Wikipedia Diskussion:WikiProjekt Georeferenzierung/Hauptseite/Wikipedia-World/news

The CSV file linked from this template is, as of 2006-12-29, encoded in a curious way. First, its original content was encoded in UTF-8: secondly, it was then decoded to Unicode as if the resulting byte-stream was encoded in Windows-1252: finally, the resulting Unicode was re-encoded as UTF-8. Finally the file was compressed in .zip format.

After unzipping, applying the following UNIX command-line filter program in Python will remove these layers of encoding, and output a valid UTF-8 encoding of the original Unicode content of the CSV file. I hope this is useful.

import sys, string

def explodify(ch):
    if ord(ch) in [0x81, 0x8d, 0x8f, 0x90, 0x9d]:
       return chr(ord(ch))
    return chr(ord(ch.encode("windows-1252")))

# Stuff has been multiply encoded: remove these layers
while 1:
    text = sys.stdin.readline()
    if not text:
      break
    # Decode as UTF-8
    text = text.decode("utf-8")
    # Rip off a layer of Windows-1252 -> Unicode
    text = string.join([explodify(ch) for ch in text], "")
    # What is left is a valid UTF-8 string: which is what we want
    sys.stdout.write(text)

Note: even after running this decoder, a considerable number of HTML entity escapes, both numeric and otherwise, and %XX urlencode()-style hex escapes remain in the source data fields. However, these are easily dealt with without any detective work on the encoding of the file, and can be dealt with after CSV decoding of the valid UTF-8 data produced by this filter.

-- The Anome 19:16, 29. Dez. 2006 (CET)Beantworten

Database- Structure

Bearbeiten
CREATE TABLE IF NOT EXISTS `pub_C_geo_id` (
 `ID` int(9) NOT NULL AUTO_INCREMENT,
 `lang` varchar(10) COLLATE utf8_bin NOT NULL,
 `Titel` varchar(180) COLLATE utf8_bin NOT NULL,
 `Titel_en` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_de` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_es` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_fr` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_it` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_ja` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_nl` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_pl` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_pt` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_ru` varchar(200) COLLATE utf8_bin DEFAULT NULL,
 `Titel_sv` varchar(90) COLLATE utf8_bin DEFAULT NULL,
 `Titel_fi` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_no` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_eo` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_sk` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_da` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_cs` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_tr` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_zh` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_ca` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `Titel_is` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 `lat` double NOT NULL DEFAULT '0',
 `lon` double NOT NULL DEFAULT '0',
 `type` varchar(10) COLLATE utf8_bin NOT NULL,
 `pop` double NOT NULL DEFAULT '0',
 `Height` double NOT NULL DEFAULT '0',
 `Country` varchar(10) COLLATE utf8_bin NOT NULL,
 `Subregion` varchar(10) COLLATE utf8_bin NOT NULL,
 `Scale` varchar(10) COLLATE utf8_bin NOT NULL,
 `dim` varchar(10) COLLATE utf8_bin NOT NULL,
 `psize` double NOT NULL,
 `style` varchar(12) COLLATE utf8_bin NOT NULL,
 `t` varchar(10) COLLATE utf8_bin NOT NULL,
 `image` varchar(160) COLLATE utf8_bin NOT NULL,
 `imagejpg` varchar(160) COLLATE utf8_bin NOT NULL,
 `name` varchar(160) COLLATE utf8_bin DEFAULT NULL,
 PRIMARY KEY (`ID`),
 UNIQUE KEY `Titel_en` (`Titel_en`),
 UNIQUE KEY `Titel_es` (`Titel_es`),
 UNIQUE KEY `Titel_fr` (`Titel_fr`),
 UNIQUE KEY `Titel_it` (`Titel_it`),
 UNIQUE KEY `Titel_ja` (`Titel_ja`),
 UNIQUE KEY `Titel_nl` (`Titel_nl`),
 UNIQUE KEY `Titel_pl` (`Titel_pl`),
 UNIQUE KEY `Titel_pt` (`Titel_pt`),
 UNIQUE KEY `Titel_ru` (`Titel_ru`),
 UNIQUE KEY `Titel_sv` (`Titel_sv`),
 UNIQUE KEY `Titel_fi` (`Titel_fi`),
 UNIQUE KEY `Titel_no` (`Titel_no`),
 UNIQUE KEY `Titel_eo` (`Titel_eo`),
 UNIQUE KEY `Titel_sk` (`Titel_sk`),
 UNIQUE KEY `Titel_da` (`Titel_da`),
 UNIQUE KEY `Titel_cs` (`Titel_cs`),
 UNIQUE KEY `Titel_de` (`Titel_de`,`name`),
 UNIQUE KEY `Titel_tr` (`Titel_tr`),
 UNIQUE KEY `Titel_zh` (`Titel_zh`),
 UNIQUE KEY `Titel_is` (`Titel_is`),
 UNIQUE KEY `Titel_ca` (`Titel_ca`),
 KEY `Pop` (`pop`),
 KEY `type` (`type`),
 KEY `Character` (`psize`),
 KEY `lang` (`lang`),
 KEY `lon` (`lon`),
 KEY `lat` (`lat`),
 KEY `image` (`image`),
 KEY `Country` (`Country`),
 KEY `Subregion` (`Subregion`),
 KEY `Titel` (`Titel`),
 KEY `imagejpg` (`imagejpg`),
 KEY `name` (`name`),
 KEY `dim` (`dim`)
) ENGINE=InnoDB  DEFAULT CHARSET=utf8 COLLATE=utf8_bin  ; 

--Kolossos 19:09, 10. Feb. 2010 (CET)Beantworten

Database Structure postGIS

Bearbeiten
CREATE TABLE wp_coords_red0
(
  lang character varying(10),
  "Titel" character varying(255),
  lat double precision,
  lon double precision,
  types character varying(50),
  pop character varying(50),
  "Height" character varying(50),
  "Country" character varying(10),
  "Subregion" character varying(255),
  "Scale" character varying(10),
  dim character varying(30),
  psize double precision,
  style character varying(50),
  t character varying(10),
  image character varying(255),
  imagejpg character varying(255),
  name character varying(255),
  page_id double precision,
  "T_wikidata" character varying(255),
  "T_aa" character varying(255),
  "T_ab" character varying(255),
  "T_ace" character varying(255),
  "T_af" character varying(255),
  "T_ak" character varying(255),
  "T_als" character varying(255),
  "T_am" character varying(255),
  "T_an" character varying(255),
  "T_ang" character varying(255),
  "T_ar" character varying(255),
  "T_arc" character varying(255),
  "T_arz" character varying(255),
  "T_ast" character varying(255),
  "T_av" character varying(255),
  "T_ay" character varying(255),
  "T_az" character varying(255),
  "T_ba" character varying(255),
  "T_bar" character varying(255),
  "T_bat-smg" character varying(255),
  "T_bcl" character varying(255),
  "T_be" character varying(255),
  "T_be-x-old" character varying(255),
  "T_bg" character varying(255),
  "T_bh" character varying(255),
  "T_bi" character varying(255),
  "T_bm" character varying(255),
  "T_bn" character varying(255),
  "T_bo" character varying(255),
  "T_bpy" character varying(255),
  "T_br" character varying(255),
  "T_bs" character varying(255),
  "T_bug" character varying(255),
  "T_bxr" character varying(255),
  "T_ca" character varying(255),
  "T_cbk-zam" character varying(255),
  "T_cdo" character varying(255),
  "T_ce" character varying(255),
  "T_ceb" character varying(255),
  "T_ch" character varying(255),
  "T_cho" character varying(255),
  "T_chr" character varying(255),
  "T_chy" character varying(255),
  "T_ckb" character varying(255),
  "T_co" character varying(255),
  "T_cr" character varying(255),
  "T_crh" character varying(255),
  "T_cs" character varying(255),
  "T_csb" character varying(255),
  "T_cu" character varying(255),
  "T_cv" character varying(255),
  "T_cy" character varying(255),
  "T_da" character varying(255),
  "T_de" character varying(255),
  "T_diq" character varying(255),
  "T_dsb" character varying(255),
  "T_dv" character varying(255),
  "T_dz" character varying(255),
  "T_ee" character varying(255),
  "T_el" character varying(255),
  "T_eml" character varying(255),
  "T_en" character varying(255),
  "T_en-simple" character varying(255),
  "T_eo" character varying(255),
  "T_es" character varying(255),
  "T_et" character varying(255),
  "T_eu" character varying(255),
  "T_ext" character varying(255),
  "T_fa" character varying(255),
  "T_ff" character varying(255),
  "T_fi" character varying(255),
  "T_fiu-vro" character varying(255),
  "T_fj" character varying(255),
  "T_fo" character varying(255),
  "T_fr" character varying(255),
  "T_frp" character varying(255),
  "T_fur" character varying(255),
  "T_fy" character varying(255),
  "T_ga" character varying(255),
  "T_gan" character varying(255),
  "T_gd" character varying(255),
  "T_gl" character varying(255),
  "T_glk" character varying(255),
  "T_gn" character varying(255),
  "T_got" character varying(255),
  "T_gu" character varying(255),
  "T_gv" character varying(255),
  "T_ha" character varying(255),
  "T_hak" character varying(255),
  "T_haw" character varying(255),
  "T_he" character varying(255),
  "T_hi" character varying(255),
  "T_hif" character varying(255),
  "T_ho" character varying(255),
  "T_hr" character varying(255),
  "T_hsb" character varying(255),
  "T_ht" character varying(255),
  "T_hu" character varying(255),
  "T_hy" character varying(255),
  "T_hz" character varying(255),
  "T_ia" character varying(255),
  "T_id" character varying(255),
  "T_ie" character varying(255),
  "T_ig" character varying(255),
  "T_ii" character varying(255),
  "T_ik" character varying(255),
  "T_ilo" character varying(255),
  "T_io" character varying(255),
  "T_is" character varying(255),
  "T_it" character varying(255),
  "T_iu" character varying(255),
  "T_ja" character varying(255),
  "T_jbo" character varying(255),
  "T_jv" character varying(255),
  "T_ka" character varying(255),
  "T_kaa" character varying(255),
  "T_kab" character varying(255),
  "T_kg" character varying(255),
  "T_ki" character varying(255),
  "T_kj" character varying(255),
  "T_kk" character varying(255),
  "T_kl" character varying(255),
  "T_km" character varying(255),
  "T_kn" character varying(255),
  "T_ko" character varying(255),
  "T_kr" character varying(255),
  "T_ks" character varying(255),
  "T_ksh" character varying(255),
  "T_ku" character varying(255),
  "T_kv" character varying(255),
  "T_kw" character varying(255),
  "T_ky" character varying(255),
  "T_la" character varying(255),
  "T_lad" character varying(255),
  "T_lb" character varying(255),
  "T_lbe" character varying(255),
  "T_lg" character varying(255),
  "T_li" character varying(255),
  "T_lij" character varying(255),
  "T_lmo" character varying(255),
  "T_ln" character varying(255),
  "T_lo" character varying(255),
  "T_lt" character varying(255),
  "T_lv" character varying(255),
  "T_map-bms" character varying(255),
  "T_mdf" character varying(255),
  "T_mg" character varying(255),
  "T_mh" character varying(255),
  "T_mhr" character varying(255),
  "T_mi" character varying(255),
  "T_mk" character varying(255),
  "T_ml" character varying(255),
  "T_mn" character varying(255),
  "T_mo" character varying(255),
  "T_mr" character varying(255),
  "T_ms" character varying(255),
  "T_mt" character varying(255),
  "T_mus" character varying(255),
  "T_mwl" character varying(255),
  "T_my" character varying(255),
  "T_myv" character varying(255),
  "T_mzn" character varying(255),
  "T_na" character varying(255),
  "T_nah" character varying(255),
  "T_nap" character varying(255),
  "T_nds" character varying(255),
  "T_nds-nl" character varying(255),
  "T_ne" character varying(255),
  "T_new" character varying(255),
  "T_ng" character varying(255),
  "T_nl" character varying(255),
  "T_nn" character varying(255),
  "T_no" character varying(255),
  "T_nostalgia" character varying(255),
  "T_nov" character varying(255),
  "T_nrm" character varying(255),
  "T_nv" character varying(255),
  "T_ny" character varying(255),
  "T_oc" character varying(255),
  "T_om" character varying(255),
  "T_or" character varying(255),
  "T_os" character varying(255),
  "T_pa" character varying(255),
  "T_pag" character varying(255),
  "T_pam" character varying(255),
  "T_pap" character varying(255),
  "T_pcd" character varying(255),
  "T_pdc" character varying(255),
  "T_pi" character varying(255),
  "T_pih" character varying(255),
  "T_pl" character varying(255),
  "T_pms" character varying(255),
  "T_pnb" character varying(255),
  "T_pnt" character varying(255),
  "T_ps" character varying(255),
  "T_pt" character varying(255),
  "T_qu" character varying(255),
  "T_rm" character varying(255),
  "T_rmy" character varying(255),
  "T_rn" character varying(255),
  "T_ro" character varying(255),
  "T_roa-rup" character varying(255),
  "T_roa-tara" character varying(255),
  "T_ru" character varying(255),
  "T_rw" character varying(255),
  "T_sa" character varying(255),
  "T_sah" character varying(255),
  "T_sc" character varying(255),
  "T_scn" character varying(255),
  "T_sco" character varying(255),
  "T_sd" character varying(255),
  "T_se" character varying(255),
  "T_sg" character varying(255),
  "T_sh" character varying(255),
  "T_si" character varying(255),
  "T_simple" character varying(255),
  "T_sk" character varying(255),
  "T_sl" character varying(255),
  "T_sm" character varying(255),
  "T_sn" character varying(255),
  "T_so" character varying(255),
  "T_sq" character varying(255),
  "T_sr" character varying(255),
  "T_srn" character varying(255),
  "T_ss" character varying(255),
  "T_st" character varying(255),
  "T_stq" character varying(255),
  "T_su" character varying(255),
  "T_sv" character varying(255),
  "T_sw" character varying(255),
  "T_szl" character varying(255),
  "T_ta" character varying(255),
  "T_te" character varying(255),
  "T_tet" character varying(255),
  "T_tg" character varying(255),
  "T_th" character varying(255),
  "T_ti" character varying(255),
  "T_tk" character varying(255),
  "T_tl" character varying(255),
  "T_tlh" character varying(255),
  "T_tn" character varying(255),
  "T_to" character varying(255),
  "T_tokipona" character varying(255),
  "T_tpi" character varying(255),
  "T_tr" character varying(255),
  "T_ts" character varying(255),
  "T_tt" character varying(255),
  "T_tum" character varying(255),
  "T_tw" character varying(255),
  "T_ty" character varying(255),
  "T_udm" character varying(255),
  "T_ug" character varying(255),
  "T_uk" character varying(255),
  "T_ur" character varying(255),
  "T_uz" character varying(255),
  "T_ve" character varying(255),
  "T_vec" character varying(255),
  "T_vi" character varying(255),
  "T_vls" character varying(255),
  "T_vo" character varying(255),
  "T_wa" character varying(255),
  "T_war" character varying(255),
  "T_wo" character varying(255),
  "T_wuu" character varying(255),
  "T_xal" character varying(255),
  "T_xh" character varying(255),
  "T_yi" character varying(255),
  "T_yo" character varying(255),
  "T_za" character varying(255),
  "T_zea" character varying(255),
  "T_zh" character varying(255),
  "T_zh-classical" character varying(255),
  "T_zh-min-nan" character varying(255),
  "T_zh-yue" character varying(255),
  "T_zu" character varying(255),
  the_geom geometry,
  arms character varying(255),
  instance character varying(40),
  classes integer[],
  superclasses integer[]
)
WITH (
  OIDS=FALSE
);
ALTER TABLE wp_coords_red0
  OWNER TO osm;

-- Index: "WD"

-- DROP INDEX "WD";

CREATE INDEX "WD"
  ON wp_coords_red0
  USING btree
  ("T_wikidata" COLLATE pg_catalog."default" );

-- Index: idx_the_geomnew0ds5n5bt

-- DROP INDEX idx_the_geomnew0ds5n5bt;

CREATE INDEX idx_the_geomnew0ds5n5bt
  ON wp_coords_red0
  USING gist
  (the_geom );

-- Index: klj

-- DROP INDEX klj;

CREATE INDEX klj
  ON wp_coords_red0
  USING gin
  (superclasses );



-- Daten einlesen aus entpackter Datei:
copy wp_coords_red0 from '/home/wp-world/new_red0';


a short description of each column
  • lang : For which language is the coordinate
  • "Titel": article name
  • lat : latitude in WGS84
  • lon : longitude
  • types : http://en.wikipedia.org/wiki/Wikipedia:GEO#type:T
  • pop : population
  • "Height" : height mostly unused
  • "Country" : http://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
  • "Subregion" : ISO_3166-2
  • "Scale" : mostly unused
  • dim : mostly unused dimension in meters
  • psize: very important to define relevance (Bytes of the article)
  • STYLE : mostly unused
  • t : unused
  • image : image-name of a random image
  • imagejpg : image-name of a random JPG-image
  • "name" : Object name if more than one object is in an article http://en.wikipedia.org/wiki/Wikipedia:GEO#Name
  • page_id : article-id in Wikipedia-DB unused
  • "T_aa" article names in 273 language via interwikilink
  • ...
  • the_geom : PostGIS geometry-index