├── .gitignore
├── tests
    ├── linktargets.txt
    ├── filter_pagelinks.test1expected.txt
    ├── filter_wikidata_geo_tags.test1expected.txt
    ├── filter_pagelinks.test1.txt
    ├── filter_langlinks.test1.txt
    ├── filter_langlinks.test1expected.txt
    ├── filter_wikidata_geo_tags.test1.txt
    └── run.sh
├── bin
    ├── mysqldump_to_csv.readme.txt
    ├── mysqldump_to_csv.LICENSE
    ├── filter_redirect.py
    ├── filter_langlinks.py
    ├── filter_wikidata_wb_items_per_site.py
    ├── filter_pagelinks.py
    ├── filter_page.py
    ├── filter_wikidata_page.py
    ├── filter_wikidata_geo_tags.py
    └── mysqldump_to_csv.py
├── config
    ├── languages.txt
    ├── wikidata_place_type_levels.csv
    └── wikidata_place_types.txt
├── lib
    └── languages.py
├── install_dependencies.sh
├── steps
    ├── report_database_size.sh
    ├── cleanup.sh
    ├── wikidata_download.sh
    ├── wikipedia_download.sh
    ├── wikipedia_import.sh
    ├── wikidata_import.sh
    ├── wikipedia_process.sh
    ├── latest_available_data.sh
    ├── output.sh
    ├── wikidata_sql2csv.sh
    ├── wikidata_api_fetch_placetypes.sh
    ├── wikidata_process.sh
    └── wikipedia_sql2csv.sh
├── complete_run.sh
├── .github
    └── workflows
    │   └── ci.yml
├── README.md
├── LICENSE
└── wikidata_places.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/tests/linktargets.txt:
--------------------------------------------------------------------------------
1 | 11,title1
2 | 22,title2
3 | 33,"title3,with,comma"
4 | 44,title4


--------------------------------------------------------------------------------
/tests/filter_pagelinks.test1expected.txt:
--------------------------------------------------------------------------------
1 | title1,5
2 | title2,2
3 | "title3,with,comma",1
4 | title4,1
5 | 


--------------------------------------------------------------------------------
/tests/filter_wikidata_geo_tags.test1expected.txt:
--------------------------------------------------------------------------------
1 | 5009,25.13333,56.33333
2 | 5010,-34.35806,18.47194
3 | 5018,54.08333,13.38333
4 | 5020,48.76194,8.24083
5 | 5030,54.67639,13.43778
6 | 5034,55.9214,-3.53665
7 | 


--------------------------------------------------------------------------------
/tests/filter_pagelinks.test1.txt:
--------------------------------------------------------------------------------
 1 | enwiki,0,11
 2 | enwiki,0,11
 3 | enwiki,0,11
 4 | enwiki,0,22
 5 | enwiki,0,22
 6 | enwiki,0,33
 7 | enwiki,0,11
 8 | enwiki,0,11
 9 | enwiki,0,44
10 | enwiki,1,44
11 | enwiki,0,55


--------------------------------------------------------------------------------
/tests/filter_langlinks.test1.txt:
--------------------------------------------------------------------------------
1 | 2074847,tr,Berlin dövlət kitabxanası
2 | 291145,tr,Berlin döyüşü (1945)
3 | 52637892,tr,Berlin hücumu (2016)
4 | 494808,tr,Berlin kafedralı
5 | 438617,tr,Berlin konqresi
6 | 1234,de,"Berlin, Berlin"


--------------------------------------------------------------------------------
/tests/filter_langlinks.test1expected.txt:
--------------------------------------------------------------------------------
1 | Berlin_dövlət_kitabxanası,2074847,tr
2 | Berlin_döyüşü_(1945),291145,tr
3 | Berlin_hücumu_(2016),52637892,tr
4 | Berlin_kafedralı,494808,tr
5 | Berlin_konqresi,438617,tr
6 | "Berlin,_Berlin",1234,de
7 | 


--------------------------------------------------------------------------------
/bin/mysqldump_to_csv.readme.txt:
--------------------------------------------------------------------------------
1 | https://github.com/jamesmishra/mysqldump-to-csv
2 | 
3 | * Added errors=surrogateescape to open(), otherwise the script threw UnicodeDecodeError for langlinks files
4 | * Use python3 in first line
5 | * Explicitly set escapechar for csv.writer
6 | * Don't print \x0 for NULL values, print '' instead.
7 | 


--------------------------------------------------------------------------------
/config/languages.txt:
--------------------------------------------------------------------------------
 1 | # https://en.wikipedia.org/wiki/List_of_Wikipedias
 2 | ar
 3 | bg
 4 | ca
 5 | cs
 6 | da
 7 | de
 8 | en
 9 | es
10 | eo
11 | eu
12 | fa
13 | fr
14 | ko
15 | hi
16 | hr
17 | id
18 | it
19 | he
20 | lt
21 | hu
22 | ms
23 | nl
24 | ja
25 | no
26 | pl
27 | pt
28 | kk
29 | ro
30 | ru
31 | sk
32 | sl
33 | sr
34 | fi
35 | sv
36 | tr
37 | uk
38 | vi
39 | war
40 | zh


--------------------------------------------------------------------------------
/tests/filter_wikidata_geo_tags.test1.txt:
--------------------------------------------------------------------------------
1 | 158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL
2 | 158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL
3 | 158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL
4 | 158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL
5 | 158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL
6 | 158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL
7 | 158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL


--------------------------------------------------------------------------------
/lib/languages.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class Languages:
 4 |     def get_languages():
 5 |         if 'LANGUAGES' in os.environ:
 6 |             return os.environ['LANGUAGES'].split(',')
 7 | 
 8 |         with open('config/languages.txt', 'r') as file:
 9 |             languages = file.readlines()
10 |             languages = map(lambda line: line.strip('\n'), languages)
11 |             languages = filter(lambda line: not line.startswith('#'), languages )
12 |             return list(languages)
13 | 
14 |         return []
15 | 


--------------------------------------------------------------------------------
/install_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Tested on Ubuntu-24
 5 | #
 6 | 
 7 | sudo apt-get install -y postgresql-16
 8 | sudo -u postgres createuser -s $USER
 9 | 
10 | # No not significant performance increase above 250MB
11 | sudo -u postgres mkdir -p /etc/postgresql/16/main/conf.d/
12 | echo "
13 | work_mem = 250MB
14 | " | sudo -u postgres tee /etc/postgresql/16/main/conf.d/wikipedia.conf
15 | 
16 | sudo systemctl restart postgresql
17 | 
18 | sudo apt-get install -y wget coreutils nodejs jq moreutils pigz
19 | sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
20 | 
21 | # https://wdtaxonomy.readthedocs.io/
22 | sudo apt-get install -y nodejs
23 | node --version
24 | sudo npm install -g wikidata-taxonomy
25 | wdtaxonomy --version
26 | 


--------------------------------------------------------------------------------
/steps/report_database_size.sh:
--------------------------------------------------------------------------------
 1 | cat <<END | psql $DATABASE_NAME
 2 | SELECT
 3 |   C.relname as name,
 4 |   C.relkind,
 5 |   T.spcname AS tablespace,
 6 |   pg_size_pretty(sum(pg_relation_size(C.oid))) AS "total_size"
 7 | FROM pg_class C
 8 | LEFT JOIN pg_namespace  N ON (N.oid = C.relnamespace)
 9 | LEFT JOIN pg_tablespace T ON (T.oid = C.reltablespace)
10 | WHERE
11 |    C.relkind != 'v' AND
12 |    C.relname NOT LIKE 'pg_%' AND C.relname NOT LIKE 'sql_%'
13 | GROUP BY C.relname, C.relkind, tablespace
14 | -- ORDER BY sum(pg_relation_size(C.oid)) DESC;
15 | ORDER BY name;
16 | END
17 | 
18 | 
19 | cat <<END | psql $DATABASE_NAME
20 | SELECT
21 |   pg_size_pretty(sum(pg_relation_size(C.oid))) AS "total_size"
22 | FROM pg_class C
23 | LEFT JOIN pg_namespace  N ON (N.oid = C.relnamespace)
24 | LEFT JOIN pg_tablespace T ON (T.oid = C.reltablespace)
25 | WHERE
26 |    C.relkind != 'v' AND
27 |    C.relname NOT LIKE 'pg_%' AND C.relname NOT LIKE 'sql_%';
28 | END
29 | 


--------------------------------------------------------------------------------
/tests/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUT=$(python3 -c'from lib.languages import Languages; print(len(Languages.get_languages()))')
 4 | if [[ "$OUT" != "39" ]]; then
 5 |     echo 'expected 39'
 6 |     exit 1
 7 | fi
 8 | 
 9 | OUT=$(LANGUAGES=de,fr,it,en python3 -c'from lib.languages import Languages; print(len(Languages.get_languages()))')
10 | if [[ "$OUT" != "4" ]]; then
11 |     echo 'expected 4'
12 |     exit 1
13 | fi
14 | 
15 | cat tests/linktargets.txt | gzip > tests/linktargets.txt.gz
16 | cat tests/filter_pagelinks.test1.txt | bin/filter_pagelinks.py tests/linktargets.txt.gz > out.txt
17 | diff --brief out.txt tests/filter_pagelinks.test1expected.txt || exit 1
18 | rm -f tests/linktargets.txt.gz
19 | 
20 | cat tests/filter_langlinks.test1.txt | bin/filter_langlinks.py > out.txt
21 | diff --brief out.txt tests/filter_langlinks.test1expected.txt || exit 1
22 | 
23 | cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt
24 | diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1
25 | 
26 | rm -f out.txt


--------------------------------------------------------------------------------
/bin/mysqldump_to_csv.LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 James Mishra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/bin/filter_redirect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 |     # CREATE TABLE `redirect` (
 6 |     #   `rd_from`         int(8) unsigned   NOT NULL DEFAULT 0,
 7 |     #   `rd_namespace`    int(11)           NOT NULL DEFAULT 0,
 8 |     #   `rd_title`        varbinary(255)    NOT NULL DEFAULT '',
 9 |     #   `rd_interwiki`    varbinary(32)              DEFAULT NULL,
10 |     #   `rd_fragment`     varbinary(255)             DEFAULT NULL,
11 | 
12 | Output to STDOUT: rd_from_page_id, rd_title
13 | 
14 | Same for linktarget table
15 |     # CREATE TABLE `linktarget` (
16 |     #   `lt_id`          bigint(20) unsigned  NOT NULL AUTO_INCREMENT,
17 |     #   `lt_namespace`   int(11)              NOT NULL,
18 |     #   `lt_title`       varbinary(255)       NOT NULL,
19 | '''
20 | 
21 | import sys
22 | import csv
23 | 
24 | reader = csv.reader(sys.stdin)
25 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
26 | 
27 | for row in reader:
28 |     # namespace: 0 are articles
29 |     if (row[1] != '0'):
30 |         continue
31 | 
32 |     title = row[2].replace('\r', '')
33 |     if len(title) == 0:
34 |         continue
35 | 
36 |     writer.writerow([row[0], title])
37 | 


--------------------------------------------------------------------------------
/bin/filter_langlinks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 |     # CREATE TABLE `langlinks` (
 6 |     #   `ll_from`         int(8) unsigned   NOT NULL DEFAULT 0,
 7 |     #   `ll_lang`         varbinary(35)     NOT NULL DEFAULT '',
 8 |     #   `ll_title`        varbinary(255)    NOT NULL DEFAULT '',
 9 | 
10 | Output to STDOUT: ll_title, ll_from_page_id, ll_lang
11 | '''
12 | 
13 | import os
14 | import sys
15 | 
16 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 | sys.path.append(parent_dir)
18 | 
19 | from lib.languages import Languages;
20 | 
21 | languages_set = set(Languages.get_languages())
22 | 
23 | 
24 | # We don't need CSV parsing here because the first two columns never
25 | # contain commas.
26 | for line in sys.stdin:
27 |     line = line.rstrip().replace('\r', '')
28 | 
29 |     columns = line.split(',', 2)
30 | 
31 |     # ll_lang, e.g. 'en'
32 |     language = columns[1]
33 |     if language not in languages_set:
34 |         continue
35 | 
36 |     # langlinks table contain titles with spaces, e.g. 'one (two)' while pages and
37 |     # pagelinkcount table contain titles with underscore, e.g. 'one_(two)'
38 |     title = columns[2].replace(' ', '_')
39 | 
40 |     print(','.join([title, columns[0], language]))
41 | 


--------------------------------------------------------------------------------
/complete_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Single script to do all processing from scratch. Run it or
 5 | # use as guide how to run the individual steps.
 6 | #
 7 | # Example to add timestamps and create a logfile:
 8 | # time ./complete_run.sh 2>&1 | ts -s "[%H:%M:%S]" | tee "$(date +"%Y%m%d").$$.log"
 9 | 
10 | 
11 | ./install_dependencies.sh
12 | 
13 | # checks https://mirror.clarkson.edu/wikimedia/enwiki/
14 | #    and https://mirror.clarkson.edu/wikimedia/wikidatawiki/
15 | LATEST_DATE=$(./steps/latest_available_data.sh) # yyyymmdd
16 | 
17 | export WIKIPEDIA_DATE=$LATEST_DATE
18 | export WIKIDATA_DATE=$LATEST_DATE
19 | export BUILDID=wikimedia_build_$(date +"%Y%m%d")
20 | export LANGUAGES=$(grep -v '^#' config/languages.txt | tr "\n" ",")
21 | # export LANGUAGES=de,nl
22 | export DATABASE_NAME=$BUILDID
23 | 
24 | ./steps/wikipedia_download.sh
25 | ./steps/wikidata_download.sh
26 | ./steps/wikidata_api_fetch_placetypes.sh
27 | 
28 | ./steps/wikipedia_sql2csv.sh
29 | ./steps/wikidata_sql2csv.sh
30 | 
31 | # dropdb --if-exists $DATABASE_NAME
32 | createdb $DATABASE_NAME
33 | ./steps/wikipedia_import.sh
34 | ./steps/wikidata_import.sh
35 | 
36 | ./steps/wikipedia_process.sh
37 | ./steps/wikidata_process.sh
38 | 
39 | ./steps/report_database_size.sh
40 | ./steps/output.sh
41 | # ./steps/cleanup.sh
42 | 
43 | echo "Finished."


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: "Continuous Integration"
 2 | 
 3 | on: [ push, pull_request ]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v4
10 |     - name: Install PostgreSQL
11 |       run: |
12 |         sudo apt-get update -qq
13 |         sudo apt-get install -y -qq postgresql postgresql-client
14 |         sudo systemctl restart postgresql
15 |         sudo -u postgres createuser -s runner
16 |     - name: Install dependencies
17 |       run: ./install_dependencies.sh
18 |     - name: Create database
19 |       run: createdb wikiprocessingdb
20 |     - name: Build for languages Limburgish (li), Bavarian (bar)
21 |       run: |
22 |         LATEST_DATE=$(./steps/latest_available_data.sh)
23 |         export WIKIPEDIA_DATE=$LATEST_DATE
24 |         export WIKIDATA_DATE=$LATEST_DATE
25 | 
26 |         ./steps/wikipedia_download.sh
27 |         ./steps/wikipedia_sql2csv.sh
28 |         ./steps/wikipedia_import.sh
29 |         ./steps/wikipedia_process.sh
30 | 
31 |         grep county config/wikidata_place_types.txt > new.txt
32 |         mv new.txt config/wikidata_place_types.txt
33 |         ./steps/wikidata_api_fetch_placetypes.sh
34 |       env:
35 |         BUILDID: ci_test_build
36 |         LANGUAGES: li,bar
37 | 
38 |     - name: Test output
39 |       run: tests/run.sh
40 | 


--------------------------------------------------------------------------------
/bin/filter_wikidata_wb_items_per_site.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 | # MySQL schema inside the sql.gz file:
 6 | #
 7 | # CREATE TABLE `wb_items_per_site` (
 8 | #   `ips_row_id`    bigint(20) unsigned NOT NULL AUTO_INCREMENT,
 9 | #   `ips_item_id`   int(10) unsigned    NOT NULL,
10 | #   `ips_site_id`   varbinary(32)       NOT NULL,
11 | #   `ips_site_page` varbinary(310)      NOT NULL,
12 | 
13 | Output to STDOUT: item_id, site_id, site_page (title)
14 | '''
15 | 
16 | import os
17 | import sys
18 | import csv
19 | 
20 | # Add the parent directory to sys.path
21 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
22 | sys.path.append(parent_dir)
23 | 
24 | from lib.languages import Languages;
25 | 
26 | languages_set = set(Languages.get_languages())
27 | # print(languages_set, file=sys.stderr)
28 | 
29 | 
30 | reader = csv.reader(sys.stdin)
31 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
32 | 
33 | for row in reader:
34 |     # ips_site_page is the title
35 |     title = row[3].replace('\r', '')
36 |     if len(title) == 0:
37 |         continue
38 | 
39 |     # ips_site_id, e.g. 'enwiki'
40 |     language = row[2].replace('wiki', '')
41 |     if language not in languages_set:
42 |         continue
43 | 
44 |     writer.writerow([row[1], row[2], title])
45 | 


--------------------------------------------------------------------------------
/steps/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set defaults
 4 | : ${BUILDID:=latest}
 5 | : ${DATABASE_NAME:=wikiprocessingdb}
 6 | 
 7 | # Languages as comma-separated string, e.g. 'en,fr,de'
 8 | : ${LANGUAGES:=bar,cy}
 9 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
10 | 
11 | psqlcmd() {
12 |      psql --quiet $DATABASE_NAME
13 | }
14 | 
15 | 
16 | 
17 | echo "====================================================================="
18 | echo "Dropping intermediate wikipedia tables to conserve space"
19 | echo "====================================================================="
20 | 
21 | for LANG in "${LANGUAGES_ARRAY[@]}"
22 | do
23 |     echo "DROP TABLE ${LANG}pagelinks;"     | psqlcmd
24 |     echo "DROP TABLE ${LANG}page;"          | psqlcmd
25 |     echo "DROP TABLE ${LANG}langlinks;"     | psqlcmd
26 |     echo "DROP TABLE ${LANG}redirect;"      | psqlcmd
27 |     echo "DROP TABLE ${LANG}pagelinkcount;" | psqlcmd
28 | done
29 | 
30 | 
31 | echo "====================================================================="
32 | echo "Dropping intermediate wikidata tables"
33 | echo "====================================================================="
34 | 
35 | echo "DROP TABLE wikidata_place_dump;" | psqlcmd
36 | echo "DROP TABLE geo_earth_primary;"   | psqlcmd
37 | for LANG in "${LANGUAGES_ARRAY[@]}"
38 | do
39 |     echo "DROP TABLE wikidata_${LANG}_pages;" | psqlcmd
40 | done
41 | 


--------------------------------------------------------------------------------
/bin/filter_pagelinks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 |     # CREATE TABLE `pagelinks` (
 6 |     #   `pl_from`            int(8) unsigned     NOT NULL DEFAULT 0,
 7 |     #   `pl_namespace`       int(11)             NOT NULL DEFAULT 0,
 8 |     #   `pl_target_id`       bigint(20) unsigned NOT NULL,
 9 | 
10 | Output to STDOUT: pl_title, count
11 | '''
12 | 
13 | import sys
14 | import csv
15 | import gzip
16 | 
17 | if len(sys.argv) < 2:
18 |     print("Usage: filter_pagelinks.py linktarget.csv.gz")
19 |     exit(1)
20 | 
21 | linktarget_filename = sys.argv[1]
22 | linktarget_id_to_title = dict()
23 | 
24 | with gzip.open(linktarget_filename, 'rt') as gzfile:
25 |     reader = csv.reader(gzfile)
26 |     for row in reader:
27 |         linktarget_id_to_title[row[0]] = row[1]
28 | 
29 | reader = csv.reader(sys.stdin)
30 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
31 | 
32 | counts = {}
33 | for row in reader:
34 |     # pl_namespace: 0 are articles
35 |     if (row[1] != '0'):
36 |         continue
37 | 
38 |     title = linktarget_id_to_title.get(row[2])
39 |     if title is None:
40 |         continue
41 | 
42 |     if title not in counts:
43 |         counts[title] = 1
44 |     else:
45 |         counts[title] += 1
46 | 
47 | # for title in sorted(counts.keys()):
48 | for title in counts.keys():
49 |     writer.writerow([title, counts[title]])
50 | 


--------------------------------------------------------------------------------
/bin/filter_page.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 |     # CREATE TABLE `page` (
 6 |     #   `page_id`            int(8) unsigned     NOT NULL AUTO_INCREMENT,
 7 |     #   `page_namespace`     int(11)             NOT NULL DEFAULT 0,
 8 |     #   `page_title`         varbinary(255)      NOT NULL DEFAULT '',
 9 |     #   `page_is_redirect`   tinyint(1) unsigned NOT NULL DEFAULT 0,
10 |     #   `page_is_new`        tinyint(1) unsigned NOT NULL DEFAULT 0,
11 |     #   `page_random`        double unsigned     NOT NULL DEFAULT 0,
12 |     #   `page_touched`       varbinary(14)       NOT NULL DEFAULT '',
13 |     #   `page_links_updated` varbinary(14)                DEFAULT NULL,
14 |     #   `page_latest`        int(8) unsigned     NOT NULL DEFAULT 0,
15 |     #   `page_len`           int(8) unsigned     NOT NULL DEFAULT 0,
16 |     #   `page_content_model` varbinary(32)                DEFAULT NULL,
17 |     #   `page_lang`          varbinary(35)                DEFAULT NULL,
18 | 
19 | Output to STDOUT: page_id, page_title
20 | '''
21 | 
22 | import sys
23 | import csv
24 | 
25 | reader = csv.reader(sys.stdin)
26 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL)
27 | 
28 | for row in reader:
29 |     # 0 are articles
30 |     if (row[1] != '0'):
31 |         continue
32 | 
33 |     title = row[2].replace('\r', '')
34 |     if len(title) == 0:
35 |         continue
36 | 
37 |     writer.writerow([row[0], title])
38 | 


--------------------------------------------------------------------------------
/bin/filter_wikidata_page.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 | # MySQL schema inside the sql.gz file:
 6 | #
 7 | # CREATE TABLE `page` (
 8 | #   `page_id`            int(10) unsigned    NOT NULL AUTO_INCREMENT,
 9 | #   `page_namespace`     int(11)             NOT NULL,
10 | #   `page_title`         varbinary(255)      NOT NULL,
11 | #   `page_restrictions`  tinyblob                     DEFAULT NULL,
12 | #   `page_is_redirect`   tinyint(3) unsigned NOT NULL DEFAULT 0,
13 | #   `page_is_new`        tinyint(3) unsigned NOT NULL DEFAULT 0,
14 | #   `page_random`        double unsigned     NOT NULL,
15 | #   `page_touched`       binary(14)          NOT NULL,
16 | #   `page_links_updated` varbinary(14)                DEFAULT NULL,
17 | #   `page_latest`        int(10) unsigned    NOT NULL,
18 | #   `page_len`           int(10) unsigned    NOT NULL,
19 | #   `page_content_model` varbinary(32)                DEFAULT NULL,
20 | #   `page_lang`          varbinary(35)                DEFAULT NULL,
21 | 
22 | # page_lang isn't interesting, 'NULL' 99.999% of the time
23 | 
24 | Output to STDOUT: page_id, page_title
25 | '''
26 | 
27 | import sys
28 | import csv
29 | 
30 | reader = csv.reader(sys.stdin)
31 | 
32 | for row in reader:
33 |     # page_namespace: 0 are articles (99% of the input lines)
34 |     if (row[1] != '0'):
35 |         continue
36 | 
37 |     # page_title are actually ids. Some are special pages, not articles
38 |     if (row[2][0] != 'Q'):
39 |         continue
40 | 
41 |     print(row[0] + ',' + row[2])
42 | 


--------------------------------------------------------------------------------
/bin/filter_wikidata_geo_tags.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | '''
 4 | Input from STDIN
 5 | # MySQL schema inside the sql.gz file:
 6 | #
 7 | # CREATE TABLE `geo_tags` (
 8 | #   `gt_id`      int(10) unsigned NOT NULL AUTO_INCREMENT,
 9 | #   `gt_page_id` int(10) unsigned NOT NULL,
10 | #   `gt_globe`   varbinary(32)    NOT NULL,
11 | #   `gt_primary` tinyint(1)       NOT NULL,
12 | #   `gt_lat`     decimal(11,8)              DEFAULT NULL,
13 | #   `gt_lon`     decimal(11,8)              DEFAULT NULL,
14 | #   `gt_dim`     int(11)                    DEFAULT NULL,
15 | #   `gt_type`    varbinary(32)              DEFAULT NULL,
16 | #   `gt_name`    varbinary(255)             DEFAULT NULL,
17 | #   `gt_country` binary(2)                  DEFAULT NULL,
18 | #   `gt_region`  varbinary(3)               DEFAULT NULL,
19 | 
20 | Output to STDOUT: gt_page_id, gt_lat, gt_lon
21 | '''
22 | 
23 | import sys
24 | import csv
25 | 
26 | reader = csv.reader(sys.stdin)
27 | 
28 | for row in reader:
29 |     # gt_globe: There are places e.g. on the moon with coordinates
30 |     if (row[2] != 'earth'):
31 |         continue
32 | 
33 |     # gt_primary
34 |     if (row[3] != '1'):
35 |         continue
36 | 
37 |     lat = float(row[4])
38 |     lon = float(row[5])
39 | 
40 |     if (lat == 0 and lon == 0):
41 |         # print('skipping 0,0', file=sys.stderr)
42 |         continue
43 | 
44 |     if (lat < -90 or lat > 90 or lon < -180 or lon > 180):
45 |         # print('skipping out of bounds', file=sys.stderr)
46 |         # print(lat, file=sys.stderr)
47 |         # print(lon, file=sys.stderr)
48 |         continue
49 | 
50 |     lat = round(lat, 5)
51 |     lon = round(lon, 5)
52 | 
53 |     print(row[1] + ',' + str(lat) + ',' + str(lon))
54 | 


--------------------------------------------------------------------------------
/steps/wikidata_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "====================================================================="
 4 | echo "Download wikidata dump tables"
 5 | echo "====================================================================="
 6 | 
 7 | # set defaults
 8 | : ${BUILDID:=latest}
 9 | # List of mirrors https://dumps.wikimedia.org/mirrors.html
10 | # Download using main dumps.wikimedia.org: 60 minutes, mirror: 20 minutes
11 | : ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com}
12 | # See list on https://wikidata.aerotechnet.com/wikidatawiki/
13 | : ${WIKIDATA_DATE:=20220701}
14 | 
15 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
16 | mkdir -p $DOWNLOADED_PATH
17 | 
18 | download() {
19 |     echo "Downloading $1 > $2"
20 |     if [ -e "$2" ]; then
21 |         echo "file $2 already exists, skipping"
22 |         return
23 |     fi
24 |     header='--header=User-Agent:Osm-search-Bot/1(https://github.com/osm-search/wikipedia-wikidata)'
25 |     wget -O "$2" --quiet $header --no-clobber --tries=3 "$1"
26 |     if [ ! -s "$2" ]; then
27 |         echo "downloaded file $2 is empty, please retry later"
28 |         rm -f "$2"
29 |         exit 1
30 |     fi
31 | }
32 | 
33 | for FN in geo_tags.sql.gz page.sql.gz wb_items_per_site.sql.gz; do
34 | 
35 |     # https://wikidata.aerotechnet.com/wikidatawiki/20250501/wikidatawiki-20250501-geo_tags.sql.gz
36 |     # https://wikidata.aerotechnet.com/wikidatawiki/20250501/md5sums-wikidatawiki-20250501-geo_tags.sql.gz.txt
37 |     download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/wikidatawiki-$WIKIDATA_DATE-$FN "$DOWNLOADED_PATH/$FN"
38 |     download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/md5sums-wikidatawiki-$WIKIDATA_DATE-$FN.txt "$DOWNLOADED_PATH/$FN.md5"
39 | 
40 |     EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$FN.md5" | cut -d\  -f1)
41 |     CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$FN" | cut -d\  -f1)
42 | 
43 |     if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then
44 |         echo "$FN - md5 checksum doesn't match, download broken"
45 |         exit 1
46 |     fi
47 | 
48 | done
49 | du -h $DOWNLOADED_PATH/*
50 | 
51 | # 114M  downloaded/geo_tags.sql.gz
52 | # 1.7G  downloaded/page.sql.gz
53 | # 1.2G  downloaded/wb_items_per_site.sql.gz
54 | 


--------------------------------------------------------------------------------
/steps/wikipedia_download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "====================================================================="
 4 | echo "Download individual wikipedia language tables dumps"
 5 | echo "====================================================================="
 6 | 
 7 | # set defaults
 8 | : ${BUILDID:=latest}
 9 | # Languages as comma-separated string, e.g. 'en,fr,de'
10 | : ${LANGUAGES:=bar,cy}
11 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
12 | # List of mirrors https://dumps.wikimedia.org/mirrors.html
13 | # Download using main dumps.wikimedia.org: 150 minutes, mirror: 40 minutes
14 | : ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com}
15 | # See list on https://wikidata.aerotechnet.com/enwiki/
16 | : ${WIKIPEDIA_DATE:=20220620}
17 | 
18 | DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia"
19 | 
20 | download() {
21 |     echo "Downloading $1 > $2"
22 |     if [ -e "$2" ]; then
23 |         echo "file $2 already exists, skipping"
24 |         return
25 |     fi
26 |     header='--header=User-Agent:Osm-search-Bot/1(https://github.com/osm-search/wikipedia-wikidata)'
27 |     wget -O "$2" --quiet $header --no-clobber --tries=3 "$1"
28 |     if [ ! -s "$2" ]; then
29 |         echo "downloaded file $2 is empty, please retry later"
30 |         rm -f "$2"
31 |         exit 1
32 |     fi
33 |     du -h "$2" | cut -f1
34 | }
35 | 
36 | for LANG in "${LANGUAGES_ARRAY[@]}"; do
37 |     echo "Language: $LANG"
38 | 
39 |     mkdir -p "$DOWNLOADED_PATH/$LANG"
40 | 
41 |     # English is the largest
42 |     # 2.1G  downloaded/en/page.sql.gz
43 |     # 6.4G  downloaded/en/pagelinks.sql.gz
44 |     # 492M  downloaded/en/langlinks.sql.gz
45 |     # 992M  downloaded/en/linktarget.sql.gz
46 |     # 160M  downloaded/en/redirect.sql.gz
47 | 
48 |     # Smaller language Turkish
49 |     #  90M  downloaded/tr/page.sql.gz
50 |     # 255M  downloaded/tr/pagelinks.sql.gz
51 |     # 166M  downloaded/tr/langlinks.sql.gz
52 |     #  62M  downloaded/tr/linktarget.sql.gz
53 |     # 4.2M  downloaded/tr/redirect.sql.gz
54 | 
55 |     for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do
56 | 
57 |         download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN"
58 |         download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5"
59 | 
60 |         EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$LANG/$FN.md5" | cut -d\  -f1)
61 |         CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$LANG/$FN" | cut -d\  -f1)
62 | 
63 |         if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then
64 |             echo "$FN for language $LANG - md5 checksum doesn't match, download broken"
65 |             exit 1
66 |         fi
67 |     done
68 | done
69 | 


--------------------------------------------------------------------------------
/steps/wikipedia_import.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set defaults
 4 | : ${BUILDID:=latest}
 5 | : ${DATABASE_NAME:=wikiprocessingdb}
 6 | : ${LANGUAGES:=bar,cy}
 7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
 8 | 
 9 | CONVERTED_PATH="$BUILDID/converted/wikipedia"
10 | # postgresql's COPY requires full path
11 | CONVERTED_PATH_ABS=$(realpath "$CONVERTED_PATH")
12 | 
13 | psqlcmd() {
14 |      psql --quiet $DATABASE_NAME |& \
15 |      grep -v 'does not exist, skipping'
16 | }
17 | 
18 | echo "====================================================================="
19 | echo "Import wikipedia CSV tables"
20 | echo "====================================================================="
21 | 
22 | for LANG in "${LANGUAGES_ARRAY[@]}"
23 | do
24 |     echo "$LANG"
25 | 
26 |     # -----------------------------------------------------------
27 |     echo "* ${LANG}page from $CONVERTED_PATH_ABS/$LANG/pages.csv.gz";
28 | 
29 |     echo "DROP TABLE IF EXISTS ${LANG}page;" | psqlcmd
30 |     echo "CREATE TABLE ${LANG}page (
31 |             page_id            integer,
32 |             page_title         text
33 |         );" | psqlcmd
34 | 
35 | 
36 |     echo "COPY ${LANG}page (page_id, page_title)
37 |         FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/pages.csv.gz'
38 |         CSV
39 |         ;" | psqlcmd
40 | 
41 | 
42 | 
43 |     # -----------------------------------------------------------
44 |     echo "* ${LANG}pagelinks from $CONVERTED_PATH_ABS/$LANG/pagelinks.csv.gz";
45 | 
46 |     echo "DROP TABLE IF EXISTS ${LANG}pagelinks;" | psqlcmd
47 |     echo "CREATE TABLE ${LANG}pagelinks (
48 |             pl_title          text,
49 |             langcount         integer,
50 |             othercount        integer DEFAULT 0
51 |         );" | psqlcmd
52 | 
53 |     echo "COPY ${LANG}pagelinks (pl_title, langcount)
54 |         FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/pagelinks.csv.gz'
55 |         CSV
56 |         ;" | psqlcmd
57 | 
58 | 
59 |     # -----------------------------------------------------------
60 |     echo "* ${LANG}langlinks from $CONVERTED_PATH_ABS/$LANG/langlinks.csv.gz";
61 | 
62 |     echo "DROP TABLE IF EXISTS ${LANG}langlinks;" | psqlcmd
63 |     echo "CREATE TABLE ${LANG}langlinks (
64 |             ll_from    integer,
65 |             ll_lang    text,
66 |             ll_title   text
67 |         );" | psqlcmd
68 | 
69 |     echo "COPY ${LANG}langlinks (ll_title, ll_from, ll_lang)
70 |         FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/langlinks.csv.gz'
71 |         CSV
72 |         ;" | psqlcmd
73 | 
74 | 
75 |     # -----------------------------------------------------------
76 |     echo "* ${LANG}redirect from $CONVERTED_PATH_ABS/$LANG/redirects.csv.gz";
77 | 
78 |     echo "DROP TABLE IF EXISTS ${LANG}redirect;" | psqlcmd
79 |     echo "CREATE TABLE ${LANG}redirect (
80 |             rd_from    integer,
81 |             rd_title   text
82 |         );" | psqlcmd
83 | 
84 |     echo "COPY ${LANG}redirect (rd_from, rd_title)
85 |         FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/redirect.csv.gz'
86 |         CSV
87 |         ;" | psqlcmd
88 | 
89 | done


--------------------------------------------------------------------------------
/config/wikidata_place_type_levels.csv:
--------------------------------------------------------------------------------
  1 | place_type,level
  2 | Q9842,4
  3 | Q9430,3
  4 | Q928830,4
  5 | Q9259,1
  6 | Q91028,5
  7 | Q8514,2
  8 | Q8502,2
  9 | Q83405,3
 10 | Q82794,2
 11 | Q820477,1
 12 | Q811979,1
 13 | Q8072,2
 14 | Q79007,2
 15 | Q786014,3
 16 | Q75848,2
 17 | Q75520,2
 18 | Q728937,4
 19 | Q7275,2
 20 | Q719456,3
 21 | Q7075,3
 22 | Q697295,4
 23 | Q6852233,2
 24 | Q682943,3
 25 | Q665487,5
 26 | Q655686,3
 27 | Q643589,5
 28 | Q641226,2
 29 | Q631305,2
 30 | Q6256,2
 31 | Q6023295,2
 32 | Q5773747,5
 33 | Q56061,1
 34 | Q55659167,4
 35 | Q55488,4
 36 | Q55465477,3
 37 | Q54050,2
 38 | Q532,3
 39 | Q53060,2
 40 | Q52177058,4
 41 | Q515716,5
 42 | Q5153984,4
 43 | Q515,3
 44 | Q5144960,5
 45 | Q5119,4
 46 | Q5119,4
 47 | Q5107,2
 48 | Q5084,4
 49 | Q5031071,4
 50 | Q5003624,2
 51 | Q4989906,1
 52 | Q4976993,3
 53 | Q486972,1
 54 | Q486972,2
 55 | Q483110,3
 56 | Q4830453,4
 57 | Q47521,3
 58 | Q473972,1
 59 | Q46831,2
 60 | Q46614560,5
 61 | Q44782,3
 62 | Q44613,4
 63 | Q44539,4
 64 | Q44494,2
 65 | Q44377,2
 66 | Q4421,2
 67 | Q43501,2
 68 | Q4286337,3
 69 | Q42523,3
 70 | Q41176,2
 71 | Q40357,3
 72 | Q4022,4
 73 | Q40080,2
 74 | Q39816,2
 75 | Q39715,3
 76 | Q39614,1
 77 | Q3957,3
 78 | Q3947,4
 79 | Q3914,3
 80 | Q38723,2
 81 | Q38720,3
 82 | Q3623867,5
 83 | Q35666,2
 84 | Q355304,3
 85 | Q35509,2
 86 | Q35112127,3
 87 | Q34985575,4
 88 | Q34876,5
 89 | Q34763,2
 90 | Q34627,4
 91 | Q3455524,3
 92 | Q34442,4
 93 | Q33837,2
 94 | Q33506,3
 95 | Q32815,4
 96 | Q3257686,2
 97 | Q3240715,2
 98 | Q3191695,5
 99 | Q3153117,2
100 | Q30198,2
101 | Q30139652,3
102 | Q294422,3
103 | Q2870166,3
104 | Q27686,3
105 | Q274153,3
106 | Q271669,1
107 | Q2659904,2
108 | Q24529780,2
109 | Q24354,3
110 | Q2354973,4
111 | Q23442,2
112 | Q23413,3
113 | Q23397,3
114 | Q2327515,4
115 | Q2311958,5
116 | Q22927291,6
117 | Q22698,1
118 | Q2175765,4
119 | Q205495,4
120 | Q204832,3
121 | Q2042028,2
122 | Q202216,6
123 | Q1970725,3
124 | Q194203,5
125 | Q194195,2
126 | Q190429,2
127 | Q185187,3
128 | Q185113,2
129 | Q183366,2
130 | Q1799794,1
131 | Q1788454,4
132 | Q1785071,3
133 | Q1777138,3
134 | Q177634,2
135 | Q177380,2
136 | Q174814,4
137 | Q174782,2
138 | Q17350442,2
139 | Q17343829,3
140 | Q17334923,0
141 | Q17018380,3
142 | Q16970,4
143 | Q16917,3
144 | Q16831714,4
145 | Q165,3
146 | Q160742,4
147 | Q159719,3
148 | Q159334,4
149 | Q15640612,5
150 | Q15324,2
151 | Q15284,5
152 | Q15243209,6
153 | Q152081,1
154 | Q15195406,4
155 | Q1500350,5
156 | Q149621,5
157 | Q14757767,4
158 | Q14350,3
159 | Q1410668,3
160 | Q1394476,3
161 | Q1377575,2
162 | Q1353183,3
163 | Q134447,4
164 | Q133215,3
165 | Q133056,2
166 | Q13221722,3
167 | Q13220204,2
168 | Q1311958,4
169 | Q1303167,3
170 | Q130003,3
171 | Q12518,2
172 | Q12516,3
173 | Q1248784,3
174 | Q123705,3
175 | Q12323,3
176 | Q12284,4
177 | Q12280,4
178 | Q121359,2
179 | Q1210950,2
180 | Q11755880,3
181 | Q11707,3
182 | Q11315,3
183 | Q11303,3
184 | Q1115575,4
185 | Q1107656,1
186 | Q10864048,1
187 | Q1076486,2
188 | Q105731,3
189 | Q105190,3
190 | Q1048525,3
191 | Q102496,5
192 | Q28872924,1
193 | Q15617994,1
194 | Q159313,2
195 | Q24398318,3
196 | Q327333,2
197 | Q43229,1
198 | Q860861,1
199 | Q4989906,1
200 | 


--------------------------------------------------------------------------------
/steps/wikidata_import.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | : ${DATABASE_NAME:=wikiprocessingdb}
  6 | 
  7 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
  8 | CONVERTED_PATH="$BUILDID/converted/wikidata"
  9 | # postgresql's COPY requires full path
 10 | DOWNLOADED_PATH_ABS=$(realpath "$DOWNLOADED_PATH")
 11 | CONVERTED_PATH_ABS=$(realpath "$CONVERTED_PATH")
 12 | 
 13 | psqlcmd() {
 14 |      psql --quiet $DATABASE_NAME |& \
 15 |      grep -v 'does not exist, skipping'
 16 | }
 17 | 
 18 | 
 19 | echo "====================================================================="
 20 | echo "Import wikidata tables"
 21 | echo "====================================================================="
 22 | 
 23 | 
 24 | # -----------------------------------------------------------
 25 | echo "Importing geotags from $CONVERTED_PATH_ABS/geo_tags.csv.gz";
 26 | 
 27 | echo "DROP TABLE IF EXISTS geo_tags;" | psqlcmd
 28 | echo "CREATE TABLE geo_tags (
 29 |         gt_page_id    bigint,
 30 |         gt_lat        numeric(11,8),
 31 |         gt_lon        numeric(11,8)
 32 |     );" | psqlcmd
 33 | 
 34 | 
 35 | echo "COPY geo_tags (gt_page_id, gt_lat, gt_lon)
 36 |     FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/geo_tags.csv.gz'
 37 |     CSV
 38 |     ;" | psqlcmd
 39 | 
 40 | 
 41 | 
 42 | # -----------------------------------------------------------
 43 | echo "Importing page from $CONVERTED_PATH_ABS/page.csv.gz";
 44 | 
 45 | echo "DROP TABLE IF EXISTS page;" | psqlcmd
 46 | echo "CREATE TABLE page (
 47 |         page_id            bigint,
 48 |         page_title         text
 49 |     );" | psqlcmd
 50 | 
 51 | 
 52 | echo "COPY page (page_id, page_title)
 53 |     FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/page.csv.gz'
 54 |     CSV
 55 |     ;" | psqlcmd
 56 | 
 57 | 
 58 | 
 59 | # -----------------------------------------------------------
 60 | echo "Importing wb_items_per_site from $CONVERTED_PATH_ABS/wb_items_per_site.csv.gz";
 61 | 
 62 | echo "DROP TABLE IF EXISTS wb_items_per_site;" | psqlcmd
 63 | echo "CREATE TABLE wb_items_per_site (
 64 |         ips_item_id        integer,
 65 |         ips_site_id        text,
 66 |         ips_site_page      text
 67 |     );" | psqlcmd
 68 | 
 69 | echo "COPY wb_items_per_site (ips_item_id, ips_site_id, ips_site_page)
 70 |     FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/wb_items_per_site.csv.gz'
 71 |     CSV
 72 |     ;" | psqlcmd
 73 | 
 74 | 
 75 | 
 76 | # -----------------------------------------------------------
 77 | echo "Importing wikidata_place_dump from $DOWNLOADED_PATH_ABS/wikidata_place_dump.csv.gz";
 78 | 
 79 | echo "DROP TABLE IF EXISTS wikidata_place_dump;" | psqlcmd
 80 | echo "CREATE TABLE wikidata_place_dump (
 81 |         item        text,
 82 |         instance_of text
 83 |       );" | psqlcmd
 84 | 
 85 | echo "COPY wikidata_place_dump (item, instance_of)
 86 |       FROM PROGRAM 'unpigz -c $DOWNLOADED_PATH_ABS/wikidata_place_dump.csv.gz'
 87 |       CSV
 88 |       ;" | psqlcmd
 89 | 
 90 | 
 91 | 
 92 | # -----------------------------------------------------------
 93 | echo "Importing wikidata_place_type_levels from $DOWNLOADED_PATH_ABS/wikidata_place_type_levels.csv";
 94 | 
 95 | echo "DROP TABLE IF EXISTS wikidata_place_type_levels;" | psqlcmd
 96 | echo "CREATE TABLE wikidata_place_type_levels (
 97 |         place_type    text,
 98 |         level         integer
 99 |       );" | psqlcmd
100 | 
101 | echo "COPY wikidata_place_type_levels (place_type, level)
102 |       FROM '$DOWNLOADED_PATH_ABS/wikidata_place_type_levels.csv'
103 |       CSV
104 |       HEADER
105 |       ;" | psqlcmd
106 | 
107 | 


--------------------------------------------------------------------------------
/steps/wikipedia_process.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | : ${DATABASE_NAME:=wikiprocessingdb}
  6 | : ${LANGUAGES:=bar,cy}
  7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
  8 | 
  9 | 
 10 | psqlcmd() {
 11 |      psql --quiet $DATABASE_NAME |& \
 12 |      grep -v 'does not exist, skipping'
 13 | }
 14 | 
 15 | 
 16 | echo "====================================================================="
 17 | echo "Create and fill wikipedia_redirect_full"
 18 | echo "====================================================================="
 19 | echo "DROP TABLE IF EXISTS wikipedia_redirect_full;" | psqlcmd
 20 | echo "CREATE TABLE wikipedia_redirect_full (
 21 |         language   text,
 22 |         from_title text,
 23 |         to_title   text
 24 |      );" | psqlcmd
 25 | 
 26 | for LANG in "${LANGUAGES_ARRAY[@]}"
 27 | do
 28 |     echo "INSERT INTO wikipedia_redirect_full
 29 |           SELECT '${LANG}',
 30 |                  page_title,
 31 |                  rd_title
 32 |           FROM ${LANG}redirect
 33 |           JOIN ${LANG}page ON (rd_from = page_id)
 34 |           ;" | psqlcmd
 35 | done
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | echo "====================================================================="
 42 | echo "Process language tables and associated pagelink counts"
 43 | echo "====================================================================="
 44 | 
 45 | echo "set othercounts"
 46 | # Creating indexes on title, ll_title didn't have any positive effect on
 47 | # query performance and added another 1 hour and 35GB of data.
 48 | # echo "CREATE INDEX idx_${LANG}langlinks ON ${LANG}langlinks (ll_lang, ll_title);" | psqlcmd
 49 | # echo "CREATE INDEX idx_${LANG}langlinks2 ON ${LANG}langlinks (ll_title);" | psqlcmd
 50 | # echo "CREATE INDEX idx_${LANG}page ON ${LANG}page (page_id);" | psqlcmd
 51 | # echo "CREATE INDEX idx_${LANG}page2 ON ${LANG}page (page_title);" | psqlcmd
 52 | for LANG in "${LANGUAGES_ARRAY[@]}"
 53 | do
 54 |     echo "Language: $LANG"
 55 | 
 56 |     for OTHERLANG in "${LANGUAGES_ARRAY[@]}"
 57 |     do
 58 |         echo "UPDATE ${LANG}pagelinks
 59 |               SET othercount = othercount + x.count
 60 |               FROM (
 61 |                 SELECT ${LANG}page.page_title AS title,
 62 |                        ${OTHERLANG}pagelinks.langcount AS count
 63 |                 FROM ${LANG}langlinks
 64 |                 JOIN ${LANG}page ON (ll_from = page_id)
 65 |                 JOIN ${OTHERLANG}pagelinks ON (ll_lang = '${OTHERLANG}' AND ll_title = pl_title)
 66 |               ) AS x
 67 |               WHERE x.title = ${LANG}pagelinks.pl_title
 68 |               ;" | psqlcmd
 69 |     done
 70 | 
 71 | done
 72 | 
 73 | 
 74 | 
 75 | echo "====================================================================="
 76 | echo "Create and fill wikipedia_article_full"
 77 | echo "====================================================================="
 78 | 
 79 | echo "DROP TABLE IF EXISTS wikipedia_article_full;" | psqlcmd
 80 | echo "CREATE TABLE wikipedia_article_full (
 81 |         language       text NOT NULL,
 82 |         title          text NOT NULL,
 83 |         langcount      integer,
 84 |         othercount     integer,
 85 |         totalcount     integer,
 86 |         lat            double  precision,
 87 |         lon            double  precision,
 88 |         importance     double precision,
 89 |         title_en       text,
 90 |         wd_page_title  text,
 91 |         instance_of    text
 92 |       );" | psqlcmd
 93 | 
 94 | for LANG in "${LANGUAGES_ARRAY[@]}"
 95 | do
 96 |     echo "INSERT INTO wikipedia_article_full
 97 |           SELECT '${LANG}',
 98 |                  pl_title,
 99 |                  langcount,
100 |                  othercount,
101 |                  langcount + othercount
102 |           FROM ${LANG}pagelinks
103 |           ;" | psqlcmd
104 | done
105 | 
106 | 
107 | echo "done"
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/steps/latest_available_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #
  4 | # Prints a YYYYMMDD date of the latest available date on
  5 | # https://wikidata.aerotechnet.com/enwiki/
  6 | # We do some additional checks if the dumps are complete, too
  7 | #
  8 | 
  9 | debug() {
 10 |     # Comment out the following line to print debug information
 11 |     # echo "$@" 1>&2
 12 |     echo -n ''
 13 | }
 14 | 
 15 | DATE=''
 16 | 
 17 | # Sets $DATE to first of the month (YYYYMMDD). If given a parameter then
 18 | # it substracts number of months
 19 | set_date_to_first_of_month() {
 20 |     MINUS_NUM_MONTHS=${1:-0}
 21 | 
 22 |     if [[ "$(uname)" == "Darwin" ]]; then
 23 |         DATE=$(date -v -${MINUS_NUM_MONTHS}m +%Y%m01)
 24 |     else
 25 |         DATE=$(date --date="-$MINUS_NUM_MONTHS month" +%Y%m01)
 26 |     fi
 27 | }
 28 | 
 29 | check_all_files_ready() {
 30 |     CHECK_DATE=$1
 31 |     debug "check_all_files_ready for $CHECK_DATE"
 32 | 
 33 |     # The complete dump for wikidata for example can take several weeks (metahistory7zdump
 34 |     # file ready after 15 days).
 35 |     #
 36 |     # The dumpruninfo.json files have this format:
 37 |     # {
 38 |     #   "jobs": {
 39 |     #     "imagetable": {
 40 |     #       "status": "done",
 41 |     #       "updated": "2023-02-01 08:27:30"
 42 |     #     },
 43 |     #     "imagelinkstable": {
 44 |     #       "status": "done",
 45 |     #       "updated": "2023-02-01 09:18:03"
 46 |     #     },
 47 |     #     "geotagstable": {
 48 |     #       "status": "done",
 49 |     #       "updated": "2023-02-01 10:01:50"
 50 |     #     },
 51 |     #     [...]
 52 |     #
 53 | 
 54 |     ANY_FILE_MISSING=0
 55 | 
 56 |     ##
 57 |     ## 1. Chinese (ZH) Wikipedia
 58 |     ## usually the last to be dumped
 59 |     ##
 60 |     # from wikipedia_download.sh
 61 |     WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect"
 62 |     DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/zhwiki/$CHECK_DATE/dumpruninfo.json"
 63 |     debug $DUMP_RUN_INFO_URL
 64 |     DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")
 65 | 
 66 |     if [[ $? != 0 ]]; then
 67 |         debug "fetching from URL $DUMP_RUN_INFO_URL failed"
 68 |         return 1
 69 |     fi
 70 | 
 71 |     for FN in $WIKIPEDIA_REQUIRED_FILES; do
 72 |         TABLENAME=${FN//_/}table # redirect => redirecttable
 73 |         debug "checking status for table $TABLENAME"
 74 | 
 75 |         STATUS=$(echo "$DUMP_RUN_INFO" | TABLE=$TABLENAME jq -r '.jobs[env.TABLE].status')
 76 |         debug "  status: $STATUS"
 77 | 
 78 |         if [ "$STATUS" != "done" ]; then
 79 |             debug "$TABLENAME not ready yet"
 80 |             ANY_FILE_MISSING=1
 81 |         fi
 82 |     done
 83 | 
 84 |     ##
 85 |     ## 2. Wikidata
 86 |     ##
 87 |     # from wikidata_download.sh
 88 |     WIKIDATA_REQUIRED_FILES="geo_tags page wb_items_per_site"
 89 | 
 90 |     DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/wikidatawiki/$CHECK_DATE/dumpruninfo.json"
 91 |     debug $DUMP_RUN_INFO_URL
 92 |     DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL")
 93 | 
 94 |     if [[ $? != 0 ]]; then
 95 |         debug "fetching from URL $DUMP_RUN_INFO_URL failed"
 96 |         return 1
 97 |     fi
 98 | 
 99 |     for FN in $WIKIDATA_REQUIRED_FILES; do
100 |         TABLENAME=${FN//_/}table # wb_items_per_site => wbitemspersitetable
101 |         debug "checking status for table $TABLENAME"
102 | 
103 |         STATUS=$(echo "$DUMP_RUN_INFO" | TABLE=$TABLENAME jq -r '.jobs[env.TABLE].status')
104 |         debug "  status: $STATUS"
105 | 
106 |         if [ "$STATUS" != "done" ]; then
107 |             debug "$TABLENAME not ready yet"
108 |             ANY_FILE_MISSING=1
109 |         fi
110 |     done
111 | 
112 |     return $ANY_FILE_MISSING
113 | }
114 | 
115 | # Find dates in directory names. We need to parse HTML.
116 | #
117 | CONTENT=$(curl -s -S --fail 'https://wikidata.aerotechnet.com/enwiki/')
118 | for DATE in $(echo $CONTENT | grep -oE '20[0-9]{6}' | sort -nr); do
119 |     check_all_files_ready $DATE
120 | 
121 |     if [ $? == 0 ]; then
122 |         echo "$DATE"
123 |         exit 0
124 |     fi
125 | done
126 | 
127 | exit 1
128 | 


--------------------------------------------------------------------------------
/bin/mysqldump_to_csv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # import fileinput
  3 | import csv
  4 | import sys
  5 | import io
  6 | 
  7 | # This prevents prematurely closed pipes from raising
  8 | # an exception in Python
  9 | from signal import signal, SIGPIPE, SIG_DFL
 10 | signal(SIGPIPE, SIG_DFL)
 11 | 
 12 | # allow large content in the dump
 13 | csv.field_size_limit(sys.maxsize)
 14 | 
 15 | def is_insert(line):
 16 |     """
 17 |     Returns true if the line begins a SQL insert statement.
 18 |     """
 19 |     return line.startswith('INSERT INTO') or False
 20 | 
 21 | 
 22 | def get_values(line):
 23 |     """
 24 |     Returns the portion of an INSERT statement containing values
 25 |     """
 26 |     return line.partition('` VALUES ')[2]
 27 | 
 28 | 
 29 | def values_sanity_check(values):
 30 |     """
 31 |     Ensures that values from the INSERT statement meet basic checks.
 32 |     """
 33 |     assert values
 34 |     assert values[0] == '('
 35 |     # Assertions have not been raised
 36 |     return True
 37 | 
 38 | 
 39 | def parse_values(values, outfile):
 40 |     """
 41 |     Given a file handle and the raw values from a MySQL INSERT
 42 |     statement, write the equivalent CSV to the file
 43 |     """
 44 |     latest_row = []
 45 | 
 46 |     reader = csv.reader([values], delimiter=',',
 47 |                         doublequote=False,
 48 |                         escapechar='\\',
 49 |                         quotechar="'",
 50 |                         strict=True
 51 |     )
 52 | 
 53 |     writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, escapechar='\\')
 54 |     for reader_row in reader:
 55 |         for column in reader_row:
 56 |             # If our current string is empty...
 57 |             if len(column) == 0 or column == 'NULL':
 58 |                 # latest_row.append(chr(0))
 59 |                 latest_row.append('')
 60 |                 continue
 61 |             # If our string starts with an open paren
 62 |             if column[0] == "(":
 63 |                 # Assume that this column does not begin
 64 |                 # a new row.
 65 |                 new_row = False
 66 |                 # If we've been filling out a row
 67 |                 if len(latest_row) > 0:
 68 |                     # Check if the previous entry ended in
 69 |                     # a close paren. If so, the row we've
 70 |                     # been filling out has been COMPLETED
 71 |                     # as:
 72 |                     #    1) the previous entry ended in a )
 73 |                     #    2) the current entry starts with a (
 74 |                     if (latest_row[-1] and latest_row[-1][-1] == ")"):
 75 |                         # Remove the close paren.
 76 |                         latest_row[-1] = latest_row[-1][:-1]
 77 |                         new_row = True
 78 |                 # If we've found a new row, write it out
 79 |                 # and begin our new one
 80 |                 if new_row:
 81 |                     writer.writerow(latest_row)
 82 |                     latest_row = []
 83 |                 # If we're beginning a new row, eliminate the
 84 |                 # opening parentheses.
 85 |                 if len(latest_row) == 0:
 86 |                     column = column[1:]
 87 |             # Add our column to the row we're working on.
 88 |             latest_row.append(column)
 89 |         # At the end of an INSERT statement, we'll
 90 |         # have the semicolon.
 91 |         # Make sure to remove the semicolon and
 92 |         # the close paren.
 93 |         if latest_row[-1][-2:] == ");":
 94 |             latest_row[-1] = latest_row[-1][:-2]
 95 |             writer.writerow(latest_row)
 96 | 
 97 | 
 98 | def main():
 99 |     """
100 |     Parse arguments and start the program
101 |     """
102 |     # Iterate over all lines in all files
103 |     # listed in sys.argv[1:]
104 |     # or stdin if no args given.
105 |     try:
106 |         # UPDATE: fileinput starts supporting 'errors' in Python 5.10. Until then
107 |         # call io.open() directly.
108 |         # for line in fileinput.input():
109 |         with io.open(sys.stdin.fileno(), 'r', encoding="utf-8", errors="ignore") as file:
110 |             for line in file:
111 |                 # Look for an INSERT statement and parse it.
112 |                 if is_insert(line):
113 |                     values = get_values(line)
114 |                     if values_sanity_check(values):
115 |                         parse_values(values, sys.stdout)
116 |     except KeyboardInterrupt:
117 |         sys.exit(0)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/steps/output.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | : ${DATABASE_NAME:=wikiprocessingdb}
  6 | 
  7 | OUTPUT_PATH="$BUILDID/output"
  8 | mkdir -p "$OUTPUT_PATH"
  9 | 
 10 | psqlcmd() {
 11 |       psql --quiet $DATABASE_NAME |&
 12 |             grep -v 'does not exist, skipping'
 13 | }
 14 | 
 15 | echo "====================================================================="
 16 | echo "Create output"
 17 | echo "====================================================================="
 18 | 
 19 | # "====================================================================="
 20 | echo "Create tables"
 21 | # "====================================================================="
 22 | 
 23 | echo "* wikipedia_article (Less rows and columns than wikipedia_article_full)"
 24 | # Remove rows that don't have a title. For redirect only row
 25 | 
 26 | echo "DROP TABLE IF EXISTS wikipedia_article;" | psqlcmd
 27 | echo "CREATE TABLE wikipedia_article
 28 |       AS
 29 |       SELECT language, title, importance, wd_page_title FROM wikipedia_article_full
 30 |       WHERE wd_page_title IS NOT NULL
 31 |         AND importance != 0
 32 |       ;" | psqlcmd
 33 | 
 34 | # 5 minutes
 35 | # 9.2m rows
 36 | 
 37 | echo "* wikipedia_redirect (Less rows than wikipedia_redirect_full)"
 38 | # Remove rows that don't point to titles in wikipedia_article)"
 39 | 
 40 | echo "DROP TABLE IF EXISTS wikipedia_redirect;" | psqlcmd
 41 | echo "CREATE TABLE wikipedia_redirect
 42 |       AS
 43 |       SELECT wikipedia_redirect_full.*
 44 |       FROM wikipedia_redirect_full
 45 |       RIGHT OUTER JOIN wikipedia_article
 46 |                    ON (wikipedia_redirect_full.language = wikipedia_article.language
 47 |                        AND
 48 |                        wikipedia_redirect_full.to_title = wikipedia_article.title)
 49 |       ;" | psqlcmd
 50 | 
 51 | # 13m rows
 52 | 
 53 | echo "* wikimedia_importance"
 54 | 
 55 | echo "DROP TABLE IF EXISTS wikimedia_importance;" | psqlcmd
 56 | echo "CREATE TABLE wikimedia_importance AS
 57 |       SELECT language, 'a' as type, title, importance, wd_page_title as wikidata_id
 58 |       FROM wikipedia_article
 59 |       ;" | psqlcmd
 60 | 
 61 | # Now add the same from redirects, unless (language + title) already exists in wikimedia_importance
 62 | echo "WITH from_redirects AS (
 63 |           SELECT r.language, 'r' as type, r.from_title as title, a.importance, a.wd_page_title AS wikidata_id
 64 |           FROM wikipedia_article a, wikipedia_redirect r
 65 |           WHERE a.language = r.language AND a.title = r.to_title
 66 |       )
 67 |       INSERT INTO wikimedia_importance
 68 |       SELECT from_redirects.* FROM from_redirects
 69 |       LEFT JOIN wikimedia_importance USING (language, title)
 70 |       WHERE wikimedia_importance IS NULL
 71 |       ;" | psqlcmd
 72 | 
 73 | # Are all language+title unique?
 74 | # WITH duplicates AS (
 75 | #   SELECT language, title, count(*)
 76 | #   FROM wikimedia_importance
 77 | #   GROUP BY language, title
 78 | #   HAVING count(*) > 1
 79 | # )
 80 | # SELECT count(*) FROM duplicates;
 81 | #  0
 82 | 
 83 | # 17m rows
 84 | 
 85 | # "====================================================================="
 86 | echo "Dump table"
 87 | # "====================================================================="
 88 | 
 89 | # Temporary table for sorting the output by most popular language. Nominatim assigns
 90 | # the wikipedia extra tag to the first language it finds during import and English (en)
 91 | # makes debugging easier than Arabic (ar).
 92 | # Not a temporary table actually because with each psqlcmd call we start a new
 93 | # session.
 94 | #
 95 | #  language |  size
 96 | # ----------+---------
 97 | #  en       | 3360898
 98 | #  de       |  989366
 99 | #  fr       |  955523
100 | #  uk       |  920531
101 | #  sv       |  918185
102 | 
103 | echo "DROP TABLE IF EXISTS top_languages;" | psqlcmd
104 | echo "CREATE TABLE top_languages AS
105 |       SELECT language, COUNT(*) AS size
106 |       FROM wikimedia_importance
107 |       GROUP BY language
108 |       ORDER BY size DESC
109 |       ;" | psqlcmd
110 | 
111 | echo "* wikimedia_importance.tsv.gz"
112 | 
113 | {
114 |       # Prints the CSV header row
115 |       # language  type  title importance  wikidata_id
116 |       echo "COPY (SELECT * FROM wikimedia_importance LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" |
117 |             psqlcmd
118 |       echo "COPY (
119 |                   SELECT w.*
120 |                   FROM wikimedia_importance w
121 |                   JOIN top_languages tl ON w.language = tl.language
122 |                   ORDER BY tl.size DESC, w.type, w.title
123 |             ) TO STDOUT" |
124 |             psqlcmd
125 | } | pigz -9 >"$OUTPUT_PATH/wikimedia_importance.tsv.gz"
126 | 
127 | # default is 600
128 | chmod 644 "$OUTPUT_PATH/wikimedia_importance.tsv.gz"
129 | 
130 | du -h $OUTPUT_PATH/*
131 | # 265M  wikimedia_importance.tsv.gz
132 | 


--------------------------------------------------------------------------------
/steps/wikidata_sql2csv.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | # Languages as comma-separated string, e.g. 'en,fr,de'
  6 | : ${LANGUAGES:=bar,cy}
  7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
  8 | 
  9 | 
 10 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
 11 | CONVERTED_PATH="$BUILDID/converted/wikidata"
 12 | mkdir -p $CONVERTED_PATH
 13 | 
 14 | 
 15 | ###############################################################################
 16 | ## GEO_TAGS
 17 | ##
 18 | echo "wikidata_sql2csv geo_tags"
 19 | 
 20 | # MySQL schema inside the sql.gz file:
 21 | #
 22 | # CREATE TABLE `geo_tags` (
 23 | #   `gt_id`      int(10) unsigned NOT NULL AUTO_INCREMENT,
 24 | #   `gt_page_id` int(10) unsigned NOT NULL,
 25 | #   `gt_globe`   varbinary(32)    NOT NULL,
 26 | #   `gt_primary` tinyint(1)       NOT NULL,
 27 | #   `gt_lat`     decimal(11,8)              DEFAULT NULL,
 28 | #   `gt_lon`     decimal(11,8)              DEFAULT NULL,
 29 | #   `gt_dim`     int(11)                    DEFAULT NULL,
 30 | #   `gt_type`    varbinary(32)              DEFAULT NULL,
 31 | #   `gt_name`    varbinary(255)             DEFAULT NULL,
 32 | #   `gt_country` binary(2)                  DEFAULT NULL,
 33 | #   `gt_region`  varbinary(3)               DEFAULT NULL,
 34 | 
 35 | # Remove anything globe!=earth, primary!=1
 36 | # Round the coordinates
 37 | unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \
 38 | ./bin/mysqldump_to_csv.py | \
 39 | bin/filter_wikidata_geo_tags.py | \
 40 | pigz -9 \
 41 | > $CONVERTED_PATH/geo_tags.csv.gz
 42 | 
 43 | # Input
 44 | #   134 MB (690 MB uncompressed)
 45 | # Output
 46 | #   89 MB (240 MB uncompressed)
 47 | #   8.4m entries
 48 | #   columns: gt_page_id, gt_lat, gt_lon
 49 | # 4175,43.1924,-81.3158
 50 | # 4180,-26.0,121.0
 51 | # 4181,43.08333333,2.41666667
 52 | # 4187,51.76055556,14.33416667
 53 | 
 54 | 
 55 | 
 56 | ###############################################################################
 57 | ## PAGE
 58 | ##
 59 | 
 60 | echo "wikidata_sql2csv page"
 61 | 
 62 | # MySQL schema inside the sql.gz file:
 63 | #
 64 | # CREATE TABLE `page` (
 65 | #   `page_id`            int(10) unsigned    NOT NULL AUTO_INCREMENT,
 66 | #   `page_namespace`     int(11)             NOT NULL,
 67 | #   `page_title`         varbinary(255)      NOT NULL,
 68 | #   `page_restrictions`  tinyblob                     DEFAULT NULL,
 69 | #   `page_is_redirect`   tinyint(3) unsigned NOT NULL DEFAULT 0,
 70 | #   `page_is_new`        tinyint(3) unsigned NOT NULL DEFAULT 0,
 71 | #   `page_random`        double unsigned     NOT NULL,
 72 | #   `page_touched`       binary(14)          NOT NULL,
 73 | #   `page_links_updated` varbinary(14)                DEFAULT NULL,
 74 | #   `page_latest`        int(10) unsigned    NOT NULL,
 75 | #   `page_len`           int(10) unsigned    NOT NULL,
 76 | #   `page_content_model` varbinary(32)                DEFAULT NULL,
 77 | #   `page_lang`          varbinary(35)                DEFAULT NULL,
 78 | 
 79 | # We remove all namespace != 0 (0=articles, 99% of the lines)
 80 | # page_lang isn't interesting, 'NULL' 99.999% of the time
 81 | # Remove all page_title that don't start with 'Q'
 82 | 
 83 | unpigz -c $DOWNLOADED_PATH/page.sql.gz | \
 84 | ./bin/mysqldump_to_csv.py | \
 85 | bin/filter_wikidata_page.py | \
 86 | pigz -9 \
 87 | > $CONVERTED_PATH/page.csv.gz
 88 | 
 89 | # 34min
 90 | # Input
 91 | #   2.8GB, (3.1GB uncompresseed)
 92 | # Output
 93 | #   480MB, (1.8GB uncompressed)
 94 | #   3m lines
 95 | #   columns: page_id, page_title
 96 | #
 97 | # 12991,Q11474
 98 | # 12992,Q11475
 99 | # 12993,Q11476
100 | # 12995,Q11477
101 | # 12996,Q11478
102 | # 12997,Q11479
103 | 
104 | 
105 | 
106 | 
107 | 
108 | ###############################################################################
109 | ## WB_ITEMS_PER_SITE
110 | ##
111 | 
112 | echo "wikidata_sql2csv wb_items_per_site"
113 | 
114 | # MySQL schema inside the sql.gz file:
115 | #
116 | # CREATE TABLE `wb_items_per_site` (
117 | #   `ips_row_id`    bigint(20) unsigned NOT NULL AUTO_INCREMENT,
118 | #   `ips_item_id`   int(10) unsigned    NOT NULL,
119 | #   `ips_site_id`   varbinary(32)       NOT NULL,
120 | #   `ips_site_page` varbinary(310)      NOT NULL,
121 | 
122 | # Only considering languages we need, cuts down 80m lines to 52m
123 | # LISTLANG=${LANGUAGES_ARRAY[@]}
124 | # ar bg ca cs da de en es
125 | # LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki,"
126 | # ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki,
127 | 
128 | unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \
129 | ./bin/mysqldump_to_csv.py | \
130 | bin/filter_wikidata_wb_items_per_site.py | \
131 | pigz -9 \
132 | > $CONVERTED_PATH/wb_items_per_site.csv.gz
133 | 
134 | # Input
135 | #   1.4GB compressed, (4.7GB uncompressed)
136 | # Output 
137 | #   750MB compressed, (2.2GB uncompressed)
138 | #   52m lines
139 | #   columns: item_id, site_id, page (title)
140 | # 576947,cawiki,Bryaninops amplus
141 | # 2739322,cawiki,Bryneich
142 | # 2927288,cawiki,Bréjaude
143 | # 2912549,cawiki,Brúixola Brunton
144 | 
145 | 
146 | du -h $CONVERTED_PATH/*
147 | # 88M     geo_tags.csv.gz
148 | # 480M    page.csv.gz
149 | # 744M    wb_items_per_site.csv.gz
150 | 


--------------------------------------------------------------------------------
/steps/wikidata_api_fetch_placetypes.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | 
  6 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
  7 | TEMP_PATH=$DOWNLOADED_PATH/tmp
  8 | 
  9 | if [[ -e $DOWNLOADED_PATH/wikidata_place_dump.csv.gz ]]; then
 10 |     echo "Output file $DOWNLOADED_PATH/wikidata_place_dump.csv.gz already exists. Won't fetch again."
 11 |     exit 0
 12 | fi
 13 | 
 14 | echo "====================================================================="
 15 | echo "Get wikidata places from wikidata query API"
 16 | echo "====================================================================="
 17 | 
 18 | # We create a mapping of QID->place type QID
 19 | # for example 'Q6922586;Q130003' (Mount Olympus Ski Area -> ski resort)
 20 | # 
 21 | # Takes about 30 minutes for 300 place types.
 22 | #
 23 | # The input wikidata_place_types.txt has the format
 24 | # Q1303167;barn
 25 | # Q130003;ski resort
 26 | # Q12518;tower
 27 | # The second column is optional.
 28 | #
 29 | # We tried to come up with a list of geographic related types but wikidata hierarchy
 30 | # is complex. You'd need to know what a Raion is (administrative unit of post-Soviet
 31 | # states) or a Bight. Many place types will be too broad, too narrow or even missing.
 32 | # It's best effort.
 33 | #
 34 | # wdtaxonomy (https://github.com/nichtich/wikidata-taxonomy) runs SPARQL queries
 35 | # against wikidata servers. Add --sparql to see the query. Example SPARQL query:
 36 | #
 37 | #     SELECT ?item ?broader ?sites WITH {
 38 | #       SELECT DISTINCT ?item { ?item wdt:P279* wd:Q82794 }
 39 | #     } AS %items WHERE {
 40 | #       INCLUDE %items .
 41 | #       OPTIONAL { ?item wdt:P279 ?broader } .
 42 | #       {
 43 | #         SELECT ?item (count(distinct ?site) as ?sites) {
 44 | #           INCLUDE %items.
 45 | #           OPTIONAL { ?site schema:about ?item }
 46 | #         } GROUP BY ?item
 47 | #       }
 48 | #     }
 49 | #
 50 | # The queries can time out (60 second limit). If that's the case we need to further
 51 | # subdivide the place type. For example Q486972 (human settlement) has too many
 52 | # instances. We run "wdtaxonomy Q486972 | grep '^├─'" which prints a long list
 53 | # ├──municipality (Q15284) •106 ×4208 ↑↑↑↑
 54 | # ├──trading post (Q39463) •14 ×97 ↑
 55 | # ├──monastery (Q44613) •100 ×13536 ↑↑↑↑↑
 56 | # ├──barangay (Q61878) •39 ×3524 ↑
 57 | # ├──county seat (Q62049) •34 ×1694 ↑
 58 | #
 59 | # Some instances don't have titles, e.g. https://www.wikidata.org/wiki/Q17218407
 60 | # but can still be assigned to wikipedia articles, in this case
 61 | # https://ja.wikipedia.org/wiki/%E3%82%81%E3%81%8C%E3%81%B2%E3%82%89%E3%82%B9%E3%82%AD%E3%83%BC%E5%A0%B4
 62 | # so we leave them in.
 63 | 
 64 | mkdir -p $DOWNLOADED_PATH
 65 | mkdir -p $TEMP_PATH
 66 | 
 67 | echo "Number of place types:"
 68 | wc -l config/wikidata_place_types.txt
 69 | echo -n > $DOWNLOADED_PATH/wikidata_place_dump.csv
 70 | 
 71 | while read PT_LINE ; do
 72 |     QID=$(echo $PT_LINE | sed 's/;.*//' )
 73 |     NAME=$(echo $PT_LINE | sed 's/^.*;//' )
 74 | 
 75 |     # Querying for place type Q205495 (petrol station)...
 76 |     echo "Querying for place type $QID ($NAME)..."
 77 | 
 78 |     # Example response from wdtaxonomy in CSV format for readability:
 79 |     # level,id,label,sites,instances,parents
 80 |     # [...]
 81 |     # -,Q110941628,Tegatayama Ski Area,0,0,
 82 |     # -,Q111016306,Ski resort Říčky,0,0,
 83 |     # -,Q111016347,Ski resort Deštné v Orlických horách,0,0,
 84 |     # -,Q111818006,Lively Ski Hill,0,0,
 85 |     # -,Q111983623,Falls Creek Alpine Resort,0,0,
 86 |     # -,Q1535041,summer skiing area,3,0,^^
 87 |     # -,Q2292158,,1,0,
 88 |     # -,Q5136446,Club skifield,1,0,
 89 |     # --,Q6922586,Mount Olympus Ski Area,0,0,
 90 |     # -,Q30752692,,1,0,
 91 |     #
 92 |     # For faster queries we use --no-instancecount and --no-labels
 93 |     # Now the columns are actually 'level,id,label,sites,parents' with 'label' always empty.
 94 |     # Unclear why for TSV the header is still commas, likely a bug in wdtaxonomy
 95 |     #
 96 |     # We don't care about parents ('^^', so called broader subcategories) in the last column.
 97 |     # We filter subcategoies, e.g. 'Club skifield', we're only interested in the children
 98 |     # (instances). Subcategories have 'sites' value > 0
 99 |     #
100 | 
101 |     wdtaxonomy $QID --instances --no-instancecount --no-labels --format tsv | \
102 |     cut  -f1-4 | \
103 |     grep -e "[[:space:]]0$" | \
104 |     cut -f2 | \
105 |     sort | \
106 |     awk -v qid=$QID '{print $0 ","qid}' > $TEMP_PATH/$QID.csv
107 |     wc -l $TEMP_PATH/$QID.csv
108 | 
109 |     # output example:
110 |     # Q97774986,Q130003
111 |     # Q980500,Q130003
112 |     # Q988298,Q130003
113 |     # Q991719,Q130003
114 |     # Q992902,Q130003
115 |     # Q995986,Q130003
116 | 
117 |     cat $TEMP_PATH/$QID.csv >> $DOWNLOADED_PATH/wikidata_place_dump.csv
118 |     rm $TEMP_PATH/$QID.csv
119 | done < config/wikidata_place_types.txt
120 | 
121 | # Non-Q is less than 20, not sure what they mean
122 | #    L673595,Q4830453
123 | #    P750,Q4830453
124 | #    L162425-S2,Q40357
125 | # uniq saves 4% lines
126 | # 470MB compressed 72MB
127 | grep '^Q' $DOWNLOADED_PATH/wikidata_place_dump.csv | \
128 | uniq | \
129 | pigz -f -9 > $DOWNLOADED_PATH/wikidata_place_dump.csv.gz
130 | 
131 | cp config/wikidata_place_type_levels.csv $DOWNLOADED_PATH
132 | # temp should be empty but if not then that should be fine, too
133 | rmdir $TEMP_PATH
134 | 
135 | du -h $DOWNLOADED_PATH
136 | 


--------------------------------------------------------------------------------
/steps/wikidata_process.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | : ${DATABASE_NAME:=wikiprocessingdb}
  6 | # Languages as comma-separated string, e.g. 'en,fr,de'
  7 | : ${LANGUAGES:=bar,cy}
  8 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
  9 | 
 10 | psqlcmd() {
 11 |      psql --quiet $DATABASE_NAME |& \
 12 |      grep -v 'does not exist, skipping'
 13 | }
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | echo "====================================================================="
 22 | echo "Create derived tables"
 23 | echo "====================================================================="
 24 | 
 25 | 
 26 | echo "DROP TABLE IF EXISTS geo_earth_wikidata;" | psqlcmd
 27 | echo "CREATE TABLE geo_earth_wikidata AS
 28 |       SELECT DISTINCT geo_tags.gt_page_id,
 29 |                       geo_tags.gt_lat,
 30 |                       geo_tags.gt_lon,
 31 |                       page.page_title
 32 |       FROM geo_tags
 33 |       LEFT OUTER JOIN page
 34 |                    ON (geo_tags.gt_page_id = page.page_id)
 35 |       ORDER BY geo_tags.gt_page_id
 36 |       ;" | psqlcmd
 37 | 
 38 | echo "ALTER TABLE wikidata_place_dump
 39 |       ADD COLUMN ont_level integer,
 40 |       ADD COLUMN lat numeric(11,8),
 41 |       ADD COLUMN lon numeric(11,8)
 42 |       ;" | psqlcmd
 43 | 
 44 | echo "UPDATE wikidata_place_dump
 45 |       SET ont_level = wikidata_place_type_levels.level
 46 |       FROM wikidata_place_type_levels
 47 |       WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
 48 |       ;" | psqlcmd
 49 | 
 50 | 
 51 | echo "DROP TABLE IF EXISTS wikidata_places;" | psqlcmd
 52 | echo "CREATE TABLE wikidata_places
 53 |       AS
 54 |       SELECT DISTINCT ON (item) item,
 55 |                                 instance_of,
 56 |                                 MAX(ont_level) AS ont_level,
 57 |                                 lat,
 58 |                                 lon
 59 |       FROM wikidata_place_dump
 60 |       GROUP BY item,
 61 |                instance_of,
 62 |                ont_level,
 63 |                lat,
 64 |                lon
 65 |       ORDER BY item
 66 |       ;" | psqlcmd
 67 | 
 68 | echo "UPDATE wikidata_places
 69 |       SET lat = geo_earth_wikidata.gt_lat,
 70 |           lon = geo_earth_wikidata.gt_lon
 71 |       FROM geo_earth_wikidata
 72 |       WHERE wikidata_places.item = geo_earth_wikidata.page_title
 73 |       ;" | psqlcmd
 74 | 
 75 | 
 76 | 
 77 | 
 78 | echo "====================================================================="
 79 | echo "Process language pages"
 80 | echo "====================================================================="
 81 | 
 82 | 
 83 | echo "DROP TABLE IF EXISTS wikidata_pages;" | psqlcmd
 84 | echo "CREATE TABLE wikidata_pages (
 85 |         item          text,
 86 |         instance_of   text,
 87 |         lat           numeric(11,8),
 88 |         lon           numeric(11,8),
 89 |         wp_page_title text,
 90 |         language      text
 91 |       );" | psqlcmd
 92 | 
 93 | for LANG in "${LANGUAGES_ARRAY[@]}"
 94 | do
 95 |    echo "DROP TABLE IF EXISTS wikidata_${LANG}_pages;" | psqlcmd
 96 |    echo "CREATE TABLE wikidata_${LANG}_pages AS
 97 |          SELECT wikidata_places.item,
 98 |                 wikidata_places.instance_of,
 99 |                 wikidata_places.lat,
100 |                 wikidata_places.lon,
101 |                 wb_items_per_site.ips_site_page
102 |          FROM wikidata_places
103 |          LEFT JOIN wb_items_per_site
104 |                 ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
105 |          WHERE ips_site_id = '${LANG}wiki'
106 |          ORDER BY wikidata_places.item
107 |          ;" | psqlcmd
108 | 
109 |    echo "INSERT INTO wikidata_pages
110 |          SELECT item,
111 |                 instance_of,
112 |                 lat,
113 |                 lon,
114 |                 REPLACE(ips_site_page, ' ', '_') as wp_page_title,
115 |                 '${LANG}'
116 |          FROM wikidata_${LANG}_pages
117 |          ;" | psqlcmd
118 | done
119 | 
120 | 
121 | 
122 | 
123 | echo "====================================================================="
124 | echo "Add wikidata to wikipedia_article_full table"
125 | echo "====================================================================="
126 | 
127 | echo "UPDATE wikipedia_article_full
128 |       SET lat           = wikidata_pages.lat,
129 |           lon           = wikidata_pages.lon,
130 |           wd_page_title = wikidata_pages.item,
131 |           instance_of   = wikidata_pages.instance_of
132 |       FROM wikidata_pages
133 |       WHERE wikipedia_article_full.language = wikidata_pages.language
134 |         AND wikipedia_article_full.title  = wikidata_pages.wp_page_title
135 |       ;" | psqlcmd
136 | 
137 | # 35 minutes
138 | # 166m rows
139 | 
140 | 
141 | echo "====================================================================="
142 | echo "Calculate importance score for each wikipedia page"
143 | echo "====================================================================="
144 | 
145 | # takes 3 minutes
146 | # 'greatest' because log(1)/<any number> is always 0
147 | echo "UPDATE wikipedia_article_full
148 |       SET importance = GREATEST(
149 |                           LOG(totalcount)
150 |                           /
151 |                           LOG((
152 |                             SELECT MAX(totalcount)
153 |                             FROM wikipedia_article_full
154 |                             WHERE wd_page_title IS NOT NULL
155 |                           )),
156 |                           0.0000000001
157 |                        )
158 |       ;" | psqlcmd
159 | 


--------------------------------------------------------------------------------
/steps/wikipedia_sql2csv.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # set defaults
  4 | : ${BUILDID:=latest}
  5 | : ${LANGUAGES:=bar,cy}
  6 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
  7 | 
  8 | DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia"
  9 | CONVERTED_PATH="$BUILDID/converted/wikipedia"
 10 | 
 11 | echo "====================================================================="
 12 | echo "Convert Wikipedia language tables"
 13 | echo "====================================================================="
 14 | 
 15 | for LANG in "${LANGUAGES_ARRAY[@]}"
 16 | do
 17 |     mkdir -p "$CONVERTED_PATH/$LANG/"
 18 | 
 19 |     echo "[language $LANG] Page table SQL => CSV"
 20 |     # https://www.mediawiki.org/wiki/Manual:Page_table
 21 |     #
 22 |     # CREATE TABLE `page` (
 23 |     #   `page_id`            int(8) unsigned     NOT NULL AUTO_INCREMENT,
 24 |     #   `page_namespace`     int(11)             NOT NULL DEFAULT 0,
 25 |     #   `page_title`         varbinary(255)      NOT NULL DEFAULT '',
 26 |     #   `page_is_redirect`   tinyint(1) unsigned NOT NULL DEFAULT 0,
 27 |     #   `page_is_new`        tinyint(1) unsigned NOT NULL DEFAULT 0,
 28 |     #   `page_random`        double unsigned     NOT NULL DEFAULT 0,
 29 |     #   `page_touched`       varbinary(14)       NOT NULL DEFAULT '',
 30 |     #   `page_links_updated` varbinary(14)                DEFAULT NULL,
 31 |     #   `page_latest`        int(8) unsigned     NOT NULL DEFAULT 0,
 32 |     #   `page_len`           int(8) unsigned     NOT NULL DEFAULT 0,
 33 |     #   `page_content_model` varbinary(32)                DEFAULT NULL,
 34 |     #   `page_lang`          varbinary(35)                DEFAULT NULL,
 35 |     #
 36 |     # Only interested in page_namespace == 0 (articles)
 37 |     # English wikipedia:
 38 |     #   input 1.9GB compressed
 39 |     #   output 200MB compressed
 40 |     # Output columns: page_id, page_title
 41 | 
 42 |     unpigz -c $DOWNLOADED_PATH/$LANG/page.sql.gz | \
 43 |     bin/mysqldump_to_csv.py | \
 44 |     bin/filter_page.py | \
 45 |     pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz
 46 | 
 47 | 
 48 |     echo "[language $LANG] linktarget table SQL => CSV"
 49 |     # https://www.mediawiki.org/wiki/Manual:Linktarget_table
 50 |     #
 51 |     # CREATE TABLE `linktarget` (
 52 |     #   `lt_id`          bigint(20) unsigned  NOT NULL AUTO_INCREMENT,
 53 |     #   `lt_namespace`   int(11)              NOT NULL,
 54 |     #   `lt_title`       varbinary(255)       NOT NULL,
 55 |     #
 56 |     # Only interested in lt_namespace == 0 (articles)
 57 |     # English wikipedia:
 58 |     #   input 964MB compressed (100m rows)
 59 |     #   output 322MB compressed (30m rows)
 60 |     # Output columns: lt_id, lt_title
 61 | 
 62 |     unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \
 63 |     bin/mysqldump_to_csv.py | \
 64 |     bin/filter_redirect.py  | \
 65 |     pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz
 66 | 
 67 | 
 68 | 
 69 |     echo "[language $LANG] Pagelinks table SQL => CSV"
 70 |     # https://www.mediawiki.org/wiki/Manual:Pagelinks_table
 71 |     #
 72 |     # CREATE TABLE `pagelinks` (
 73 |     #   `pl_from`            int(8) unsigned     NOT NULL DEFAULT 0,
 74 |     #   `pl_namespace`       int(11)             NOT NULL DEFAULT 0,
 75 |     #   `pl_target_id`       bigint(20) unsigned NOT NULL,
 76 |     #
 77 |     # Only interested in target_ids that point to  == 0 (articles)
 78 |     # English wikipedia:
 79 |     #   input 6.8GB compressed
 80 |     #   output 200MB compressed
 81 |     # Output columns: lt_title (from linktarget file), count (unique pl_from)
 82 | 
 83 |     unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \
 84 |     bin/mysqldump_to_csv.py | \
 85 |     bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \
 86 |     pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz
 87 | 
 88 | 
 89 |     echo "[language $LANG] langlinks table SQL => CSV"
 90 |     # https://www.mediawiki.org/wiki/Manual:Langlinks_table
 91 |     #
 92 |     # CREATE TABLE `langlinks` (
 93 |     #   `ll_from`         int(8) unsigned   NOT NULL DEFAULT 0,
 94 |     #   `ll_lang`         varbinary(35)     NOT NULL DEFAULT '',
 95 |     #   `ll_title`        varbinary(255)    NOT NULL DEFAULT '',
 96 |     #
 97 |     # Output columns: ll_title, ll_from_page_id, ll_lang
 98 |     # Output is sorted by lang
 99 |     # English wikipedia:
100 |     #   input 400MB compressed (1.5GB uncompressed)
101 |     #   output 310MB compressed (1.3GB uncompressed)
102 | 
103 |     unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \
104 |     bin/mysqldump_to_csv.py | \
105 |     bin/filter_langlinks.py | \
106 |     pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz
107 | 
108 | 
109 | 
110 | 
111 |     echo "[language $LANG] redirect table SQL => CSV"
112 |     # https://www.mediawiki.org/wiki/Manual:Redirect_table
113 |     #
114 |     # CREATE TABLE `redirect` (
115 |     #   `rd_from`         int(8) unsigned   NOT NULL DEFAULT 0,
116 |     #   `rd_namespace`    int(11)           NOT NULL DEFAULT 0,
117 |     #   `rd_title`        varbinary(255)    NOT NULL DEFAULT '',
118 |     #   `rd_interwiki`    varbinary(32)              DEFAULT NULL,
119 |     #   `rd_fragment`     varbinary(255)             DEFAULT NULL,
120 |     #
121 |     # Only interested in rd_namespace = 0 (articles)
122 |     # Output columns: rd_from_page_id, rd_title
123 |     # English wikipedia:
124 |     #   input 140MB compressed (530MB uncompressed)
125 |     #   output 120MB compressed (300MB uncompressed)
126 | 
127 |     unpigz -c $DOWNLOADED_PATH/$LANG/redirect.sql.gz | \
128 |     bin/mysqldump_to_csv.py | \
129 |     bin/filter_redirect.py | \
130 |     pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz
131 | 
132 |     du -h $CONVERTED_PATH/$LANG/*
133 | done
134 | 


--------------------------------------------------------------------------------
/config/wikidata_place_types.txt:
--------------------------------------------------------------------------------
  1 | Q9842;primary school
  2 | Q149566;middle school
  3 | Q9430;ocean
  4 | Q928830;metro station
  5 | Q9259;UNESCO World Heritage Site
  6 | Q91028;administrative arrondissement of Belgium
  7 | Q8514;desert
  8 | Q8502;mountain
  9 | Q15324;body of water
 10 | Q28575;county
 11 | Q39816;valley
 12 | Q46831;mountain range
 13 | Q50337;prefecture of Japan
 14 | Q175185;rural area
 15 | Q191086;jungle
 16 | Q205895;landmass
 17 | Q207520;region of Japan
 18 | Q369639;region of Norway
 19 | Q15284;municipality
 20 | Q123705;neighborhood
 21 | Q161387;kibbutz
 22 | Q188509;suburb
 23 | Q200250;metropolis
 24 | Q245016;military base
 25 | Q253019;Ortsteil
 26 | Q269528;unincorporated area
 27 | Q329245;basic unit of settlement in the Czech Republic
 28 | Q484170;commune of France
 29 | Q498162;census-designated place
 30 | Q509028;ranch
 31 | Q582706;Israeli settlement
 32 | Q587144;census town
 33 | Q618299;settlement
 34 | Q622499;refugee camp
 35 | Q627236;company town
 36 | Q674950;residential area
 37 | Q702492;urban area
 38 | Q790344;district of Barcelona
 39 | Q815324;town municipality of Turkey
 40 | Q820254;mining community
 41 | Q1074523;planned community
 42 | Q1175522;County city (council system)
 43 | Q1198413;military camp
 44 | Q1288520;local council in Israel
 45 | Q1294703;shanty town
 46 | Q1326028;camp
 47 | Q1348006;city block
 48 | Q1372205;dispersed settlement
 49 | Q1394476;civil township
 50 | Q1426493;trailer park
 51 | Q1501046;community settlement
 52 | Q1529096;village in Turkey
 53 | Q1907114;metropolitan area
 54 | Q2755753;area of London
 55 | Q2989398;commune of Algeria
 56 | Q3172900;colony
 57 | Q3257686;locality
 58 | Q3477348;urban area
 59 | Q3559019;urban village
 60 | Q4313794;populated place in Georgia
 61 | Q4373615;colony of the Russian empire
 62 | Q4632675;dwelling place
 63 | Q4668360;Aboriginal community in Western Australia
 64 | Q4845841;settlement (Croatia)
 65 | Q5148433;Colonias of Mexico City
 66 | Q5195043;borough
 67 | Q7930989;city/town
 68 | Q10354598;rural settlement
 69 | Q10840661;urban area of Vietnam
 70 | Q12051488;populated place in Ukraine
 71 | Q12063697;neighborhood of Washington, D.C.
 72 | Q16480895;hamlet
 73 | Q24258416;railway station
 74 | Q26714626;large village
 75 | Q27062006;station
 76 | Q27554677;former capital
 77 | Q108775530;proposed human settlement
 78 | Q855697;subcontinent
 79 | Q1029013;Megaregions of the United States
 80 | Q1200957;tourist destination
 81 | Q1666245;region of China
 82 | Q1970725;natural region
 83 | Q2140699;wine-producing region
 84 | Q3744088;tourism region
 85 | Q30059;arrondissement
 86 | Q52105;habitat
 87 | Q107425;landscape
 88 | Q171809;county of England
 89 | Q356936;exclusion zone
 90 | Q453909;built-up area
 91 | Q518261;cultural area
 92 | Q1062177;capital region
 93 | Q1081138;historic site
 94 | Q1092661;moorland
 95 | Q1133961;commercial district
 96 | Q1185892;geological massif
 97 | Q1248049;Land
 98 | Q1286517;natural landscape
 99 | Q1389310;waterbody
100 | Q1662024;industrial district
101 | Q1742059;lake area
102 | Q1852859;populated place in the Netherlands
103 | Q2063507;recreation area
104 | Q3241565;woodland
105 | Q10594991;nature area
106 | Q16363669;sports park
107 | Q17350442;venue
108 | Q27995042;wilderness area
109 | Q55726155;financial district
110 | Q99323582;quarter or sector of Monaco
111 | Q820477;mine
112 | Q12323;dam
113 | Q12493;dome
114 | Q27686;hotel
115 | Q39614;cemetery
116 | Q54831;amphitheatre
117 | Q62447;aerodrome
118 | Q62832;observatory
119 | Q83405;factory
120 | Q199451;pagoda
121 | Q483110;stadium
122 | Q483453;fountain
123 | Q587682;oil well
124 | Q653208;monolith
125 | Q671224;data center
126 | Q697295;shrine
127 | Q699405;residence
128 | Q1076486;sports venue
129 | Q4260475;medical facility
130 | Q4989906;monument
131 | Q5327174;earth structure
132 | Q6017969;scenic viewpoint
133 | Q6640302;commercial center
134 | Q10373565;waterworks
135 | Q12146012;underground building
136 | Q12292478;estate
137 | Q15090615;arts venue
138 | Q29845814;trolleybus depot
139 | Q47520309;recycling facility
140 | Q50418254;outdoor concert venue
141 | Q55713852;resthouse
142 | Q56240808;oil rig
143 | Q8072;volcano
144 | Q79007;street
145 | Q786014;rest area
146 | Q75848;gated community
147 | Q75520;plateau
148 | Q728937;railway line
149 | Q7275;state
150 | Q719456;station
151 | Q7075;library
152 | Q6852233;military building
153 | Q682943;cricket field
154 | Q665487;diocese
155 | Q655686;commercial building
156 | Q643589;department
157 | Q641226;arena
158 | Q631305;rock formation
159 | Q6256;country
160 | Q6023295;funerary structure
161 | Q5773747;historic house
162 | Q55659167;natural watercourse
163 | Q55488;railway station
164 | Q54050;hill
165 | Q532;village
166 | Q53060;gate
167 | Q52177058;civic building
168 | Q515716;prefecture
169 | Q5153984;commune-level subdivision of Vietnam
170 | Q515;city
171 | Q5144960;microregion
172 | Q5119;capital
173 | Q5107;continent
174 | Q5084;hamlet
175 | Q5031071;canal tunnel
176 | Q5003624;memorial
177 | Q4976993;civil parish
178 | Q4830453;business
179 | Q47521;stream
180 | Q473972;protected area
181 | Q46614560;deanery (building)
182 | Q44782;port
183 | Q44613;monastery
184 | Q44539;temple
185 | Q44494;mill
186 | Q44377;tunnel
187 | Q4421;forest
188 | Q43501;zoo
189 | Q4286337;city district
190 | Q42523;atoll
191 | Q40357;prison
192 | Q4022;river
193 | Q40080;beach
194 | Q39715;lighthouse
195 | Q3957;town
196 | Q3947;house
197 | Q38723;higher education institution
198 | Q38720;windmill
199 | Q3623867;arrondissement of Benin
200 | Q35666;glacier
201 | Q355304;watercourse
202 | Q35509;cave
203 | Q35112127;historic building
204 | Q34985575;city district
205 | Q34876;province
206 | Q34763;peninsula
207 | Q34627;synagogue
208 | Q3455524;administrative region
209 | Q34442;road
210 | Q33837;archipelago
211 | Q33506;museum
212 | Q24699794;museum building
213 | Q32815;mosque
214 | Q3240715;crater
215 | Q3191695;regency of Indonesia
216 | Q3153117;intercommunality
217 | Q30198;marsh
218 | Q30139652;health care structure
219 | Q294422;public building
220 | Q2870166;water ride
221 | Q274153;water tower
222 | Q271669;landform
223 | Q2659904;government organization
224 | Q24529780;point
225 | Q24354;theater
226 | Q2354973;road tunnel
227 | Q23442;island
228 | Q23413;castle
229 | Q23397;lake
230 | Q2327515;city district of Baden-Württemberg
231 | Q2311958;canton
232 | Q22927291;sixth-level administrative country subdivision
233 | Q22698;park
234 | Q2175765;tram stop
235 | Q205495;petrol station
236 | Q204832;roller coaster
237 | Q2042028;ravine
238 | Q202216;overseas department and region of France
239 | Q194203;arrondissement of France
240 | Q194195;amusement park
241 | Q185187;watermill
242 | Q185113;cape
243 | Q1799794;administrative territorial entity of a specific level
244 | Q1788454;road junction
245 | Q1785071;fort
246 | Q1777138;race track
247 | Q177380;hot spring
248 | Q174814;electrical substation
249 | Q174782;square
250 | Q17343829;unincorporated community in the United States
251 | Q17018380;bight
252 | Q16970;church building
253 | Q16917;hospital
254 | Q39364723;hospital building
255 | Q16831714;government building
256 | Q165;sea
257 | Q160742;abbey
258 | Q159719;power station
259 | Q159334;secondary school
260 | Q15640612;fifth-level administrative country subdivision
261 | Q15243209;historic district
262 | Q152081;concentration camp
263 | Q15195406;city district in Russia
264 | Q1500350;township of the People's Republic of China
265 | Q149621;district
266 | Q14757767;fourth-level administrative country subdivision
267 | Q14350;radio station
268 | Q1410668;National Wildlife Refuge
269 | Q1377575;wildlife refuge
270 | Q1353183;launch pad
271 | Q134447;nuclear power plant
272 | Q133215;casino
273 | Q133056;mountain pass
274 | Q13221722;third-level administrative country subdivision
275 | Q13220204;second-level administrative country subdivision
276 | Q1311958;railway tunnel
277 | Q1303167;barn
278 | Q130003;ski resort
279 | Q12518;tower
280 | Q489357;farmhouse
281 | Q12516;pyramid
282 | Q1248784;airport
283 | Q12284;canal
284 | Q12280;bridge
285 | Q121359;infrastructure
286 | Q1210950;channel
287 | Q11755880;residential building
288 | Q11707;restaurant
289 | Q11315;shopping center
290 | Q11303;skyscraper
291 | Q1115575;civil parish
292 | Q1107656;garden
293 | Q10864048;first-level administrative country subdivision
294 | Q105731;lock
295 | Q105190;levee
296 | Q1048525;golf course
297 | Q102496;parish
298 | Q28872924;designation for an administrative territorial entity of a single country
299 | Q15617994;designation for an administrative territorial entity
300 | Q159313;urban agglomeration
301 | Q24398318;religious building
302 | Q327333;government agency
303 | Q860861;sculpture
304 | Q46395;British overseas territories
305 | Q103910131;part of city or town or population centre
306 | Q103910177;city or town or population centre
307 | Q103910453;village or neigbourhood


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Add Wikipedia and Wikidata to Nominatim
  2 | 
  3 | ## Summary
  4 | 
  5 | This project creates an export of Wikipedia articles (title, wikidata id) and an calculated importance score (0..1) for each.
  6 | If Wikipedia has redirects to titles then each redirect is also added.
  7 | 
  8 | The score can be used to approximate how important a place name is relative to another by the same name.
  9 | 
 10 | Examples:
 11 | 
 12 |    * "Berlin" (capital of Germany, [Wikipedia](https://en.wikipedia.org/wiki/Berlin), [OpenStreetMap](https://www.openstreetmap.org/relation/62422))
 13 | vs "Berlin" (town in Maryland, USA, [Wikipedia](https://en.wikipedia.org/wiki/en:Berlin,%20Maryland), [OpenStreetMap](https://www.openstreetmap.org/relation/133689)). 
 14 |    * "Eiffel Tower" (Paris, France, [Wikipedia](https://en.wikipedia.org/wiki/Eiffel_Tower), [OpenStreetMap](https://www.openstreetmap.org/way/5013364)) vs "Eiffel Tower" (Paris, Tennessee, United States, [Wikipedia](https://en.wikipedia.org/wiki/Eiffel_Tower_(Paris,_Tennessee)), [OpenStreetMap](https://www.openstreetmap.org/way/1080841041)).
 15 |    * 50 places called "Springfield" in the United States
 16 |    * 35 places called "Washington" in the United States
 17 | 
 18 | [Nominatim](https://nominatim.org/) geocoding engine can import the files and improve its ranking of
 19 | place search results. During searches Nominatim combines importance score with other ranking factors like place type
 20 | (city vs county vs village), proximity (e.g. current map view position), phrase relevance (how many words
 21 | in the results match the search terms).
 22 | 
 23 | Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
 24 | 
 25 | To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process
 26 | 39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with NVMe
 27 | drives.
 28 | 
 29 | ```
 30 | 334M wikimedia_importance.tsv.gz
 31 | ```
 32 | 
 33 | 
 34 | ## History of this project
 35 | 
 36 | Nominatim 2.2 introduced the first `utils/importWikipedia.php` using [mwdumper](https://github.com/bcollier/mwdumper/),
 37 | then parsing HTML pages to find geo coordindates in articles. It was a single script without documentation on runtime
 38 | and ran irregular (less than once per year). Output was binary SQL database dumps.
 39 | 
 40 | During several months of [Google Summer of Code](https://en.wikipedia.org/wiki/Google_Summer_of_Code) 2019, [tchaddad](https://www.openstreetmap.org/user/tchaddad) rewrote the script, added wikidata processing, documentation and merged files into a new `wikimedia-importance.sql.gz` export. You can read her reports on [her diary posts](https://www.openstreetmap.org/user/tchaddad/diary).
 41 | 
 42 | Nominatim 3.5 switched to using the new `wikimedia-importance.sql.gz` file and improved its ranking algorithm.
 43 | 
 44 | Later the project was moved into its own git repository. In small steps the process was split into steps for downloading,
 45 | converting, processing, creating output. `mysql2pgsql` was replaced with `mysqldump`, which allowed filtering in scripts.
 46 | Performance was improved by loading only required data into the database. Some caching (don't redownload files) and
 47 | retries (wikidata API being unreliable) was added.
 48 | 
 49 | 
 50 | ## Output data
 51 | 
 52 | `wikimedia_importance.tsv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023.
 53 | The file tab delimited, not quoted, is sorted and contains a header row.
 54 | 
 55 | |   Column    |       Type       |
 56 | | ----------- | ---------------- |
 57 | | language    | text             |
 58 | | type        | char             |
 59 | | title       | text             |
 60 | | importance  | double precision |
 61 | | wikidata_id | text             |
 62 | 
 63 | All columns are filled with values.
 64 | 
 65 | Combination of language+title (and language+type+title) are unique.
 66 | 
 67 | Type is either 'a' (article) or 'r' (redirect).
 68 | 
 69 | Maximum title length is 247.
 70 | 
 71 | Importance is between 0.0000000001 (never 0) and 1.
 72 | 
 73 | Currently 39 languages, English has by far the largest share.
 74 | 
 75 | |  language      |  count           |
 76 | | -------------- | ---------------- |
 77 | | en (English)   | 3,337,994 (19%)  |
 78 | | de (German)    |   966,820 (6%)   |
 79 | | fr (French)    |   935,817 (5%)   |
 80 | | sv (Swdish)    |   906,813        |
 81 | | uk (Ukranian)  |   900,548        |
 82 | | ...            |                  |
 83 | | bg (Bulgarian) |    88,993        |
 84 |  
 85 | Examples of `wikimedia_importance.tsv.gz` rows:
 86 | 
 87 | * Wikipedia contains redirects, so a single wikidata object can have multiple titles even though. Each title has the same importance score. Redirects to non-existing articles are removed.
 88 | 
 89 |     ```
 90 |     en	a	Brandenburg_Gate	0.5531125195487524	Q82425
 91 |     en	r	Berlin's_Gate	0.5531125195487524	Q82425
 92 |     en	r	Brandenberg_Gate	0.5531125195487524	Q82425
 93 |     en	r	Brandenburger_gate	0.5531125195487524	Q82425
 94 |     en	r	Brandenburger_Gate	0.5531125195487524	Q82425
 95 |     en	r	Brandenburger_Tor	0.5531125195487524	Q82425
 96 |     en	r	Brandenburg_gate	0.5531125195487524	Q82425
 97 |     en	r	BRANDENBURG_GATE	0.5531125195487524	Q82425
 98 |     en	r	Brandenburg_Gates	0.5531125195487524	Q82425
 99 |     en	r	Brandenburg_Tor	0.5531125195487524	Q82425
100 |     ```
101 | 
102 | * Wikipedia titles contain underscores instead of space, e.g. [Alford,_Massachusetts](https://en.wikipedia.org/wiki/Alford,_Massachusetts)
103 | 
104 |     ```
105 |     en	a	"Alford	_Massachusetts"	0.36590368314334637	Q2431901
106 |     en	r	"Alford	_ma"	0.36590368314334637	Q2431901
107 |     en	r	"Alford	_MA"	0.36590368314334637	Q2431901
108 |     en	r	"Alford	_Mass"	0.36590368314334637	Q2431901
109 |     ```
110 | 
111 | * The highest score article is the [United States](https://en.wikipedia.org/wiki/United_States) 
112 | 
113 |     ```
114 |     pl	a	Stany_Zjednoczone	1	Q30
115 |     en	a	United_States	1	Q30
116 |     ru	a	Соединённые_Штаты_Америки	1	Q30
117 |     hu	a	Amerikai_Egyesült_Államok	1	Q30
118 |     it	a	Stati_Uniti_d'America	1	Q30
119 |     de	a	Vereinigte_Staaten	1	Q30
120 |     ...
121 |     ```
122 | 
123 | ## How importance scores are calculated
124 | 
125 | Wikipedia articles with more links to them from other articles ("pagelinks") plus from other languages ("langlinks") receive a higher score.
126 | 
127 | 1. The Wikipedia dump file `${language}pagelinks` contains how many links each Wikipedia article
128 |    has **from** other Wikipedia articles of the same language. We store that as `langcount` for
129 |    each article.
130 | 
131 |    The dump has the columns
132 |    
133 |       ```sql
134 |       CREATE TABLE `pagelinks` (
135 |         `pl_from`            int(8) unsigned    NOT NULL DEFAULT 0,
136 |         `pl_namespace`       int(11)            NOT NULL DEFAULT 0,
137 |         `pl_title`           varbinary(255)     NOT NULL DEFAULT '',
138 |         `pl_from_namespace`  int(11)            NOT NULL DEFAULT 0,
139 |       ```
140 | 
141 |    After filtering namespaces (0 = articles) we only have to look at the `pl_title` column
142 |    and count now often each title occurs. For example `Eiffel_Tower` 2862 times (*).
143 |    We store that as `langcount` for each article.
144 |    
145 |    *) `zgrep -c -e'^Eiffel_Tower$' converted/wikipedia/en/pagelinks.csv.gz`
146 | 
147 | 2. The dump file `${language}langlinks` contains how many links each Wikipedia article has **to**
148 |    other languages. Such a link doesn't count as 1 but as number of `${language}pagelinks`.
149 | 
150 |    The dump has the columns
151 | 
152 |       ```sql
153 |       CREATE TABLE `langlinks` (
154 |         `ll_from`         int(8) unsigned   NOT NULL DEFAULT 0,
155 |         `ll_lang`         varbinary(35)     NOT NULL DEFAULT '',
156 |         `ll_title`        varbinary(255)    NOT NULL DEFAULT '',
157 |       ```
158 | 
159 |    For example the row `"9232,fr,Tour Eiffel"` in `enlanglinks` file means the
160 |    [English article](https://en.wikipedia.org/wiki/Eiffel_Tower) has a link to the
161 |    [French article](https://fr.wikipedia.org/wiki/Tour_Eiffel) (*).
162 |    
163 |    When processing the English language we need to inspect and calculate the sum of
164 |    the `langlinks` files of all other languages. We store that as `othercount` for
165 |    each article.
166 |    
167 |    For example the French article gets 2862 links from the English article (plus more
168 |    from the other languages).
169 | 
170 |    *) The `langlink` files have no underscores in the title while other files do.
171 | 
172 | 3. `langcount` and `othercount` together are `totalcount`.
173 | 
174 | 4. We check which article has the highest (maximum) count of links to it. Currently that's
175 |    "United States" with a `totalcount` of 5,198,249. All other articles are scored on a
176 |    logarithmic scale accordingly.
177 |    
178 |    For example an article with half (2,599,124) the links to it gets a score of 0.952664935, an
179 |    article with 10% (519,825) the links get a score of 0.85109869, an article with 1% a score of
180 |    0.7021967.
181 | 
182 |    ```sql
183 |       SET importance = GREATEST(
184 |                           LOG(totalcount)
185 |                           /
186 |                           LOG((
187 |                             SELECT MAX(totalcount)
188 |                             FROM wikipedia_article_full
189 |                             WHERE wd_page_title IS NOT NULL
190 |                           )),
191 |                           0.0000000001
192 |                        )
193 |     ```
194 | 
195 | 
196 | 
197 | 
198 | 
199 | ## How Nominatim uses the files
200 | 
201 | (As of Nominatim 4.2)
202 | 
203 | During [Nominatim installation](https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings
204 | )
205 | it will check if a wikipedia-importance file is present and automatically import it into the
206 | database tables `wikpedia_article` and `wikipedia_redirect`. There is also a `nominatim refresh`
207 | command to update the tables later.
208 | 
209 | OpenStreetMap contributors frequently tag items with links to Wikipedia
210 | ([documentation](https://wiki.openstreetmap.org/wiki/Key:wikipedia))
211 | and Wikidata ([documentation](https://wiki.openstreetmap.org/wiki/Key:wikidata)). For example
212 | [Newcastle upon Tyne](https://www.openstreetmap.org/relation/142282) has the tags
213 | 
214 | | tag           | value                           |
215 | | ------------- | ------------------------------- |
216 | | admin_level   | 8                               |
217 | | boundary      | administrative                  |
218 | | name          | Newcastle upon Tyne             |
219 | | type          | boundary                        |
220 | | website       | https://www.newcastle.gov.uk/   |
221 | | wikidata      | Q1425428                        |
222 | | wikipedia     | en:Newcastle upon Tyne          |
223 | 
224 | When Nominatim indexes places it checks if they have an wikipedia or wikidata tag. If they do
225 | they set the `importance` value in the `placex` table for that place. This happens in
226 | `compute_importance` in `lib-sql/functions/importance.sql` (called from methods in
227 | `lib-sql/functions/placex_triggers.sql`. This is also were default values are set
228 | (when a place has neither).
229 | 
230 | During a search Nominatim will inspect the `importance` value of a place and use that as
231 | one of the ranking (sorting) factors.
232 | 
233 | See also [Nominatim importance documentation](https://nominatim.org/release-docs/latest/customize/Importance/).
234 | 
235 | 
236 | ## Steps of the build
237 | 
238 | Have a look at `complete_run.sh` as entry point to the code. You will require a local Postgresql database. Edit
239 | the `languages.txt` file to only run a small language (e.g. Bulgarian) first.
240 | 
241 | 1. latest\_available\_data
242 | 
243 |    Prints a date. Wikipedia exports take many days, then mirrors are sometimes slow copying them. It's not
244 | uncommon for an export starting Jan/1st to only be full ready Jan/10th or later.
245 | 
246 | 2. wikipedia_download (1h)
247 | 
248 |    Downloads 40GB compressed files. 4 files per language. English is 10GB.
249 | 
250 | 3. wikidata\_download (0:15h)
251 | 
252 |    Another 4 files, 5GB.
253 | 
254 | 4. wikidata_api\_fetch\_placetypes (0:15h)
255 | 
256 |    Runs 300 SPARQL queries against wikidata servers. Output is 5GB.
257 | 
258 | 5. wikipedia_sql2csv (4:20h)
259 |    
260 |    The MySQL SQL files get parsed sequentially and we try to exclude as much data (rows,
261 |    columns) as possible. Output is 75% smaller than input. Any work done here cuts
262 |    down the time (and space) needed in the database (database used to be 1TB before
263 |    this step).
264 |   
265 |    Most time is spend on the Pagelinks table
266 |   
267 |    ```
268 |    [language en] Page table      (0:06h)
269 |    [language en] Pagelinks table (0:50h)
270 |    [language en] langlinks table (0:02h)
271 |    [language en] redirect table  (0:01h)
272 |    ```
273 | 
274 | 6. wikidata_sql2csv (0:15h)
275 | 
276 |    ```
277 | 	geo_tags          (0:01h)
278 | 	page              (0:09h)
279 | 	wb_items_per_site (0:07h)
280 |    ```
281 | 
282 | 7. wikipedia\_import, wikidata\_import (0:10h)
283 | 
284 |    Given the number of rows a pretty efficient loading of data into Postgresql.
285 | 
286 |    English database tables
287 | 
288 |    ```
289 |    enlanglinks        |  28,365,965 rows | 1762 MB
290 |    enpage             |  17,211,555 rows | 946 MB
291 |    enpagelinkcount    |  27,792,966 rows | 2164 MB
292 |    enpagelinks        |  61,310,384 rows | 3351 MB
293 |    enredirect         |  10,804,606 rows | 599 MB
294 |    ```
295 | 
296 | 8. wikipedia\_process, wikidata\_process (2:30h)
297 | 
298 |    Postgresql is great joining large datasets together, especially if not all
299 |    data fits into RAM.
300 | 
301 |    ```
302 |    set othercounts                                        (2:20h)
303 |    Create and fill wikipedia_article_full                 (0.03h)
304 |    Create derived tables                                  (0.03h)
305 |    Process language pages                                 (0.03h)
306 |    Add wikidata to wikipedia_article_full table           (0.04h)
307 |    Calculate importance score for each wikipedia page     (0.08h)
308 |    ```
309 |    
310 | 9. output (0:15h)
311 |    
312 |    Uses `pg_dump` tool to create SQL files. Uses SQL `COPY` command to create TSV file.
313 | 
314 | 
315 | License
316 | -------
317 | The source code is available under a GPLv2 license.
318 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/wikidata_places.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Wikidata place types and related OSM Tags
  3 | 
  4 | Wikidata does not have any official ontologies, however the [DBpedia project](https://wiki.dbpedia.org/) has created an [ontology](https://wiki.dbpedia.org/services-resources/ontology) that covered [place types](http://mappings.dbpedia.org/server/ontology/classes/#Place). The table below used the DBpedia place ontology as a starting point, and is provided as a cross-reference to the relevant OSM tags.
  5 | 
  6 | The Wikidata place types listed in the table below can be used in conjunction with the [Wikidata Query Service](https://query.wikidata.org/) to retrieve instances of those place types from the Wikidata knowledgebase. 
  7 | 
  8 | ```
  9 | SELECT ?item ?lat ?lon
 10 | WHERE {
 11 |   ?item wdt:P31*/wdt:P279*wd:Q9430; wdt:P625 ?pt.
 12 |   ?item p:P625?loc.
 13 |   ?loc psv:P625?cnode.
 14 |   ?cnode wikibase:geoLatitude?lat.
 15 |   ?cnode wikibase:geoLongitude?lon.
 16 | }
 17 | ```
 18 | 
 19 | An example json return for all instances of the Wikidata item "Q9430" (Ocean) can be seen at [json](https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT?item?lat?lon%20WHERE{?item%20wdt:P31*/wdt:P279*wd:Q9430;wdt:P625?pt.?item%20p:P625?loc.?loc%20psv:P625?cnode.?cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.})
 20 | 
 21 | **NOTE** the OSM tags listed are those listed in the wikidata entries, and not all the possible matches for tags within OSM.
 22 | 
 23 | 
 24 |    title   |             concept                   |       OSM Tag     | 
 25 | -----------|---------------------------------------|------------------|
 26 | [Q17334923](https://www.wikidata.org/entity/Q17334923)  | Location | | 
 27 | [Q811979](https://www.wikidata.org/entity/Q811979)           | Architectural Structure | | 
 28 | [Q194195](https://www.wikidata.org/entity/Q194195)   | Amusement park | 
 29 | [Q204832](https://www.wikidata.org/entity/Q204832)   | Roller coaster | [attraction=roller_coaster](https://wiki.openstreetmap.org/wiki/Tag:attraction=roller_coaster) | 
 30 | [Q2870166](https://www.wikidata.org/entity/Q2870166)   | Water ride | |
 31 | [Q641226](https://www.wikidata.org/entity/Q641226)    | Arena | [amenity=events_centre](https://wiki.openstreetmap.org/wiki/Tag:amenity=events_centre) | 
 32 | [Q41176](https://www.wikidata.org/entity/Q41176)     | Building | [building=yes](https://wiki.openstreetmap.org/wiki/Key:building) |
 33 | [Q1303167](https://www.wikidata.org/entity/Q1303167)   | Barn | [building=barn](https://wiki.openstreetmap.org/wiki/Tag:building=barn) |
 34 | [Q655686](https://www.wikidata.org/entity/Q655686)   | Commercial building | [building=commercial](https://wiki.openstreetmap.org/wiki/Tag:building=commercial) | 
 35 | [Q4830453](https://www.wikidata.org/entity/Q4830453)   | Business | |
 36 | [Q7075](https://www.wikidata.org/entity/Q7075)     | Library | [amenity=library](https://wiki.openstreetmap.org/wiki/Tag:amenity=library) |
 37 | [Q133215](https://www.wikidata.org/entity/Q133215)   | Casino | [amenity=casino](https://wiki.openstreetmap.org/wiki/Tag:amenity=casino) | 
 38 | [Q23413](https://www.wikidata.org/entity/Q23413)     | Castle | [historic=castle](https://wiki.openstreetmap.org/wiki/Tag:historic=castle) |
 39 | [Q83405](https://www.wikidata.org/entity/Q83405)     | Factory | | 
 40 | [Q53060](https://www.wikidata.org/entity/Q53060)     | Gate  | [barrier=gate](https://wiki.openstreetmap.org/wiki/Tag:barrier=gate) |cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.})
 41 | [Q11755880](https://www.wikidata.org/entity/Q11755880)           | Residential Building  | [building=residential](https://wiki.openstreetmap.org/wiki/Tag:building=residential) | 
 42 | [Q3947](https://www.wikidata.org/entity/Q3947)      | House  | [building=house](https://wiki.openstreetmap.org/wiki/Tag:building=house) |
 43 | [Q35112127](https://www.wikidata.org/entity/Q35112127)           | Historic Building  | |
 44 | [Q5773747](https://www.wikidata.org/entity/Q5773747)   | Historic house  | | 
 45 | [Q38723](https://www.wikidata.org/entity/Q38723)           | Higher Education Institution  | 
 46 | [Q3914](https://www.wikidata.org/entity/Q3914)      | School  | [amenity=school](https://wiki.openstreetmap.org/wiki/Tag:amenity=school) | 
 47 | [Q9842](https://www.wikidata.org/entity/Q9842)      | Primary school  | | 
 48 | [Q159334](https://www.wikidata.org/entity/Q159334)    | Secondary school  | | 
 49 | [Q16917](https://www.wikidata.org/entity/Q16917)     | Hospital  | [amenity=hospital](https://wiki.openstreetmap.org/wiki/Tag:amenity=hospital), [healthcare=hospital](https://wiki.openstreetmap.org/wiki/Tag:healthcare=hospital), [building=hospital](https://wiki.openstreetmap.org/wiki/Tag:building=hospital) |
 50 | [Q27686](https://www.wikidata.org/entity/Q27686)     | Hotel  | [tourism=hotel](https://wiki.openstreetmap.org/wiki/Tag:tourism=hotel), [building=hotel](https://wiki.openstreetmap.org/wiki/Tag:building=hotel) |
 51 | [Q33506](https://www.wikidata.org/entity/Q33506)     | Museum  | [tourism=museum](https://wiki.openstreetmap.org/wiki/Tag:tourism=museum) |
 52 | [Q40357](https://www.wikidata.org/entity/Q40357)     | Prison  | [amenity=prison](https://wiki.openstreetmap.org/wiki/Tag:amenity=prison) |
 53 | [Q24398318](https://www.wikidata.org/entity/Q24398318)           | Religious Building  | |
 54 | [Q160742](https://www.wikidata.org/entity/Q160742)    | Abbey  | |
 55 | [Q16970](https://www.wikidata.org/entity/Q16970)     | Church (building)  | [building=church](https://wiki.openstreetmap.org/wiki/Tag:building=church) |
 56 | [Q44613](https://www.wikidata.org/entity/Q44613)     | Monastery  | [amenity=monastery](https://wiki.openstreetmap.org/wiki/Tag:amenity=monastery) | 
 57 | [Q32815](https://www.wikidata.org/entity/Q32815)     | Mosque  | [building=mosque](https://wiki.openstreetmap.org/wiki/Tag:building=mosque) | 
 58 | [Q697295](https://www.wikidata.org/entity/Q697295)    | Shrine  | [building=shrine](https://wiki.openstreetmap.org/wiki/Tag:building=shrine) |
 59 | [Q34627](https://www.wikidata.org/entity/Q34627)     | Synagogue  | [building=synagogue](https://wiki.openstreetmap.org/wiki/Tag:building=synagogue) |
 60 | [Q44539](https://www.wikidata.org/entity/Q44539)     | Temple  | [building=temple](https://wiki.openstreetmap.org/wiki/Tag:building=temple) | 
 61 | [Q11707](https://www.wikidata.org/entity/Q11707)     | Restaurant  | [amenity=restaurant](https://wiki.openstreetmap.org/wiki/Tag:amenity=restaurant) |
 62 | [Q11315](https://www.wikidata.org/entity/Q11315)     | Shopping mall  | [shop=mall](https://wiki.openstreetmap.org/wiki/Tag:shop=mall), [shop=shopping_centre](https://wiki.openstreetmap.org/wiki/Tag:shop=shopping_centre) | 
 63 | [Q11303](https://www.wikidata.org/entity/Q11303)     | Skyscraper  | |
 64 | [Q17350442](https://www.wikidata.org/entity/Q17350442)           | Venue  | |
 65 | [Q41253](https://www.wikidata.org/entity/Q41253)           | Movie Theater  | [amenity=cinema](https://wiki.openstreetmap.org/wiki/Tag:amenity=cinema) | 
 66 | [Q483110](https://www.wikidata.org/entity/Q483110)    | Stadium  | [leisure=stadium](https://wiki.openstreetmap.org/wiki/Tag:leisure=stadium), [building=stadium](https://wiki.openstreetmap.org/wiki/Tag:building=stadium) |
 67 | [Q24354](https://www.wikidata.org/entity/Q24354)     | Theater (structure)  | [amenity=theatre](https://wiki.openstreetmap.org/wiki/Tag:amenity=theatre) |
 68 | [Q121359](https://www.wikidata.org/entity/Q121359)    | Infrastructure  | |
 69 | [Q1248784](https://www.wikidata.org/entity/Q1248784)   | Airport  | |
 70 | [Q12323](https://www.wikidata.org/entity/Q12323)     | Dam  | [waterway=dam](https://wiki.openstreetmap.org/wiki/Tag:waterway=dam) |
 71 | [Q1353183](https://www.wikidata.org/entity/Q1353183)   | Launch pad  | | 
 72 | [Q105190](https://www.wikidata.org/entity/Q105190)   | Levee  | [man_made=dyke](https://wiki.openstreetmap.org/wiki/Tag:man_made=dyke) |
 73 | [Q105731](https://www.wikidata.org/entity/Q105731)    | Lock (water navigation)   | [lock=yes](https://wiki.openstreetmap.org/wiki/Key:lock) |
 74 | [Q44782](https://www.wikidata.org/entity/Q44782)     | Port  | |
 75 | [Q159719](https://www.wikidata.org/entity/Q159719)    | Power station  | [power=plant](https://wiki.openstreetmap.org/wiki/Tag:power=plant) |
 76 | [Q174814](https://www.wikidata.org/entity/Q174814)    | Electrical substation   |  |
 77 | [Q134447](https://www.wikidata.org/entity/Q134447)    | Nuclear power plant  | [plant:source=nuclear](https://wiki.openstreetmap.org/wiki/Tag:plant:source=nuclear) |
 78 | [Q786014](https://www.wikidata.org/entity/Q786014)   | Rest area  | [highway=rest_area](https://wiki.openstreetmap.org/wiki/Tag:highway=rest_area), [highway=services](https://wiki.openstreetmap.org/wiki/Tag:highway=services) |
 79 | [Q12280](https://www.wikidata.org/entity/Q12280)     | Bridge  | [bridge=* ](https://wiki.openstreetmap.org/wiki/Key:bridge), [man_made=bridge](https://wiki.openstreetmap.org/wiki/Tag:man_made=bridge) |
 80 | [Q728937](https://www.wikidata.org/entity/Q728937)           | Railroad Line  | [railway=rail](https://wiki.openstreetmap.org/wiki/Tag:railway=rail) | 
 81 | [Q1311958](https://www.wikidata.org/entity/Q1311958)           | Railway Tunnel  | | 
 82 | [Q34442](https://www.wikidata.org/entity/Q34442)     | Road  | [highway=* ](https://wiki.openstreetmap.org/wiki/Key:highway), [route=road](https://wiki.openstreetmap.org/wiki/Tag:route=road) |
 83 | [Q1788454](https://www.wikidata.org/entity/Q1788454)   | Road junction  |  | 
 84 | [Q44377](https://www.wikidata.org/entity/Q44377)     | Tunnel  | [tunnel=* ](https://wiki.openstreetmap.org/wiki/Key:tunnel) |
 85 | [Q5031071](https://www.wikidata.org/entity/Q5031071)  | Canal tunnel  | |
 86 | [Q719456](https://www.wikidata.org/entity/Q719456)    | Station  | [public_transport=station](https://wiki.openstreetmap.org/wiki/Tag:public_transport=station) |
 87 | [Q205495](https://www.wikidata.org/entity/Q205495)    | Filling station  | [amenity=fuel](https://wiki.openstreetmap.org/wiki/Tag:amenity=fuel) |
 88 | [Q928830](https://www.wikidata.org/entity/Q928830)    | Metro station  | [station=subway](https://wiki.openstreetmap.org/wiki/Tag:station=subway) |
 89 | [Q55488](https://www.wikidata.org/entity/Q55488)     | Train station  | [railway=station](https://wiki.openstreetmap.org/wiki/Tag:railway=station) |
 90 | [Q2175765](https://www.wikidata.org/entity/Q2175765)   | Tram stop  | [railway=tram_stop](https://wiki.openstreetmap.org/wiki/Tag:railway=tram_stop), [public_transport=stop_position](https://wiki.openstreetmap.org/wiki/Tag:public_transport=stop_position) |
 91 | [Q6852233](https://www.wikidata.org/entity/Q6852233)   | Military building  | |
 92 | [Q44494](https://www.wikidata.org/entity/Q44494)     | Mill (grinding)  | |
 93 | [Q185187](https://www.wikidata.org/entity/Q185187)    | Watermill  | [man_made=watermill](https://wiki.openstreetmap.org/wiki/Tag:man_made=watermill) |
 94 | [Q38720](https://www.wikidata.org/entity/Q38720)     | Windmill  | [man_made=windmill](https://wiki.openstreetmap.org/wiki/Tag:man_made=windmill) | 
 95 | [Q4989906](https://www.wikidata.org/entity/Q4989906)   | Monument  | [historic=monument](https://wiki.openstreetmap.org/wiki/Tag:historic=monument) |
 96 | [Q5003624](https://www.wikidata.org/entity/Q5003624)   | Memorial  | [historic=memorial](https://wiki.openstreetmap.org/wiki/Tag:historic=memorial) |
 97 | [Q271669](https://www.wikidata.org/entity/Q271669)   | Landform  | |
 98 | [Q190429](https://www.wikidata.org/entity/Q190429)    | Depression (geology)  | |
 99 | [Q17018380](https://www.wikidata.org/entity/Q17018380)  | Bight (geography)  | | 
100 | [Q54050](https://www.wikidata.org/entity/Q54050)     | Hill  | |
101 | [Q1210950](https://www.wikidata.org/entity/Q1210950)   | Channel (geography)  | |
102 | [Q23442](https://www.wikidata.org/entity/Q23442)    | Island  | [place=island](https://wiki.openstreetmap.org/wiki/Tag:place=island) | 
103 | [Q42523](https://www.wikidata.org/entity/Q42523)    | Atoll  | |
104 | [Q34763](https://www.wikidata.org/entity/Q34763)    | Peninsula  | | 
105 | [Q355304](https://www.wikidata.org/entity/Q355304)   | Watercourse  | |
106 | [Q30198](https://www.wikidata.org/entity/Q30198)    | Marsh  | [wetland=marsh](https://wiki.openstreetmap.org/wiki/Tag:wetland=marsh) |
107 | [Q75520](https://www.wikidata.org/entity/Q75520)    | Plateau  | |
108 | [Q2042028](https://www.wikidata.org/entity/Q2042028)  | Ravine  | |
109 | [Q631305](https://www.wikidata.org/entity/Q631305)   | Rock formation  | | 
110 | [Q12516](https://www.wikidata.org/entity/Q12516)    | Pyramid  | |
111 | [Q1076486](https://www.wikidata.org/entity/Q1076486) | Sports venue  |  |
112 | [Q682943](https://www.wikidata.org/entity/Q682943)   | Cricket field  | [sport=cricket](https://wiki.openstreetmap.org/wiki/Tag:sport=cricket) | 
113 | [Q1048525](https://www.wikidata.org/entity/Q1048525)  | Golf course  | [leisure=golf_course](https://wiki.openstreetmap.org/wiki/Tag:leisure=golf_course) |
114 | [Q1777138](https://www.wikidata.org/entity/Q1777138)  | Race track  | [highway=raceway](https://wiki.openstreetmap.org/wiki/Tag:highway=raceway) | 
115 | [Q130003](https://www.wikidata.org/entity/Q130003)   | Ski resort  | |
116 | [Q174782](https://www.wikidata.org/entity/Q174782)   | Town square  | [place=square](https://wiki.openstreetmap.org/wiki/Tag:place=square) |
117 | [Q12518](https://www.wikidata.org/entity/Q12518)    | Tower  | [building=tower](https://wiki.openstreetmap.org/wiki/Tag:building=tower), [man_made=tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=tower) |
118 | [Q39715](https://www.wikidata.org/entity/Q39715)    | Lighthouse  | [man_made=lighthouse](https://wiki.openstreetmap.org/wiki/Tag:man_made=lighthouse) |
119 | [Q274153](https://www.wikidata.org/entity/Q274153)   | Water tower | [building=water_tower](https://wiki.openstreetmap.org/wiki/Tag:building=water_tower), [man_made=water_tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=water_tower) |
120 | [Q43501](https://www.wikidata.org/entity/Q43501)    | Zoo  | [tourism=zoo](https://wiki.openstreetmap.org/wiki/Tag:tourism=zoo) | 
121 | [Q39614](https://www.wikidata.org/entity/Q39614)    | Cemetery  | [amenity=grave_yard](https://wiki.openstreetmap.org/wiki/Tag:amenity=grave_yard), [landuse=cemetery](https://wiki.openstreetmap.org/wiki/Tag:landuse=cemetery) |
122 | [Q152081](https://www.wikidata.org/entity/Q152081)   | Concentration camp  | |
123 | [Q1107656](https://www.wikidata.org/entity/Q1107656)  | Garden  | [leisure=garden](https://wiki.openstreetmap.org/wiki/Tag:leisure=garden) |
124 | [Q820477](https://www.wikidata.org/entity/Q820477)   | Mine |  | 
125 | [Q33837](https://www.wikidata.org/entity/Q33837) | Archipelago  | [place=archipelago](https://wiki.openstreetmap.org/wiki/Tag:place=archipelago) | 
126 | [Q40080](https://www.wikidata.org/entity/Q40080)    | Beach  | [natural=beach](https://wiki.openstreetmap.org/wiki/Tag:natural=beach) |
127 | [Q15324](https://www.wikidata.org/entity/Q15324)    | Body of water | [natural=water](https://wiki.openstreetmap.org/wiki/Tag:natural=water) | 
128 | [Q23397](https://www.wikidata.org/entity/Q23397)    | Lake  | [water=lake](https://wiki.openstreetmap.org/wiki/Tag:water=lake) | 
129 | [Q9430](https://www.wikidata.org/entity/Q9430)     | Ocean  | |
130 | [Q165](https://www.wikidata.org/entity/Q165)    | Sea  | |
131 | [Q47521](https://www.wikidata.org/entity/Q47521)    | Stream  | | 
132 | [Q12284](https://www.wikidata.org/entity/Q12284)    | Canal  | [waterway=canal](https://wiki.openstreetmap.org/wiki/Tag:waterway=canal) |
133 | [Q4022](https://www.wikidata.org/entity/Q4022)     | River  | [waterway=river](https://wiki.openstreetmap.org/wiki/Tag:waterway=river), [type=waterway](https://wiki.openstreetmap.org/wiki/Relation:waterway) |
134 | [Q185113](https://www.wikidata.org/entity/Q185113)   | Cape | [natural=cape](https://wiki.openstreetmap.org/wiki/Tag:natural=cape) | 
135 | [Q35509](https://www.wikidata.org/entity/Q35509)    | Cave  | [natural=cave_entrance](https://wiki.openstreetmap.org/wiki/Tag:natural=cave_entrance) | 
136 | [Q8514](https://www.wikidata.org/entity/Q8514)     | Desert  | | 
137 | [Q4421](https://www.wikidata.org/entity/Q4421)     | Forest  | [natural=wood](https://wiki.openstreetmap.org/wiki/Tag:natural=wood) |
138 | [Q35666](https://www.wikidata.org/entity/Q35666)    | Glacier  | [natural=glacier](https://wiki.openstreetmap.org/wiki/Tag:natural=glacier) |
139 | [Q177380](https://www.wikidata.org/entity/Q177380)   | Hot spring | | 
140 | [Q8502](https://www.wikidata.org/entity/Q8502)     | Mountain  | [natural=peak](https://wiki.openstreetmap.org/wiki/Tag:natural=peak) | 
141 | [Q133056](https://www.wikidata.org/entity/Q133056)   | Mountain pass  | | 
142 | [Q46831](https://www.wikidata.org/entity/Q46831)    | Mountain range  | |
143 | [Q39816](https://www.wikidata.org/entity/Q39816)    | Valley  | [natural=valley](https://wiki.openstreetmap.org/wiki/Tag:natural=valley) |
144 | [Q8072](https://www.wikidata.org/entity/Q8072)     | Volcano  | [natural=volcano](https://wiki.openstreetmap.org/wiki/Tag:natural=volcano) |
145 | [Q43229](https://www.wikidata.org/entity/Q43229)    | Organization  |  | 
146 | [Q327333](https://www.wikidata.org/entity/Q327333)   | Government agency  | [office=government](https://wiki.openstreetmap.org/wiki/Tag:office=government)|
147 | [Q22698](https://www.wikidata.org/entity/Q22698)    | Park | [leisure=park](https://wiki.openstreetmap.org/wiki/Tag:leisure=park) | 
148 | [Q159313](https://www.wikidata.org/entity/Q159313)   | Urban agglomeration | |
149 | [Q177634](https://www.wikidata.org/entity/Q177634)   | Community  | |
150 | [Q5107](https://www.wikidata.org/entity/Q5107)    | Continent | [place=continent](https://wiki.openstreetmap.org/wiki/Tag:place=continent) |
151 | [Q6256](https://www.wikidata.org/entity/Q6256)     | Country  | [place=country](https://wiki.openstreetmap.org/wiki/Tag:place=country) | 
152 | [Q75848](https://www.wikidata.org/entity/Q75848)    | Gated community | | 
153 | [Q3153117](https://www.wikidata.org/entity/Q3153117) | Intercommunality  | |
154 | [Q82794](https://www.wikidata.org/entity/Q82794)    | Region  | | 
155 | [Q56061](https://www.wikidata.org/entity/Q56061)    | Administrative division  | [boundary=administrative](https://wiki.openstreetmap.org/wiki/Tag:boundary=administrative)  | 
156 | [Q665487](https://www.wikidata.org/entity/Q665487)   | Diocese | | 
157 | [Q4976993](https://www.wikidata.org/entity/Q4976993)  | Parish | [boundary=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:boundary=civil_parish) |
158 | [Q194203](https://www.wikidata.org/entity/Q194203)   | Arrondissements of France  | |
159 | [Q91028](https://www.wikidata.org/entity/Q91028)    | Arrondissements of Belgium  | | 
160 | [Q3623867](https://www.wikidata.org/entity/Q3623867)  | Arrondissements of Benin  | | 
161 | [Q2311958](https://www.wikidata.org/entity/Q2311958)  | Canton (country subdivision) | [political_division=canton](https://wiki.openstreetmap.org/wiki/FR:Cantons_in_France) |
162 | [Q643589](https://www.wikidata.org/entity/Q643589)   | Department |  | 
163 | [Q202216](https://www.wikidata.org/entity/Q202216)   | Overseas department and region  | |
164 | [Q149621](https://www.wikidata.org/entity/Q149621)   | District  | [place=district](https://wiki.openstreetmap.org/wiki/Tag:place=district) |
165 | [Q15243209](https://www.wikidata.org/wiki/Q15243209) | Historic district  | |
166 | [Q5144960](https://www.wikidata.org/entity/Q5144960)  | Microregion  | | 
167 | [Q15284](https://www.wikidata.org/entity/Q15284)    | Municipality  | |
168 | [Q515716](https://www.wikidata.org/entity/Q515716)   | Prefecture  | |
169 | [Q34876](https://www.wikidata.org/entity/Q34876)    | Province  | |
170 | [Q3191695](https://www.wikidata.org/entity/Q3191695)  | Regency (Indonesia)  | |
171 | [Q1970725](https://www.wikidata.org/entity/Q1970725)  | Natural region  | |
172 | [Q486972](https://www.wikidata.org/entity/Q486972)   | Human settlement  | | 
173 | [Q515](https://www.wikidata.org/entity/Q515)      | City  | [place=city](https://wiki.openstreetmap.org/wiki/Tag:place=city) |
174 | [Q5119](https://www.wikidata.org/entity/Q5119)     | Capital city | [capital=yes](https://wiki.openstreetmap.org/wiki/Key:capital) |
175 | [Q4286337](https://www.wikidata.org/entity/Q4286337)  | City district  | | 
176 | [Q1394476](https://www.wikidata.org/entity/Q1394476)  | Civil township  | | 
177 | [Q1115575](https://www.wikidata.org/entity/Q1115575)  | Civil parish  | [designation=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:designation=civil_parish) |
178 | [Q5153984](https://www.wikidata.org/entity/Q5153984)  | Commune-level subdivisions  | |
179 | [Q123705](https://www.wikidata.org/entity/Q123705)   | Neighbourhood  | [place=neighbourhood](https://wiki.openstreetmap.org/wiki/Tag:place=neighbourhood) |
180 | [Q1500350](https://www.wikidata.org/entity/Q1500350)  | Townships of China  | |
181 | [Q17343829](https://www.wikidata.org/entity/Q17343829)           | Unincorporated Community  | |
182 | [Q3957](https://www.wikidata.org/entity/Q3957)     | Town  | [place=town](https://wiki.openstreetmap.org/wiki/Tag:place=town) | 
183 | [Q532](https://www.wikidata.org/entity/Q532)      | Village  | [place=village](https://wiki.openstreetmap.org/wiki/Tag:place=village) |
184 | [Q5084](https://www.wikidata.org/entity/Q5084)     | Hamlet   | [place=hamlet](https://wiki.openstreetmap.org/wiki/Tag:place=hamlet) | 
185 | [Q7275](https://www.wikidata.org/entity/Q7275)     | State  | | 
186 | [Q79007](https://www.wikidata.org/entity/Q79007)    | Street  | |
187 | [Q473972](https://www.wikidata.org/entity/Q473972)   | Protected area  | [boundary=protected_area](https://wiki.openstreetmap.org/wiki/Tag:boundary=protected_area) |
188 | [Q1377575](https://www.wikidata.org/entity/Q1377575)  | Wildlife refuge  | | 
189 | [Q1410668](https://www.wikidata.org/entity/Q1410668)  | National Wildlife Refuge  | [protection_title=National Wildlife Refuge](ownership=national), [ownership=national](https://wiki.openstreetmap.org/wiki/Tag:ownership=national)|
190 | [Q9259](https://www.wikidata.org/entity/Q9259)     | World Heritage Site  | |
191 | 
192 | ---
193 | 
194 | ### Future Work
195 | 
196 | The Wikidata improvements to Nominatim can be further enhanced by:
197 | 
198 | - continuing to add new Wikidata links to OSM objects
199 | - increasing the number of place types accounted for in the wikipedia_articles table
200 | - working to use place types in the wikipedia_article matching process
201 | 


--------------------------------------------------------------------------------