├── .gitignore ├── tests ├── linktargets.txt ├── filter_pagelinks.test1expected.txt ├── filter_wikidata_geo_tags.test1expected.txt ├── filter_pagelinks.test1.txt ├── filter_langlinks.test1.txt ├── filter_langlinks.test1expected.txt ├── filter_wikidata_geo_tags.test1.txt └── run.sh ├── bin ├── mysqldump_to_csv.readme.txt ├── mysqldump_to_csv.LICENSE ├── filter_redirect.py ├── filter_langlinks.py ├── filter_wikidata_wb_items_per_site.py ├── filter_pagelinks.py ├── filter_page.py ├── filter_wikidata_page.py ├── filter_wikidata_geo_tags.py └── mysqldump_to_csv.py ├── config ├── languages.txt ├── wikidata_place_type_levels.csv └── wikidata_place_types.txt ├── lib └── languages.py ├── install_dependencies.sh ├── steps ├── report_database_size.sh ├── cleanup.sh ├── wikidata_download.sh ├── wikipedia_download.sh ├── wikipedia_import.sh ├── wikidata_import.sh ├── wikipedia_process.sh ├── latest_available_data.sh ├── output.sh ├── wikidata_sql2csv.sh ├── wikidata_api_fetch_placetypes.sh ├── wikidata_process.sh └── wikipedia_sql2csv.sh ├── complete_run.sh ├── .github └── workflows │ └── ci.yml ├── README.md ├── LICENSE └── wikidata_places.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /tests/linktargets.txt: -------------------------------------------------------------------------------- 1 | 11,title1 2 | 22,title2 3 | 33,"title3,with,comma" 4 | 44,title4 -------------------------------------------------------------------------------- /tests/filter_pagelinks.test1expected.txt: -------------------------------------------------------------------------------- 1 | title1,5 2 | title2,2 3 | "title3,with,comma",1 4 | title4,1 5 | -------------------------------------------------------------------------------- /tests/filter_wikidata_geo_tags.test1expected.txt: -------------------------------------------------------------------------------- 1 | 5009,25.13333,56.33333 2 | 5010,-34.35806,18.47194 3 | 5018,54.08333,13.38333 4 | 5020,48.76194,8.24083 5 | 5030,54.67639,13.43778 6 | 5034,55.9214,-3.53665 7 | -------------------------------------------------------------------------------- /tests/filter_pagelinks.test1.txt: -------------------------------------------------------------------------------- 1 | enwiki,0,11 2 | enwiki,0,11 3 | enwiki,0,11 4 | enwiki,0,22 5 | enwiki,0,22 6 | enwiki,0,33 7 | enwiki,0,11 8 | enwiki,0,11 9 | enwiki,0,44 10 | enwiki,1,44 11 | enwiki,0,55 -------------------------------------------------------------------------------- /tests/filter_langlinks.test1.txt: -------------------------------------------------------------------------------- 1 | 2074847,tr,Berlin dövlət kitabxanası 2 | 291145,tr,Berlin döyüşü (1945) 3 | 52637892,tr,Berlin hücumu (2016) 4 | 494808,tr,Berlin kafedralı 5 | 438617,tr,Berlin konqresi 6 | 1234,de,"Berlin, Berlin" -------------------------------------------------------------------------------- /tests/filter_langlinks.test1expected.txt: -------------------------------------------------------------------------------- 1 | Berlin_dövlət_kitabxanası,2074847,tr 2 | Berlin_döyüşü_(1945),291145,tr 3 | Berlin_hücumu_(2016),52637892,tr 4 | Berlin_kafedralı,494808,tr 5 | Berlin_konqresi,438617,tr 6 | "Berlin,_Berlin",1234,de 7 | -------------------------------------------------------------------------------- /bin/mysqldump_to_csv.readme.txt: -------------------------------------------------------------------------------- 1 | https://github.com/jamesmishra/mysqldump-to-csv 2 | 3 | * Added errors=surrogateescape to open(), otherwise the script threw UnicodeDecodeError for langlinks files 4 | * Use python3 in first line 5 | * Explicitly set escapechar for csv.writer 6 | * Don't print \x0 for NULL values, print '' instead. 7 | -------------------------------------------------------------------------------- /config/languages.txt: -------------------------------------------------------------------------------- 1 | # https://en.wikipedia.org/wiki/List_of_Wikipedias 2 | ar 3 | bg 4 | ca 5 | cs 6 | da 7 | de 8 | en 9 | es 10 | eo 11 | eu 12 | fa 13 | fr 14 | ko 15 | hi 16 | hr 17 | id 18 | it 19 | he 20 | lt 21 | hu 22 | ms 23 | nl 24 | ja 25 | no 26 | pl 27 | pt 28 | kk 29 | ro 30 | ru 31 | sk 32 | sl 33 | sr 34 | fi 35 | sv 36 | tr 37 | uk 38 | vi 39 | war 40 | zh -------------------------------------------------------------------------------- /tests/filter_wikidata_geo_tags.test1.txt: -------------------------------------------------------------------------------- 1 | 158103,15923968,moon,1,29.63771000,111.17787000,,,,,,,NULL 2 | 158108,5009,earth,1,25.13333300,56.33333300,,,,,,,NULL 3 | 158109,5010,earth,1,-34.35805556,18.47194444,,,,,,,NULL 4 | 158112,5018,earth,1,54.08333333,13.38333333,,,,,,,NULL 5 | 158113,5020,earth,1,48.76194444,8.24083333,,,,,,,NULL 6 | 158120,5030,earth,1,54.67638889,13.43777778,,,,,,,NULL 7 | 158124,5034,earth,1,55.92140000,-3.53665000,,,,,,,NULL -------------------------------------------------------------------------------- /lib/languages.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class Languages: 4 | def get_languages(): 5 | if 'LANGUAGES' in os.environ: 6 | return os.environ['LANGUAGES'].split(',') 7 | 8 | with open('config/languages.txt', 'r') as file: 9 | languages = file.readlines() 10 | languages = map(lambda line: line.strip('\n'), languages) 11 | languages = filter(lambda line: not line.startswith('#'), languages ) 12 | return list(languages) 13 | 14 | return [] 15 | -------------------------------------------------------------------------------- /install_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Tested on Ubuntu-24 5 | # 6 | 7 | sudo apt-get install -y postgresql-16 8 | sudo -u postgres createuser -s $USER 9 | 10 | # No not significant performance increase above 250MB 11 | sudo -u postgres mkdir -p /etc/postgresql/16/main/conf.d/ 12 | echo " 13 | work_mem = 250MB 14 | " | sudo -u postgres tee /etc/postgresql/16/main/conf.d/wikipedia.conf 15 | 16 | sudo systemctl restart postgresql 17 | 18 | sudo apt-get install -y wget coreutils nodejs jq moreutils pigz 19 | sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential 20 | 21 | # https://wdtaxonomy.readthedocs.io/ 22 | sudo apt-get install -y nodejs 23 | node --version 24 | sudo npm install -g wikidata-taxonomy 25 | wdtaxonomy --version 26 | -------------------------------------------------------------------------------- /steps/report_database_size.sh: -------------------------------------------------------------------------------- 1 | cat < tests/linktargets.txt.gz 16 | cat tests/filter_pagelinks.test1.txt | bin/filter_pagelinks.py tests/linktargets.txt.gz > out.txt 17 | diff --brief out.txt tests/filter_pagelinks.test1expected.txt || exit 1 18 | rm -f tests/linktargets.txt.gz 19 | 20 | cat tests/filter_langlinks.test1.txt | bin/filter_langlinks.py > out.txt 21 | diff --brief out.txt tests/filter_langlinks.test1expected.txt || exit 1 22 | 23 | cat tests/filter_wikidata_geo_tags.test1.txt | bin/filter_wikidata_geo_tags.py > out.txt 24 | diff --brief out.txt tests/filter_wikidata_geo_tags.test1expected.txt || exit 1 25 | 26 | rm -f out.txt -------------------------------------------------------------------------------- /bin/mysqldump_to_csv.LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 James Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /bin/filter_redirect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # CREATE TABLE `redirect` ( 6 | # `rd_from` int(8) unsigned NOT NULL DEFAULT 0, 7 | # `rd_namespace` int(11) NOT NULL DEFAULT 0, 8 | # `rd_title` varbinary(255) NOT NULL DEFAULT '', 9 | # `rd_interwiki` varbinary(32) DEFAULT NULL, 10 | # `rd_fragment` varbinary(255) DEFAULT NULL, 11 | 12 | Output to STDOUT: rd_from_page_id, rd_title 13 | 14 | Same for linktarget table 15 | # CREATE TABLE `linktarget` ( 16 | # `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 17 | # `lt_namespace` int(11) NOT NULL, 18 | # `lt_title` varbinary(255) NOT NULL, 19 | ''' 20 | 21 | import sys 22 | import csv 23 | 24 | reader = csv.reader(sys.stdin) 25 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) 26 | 27 | for row in reader: 28 | # namespace: 0 are articles 29 | if (row[1] != '0'): 30 | continue 31 | 32 | title = row[2].replace('\r', '') 33 | if len(title) == 0: 34 | continue 35 | 36 | writer.writerow([row[0], title]) 37 | -------------------------------------------------------------------------------- /bin/filter_langlinks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # CREATE TABLE `langlinks` ( 6 | # `ll_from` int(8) unsigned NOT NULL DEFAULT 0, 7 | # `ll_lang` varbinary(35) NOT NULL DEFAULT '', 8 | # `ll_title` varbinary(255) NOT NULL DEFAULT '', 9 | 10 | Output to STDOUT: ll_title, ll_from_page_id, ll_lang 11 | ''' 12 | 13 | import os 14 | import sys 15 | 16 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | sys.path.append(parent_dir) 18 | 19 | from lib.languages import Languages; 20 | 21 | languages_set = set(Languages.get_languages()) 22 | 23 | 24 | # We don't need CSV parsing here because the first two columns never 25 | # contain commas. 26 | for line in sys.stdin: 27 | line = line.rstrip().replace('\r', '') 28 | 29 | columns = line.split(',', 2) 30 | 31 | # ll_lang, e.g. 'en' 32 | language = columns[1] 33 | if language not in languages_set: 34 | continue 35 | 36 | # langlinks table contain titles with spaces, e.g. 'one (two)' while pages and 37 | # pagelinkcount table contain titles with underscore, e.g. 'one_(two)' 38 | title = columns[2].replace(' ', '_') 39 | 40 | print(','.join([title, columns[0], language])) 41 | -------------------------------------------------------------------------------- /complete_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Single script to do all processing from scratch. Run it or 5 | # use as guide how to run the individual steps. 6 | # 7 | # Example to add timestamps and create a logfile: 8 | # time ./complete_run.sh 2>&1 | ts -s "[%H:%M:%S]" | tee "$(date +"%Y%m%d").$$.log" 9 | 10 | 11 | ./install_dependencies.sh 12 | 13 | # checks https://mirror.clarkson.edu/wikimedia/enwiki/ 14 | # and https://mirror.clarkson.edu/wikimedia/wikidatawiki/ 15 | LATEST_DATE=$(./steps/latest_available_data.sh) # yyyymmdd 16 | 17 | export WIKIPEDIA_DATE=$LATEST_DATE 18 | export WIKIDATA_DATE=$LATEST_DATE 19 | export BUILDID=wikimedia_build_$(date +"%Y%m%d") 20 | export LANGUAGES=$(grep -v '^#' config/languages.txt | tr "\n" ",") 21 | # export LANGUAGES=de,nl 22 | export DATABASE_NAME=$BUILDID 23 | 24 | ./steps/wikipedia_download.sh 25 | ./steps/wikidata_download.sh 26 | ./steps/wikidata_api_fetch_placetypes.sh 27 | 28 | ./steps/wikipedia_sql2csv.sh 29 | ./steps/wikidata_sql2csv.sh 30 | 31 | # dropdb --if-exists $DATABASE_NAME 32 | createdb $DATABASE_NAME 33 | ./steps/wikipedia_import.sh 34 | ./steps/wikidata_import.sh 35 | 36 | ./steps/wikipedia_process.sh 37 | ./steps/wikidata_process.sh 38 | 39 | ./steps/report_database_size.sh 40 | ./steps/output.sh 41 | # ./steps/cleanup.sh 42 | 43 | echo "Finished." -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: "Continuous Integration" 2 | 3 | on: [ push, pull_request ] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Install PostgreSQL 11 | run: | 12 | sudo apt-get update -qq 13 | sudo apt-get install -y -qq postgresql postgresql-client 14 | sudo systemctl restart postgresql 15 | sudo -u postgres createuser -s runner 16 | - name: Install dependencies 17 | run: ./install_dependencies.sh 18 | - name: Create database 19 | run: createdb wikiprocessingdb 20 | - name: Build for languages Limburgish (li), Bavarian (bar) 21 | run: | 22 | LATEST_DATE=$(./steps/latest_available_data.sh) 23 | export WIKIPEDIA_DATE=$LATEST_DATE 24 | export WIKIDATA_DATE=$LATEST_DATE 25 | 26 | ./steps/wikipedia_download.sh 27 | ./steps/wikipedia_sql2csv.sh 28 | ./steps/wikipedia_import.sh 29 | ./steps/wikipedia_process.sh 30 | 31 | grep county config/wikidata_place_types.txt > new.txt 32 | mv new.txt config/wikidata_place_types.txt 33 | ./steps/wikidata_api_fetch_placetypes.sh 34 | env: 35 | BUILDID: ci_test_build 36 | LANGUAGES: li,bar 37 | 38 | - name: Test output 39 | run: tests/run.sh 40 | -------------------------------------------------------------------------------- /bin/filter_wikidata_wb_items_per_site.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # MySQL schema inside the sql.gz file: 6 | # 7 | # CREATE TABLE `wb_items_per_site` ( 8 | # `ips_row_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 9 | # `ips_item_id` int(10) unsigned NOT NULL, 10 | # `ips_site_id` varbinary(32) NOT NULL, 11 | # `ips_site_page` varbinary(310) NOT NULL, 12 | 13 | Output to STDOUT: item_id, site_id, site_page (title) 14 | ''' 15 | 16 | import os 17 | import sys 18 | import csv 19 | 20 | # Add the parent directory to sys.path 21 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 22 | sys.path.append(parent_dir) 23 | 24 | from lib.languages import Languages; 25 | 26 | languages_set = set(Languages.get_languages()) 27 | # print(languages_set, file=sys.stderr) 28 | 29 | 30 | reader = csv.reader(sys.stdin) 31 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) 32 | 33 | for row in reader: 34 | # ips_site_page is the title 35 | title = row[3].replace('\r', '') 36 | if len(title) == 0: 37 | continue 38 | 39 | # ips_site_id, e.g. 'enwiki' 40 | language = row[2].replace('wiki', '') 41 | if language not in languages_set: 42 | continue 43 | 44 | writer.writerow([row[1], row[2], title]) 45 | -------------------------------------------------------------------------------- /steps/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | 7 | # Languages as comma-separated string, e.g. 'en,fr,de' 8 | : ${LANGUAGES:=bar,cy} 9 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 10 | 11 | psqlcmd() { 12 | psql --quiet $DATABASE_NAME 13 | } 14 | 15 | 16 | 17 | echo "=====================================================================" 18 | echo "Dropping intermediate wikipedia tables to conserve space" 19 | echo "=====================================================================" 20 | 21 | for LANG in "${LANGUAGES_ARRAY[@]}" 22 | do 23 | echo "DROP TABLE ${LANG}pagelinks;" | psqlcmd 24 | echo "DROP TABLE ${LANG}page;" | psqlcmd 25 | echo "DROP TABLE ${LANG}langlinks;" | psqlcmd 26 | echo "DROP TABLE ${LANG}redirect;" | psqlcmd 27 | echo "DROP TABLE ${LANG}pagelinkcount;" | psqlcmd 28 | done 29 | 30 | 31 | echo "=====================================================================" 32 | echo "Dropping intermediate wikidata tables" 33 | echo "=====================================================================" 34 | 35 | echo "DROP TABLE wikidata_place_dump;" | psqlcmd 36 | echo "DROP TABLE geo_earth_primary;" | psqlcmd 37 | for LANG in "${LANGUAGES_ARRAY[@]}" 38 | do 39 | echo "DROP TABLE wikidata_${LANG}_pages;" | psqlcmd 40 | done 41 | -------------------------------------------------------------------------------- /bin/filter_pagelinks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # CREATE TABLE `pagelinks` ( 6 | # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, 7 | # `pl_namespace` int(11) NOT NULL DEFAULT 0, 8 | # `pl_target_id` bigint(20) unsigned NOT NULL, 9 | 10 | Output to STDOUT: pl_title, count 11 | ''' 12 | 13 | import sys 14 | import csv 15 | import gzip 16 | 17 | if len(sys.argv) < 2: 18 | print("Usage: filter_pagelinks.py linktarget.csv.gz") 19 | exit(1) 20 | 21 | linktarget_filename = sys.argv[1] 22 | linktarget_id_to_title = dict() 23 | 24 | with gzip.open(linktarget_filename, 'rt') as gzfile: 25 | reader = csv.reader(gzfile) 26 | for row in reader: 27 | linktarget_id_to_title[row[0]] = row[1] 28 | 29 | reader = csv.reader(sys.stdin) 30 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) 31 | 32 | counts = {} 33 | for row in reader: 34 | # pl_namespace: 0 are articles 35 | if (row[1] != '0'): 36 | continue 37 | 38 | title = linktarget_id_to_title.get(row[2]) 39 | if title is None: 40 | continue 41 | 42 | if title not in counts: 43 | counts[title] = 1 44 | else: 45 | counts[title] += 1 46 | 47 | # for title in sorted(counts.keys()): 48 | for title in counts.keys(): 49 | writer.writerow([title, counts[title]]) 50 | -------------------------------------------------------------------------------- /bin/filter_page.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # CREATE TABLE `page` ( 6 | # `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, 7 | # `page_namespace` int(11) NOT NULL DEFAULT 0, 8 | # `page_title` varbinary(255) NOT NULL DEFAULT '', 9 | # `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, 10 | # `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, 11 | # `page_random` double unsigned NOT NULL DEFAULT 0, 12 | # `page_touched` varbinary(14) NOT NULL DEFAULT '', 13 | # `page_links_updated` varbinary(14) DEFAULT NULL, 14 | # `page_latest` int(8) unsigned NOT NULL DEFAULT 0, 15 | # `page_len` int(8) unsigned NOT NULL DEFAULT 0, 16 | # `page_content_model` varbinary(32) DEFAULT NULL, 17 | # `page_lang` varbinary(35) DEFAULT NULL, 18 | 19 | Output to STDOUT: page_id, page_title 20 | ''' 21 | 22 | import sys 23 | import csv 24 | 25 | reader = csv.reader(sys.stdin) 26 | writer = csv.writer(sys.stdout, dialect='unix', quoting=csv.QUOTE_MINIMAL) 27 | 28 | for row in reader: 29 | # 0 are articles 30 | if (row[1] != '0'): 31 | continue 32 | 33 | title = row[2].replace('\r', '') 34 | if len(title) == 0: 35 | continue 36 | 37 | writer.writerow([row[0], title]) 38 | -------------------------------------------------------------------------------- /bin/filter_wikidata_page.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # MySQL schema inside the sql.gz file: 6 | # 7 | # CREATE TABLE `page` ( 8 | # `page_id` int(10) unsigned NOT NULL AUTO_INCREMENT, 9 | # `page_namespace` int(11) NOT NULL, 10 | # `page_title` varbinary(255) NOT NULL, 11 | # `page_restrictions` tinyblob DEFAULT NULL, 12 | # `page_is_redirect` tinyint(3) unsigned NOT NULL DEFAULT 0, 13 | # `page_is_new` tinyint(3) unsigned NOT NULL DEFAULT 0, 14 | # `page_random` double unsigned NOT NULL, 15 | # `page_touched` binary(14) NOT NULL, 16 | # `page_links_updated` varbinary(14) DEFAULT NULL, 17 | # `page_latest` int(10) unsigned NOT NULL, 18 | # `page_len` int(10) unsigned NOT NULL, 19 | # `page_content_model` varbinary(32) DEFAULT NULL, 20 | # `page_lang` varbinary(35) DEFAULT NULL, 21 | 22 | # page_lang isn't interesting, 'NULL' 99.999% of the time 23 | 24 | Output to STDOUT: page_id, page_title 25 | ''' 26 | 27 | import sys 28 | import csv 29 | 30 | reader = csv.reader(sys.stdin) 31 | 32 | for row in reader: 33 | # page_namespace: 0 are articles (99% of the input lines) 34 | if (row[1] != '0'): 35 | continue 36 | 37 | # page_title are actually ids. Some are special pages, not articles 38 | if (row[2][0] != 'Q'): 39 | continue 40 | 41 | print(row[0] + ',' + row[2]) 42 | -------------------------------------------------------------------------------- /bin/filter_wikidata_geo_tags.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Input from STDIN 5 | # MySQL schema inside the sql.gz file: 6 | # 7 | # CREATE TABLE `geo_tags` ( 8 | # `gt_id` int(10) unsigned NOT NULL AUTO_INCREMENT, 9 | # `gt_page_id` int(10) unsigned NOT NULL, 10 | # `gt_globe` varbinary(32) NOT NULL, 11 | # `gt_primary` tinyint(1) NOT NULL, 12 | # `gt_lat` decimal(11,8) DEFAULT NULL, 13 | # `gt_lon` decimal(11,8) DEFAULT NULL, 14 | # `gt_dim` int(11) DEFAULT NULL, 15 | # `gt_type` varbinary(32) DEFAULT NULL, 16 | # `gt_name` varbinary(255) DEFAULT NULL, 17 | # `gt_country` binary(2) DEFAULT NULL, 18 | # `gt_region` varbinary(3) DEFAULT NULL, 19 | 20 | Output to STDOUT: gt_page_id, gt_lat, gt_lon 21 | ''' 22 | 23 | import sys 24 | import csv 25 | 26 | reader = csv.reader(sys.stdin) 27 | 28 | for row in reader: 29 | # gt_globe: There are places e.g. on the moon with coordinates 30 | if (row[2] != 'earth'): 31 | continue 32 | 33 | # gt_primary 34 | if (row[3] != '1'): 35 | continue 36 | 37 | lat = float(row[4]) 38 | lon = float(row[5]) 39 | 40 | if (lat == 0 and lon == 0): 41 | # print('skipping 0,0', file=sys.stderr) 42 | continue 43 | 44 | if (lat < -90 or lat > 90 or lon < -180 or lon > 180): 45 | # print('skipping out of bounds', file=sys.stderr) 46 | # print(lat, file=sys.stderr) 47 | # print(lon, file=sys.stderr) 48 | continue 49 | 50 | lat = round(lat, 5) 51 | lon = round(lon, 5) 52 | 53 | print(row[1] + ',' + str(lat) + ',' + str(lon)) 54 | -------------------------------------------------------------------------------- /steps/wikidata_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "=====================================================================" 4 | echo "Download wikidata dump tables" 5 | echo "=====================================================================" 6 | 7 | # set defaults 8 | : ${BUILDID:=latest} 9 | # List of mirrors https://dumps.wikimedia.org/mirrors.html 10 | # Download using main dumps.wikimedia.org: 60 minutes, mirror: 20 minutes 11 | : ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com} 12 | # See list on https://wikidata.aerotechnet.com/wikidatawiki/ 13 | : ${WIKIDATA_DATE:=20220701} 14 | 15 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata" 16 | mkdir -p $DOWNLOADED_PATH 17 | 18 | download() { 19 | echo "Downloading $1 > $2" 20 | if [ -e "$2" ]; then 21 | echo "file $2 already exists, skipping" 22 | return 23 | fi 24 | header='--header=User-Agent:Osm-search-Bot/1(https://github.com/osm-search/wikipedia-wikidata)' 25 | wget -O "$2" --quiet $header --no-clobber --tries=3 "$1" 26 | if [ ! -s "$2" ]; then 27 | echo "downloaded file $2 is empty, please retry later" 28 | rm -f "$2" 29 | exit 1 30 | fi 31 | } 32 | 33 | for FN in geo_tags.sql.gz page.sql.gz wb_items_per_site.sql.gz; do 34 | 35 | # https://wikidata.aerotechnet.com/wikidatawiki/20250501/wikidatawiki-20250501-geo_tags.sql.gz 36 | # https://wikidata.aerotechnet.com/wikidatawiki/20250501/md5sums-wikidatawiki-20250501-geo_tags.sql.gz.txt 37 | download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/wikidatawiki-$WIKIDATA_DATE-$FN "$DOWNLOADED_PATH/$FN" 38 | download https://$WIKIMEDIA_HOST/wikidatawiki/$WIKIDATA_DATE/md5sums-wikidatawiki-$WIKIDATA_DATE-$FN.txt "$DOWNLOADED_PATH/$FN.md5" 39 | 40 | EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$FN.md5" | cut -d\ -f1) 41 | CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$FN" | cut -d\ -f1) 42 | 43 | if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then 44 | echo "$FN - md5 checksum doesn't match, download broken" 45 | exit 1 46 | fi 47 | 48 | done 49 | du -h $DOWNLOADED_PATH/* 50 | 51 | # 114M downloaded/geo_tags.sql.gz 52 | # 1.7G downloaded/page.sql.gz 53 | # 1.2G downloaded/wb_items_per_site.sql.gz 54 | -------------------------------------------------------------------------------- /steps/wikipedia_download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "=====================================================================" 4 | echo "Download individual wikipedia language tables dumps" 5 | echo "=====================================================================" 6 | 7 | # set defaults 8 | : ${BUILDID:=latest} 9 | # Languages as comma-separated string, e.g. 'en,fr,de' 10 | : ${LANGUAGES:=bar,cy} 11 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 12 | # List of mirrors https://dumps.wikimedia.org/mirrors.html 13 | # Download using main dumps.wikimedia.org: 150 minutes, mirror: 40 minutes 14 | : ${WIKIMEDIA_HOST:=wikidata.aerotechnet.com} 15 | # See list on https://wikidata.aerotechnet.com/enwiki/ 16 | : ${WIKIPEDIA_DATE:=20220620} 17 | 18 | DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia" 19 | 20 | download() { 21 | echo "Downloading $1 > $2" 22 | if [ -e "$2" ]; then 23 | echo "file $2 already exists, skipping" 24 | return 25 | fi 26 | header='--header=User-Agent:Osm-search-Bot/1(https://github.com/osm-search/wikipedia-wikidata)' 27 | wget -O "$2" --quiet $header --no-clobber --tries=3 "$1" 28 | if [ ! -s "$2" ]; then 29 | echo "downloaded file $2 is empty, please retry later" 30 | rm -f "$2" 31 | exit 1 32 | fi 33 | du -h "$2" | cut -f1 34 | } 35 | 36 | for LANG in "${LANGUAGES_ARRAY[@]}"; do 37 | echo "Language: $LANG" 38 | 39 | mkdir -p "$DOWNLOADED_PATH/$LANG" 40 | 41 | # English is the largest 42 | # 2.1G downloaded/en/page.sql.gz 43 | # 6.4G downloaded/en/pagelinks.sql.gz 44 | # 492M downloaded/en/langlinks.sql.gz 45 | # 992M downloaded/en/linktarget.sql.gz 46 | # 160M downloaded/en/redirect.sql.gz 47 | 48 | # Smaller language Turkish 49 | # 90M downloaded/tr/page.sql.gz 50 | # 255M downloaded/tr/pagelinks.sql.gz 51 | # 166M downloaded/tr/langlinks.sql.gz 52 | # 62M downloaded/tr/linktarget.sql.gz 53 | # 4.2M downloaded/tr/redirect.sql.gz 54 | 55 | for FN in page.sql.gz pagelinks.sql.gz langlinks.sql.gz linktarget.sql.gz redirect.sql.gz; do 56 | 57 | download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/${LANG}wiki-$WIKIPEDIA_DATE-$FN "$DOWNLOADED_PATH/$LANG/$FN" 58 | download https://$WIKIMEDIA_HOST/${LANG}wiki/$WIKIPEDIA_DATE/md5sums-${LANG}wiki-$WIKIPEDIA_DATE-$FN.txt "$DOWNLOADED_PATH/$LANG/$FN.md5" 59 | 60 | EXPECTED_MD5=$(cat "$DOWNLOADED_PATH/$LANG/$FN.md5" | cut -d\ -f1) 61 | CALCULATED_MD5=$(md5sum "$DOWNLOADED_PATH/$LANG/$FN" | cut -d\ -f1) 62 | 63 | if [[ "$EXPECTED_MD5" != "$CALCULATED_MD5" ]]; then 64 | echo "$FN for language $LANG - md5 checksum doesn't match, download broken" 65 | exit 1 66 | fi 67 | done 68 | done 69 | -------------------------------------------------------------------------------- /steps/wikipedia_import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | : ${LANGUAGES:=bar,cy} 7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 8 | 9 | CONVERTED_PATH="$BUILDID/converted/wikipedia" 10 | # postgresql's COPY requires full path 11 | CONVERTED_PATH_ABS=$(realpath "$CONVERTED_PATH") 12 | 13 | psqlcmd() { 14 | psql --quiet $DATABASE_NAME |& \ 15 | grep -v 'does not exist, skipping' 16 | } 17 | 18 | echo "=====================================================================" 19 | echo "Import wikipedia CSV tables" 20 | echo "=====================================================================" 21 | 22 | for LANG in "${LANGUAGES_ARRAY[@]}" 23 | do 24 | echo "$LANG" 25 | 26 | # ----------------------------------------------------------- 27 | echo "* ${LANG}page from $CONVERTED_PATH_ABS/$LANG/pages.csv.gz"; 28 | 29 | echo "DROP TABLE IF EXISTS ${LANG}page;" | psqlcmd 30 | echo "CREATE TABLE ${LANG}page ( 31 | page_id integer, 32 | page_title text 33 | );" | psqlcmd 34 | 35 | 36 | echo "COPY ${LANG}page (page_id, page_title) 37 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/pages.csv.gz' 38 | CSV 39 | ;" | psqlcmd 40 | 41 | 42 | 43 | # ----------------------------------------------------------- 44 | echo "* ${LANG}pagelinks from $CONVERTED_PATH_ABS/$LANG/pagelinks.csv.gz"; 45 | 46 | echo "DROP TABLE IF EXISTS ${LANG}pagelinks;" | psqlcmd 47 | echo "CREATE TABLE ${LANG}pagelinks ( 48 | pl_title text, 49 | langcount integer, 50 | othercount integer DEFAULT 0 51 | );" | psqlcmd 52 | 53 | echo "COPY ${LANG}pagelinks (pl_title, langcount) 54 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/pagelinks.csv.gz' 55 | CSV 56 | ;" | psqlcmd 57 | 58 | 59 | # ----------------------------------------------------------- 60 | echo "* ${LANG}langlinks from $CONVERTED_PATH_ABS/$LANG/langlinks.csv.gz"; 61 | 62 | echo "DROP TABLE IF EXISTS ${LANG}langlinks;" | psqlcmd 63 | echo "CREATE TABLE ${LANG}langlinks ( 64 | ll_from integer, 65 | ll_lang text, 66 | ll_title text 67 | );" | psqlcmd 68 | 69 | echo "COPY ${LANG}langlinks (ll_title, ll_from, ll_lang) 70 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/langlinks.csv.gz' 71 | CSV 72 | ;" | psqlcmd 73 | 74 | 75 | # ----------------------------------------------------------- 76 | echo "* ${LANG}redirect from $CONVERTED_PATH_ABS/$LANG/redirects.csv.gz"; 77 | 78 | echo "DROP TABLE IF EXISTS ${LANG}redirect;" | psqlcmd 79 | echo "CREATE TABLE ${LANG}redirect ( 80 | rd_from integer, 81 | rd_title text 82 | );" | psqlcmd 83 | 84 | echo "COPY ${LANG}redirect (rd_from, rd_title) 85 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/$LANG/redirect.csv.gz' 86 | CSV 87 | ;" | psqlcmd 88 | 89 | done -------------------------------------------------------------------------------- /config/wikidata_place_type_levels.csv: -------------------------------------------------------------------------------- 1 | place_type,level 2 | Q9842,4 3 | Q9430,3 4 | Q928830,4 5 | Q9259,1 6 | Q91028,5 7 | Q8514,2 8 | Q8502,2 9 | Q83405,3 10 | Q82794,2 11 | Q820477,1 12 | Q811979,1 13 | Q8072,2 14 | Q79007,2 15 | Q786014,3 16 | Q75848,2 17 | Q75520,2 18 | Q728937,4 19 | Q7275,2 20 | Q719456,3 21 | Q7075,3 22 | Q697295,4 23 | Q6852233,2 24 | Q682943,3 25 | Q665487,5 26 | Q655686,3 27 | Q643589,5 28 | Q641226,2 29 | Q631305,2 30 | Q6256,2 31 | Q6023295,2 32 | Q5773747,5 33 | Q56061,1 34 | Q55659167,4 35 | Q55488,4 36 | Q55465477,3 37 | Q54050,2 38 | Q532,3 39 | Q53060,2 40 | Q52177058,4 41 | Q515716,5 42 | Q5153984,4 43 | Q515,3 44 | Q5144960,5 45 | Q5119,4 46 | Q5119,4 47 | Q5107,2 48 | Q5084,4 49 | Q5031071,4 50 | Q5003624,2 51 | Q4989906,1 52 | Q4976993,3 53 | Q486972,1 54 | Q486972,2 55 | Q483110,3 56 | Q4830453,4 57 | Q47521,3 58 | Q473972,1 59 | Q46831,2 60 | Q46614560,5 61 | Q44782,3 62 | Q44613,4 63 | Q44539,4 64 | Q44494,2 65 | Q44377,2 66 | Q4421,2 67 | Q43501,2 68 | Q4286337,3 69 | Q42523,3 70 | Q41176,2 71 | Q40357,3 72 | Q4022,4 73 | Q40080,2 74 | Q39816,2 75 | Q39715,3 76 | Q39614,1 77 | Q3957,3 78 | Q3947,4 79 | Q3914,3 80 | Q38723,2 81 | Q38720,3 82 | Q3623867,5 83 | Q35666,2 84 | Q355304,3 85 | Q35509,2 86 | Q35112127,3 87 | Q34985575,4 88 | Q34876,5 89 | Q34763,2 90 | Q34627,4 91 | Q3455524,3 92 | Q34442,4 93 | Q33837,2 94 | Q33506,3 95 | Q32815,4 96 | Q3257686,2 97 | Q3240715,2 98 | Q3191695,5 99 | Q3153117,2 100 | Q30198,2 101 | Q30139652,3 102 | Q294422,3 103 | Q2870166,3 104 | Q27686,3 105 | Q274153,3 106 | Q271669,1 107 | Q2659904,2 108 | Q24529780,2 109 | Q24354,3 110 | Q2354973,4 111 | Q23442,2 112 | Q23413,3 113 | Q23397,3 114 | Q2327515,4 115 | Q2311958,5 116 | Q22927291,6 117 | Q22698,1 118 | Q2175765,4 119 | Q205495,4 120 | Q204832,3 121 | Q2042028,2 122 | Q202216,6 123 | Q1970725,3 124 | Q194203,5 125 | Q194195,2 126 | Q190429,2 127 | Q185187,3 128 | Q185113,2 129 | Q183366,2 130 | Q1799794,1 131 | Q1788454,4 132 | Q1785071,3 133 | Q1777138,3 134 | Q177634,2 135 | Q177380,2 136 | Q174814,4 137 | Q174782,2 138 | Q17350442,2 139 | Q17343829,3 140 | Q17334923,0 141 | Q17018380,3 142 | Q16970,4 143 | Q16917,3 144 | Q16831714,4 145 | Q165,3 146 | Q160742,4 147 | Q159719,3 148 | Q159334,4 149 | Q15640612,5 150 | Q15324,2 151 | Q15284,5 152 | Q15243209,6 153 | Q152081,1 154 | Q15195406,4 155 | Q1500350,5 156 | Q149621,5 157 | Q14757767,4 158 | Q14350,3 159 | Q1410668,3 160 | Q1394476,3 161 | Q1377575,2 162 | Q1353183,3 163 | Q134447,4 164 | Q133215,3 165 | Q133056,2 166 | Q13221722,3 167 | Q13220204,2 168 | Q1311958,4 169 | Q1303167,3 170 | Q130003,3 171 | Q12518,2 172 | Q12516,3 173 | Q1248784,3 174 | Q123705,3 175 | Q12323,3 176 | Q12284,4 177 | Q12280,4 178 | Q121359,2 179 | Q1210950,2 180 | Q11755880,3 181 | Q11707,3 182 | Q11315,3 183 | Q11303,3 184 | Q1115575,4 185 | Q1107656,1 186 | Q10864048,1 187 | Q1076486,2 188 | Q105731,3 189 | Q105190,3 190 | Q1048525,3 191 | Q102496,5 192 | Q28872924,1 193 | Q15617994,1 194 | Q159313,2 195 | Q24398318,3 196 | Q327333,2 197 | Q43229,1 198 | Q860861,1 199 | Q4989906,1 200 | -------------------------------------------------------------------------------- /steps/wikidata_import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | 7 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata" 8 | CONVERTED_PATH="$BUILDID/converted/wikidata" 9 | # postgresql's COPY requires full path 10 | DOWNLOADED_PATH_ABS=$(realpath "$DOWNLOADED_PATH") 11 | CONVERTED_PATH_ABS=$(realpath "$CONVERTED_PATH") 12 | 13 | psqlcmd() { 14 | psql --quiet $DATABASE_NAME |& \ 15 | grep -v 'does not exist, skipping' 16 | } 17 | 18 | 19 | echo "=====================================================================" 20 | echo "Import wikidata tables" 21 | echo "=====================================================================" 22 | 23 | 24 | # ----------------------------------------------------------- 25 | echo "Importing geotags from $CONVERTED_PATH_ABS/geo_tags.csv.gz"; 26 | 27 | echo "DROP TABLE IF EXISTS geo_tags;" | psqlcmd 28 | echo "CREATE TABLE geo_tags ( 29 | gt_page_id bigint, 30 | gt_lat numeric(11,8), 31 | gt_lon numeric(11,8) 32 | );" | psqlcmd 33 | 34 | 35 | echo "COPY geo_tags (gt_page_id, gt_lat, gt_lon) 36 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/geo_tags.csv.gz' 37 | CSV 38 | ;" | psqlcmd 39 | 40 | 41 | 42 | # ----------------------------------------------------------- 43 | echo "Importing page from $CONVERTED_PATH_ABS/page.csv.gz"; 44 | 45 | echo "DROP TABLE IF EXISTS page;" | psqlcmd 46 | echo "CREATE TABLE page ( 47 | page_id bigint, 48 | page_title text 49 | );" | psqlcmd 50 | 51 | 52 | echo "COPY page (page_id, page_title) 53 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/page.csv.gz' 54 | CSV 55 | ;" | psqlcmd 56 | 57 | 58 | 59 | # ----------------------------------------------------------- 60 | echo "Importing wb_items_per_site from $CONVERTED_PATH_ABS/wb_items_per_site.csv.gz"; 61 | 62 | echo "DROP TABLE IF EXISTS wb_items_per_site;" | psqlcmd 63 | echo "CREATE TABLE wb_items_per_site ( 64 | ips_item_id integer, 65 | ips_site_id text, 66 | ips_site_page text 67 | );" | psqlcmd 68 | 69 | echo "COPY wb_items_per_site (ips_item_id, ips_site_id, ips_site_page) 70 | FROM PROGRAM 'unpigz -c $CONVERTED_PATH_ABS/wb_items_per_site.csv.gz' 71 | CSV 72 | ;" | psqlcmd 73 | 74 | 75 | 76 | # ----------------------------------------------------------- 77 | echo "Importing wikidata_place_dump from $DOWNLOADED_PATH_ABS/wikidata_place_dump.csv.gz"; 78 | 79 | echo "DROP TABLE IF EXISTS wikidata_place_dump;" | psqlcmd 80 | echo "CREATE TABLE wikidata_place_dump ( 81 | item text, 82 | instance_of text 83 | );" | psqlcmd 84 | 85 | echo "COPY wikidata_place_dump (item, instance_of) 86 | FROM PROGRAM 'unpigz -c $DOWNLOADED_PATH_ABS/wikidata_place_dump.csv.gz' 87 | CSV 88 | ;" | psqlcmd 89 | 90 | 91 | 92 | # ----------------------------------------------------------- 93 | echo "Importing wikidata_place_type_levels from $DOWNLOADED_PATH_ABS/wikidata_place_type_levels.csv"; 94 | 95 | echo "DROP TABLE IF EXISTS wikidata_place_type_levels;" | psqlcmd 96 | echo "CREATE TABLE wikidata_place_type_levels ( 97 | place_type text, 98 | level integer 99 | );" | psqlcmd 100 | 101 | echo "COPY wikidata_place_type_levels (place_type, level) 102 | FROM '$DOWNLOADED_PATH_ABS/wikidata_place_type_levels.csv' 103 | CSV 104 | HEADER 105 | ;" | psqlcmd 106 | 107 | -------------------------------------------------------------------------------- /steps/wikipedia_process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | : ${LANGUAGES:=bar,cy} 7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 8 | 9 | 10 | psqlcmd() { 11 | psql --quiet $DATABASE_NAME |& \ 12 | grep -v 'does not exist, skipping' 13 | } 14 | 15 | 16 | echo "=====================================================================" 17 | echo "Create and fill wikipedia_redirect_full" 18 | echo "=====================================================================" 19 | echo "DROP TABLE IF EXISTS wikipedia_redirect_full;" | psqlcmd 20 | echo "CREATE TABLE wikipedia_redirect_full ( 21 | language text, 22 | from_title text, 23 | to_title text 24 | );" | psqlcmd 25 | 26 | for LANG in "${LANGUAGES_ARRAY[@]}" 27 | do 28 | echo "INSERT INTO wikipedia_redirect_full 29 | SELECT '${LANG}', 30 | page_title, 31 | rd_title 32 | FROM ${LANG}redirect 33 | JOIN ${LANG}page ON (rd_from = page_id) 34 | ;" | psqlcmd 35 | done 36 | 37 | 38 | 39 | 40 | 41 | echo "=====================================================================" 42 | echo "Process language tables and associated pagelink counts" 43 | echo "=====================================================================" 44 | 45 | echo "set othercounts" 46 | # Creating indexes on title, ll_title didn't have any positive effect on 47 | # query performance and added another 1 hour and 35GB of data. 48 | # echo "CREATE INDEX idx_${LANG}langlinks ON ${LANG}langlinks (ll_lang, ll_title);" | psqlcmd 49 | # echo "CREATE INDEX idx_${LANG}langlinks2 ON ${LANG}langlinks (ll_title);" | psqlcmd 50 | # echo "CREATE INDEX idx_${LANG}page ON ${LANG}page (page_id);" | psqlcmd 51 | # echo "CREATE INDEX idx_${LANG}page2 ON ${LANG}page (page_title);" | psqlcmd 52 | for LANG in "${LANGUAGES_ARRAY[@]}" 53 | do 54 | echo "Language: $LANG" 55 | 56 | for OTHERLANG in "${LANGUAGES_ARRAY[@]}" 57 | do 58 | echo "UPDATE ${LANG}pagelinks 59 | SET othercount = othercount + x.count 60 | FROM ( 61 | SELECT ${LANG}page.page_title AS title, 62 | ${OTHERLANG}pagelinks.langcount AS count 63 | FROM ${LANG}langlinks 64 | JOIN ${LANG}page ON (ll_from = page_id) 65 | JOIN ${OTHERLANG}pagelinks ON (ll_lang = '${OTHERLANG}' AND ll_title = pl_title) 66 | ) AS x 67 | WHERE x.title = ${LANG}pagelinks.pl_title 68 | ;" | psqlcmd 69 | done 70 | 71 | done 72 | 73 | 74 | 75 | echo "=====================================================================" 76 | echo "Create and fill wikipedia_article_full" 77 | echo "=====================================================================" 78 | 79 | echo "DROP TABLE IF EXISTS wikipedia_article_full;" | psqlcmd 80 | echo "CREATE TABLE wikipedia_article_full ( 81 | language text NOT NULL, 82 | title text NOT NULL, 83 | langcount integer, 84 | othercount integer, 85 | totalcount integer, 86 | lat double precision, 87 | lon double precision, 88 | importance double precision, 89 | title_en text, 90 | wd_page_title text, 91 | instance_of text 92 | );" | psqlcmd 93 | 94 | for LANG in "${LANGUAGES_ARRAY[@]}" 95 | do 96 | echo "INSERT INTO wikipedia_article_full 97 | SELECT '${LANG}', 98 | pl_title, 99 | langcount, 100 | othercount, 101 | langcount + othercount 102 | FROM ${LANG}pagelinks 103 | ;" | psqlcmd 104 | done 105 | 106 | 107 | echo "done" 108 | 109 | 110 | -------------------------------------------------------------------------------- /steps/latest_available_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Prints a YYYYMMDD date of the latest available date on 5 | # https://wikidata.aerotechnet.com/enwiki/ 6 | # We do some additional checks if the dumps are complete, too 7 | # 8 | 9 | debug() { 10 | # Comment out the following line to print debug information 11 | # echo "$@" 1>&2 12 | echo -n '' 13 | } 14 | 15 | DATE='' 16 | 17 | # Sets $DATE to first of the month (YYYYMMDD). If given a parameter then 18 | # it substracts number of months 19 | set_date_to_first_of_month() { 20 | MINUS_NUM_MONTHS=${1:-0} 21 | 22 | if [[ "$(uname)" == "Darwin" ]]; then 23 | DATE=$(date -v -${MINUS_NUM_MONTHS}m +%Y%m01) 24 | else 25 | DATE=$(date --date="-$MINUS_NUM_MONTHS month" +%Y%m01) 26 | fi 27 | } 28 | 29 | check_all_files_ready() { 30 | CHECK_DATE=$1 31 | debug "check_all_files_ready for $CHECK_DATE" 32 | 33 | # The complete dump for wikidata for example can take several weeks (metahistory7zdump 34 | # file ready after 15 days). 35 | # 36 | # The dumpruninfo.json files have this format: 37 | # { 38 | # "jobs": { 39 | # "imagetable": { 40 | # "status": "done", 41 | # "updated": "2023-02-01 08:27:30" 42 | # }, 43 | # "imagelinkstable": { 44 | # "status": "done", 45 | # "updated": "2023-02-01 09:18:03" 46 | # }, 47 | # "geotagstable": { 48 | # "status": "done", 49 | # "updated": "2023-02-01 10:01:50" 50 | # }, 51 | # [...] 52 | # 53 | 54 | ANY_FILE_MISSING=0 55 | 56 | ## 57 | ## 1. Chinese (ZH) Wikipedia 58 | ## usually the last to be dumped 59 | ## 60 | # from wikipedia_download.sh 61 | WIKIPEDIA_REQUIRED_FILES="page pagelinks langlinks linktarget redirect" 62 | DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/zhwiki/$CHECK_DATE/dumpruninfo.json" 63 | debug $DUMP_RUN_INFO_URL 64 | DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL") 65 | 66 | if [[ $? != 0 ]]; then 67 | debug "fetching from URL $DUMP_RUN_INFO_URL failed" 68 | return 1 69 | fi 70 | 71 | for FN in $WIKIPEDIA_REQUIRED_FILES; do 72 | TABLENAME=${FN//_/}table # redirect => redirecttable 73 | debug "checking status for table $TABLENAME" 74 | 75 | STATUS=$(echo "$DUMP_RUN_INFO" | TABLE=$TABLENAME jq -r '.jobs[env.TABLE].status') 76 | debug " status: $STATUS" 77 | 78 | if [ "$STATUS" != "done" ]; then 79 | debug "$TABLENAME not ready yet" 80 | ANY_FILE_MISSING=1 81 | fi 82 | done 83 | 84 | ## 85 | ## 2. Wikidata 86 | ## 87 | # from wikidata_download.sh 88 | WIKIDATA_REQUIRED_FILES="geo_tags page wb_items_per_site" 89 | 90 | DUMP_RUN_INFO_URL="https://wikidata.aerotechnet.com/wikidatawiki/$CHECK_DATE/dumpruninfo.json" 91 | debug $DUMP_RUN_INFO_URL 92 | DUMP_RUN_INFO=$(curl -s --fail "$DUMP_RUN_INFO_URL") 93 | 94 | if [[ $? != 0 ]]; then 95 | debug "fetching from URL $DUMP_RUN_INFO_URL failed" 96 | return 1 97 | fi 98 | 99 | for FN in $WIKIDATA_REQUIRED_FILES; do 100 | TABLENAME=${FN//_/}table # wb_items_per_site => wbitemspersitetable 101 | debug "checking status for table $TABLENAME" 102 | 103 | STATUS=$(echo "$DUMP_RUN_INFO" | TABLE=$TABLENAME jq -r '.jobs[env.TABLE].status') 104 | debug " status: $STATUS" 105 | 106 | if [ "$STATUS" != "done" ]; then 107 | debug "$TABLENAME not ready yet" 108 | ANY_FILE_MISSING=1 109 | fi 110 | done 111 | 112 | return $ANY_FILE_MISSING 113 | } 114 | 115 | # Find dates in directory names. We need to parse HTML. 116 | # 117 | CONTENT=$(curl -s -S --fail 'https://wikidata.aerotechnet.com/enwiki/') 118 | for DATE in $(echo $CONTENT | grep -oE '20[0-9]{6}' | sort -nr); do 119 | check_all_files_ready $DATE 120 | 121 | if [ $? == 0 ]; then 122 | echo "$DATE" 123 | exit 0 124 | fi 125 | done 126 | 127 | exit 1 128 | -------------------------------------------------------------------------------- /bin/mysqldump_to_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import fileinput 3 | import csv 4 | import sys 5 | import io 6 | 7 | # This prevents prematurely closed pipes from raising 8 | # an exception in Python 9 | from signal import signal, SIGPIPE, SIG_DFL 10 | signal(SIGPIPE, SIG_DFL) 11 | 12 | # allow large content in the dump 13 | csv.field_size_limit(sys.maxsize) 14 | 15 | def is_insert(line): 16 | """ 17 | Returns true if the line begins a SQL insert statement. 18 | """ 19 | return line.startswith('INSERT INTO') or False 20 | 21 | 22 | def get_values(line): 23 | """ 24 | Returns the portion of an INSERT statement containing values 25 | """ 26 | return line.partition('` VALUES ')[2] 27 | 28 | 29 | def values_sanity_check(values): 30 | """ 31 | Ensures that values from the INSERT statement meet basic checks. 32 | """ 33 | assert values 34 | assert values[0] == '(' 35 | # Assertions have not been raised 36 | return True 37 | 38 | 39 | def parse_values(values, outfile): 40 | """ 41 | Given a file handle and the raw values from a MySQL INSERT 42 | statement, write the equivalent CSV to the file 43 | """ 44 | latest_row = [] 45 | 46 | reader = csv.reader([values], delimiter=',', 47 | doublequote=False, 48 | escapechar='\\', 49 | quotechar="'", 50 | strict=True 51 | ) 52 | 53 | writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, escapechar='\\') 54 | for reader_row in reader: 55 | for column in reader_row: 56 | # If our current string is empty... 57 | if len(column) == 0 or column == 'NULL': 58 | # latest_row.append(chr(0)) 59 | latest_row.append('') 60 | continue 61 | # If our string starts with an open paren 62 | if column[0] == "(": 63 | # Assume that this column does not begin 64 | # a new row. 65 | new_row = False 66 | # If we've been filling out a row 67 | if len(latest_row) > 0: 68 | # Check if the previous entry ended in 69 | # a close paren. If so, the row we've 70 | # been filling out has been COMPLETED 71 | # as: 72 | # 1) the previous entry ended in a ) 73 | # 2) the current entry starts with a ( 74 | if (latest_row[-1] and latest_row[-1][-1] == ")"): 75 | # Remove the close paren. 76 | latest_row[-1] = latest_row[-1][:-1] 77 | new_row = True 78 | # If we've found a new row, write it out 79 | # and begin our new one 80 | if new_row: 81 | writer.writerow(latest_row) 82 | latest_row = [] 83 | # If we're beginning a new row, eliminate the 84 | # opening parentheses. 85 | if len(latest_row) == 0: 86 | column = column[1:] 87 | # Add our column to the row we're working on. 88 | latest_row.append(column) 89 | # At the end of an INSERT statement, we'll 90 | # have the semicolon. 91 | # Make sure to remove the semicolon and 92 | # the close paren. 93 | if latest_row[-1][-2:] == ");": 94 | latest_row[-1] = latest_row[-1][:-2] 95 | writer.writerow(latest_row) 96 | 97 | 98 | def main(): 99 | """ 100 | Parse arguments and start the program 101 | """ 102 | # Iterate over all lines in all files 103 | # listed in sys.argv[1:] 104 | # or stdin if no args given. 105 | try: 106 | # UPDATE: fileinput starts supporting 'errors' in Python 5.10. Until then 107 | # call io.open() directly. 108 | # for line in fileinput.input(): 109 | with io.open(sys.stdin.fileno(), 'r', encoding="utf-8", errors="ignore") as file: 110 | for line in file: 111 | # Look for an INSERT statement and parse it. 112 | if is_insert(line): 113 | values = get_values(line) 114 | if values_sanity_check(values): 115 | parse_values(values, sys.stdout) 116 | except KeyboardInterrupt: 117 | sys.exit(0) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /steps/output.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | 7 | OUTPUT_PATH="$BUILDID/output" 8 | mkdir -p "$OUTPUT_PATH" 9 | 10 | psqlcmd() { 11 | psql --quiet $DATABASE_NAME |& 12 | grep -v 'does not exist, skipping' 13 | } 14 | 15 | echo "=====================================================================" 16 | echo "Create output" 17 | echo "=====================================================================" 18 | 19 | # "=====================================================================" 20 | echo "Create tables" 21 | # "=====================================================================" 22 | 23 | echo "* wikipedia_article (Less rows and columns than wikipedia_article_full)" 24 | # Remove rows that don't have a title. For redirect only row 25 | 26 | echo "DROP TABLE IF EXISTS wikipedia_article;" | psqlcmd 27 | echo "CREATE TABLE wikipedia_article 28 | AS 29 | SELECT language, title, importance, wd_page_title FROM wikipedia_article_full 30 | WHERE wd_page_title IS NOT NULL 31 | AND importance != 0 32 | ;" | psqlcmd 33 | 34 | # 5 minutes 35 | # 9.2m rows 36 | 37 | echo "* wikipedia_redirect (Less rows than wikipedia_redirect_full)" 38 | # Remove rows that don't point to titles in wikipedia_article)" 39 | 40 | echo "DROP TABLE IF EXISTS wikipedia_redirect;" | psqlcmd 41 | echo "CREATE TABLE wikipedia_redirect 42 | AS 43 | SELECT wikipedia_redirect_full.* 44 | FROM wikipedia_redirect_full 45 | RIGHT OUTER JOIN wikipedia_article 46 | ON (wikipedia_redirect_full.language = wikipedia_article.language 47 | AND 48 | wikipedia_redirect_full.to_title = wikipedia_article.title) 49 | ;" | psqlcmd 50 | 51 | # 13m rows 52 | 53 | echo "* wikimedia_importance" 54 | 55 | echo "DROP TABLE IF EXISTS wikimedia_importance;" | psqlcmd 56 | echo "CREATE TABLE wikimedia_importance AS 57 | SELECT language, 'a' as type, title, importance, wd_page_title as wikidata_id 58 | FROM wikipedia_article 59 | ;" | psqlcmd 60 | 61 | # Now add the same from redirects, unless (language + title) already exists in wikimedia_importance 62 | echo "WITH from_redirects AS ( 63 | SELECT r.language, 'r' as type, r.from_title as title, a.importance, a.wd_page_title AS wikidata_id 64 | FROM wikipedia_article a, wikipedia_redirect r 65 | WHERE a.language = r.language AND a.title = r.to_title 66 | ) 67 | INSERT INTO wikimedia_importance 68 | SELECT from_redirects.* FROM from_redirects 69 | LEFT JOIN wikimedia_importance USING (language, title) 70 | WHERE wikimedia_importance IS NULL 71 | ;" | psqlcmd 72 | 73 | # Are all language+title unique? 74 | # WITH duplicates AS ( 75 | # SELECT language, title, count(*) 76 | # FROM wikimedia_importance 77 | # GROUP BY language, title 78 | # HAVING count(*) > 1 79 | # ) 80 | # SELECT count(*) FROM duplicates; 81 | # 0 82 | 83 | # 17m rows 84 | 85 | # "=====================================================================" 86 | echo "Dump table" 87 | # "=====================================================================" 88 | 89 | # Temporary table for sorting the output by most popular language. Nominatim assigns 90 | # the wikipedia extra tag to the first language it finds during import and English (en) 91 | # makes debugging easier than Arabic (ar). 92 | # Not a temporary table actually because with each psqlcmd call we start a new 93 | # session. 94 | # 95 | # language | size 96 | # ----------+--------- 97 | # en | 3360898 98 | # de | 989366 99 | # fr | 955523 100 | # uk | 920531 101 | # sv | 918185 102 | 103 | echo "DROP TABLE IF EXISTS top_languages;" | psqlcmd 104 | echo "CREATE TABLE top_languages AS 105 | SELECT language, COUNT(*) AS size 106 | FROM wikimedia_importance 107 | GROUP BY language 108 | ORDER BY size DESC 109 | ;" | psqlcmd 110 | 111 | echo "* wikimedia_importance.tsv.gz" 112 | 113 | { 114 | # Prints the CSV header row 115 | # language type title importance wikidata_id 116 | echo "COPY (SELECT * FROM wikimedia_importance LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | 117 | psqlcmd 118 | echo "COPY ( 119 | SELECT w.* 120 | FROM wikimedia_importance w 121 | JOIN top_languages tl ON w.language = tl.language 122 | ORDER BY tl.size DESC, w.type, w.title 123 | ) TO STDOUT" | 124 | psqlcmd 125 | } | pigz -9 >"$OUTPUT_PATH/wikimedia_importance.tsv.gz" 126 | 127 | # default is 600 128 | chmod 644 "$OUTPUT_PATH/wikimedia_importance.tsv.gz" 129 | 130 | du -h $OUTPUT_PATH/* 131 | # 265M wikimedia_importance.tsv.gz 132 | -------------------------------------------------------------------------------- /steps/wikidata_sql2csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | # Languages as comma-separated string, e.g. 'en,fr,de' 6 | : ${LANGUAGES:=bar,cy} 7 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 8 | 9 | 10 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata" 11 | CONVERTED_PATH="$BUILDID/converted/wikidata" 12 | mkdir -p $CONVERTED_PATH 13 | 14 | 15 | ############################################################################### 16 | ## GEO_TAGS 17 | ## 18 | echo "wikidata_sql2csv geo_tags" 19 | 20 | # MySQL schema inside the sql.gz file: 21 | # 22 | # CREATE TABLE `geo_tags` ( 23 | # `gt_id` int(10) unsigned NOT NULL AUTO_INCREMENT, 24 | # `gt_page_id` int(10) unsigned NOT NULL, 25 | # `gt_globe` varbinary(32) NOT NULL, 26 | # `gt_primary` tinyint(1) NOT NULL, 27 | # `gt_lat` decimal(11,8) DEFAULT NULL, 28 | # `gt_lon` decimal(11,8) DEFAULT NULL, 29 | # `gt_dim` int(11) DEFAULT NULL, 30 | # `gt_type` varbinary(32) DEFAULT NULL, 31 | # `gt_name` varbinary(255) DEFAULT NULL, 32 | # `gt_country` binary(2) DEFAULT NULL, 33 | # `gt_region` varbinary(3) DEFAULT NULL, 34 | 35 | # Remove anything globe!=earth, primary!=1 36 | # Round the coordinates 37 | unpigz -c $DOWNLOADED_PATH/geo_tags.sql.gz | \ 38 | ./bin/mysqldump_to_csv.py | \ 39 | bin/filter_wikidata_geo_tags.py | \ 40 | pigz -9 \ 41 | > $CONVERTED_PATH/geo_tags.csv.gz 42 | 43 | # Input 44 | # 134 MB (690 MB uncompressed) 45 | # Output 46 | # 89 MB (240 MB uncompressed) 47 | # 8.4m entries 48 | # columns: gt_page_id, gt_lat, gt_lon 49 | # 4175,43.1924,-81.3158 50 | # 4180,-26.0,121.0 51 | # 4181,43.08333333,2.41666667 52 | # 4187,51.76055556,14.33416667 53 | 54 | 55 | 56 | ############################################################################### 57 | ## PAGE 58 | ## 59 | 60 | echo "wikidata_sql2csv page" 61 | 62 | # MySQL schema inside the sql.gz file: 63 | # 64 | # CREATE TABLE `page` ( 65 | # `page_id` int(10) unsigned NOT NULL AUTO_INCREMENT, 66 | # `page_namespace` int(11) NOT NULL, 67 | # `page_title` varbinary(255) NOT NULL, 68 | # `page_restrictions` tinyblob DEFAULT NULL, 69 | # `page_is_redirect` tinyint(3) unsigned NOT NULL DEFAULT 0, 70 | # `page_is_new` tinyint(3) unsigned NOT NULL DEFAULT 0, 71 | # `page_random` double unsigned NOT NULL, 72 | # `page_touched` binary(14) NOT NULL, 73 | # `page_links_updated` varbinary(14) DEFAULT NULL, 74 | # `page_latest` int(10) unsigned NOT NULL, 75 | # `page_len` int(10) unsigned NOT NULL, 76 | # `page_content_model` varbinary(32) DEFAULT NULL, 77 | # `page_lang` varbinary(35) DEFAULT NULL, 78 | 79 | # We remove all namespace != 0 (0=articles, 99% of the lines) 80 | # page_lang isn't interesting, 'NULL' 99.999% of the time 81 | # Remove all page_title that don't start with 'Q' 82 | 83 | unpigz -c $DOWNLOADED_PATH/page.sql.gz | \ 84 | ./bin/mysqldump_to_csv.py | \ 85 | bin/filter_wikidata_page.py | \ 86 | pigz -9 \ 87 | > $CONVERTED_PATH/page.csv.gz 88 | 89 | # 34min 90 | # Input 91 | # 2.8GB, (3.1GB uncompresseed) 92 | # Output 93 | # 480MB, (1.8GB uncompressed) 94 | # 3m lines 95 | # columns: page_id, page_title 96 | # 97 | # 12991,Q11474 98 | # 12992,Q11475 99 | # 12993,Q11476 100 | # 12995,Q11477 101 | # 12996,Q11478 102 | # 12997,Q11479 103 | 104 | 105 | 106 | 107 | 108 | ############################################################################### 109 | ## WB_ITEMS_PER_SITE 110 | ## 111 | 112 | echo "wikidata_sql2csv wb_items_per_site" 113 | 114 | # MySQL schema inside the sql.gz file: 115 | # 116 | # CREATE TABLE `wb_items_per_site` ( 117 | # `ips_row_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 118 | # `ips_item_id` int(10) unsigned NOT NULL, 119 | # `ips_site_id` varbinary(32) NOT NULL, 120 | # `ips_site_page` varbinary(310) NOT NULL, 121 | 122 | # Only considering languages we need, cuts down 80m lines to 52m 123 | # LISTLANG=${LANGUAGES_ARRAY[@]} 124 | # ar bg ca cs da de en es 125 | # LANG_E_REGEX=",\(${LISTLANG// /\\|}\)wiki," 126 | # ,\(ar\|bg\|ca\|cs\|da\|de\|en...\)wiki, 127 | 128 | unpigz -c $DOWNLOADED_PATH/wb_items_per_site.sql.gz | \ 129 | ./bin/mysqldump_to_csv.py | \ 130 | bin/filter_wikidata_wb_items_per_site.py | \ 131 | pigz -9 \ 132 | > $CONVERTED_PATH/wb_items_per_site.csv.gz 133 | 134 | # Input 135 | # 1.4GB compressed, (4.7GB uncompressed) 136 | # Output 137 | # 750MB compressed, (2.2GB uncompressed) 138 | # 52m lines 139 | # columns: item_id, site_id, page (title) 140 | # 576947,cawiki,Bryaninops amplus 141 | # 2739322,cawiki,Bryneich 142 | # 2927288,cawiki,Bréjaude 143 | # 2912549,cawiki,Brúixola Brunton 144 | 145 | 146 | du -h $CONVERTED_PATH/* 147 | # 88M geo_tags.csv.gz 148 | # 480M page.csv.gz 149 | # 744M wb_items_per_site.csv.gz 150 | -------------------------------------------------------------------------------- /steps/wikidata_api_fetch_placetypes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | 6 | DOWNLOADED_PATH="$BUILDID/downloaded/wikidata" 7 | TEMP_PATH=$DOWNLOADED_PATH/tmp 8 | 9 | if [[ -e $DOWNLOADED_PATH/wikidata_place_dump.csv.gz ]]; then 10 | echo "Output file $DOWNLOADED_PATH/wikidata_place_dump.csv.gz already exists. Won't fetch again." 11 | exit 0 12 | fi 13 | 14 | echo "=====================================================================" 15 | echo "Get wikidata places from wikidata query API" 16 | echo "=====================================================================" 17 | 18 | # We create a mapping of QID->place type QID 19 | # for example 'Q6922586;Q130003' (Mount Olympus Ski Area -> ski resort) 20 | # 21 | # Takes about 30 minutes for 300 place types. 22 | # 23 | # The input wikidata_place_types.txt has the format 24 | # Q1303167;barn 25 | # Q130003;ski resort 26 | # Q12518;tower 27 | # The second column is optional. 28 | # 29 | # We tried to come up with a list of geographic related types but wikidata hierarchy 30 | # is complex. You'd need to know what a Raion is (administrative unit of post-Soviet 31 | # states) or a Bight. Many place types will be too broad, too narrow or even missing. 32 | # It's best effort. 33 | # 34 | # wdtaxonomy (https://github.com/nichtich/wikidata-taxonomy) runs SPARQL queries 35 | # against wikidata servers. Add --sparql to see the query. Example SPARQL query: 36 | # 37 | # SELECT ?item ?broader ?sites WITH { 38 | # SELECT DISTINCT ?item { ?item wdt:P279* wd:Q82794 } 39 | # } AS %items WHERE { 40 | # INCLUDE %items . 41 | # OPTIONAL { ?item wdt:P279 ?broader } . 42 | # { 43 | # SELECT ?item (count(distinct ?site) as ?sites) { 44 | # INCLUDE %items. 45 | # OPTIONAL { ?site schema:about ?item } 46 | # } GROUP BY ?item 47 | # } 48 | # } 49 | # 50 | # The queries can time out (60 second limit). If that's the case we need to further 51 | # subdivide the place type. For example Q486972 (human settlement) has too many 52 | # instances. We run "wdtaxonomy Q486972 | grep '^├─'" which prints a long list 53 | # ├──municipality (Q15284) •106 ×4208 ↑↑↑↑ 54 | # ├──trading post (Q39463) •14 ×97 ↑ 55 | # ├──monastery (Q44613) •100 ×13536 ↑↑↑↑↑ 56 | # ├──barangay (Q61878) •39 ×3524 ↑ 57 | # ├──county seat (Q62049) •34 ×1694 ↑ 58 | # 59 | # Some instances don't have titles, e.g. https://www.wikidata.org/wiki/Q17218407 60 | # but can still be assigned to wikipedia articles, in this case 61 | # https://ja.wikipedia.org/wiki/%E3%82%81%E3%81%8C%E3%81%B2%E3%82%89%E3%82%B9%E3%82%AD%E3%83%BC%E5%A0%B4 62 | # so we leave them in. 63 | 64 | mkdir -p $DOWNLOADED_PATH 65 | mkdir -p $TEMP_PATH 66 | 67 | echo "Number of place types:" 68 | wc -l config/wikidata_place_types.txt 69 | echo -n > $DOWNLOADED_PATH/wikidata_place_dump.csv 70 | 71 | while read PT_LINE ; do 72 | QID=$(echo $PT_LINE | sed 's/;.*//' ) 73 | NAME=$(echo $PT_LINE | sed 's/^.*;//' ) 74 | 75 | # Querying for place type Q205495 (petrol station)... 76 | echo "Querying for place type $QID ($NAME)..." 77 | 78 | # Example response from wdtaxonomy in CSV format for readability: 79 | # level,id,label,sites,instances,parents 80 | # [...] 81 | # -,Q110941628,Tegatayama Ski Area,0,0, 82 | # -,Q111016306,Ski resort Říčky,0,0, 83 | # -,Q111016347,Ski resort Deštné v Orlických horách,0,0, 84 | # -,Q111818006,Lively Ski Hill,0,0, 85 | # -,Q111983623,Falls Creek Alpine Resort,0,0, 86 | # -,Q1535041,summer skiing area,3,0,^^ 87 | # -,Q2292158,,1,0, 88 | # -,Q5136446,Club skifield,1,0, 89 | # --,Q6922586,Mount Olympus Ski Area,0,0, 90 | # -,Q30752692,,1,0, 91 | # 92 | # For faster queries we use --no-instancecount and --no-labels 93 | # Now the columns are actually 'level,id,label,sites,parents' with 'label' always empty. 94 | # Unclear why for TSV the header is still commas, likely a bug in wdtaxonomy 95 | # 96 | # We don't care about parents ('^^', so called broader subcategories) in the last column. 97 | # We filter subcategoies, e.g. 'Club skifield', we're only interested in the children 98 | # (instances). Subcategories have 'sites' value > 0 99 | # 100 | 101 | wdtaxonomy $QID --instances --no-instancecount --no-labels --format tsv | \ 102 | cut -f1-4 | \ 103 | grep -e "[[:space:]]0$" | \ 104 | cut -f2 | \ 105 | sort | \ 106 | awk -v qid=$QID '{print $0 ","qid}' > $TEMP_PATH/$QID.csv 107 | wc -l $TEMP_PATH/$QID.csv 108 | 109 | # output example: 110 | # Q97774986,Q130003 111 | # Q980500,Q130003 112 | # Q988298,Q130003 113 | # Q991719,Q130003 114 | # Q992902,Q130003 115 | # Q995986,Q130003 116 | 117 | cat $TEMP_PATH/$QID.csv >> $DOWNLOADED_PATH/wikidata_place_dump.csv 118 | rm $TEMP_PATH/$QID.csv 119 | done < config/wikidata_place_types.txt 120 | 121 | # Non-Q is less than 20, not sure what they mean 122 | # L673595,Q4830453 123 | # P750,Q4830453 124 | # L162425-S2,Q40357 125 | # uniq saves 4% lines 126 | # 470MB compressed 72MB 127 | grep '^Q' $DOWNLOADED_PATH/wikidata_place_dump.csv | \ 128 | uniq | \ 129 | pigz -f -9 > $DOWNLOADED_PATH/wikidata_place_dump.csv.gz 130 | 131 | cp config/wikidata_place_type_levels.csv $DOWNLOADED_PATH 132 | # temp should be empty but if not then that should be fine, too 133 | rmdir $TEMP_PATH 134 | 135 | du -h $DOWNLOADED_PATH 136 | -------------------------------------------------------------------------------- /steps/wikidata_process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${DATABASE_NAME:=wikiprocessingdb} 6 | # Languages as comma-separated string, e.g. 'en,fr,de' 7 | : ${LANGUAGES:=bar,cy} 8 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 9 | 10 | psqlcmd() { 11 | psql --quiet $DATABASE_NAME |& \ 12 | grep -v 'does not exist, skipping' 13 | } 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | echo "=====================================================================" 22 | echo "Create derived tables" 23 | echo "=====================================================================" 24 | 25 | 26 | echo "DROP TABLE IF EXISTS geo_earth_wikidata;" | psqlcmd 27 | echo "CREATE TABLE geo_earth_wikidata AS 28 | SELECT DISTINCT geo_tags.gt_page_id, 29 | geo_tags.gt_lat, 30 | geo_tags.gt_lon, 31 | page.page_title 32 | FROM geo_tags 33 | LEFT OUTER JOIN page 34 | ON (geo_tags.gt_page_id = page.page_id) 35 | ORDER BY geo_tags.gt_page_id 36 | ;" | psqlcmd 37 | 38 | echo "ALTER TABLE wikidata_place_dump 39 | ADD COLUMN ont_level integer, 40 | ADD COLUMN lat numeric(11,8), 41 | ADD COLUMN lon numeric(11,8) 42 | ;" | psqlcmd 43 | 44 | echo "UPDATE wikidata_place_dump 45 | SET ont_level = wikidata_place_type_levels.level 46 | FROM wikidata_place_type_levels 47 | WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type 48 | ;" | psqlcmd 49 | 50 | 51 | echo "DROP TABLE IF EXISTS wikidata_places;" | psqlcmd 52 | echo "CREATE TABLE wikidata_places 53 | AS 54 | SELECT DISTINCT ON (item) item, 55 | instance_of, 56 | MAX(ont_level) AS ont_level, 57 | lat, 58 | lon 59 | FROM wikidata_place_dump 60 | GROUP BY item, 61 | instance_of, 62 | ont_level, 63 | lat, 64 | lon 65 | ORDER BY item 66 | ;" | psqlcmd 67 | 68 | echo "UPDATE wikidata_places 69 | SET lat = geo_earth_wikidata.gt_lat, 70 | lon = geo_earth_wikidata.gt_lon 71 | FROM geo_earth_wikidata 72 | WHERE wikidata_places.item = geo_earth_wikidata.page_title 73 | ;" | psqlcmd 74 | 75 | 76 | 77 | 78 | echo "=====================================================================" 79 | echo "Process language pages" 80 | echo "=====================================================================" 81 | 82 | 83 | echo "DROP TABLE IF EXISTS wikidata_pages;" | psqlcmd 84 | echo "CREATE TABLE wikidata_pages ( 85 | item text, 86 | instance_of text, 87 | lat numeric(11,8), 88 | lon numeric(11,8), 89 | wp_page_title text, 90 | language text 91 | );" | psqlcmd 92 | 93 | for LANG in "${LANGUAGES_ARRAY[@]}" 94 | do 95 | echo "DROP TABLE IF EXISTS wikidata_${LANG}_pages;" | psqlcmd 96 | echo "CREATE TABLE wikidata_${LANG}_pages AS 97 | SELECT wikidata_places.item, 98 | wikidata_places.instance_of, 99 | wikidata_places.lat, 100 | wikidata_places.lon, 101 | wb_items_per_site.ips_site_page 102 | FROM wikidata_places 103 | LEFT JOIN wb_items_per_site 104 | ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) 105 | WHERE ips_site_id = '${LANG}wiki' 106 | ORDER BY wikidata_places.item 107 | ;" | psqlcmd 108 | 109 | echo "INSERT INTO wikidata_pages 110 | SELECT item, 111 | instance_of, 112 | lat, 113 | lon, 114 | REPLACE(ips_site_page, ' ', '_') as wp_page_title, 115 | '${LANG}' 116 | FROM wikidata_${LANG}_pages 117 | ;" | psqlcmd 118 | done 119 | 120 | 121 | 122 | 123 | echo "=====================================================================" 124 | echo "Add wikidata to wikipedia_article_full table" 125 | echo "=====================================================================" 126 | 127 | echo "UPDATE wikipedia_article_full 128 | SET lat = wikidata_pages.lat, 129 | lon = wikidata_pages.lon, 130 | wd_page_title = wikidata_pages.item, 131 | instance_of = wikidata_pages.instance_of 132 | FROM wikidata_pages 133 | WHERE wikipedia_article_full.language = wikidata_pages.language 134 | AND wikipedia_article_full.title = wikidata_pages.wp_page_title 135 | ;" | psqlcmd 136 | 137 | # 35 minutes 138 | # 166m rows 139 | 140 | 141 | echo "=====================================================================" 142 | echo "Calculate importance score for each wikipedia page" 143 | echo "=====================================================================" 144 | 145 | # takes 3 minutes 146 | # 'greatest' because log(1)/ is always 0 147 | echo "UPDATE wikipedia_article_full 148 | SET importance = GREATEST( 149 | LOG(totalcount) 150 | / 151 | LOG(( 152 | SELECT MAX(totalcount) 153 | FROM wikipedia_article_full 154 | WHERE wd_page_title IS NOT NULL 155 | )), 156 | 0.0000000001 157 | ) 158 | ;" | psqlcmd 159 | -------------------------------------------------------------------------------- /steps/wikipedia_sql2csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set defaults 4 | : ${BUILDID:=latest} 5 | : ${LANGUAGES:=bar,cy} 6 | LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' ')) 7 | 8 | DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia" 9 | CONVERTED_PATH="$BUILDID/converted/wikipedia" 10 | 11 | echo "=====================================================================" 12 | echo "Convert Wikipedia language tables" 13 | echo "=====================================================================" 14 | 15 | for LANG in "${LANGUAGES_ARRAY[@]}" 16 | do 17 | mkdir -p "$CONVERTED_PATH/$LANG/" 18 | 19 | echo "[language $LANG] Page table SQL => CSV" 20 | # https://www.mediawiki.org/wiki/Manual:Page_table 21 | # 22 | # CREATE TABLE `page` ( 23 | # `page_id` int(8) unsigned NOT NULL AUTO_INCREMENT, 24 | # `page_namespace` int(11) NOT NULL DEFAULT 0, 25 | # `page_title` varbinary(255) NOT NULL DEFAULT '', 26 | # `page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT 0, 27 | # `page_is_new` tinyint(1) unsigned NOT NULL DEFAULT 0, 28 | # `page_random` double unsigned NOT NULL DEFAULT 0, 29 | # `page_touched` varbinary(14) NOT NULL DEFAULT '', 30 | # `page_links_updated` varbinary(14) DEFAULT NULL, 31 | # `page_latest` int(8) unsigned NOT NULL DEFAULT 0, 32 | # `page_len` int(8) unsigned NOT NULL DEFAULT 0, 33 | # `page_content_model` varbinary(32) DEFAULT NULL, 34 | # `page_lang` varbinary(35) DEFAULT NULL, 35 | # 36 | # Only interested in page_namespace == 0 (articles) 37 | # English wikipedia: 38 | # input 1.9GB compressed 39 | # output 200MB compressed 40 | # Output columns: page_id, page_title 41 | 42 | unpigz -c $DOWNLOADED_PATH/$LANG/page.sql.gz | \ 43 | bin/mysqldump_to_csv.py | \ 44 | bin/filter_page.py | \ 45 | pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz 46 | 47 | 48 | echo "[language $LANG] linktarget table SQL => CSV" 49 | # https://www.mediawiki.org/wiki/Manual:Linktarget_table 50 | # 51 | # CREATE TABLE `linktarget` ( 52 | # `lt_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 53 | # `lt_namespace` int(11) NOT NULL, 54 | # `lt_title` varbinary(255) NOT NULL, 55 | # 56 | # Only interested in lt_namespace == 0 (articles) 57 | # English wikipedia: 58 | # input 964MB compressed (100m rows) 59 | # output 322MB compressed (30m rows) 60 | # Output columns: lt_id, lt_title 61 | 62 | unpigz -c $DOWNLOADED_PATH/${LANG}/linktarget.sql.gz | \ 63 | bin/mysqldump_to_csv.py | \ 64 | bin/filter_redirect.py | \ 65 | pigz -9 > $CONVERTED_PATH/$LANG/linktarget.csv.gz 66 | 67 | 68 | 69 | echo "[language $LANG] Pagelinks table SQL => CSV" 70 | # https://www.mediawiki.org/wiki/Manual:Pagelinks_table 71 | # 72 | # CREATE TABLE `pagelinks` ( 73 | # `pl_from` int(8) unsigned NOT NULL DEFAULT 0, 74 | # `pl_namespace` int(11) NOT NULL DEFAULT 0, 75 | # `pl_target_id` bigint(20) unsigned NOT NULL, 76 | # 77 | # Only interested in target_ids that point to == 0 (articles) 78 | # English wikipedia: 79 | # input 6.8GB compressed 80 | # output 200MB compressed 81 | # Output columns: lt_title (from linktarget file), count (unique pl_from) 82 | 83 | unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \ 84 | bin/mysqldump_to_csv.py | \ 85 | bin/filter_pagelinks.py $CONVERTED_PATH/$LANG/linktarget.csv.gz | \ 86 | pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz 87 | 88 | 89 | echo "[language $LANG] langlinks table SQL => CSV" 90 | # https://www.mediawiki.org/wiki/Manual:Langlinks_table 91 | # 92 | # CREATE TABLE `langlinks` ( 93 | # `ll_from` int(8) unsigned NOT NULL DEFAULT 0, 94 | # `ll_lang` varbinary(35) NOT NULL DEFAULT '', 95 | # `ll_title` varbinary(255) NOT NULL DEFAULT '', 96 | # 97 | # Output columns: ll_title, ll_from_page_id, ll_lang 98 | # Output is sorted by lang 99 | # English wikipedia: 100 | # input 400MB compressed (1.5GB uncompressed) 101 | # output 310MB compressed (1.3GB uncompressed) 102 | 103 | unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \ 104 | bin/mysqldump_to_csv.py | \ 105 | bin/filter_langlinks.py | \ 106 | pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz 107 | 108 | 109 | 110 | 111 | echo "[language $LANG] redirect table SQL => CSV" 112 | # https://www.mediawiki.org/wiki/Manual:Redirect_table 113 | # 114 | # CREATE TABLE `redirect` ( 115 | # `rd_from` int(8) unsigned NOT NULL DEFAULT 0, 116 | # `rd_namespace` int(11) NOT NULL DEFAULT 0, 117 | # `rd_title` varbinary(255) NOT NULL DEFAULT '', 118 | # `rd_interwiki` varbinary(32) DEFAULT NULL, 119 | # `rd_fragment` varbinary(255) DEFAULT NULL, 120 | # 121 | # Only interested in rd_namespace = 0 (articles) 122 | # Output columns: rd_from_page_id, rd_title 123 | # English wikipedia: 124 | # input 140MB compressed (530MB uncompressed) 125 | # output 120MB compressed (300MB uncompressed) 126 | 127 | unpigz -c $DOWNLOADED_PATH/$LANG/redirect.sql.gz | \ 128 | bin/mysqldump_to_csv.py | \ 129 | bin/filter_redirect.py | \ 130 | pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz 131 | 132 | du -h $CONVERTED_PATH/$LANG/* 133 | done 134 | -------------------------------------------------------------------------------- /config/wikidata_place_types.txt: -------------------------------------------------------------------------------- 1 | Q9842;primary school 2 | Q149566;middle school 3 | Q9430;ocean 4 | Q928830;metro station 5 | Q9259;UNESCO World Heritage Site 6 | Q91028;administrative arrondissement of Belgium 7 | Q8514;desert 8 | Q8502;mountain 9 | Q15324;body of water 10 | Q28575;county 11 | Q39816;valley 12 | Q46831;mountain range 13 | Q50337;prefecture of Japan 14 | Q175185;rural area 15 | Q191086;jungle 16 | Q205895;landmass 17 | Q207520;region of Japan 18 | Q369639;region of Norway 19 | Q15284;municipality 20 | Q123705;neighborhood 21 | Q161387;kibbutz 22 | Q188509;suburb 23 | Q200250;metropolis 24 | Q245016;military base 25 | Q253019;Ortsteil 26 | Q269528;unincorporated area 27 | Q329245;basic unit of settlement in the Czech Republic 28 | Q484170;commune of France 29 | Q498162;census-designated place 30 | Q509028;ranch 31 | Q582706;Israeli settlement 32 | Q587144;census town 33 | Q618299;settlement 34 | Q622499;refugee camp 35 | Q627236;company town 36 | Q674950;residential area 37 | Q702492;urban area 38 | Q790344;district of Barcelona 39 | Q815324;town municipality of Turkey 40 | Q820254;mining community 41 | Q1074523;planned community 42 | Q1175522;County city (council system) 43 | Q1198413;military camp 44 | Q1288520;local council in Israel 45 | Q1294703;shanty town 46 | Q1326028;camp 47 | Q1348006;city block 48 | Q1372205;dispersed settlement 49 | Q1394476;civil township 50 | Q1426493;trailer park 51 | Q1501046;community settlement 52 | Q1529096;village in Turkey 53 | Q1907114;metropolitan area 54 | Q2755753;area of London 55 | Q2989398;commune of Algeria 56 | Q3172900;colony 57 | Q3257686;locality 58 | Q3477348;urban area 59 | Q3559019;urban village 60 | Q4313794;populated place in Georgia 61 | Q4373615;colony of the Russian empire 62 | Q4632675;dwelling place 63 | Q4668360;Aboriginal community in Western Australia 64 | Q4845841;settlement (Croatia) 65 | Q5148433;Colonias of Mexico City 66 | Q5195043;borough 67 | Q7930989;city/town 68 | Q10354598;rural settlement 69 | Q10840661;urban area of Vietnam 70 | Q12051488;populated place in Ukraine 71 | Q12063697;neighborhood of Washington, D.C. 72 | Q16480895;hamlet 73 | Q24258416;railway station 74 | Q26714626;large village 75 | Q27062006;station 76 | Q27554677;former capital 77 | Q108775530;proposed human settlement 78 | Q855697;subcontinent 79 | Q1029013;Megaregions of the United States 80 | Q1200957;tourist destination 81 | Q1666245;region of China 82 | Q1970725;natural region 83 | Q2140699;wine-producing region 84 | Q3744088;tourism region 85 | Q30059;arrondissement 86 | Q52105;habitat 87 | Q107425;landscape 88 | Q171809;county of England 89 | Q356936;exclusion zone 90 | Q453909;built-up area 91 | Q518261;cultural area 92 | Q1062177;capital region 93 | Q1081138;historic site 94 | Q1092661;moorland 95 | Q1133961;commercial district 96 | Q1185892;geological massif 97 | Q1248049;Land 98 | Q1286517;natural landscape 99 | Q1389310;waterbody 100 | Q1662024;industrial district 101 | Q1742059;lake area 102 | Q1852859;populated place in the Netherlands 103 | Q2063507;recreation area 104 | Q3241565;woodland 105 | Q10594991;nature area 106 | Q16363669;sports park 107 | Q17350442;venue 108 | Q27995042;wilderness area 109 | Q55726155;financial district 110 | Q99323582;quarter or sector of Monaco 111 | Q820477;mine 112 | Q12323;dam 113 | Q12493;dome 114 | Q27686;hotel 115 | Q39614;cemetery 116 | Q54831;amphitheatre 117 | Q62447;aerodrome 118 | Q62832;observatory 119 | Q83405;factory 120 | Q199451;pagoda 121 | Q483110;stadium 122 | Q483453;fountain 123 | Q587682;oil well 124 | Q653208;monolith 125 | Q671224;data center 126 | Q697295;shrine 127 | Q699405;residence 128 | Q1076486;sports venue 129 | Q4260475;medical facility 130 | Q4989906;monument 131 | Q5327174;earth structure 132 | Q6017969;scenic viewpoint 133 | Q6640302;commercial center 134 | Q10373565;waterworks 135 | Q12146012;underground building 136 | Q12292478;estate 137 | Q15090615;arts venue 138 | Q29845814;trolleybus depot 139 | Q47520309;recycling facility 140 | Q50418254;outdoor concert venue 141 | Q55713852;resthouse 142 | Q56240808;oil rig 143 | Q8072;volcano 144 | Q79007;street 145 | Q786014;rest area 146 | Q75848;gated community 147 | Q75520;plateau 148 | Q728937;railway line 149 | Q7275;state 150 | Q719456;station 151 | Q7075;library 152 | Q6852233;military building 153 | Q682943;cricket field 154 | Q665487;diocese 155 | Q655686;commercial building 156 | Q643589;department 157 | Q641226;arena 158 | Q631305;rock formation 159 | Q6256;country 160 | Q6023295;funerary structure 161 | Q5773747;historic house 162 | Q55659167;natural watercourse 163 | Q55488;railway station 164 | Q54050;hill 165 | Q532;village 166 | Q53060;gate 167 | Q52177058;civic building 168 | Q515716;prefecture 169 | Q5153984;commune-level subdivision of Vietnam 170 | Q515;city 171 | Q5144960;microregion 172 | Q5119;capital 173 | Q5107;continent 174 | Q5084;hamlet 175 | Q5031071;canal tunnel 176 | Q5003624;memorial 177 | Q4976993;civil parish 178 | Q4830453;business 179 | Q47521;stream 180 | Q473972;protected area 181 | Q46614560;deanery (building) 182 | Q44782;port 183 | Q44613;monastery 184 | Q44539;temple 185 | Q44494;mill 186 | Q44377;tunnel 187 | Q4421;forest 188 | Q43501;zoo 189 | Q4286337;city district 190 | Q42523;atoll 191 | Q40357;prison 192 | Q4022;river 193 | Q40080;beach 194 | Q39715;lighthouse 195 | Q3957;town 196 | Q3947;house 197 | Q38723;higher education institution 198 | Q38720;windmill 199 | Q3623867;arrondissement of Benin 200 | Q35666;glacier 201 | Q355304;watercourse 202 | Q35509;cave 203 | Q35112127;historic building 204 | Q34985575;city district 205 | Q34876;province 206 | Q34763;peninsula 207 | Q34627;synagogue 208 | Q3455524;administrative region 209 | Q34442;road 210 | Q33837;archipelago 211 | Q33506;museum 212 | Q24699794;museum building 213 | Q32815;mosque 214 | Q3240715;crater 215 | Q3191695;regency of Indonesia 216 | Q3153117;intercommunality 217 | Q30198;marsh 218 | Q30139652;health care structure 219 | Q294422;public building 220 | Q2870166;water ride 221 | Q274153;water tower 222 | Q271669;landform 223 | Q2659904;government organization 224 | Q24529780;point 225 | Q24354;theater 226 | Q2354973;road tunnel 227 | Q23442;island 228 | Q23413;castle 229 | Q23397;lake 230 | Q2327515;city district of Baden-Württemberg 231 | Q2311958;canton 232 | Q22927291;sixth-level administrative country subdivision 233 | Q22698;park 234 | Q2175765;tram stop 235 | Q205495;petrol station 236 | Q204832;roller coaster 237 | Q2042028;ravine 238 | Q202216;overseas department and region of France 239 | Q194203;arrondissement of France 240 | Q194195;amusement park 241 | Q185187;watermill 242 | Q185113;cape 243 | Q1799794;administrative territorial entity of a specific level 244 | Q1788454;road junction 245 | Q1785071;fort 246 | Q1777138;race track 247 | Q177380;hot spring 248 | Q174814;electrical substation 249 | Q174782;square 250 | Q17343829;unincorporated community in the United States 251 | Q17018380;bight 252 | Q16970;church building 253 | Q16917;hospital 254 | Q39364723;hospital building 255 | Q16831714;government building 256 | Q165;sea 257 | Q160742;abbey 258 | Q159719;power station 259 | Q159334;secondary school 260 | Q15640612;fifth-level administrative country subdivision 261 | Q15243209;historic district 262 | Q152081;concentration camp 263 | Q15195406;city district in Russia 264 | Q1500350;township of the People's Republic of China 265 | Q149621;district 266 | Q14757767;fourth-level administrative country subdivision 267 | Q14350;radio station 268 | Q1410668;National Wildlife Refuge 269 | Q1377575;wildlife refuge 270 | Q1353183;launch pad 271 | Q134447;nuclear power plant 272 | Q133215;casino 273 | Q133056;mountain pass 274 | Q13221722;third-level administrative country subdivision 275 | Q13220204;second-level administrative country subdivision 276 | Q1311958;railway tunnel 277 | Q1303167;barn 278 | Q130003;ski resort 279 | Q12518;tower 280 | Q489357;farmhouse 281 | Q12516;pyramid 282 | Q1248784;airport 283 | Q12284;canal 284 | Q12280;bridge 285 | Q121359;infrastructure 286 | Q1210950;channel 287 | Q11755880;residential building 288 | Q11707;restaurant 289 | Q11315;shopping center 290 | Q11303;skyscraper 291 | Q1115575;civil parish 292 | Q1107656;garden 293 | Q10864048;first-level administrative country subdivision 294 | Q105731;lock 295 | Q105190;levee 296 | Q1048525;golf course 297 | Q102496;parish 298 | Q28872924;designation for an administrative territorial entity of a single country 299 | Q15617994;designation for an administrative territorial entity 300 | Q159313;urban agglomeration 301 | Q24398318;religious building 302 | Q327333;government agency 303 | Q860861;sculpture 304 | Q46395;British overseas territories 305 | Q103910131;part of city or town or population centre 306 | Q103910177;city or town or population centre 307 | Q103910453;village or neigbourhood -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Add Wikipedia and Wikidata to Nominatim 2 | 3 | ## Summary 4 | 5 | This project creates an export of Wikipedia articles (title, wikidata id) and an calculated importance score (0..1) for each. 6 | If Wikipedia has redirects to titles then each redirect is also added. 7 | 8 | The score can be used to approximate how important a place name is relative to another by the same name. 9 | 10 | Examples: 11 | 12 | * "Berlin" (capital of Germany, [Wikipedia](https://en.wikipedia.org/wiki/Berlin), [OpenStreetMap](https://www.openstreetmap.org/relation/62422)) 13 | vs "Berlin" (town in Maryland, USA, [Wikipedia](https://en.wikipedia.org/wiki/en:Berlin,%20Maryland), [OpenStreetMap](https://www.openstreetmap.org/relation/133689)). 14 | * "Eiffel Tower" (Paris, France, [Wikipedia](https://en.wikipedia.org/wiki/Eiffel_Tower), [OpenStreetMap](https://www.openstreetmap.org/way/5013364)) vs "Eiffel Tower" (Paris, Tennessee, United States, [Wikipedia](https://en.wikipedia.org/wiki/Eiffel_Tower_(Paris,_Tennessee)), [OpenStreetMap](https://www.openstreetmap.org/way/1080841041)). 15 | * 50 places called "Springfield" in the United States 16 | * 35 places called "Washington" in the United States 17 | 18 | [Nominatim](https://nominatim.org/) geocoding engine can import the files and improve its ranking of 19 | place search results. During searches Nominatim combines importance score with other ranking factors like place type 20 | (city vs county vs village), proximity (e.g. current map view position), phrase relevance (how many words 21 | in the results match the search terms). 22 | 23 | Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month. 24 | 25 | To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process 26 | 39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with NVMe 27 | drives. 28 | 29 | ``` 30 | 334M wikimedia_importance.tsv.gz 31 | ``` 32 | 33 | 34 | ## History of this project 35 | 36 | Nominatim 2.2 introduced the first `utils/importWikipedia.php` using [mwdumper](https://github.com/bcollier/mwdumper/), 37 | then parsing HTML pages to find geo coordindates in articles. It was a single script without documentation on runtime 38 | and ran irregular (less than once per year). Output was binary SQL database dumps. 39 | 40 | During several months of [Google Summer of Code](https://en.wikipedia.org/wiki/Google_Summer_of_Code) 2019, [tchaddad](https://www.openstreetmap.org/user/tchaddad) rewrote the script, added wikidata processing, documentation and merged files into a new `wikimedia-importance.sql.gz` export. You can read her reports on [her diary posts](https://www.openstreetmap.org/user/tchaddad/diary). 41 | 42 | Nominatim 3.5 switched to using the new `wikimedia-importance.sql.gz` file and improved its ranking algorithm. 43 | 44 | Later the project was moved into its own git repository. In small steps the process was split into steps for downloading, 45 | converting, processing, creating output. `mysql2pgsql` was replaced with `mysqldump`, which allowed filtering in scripts. 46 | Performance was improved by loading only required data into the database. Some caching (don't redownload files) and 47 | retries (wikidata API being unreliable) was added. 48 | 49 | 50 | ## Output data 51 | 52 | `wikimedia_importance.tsv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023. 53 | The file tab delimited, not quoted, is sorted and contains a header row. 54 | 55 | | Column | Type | 56 | | ----------- | ---------------- | 57 | | language | text | 58 | | type | char | 59 | | title | text | 60 | | importance | double precision | 61 | | wikidata_id | text | 62 | 63 | All columns are filled with values. 64 | 65 | Combination of language+title (and language+type+title) are unique. 66 | 67 | Type is either 'a' (article) or 'r' (redirect). 68 | 69 | Maximum title length is 247. 70 | 71 | Importance is between 0.0000000001 (never 0) and 1. 72 | 73 | Currently 39 languages, English has by far the largest share. 74 | 75 | | language | count | 76 | | -------------- | ---------------- | 77 | | en (English) | 3,337,994 (19%) | 78 | | de (German) | 966,820 (6%) | 79 | | fr (French) | 935,817 (5%) | 80 | | sv (Swdish) | 906,813 | 81 | | uk (Ukranian) | 900,548 | 82 | | ... | | 83 | | bg (Bulgarian) | 88,993 | 84 | 85 | Examples of `wikimedia_importance.tsv.gz` rows: 86 | 87 | * Wikipedia contains redirects, so a single wikidata object can have multiple titles even though. Each title has the same importance score. Redirects to non-existing articles are removed. 88 | 89 | ``` 90 | en a Brandenburg_Gate 0.5531125195487524 Q82425 91 | en r Berlin's_Gate 0.5531125195487524 Q82425 92 | en r Brandenberg_Gate 0.5531125195487524 Q82425 93 | en r Brandenburger_gate 0.5531125195487524 Q82425 94 | en r Brandenburger_Gate 0.5531125195487524 Q82425 95 | en r Brandenburger_Tor 0.5531125195487524 Q82425 96 | en r Brandenburg_gate 0.5531125195487524 Q82425 97 | en r BRANDENBURG_GATE 0.5531125195487524 Q82425 98 | en r Brandenburg_Gates 0.5531125195487524 Q82425 99 | en r Brandenburg_Tor 0.5531125195487524 Q82425 100 | ``` 101 | 102 | * Wikipedia titles contain underscores instead of space, e.g. [Alford,_Massachusetts](https://en.wikipedia.org/wiki/Alford,_Massachusetts) 103 | 104 | ``` 105 | en a "Alford _Massachusetts" 0.36590368314334637 Q2431901 106 | en r "Alford _ma" 0.36590368314334637 Q2431901 107 | en r "Alford _MA" 0.36590368314334637 Q2431901 108 | en r "Alford _Mass" 0.36590368314334637 Q2431901 109 | ``` 110 | 111 | * The highest score article is the [United States](https://en.wikipedia.org/wiki/United_States) 112 | 113 | ``` 114 | pl a Stany_Zjednoczone 1 Q30 115 | en a United_States 1 Q30 116 | ru a Соединённые_Штаты_Америки 1 Q30 117 | hu a Amerikai_Egyesült_Államok 1 Q30 118 | it a Stati_Uniti_d'America 1 Q30 119 | de a Vereinigte_Staaten 1 Q30 120 | ... 121 | ``` 122 | 123 | ## How importance scores are calculated 124 | 125 | Wikipedia articles with more links to them from other articles ("pagelinks") plus from other languages ("langlinks") receive a higher score. 126 | 127 | 1. The Wikipedia dump file `${language}pagelinks` contains how many links each Wikipedia article 128 | has **from** other Wikipedia articles of the same language. We store that as `langcount` for 129 | each article. 130 | 131 | The dump has the columns 132 | 133 | ```sql 134 | CREATE TABLE `pagelinks` ( 135 | `pl_from` int(8) unsigned NOT NULL DEFAULT 0, 136 | `pl_namespace` int(11) NOT NULL DEFAULT 0, 137 | `pl_title` varbinary(255) NOT NULL DEFAULT '', 138 | `pl_from_namespace` int(11) NOT NULL DEFAULT 0, 139 | ``` 140 | 141 | After filtering namespaces (0 = articles) we only have to look at the `pl_title` column 142 | and count now often each title occurs. For example `Eiffel_Tower` 2862 times (*). 143 | We store that as `langcount` for each article. 144 | 145 | *) `zgrep -c -e'^Eiffel_Tower$' converted/wikipedia/en/pagelinks.csv.gz` 146 | 147 | 2. The dump file `${language}langlinks` contains how many links each Wikipedia article has **to** 148 | other languages. Such a link doesn't count as 1 but as number of `${language}pagelinks`. 149 | 150 | The dump has the columns 151 | 152 | ```sql 153 | CREATE TABLE `langlinks` ( 154 | `ll_from` int(8) unsigned NOT NULL DEFAULT 0, 155 | `ll_lang` varbinary(35) NOT NULL DEFAULT '', 156 | `ll_title` varbinary(255) NOT NULL DEFAULT '', 157 | ``` 158 | 159 | For example the row `"9232,fr,Tour Eiffel"` in `enlanglinks` file means the 160 | [English article](https://en.wikipedia.org/wiki/Eiffel_Tower) has a link to the 161 | [French article](https://fr.wikipedia.org/wiki/Tour_Eiffel) (*). 162 | 163 | When processing the English language we need to inspect and calculate the sum of 164 | the `langlinks` files of all other languages. We store that as `othercount` for 165 | each article. 166 | 167 | For example the French article gets 2862 links from the English article (plus more 168 | from the other languages). 169 | 170 | *) The `langlink` files have no underscores in the title while other files do. 171 | 172 | 3. `langcount` and `othercount` together are `totalcount`. 173 | 174 | 4. We check which article has the highest (maximum) count of links to it. Currently that's 175 | "United States" with a `totalcount` of 5,198,249. All other articles are scored on a 176 | logarithmic scale accordingly. 177 | 178 | For example an article with half (2,599,124) the links to it gets a score of 0.952664935, an 179 | article with 10% (519,825) the links get a score of 0.85109869, an article with 1% a score of 180 | 0.7021967. 181 | 182 | ```sql 183 | SET importance = GREATEST( 184 | LOG(totalcount) 185 | / 186 | LOG(( 187 | SELECT MAX(totalcount) 188 | FROM wikipedia_article_full 189 | WHERE wd_page_title IS NOT NULL 190 | )), 191 | 0.0000000001 192 | ) 193 | ``` 194 | 195 | 196 | 197 | 198 | 199 | ## How Nominatim uses the files 200 | 201 | (As of Nominatim 4.2) 202 | 203 | During [Nominatim installation](https://nominatim.org/release-docs/latest/admin/Import/#wikipediawikidata-rankings 204 | ) 205 | it will check if a wikipedia-importance file is present and automatically import it into the 206 | database tables `wikpedia_article` and `wikipedia_redirect`. There is also a `nominatim refresh` 207 | command to update the tables later. 208 | 209 | OpenStreetMap contributors frequently tag items with links to Wikipedia 210 | ([documentation](https://wiki.openstreetmap.org/wiki/Key:wikipedia)) 211 | and Wikidata ([documentation](https://wiki.openstreetmap.org/wiki/Key:wikidata)). For example 212 | [Newcastle upon Tyne](https://www.openstreetmap.org/relation/142282) has the tags 213 | 214 | | tag | value | 215 | | ------------- | ------------------------------- | 216 | | admin_level | 8 | 217 | | boundary | administrative | 218 | | name | Newcastle upon Tyne | 219 | | type | boundary | 220 | | website | https://www.newcastle.gov.uk/ | 221 | | wikidata | Q1425428 | 222 | | wikipedia | en:Newcastle upon Tyne | 223 | 224 | When Nominatim indexes places it checks if they have an wikipedia or wikidata tag. If they do 225 | they set the `importance` value in the `placex` table for that place. This happens in 226 | `compute_importance` in `lib-sql/functions/importance.sql` (called from methods in 227 | `lib-sql/functions/placex_triggers.sql`. This is also were default values are set 228 | (when a place has neither). 229 | 230 | During a search Nominatim will inspect the `importance` value of a place and use that as 231 | one of the ranking (sorting) factors. 232 | 233 | See also [Nominatim importance documentation](https://nominatim.org/release-docs/latest/customize/Importance/). 234 | 235 | 236 | ## Steps of the build 237 | 238 | Have a look at `complete_run.sh` as entry point to the code. You will require a local Postgresql database. Edit 239 | the `languages.txt` file to only run a small language (e.g. Bulgarian) first. 240 | 241 | 1. latest\_available\_data 242 | 243 | Prints a date. Wikipedia exports take many days, then mirrors are sometimes slow copying them. It's not 244 | uncommon for an export starting Jan/1st to only be full ready Jan/10th or later. 245 | 246 | 2. wikipedia_download (1h) 247 | 248 | Downloads 40GB compressed files. 4 files per language. English is 10GB. 249 | 250 | 3. wikidata\_download (0:15h) 251 | 252 | Another 4 files, 5GB. 253 | 254 | 4. wikidata_api\_fetch\_placetypes (0:15h) 255 | 256 | Runs 300 SPARQL queries against wikidata servers. Output is 5GB. 257 | 258 | 5. wikipedia_sql2csv (4:20h) 259 | 260 | The MySQL SQL files get parsed sequentially and we try to exclude as much data (rows, 261 | columns) as possible. Output is 75% smaller than input. Any work done here cuts 262 | down the time (and space) needed in the database (database used to be 1TB before 263 | this step). 264 | 265 | Most time is spend on the Pagelinks table 266 | 267 | ``` 268 | [language en] Page table (0:06h) 269 | [language en] Pagelinks table (0:50h) 270 | [language en] langlinks table (0:02h) 271 | [language en] redirect table (0:01h) 272 | ``` 273 | 274 | 6. wikidata_sql2csv (0:15h) 275 | 276 | ``` 277 | geo_tags (0:01h) 278 | page (0:09h) 279 | wb_items_per_site (0:07h) 280 | ``` 281 | 282 | 7. wikipedia\_import, wikidata\_import (0:10h) 283 | 284 | Given the number of rows a pretty efficient loading of data into Postgresql. 285 | 286 | English database tables 287 | 288 | ``` 289 | enlanglinks | 28,365,965 rows | 1762 MB 290 | enpage | 17,211,555 rows | 946 MB 291 | enpagelinkcount | 27,792,966 rows | 2164 MB 292 | enpagelinks | 61,310,384 rows | 3351 MB 293 | enredirect | 10,804,606 rows | 599 MB 294 | ``` 295 | 296 | 8. wikipedia\_process, wikidata\_process (2:30h) 297 | 298 | Postgresql is great joining large datasets together, especially if not all 299 | data fits into RAM. 300 | 301 | ``` 302 | set othercounts (2:20h) 303 | Create and fill wikipedia_article_full (0.03h) 304 | Create derived tables (0.03h) 305 | Process language pages (0.03h) 306 | Add wikidata to wikipedia_article_full table (0.04h) 307 | Calculate importance score for each wikipedia page (0.08h) 308 | ``` 309 | 310 | 9. output (0:15h) 311 | 312 | Uses `pg_dump` tool to create SQL files. Uses SQL `COPY` command to create TSV file. 313 | 314 | 315 | License 316 | ------- 317 | The source code is available under a GPLv2 license. 318 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /wikidata_places.md: -------------------------------------------------------------------------------- 1 | 2 | ## Wikidata place types and related OSM Tags 3 | 4 | Wikidata does not have any official ontologies, however the [DBpedia project](https://wiki.dbpedia.org/) has created an [ontology](https://wiki.dbpedia.org/services-resources/ontology) that covered [place types](http://mappings.dbpedia.org/server/ontology/classes/#Place). The table below used the DBpedia place ontology as a starting point, and is provided as a cross-reference to the relevant OSM tags. 5 | 6 | The Wikidata place types listed in the table below can be used in conjunction with the [Wikidata Query Service](https://query.wikidata.org/) to retrieve instances of those place types from the Wikidata knowledgebase. 7 | 8 | ``` 9 | SELECT ?item ?lat ?lon 10 | WHERE { 11 | ?item wdt:P31*/wdt:P279*wd:Q9430; wdt:P625 ?pt. 12 | ?item p:P625?loc. 13 | ?loc psv:P625?cnode. 14 | ?cnode wikibase:geoLatitude?lat. 15 | ?cnode wikibase:geoLongitude?lon. 16 | } 17 | ``` 18 | 19 | An example json return for all instances of the Wikidata item "Q9430" (Ocean) can be seen at [json](https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT?item?lat?lon%20WHERE{?item%20wdt:P31*/wdt:P279*wd:Q9430;wdt:P625?pt.?item%20p:P625?loc.?loc%20psv:P625?cnode.?cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.}) 20 | 21 | **NOTE** the OSM tags listed are those listed in the wikidata entries, and not all the possible matches for tags within OSM. 22 | 23 | 24 | title | concept | OSM Tag | 25 | -----------|---------------------------------------|------------------| 26 | [Q17334923](https://www.wikidata.org/entity/Q17334923) | Location | | 27 | [Q811979](https://www.wikidata.org/entity/Q811979) | Architectural Structure | | 28 | [Q194195](https://www.wikidata.org/entity/Q194195) | Amusement park | 29 | [Q204832](https://www.wikidata.org/entity/Q204832) | Roller coaster | [attraction=roller_coaster](https://wiki.openstreetmap.org/wiki/Tag:attraction=roller_coaster) | 30 | [Q2870166](https://www.wikidata.org/entity/Q2870166) | Water ride | | 31 | [Q641226](https://www.wikidata.org/entity/Q641226) | Arena | [amenity=events_centre](https://wiki.openstreetmap.org/wiki/Tag:amenity=events_centre) | 32 | [Q41176](https://www.wikidata.org/entity/Q41176) | Building | [building=yes](https://wiki.openstreetmap.org/wiki/Key:building) | 33 | [Q1303167](https://www.wikidata.org/entity/Q1303167) | Barn | [building=barn](https://wiki.openstreetmap.org/wiki/Tag:building=barn) | 34 | [Q655686](https://www.wikidata.org/entity/Q655686) | Commercial building | [building=commercial](https://wiki.openstreetmap.org/wiki/Tag:building=commercial) | 35 | [Q4830453](https://www.wikidata.org/entity/Q4830453) | Business | | 36 | [Q7075](https://www.wikidata.org/entity/Q7075) | Library | [amenity=library](https://wiki.openstreetmap.org/wiki/Tag:amenity=library) | 37 | [Q133215](https://www.wikidata.org/entity/Q133215) | Casino | [amenity=casino](https://wiki.openstreetmap.org/wiki/Tag:amenity=casino) | 38 | [Q23413](https://www.wikidata.org/entity/Q23413) | Castle | [historic=castle](https://wiki.openstreetmap.org/wiki/Tag:historic=castle) | 39 | [Q83405](https://www.wikidata.org/entity/Q83405) | Factory | | 40 | [Q53060](https://www.wikidata.org/entity/Q53060) | Gate | [barrier=gate](https://wiki.openstreetmap.org/wiki/Tag:barrier=gate) |cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.}) 41 | [Q11755880](https://www.wikidata.org/entity/Q11755880) | Residential Building | [building=residential](https://wiki.openstreetmap.org/wiki/Tag:building=residential) | 42 | [Q3947](https://www.wikidata.org/entity/Q3947) | House | [building=house](https://wiki.openstreetmap.org/wiki/Tag:building=house) | 43 | [Q35112127](https://www.wikidata.org/entity/Q35112127) | Historic Building | | 44 | [Q5773747](https://www.wikidata.org/entity/Q5773747) | Historic house | | 45 | [Q38723](https://www.wikidata.org/entity/Q38723) | Higher Education Institution | 46 | [Q3914](https://www.wikidata.org/entity/Q3914) | School | [amenity=school](https://wiki.openstreetmap.org/wiki/Tag:amenity=school) | 47 | [Q9842](https://www.wikidata.org/entity/Q9842) | Primary school | | 48 | [Q159334](https://www.wikidata.org/entity/Q159334) | Secondary school | | 49 | [Q16917](https://www.wikidata.org/entity/Q16917) | Hospital | [amenity=hospital](https://wiki.openstreetmap.org/wiki/Tag:amenity=hospital), [healthcare=hospital](https://wiki.openstreetmap.org/wiki/Tag:healthcare=hospital), [building=hospital](https://wiki.openstreetmap.org/wiki/Tag:building=hospital) | 50 | [Q27686](https://www.wikidata.org/entity/Q27686) | Hotel | [tourism=hotel](https://wiki.openstreetmap.org/wiki/Tag:tourism=hotel), [building=hotel](https://wiki.openstreetmap.org/wiki/Tag:building=hotel) | 51 | [Q33506](https://www.wikidata.org/entity/Q33506) | Museum | [tourism=museum](https://wiki.openstreetmap.org/wiki/Tag:tourism=museum) | 52 | [Q40357](https://www.wikidata.org/entity/Q40357) | Prison | [amenity=prison](https://wiki.openstreetmap.org/wiki/Tag:amenity=prison) | 53 | [Q24398318](https://www.wikidata.org/entity/Q24398318) | Religious Building | | 54 | [Q160742](https://www.wikidata.org/entity/Q160742) | Abbey | | 55 | [Q16970](https://www.wikidata.org/entity/Q16970) | Church (building) | [building=church](https://wiki.openstreetmap.org/wiki/Tag:building=church) | 56 | [Q44613](https://www.wikidata.org/entity/Q44613) | Monastery | [amenity=monastery](https://wiki.openstreetmap.org/wiki/Tag:amenity=monastery) | 57 | [Q32815](https://www.wikidata.org/entity/Q32815) | Mosque | [building=mosque](https://wiki.openstreetmap.org/wiki/Tag:building=mosque) | 58 | [Q697295](https://www.wikidata.org/entity/Q697295) | Shrine | [building=shrine](https://wiki.openstreetmap.org/wiki/Tag:building=shrine) | 59 | [Q34627](https://www.wikidata.org/entity/Q34627) | Synagogue | [building=synagogue](https://wiki.openstreetmap.org/wiki/Tag:building=synagogue) | 60 | [Q44539](https://www.wikidata.org/entity/Q44539) | Temple | [building=temple](https://wiki.openstreetmap.org/wiki/Tag:building=temple) | 61 | [Q11707](https://www.wikidata.org/entity/Q11707) | Restaurant | [amenity=restaurant](https://wiki.openstreetmap.org/wiki/Tag:amenity=restaurant) | 62 | [Q11315](https://www.wikidata.org/entity/Q11315) | Shopping mall | [shop=mall](https://wiki.openstreetmap.org/wiki/Tag:shop=mall), [shop=shopping_centre](https://wiki.openstreetmap.org/wiki/Tag:shop=shopping_centre) | 63 | [Q11303](https://www.wikidata.org/entity/Q11303) | Skyscraper | | 64 | [Q17350442](https://www.wikidata.org/entity/Q17350442) | Venue | | 65 | [Q41253](https://www.wikidata.org/entity/Q41253) | Movie Theater | [amenity=cinema](https://wiki.openstreetmap.org/wiki/Tag:amenity=cinema) | 66 | [Q483110](https://www.wikidata.org/entity/Q483110) | Stadium | [leisure=stadium](https://wiki.openstreetmap.org/wiki/Tag:leisure=stadium), [building=stadium](https://wiki.openstreetmap.org/wiki/Tag:building=stadium) | 67 | [Q24354](https://www.wikidata.org/entity/Q24354) | Theater (structure) | [amenity=theatre](https://wiki.openstreetmap.org/wiki/Tag:amenity=theatre) | 68 | [Q121359](https://www.wikidata.org/entity/Q121359) | Infrastructure | | 69 | [Q1248784](https://www.wikidata.org/entity/Q1248784) | Airport | | 70 | [Q12323](https://www.wikidata.org/entity/Q12323) | Dam | [waterway=dam](https://wiki.openstreetmap.org/wiki/Tag:waterway=dam) | 71 | [Q1353183](https://www.wikidata.org/entity/Q1353183) | Launch pad | | 72 | [Q105190](https://www.wikidata.org/entity/Q105190) | Levee | [man_made=dyke](https://wiki.openstreetmap.org/wiki/Tag:man_made=dyke) | 73 | [Q105731](https://www.wikidata.org/entity/Q105731) | Lock (water navigation) | [lock=yes](https://wiki.openstreetmap.org/wiki/Key:lock) | 74 | [Q44782](https://www.wikidata.org/entity/Q44782) | Port | | 75 | [Q159719](https://www.wikidata.org/entity/Q159719) | Power station | [power=plant](https://wiki.openstreetmap.org/wiki/Tag:power=plant) | 76 | [Q174814](https://www.wikidata.org/entity/Q174814) | Electrical substation | | 77 | [Q134447](https://www.wikidata.org/entity/Q134447) | Nuclear power plant | [plant:source=nuclear](https://wiki.openstreetmap.org/wiki/Tag:plant:source=nuclear) | 78 | [Q786014](https://www.wikidata.org/entity/Q786014) | Rest area | [highway=rest_area](https://wiki.openstreetmap.org/wiki/Tag:highway=rest_area), [highway=services](https://wiki.openstreetmap.org/wiki/Tag:highway=services) | 79 | [Q12280](https://www.wikidata.org/entity/Q12280) | Bridge | [bridge=* ](https://wiki.openstreetmap.org/wiki/Key:bridge), [man_made=bridge](https://wiki.openstreetmap.org/wiki/Tag:man_made=bridge) | 80 | [Q728937](https://www.wikidata.org/entity/Q728937) | Railroad Line | [railway=rail](https://wiki.openstreetmap.org/wiki/Tag:railway=rail) | 81 | [Q1311958](https://www.wikidata.org/entity/Q1311958) | Railway Tunnel | | 82 | [Q34442](https://www.wikidata.org/entity/Q34442) | Road | [highway=* ](https://wiki.openstreetmap.org/wiki/Key:highway), [route=road](https://wiki.openstreetmap.org/wiki/Tag:route=road) | 83 | [Q1788454](https://www.wikidata.org/entity/Q1788454) | Road junction | | 84 | [Q44377](https://www.wikidata.org/entity/Q44377) | Tunnel | [tunnel=* ](https://wiki.openstreetmap.org/wiki/Key:tunnel) | 85 | [Q5031071](https://www.wikidata.org/entity/Q5031071) | Canal tunnel | | 86 | [Q719456](https://www.wikidata.org/entity/Q719456) | Station | [public_transport=station](https://wiki.openstreetmap.org/wiki/Tag:public_transport=station) | 87 | [Q205495](https://www.wikidata.org/entity/Q205495) | Filling station | [amenity=fuel](https://wiki.openstreetmap.org/wiki/Tag:amenity=fuel) | 88 | [Q928830](https://www.wikidata.org/entity/Q928830) | Metro station | [station=subway](https://wiki.openstreetmap.org/wiki/Tag:station=subway) | 89 | [Q55488](https://www.wikidata.org/entity/Q55488) | Train station | [railway=station](https://wiki.openstreetmap.org/wiki/Tag:railway=station) | 90 | [Q2175765](https://www.wikidata.org/entity/Q2175765) | Tram stop | [railway=tram_stop](https://wiki.openstreetmap.org/wiki/Tag:railway=tram_stop), [public_transport=stop_position](https://wiki.openstreetmap.org/wiki/Tag:public_transport=stop_position) | 91 | [Q6852233](https://www.wikidata.org/entity/Q6852233) | Military building | | 92 | [Q44494](https://www.wikidata.org/entity/Q44494) | Mill (grinding) | | 93 | [Q185187](https://www.wikidata.org/entity/Q185187) | Watermill | [man_made=watermill](https://wiki.openstreetmap.org/wiki/Tag:man_made=watermill) | 94 | [Q38720](https://www.wikidata.org/entity/Q38720) | Windmill | [man_made=windmill](https://wiki.openstreetmap.org/wiki/Tag:man_made=windmill) | 95 | [Q4989906](https://www.wikidata.org/entity/Q4989906) | Monument | [historic=monument](https://wiki.openstreetmap.org/wiki/Tag:historic=monument) | 96 | [Q5003624](https://www.wikidata.org/entity/Q5003624) | Memorial | [historic=memorial](https://wiki.openstreetmap.org/wiki/Tag:historic=memorial) | 97 | [Q271669](https://www.wikidata.org/entity/Q271669) | Landform | | 98 | [Q190429](https://www.wikidata.org/entity/Q190429) | Depression (geology) | | 99 | [Q17018380](https://www.wikidata.org/entity/Q17018380) | Bight (geography) | | 100 | [Q54050](https://www.wikidata.org/entity/Q54050) | Hill | | 101 | [Q1210950](https://www.wikidata.org/entity/Q1210950) | Channel (geography) | | 102 | [Q23442](https://www.wikidata.org/entity/Q23442) | Island | [place=island](https://wiki.openstreetmap.org/wiki/Tag:place=island) | 103 | [Q42523](https://www.wikidata.org/entity/Q42523) | Atoll | | 104 | [Q34763](https://www.wikidata.org/entity/Q34763) | Peninsula | | 105 | [Q355304](https://www.wikidata.org/entity/Q355304) | Watercourse | | 106 | [Q30198](https://www.wikidata.org/entity/Q30198) | Marsh | [wetland=marsh](https://wiki.openstreetmap.org/wiki/Tag:wetland=marsh) | 107 | [Q75520](https://www.wikidata.org/entity/Q75520) | Plateau | | 108 | [Q2042028](https://www.wikidata.org/entity/Q2042028) | Ravine | | 109 | [Q631305](https://www.wikidata.org/entity/Q631305) | Rock formation | | 110 | [Q12516](https://www.wikidata.org/entity/Q12516) | Pyramid | | 111 | [Q1076486](https://www.wikidata.org/entity/Q1076486) | Sports venue | | 112 | [Q682943](https://www.wikidata.org/entity/Q682943) | Cricket field | [sport=cricket](https://wiki.openstreetmap.org/wiki/Tag:sport=cricket) | 113 | [Q1048525](https://www.wikidata.org/entity/Q1048525) | Golf course | [leisure=golf_course](https://wiki.openstreetmap.org/wiki/Tag:leisure=golf_course) | 114 | [Q1777138](https://www.wikidata.org/entity/Q1777138) | Race track | [highway=raceway](https://wiki.openstreetmap.org/wiki/Tag:highway=raceway) | 115 | [Q130003](https://www.wikidata.org/entity/Q130003) | Ski resort | | 116 | [Q174782](https://www.wikidata.org/entity/Q174782) | Town square | [place=square](https://wiki.openstreetmap.org/wiki/Tag:place=square) | 117 | [Q12518](https://www.wikidata.org/entity/Q12518) | Tower | [building=tower](https://wiki.openstreetmap.org/wiki/Tag:building=tower), [man_made=tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=tower) | 118 | [Q39715](https://www.wikidata.org/entity/Q39715) | Lighthouse | [man_made=lighthouse](https://wiki.openstreetmap.org/wiki/Tag:man_made=lighthouse) | 119 | [Q274153](https://www.wikidata.org/entity/Q274153) | Water tower | [building=water_tower](https://wiki.openstreetmap.org/wiki/Tag:building=water_tower), [man_made=water_tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=water_tower) | 120 | [Q43501](https://www.wikidata.org/entity/Q43501) | Zoo | [tourism=zoo](https://wiki.openstreetmap.org/wiki/Tag:tourism=zoo) | 121 | [Q39614](https://www.wikidata.org/entity/Q39614) | Cemetery | [amenity=grave_yard](https://wiki.openstreetmap.org/wiki/Tag:amenity=grave_yard), [landuse=cemetery](https://wiki.openstreetmap.org/wiki/Tag:landuse=cemetery) | 122 | [Q152081](https://www.wikidata.org/entity/Q152081) | Concentration camp | | 123 | [Q1107656](https://www.wikidata.org/entity/Q1107656) | Garden | [leisure=garden](https://wiki.openstreetmap.org/wiki/Tag:leisure=garden) | 124 | [Q820477](https://www.wikidata.org/entity/Q820477) | Mine | | 125 | [Q33837](https://www.wikidata.org/entity/Q33837) | Archipelago | [place=archipelago](https://wiki.openstreetmap.org/wiki/Tag:place=archipelago) | 126 | [Q40080](https://www.wikidata.org/entity/Q40080) | Beach | [natural=beach](https://wiki.openstreetmap.org/wiki/Tag:natural=beach) | 127 | [Q15324](https://www.wikidata.org/entity/Q15324) | Body of water | [natural=water](https://wiki.openstreetmap.org/wiki/Tag:natural=water) | 128 | [Q23397](https://www.wikidata.org/entity/Q23397) | Lake | [water=lake](https://wiki.openstreetmap.org/wiki/Tag:water=lake) | 129 | [Q9430](https://www.wikidata.org/entity/Q9430) | Ocean | | 130 | [Q165](https://www.wikidata.org/entity/Q165) | Sea | | 131 | [Q47521](https://www.wikidata.org/entity/Q47521) | Stream | | 132 | [Q12284](https://www.wikidata.org/entity/Q12284) | Canal | [waterway=canal](https://wiki.openstreetmap.org/wiki/Tag:waterway=canal) | 133 | [Q4022](https://www.wikidata.org/entity/Q4022) | River | [waterway=river](https://wiki.openstreetmap.org/wiki/Tag:waterway=river), [type=waterway](https://wiki.openstreetmap.org/wiki/Relation:waterway) | 134 | [Q185113](https://www.wikidata.org/entity/Q185113) | Cape | [natural=cape](https://wiki.openstreetmap.org/wiki/Tag:natural=cape) | 135 | [Q35509](https://www.wikidata.org/entity/Q35509) | Cave | [natural=cave_entrance](https://wiki.openstreetmap.org/wiki/Tag:natural=cave_entrance) | 136 | [Q8514](https://www.wikidata.org/entity/Q8514) | Desert | | 137 | [Q4421](https://www.wikidata.org/entity/Q4421) | Forest | [natural=wood](https://wiki.openstreetmap.org/wiki/Tag:natural=wood) | 138 | [Q35666](https://www.wikidata.org/entity/Q35666) | Glacier | [natural=glacier](https://wiki.openstreetmap.org/wiki/Tag:natural=glacier) | 139 | [Q177380](https://www.wikidata.org/entity/Q177380) | Hot spring | | 140 | [Q8502](https://www.wikidata.org/entity/Q8502) | Mountain | [natural=peak](https://wiki.openstreetmap.org/wiki/Tag:natural=peak) | 141 | [Q133056](https://www.wikidata.org/entity/Q133056) | Mountain pass | | 142 | [Q46831](https://www.wikidata.org/entity/Q46831) | Mountain range | | 143 | [Q39816](https://www.wikidata.org/entity/Q39816) | Valley | [natural=valley](https://wiki.openstreetmap.org/wiki/Tag:natural=valley) | 144 | [Q8072](https://www.wikidata.org/entity/Q8072) | Volcano | [natural=volcano](https://wiki.openstreetmap.org/wiki/Tag:natural=volcano) | 145 | [Q43229](https://www.wikidata.org/entity/Q43229) | Organization | | 146 | [Q327333](https://www.wikidata.org/entity/Q327333) | Government agency | [office=government](https://wiki.openstreetmap.org/wiki/Tag:office=government)| 147 | [Q22698](https://www.wikidata.org/entity/Q22698) | Park | [leisure=park](https://wiki.openstreetmap.org/wiki/Tag:leisure=park) | 148 | [Q159313](https://www.wikidata.org/entity/Q159313) | Urban agglomeration | | 149 | [Q177634](https://www.wikidata.org/entity/Q177634) | Community | | 150 | [Q5107](https://www.wikidata.org/entity/Q5107) | Continent | [place=continent](https://wiki.openstreetmap.org/wiki/Tag:place=continent) | 151 | [Q6256](https://www.wikidata.org/entity/Q6256) | Country | [place=country](https://wiki.openstreetmap.org/wiki/Tag:place=country) | 152 | [Q75848](https://www.wikidata.org/entity/Q75848) | Gated community | | 153 | [Q3153117](https://www.wikidata.org/entity/Q3153117) | Intercommunality | | 154 | [Q82794](https://www.wikidata.org/entity/Q82794) | Region | | 155 | [Q56061](https://www.wikidata.org/entity/Q56061) | Administrative division | [boundary=administrative](https://wiki.openstreetmap.org/wiki/Tag:boundary=administrative) | 156 | [Q665487](https://www.wikidata.org/entity/Q665487) | Diocese | | 157 | [Q4976993](https://www.wikidata.org/entity/Q4976993) | Parish | [boundary=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:boundary=civil_parish) | 158 | [Q194203](https://www.wikidata.org/entity/Q194203) | Arrondissements of France | | 159 | [Q91028](https://www.wikidata.org/entity/Q91028) | Arrondissements of Belgium | | 160 | [Q3623867](https://www.wikidata.org/entity/Q3623867) | Arrondissements of Benin | | 161 | [Q2311958](https://www.wikidata.org/entity/Q2311958) | Canton (country subdivision) | [political_division=canton](https://wiki.openstreetmap.org/wiki/FR:Cantons_in_France) | 162 | [Q643589](https://www.wikidata.org/entity/Q643589) | Department | | 163 | [Q202216](https://www.wikidata.org/entity/Q202216) | Overseas department and region | | 164 | [Q149621](https://www.wikidata.org/entity/Q149621) | District | [place=district](https://wiki.openstreetmap.org/wiki/Tag:place=district) | 165 | [Q15243209](https://www.wikidata.org/wiki/Q15243209) | Historic district | | 166 | [Q5144960](https://www.wikidata.org/entity/Q5144960) | Microregion | | 167 | [Q15284](https://www.wikidata.org/entity/Q15284) | Municipality | | 168 | [Q515716](https://www.wikidata.org/entity/Q515716) | Prefecture | | 169 | [Q34876](https://www.wikidata.org/entity/Q34876) | Province | | 170 | [Q3191695](https://www.wikidata.org/entity/Q3191695) | Regency (Indonesia) | | 171 | [Q1970725](https://www.wikidata.org/entity/Q1970725) | Natural region | | 172 | [Q486972](https://www.wikidata.org/entity/Q486972) | Human settlement | | 173 | [Q515](https://www.wikidata.org/entity/Q515) | City | [place=city](https://wiki.openstreetmap.org/wiki/Tag:place=city) | 174 | [Q5119](https://www.wikidata.org/entity/Q5119) | Capital city | [capital=yes](https://wiki.openstreetmap.org/wiki/Key:capital) | 175 | [Q4286337](https://www.wikidata.org/entity/Q4286337) | City district | | 176 | [Q1394476](https://www.wikidata.org/entity/Q1394476) | Civil township | | 177 | [Q1115575](https://www.wikidata.org/entity/Q1115575) | Civil parish | [designation=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:designation=civil_parish) | 178 | [Q5153984](https://www.wikidata.org/entity/Q5153984) | Commune-level subdivisions | | 179 | [Q123705](https://www.wikidata.org/entity/Q123705) | Neighbourhood | [place=neighbourhood](https://wiki.openstreetmap.org/wiki/Tag:place=neighbourhood) | 180 | [Q1500350](https://www.wikidata.org/entity/Q1500350) | Townships of China | | 181 | [Q17343829](https://www.wikidata.org/entity/Q17343829) | Unincorporated Community | | 182 | [Q3957](https://www.wikidata.org/entity/Q3957) | Town | [place=town](https://wiki.openstreetmap.org/wiki/Tag:place=town) | 183 | [Q532](https://www.wikidata.org/entity/Q532) | Village | [place=village](https://wiki.openstreetmap.org/wiki/Tag:place=village) | 184 | [Q5084](https://www.wikidata.org/entity/Q5084) | Hamlet | [place=hamlet](https://wiki.openstreetmap.org/wiki/Tag:place=hamlet) | 185 | [Q7275](https://www.wikidata.org/entity/Q7275) | State | | 186 | [Q79007](https://www.wikidata.org/entity/Q79007) | Street | | 187 | [Q473972](https://www.wikidata.org/entity/Q473972) | Protected area | [boundary=protected_area](https://wiki.openstreetmap.org/wiki/Tag:boundary=protected_area) | 188 | [Q1377575](https://www.wikidata.org/entity/Q1377575) | Wildlife refuge | | 189 | [Q1410668](https://www.wikidata.org/entity/Q1410668) | National Wildlife Refuge | [protection_title=National Wildlife Refuge](ownership=national), [ownership=national](https://wiki.openstreetmap.org/wiki/Tag:ownership=national)| 190 | [Q9259](https://www.wikidata.org/entity/Q9259) | World Heritage Site | | 191 | 192 | --- 193 | 194 | ### Future Work 195 | 196 | The Wikidata improvements to Nominatim can be further enhanced by: 197 | 198 | - continuing to add new Wikidata links to OSM objects 199 | - increasing the number of place types accounted for in the wikipedia_articles table 200 | - working to use place types in the wikipedia_article matching process 201 | --------------------------------------------------------------------------------