├── .gitignore ├── .travis.yml ├── README.md ├── requirements.txt ├── scripts └── wd ├── setup.py ├── test.py └── wikidata_suggest.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | wikidata_suggest.egg-info/* 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | script: python setup.py test 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wikidata_suggest 2 | 3 | [![Build Status](https://secure.travis-ci.org/edsu/wikidata_suggest.png)](http://travis-ci.org/edsu/wikidata_suggest) 4 | 5 | `wikidata_suggest` is a simple command line tool for interactively reconciling 6 | your data against [Wikidata](https://wikidata.org). First you'll want to 7 | install: 8 | 9 | % pip install wikidata_suggest 10 | 11 | Once you've installed it you will get a command line tool `wd`: 12 | 13 | ![](http://edsu.github.io/wikidata_suggest/images/screenshot1.png) 14 | 15 | Most likely you will want to use wikidata_suggest as a little data 16 | cleansing/augmentation library. For example if you have a CSV spreadsheet 17 | that has an *author* column that you'd like to link up to Wikidata, you 18 | can do something like this: 19 | 20 | ```python 21 | 22 | import csv 23 | 24 | from wikidata_suggest import suggest 25 | 26 | reader = csv.reader(open("data.csv")) 27 | writer = csv.writer(open("new_data.csv", "wb")) 28 | 29 | # read the csv 30 | for row in reader: 31 | 32 | # column 2 has author names 33 | author = row[1] 34 | 35 | # drop into interactive session 36 | wikidata = suggest(author) 37 | 38 | if wikidata: 39 | row.append(wikidata["id"]) 40 | else: 41 | row.append(None) 42 | 43 | # write our new row 44 | writer.writerow(row) 45 | 46 | reader.close() 47 | writer.close() 48 | ``` 49 | 50 | 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | colorama 3 | requests 4 | -------------------------------------------------------------------------------- /scripts/wd: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import json 5 | import wikidata_suggest 6 | 7 | if __name__ == "__main__": 8 | if len(sys.argv) > 1: 9 | name = ' '.join(sys.argv[1:]) 10 | else: 11 | name = raw_input("search for: ") 12 | try: 13 | result = wikidata_suggest.suggest(name) 14 | if result != None: 15 | print json.dumps(result, indent=2) 16 | except wikidata_suggest.Quit: 17 | pass 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Command 2 | 3 | class PyTest(Command): 4 | """ 5 | A command to convince setuptools to run pytests. 6 | """ 7 | user_options = [] 8 | def initialize_options(self): 9 | pass 10 | def finalize_options(self): 11 | pass 12 | def run(self): 13 | import pytest 14 | pytest.main("test.py") 15 | 16 | setup( 17 | name = 'wikidata_suggest', 18 | version = '0.0.7', 19 | url = 'http://github.com/edsu/wikidata_suggest', 20 | author = 'Ed Summers', 21 | author_email = 'ehs@pobox.com', 22 | py_modules = ['wikidata_suggest'], 23 | install_requires = ['requests', 'colorama'], 24 | tests_require=['pytest'], 25 | scripts = ['scripts/wd'], 26 | cmdclass = {'test': PyTest}, 27 | description = 'Interactively look up Wikidata entities from the command line', 28 | license='MIT License' 29 | ) 30 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import pytest 5 | from wikidata_suggest import suggest, _wikidata, _wikipedia 6 | 7 | def test_wikidata(): 8 | results = _wikidata('tolstoi') 9 | assert len(results['search']) > 0 10 | 11 | def test_wikidata_missing(): 12 | results = _wikidata('Catherine Breshkovksy') 13 | assert len(results['search']) == 0 14 | 15 | def test_wikipedia(): 16 | assert _wikipedia('Catherine Breshkovksy') == "Catherine Breshkovsky" 17 | 18 | def test_wikipedia_another(): 19 | results = _wikipedia('leo nikolaievich tolstoy') 20 | assert "Leo Tolstoy" in results 21 | -------------------------------------------------------------------------------- /wikidata_suggest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re 4 | import sys 5 | import json 6 | import time 7 | import requests 8 | 9 | from colorama import Fore, Style, init 10 | 11 | cache = {} 12 | 13 | def suggest(orig_name): 14 | 15 | name = orig_name.strip() 16 | 17 | if name == None or name == "": 18 | return None 19 | 20 | if name in cache: 21 | return cache[name] 22 | 23 | init() 24 | 25 | while True: 26 | 27 | print 28 | print(Fore.RED + 'Search: ' + name + Style.RESET_ALL) 29 | print 30 | 31 | # print wikidata suggestions 32 | wd_sug = _wikidata(name) 33 | 34 | count = 0 35 | for s in wd_sug['search']: 36 | count += 1 37 | label = s.get('label', '') 38 | description = s.get('description', 'https:' + s['url']) 39 | 40 | print(Fore.BLUE + str(count) + ')' + ' ' + Style.BRIGHT + label), 41 | if description: 42 | print(' - ' + description), 43 | print(Fore.RESET) 44 | 45 | # print wikipedia suggestions 46 | wp_sug = _wikipedia(name) 47 | if wp_sug: 48 | print(Fore.MAGENTA + 'W) Wikipedia suggests %s' % wp_sug + Fore.RESET) 49 | 50 | print(Fore.GREEN + 'N) none') 51 | print(Fore.YELLOW + 'O) other') 52 | print(Fore.RED + 'Q) quit') 53 | print(Style.RESET_ALL) 54 | 55 | try: 56 | choice = raw_input("Choice: ") 57 | except EOFError: 58 | print 59 | return None 60 | 61 | choice = choice.upper() 62 | if re.match('^\d+$', choice): 63 | r = wd_sug['search'][int(choice)-1] 64 | cache[orig_name] = r 65 | cache[name] = r 66 | return r 67 | elif wp_sug and choice[0] == "W": 68 | name = wp_sug 69 | elif choice[0] == "O": 70 | name = raw_input("Lookup: ") 71 | elif choice[0] == "N": 72 | cache[orig_name] = None 73 | cache[name] = None 74 | return None 75 | elif choice[0] == "Q": 76 | raise Quit() 77 | 78 | 79 | def _wikidata(name): 80 | url = "http://www.wikidata.org/w/api.php" 81 | params = { 82 | "search": name, 83 | "action": "wbsearchentities", 84 | "format": "json", 85 | "language": "en", 86 | "type": "item", 87 | "continue": "0", 88 | "limit": "10" 89 | } 90 | return requests.get(url, params=params).json() 91 | 92 | 93 | def _wikipedia(name, lang='en'): 94 | url = "https://%s.wikipedia.org/w/api.php" % lang 95 | params = { 96 | "action": "query", 97 | "list": "search", 98 | "format": "json", 99 | "srnamespace": "0", 100 | "srsearch": name 101 | } 102 | sug = None 103 | results = requests.get(url, params=params).json() 104 | if len(results['query']['search']) > 0: 105 | sug = results['query']['search'][0]['title'] 106 | elif 'suggestion' in results['query']['searchinfo'] and \ 107 | name != results['query']['searchinfo']['suggestion']: 108 | sug = _wikipedia(results['query']['searchinfo']['suggestion'], lang) 109 | return sug 110 | 111 | class Quit(Exception): 112 | pass 113 | --------------------------------------------------------------------------------