├── LICENSE ├── MANIFEST.in ├── README.md ├── geograpy2 ├── __init__.py ├── extraction.py ├── extraction.pyc ├── place.py └── placecontext.py ├── setup.py └── tests ├── __init__.py ├── test_extractor.py ├── test_package.py ├── test_place.py └── test_placecontext.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Corollarium 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Geograpy2 2 | ========= 3 | 4 | Extract place names from a URL or text, and add context to those names -- for 5 | example distinguishing between a country, region or city. 6 | 7 | 8 | ## Install & Setup 9 | 10 | Grab the package using `pip` (this will take a few minutes) 11 | 12 | pip install geograpy2 13 | 14 | Geograpy2 uses [NLTK](http://www.nltk.org/) for entity recognition, so you'll also need 15 | to download the models we're using. Fortunately there's a command that'll take 16 | care of this for you. 17 | 18 | geograpy-nltk 19 | 20 | ## Basic Usage 21 | 22 | Import the module, give some text or a URL, and presto. 23 | 24 | import geograpy2 25 | url = 'http://www.bbc.com/news/world-europe-26919928' 26 | places = geograpy2.get_place_context(url=url) 27 | 28 | 29 | ## Credits 30 | 31 | Geograpy2 is a fork of [geograpy](https://github.com/ushahidi/geograpy) and inherits 32 | most of it, but solves several problems (such as support for utf8, places names 33 | with multiple words, confusion over homonyms etc). 34 | 35 | Geograpy2 uses the following excellent libraries: 36 | 37 | * [NLTK](http://www.nltk.org/) for entity recognition 38 | * [newspaper](https://github.com/codelucas/newspaper) for text extraction from HTML 39 | * [jellyfish](https://github.com/sunlightlabs/jellyfish) for fuzzy text match 40 | * [pycountry](https://pypi.python.org/pypi/pycountry) for country/region lookups 41 | 42 | Geograpy uses the following data sources: 43 | 44 | * [GeoLite2](http://dev.maxmind.com/geoip/geoip2/geolite2/) for city lookups 45 | * [ISO3166ErrorDictionary](https://github.com/bodacea/countryname/blob/master/countryname/databases/ISO3166ErrorDictionary.csv) for common country mispellings _via [Sara-Jayne Terp](https://github.com/bodacea)_ 46 | 47 | Hat tip to [Chris Albon](https://github.com/chrisalbon) for the name. 48 | 49 | Released under the MIT license. -------------------------------------------------------------------------------- /geograpy2/__init__.py: -------------------------------------------------------------------------------- 1 | from extraction import Extractor 2 | from placecontext import PlaceContext 3 | 4 | def get_place_context(url=None, text=None): 5 | e = Extractor(url=url, text=text) 6 | e.find_entities() 7 | 8 | pc = PlaceContext(e.places) 9 | # pc.set_countries() 10 | # pc.set_regions() 11 | # pc.set_cities() 12 | # pc.set_other() 13 | return pc -------------------------------------------------------------------------------- /geograpy2/extraction.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from newspaper import Article 3 | from nltk.tag.stanford import POSTagger 4 | 5 | class Extractor(object): 6 | """Extracts possible place names from text. 7 | 8 | Attributes: 9 | text (str or unicode): The text to parse. Unicode is accepted. 10 | url (list of str): The url to parse, if there is one. 11 | places (list): The list of possible place names found. 12 | """ 13 | 14 | def __init__(self, text=None, url=None): 15 | """Inits the parser. 16 | 17 | Args: 18 | text (str or unicode): The text to parse. Unicode is accepted. 19 | url (str): Alternatively pass a url, which will be downloaded and 20 | stripped of HTML. 21 | """ 22 | if not text and not url: 23 | raise Exception('text or url is required') 24 | 25 | self.text = text 26 | self.url = url 27 | self.places = [] 28 | 29 | if self.url is not None: 30 | self.download_text() 31 | 32 | def download_text(self): 33 | """Downloads text from self.url and strip HTML tags. 34 | """ 35 | if not self.text and self.url: 36 | a = Article(self.url) 37 | a.download() 38 | a.parse() 39 | self.text = a.text 40 | 41 | def named_entities(self): 42 | # word_tokenize should work well for most non-CJK languages 43 | text = nltk.word_tokenize(self.text) 44 | 45 | # TODO: this works only for english. Stanford's pos tagger supports 46 | # more languages 47 | # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford 48 | # http://stackoverflow.com/questions/1639855/pos-tagging-in-german 49 | # PT corpus http://aelius.sourceforge.net/manual.html 50 | # 51 | pos_tag = nltk.pos_tag(text) 52 | 53 | nes = nltk.ne_chunk(pos_tag) 54 | return nes 55 | 56 | 57 | def find_entities(self): 58 | """Parse text and tokenize it. 59 | """ 60 | nes = self.named_entities() 61 | for ne in nes: 62 | if type(ne) is nltk.tree.Tree: 63 | if ne.label() in ['GPE', 'PERSON', 'ORGANIZATION']: 64 | self.places.append(u' '.join([i[0] for i in ne.leaves()])) 65 | -------------------------------------------------------------------------------- /geograpy2/extraction.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Corollarium/geograpy2/aa2c8f9602d01059a2f2c134d3d0b66b32f53940/geograpy2/extraction.pyc -------------------------------------------------------------------------------- /geograpy2/place.py: -------------------------------------------------------------------------------- 1 | 2 | class Place: 3 | def __init__(self, city = None, region = None, country = None): 4 | self.city = city 5 | self.region = region 6 | self.country = country -------------------------------------------------------------------------------- /geograpy2/placecontext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import pycountry 5 | import sqlite3 6 | from collections import Counter 7 | import place 8 | 9 | """ 10 | Takes a list of place names and works place designation (country, region, etc) 11 | and relationships between places (city is inside region is inside country, etc) 12 | """ 13 | class PlaceContext(object): 14 | """ 15 | Attributes: 16 | places (list of place): The list of possible place names found. 17 | names (list of unicode): The list of possible place names found. 18 | conn (object): 19 | 20 | Raises: 21 | IOError: if cannot write to DB 22 | 23 | """ 24 | 25 | def __init__(self, place_names, db_file=None): 26 | db_file = db_file or os.path.dirname(os.path.realpath(__file__)) + "/locs.db" 27 | open(db_file, 'w') # just checks if writing is allowed 28 | 29 | self.conn = sqlite3.connect(db_file) 30 | self.conn.text_factory = lambda x: unicode(x, 'utf-8', 'ignore') 31 | self.names = place_names 32 | self.places = [] 33 | 34 | def populate_db(self): 35 | cur = self.conn.cursor() 36 | cur.execute("DROP TABLE IF EXISTS cities") 37 | 38 | cur.execute("CREATE TABLE cities(geoname_id INTEGER, continent_code TEXT, continent_name TEXT, country_iso_code TEXT, country_name TEXT, subdivision_iso_code TEXT, subdivision_name TEXT, city_name TEXT, metro_code TEXT, time_zone TEXT)") 39 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 40 | with open(cur_dir+"/data/GeoLite2-City-Locations.csv", "rb") as info: 41 | reader = csv.reader(info) 42 | for row in reader: 43 | cur.execute("INSERT INTO cities VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", row) 44 | self.conn.commit() 45 | 46 | 47 | def db_has_data(self): 48 | cur = self.conn.cursor() 49 | 50 | cur.execute("SELECT Count(*) FROM sqlite_master WHERE name='cities';") 51 | data = cur.fetchone()[0] 52 | 53 | if data > 0: 54 | cur.execute("SELECT Count(*) FROM cities") 55 | data = cur.fetchone()[0] 56 | return data > 0 57 | 58 | return False 59 | 60 | 61 | def correct_country_mispelling(self, s): 62 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 63 | with open(cur_dir+"/data/ISO3166ErrorDictionary.csv", "rb") as info: 64 | reader = csv.reader(info) 65 | for row in reader: 66 | if s in remove_non_ascii(row[0]): 67 | return row[2] 68 | 69 | return s 70 | 71 | 72 | def is_a_country(self, s): 73 | s = self.correct_country_mispelling(s) 74 | try: 75 | pycountry.countries.get(name=s) 76 | return True 77 | except KeyError, e: 78 | return False 79 | 80 | 81 | 82 | def places_by_name(self, place_name, column_name): 83 | if not self.db_has_data(): 84 | self.populate_db() 85 | 86 | cur = self.conn.cursor() 87 | cur.execute('SELECT * FROM cities WHERE ' + column_name + ' = "' + place_name + '"') 88 | rows = cur.fetchall() 89 | 90 | if len(rows) > 0: 91 | return rows 92 | 93 | return None 94 | 95 | 96 | def cities_for_name(self, city_name): 97 | return self.places_by_name(city_name, 'city_name') 98 | 99 | 100 | def regions_for_name(self, region_name): 101 | return self.places_by_name(region_name, 'subdivision_name') 102 | 103 | 104 | def get_region_names(self, country_name): 105 | country_name = self.correct_country_mispelling(country_name) 106 | try: 107 | obj = pycountry.countries.get(name=country_name) 108 | regions = pycountry.subdivisions.get(country_code=obj.alpha2) 109 | except: 110 | regions = [] 111 | 112 | return [r.name for r in regions] 113 | 114 | 115 | def set_countries(self): 116 | countries = [self.correct_country_mispelling(place) 117 | for place in self.names if self.is_a_country(place)] 118 | 119 | self.country_mentions = Counter(countries).most_common() 120 | self.countries = list(set(countries)) 121 | 122 | 123 | def set_regions(self): 124 | regions = [] 125 | self.country_regions = {} 126 | region_names = {} 127 | 128 | if not self.countries: 129 | self.set_countries() 130 | 131 | def region_match(place_name, region_name): 132 | return fuzzy_match(remove_non_ascii(place_name), 133 | remove_non_ascii(region_name)) 134 | 135 | def is_region(place_name, region_names): 136 | return filter(lambda rn: region_match(place_name, rn), region_names) 137 | 138 | for country in self.countries: 139 | region_names = self.get_region_names(country) 140 | matched_regions = [p for p in self.names if is_region(p, region_names)] 141 | 142 | regions += matched_regions 143 | self.country_regions[country] = list(set(matched_regions)) 144 | 145 | self.region_mentions = Counter(regions).most_common() 146 | self.regions = list(set(regions)) 147 | 148 | 149 | def set_cities(self): 150 | self.cities = [] 151 | self.country_cities = {} 152 | self.address_strings = [] 153 | 154 | if not self.countries: 155 | self.set_countries() 156 | 157 | if not self.regions: 158 | self.set_regions() 159 | 160 | if not self.db_has_data(): 161 | self.populate_db() 162 | 163 | cur = self.conn.cursor() 164 | cur.execute("SELECT * FROM cities WHERE city_name IN (" + ",".join("?"*len(self.names)) + ")", self.names) 165 | rows = cur.fetchall() 166 | 167 | for row in rows: 168 | country = None 169 | 170 | try: 171 | country = pycountry.countries.get(alpha2=row[3]) 172 | country_name = country.name 173 | except KeyError, e: 174 | country_name = row[4] 175 | 176 | city_name = row[7] 177 | region_name = row[6] 178 | 179 | if city_name not in self.cities: 180 | self.cities.append(city_name) 181 | 182 | if country_name not in self.countries: 183 | self.countries.append(country_name) 184 | self.country_mentions.append((country_name,1)) 185 | 186 | if country_name not in self.country_cities: 187 | self.country_cities[country.name] = [] 188 | 189 | if city_name not in self.country_cities[country_name]: 190 | self.country_cities[country_name].append(city_name) 191 | 192 | if country_name in self.country_regions and region_name in self.country_regions[country_name]: 193 | self.address_strings.append(city_name + ", " + region_name + ", " + country_name) 194 | 195 | 196 | all_cities = [p for p in self.names if p in self.cities] 197 | self.city_mentions = Counter(all_cities).most_common() 198 | 199 | 200 | def set_other(self): 201 | if not self.cities: 202 | self.set_cities() 203 | 204 | def unused(place_name): 205 | places = [self.countries, self.cities, self.regions] 206 | return all(self.correct_country_mispelling(place_name) not in l for l in places) 207 | 208 | self.other = [p for p in self.names if unused(p)] 209 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | 4 | try: 5 | import pypandoc 6 | long_description = pypandoc.convert('README.md', 'rst') 7 | except (IOError, ImportError): 8 | long_description = open('README.md').read() 9 | 10 | setup(name='geograpy2', 11 | version='0.1.0', 12 | description='Extract countries, regions and cities from a URL or text', 13 | long_description=long_description, 14 | url='https://github.com/Corollarium/geograpy2', 15 | download_url ='https://github.com/Corollarium/geograpy2', 16 | author='Corollarium', 17 | author_email='email@corollarium.com', 18 | license='MIT', 19 | packages=['geograpy2'], 20 | install_requires=[ 21 | 'numpy', 22 | 'nltk', 23 | 'newspaper', 24 | 'jellyfish', 25 | 'pycountry' 26 | ], 27 | scripts=['geograpy/bin/geograpy-nltk'], 28 | package_data = { 29 | 'geograpy': ['data/*.csv'], 30 | }, 31 | zip_safe=False) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Corollarium/geograpy2/aa2c8f9602d01059a2f2c134d3d0b66b32f53940/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from geograpy2.extraction import Extractor 5 | 6 | class TestExtractor(unittest.TestCase): 7 | def testBBCNews(self): 8 | e = Extractor(url='http://www.bbc.com/news/world-europe-26919928') 9 | e.find_entities() 10 | 11 | self.assertLess(0, len(e.places)) 12 | self.assertIn('Russia', e.places) 13 | self.assertIn('Kiev', e.places) 14 | 15 | def testNairobi(self): 16 | text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 17 | Friday evening! horrible traffic here is your cue to become worse @Ma3Route """ 18 | 19 | e = Extractor(text=text) 20 | e.find_entities() 21 | 22 | self.assertLess(0, len(e.places)) 23 | assert 'Nairobi' in e.places 24 | 25 | def testNairobi2(self): 26 | text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """ 27 | e = Extractor(text=text3) 28 | e.find_entities() 29 | 30 | self.assertLess(0, len(e.places)) 31 | self.assertIn('Nairobi', e.places) 32 | 33 | def testNairobi3(self): 34 | text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """ 35 | e = Extractor(text=text4) 36 | e.find_entities() 37 | 38 | self.assertLess(0, len(e.places)) 39 | self.assertIn('Nairobi', e.places) 40 | self.assertIn('Ngong', e.places) 41 | 42 | def testNewYork(self): 43 | # unicode 44 | text5 = u""" There is a city called New York in the United States.""" 45 | e = Extractor(text=text5) 46 | e.find_entities() 47 | 48 | self.assertEqual(2, len(e.places)) 49 | assert u'New York' in e.places 50 | assert u'United States' in e.places 51 | 52 | def testSaoPaulo(self): 53 | # unicode and two words 54 | text6 = u""" There is a city called São Paulo in Brazil.""" 55 | e = Extractor(text=text6) 56 | e.find_entities() 57 | 58 | self.assertEqual(2, len(e.places)) 59 | self.assertIn(u'São Paulo', e.places) 60 | self.assertIn(u'Brazil', e.places) 61 | 62 | text6 = u""" There is a city called Sao Paulo in Brazil.""" 63 | e = Extractor(text=text6) 64 | e.find_entities() 65 | 66 | self.assertEqual(2, len(e.places)) 67 | self.assertIn(u'Sao Paulo', e.places) 68 | self.assertIn(u'Brazil', e.places) 69 | 70 | 71 | 72 | # def testSaoPauloPT(self): 73 | # # Portuguese, unicode and two words 74 | # text6 = u"""Há uma cidade chamada São Paulo no Brasil.""" 75 | # e = Extractor(text=text6) 76 | # e.find_entities() 77 | # 78 | # self.assertEqual(2, len(e.places)) 79 | # self.assertIn(u'São Paulo', e.places) 80 | # self.assertIn(u'Brasil', e.places) 81 | 82 | def main(): 83 | unittest.main() 84 | 85 | if __name__ == '__main__': 86 | main() -------------------------------------------------------------------------------- /tests/test_package.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | import geograpy2 5 | 6 | class TestExtractor(unittest.TestCase): 7 | def test_italy(self): 8 | text = u"""Qualsiasi cosa qui. Via Alberto da Padova, 232 - Padova Italy""" 9 | places = geograpy2.get_place_context(text=text) 10 | # TODO self.assert 11 | 12 | def main(): 13 | unittest.main() 14 | 15 | if __name__ == '__main__': 16 | main() -------------------------------------------------------------------------------- /tests/test_place.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from geograpy2.place import Place 5 | 6 | class TestPlace(unittest.TestCase): 7 | def test_basic(self): 8 | p = Place(city='city', region='region', country='country') 9 | self.assertEqual('city', p.city) 10 | self.assertEqual('region', p.region) 11 | self.assertEqual('country', p.country) 12 | -------------------------------------------------------------------------------- /tests/test_placecontext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from geograpy2.placecontext import PlaceContext 5 | 6 | class TestPlaceContext(unittest.TestCase): 7 | 8 | def test_sao_paulo_brazil(self): 9 | pc = PlaceContext([u'São Paulo', 'Brazil']) 10 | 11 | def test_Kenya(self): 12 | pc = PlaceContext(['Ngong', 'Nairobi', 'Kenya']) 13 | 14 | # assert len(pc.countries) == 1 15 | # assert len(pc.cities) == 1 16 | # assert len(pc.other) == 1 17 | # assert 'Ngong' in pc.other 18 | # 19 | # assert pc.cities_for_name('Nairobi')[0][4] == 'Kenya' 20 | # assert pc.regions_for_name('Ohio')[0][4] == 'United States' 21 | 22 | def test_aleppo(self): 23 | pc = PlaceContext(['Aleppo', 'Syria']) 24 | 25 | # assert 'Aleppo' in pc.cities 26 | 27 | --------------------------------------------------------------------------------