├── LICENSE
├── MANIFEST.in
├── README.md
├── geograpy2
    ├── __init__.py
    ├── extraction.py
    ├── extraction.pyc
    ├── place.py
    └── placecontext.py
├── setup.py
└── tests
    ├── __init__.py
    ├── test_extractor.py
    ├── test_package.py
    ├── test_place.py
    └── test_placecontext.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Corollarium
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Geograpy2
 2 | =========
 3 | 
 4 | Extract place names from a URL or text, and add context to those names -- for 
 5 | example distinguishing between a country, region or city. 
 6 | 
 7 | 
 8 | ## Install & Setup
 9 | 
10 | Grab the package using `pip` (this will take a few minutes)
11 | 
12 |     pip install geograpy2
13 | 
14 | Geograpy2 uses [NLTK](http://www.nltk.org/) for entity recognition, so you'll also need 
15 | to download the models we're using. Fortunately there's a command that'll take 
16 | care of this for you. 
17 | 
18 |     geograpy-nltk
19 | 
20 | ## Basic Usage
21 | 
22 | Import the module, give some text or a URL, and presto.
23 | 
24 |     import geograpy2
25 |     url = 'http://www.bbc.com/news/world-europe-26919928'
26 |     places = geograpy2.get_place_context(url=url)
27 | 
28 | 
29 | ## Credits
30 | 
31 | Geograpy2 is a fork of [geograpy](https://github.com/ushahidi/geograpy) and inherits
32 | most of it, but solves several problems (such as support for utf8, places names 
33 | with multiple words, confusion over homonyms etc).
34 | 
35 | Geograpy2 uses the following excellent libraries:
36 | 
37 | * [NLTK](http://www.nltk.org/) for entity recognition
38 | * [newspaper](https://github.com/codelucas/newspaper) for text extraction from HTML
39 | * [jellyfish](https://github.com/sunlightlabs/jellyfish) for fuzzy text match
40 | * [pycountry](https://pypi.python.org/pypi/pycountry) for country/region lookups
41 | 
42 | Geograpy uses the following data sources:
43 | 
44 | * [GeoLite2](http://dev.maxmind.com/geoip/geoip2/geolite2/) for city lookups
45 | * [ISO3166ErrorDictionary](https://github.com/bodacea/countryname/blob/master/countryname/databases/ISO3166ErrorDictionary.csv) for common country mispellings _via [Sara-Jayne Terp](https://github.com/bodacea)_
46 | 
47 | Hat tip to [Chris Albon](https://github.com/chrisalbon) for the name.
48 | 
49 | Released under the MIT license.


--------------------------------------------------------------------------------
/geograpy2/__init__.py:
--------------------------------------------------------------------------------
 1 | from extraction import Extractor
 2 | from placecontext import PlaceContext
 3 | 
 4 | def get_place_context(url=None, text=None):
 5 |     e = Extractor(url=url, text=text)
 6 |     e.find_entities()
 7 | 
 8 |     pc = PlaceContext(e.places)
 9 | #     pc.set_countries()
10 | #     pc.set_regions()
11 | #     pc.set_cities()
12 | #     pc.set_other()
13 |     return pc


--------------------------------------------------------------------------------
/geograpy2/extraction.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from newspaper import Article
 3 | from nltk.tag.stanford import POSTagger
 4 | 
 5 | class Extractor(object):
 6 |     """Extracts possible place names from text.
 7 | 
 8 |     Attributes:
 9 |       text (str or unicode): The text to parse. Unicode is accepted.
10 |       url (list of str): The url to parse, if there is one.
11 |       places (list): The list of possible place names found. 
12 |     """
13 | 
14 |     def __init__(self, text=None, url=None):
15 |         """Inits the parser.
16 |         
17 |         Args: 
18 |             text (str or unicode): The text to parse. Unicode is accepted.
19 |             url (str): Alternatively pass a url, which will be downloaded and
20 |                 stripped of HTML.
21 |         """
22 |         if not text and not url:
23 |             raise Exception('text or url is required')
24 | 
25 |         self.text = text
26 |         self.url = url
27 |         self.places = []
28 |         
29 |         if self.url is not None:
30 |             self.download_text()
31 |     
32 |     def download_text(self):
33 |         """Downloads text from self.url and strip HTML tags.
34 |         """
35 |         if not self.text and self.url:
36 |             a = Article(self.url)
37 |             a.download()
38 |             a.parse()
39 |             self.text = a.text
40 | 
41 |     def named_entities(self):
42 |         # word_tokenize should work well for most non-CJK languages
43 |         text = nltk.word_tokenize(self.text)
44 |         
45 |         # TODO: this works only for english. Stanford's pos tagger supports
46 |         # more languages
47 |         # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
48 |         # http://stackoverflow.com/questions/1639855/pos-tagging-in-german
49 |         # PT corpus http://aelius.sourceforge.net/manual.html
50 |         # 
51 |         pos_tag = nltk.pos_tag(text)
52 |         
53 |         nes = nltk.ne_chunk(pos_tag)
54 |         return nes
55 |         
56 | 
57 |     def find_entities(self):
58 |         """Parse text and tokenize it.
59 |         """
60 |         nes = self.named_entities()
61 |         for ne in nes:
62 |             if type(ne) is nltk.tree.Tree:
63 |                 if ne.label() in ['GPE', 'PERSON', 'ORGANIZATION']:
64 |                     self.places.append(u' '.join([i[0] for i in ne.leaves()]))
65 | 


--------------------------------------------------------------------------------
/geograpy2/extraction.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Corollarium/geograpy2/aa2c8f9602d01059a2f2c134d3d0b66b32f53940/geograpy2/extraction.pyc


--------------------------------------------------------------------------------
/geograpy2/place.py:
--------------------------------------------------------------------------------
1 | 
2 | class Place:
3 |     def __init__(self, city = None, region = None, country = None):
4 |         self.city = city
5 |         self.region = region
6 |         self.country = country


--------------------------------------------------------------------------------
/geograpy2/placecontext.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import csv
  4 | import pycountry
  5 | import sqlite3
  6 | from collections import Counter
  7 | import place
  8 | 
  9 | """
 10 | Takes a list of place names and works place designation (country, region, etc) 
 11 | and relationships between places (city is inside region is inside country, etc)
 12 | """
 13 | class PlaceContext(object):
 14 |     """
 15 |     Attributes:
 16 |       places (list of place): The list of possible place names found. 
 17 |       names (list of unicode): The list of possible place names found.
 18 |       conn (object):
 19 |        
 20 |     Raises:
 21 |         IOError: if cannot write to DB
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self, place_names, db_file=None):
 26 |         db_file = db_file or os.path.dirname(os.path.realpath(__file__)) + "/locs.db"
 27 |         open(db_file, 'w') # just checks if writing is allowed
 28 | 
 29 |         self.conn = sqlite3.connect(db_file)
 30 |         self.conn.text_factory = lambda x: unicode(x, 'utf-8', 'ignore')
 31 |         self.names = place_names
 32 |         self.places = []
 33 | 
 34 |     def populate_db(self):
 35 |         cur = self.conn.cursor()
 36 |         cur.execute("DROP TABLE IF EXISTS cities")
 37 | 
 38 |         cur.execute("CREATE TABLE cities(geoname_id INTEGER, continent_code TEXT, continent_name TEXT, country_iso_code TEXT, country_name TEXT, subdivision_iso_code TEXT, subdivision_name TEXT, city_name TEXT, metro_code TEXT, time_zone TEXT)")
 39 |         cur_dir = os.path.dirname(os.path.realpath(__file__))
 40 |         with open(cur_dir+"/data/GeoLite2-City-Locations.csv", "rb") as info:
 41 |             reader = csv.reader(info)
 42 |             for row in reader:
 43 |                 cur.execute("INSERT INTO cities VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", row)
 44 |             self.conn.commit()
 45 | 
 46 | 
 47 |     def db_has_data(self):
 48 |         cur = self.conn.cursor()
 49 | 
 50 |         cur.execute("SELECT Count(*) FROM sqlite_master WHERE name='cities';")
 51 |         data = cur.fetchone()[0]
 52 | 
 53 |         if data > 0:
 54 |             cur.execute("SELECT Count(*) FROM cities")
 55 |             data = cur.fetchone()[0]
 56 |             return data > 0
 57 | 
 58 |         return False
 59 | 
 60 | 
 61 |     def correct_country_mispelling(self, s):
 62 |         cur_dir = os.path.dirname(os.path.realpath(__file__))
 63 |         with open(cur_dir+"/data/ISO3166ErrorDictionary.csv", "rb") as info:
 64 |             reader = csv.reader(info)
 65 |             for row in reader:
 66 |                 if s in remove_non_ascii(row[0]):
 67 |                     return row[2]
 68 | 
 69 |         return s
 70 | 
 71 |     
 72 |     def is_a_country(self, s): 
 73 |         s = self.correct_country_mispelling(s)
 74 |         try:
 75 |             pycountry.countries.get(name=s)
 76 |             return True
 77 |         except KeyError, e:
 78 |             return False
 79 | 
 80 |     
 81 |     
 82 |     def places_by_name(self, place_name, column_name):
 83 |         if not self.db_has_data():
 84 |             self.populate_db()
 85 | 
 86 |         cur = self.conn.cursor()
 87 |         cur.execute('SELECT * FROM cities WHERE ' + column_name + ' = "' + place_name + '"')
 88 |         rows = cur.fetchall()
 89 | 
 90 |         if len(rows) > 0:
 91 |             return rows
 92 | 
 93 |         return None
 94 | 
 95 | 
 96 |     def cities_for_name(self, city_name):
 97 |         return self.places_by_name(city_name, 'city_name')
 98 | 
 99 | 
100 |     def regions_for_name(self, region_name):
101 |         return self.places_by_name(region_name, 'subdivision_name')
102 | 
103 |     
104 |     def get_region_names(self, country_name):
105 |         country_name = self.correct_country_mispelling(country_name)
106 |         try:
107 |             obj = pycountry.countries.get(name=country_name)
108 |             regions = pycountry.subdivisions.get(country_code=obj.alpha2)
109 |         except:
110 |             regions = []
111 | 
112 |         return [r.name for r in regions]
113 | 
114 | 
115 |     def set_countries(self):
116 |         countries = [self.correct_country_mispelling(place) 
117 |             for place in self.names if self.is_a_country(place)]
118 | 
119 |         self.country_mentions = Counter(countries).most_common()
120 |         self.countries = list(set(countries))
121 | 
122 | 
123 |     def set_regions(self):
124 |         regions = []
125 |         self.country_regions = {}
126 |         region_names = {}
127 |         
128 |         if not self.countries:
129 |             self.set_countries()
130 | 
131 |         def region_match(place_name, region_name):
132 |             return fuzzy_match(remove_non_ascii(place_name), 
133 |                 remove_non_ascii(region_name))
134 | 
135 |         def is_region(place_name, region_names):
136 |             return filter(lambda rn: region_match(place_name, rn), region_names)
137 | 
138 |         for country in self.countries:
139 |             region_names = self.get_region_names(country)
140 |             matched_regions = [p for p in self.names if is_region(p, region_names)]
141 | 
142 |             regions += matched_regions
143 |             self.country_regions[country] = list(set(matched_regions))
144 | 
145 |         self.region_mentions = Counter(regions).most_common()
146 |         self.regions = list(set(regions))
147 | 
148 | 
149 |     def set_cities(self):
150 |         self.cities = []
151 |         self.country_cities = {}
152 |         self.address_strings = []
153 | 
154 |         if not self.countries:
155 |             self.set_countries()
156 | 
157 |         if not self.regions:
158 |             self.set_regions()
159 | 
160 |         if not self.db_has_data():
161 |             self.populate_db()
162 | 
163 |         cur = self.conn.cursor()
164 |         cur.execute("SELECT * FROM cities WHERE city_name IN (" + ",".join("?"*len(self.names)) + ")", self.names)
165 |         rows = cur.fetchall()
166 | 
167 |         for row in rows:
168 |             country = None
169 |             
170 |             try:
171 |                 country = pycountry.countries.get(alpha2=row[3])
172 |                 country_name = country.name
173 |             except KeyError, e:
174 |                 country_name = row[4]
175 | 
176 |             city_name = row[7]
177 |             region_name = row[6]
178 | 
179 |             if city_name not in self.cities:
180 |                 self.cities.append(city_name)
181 | 
182 |             if country_name not in self.countries:
183 |                 self.countries.append(country_name)
184 |                 self.country_mentions.append((country_name,1))
185 | 
186 |             if country_name not in self.country_cities:
187 |                 self.country_cities[country.name] = []
188 |             
189 |             if city_name not in self.country_cities[country_name]:
190 |                 self.country_cities[country_name].append(city_name)
191 | 
192 |                 if country_name in self.country_regions and region_name in self.country_regions[country_name]:
193 |                     self.address_strings.append(city_name + ", " + region_name + ", " + country_name)
194 | 
195 | 
196 |         all_cities = [p for p in self.names if p in self.cities]
197 |         self.city_mentions = Counter(all_cities).most_common()
198 | 
199 | 
200 |     def set_other(self):
201 |         if not self.cities:
202 |             self.set_cities()
203 | 
204 |         def unused(place_name):
205 |             places = [self.countries, self.cities, self.regions]
206 |             return all(self.correct_country_mispelling(place_name) not in l for l in places)
207 | 
208 |         self.other = [p for p in self.names if unused(p)]
209 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os
 3 | 
 4 | try:
 5 |    import pypandoc
 6 |    long_description = pypandoc.convert('README.md', 'rst')
 7 | except (IOError, ImportError):
 8 |    long_description = open('README.md').read()
 9 | 
10 | setup(name='geograpy2',
11 |       version='0.1.0',
12 |       description='Extract countries, regions and cities from a URL or text',
13 |       long_description=long_description,
14 |       url='https://github.com/Corollarium/geograpy2',
15 |       download_url ='https://github.com/Corollarium/geograpy2',
16 |       author='Corollarium',
17 |       author_email='email@corollarium.com',
18 |       license='MIT',
19 |       packages=['geograpy2'],
20 |       install_requires=[
21 |             'numpy',
22 |             'nltk',
23 |             'newspaper',
24 |             'jellyfish',
25 |             'pycountry'
26 |       ],
27 |       scripts=['geograpy/bin/geograpy-nltk'],
28 |       package_data = {
29 |             'geograpy': ['data/*.csv'],
30 |       },
31 |       zip_safe=False)


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Corollarium/geograpy2/aa2c8f9602d01059a2f2c134d3d0b66b32f53940/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from geograpy2.extraction import Extractor
 5 | 
 6 | class TestExtractor(unittest.TestCase):
 7 |     def testBBCNews(self):
 8 |         e = Extractor(url='http://www.bbc.com/news/world-europe-26919928')
 9 |         e.find_entities()
10 |      
11 |         self.assertLess(0, len(e.places))
12 |         self.assertIn('Russia', e.places)
13 |         self.assertIn('Kiev', e.places)
14 |      
15 |     def testNairobi(self):
16 |         text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 
17 |         Friday evening! horrible traffic here is your cue to become worse @Ma3Route """
18 |      
19 |         e = Extractor(text=text)
20 |         e.find_entities()
21 |      
22 |         self.assertLess(0, len(e.places))
23 |         assert 'Nairobi' in e.places
24 |  
25 |     def testNairobi2(self):
26 |         text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
27 |         e = Extractor(text=text3)
28 |         e.find_entities()
29 |      
30 |         self.assertLess(0, len(e.places))
31 |         self.assertIn('Nairobi', e.places)
32 |  
33 |     def testNairobi3(self):
34 |         text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
35 |         e = Extractor(text=text4)
36 |         e.find_entities()
37 |  
38 |         self.assertLess(0, len(e.places))
39 |         self.assertIn('Nairobi', e.places)
40 |         self.assertIn('Ngong', e.places)
41 |  
42 |     def testNewYork(self):
43 |         # unicode
44 |         text5 = u""" There is a city called New York in the United States."""
45 |         e = Extractor(text=text5)
46 |         e.find_entities()
47 |  
48 |         self.assertEqual(2, len(e.places))
49 |         assert u'New York' in e.places
50 |         assert u'United States' in e.places
51 |  
52 |     def testSaoPaulo(self):
53 |         # unicode and two words
54 |         text6 = u""" There is a city called São Paulo in Brazil."""
55 |         e = Extractor(text=text6)
56 |         e.find_entities()
57 |  
58 |         self.assertEqual(2, len(e.places))
59 |         self.assertIn(u'São Paulo', e.places)
60 |         self.assertIn(u'Brazil', e.places)
61 | 
62 |         text6 = u""" There is a city called Sao Paulo in Brazil."""
63 |         e = Extractor(text=text6)
64 |         e.find_entities()
65 |  
66 |         self.assertEqual(2, len(e.places))
67 |         self.assertIn(u'Sao Paulo', e.places)
68 |         self.assertIn(u'Brazil', e.places)
69 |          
70 | 
71 |          
72 | #     def testSaoPauloPT(self):
73 | #         # Portuguese, unicode and two words
74 | #         text6 = u"""Há uma cidade chamada São Paulo no Brasil."""
75 | #         e = Extractor(text=text6)
76 | #         e.find_entities()
77 | #  
78 | #         self.assertEqual(2, len(e.places))
79 | #         self.assertIn(u'São Paulo', e.places)
80 | #         self.assertIn(u'Brasil', e.places)
81 | 
82 | def main():
83 |     unittest.main()
84 | 
85 | if __name__ == '__main__':
86 |     main()


--------------------------------------------------------------------------------
/tests/test_package.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | import geograpy2
 5 | 
 6 | class TestExtractor(unittest.TestCase):
 7 |     def test_italy(self):
 8 |         text = u"""Qualsiasi cosa qui. Via Alberto da Padova, 232 - Padova Italy"""
 9 |         places = geograpy2.get_place_context(text=text)
10 |         # TODO self.assert
11 | 
12 | def main():
13 |     unittest.main()
14 | 
15 | if __name__ == '__main__':
16 |     main()


--------------------------------------------------------------------------------
/tests/test_place.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from geograpy2.place import Place
 5 | 
 6 | class TestPlace(unittest.TestCase):
 7 |     def test_basic(self):
 8 |         p = Place(city='city', region='region',  country='country')
 9 |         self.assertEqual('city', p.city)
10 |         self.assertEqual('region', p.region)
11 |         self.assertEqual('country', p.country)
12 |         


--------------------------------------------------------------------------------
/tests/test_placecontext.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from geograpy2.placecontext import PlaceContext 
 5 | 
 6 | class TestPlaceContext(unittest.TestCase):
 7 |     
 8 |     def test_sao_paulo_brazil(self):
 9 |         pc = PlaceContext([u'São Paulo', 'Brazil'])
10 |     
11 |     def test_Kenya(self):
12 |         pc = PlaceContext(['Ngong', 'Nairobi', 'Kenya'])
13 |         
14 | #         assert len(pc.countries) == 1
15 | #         assert len(pc.cities) == 1
16 | #         assert len(pc.other) == 1
17 | #         assert 'Ngong' in pc.other
18 | #     
19 | #         assert pc.cities_for_name('Nairobi')[0][4] == 'Kenya'
20 | #         assert pc.regions_for_name('Ohio')[0][4] == 'United States'
21 |     
22 |     def test_aleppo(self):
23 |         pc = PlaceContext(['Aleppo', 'Syria'])
24 |     
25 | #         assert 'Aleppo' in pc.cities
26 |         
27 | 


--------------------------------------------------------------------------------