├── locationtagger
    ├── tests
    │   ├── __init__.py
    │   ├── test_LocationExtractor.py
    │   └── test_NamedEntityExtractor.py
    ├── data
    │   ├── words_to_ignore.csv
    │   └── diagram.jpg
    ├── bin
    │   └── locationtagger-nltk-spacy
    ├── __init__.py
    ├── utils.py
    └── locationextractor.py
├── .gitattributes
├── MANIFEST.in
├── LICENSE
├── setup.py
└── README.md


/locationtagger/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/locationtagger/data/words_to_ignore.csv:
--------------------------------------------------------------------------------
1 | VALUE
2 | NaN
3 | None
4 | NONE
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md
2 | include locationtagger/data/*.csv
3 | include locationtagger/data/*.jpg
4 | 


--------------------------------------------------------------------------------
/locationtagger/data/diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaushiksoni10/locationtagger/HEAD/locationtagger/data/diagram.jpg


--------------------------------------------------------------------------------
/locationtagger/bin/locationtagger-nltk-spacy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import nltk
 4 | import spacy
 5 | 
 6 | nltk.downloader.download('maxent_ne_chunker')
 7 | nltk.downloader.download('words')
 8 | nltk.downloader.download('treebank')
 9 | nltk.downloader.download('maxent_treebank_pos_tagger')
10 | nltk.downloader.download('punkt')
11 | !python -m spacy download en_core_web_sm
12 | 


--------------------------------------------------------------------------------
/locationtagger/__init__.py:
--------------------------------------------------------------------------------
 1 | from .locationextractor import NamedEntityExtractor, LocationExtractor
 2 | 
 3 | def find_locations(url=None, text=None):
 4 |     e = NamedEntityExtractor(url=url, text=text)
 5 |     e.find_named_entities()
 6 | 
 7 |     locs = LocationExtractor(e.named_entities)
 8 |     locs.set_countries()
 9 |     locs.set_regions()
10 |     locs.set_cities()
11 |     locs.set_other()
12 | 
13 |     return locs
14 | 


--------------------------------------------------------------------------------
/locationtagger/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def clean(sentences):
 4 |     #removing non_ascii characters
 5 |     sentences = "".join(i for i in sentences if ord(i)<128)
 6 |     #changing non string to string.
 7 |     sentences = str(sentences)
 8 |     #1. Removing numbers.
 9 |     sentences_1 = re.sub("[^a-zA-Z&',.;-]"," ", sentences)
10 |     #2. Removing multiple spaces with single space
11 |     sentences_2 = re.sub(r'\s+', ' ',sentences_1, flags=re.I)
12 |     words = sentences_2.split()
13 |     newsentence = ' '.join(words)
14 |     return(newsentence)
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 kaushiksoni10
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/locationtagger/tests/test_LocationExtractor.py:
--------------------------------------------------------------------------------
 1 | from locationtagger.locationextractor import LocationExtractor
 2 | 
 3 | def test():
 4 |     try:
 5 |         l = LocationExtractor(['LGhhk', 'Vancouver', 'Pakistan', 'Texas'])
 6 |         l.set_countries()
 7 |         l.set_regions()
 8 |         l.set_cities()
 9 |         l.set_other()
10 | 
11 |         assert len(l.countries) == 1
12 |         assert len(l.regions) == 1
13 |         assert len(l.cities) == 1
14 |         assert len(l.other) == 1
15 |         assert len(l.other_countries) == 2
16 |         assert len(l.other_regions) == 3
17 |         assert 'LGhhk' in l.other
18 | 
19 |         l2 = LocationExtractor(['INDIA', 'karnataka', 'kuwait','kanpur'])
20 |         l2.set_countries()
21 |         l2.set_regions()
22 |         l2.set_cities()
23 |         l2.set_other()
24 | 
25 |         assert len(l2.countries) == 2
26 |         assert len(l2.regions) == 1
27 |         assert len(l2.cities) == 1
28 |         assert len(l2.other) == 0
29 |         assert l2.region_cities == {'Uttar Pradesh': ['Kanpur']}
30 |         assert l2.country_regions == {'India': ['Karnataka']} 
31 |         assert l2.country_cities == {'India': ['Kanpur']}
32 |         print('passed test')
33 |         
34 |     except Exception:
35 |         print('failed test')
36 | 
37 | test()
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="locationtagger",
 8 |     version="0.0.1",
 9 |     author="Kaushik Soni",
10 |     author_email="kaushiksoni10@gmail.com",
11 |     description="Detect & Extract locations from text or URL and find relationships among locations",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/kaushiksoni10/locationtagger",
15 |     packages= ["locationtagger"],
16 |     include_package_data=True,
17 |     scripts=[
18 |         'locationtagger/bin/locationtagger-nltk-spacy',
19 |         'locationtagger/tests/__init__.py',
20 |         'locationtagger/tests/test_LocationExtractor.py',
21 |         'locationtagger/tests/test_NamedEntityExtractor.py'
22 |     ],
23 |     package_data = {
24 |         'locationtagger': ['data/*.csv', 'data/*.jpg']
25 |     },
26 |     license='MIT',
27 |     install_requires=[
28 |         'nltk',
29 |         'spacy',
30 |         'newspaper3k',
31 |         'pycountry'
32 |     ],
33 |     classifiers=[
34 |         "Programming Language :: Python :: 3",
35 |         "License :: OSI Approved :: MIT License",
36 |         "Natural Language :: English",
37 |         "Topic :: Text Processing",
38 |         "Operating System :: OS Independent"
39 |     ],
40 |     python_requires='>=3.5'
41 | )


--------------------------------------------------------------------------------
/locationtagger/tests/test_NamedEntityExtractor.py:
--------------------------------------------------------------------------------
 1 | from locationtagger.locationextractor import NamedEntityExtractor
 2 | 
 3 | def test():
 4 |     try:
 5 |         ne = NamedEntityExtractor(url='https://edition.cnn.com/2020/01/14/americas/\
 6 | staggering-number-of-human-rights-defenders-killed-in-colombia-the-un-says/index.html')
 7 |         ne.find_named_entities()
 8 | 
 9 |         assert len(ne.named_entities) > 0
10 |         assert 'Colombia' in ne.named_entities
11 |         assert 'Geneva' in ne.named_entities
12 |         assert 'Switzerland' in ne.named_entities
13 | 
14 |         text = """Adult day programs in two locations Forestpark and Hawkesbury. caregiver \
15 | support including memory clinic & dementia friends, training & public education in the \
16 | five eastern counties of Ontario STORMONT, DUNDAS, GLENGARRY, PRESCOTT, RUSSELL, as well \
17 | as the CITY OF CORNWALL AND AKWESASNE"""
18 | 
19 |         ne2 = NamedEntityExtractor(text=text)
20 |         ne2.find_named_entities()
21 | 
22 |         assert len(ne2.named_entities) > 0
23 |         assert 'Ontario' in ne2.named_entities
24 |         assert 'Hawkesbury' in ne2.named_entities
25 |         assert 'DUNDAS' in ne2.named_entities
26 |         assert 'PRESCOTT' in ne2.named_entities
27 |         assert 'CORNWALL' in ne2.named_entities
28 | 
29 |         text2 = """my friend works in Bangalore which is sometimes called 'Silicon Valley of INDIA' \
30 | it is the capital of Karnataka state; also he told me he's from kanpur"""
31 |         ne3 = NamedEntityExtractor(text=text2)
32 |         ne3.find_named_entities()
33 | 
34 |         assert len(ne3.named_entities) > 0
35 |         assert 'Bangalore' in ne3.named_entities
36 |         assert 'Karnataka' in ne3.named_entities
37 |         assert 'INDIA' in ne3.named_entities
38 |         
39 |         print('passed test')
40 |     
41 |     except Exception:
42 |         print('failed test')
43 | 
44 | test()
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # locationtagger
  2 | **version 0.0.1**
  3 | 
  4 | Detect and extract locations (Countries, Regions/States & Cities) from text or URL. Also, find relationships among countries, regions & cities.
  5 | 
  6 | ---
  7 | ## About Project
  8 | In the field of [Natural Lauguage Processing](https://en.wikipedia.org/wiki/Natural_language_processing), many algorithms have been derived for different types of syntactic & semantic analysis of the textual data. NER ([Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition)) is one of the best & frequently needed tasks in real-world problems of text mining that follows some grammer-based rules & statistical modelling approaches. An entity extracted from NER can be a name of person, place, organization or product. [locationtagger](https://github.com/kaushiksoni10/locationtagger) is a further process of tagging & filter out place names (locations) amongst all the entities found with NER.
  9 | 
 10 | Approach followed is given below in the picture;
 11 | 
 12 | https://github.com/kaushiksoni10/locationtagger/blob/master/locationtagger/data/diagram.jpg?raw=true
 13 | ![Approach](locationtagger/data/diagram.jpg)
 14 | 
 15 | ---
 16 | ## Install and Setup
 17 | **(Environment: python >= 3.5)**
 18 | 
 19 | Install the package using pip -
 20 | 
 21 | `pip install locationtagger`
 22 |  
 23 | But before we install the package, we need to install some useful libraries given below,
 24 | 
 25 | `nltk`
 26 | 
 27 | `spacy`
 28 | 
 29 | `newspaper3k`
 30 | 
 31 | `pycountry`
 32 | 
 33 | After installing these packages, there are some important nltk & spacy modules that need to be downloaded using commands given in `/locationtagger/bin/locationtagger-nltk-spacy` on IPython shell or Jupyter notebook.
 34 | 
 35 | ---
 36 | ## Usage
 37 | After proper installation of the package, import the module and give some text/URL as input;
 38 | 
 39 | ### Text as input
 40 | 
 41 | 
 42 | ```python
 43 | import locationtagger
 44 | 
 45 | text = "Unlike India and Japan, A winter weather advisory remains in effect through 5 PM along and east of a line from Blue Earth, to Red Wing line in Minnesota and continuing to along an Ellsworth, to Menomonie, and Chippewa Falls line in Wisconsin."
 46 | 
 47 | entities = locationtagger.find_locations(text = text)
 48 | ```
 49 | \
 50 | Now we can grab all the place names present in above text,
 51 | 
 52 | ```python
 53 | entities.countries
 54 | ```
 55 | `['India', 'Japan']`
 56 | 
 57 | ```python
 58 | entities.regions
 59 | ```
 60 | `['Minnesota', 'Wisconsin']`
 61 | 
 62 | ```python
 63 | entities.cities
 64 | ```
 65 | `['Ellsworth', 'Red Wing', 'Blue Earth', 'Chippewa Falls', 'Menomonie']`
 66 | 
 67 | \
 68 | Apart from above places extracted from the text, we can also find the countries where these extracted `cities`, `regions` belong to,
 69 | 
 70 | ```python
 71 | entities.country_regions
 72 | ```
 73 | `{'United States': ['Minnesota', 'Wisconsin']}`
 74 | 
 75 | ```python
 76 | entities.country_cities
 77 | ```
 78 | `{'United States': ['Ellsworth',
 79 |   'Red Wing',
 80 |   'Blue Earth',
 81 |   'Chippewa Falls',
 82 |   'Menomonie']}`
 83 |   
 84 |  \
 85 |   Since "United States" is a country but not present in the text still came from the relations to the `cities` & `regions` present in the text, we can find it in `other_countries`,
 86 |   
 87 |   ```python
 88 |   entities.other_countries
 89 |   ```
 90 |   `['United States']`
 91 |   
 92 |  \
 93 |   If we are really serious about the `cities` we got in the text we can find which regions in the world it may fall in, 
 94 |   
 95 |   ```python
 96 |   entities.region_cities
 97 |   ```
 98 |   `{'Maine': ['Ellsworth'],
 99 |  'Minnesota': ['Red Wing', 'Blue Earth'],
100 |  'Wisconsin': ['Ellsworth', 'Chippewa Falls', 'Menomonie'],
101 |  'Pennsylvania': ['Ellsworth'],
102 |  'Michigan': ['Ellsworth'],
103 |  'Illinois': ['Ellsworth'],
104 |  'Kansas': ['Ellsworth'],
105 |  'Iowa': ['Ellsworth']}`
106 | 
107 | \
108 | And obviously, we'll put these regions in `other_regions` since they are not present in original text,
109 | 
110 | ```python
111 | entities.other_regions
112 | ```
113 | `['Maine',
114 |  'Minnesota',
115 |  'Wisconsin',
116 |  'Pennsylvania',
117 |  'Michigan',
118 |  'Illinois',
119 |  'Kansas',
120 |  'Iowa']`
121 |  
122 | \
123 |  Whatever words nltk & spacy both grabbed from the original text as [named entity](https://en.wikipedia.org/wiki/Named_entity) , most of them are stored in `cities`, `regions` & `countries`. But the remaining words (not recognized as place name) will be stored in `other`.
124 |  
125 |  ```python
126 |  entities.other
127 |  ```
128 |  `['winter', 'PM', 'Chippewa']` 
129 | 
130 | ### URL as Input 
131 | Similarly, It can grab places from urls too, 
132 | 
133 | ```python
134 | URL = 'https://edition.cnn.com/2020/01/14/americas/staggering-number-of-human-rights-defenders-killed-in-colombia-the-un-says/index.html'
135 | entities2 = locationtagger.find_locations(url = URL)
136 | ```
137 | \
138 | outputs we get:
139 | countries;
140 | 
141 | ```python
142 | entities2.countries
143 | ```
144 | `['Switzerland', 'Colombia']`
145 | 
146 | \
147 | regions;
148 | 
149 | ```python
150 | entities2.regions
151 | ```
152 | `['Geneva']`
153 | 
154 | \
155 | cities;
156 | 
157 | ```pyhton
158 | entities2.cities
159 | ```
160 | `['Geneva', 'Colombia']`
161 | 
162 | \
163 | Now, if we want to check how many times a place has been mentioned or most common places which have been mentioned in the whole page of the URL, we can have an idea about what location that page is talking about;
164 | 
165 | hence, most commonly mentioned countries;
166 | 
167 | ```python
168 | entities2.country_mentions
169 | ```
170 | `[('Colombia', 3), ('Switzerland', 1), ('United States', 1), ('Mexico', 1)]`
171 | 
172 | \
173 | and most commonly mentioned cities;
174 | 
175 | ```python
176 | entities2.city_mentions
177 | ```
178 | `[('Colombia', 3), ('Geneva', 1)]`
179 | 
180 | ---
181 | 
182 | ## Credits
183 | [locationtagger](https://github.com/kaushiksoni10/locationtagger) uses data from following source for country, region & city lookups,
184 | 
185 | [GEOLITE2 free downloadable database](https://dev.maxmind.com/geoip/geoip2/geolite2/)
186 | 
187 | Apart from famous nlp libraries [NLTK](http://www.nltk.org/) & [spacy](https://spacy.io/), [locationtagger](https://github.com/kaushiksoni10/locationtagger) uses following very useful libraries;
188 | 
189 | [pycountry](https://github.com/flyingcircusio/pycountry)
190 | 
191 | [newspaper3k](https://github.com/codelucas/newspaper)
192 | 


--------------------------------------------------------------------------------
/locationtagger/locationextractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import pycountry
  4 | import sqlite3
  5 | from .utils import clean
  6 | from collections import Counter
  7 | import nltk
  8 | import spacy
  9 | from newspaper import Article
 10 | nlp = spacy.load('en_core_web_sm')
 11 | cur_dir = os.path.dirname(os.path.realpath(__file__))
 12 | with open(cur_dir + "/data/words_to_ignore.csv") as file:
 13 |     words_to_ignore = file.read().splitlines()
 14 | words_to_ignore = [a.lower() for a in words_to_ignore]
 15 | 
 16 | class NamedEntityExtractor(object):
 17 |     """
 18 |     This class takes a text or url input and lists out all the named entities 
 19 |     (PERSON, PLACE, ORGANIZATION, PRODUCTS) mentioned in the text body 
 20 |     with the help of nltk & spacy's named entity recognizer.
 21 |     """
 22 | 
 23 |     def __init__(self, text=None, url=None):
 24 |         if not text and not url:
 25 |             raise Exception('Please input any text or url')
 26 | 
 27 |         self.text = text
 28 |         self.url = url
 29 |         self.named_entities = []
 30 | 
 31 |     
 32 |     def set_text(self):
 33 |         if not self.text and self.url:
 34 |             a = Article(self.url)
 35 |             a.download()
 36 |             a.parse()
 37 |             self.text = a.text
 38 | 
 39 |     def find_named_entities(self):
 40 |         self.set_text()
 41 | 
 42 |         text = nltk.word_tokenize(clean(self.text))
 43 |         nes = nltk.ne_chunk(nltk.pos_tag(text))
 44 | 
 45 |         doc = nlp(clean(self.text))
 46 |         for ent in list(doc.ents):
 47 |             if not (str(ent).lower() in words_to_ignore) :
 48 |                 self.named_entities.append(str(ent))
 49 |         
 50 |         for ne in nes:
 51 |             if type(ne) is nltk.tree.Tree:
 52 |                 if (ne.label() == 'GPE' or ne.label() == 'PERSON' or ne.label() == 'ORGANIZATION'):
 53 |                     l = []
 54 |                     for i in ne.leaves():
 55 |                         l.append(i[0])
 56 |                     s = u' '.join(l)
 57 |                     if not (s in self.named_entities):
 58 |                         if not (s.lower() in words_to_ignore):
 59 |                             self.named_entities.append(s)
 60 | 
 61 | 
 62 | 
 63 | class LocationExtractor(object):
 64 |     """
 65 |     This class takes a list of named entities and finds out the entitites which are place names
 66 |     (country, region, city etc) and relationships of countries with regions and cities
 67 |     """
 68 |     
 69 |     def __init__(self, named_entity_words, db_file=None):
 70 |         db_file = db_file or os.path.dirname(os.path.realpath(__file__)) + "/locationdata.db"
 71 |         self.conn = sqlite3.connect(db_file)
 72 |         self.named_entities = named_entity_words
 73 | 
 74 |     def populate_db(self):
 75 |         cur = self.conn.cursor()
 76 |         cur.execute("DROP TABLE IF EXISTS locations")    
 77 | 
 78 |         cur.execute("CREATE TABLE locations(geoname_id INTEGER, continent_code TEXT, continent_name TEXT, country_iso_code TEXT, country_name TEXT, subdivision_iso_code TEXT, subdivision_name TEXT, city_name TEXT, metro_code TEXT, time_zone TEXT)")
 79 | 
 80 |         with open(cur_dir+"/data/City-Region-Locations.csv",encoding = 'UTF') as info:
 81 |             reader = csv.reader(info)
 82 |             for row in reader:
 83 |                 cur.execute("INSERT INTO locations VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", row)
 84 | 
 85 |             self.conn.commit()
 86 | 
 87 | 
 88 |     def db_has_data(self):
 89 |         cur = self.conn.cursor()
 90 | 
 91 |         cur.execute("SELECT Count(*) FROM sqlite_master WHERE name='locations';")
 92 |         data = cur.fetchone()[0]
 93 | 
 94 |         if data > 0:
 95 |             cur.execute("SELECT Count(*) FROM locations")
 96 |             data = cur.fetchone()[0]
 97 |             return data > 0
 98 | 
 99 |         return False
100 | 
101 |     
102 |     def is_a_country(self, s): 
103 |         ss = ' '.join([(i[0].upper()+i[1:].lower()) for i in s.split()])
104 |         try:
105 |             pycountry.countries.get(name=ss).alpha_3
106 |             return True
107 |         except AttributeError:
108 |             try:
109 |                 pycountry.countries.get(official_name=ss).alpha_3
110 |                 return True
111 |             except AttributeError:
112 |                 return False
113 | 
114 | 
115 |     def set_countries(self):
116 |         countries = [place 
117 |             for place in self.named_entities if self.is_a_country(place)]
118 | 
119 |         self.country_mentions = Counter(countries).most_common()
120 |         self.countries = list(set(countries))
121 | 
122 | 
123 |     def set_regions(self):
124 |         self.regions = []
125 |         self.country_regions = {}
126 |         self.other_countries = []
127 | 
128 |         if not self.countries:
129 |             self.set_countries()
130 | 
131 |         if not self.db_has_data():
132 |             self.populate_db()
133 | 
134 |         if len(self.named_entities) > 0:
135 |             cur = self.conn.cursor()
136 |             cur.execute("SELECT * FROM locations WHERE LOWER(subdivision_name) IN (" + ",".join("?"*len(self.named_entities)) + ")", [p.lower() for p in self.named_entities])
137 |             rows = cur.fetchall()
138 | 
139 |             for row in rows:
140 |                 country = None
141 | 
142 |                 try:
143 |                     country = pycountry.countries.get(alpha_2=row[3])
144 |                     country_name = country.name
145 |                 except Exception:
146 |                     country_name = row[4]
147 | 
148 |                 region_name = row[6]
149 | 
150 |                 if region_name not in self.regions:
151 |                     self.regions.append(region_name)
152 | 
153 |                 if country_name not in self.other_countries:
154 |                     self.other_countries.append(country_name)
155 | 
156 |                 if country_name not in self.country_regions:
157 |                     self.country_regions[country_name] = []
158 | 
159 |                 if region_name not in self.country_regions[country_name]:
160 |                     self.country_regions[country_name].append(region_name)
161 | 
162 | 
163 |     def set_cities(self):
164 |         self.cities = []
165 |         self.country_cities = {}
166 |         self.region_cities = {}
167 |         self.other_regions = []
168 |         self.address_strings = []
169 | 
170 |         if not self.countries:
171 |             self.set_countries()
172 | 
173 |         if not self.regions:
174 |             self.set_regions()
175 | 
176 |         if not self.db_has_data():
177 |             self.populate_db()
178 |             
179 |         if len(self.named_entities) > 0:
180 |             cur = self.conn.cursor()
181 |             cur.execute("SELECT * FROM locations WHERE LOWER(city_name) IN (" + ",".join("?"*len(self.named_entities)) + ")", [p.lower() for p in self.named_entities])
182 |             rows = cur.fetchall()
183 | 
184 |             for row in rows:
185 |                 country = None
186 | 
187 |                 try:
188 |                     country = pycountry.countries.get(alpha_2=row[3])
189 |                     country_name = country.name
190 |                 except Exception:
191 |                     country_name = row[4]
192 | 
193 |                 city_name = row[7]
194 |                 region_name = row[6]
195 | 
196 |                 if city_name not in self.cities:
197 |                     self.cities.append(city_name)
198 | 
199 |                 if region_name not in self.other_regions:
200 |                     self.other_regions.append(region_name)
201 | 
202 |                 if country_name not in self.other_countries:
203 |                     self.other_countries.append(country_name)
204 |                     self.country_mentions.append((country_name,1))
205 | 
206 |                 if region_name not in self.region_cities:
207 |                     self.region_cities[region_name] = []
208 | 
209 |                 if city_name not in self.region_cities[region_name]:
210 |                     self.region_cities[region_name].append(city_name)
211 | 
212 |                 if country_name not in self.country_cities:
213 |                     self.country_cities[country_name] = []
214 | 
215 |                 if city_name not in self.country_cities[country_name]:
216 |                     self.country_cities[country_name].append(city_name)
217 | 
218 |                     if country_name in self.country_regions and region_name in self.country_regions[country_name]:
219 |                         self.address_strings.append(city_name + ", " + region_name + ", " + country_name)
220 | 
221 | 
222 |         all_cities = [p for p in self.named_entities if p in self.cities]
223 |         self.city_mentions = Counter(all_cities).most_common()
224 | 
225 | 
226 |     def set_other(self):
227 |         if not self.cities:
228 |             self.set_cities()
229 | 
230 |         def unused(place_name):
231 |             places = self.countries + self.cities + self.regions
232 |             return (place_name.lower() not in [a.lower() for a in places])
233 | 
234 |         self.other = [p for p in self.named_entities if unused(p)]
235 | 


--------------------------------------------------------------------------------