├── .gitignore ├── README.md ├── parse.py ├── requirements.txt └── russian-cities.json /.gitignore: -------------------------------------------------------------------------------- 1 | russian-cities-new.json 2 | .venv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Cities count](https://img.shields.io/badge/cities-1117-green) 2 | 3 | # Список городов России 4 | 5 | Список городов России в формате JSON. Значения полей в JSON: 6 | 7 | Поле|Описание 8 | ----|-------- 9 | name|Название города 10 | subject|Регион 11 | district|Федеральный округ 12 | population|Население 13 | coords|Координаты ('lat' - широта, 'lon' - долгота) 14 | 15 | Данные взяты из [Википедии](https://ru.wikipedia.org/wiki/Список_городов_России). 16 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Russian cities parser 4 | 5 | # Script gets list of the Russian cities from Wikipedia and basic information 6 | # about them: name, subject, district, population and coordinates. 7 | # Copyright Andrey Zhidenkov, 2018 (c) 8 | 9 | import os 10 | import sys 11 | import json 12 | 13 | from lxml.html import fromstring 14 | 15 | from parselab.parsing import BasicParser 16 | from parselab.network import NetworkManager 17 | from parselab.cache import FileCache 18 | 19 | class App(BasicParser): 20 | 21 | data = list() 22 | 23 | def __init__(self): 24 | self.cache = FileCache(namespace='russian-cities', path=os.environ.get('CACHE_PATH')) 25 | self.net = NetworkManager() 26 | 27 | def get_coords(self, url): 28 | page = self.get_page(url) 29 | html = fromstring(page) 30 | 31 | try: 32 | span = html.xpath('//span[contains(@class, "coordinates")]//a[@class="mw-kartographer-maplink"]')[0] 33 | except IndexError: 34 | return {'lat': '', 'lon': ''} 35 | 36 | return {'lat': span.get('data-lat'), 'lon': span.get('data-lon')} 37 | 38 | def run(self): 39 | page = self.get_page('https://ru.wikipedia.org/wiki/Список_городов_России') 40 | html = fromstring(page) 41 | 42 | for tr in html.xpath('//table/tbody/tr'): 43 | columns = tr.xpath('.//td') 44 | if len(columns) != 9: 45 | continue 46 | name = columns[2].xpath('./a')[0].text_content().strip() 47 | url = columns[2].xpath('./a')[0].get('href') 48 | subject = columns[3].text_content().strip() 49 | district = columns[4].text_content().strip() 50 | population = int(columns[5].get('data-sort-value')) 51 | 52 | city = {'name': name, 'subject': subject, 'district': district, 'population': population} 53 | city.update({'coords': self.get_coords('https://ru.wikipedia.org%s' % url)}) 54 | self.data.append(city) 55 | 56 | print(name, file=sys.stderr) 57 | 58 | output = sorted(self.data, key=lambda k: '%s|%s|%s' % (k['name'], k['subject'], k['district'])) 59 | print(json.dumps(output, ensure_ascii=False, sort_keys=True)) 60 | 61 | if __name__ == '__main__': 62 | app = App() 63 | sys.exit(app.run()) 64 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | parselab 2 | lxml 3 | --------------------------------------------------------------------------------