├── gis ├── gis │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── dashboard_spider.py │ │ ├── cat_spider.py │ │ ├── org_spider.py │ │ └── organizations_spider.py │ ├── items.py │ ├── pipelines.py │ ├── my_pipelines.py │ ├── settings.py │ └── middlewares.py ├── scrapy.cfg ├── json_to_xlsx.py └── scraper.py ├── categories ├── categories │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ └── cat_spider.py │ ├── items.py │ ├── utils.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py ├── show_categories.py └── scrapy.cfg ├── required.txt ├── readme.txt └── .gitignore /gis/gis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /categories/categories/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /required.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | xlsxwriter 3 | pymongo 4 | -------------------------------------------------------------------------------- /gis/gis/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /categories/categories/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /categories/show_categories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pymongo 4 | 5 | if __name__ == '__main__': 6 | client = pymongo.MongoClient("localhost", 27017) 7 | db = client.gis_db 8 | for item in db.scrapy_items.find(): 9 | print(item) 10 | -------------------------------------------------------------------------------- /gis/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = gis.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = gis 12 | -------------------------------------------------------------------------------- /categories/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = categories.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = categories 12 | -------------------------------------------------------------------------------- /categories/categories/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CatItem(scrapy.Item): 12 | id = scrapy.Field(serializer=int) 13 | name = scrapy.Field() 14 | is_metarubric = scrapy.Field(serializer=bool) 15 | region = scrapy.Field(serializer=int) 16 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | # 2gis API грабер 2 | 3 | для получения списка организаций 4 | > scrapy crawl organizations --nolog -a region_id=5 -o result_of_region_5.json 5 | 6 | для конвертации в Excel 7 | > json_to_xlsx.py result_of_region_5.json 8 | 9 | что бы последовательно сграбить несколько регионов (в рамках одного процесса) 10 | > scraper.py -r 4,5,6,7 11 | 12 | region_id: 13 | 1 - Новосибирск 14 | 2 - Омск 15 | 3 - Томск 16 | 4 - Барнаул 17 | 5 - Кемерово 18 | 6 - Новокузнецк 19 | 7 - Красноярск 20 | -------------------------------------------------------------------------------- /categories/categories/utils.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | 3 | 4 | def safe_func(func): 5 | 6 | def wrapped_func(*arg, **kwargs): 7 | try: 8 | return func(*arg, **kwargs) 9 | except Exception as e: 10 | print('- ' * 50) 11 | print('- ' * 50) 12 | print('- ' * 50) 13 | traceback.print_exc() 14 | print('- ' * 50) 15 | print('- ' * 50) 16 | print('- ' * 50) 17 | 18 | return wrapped_func 19 | -------------------------------------------------------------------------------- /gis/gis/spiders/dashboard_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | 5 | 6 | class DashboardSpider(scrapy.Spider): 7 | name = 'dashboard' 8 | start_urls = ['https://catalog.api.2gis.ru/3.0/rubricator/dashboard?locale=ru_RU®ion_id=5&key=rutnpt3272'] 9 | 10 | def parse(self, response): 11 | data = json.loads(response.text) 12 | items = [i.get('search_query') for i in data['result']['items'] if i.get('search_query')] 13 | return {num: item for num, item in enumerate(items)} 14 | -------------------------------------------------------------------------------- /gis/gis/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CatItem(scrapy.Item): 12 | type = scrapy.Field() 13 | id = scrapy.Field(serializer=int) 14 | name = scrapy.Field() 15 | is_metarubric = scrapy.Field(serializer=bool) 16 | region = scrapy.Field(serializer=int) 17 | 18 | 19 | class OrgItem(scrapy.Item): 20 | type = scrapy.Field() 21 | name = scrapy.Field() 22 | address = scrapy.Field() 23 | lat = scrapy.Field() 24 | lon = scrapy.Field() 25 | email = scrapy.Field() 26 | rubrics = scrapy.Field() 27 | region = scrapy.Field(serializer=int) 28 | -------------------------------------------------------------------------------- /gis/gis/spiders/cat_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from scrapy import Spider, Request 4 | from gis.items import CatItem 5 | 6 | 7 | # noinspection SpellCheckingInspection 8 | URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \ 9 | '?parent_id={parent_id}&locale=ru_RU®ion_id=5&key=rutnpt3272' 10 | 11 | 12 | class CatSpider(Spider): 13 | name = 'cat' 14 | start_urls = [ 15 | 'https://catalog.api.2gis.ru/3.0/rubricator/list?locale=ru_RU®ion_id=5&key=rutnpt3272', 16 | ] 17 | 18 | def parse(self, response): 19 | data = json.loads(response.text) 20 | if data['meta']['code'] == 404: 21 | return 22 | for item in data['result']['items']: 23 | uid = item.get('id') 24 | name = item.get('name') 25 | if uid is not None and name is not None: 26 | yield CatItem(id=uid, name=name) 27 | yield Request(url=URL_TEMPLATE.format(parent_id=uid)) 28 | -------------------------------------------------------------------------------- /gis/gis/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | 10 | class GisPipeline(object): 11 | def process_item(self, item, spider): 12 | del spider 13 | return item 14 | 15 | 16 | class MongoPipeline(object): 17 | 18 | collection_name = 'scrapy_items' 19 | 20 | def __init__(self, mongo_uri, mongo_db): 21 | self.mongo_uri = mongo_uri 22 | self.mongo_db = mongo_db 23 | self.client = None 24 | self.db = None 25 | 26 | @classmethod 27 | def from_crawler(cls, crawler): 28 | return cls( 29 | mongo_uri=crawler.settings.get('MONGO_URI'), 30 | mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') 31 | ) 32 | 33 | def open_spider(self, spider): 34 | del spider 35 | self.client = pymongo.MongoClient(self.mongo_uri) 36 | self.db = self.client[self.mongo_db] 37 | 38 | def close_spider(self, spider): 39 | del spider 40 | self.client.close() 41 | 42 | def process_item(self, item, spider): 43 | del spider 44 | self.db[self.collection_name].insert_one(dict(item)) 45 | return item 46 | -------------------------------------------------------------------------------- /categories/categories/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | 10 | 11 | class CategoriesPipeline(object): 12 | def process_item(self, item, spider): 13 | del spider 14 | return item 15 | 16 | 17 | class MongoPipeline(object): 18 | 19 | collection_name = 'scrapy_items' 20 | 21 | def __init__(self, mongo_uri, mongo_db): 22 | self.mongo_uri = mongo_uri 23 | self.mongo_db = mongo_db 24 | self.client = None 25 | self.db = None 26 | 27 | @classmethod 28 | def from_crawler(cls, crawler): 29 | return cls( 30 | mongo_uri=crawler.settings.get('MONGO_URI'), 31 | mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') 32 | ) 33 | 34 | def open_spider(self, spider): 35 | del spider 36 | self.client = pymongo.MongoClient(self.mongo_uri) 37 | self.db = self.client[self.mongo_db] 38 | 39 | def close_spider(self, spider): 40 | del spider 41 | self.client.close() 42 | 43 | def process_item(self, item, spider): 44 | del spider 45 | self.db[self.collection_name].insert_one(dict(item)) 46 | return item 47 | -------------------------------------------------------------------------------- /gis/json_to_xlsx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | from optparse import OptionParser 4 | 5 | import xlsxwriter 6 | 7 | 8 | def convert_json_to_xlsx(filename): 9 | with open(filename) as file: 10 | data = json.loads(file.read()) 11 | 12 | # erase categories 13 | data = filter(lambda x: 'address' in x, data) 14 | 15 | # Create an new Excel file and add a worksheet. 16 | workbook = xlsxwriter.Workbook('Excel.xlsx') 17 | worksheet = workbook.add_worksheet() 18 | 19 | column_widths = [(0, 60), (1, 25), (2, 25), (3, 80)] # (2, 10), (3, 10) 20 | for col, width in column_widths: 21 | worksheet.set_column(firstcol=col, lastcol=col, width=width) 22 | 23 | for row, item in enumerate(data): 24 | for col, field in enumerate(['name', 'address', 'email', 'rubrics']): # 'lat', 'lon' 25 | worksheet.write(row, col, item.get(field)) 26 | 27 | workbook.close() 28 | 29 | 30 | if __name__ == '__main__': 31 | 32 | parser = OptionParser() 33 | parser.add_option('-f', '--file', dest='filename', help='input/output json file', metavar='FILE') 34 | 35 | (options, args) = parser.parse_args() 36 | 37 | file_name = None 38 | if len(args) > 0: 39 | convert_json_to_xlsx(args[0]) 40 | elif options.filename: 41 | convert_json_to_xlsx(options.filename) 42 | else: 43 | print('''Usage: json_to_xlsx.py [options] 44 | Options: 45 | -h, --help show this help message and exit 46 | -f FILE, --file=FILE input/output file 47 | ''') 48 | -------------------------------------------------------------------------------- /categories/categories/spiders/cat_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from scrapy import Spider, Request 4 | 5 | from categories.items import CatItem 6 | from categories.settings import START_URL_TEMPLATE, CAT_URL_TEMPLATE 7 | from categories.utils import safe_func 8 | 9 | 10 | class OrganizationsSpider(Spider): 11 | name = 'cat' 12 | 13 | def __init__(self, region_id=None, *arg, **kwargs): 14 | super().__init__(*arg, **kwargs) 15 | self.fingerprints = set() 16 | 17 | # setup region_id 18 | if region_id is None: 19 | print('-' * 45) 20 | print('| CRITICAL: argument region_id is required! |') 21 | print('-' * 45) 22 | raise Exception('argument region_id is required!') 23 | self.region_id = region_id 24 | self.start_urls = [START_URL_TEMPLATE.format(region_id=region_id)] 25 | 26 | @safe_func 27 | def parse(self, response): 28 | data = json.loads(response.text) 29 | if data['meta']['code'] in [404, 400]: 30 | return 31 | for item in data['result']['items']: 32 | cat = CatItem( 33 | id=int(item.get('id')), 34 | name=item.get('name'), 35 | is_metarubric=item.get('type') == 'metarubric', 36 | region=self.region_id 37 | ) 38 | 39 | if cat['is_metarubric']: 40 | for mod in range(-1, 2): 41 | uid_mod = cat['id'] + mod 42 | yield Request(url=CAT_URL_TEMPLATE.format(parent_id=uid_mod, region_id=self.region_id)) 43 | 44 | yield cat 45 | -------------------------------------------------------------------------------- /gis/scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import traceback 5 | from optparse import OptionParser 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | 9 | from gis.spiders.organizations_spider import OrganizationsSpider 10 | 11 | if __name__ == '__main__': 12 | 13 | parser = OptionParser() 14 | parser.add_option('-r', '--region', dest='region', help='scraping data for region', metavar='INT') 15 | parser.add_option('-c', '--categories', 16 | action='store_true', dest='is_cat_grabber', default=False, 17 | help='scraping only categories') 18 | 19 | (options, args) = parser.parse_args() 20 | 21 | try: 22 | # getter 23 | assert options.region 24 | if options.region == 'all': 25 | regions = list(range(1, 8)) 26 | else: 27 | regions = json.loads(f'[{options.region}]') 28 | 29 | # validation 30 | assert isinstance(regions, list) 31 | for i in regions: 32 | assert isinstance(i, int) 33 | 34 | except Exception: 35 | traceback.print_exc() 36 | print('''Usage: scraper.py [options] 37 | Options: 38 | -h, --help show this help message and exit 39 | -r INT, --region=INT scraping data for region 40 | ''') 41 | 42 | else: 43 | config = { 44 | 'FEED_URI': f'results.json', 45 | 'FEED_FORMAT': 'json', 46 | 'FEED_EXPORT_ENCODING': 'utf-8', 47 | 'ROBOTSTXT_OBEY': False, 48 | } 49 | process = CrawlerProcess(config) 50 | for i in regions: 51 | process.crawl(OrganizationsSpider, region_id=i, is_cat_grabber=options.is_cat_grabber) 52 | process.start() 53 | -------------------------------------------------------------------------------- /gis/gis/my_pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | from scrapy.exceptions import DropItem 10 | 11 | 12 | class GisPipeline(object): 13 | def process_item(self, item, spider): 14 | del spider 15 | return item 16 | 17 | 18 | # noinspection PyUnusedLocal 19 | class MongoPipeline(object): 20 | 21 | collection_name = 'scrapy_items' 22 | 23 | def __init__(self, mongo_uri, mongo_db): 24 | self.mongo_uri = mongo_uri 25 | self.mongo_db = mongo_db 26 | 27 | @classmethod 28 | def from_crawler(cls, crawler): 29 | return cls( 30 | mongo_uri=crawler.settings.get('MONGO_URI'), 31 | mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') 32 | ) 33 | 34 | # noinspection PyAttributeOutsideInit 35 | def open_spider(self, spider): 36 | self.client = pymongo.MongoClient(self.mongo_uri) 37 | self.db = self.client[self.mongo_db] 38 | 39 | def close_spider(self, spider): 40 | self.client.close() 41 | 42 | def process_item(self, item, spider): 43 | self.db[self.collection_name].insert_one(dict(item)) 44 | return item 45 | 46 | 47 | class DuplicatesPipeline(object): 48 | 49 | def __init__(self): 50 | self.fingerprints = set() 51 | 52 | def process_item(self, item, spider): 53 | del spider 54 | name = item.get('name') 55 | address = item.get('address_name') 56 | fingerprint = f'{name}#{address}' 57 | if fingerprint in self.fingerprints: 58 | raise DropItem("Duplicate item found: %s" % item) 59 | else: 60 | self.fingerprints.add(fingerprint) 61 | return item 62 | -------------------------------------------------------------------------------- /gis/gis/spiders/org_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from scrapy import Spider, Request 4 | from gis.items import CatItem, OrgItem 5 | 6 | # noinspection SpellCheckingInspection 7 | URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \ 8 | '?page=1' \ 9 | '&page_size=50' \ 10 | '&rubric_id={rubric_id}' \ 11 | '®ion_id=5' \ 12 | '&locale=ru_RU' \ 13 | '&fields=items.contact_groups%2Citems.point' \ 14 | '&key=rutnpt3272' 15 | 16 | 17 | class OrgSpider(Spider): 18 | name = 'org' 19 | 20 | def __init__(self, *arg, **kwargs): 21 | super().__init__(*arg, **kwargs) 22 | self.fingerprints = set() 23 | 24 | def start_requests(self): 25 | cat_file = getattr(self, 'cat', None) 26 | if cat_file is None: 27 | return 28 | with open(cat_file) as categories: 29 | for item in json.loads(categories.read()): 30 | yield Request(URL_TEMPLATE.format(rubric_id=item['id']), self.parse) 31 | 32 | def parse(self, response): 33 | data = json.loads(response.text) 34 | if data['meta']['code'] == 404: 35 | return 36 | for item in data['result']['items']: 37 | emails = list() 38 | for contact in item.get('contact_groups', [{}])[0].get('contacts', []): 39 | if contact.get('type') == 'email': 40 | emails.append(contact.get('value')) 41 | if emails: 42 | name = item.get('name') 43 | address = item.get('address_name') 44 | fingerprint = f'{name}#{address}' 45 | if fingerprint not in self.fingerprints: 46 | self.fingerprints.add(fingerprint) 47 | yield OrgItem( 48 | name=name, 49 | address=address, 50 | lat=item.get('point', {}).get('lat'), 51 | lon=item.get('point', {}).get('lon'), 52 | email=emails[0] if len(emails) > 0 else None, 53 | email2=emails[1] if len(emails) > 1 else None, 54 | email3=emails[2] if len(emails) > 2 else None, 55 | ) 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff: 7 | .idea/**/tasks.xml 8 | .idea/dictionaries 9 | 10 | # Sensitive or high-churn files: 11 | .idea/**/dataSources/ 12 | .idea/**/dataSources.ids 13 | .idea/**/dataSources.xml 14 | .idea/**/dataSources.local.xml 15 | .idea/**/sqlDataSources.xml 16 | .idea/**/dynamic.xml 17 | .idea/**/uiDesigner.xml 18 | 19 | # Gradle: 20 | .idea/**/gradle.xml 21 | .idea/**/libraries 22 | 23 | # CMake 24 | cmake-build-debug/ 25 | cmake-build-release/ 26 | 27 | # Mongo Explorer plugin: 28 | .idea/**/mongoSettings.xml 29 | 30 | ## File-based project format: 31 | *.iws 32 | 33 | ## Plugin-specific files: 34 | 35 | # IntelliJ 36 | out/ 37 | 38 | # mpeltonen/sbt-idea plugin 39 | .idea_modules/ 40 | 41 | # JIRA plugin 42 | atlassian-ide-plugin.xml 43 | 44 | # Cursive Clojure plugin 45 | .idea/replstate.xml 46 | 47 | # Crashlytics plugin (for Android Studio and IntelliJ) 48 | com_crashlytics_export_strings.xml 49 | crashlytics.properties 50 | crashlytics-build.properties 51 | fabric.properties 52 | ### Python template 53 | # Byte-compiled / optimized / DLL files 54 | *$py.class 55 | 56 | # C extensions 57 | *.so 58 | 59 | # Distribution / packaging 60 | .Python 61 | build/ 62 | develop-eggs/ 63 | dist/ 64 | downloads/ 65 | eggs/ 66 | .eggs/ 67 | lib64/ 68 | parts/ 69 | sdist/ 70 | var/ 71 | wheels/ 72 | *.egg-info/ 73 | .installed.cfg 74 | *.egg 75 | MANIFEST 76 | 77 | # PyInstaller 78 | # Usually these files are written by a python script from a template 79 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 80 | *.manifest 81 | *.spec 82 | 83 | # Installer logs 84 | pip-log.txt 85 | pip-delete-this-directory.txt 86 | 87 | # Unit test / coverage reports 88 | htmlcov/ 89 | .tox/ 90 | .coverage 91 | .coverage.* 92 | .cache 93 | nosetests.xml 94 | coverage.xml 95 | *.cover 96 | .hypothesis/ 97 | 98 | # Translations 99 | *.mo 100 | *.pot 101 | 102 | # Django stuff: 103 | *.log 104 | .static_storage/ 105 | .media/ 106 | local_settings.py 107 | 108 | # Flask stuff: 109 | instance/ 110 | .webassets-cache 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # Jupyter Notebook 122 | .ipynb_checkpoints 123 | 124 | # pyenv 125 | .python-version 126 | 127 | # celery beat schedule file 128 | celerybeat-schedule 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | 155 | .idea 156 | gis/*.json 157 | gis/*.xlsx 158 | -------------------------------------------------------------------------------- /gis/gis/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for gis project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'gis' 13 | 14 | SPIDER_MODULES = ['gis.spiders'] 15 | NEWSPIDER_MODULE = 'gis.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'gis (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'gis.middlewares.GisSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'gis.middlewares.GisDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'gis.pipelines.GisPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | FEED_EXPORT_ENCODING = 'utf-8' 93 | -------------------------------------------------------------------------------- /gis/gis/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class GisSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class GisDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /categories/categories/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CategoriesSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CategoriesDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /categories/categories/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for categories project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'categories' 13 | 14 | SPIDER_MODULES = ['categories.spiders'] 15 | NEWSPIDER_MODULE = 'categories.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'categories (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'categories.middlewares.CategoriesSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'categories.middlewares.CategoriesDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'categories.pipelines.CategoriesPipeline': 300, 69 | 'categories.pipelines.MongoPipeline': 900, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | FEED_EXPORT_ENCODING = 'utf-8' 94 | 95 | MONGO_URI = 'mongodb://localhost' 96 | MONGO_DATABASE = 'gis_db' 97 | 98 | # noinspection SpellCheckingInspection 99 | START_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \ 100 | '?locale=ru_RU®ion_id={region_id}&key=rutnpt3272' 101 | 102 | # noinspection SpellCheckingInspection 103 | CAT_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \ 104 | '?parent_id={parent_id}&locale=ru_RU®ion_id={region_id}&key=rutnpt3272' 105 | 106 | # noinspection SpellCheckingInspection 107 | ORG_URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \ 108 | '?page={page}' \ 109 | '&page_size=50' \ 110 | '&rubric_id={rubric_id}' \ 111 | '®ion_id={region_id}' \ 112 | '&locale=ru_RU' \ 113 | '&fields=items.contact_groups%2Citems.rubrics%2Citems.point' \ 114 | '&key=rutnpt3272' 115 | -------------------------------------------------------------------------------- /gis/gis/spiders/organizations_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import traceback 4 | 5 | from scrapy import Spider, Request 6 | from gis.items import OrgItem, CatItem 7 | 8 | # noinspection SpellCheckingInspection 9 | START_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \ 10 | '?locale=ru_RU®ion_id={region_id}&key=rutnpt3272' 11 | 12 | # noinspection SpellCheckingInspection 13 | CAT_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \ 14 | '?parent_id={parent_id}&locale=ru_RU®ion_id={region_id}&key=rutnpt3272' 15 | 16 | # noinspection SpellCheckingInspection 17 | ORG_URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \ 18 | '?page={page}' \ 19 | '&page_size=50' \ 20 | '&rubric_id={rubric_id}' \ 21 | '®ion_id={region_id}' \ 22 | '&locale=ru_RU' \ 23 | '&fields=items.contact_groups%2Citems.rubrics%2Citems.point' \ 24 | '&key=rutnpt3272' 25 | 26 | page_regex = r"page=([\d]+)" 27 | rubric_regex = r"rubric_id=([\d]+)" 28 | 29 | 30 | def safe_func(func): 31 | 32 | def wrapped_func(*arg, **kwargs): 33 | try: 34 | return func(*arg, **kwargs) 35 | except Exception as e: 36 | print('- ' * 50) 37 | print('- ' * 50) 38 | print('- ' * 50) 39 | traceback.print_exc() 40 | print('- ' * 50) 41 | print('- ' * 50) 42 | print('- ' * 50) 43 | 44 | return wrapped_func 45 | 46 | 47 | class OrganizationsSpider(Spider): 48 | name = 'organizations' 49 | 50 | def __init__(self, region_id=None, is_cat_grabber=None, *arg, **kwargs): 51 | super().__init__(*arg, **kwargs) 52 | self.fingerprints = set() 53 | 54 | # setup region_id 55 | if region_id is None: 56 | print('-' * 45) 57 | print('| CRITICAL: argument region_id is required! |') 58 | print('-' * 45) 59 | raise Exception('argument region_id is required!') 60 | self.region_id = region_id 61 | self.start_urls = [START_URL_TEMPLATE.format(region_id=region_id)] 62 | self.is_cat_grabber = is_cat_grabber is True 63 | 64 | @safe_func 65 | def parse(self, response): 66 | data = json.loads(response.text) 67 | if data['meta']['code'] in [404, 400]: 68 | return 69 | for item in data['result']['items']: 70 | cat = CatItem( 71 | type='category', 72 | id=int(item.get('id')), 73 | name=item.get('name'), 74 | is_metarubric=item.get('type') == 'metarubric', 75 | region=self.region_id 76 | ) 77 | 78 | if cat['is_metarubric']: 79 | for mod in range(-1, 2): 80 | uid_mod = cat['id'] + mod 81 | yield Request(url=CAT_URL_TEMPLATE.format(parent_id=uid_mod, region_id=self.region_id)) 82 | 83 | else: 84 | if not self.is_cat_grabber: 85 | yield Request(url=ORG_URL_TEMPLATE.format(rubric_id=cat['id'], region_id=self.region_id, page=1), 86 | callback=self.parse_category) 87 | 88 | yield cat 89 | 90 | @safe_func 91 | def parse_category(self, response): 92 | data = json.loads(response.text) 93 | if data['meta']['code'] in [404, 400]: 94 | return 95 | 96 | # get next page 97 | url = response.request.url 98 | page = int(re.findall(page_regex, url)[0]) 99 | rubric_id = int(re.findall(rubric_regex, url)[0]) 100 | 101 | if len(data['result']['items']) > 0: 102 | yield Request(url=ORG_URL_TEMPLATE.format(rubric_id=rubric_id, region_id=self.region_id, page=page + 1), 103 | callback=self.parse_category) 104 | 105 | for item in data['result']['items']: 106 | email = None 107 | if len(item.get('contact_groups', [])) == 0: 108 | return 109 | for contact in item.get('contact_groups', [{}])[0].get('contacts', []): 110 | if contact.get('type') == 'email': 111 | email = contact.get('value') 112 | # skip if email does't specified 113 | if email is None: 114 | continue 115 | rubrics = [r.get('name') for r in item.get('rubrics', [])] 116 | name = item.get('name') 117 | address = item.get('address_name') 118 | fingerprint = f'{name}#{address}' 119 | if fingerprint not in self.fingerprints: 120 | self.fingerprints.add(fingerprint) 121 | point = item.get('point', {}) 122 | point = point if point is not None else {} 123 | yield OrgItem( 124 | type='item', 125 | name=name, 126 | address=address, 127 | lat=point.get('lat'), 128 | lon=point.get('lon'), 129 | email=email, 130 | rubrics=json.dumps(rubrics, ensure_ascii=False), 131 | region=self.region_id 132 | ) 133 | --------------------------------------------------------------------------------