├── gis
    ├── gis
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── dashboard_spider.py
    │   │   ├── cat_spider.py
    │   │   ├── org_spider.py
    │   │   └── organizations_spider.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── my_pipelines.py
    │   ├── settings.py
    │   └── middlewares.py
    ├── scrapy.cfg
    ├── json_to_xlsx.py
    └── scraper.py
├── categories
    ├── categories
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── cat_spider.py
    │   ├── items.py
    │   ├── utils.py
    │   ├── pipelines.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── show_categories.py
    └── scrapy.cfg
├── required.txt
├── readme.txt
└── .gitignore


/gis/gis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/categories/categories/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/required.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | xlsxwriter
3 | pymongo
4 | 


--------------------------------------------------------------------------------
/gis/gis/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/categories/categories/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/categories/show_categories.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pymongo
 4 | 
 5 | if __name__ == '__main__':
 6 |     client = pymongo.MongoClient("localhost", 27017)
 7 |     db = client.gis_db
 8 |     for item in db.scrapy_items.find():
 9 |         print(item)
10 | 


--------------------------------------------------------------------------------
/gis/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = gis.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = gis
12 | 


--------------------------------------------------------------------------------
/categories/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = categories.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = categories
12 | 


--------------------------------------------------------------------------------
/categories/categories/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CatItem(scrapy.Item):
12 |     id = scrapy.Field(serializer=int)
13 |     name = scrapy.Field()
14 |     is_metarubric = scrapy.Field(serializer=bool)
15 |     region = scrapy.Field(serializer=int)
16 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
 1 | # 2gis API грабер
 2 | 
 3 | для получения списка организаций
 4 | > scrapy crawl organizations --nolog -a region_id=5 -o result_of_region_5.json
 5 | 
 6 | для конвертации в Excel
 7 | > json_to_xlsx.py result_of_region_5.json
 8 | 
 9 | что бы последовательно сграбить несколько регионов (в рамках одного процесса)
10 | > scraper.py -r 4,5,6,7
11 | 
12 | region_id:
13 | 1 - Новосибирск
14 | 2 - Омск
15 | 3 - Томск
16 | 4 - Барнаул
17 | 5 - Кемерово
18 | 6 - Новокузнецк
19 | 7 - Красноярск
20 | 


--------------------------------------------------------------------------------
/categories/categories/utils.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | 
 3 | 
 4 | def safe_func(func):
 5 | 
 6 |     def wrapped_func(*arg, **kwargs):
 7 |         try:
 8 |             return func(*arg, **kwargs)
 9 |         except Exception as e:
10 |             print('- ' * 50)
11 |             print('- ' * 50)
12 |             print('- ' * 50)
13 |             traceback.print_exc()
14 |             print('- ' * 50)
15 |             print('- ' * 50)
16 |             print('- ' * 50)
17 | 
18 |     return wrapped_func
19 | 


--------------------------------------------------------------------------------
/gis/gis/spiders/dashboard_spider.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class DashboardSpider(scrapy.Spider):
 7 |     name = 'dashboard'
 8 |     start_urls = ['https://catalog.api.2gis.ru/3.0/rubricator/dashboard?locale=ru_RU&region_id=5&key=rutnpt3272']
 9 | 
10 |     def parse(self, response):
11 |         data = json.loads(response.text)
12 |         items = [i.get('search_query') for i in data['result']['items'] if i.get('search_query')]
13 |         return {num: item for num, item in enumerate(items)}
14 | 


--------------------------------------------------------------------------------
/gis/gis/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CatItem(scrapy.Item):
12 |     type = scrapy.Field()
13 |     id = scrapy.Field(serializer=int)
14 |     name = scrapy.Field()
15 |     is_metarubric = scrapy.Field(serializer=bool)
16 |     region = scrapy.Field(serializer=int)
17 | 
18 | 
19 | class OrgItem(scrapy.Item):
20 |     type = scrapy.Field()
21 |     name = scrapy.Field()
22 |     address = scrapy.Field()
23 |     lat = scrapy.Field()
24 |     lon = scrapy.Field()
25 |     email = scrapy.Field()
26 |     rubrics = scrapy.Field()
27 |     region = scrapy.Field(serializer=int)
28 | 


--------------------------------------------------------------------------------
/gis/gis/spiders/cat_spider.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from scrapy import Spider, Request
 4 | from gis.items import CatItem
 5 | 
 6 | 
 7 | # noinspection SpellCheckingInspection
 8 | URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \
 9 |                '?parent_id={parent_id}&locale=ru_RU&region_id=5&key=rutnpt3272'
10 | 
11 | 
12 | class CatSpider(Spider):
13 |     name = 'cat'
14 |     start_urls = [
15 |         'https://catalog.api.2gis.ru/3.0/rubricator/list?locale=ru_RU&region_id=5&key=rutnpt3272',
16 |     ]
17 | 
18 |     def parse(self, response):
19 |         data = json.loads(response.text)
20 |         if data['meta']['code'] == 404:
21 |             return
22 |         for item in data['result']['items']:
23 |             uid = item.get('id')
24 |             name = item.get('name')
25 |             if uid is not None and name is not None:
26 |                 yield CatItem(id=uid, name=name)
27 |                 yield Request(url=URL_TEMPLATE.format(parent_id=uid))
28 | 


--------------------------------------------------------------------------------
/gis/gis/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | 
10 | class GisPipeline(object):
11 |     def process_item(self, item, spider):
12 |         del spider
13 |         return item
14 | 
15 | 
16 | class MongoPipeline(object):
17 | 
18 |     collection_name = 'scrapy_items'
19 | 
20 |     def __init__(self, mongo_uri, mongo_db):
21 |         self.mongo_uri = mongo_uri
22 |         self.mongo_db = mongo_db
23 |         self.client = None
24 |         self.db = None
25 | 
26 |     @classmethod
27 |     def from_crawler(cls, crawler):
28 |         return cls(
29 |             mongo_uri=crawler.settings.get('MONGO_URI'),
30 |             mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
31 |         )
32 | 
33 |     def open_spider(self, spider):
34 |         del spider
35 |         self.client = pymongo.MongoClient(self.mongo_uri)
36 |         self.db = self.client[self.mongo_db]
37 | 
38 |     def close_spider(self, spider):
39 |         del spider
40 |         self.client.close()
41 | 
42 |     def process_item(self, item, spider):
43 |         del spider
44 |         self.db[self.collection_name].insert_one(dict(item))
45 |         return item
46 | 


--------------------------------------------------------------------------------
/categories/categories/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | 
10 | 
11 | class CategoriesPipeline(object):
12 |     def process_item(self, item, spider):
13 |         del spider
14 |         return item
15 | 
16 | 
17 | class MongoPipeline(object):
18 | 
19 |     collection_name = 'scrapy_items'
20 | 
21 |     def __init__(self, mongo_uri, mongo_db):
22 |         self.mongo_uri = mongo_uri
23 |         self.mongo_db = mongo_db
24 |         self.client = None
25 |         self.db = None
26 | 
27 |     @classmethod
28 |     def from_crawler(cls, crawler):
29 |         return cls(
30 |             mongo_uri=crawler.settings.get('MONGO_URI'),
31 |             mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
32 |         )
33 | 
34 |     def open_spider(self, spider):
35 |         del spider
36 |         self.client = pymongo.MongoClient(self.mongo_uri)
37 |         self.db = self.client[self.mongo_db]
38 | 
39 |     def close_spider(self, spider):
40 |         del spider
41 |         self.client.close()
42 | 
43 |     def process_item(self, item, spider):
44 |         del spider
45 |         self.db[self.collection_name].insert_one(dict(item))
46 |         return item
47 | 


--------------------------------------------------------------------------------
/gis/json_to_xlsx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import json
 3 | from optparse import OptionParser
 4 | 
 5 | import xlsxwriter
 6 | 
 7 | 
 8 | def convert_json_to_xlsx(filename):
 9 |     with open(filename) as file:
10 |         data = json.loads(file.read())
11 | 
12 |     # erase categories
13 |     data = filter(lambda x: 'address' in x, data)
14 | 
15 |     # Create an new Excel file and add a worksheet.
16 |     workbook = xlsxwriter.Workbook('Excel.xlsx')
17 |     worksheet = workbook.add_worksheet()
18 | 
19 |     column_widths = [(0, 60), (1, 25), (2, 25), (3, 80)]  # (2, 10), (3, 10)
20 |     for col, width in column_widths:
21 |         worksheet.set_column(firstcol=col, lastcol=col, width=width)
22 | 
23 |     for row, item in enumerate(data):
24 |         for col, field in enumerate(['name', 'address', 'email', 'rubrics']):  # 'lat', 'lon'
25 |             worksheet.write(row, col, item.get(field))
26 | 
27 |     workbook.close()
28 | 
29 | 
30 | if __name__ == '__main__':
31 | 
32 |     parser = OptionParser()
33 |     parser.add_option('-f', '--file', dest='filename', help='input/output json file', metavar='FILE')
34 | 
35 |     (options, args) = parser.parse_args()
36 | 
37 |     file_name = None
38 |     if len(args) > 0:
39 |         convert_json_to_xlsx(args[0])
40 |     elif options.filename:
41 |         convert_json_to_xlsx(options.filename)
42 |     else:
43 |         print('''Usage: json_to_xlsx.py [options]
44 | Options:
45 |   -h, --help            show this help message and exit
46 |   -f FILE, --file=FILE  input/output file
47 | ''')
48 | 


--------------------------------------------------------------------------------
/categories/categories/spiders/cat_spider.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from scrapy import Spider, Request
 4 | 
 5 | from categories.items import CatItem
 6 | from categories.settings import START_URL_TEMPLATE, CAT_URL_TEMPLATE
 7 | from categories.utils import safe_func
 8 | 
 9 | 
10 | class OrganizationsSpider(Spider):
11 |     name = 'cat'
12 | 
13 |     def __init__(self, region_id=None, *arg, **kwargs):
14 |         super().__init__(*arg, **kwargs)
15 |         self.fingerprints = set()
16 | 
17 |         # setup region_id
18 |         if region_id is None:
19 |             print('-' * 45)
20 |             print('| CRITICAL: argument region_id is required! |')
21 |             print('-' * 45)
22 |             raise Exception('argument region_id is required!')
23 |         self.region_id = region_id
24 |         self.start_urls = [START_URL_TEMPLATE.format(region_id=region_id)]
25 | 
26 |     @safe_func
27 |     def parse(self, response):
28 |         data = json.loads(response.text)
29 |         if data['meta']['code'] in [404, 400]:
30 |             return
31 |         for item in data['result']['items']:
32 |             cat = CatItem(
33 |                 id=int(item.get('id')),
34 |                 name=item.get('name'),
35 |                 is_metarubric=item.get('type') == 'metarubric',
36 |                 region=self.region_id
37 |             )
38 | 
39 |             if cat['is_metarubric']:
40 |                 for mod in range(-1, 2):
41 |                     uid_mod = cat['id'] + mod
42 |                     yield Request(url=CAT_URL_TEMPLATE.format(parent_id=uid_mod, region_id=self.region_id))
43 | 
44 |             yield cat
45 | 


--------------------------------------------------------------------------------
/gis/scraper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import json
 4 | import traceback
 5 | from optparse import OptionParser
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | 
 9 | from gis.spiders.organizations_spider import OrganizationsSpider
10 | 
11 | if __name__ == '__main__':
12 | 
13 |     parser = OptionParser()
14 |     parser.add_option('-r', '--region', dest='region', help='scraping data for region', metavar='INT')
15 |     parser.add_option('-c', '--categories',
16 |                       action='store_true', dest='is_cat_grabber', default=False,
17 |                       help='scraping only categories')
18 | 
19 |     (options, args) = parser.parse_args()
20 | 
21 |     try:
22 |         # getter
23 |         assert options.region
24 |         if options.region == 'all':
25 |             regions = list(range(1, 8))
26 |         else:
27 |             regions = json.loads(f'[{options.region}]')
28 | 
29 |         # validation
30 |         assert isinstance(regions, list)
31 |         for i in regions:
32 |             assert isinstance(i, int)
33 | 
34 |     except Exception:
35 |         traceback.print_exc()
36 |         print('''Usage: scraper.py [options]
37 | Options:
38 |   -h, --help            show this help message and exit
39 |   -r INT, --region=INT  scraping data for region
40 | ''')
41 | 
42 |     else:
43 |         config = {
44 |             'FEED_URI': f'results.json',
45 |             'FEED_FORMAT': 'json',
46 |             'FEED_EXPORT_ENCODING': 'utf-8',
47 |             'ROBOTSTXT_OBEY': False,
48 |         }
49 |         process = CrawlerProcess(config)
50 |         for i in regions:
51 |             process.crawl(OrganizationsSpider, region_id=i, is_cat_grabber=options.is_cat_grabber)
52 |         process.start()
53 | 


--------------------------------------------------------------------------------
/gis/gis/my_pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | from scrapy.exceptions import DropItem
10 | 
11 | 
12 | class GisPipeline(object):
13 |     def process_item(self, item, spider):
14 |         del spider
15 |         return item
16 | 
17 | 
18 | # noinspection PyUnusedLocal
19 | class MongoPipeline(object):
20 | 
21 |     collection_name = 'scrapy_items'
22 | 
23 |     def __init__(self, mongo_uri, mongo_db):
24 |         self.mongo_uri = mongo_uri
25 |         self.mongo_db = mongo_db
26 | 
27 |     @classmethod
28 |     def from_crawler(cls, crawler):
29 |         return cls(
30 |             mongo_uri=crawler.settings.get('MONGO_URI'),
31 |             mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
32 |         )
33 | 
34 |     # noinspection PyAttributeOutsideInit
35 |     def open_spider(self, spider):
36 |         self.client = pymongo.MongoClient(self.mongo_uri)
37 |         self.db = self.client[self.mongo_db]
38 | 
39 |     def close_spider(self, spider):
40 |         self.client.close()
41 | 
42 |     def process_item(self, item, spider):
43 |         self.db[self.collection_name].insert_one(dict(item))
44 |         return item
45 | 
46 | 
47 | class DuplicatesPipeline(object):
48 | 
49 |     def __init__(self):
50 |         self.fingerprints = set()
51 | 
52 |     def process_item(self, item, spider):
53 |         del spider
54 |         name = item.get('name')
55 |         address = item.get('address_name')
56 |         fingerprint = f'{name}#{address}'
57 |         if fingerprint in self.fingerprints:
58 |             raise DropItem("Duplicate item found: %s" % item)
59 |         else:
60 |             self.fingerprints.add(fingerprint)
61 |             return item
62 | 


--------------------------------------------------------------------------------
/gis/gis/spiders/org_spider.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from scrapy import Spider, Request
 4 | from gis.items import CatItem, OrgItem
 5 | 
 6 | # noinspection SpellCheckingInspection
 7 | URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \
 8 |                '?page=1' \
 9 |                '&page_size=50' \
10 |                '&rubric_id={rubric_id}' \
11 |                '&region_id=5' \
12 |                '&locale=ru_RU' \
13 |                '&fields=items.contact_groups%2Citems.point' \
14 |                '&key=rutnpt3272'
15 | 
16 | 
17 | class OrgSpider(Spider):
18 |     name = 'org'
19 | 
20 |     def __init__(self, *arg, **kwargs):
21 |         super().__init__(*arg, **kwargs)
22 |         self.fingerprints = set()
23 | 
24 |     def start_requests(self):
25 |         cat_file = getattr(self, 'cat', None)
26 |         if cat_file is None:
27 |             return
28 |         with open(cat_file) as categories:
29 |             for item in json.loads(categories.read()):
30 |                 yield Request(URL_TEMPLATE.format(rubric_id=item['id']), self.parse)
31 | 
32 |     def parse(self, response):
33 |         data = json.loads(response.text)
34 |         if data['meta']['code'] == 404:
35 |             return
36 |         for item in data['result']['items']:
37 |             emails = list()
38 |             for contact in item.get('contact_groups', [{}])[0].get('contacts', []):
39 |                 if contact.get('type') == 'email':
40 |                     emails.append(contact.get('value'))
41 |             if emails:
42 |                 name = item.get('name')
43 |                 address = item.get('address_name')
44 |                 fingerprint = f'{name}#{address}'
45 |                 if fingerprint not in self.fingerprints:
46 |                     self.fingerprints.add(fingerprint)
47 |                     yield OrgItem(
48 |                         name=name,
49 |                         address=address,
50 |                         lat=item.get('point', {}).get('lat'),
51 |                         lon=item.get('point', {}).get('lon'),
52 |                         email=emails[0] if len(emails) > 0 else None,
53 |                         email2=emails[1] if len(emails) > 1 else None,
54 |                         email3=emails[2] if len(emails) > 2 else None,
55 |                     )
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff:
  7 | .idea/**/tasks.xml
  8 | .idea/dictionaries
  9 | 
 10 | # Sensitive or high-churn files:
 11 | .idea/**/dataSources/
 12 | .idea/**/dataSources.ids
 13 | .idea/**/dataSources.xml
 14 | .idea/**/dataSources.local.xml
 15 | .idea/**/sqlDataSources.xml
 16 | .idea/**/dynamic.xml
 17 | .idea/**/uiDesigner.xml
 18 | 
 19 | # Gradle:
 20 | .idea/**/gradle.xml
 21 | .idea/**/libraries
 22 | 
 23 | # CMake
 24 | cmake-build-debug/
 25 | cmake-build-release/
 26 | 
 27 | # Mongo Explorer plugin:
 28 | .idea/**/mongoSettings.xml
 29 | 
 30 | ## File-based project format:
 31 | *.iws
 32 | 
 33 | ## Plugin-specific files:
 34 | 
 35 | # IntelliJ
 36 | out/
 37 | 
 38 | # mpeltonen/sbt-idea plugin
 39 | .idea_modules/
 40 | 
 41 | # JIRA plugin
 42 | atlassian-ide-plugin.xml
 43 | 
 44 | # Cursive Clojure plugin
 45 | .idea/replstate.xml
 46 | 
 47 | # Crashlytics plugin (for Android Studio and IntelliJ)
 48 | com_crashlytics_export_strings.xml
 49 | crashlytics.properties
 50 | crashlytics-build.properties
 51 | fabric.properties
 52 | ### Python template
 53 | # Byte-compiled / optimized / DLL files
 54 | *$py.class
 55 | 
 56 | # C extensions
 57 | *.so
 58 | 
 59 | # Distribution / packaging
 60 | .Python
 61 | build/
 62 | develop-eggs/
 63 | dist/
 64 | downloads/
 65 | eggs/
 66 | .eggs/
 67 | lib64/
 68 | parts/
 69 | sdist/
 70 | var/
 71 | wheels/
 72 | *.egg-info/
 73 | .installed.cfg
 74 | *.egg
 75 | MANIFEST
 76 | 
 77 | # PyInstaller
 78 | #  Usually these files are written by a python script from a template
 79 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 80 | *.manifest
 81 | *.spec
 82 | 
 83 | # Installer logs
 84 | pip-log.txt
 85 | pip-delete-this-directory.txt
 86 | 
 87 | # Unit test / coverage reports
 88 | htmlcov/
 89 | .tox/
 90 | .coverage
 91 | .coverage.*
 92 | .cache
 93 | nosetests.xml
 94 | coverage.xml
 95 | *.cover
 96 | .hypothesis/
 97 | 
 98 | # Translations
 99 | *.mo
100 | *.pot
101 | 
102 | # Django stuff:
103 | *.log
104 | .static_storage/
105 | .media/
106 | local_settings.py
107 | 
108 | # Flask stuff:
109 | instance/
110 | .webassets-cache
111 | 
112 | # Scrapy stuff:
113 | .scrapy
114 | 
115 | # Sphinx documentation
116 | docs/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # Jupyter Notebook
122 | .ipynb_checkpoints
123 | 
124 | # pyenv
125 | .python-version
126 | 
127 | # celery beat schedule file
128 | celerybeat-schedule
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | 
155 | .idea
156 | gis/*.json
157 | gis/*.xlsx
158 | 


--------------------------------------------------------------------------------
/gis/gis/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for gis project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'gis'
13 | 
14 | SPIDER_MODULES = ['gis.spiders']
15 | NEWSPIDER_MODULE = 'gis.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'gis (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'gis.middlewares.GisSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'gis.middlewares.GisDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'gis.pipelines.GisPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | FEED_EXPORT_ENCODING = 'utf-8'
93 | 


--------------------------------------------------------------------------------
/gis/gis/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class GisSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class GisDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/categories/categories/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class CategoriesSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CategoriesDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/categories/categories/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for categories project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'categories'
 13 | 
 14 | SPIDER_MODULES = ['categories.spiders']
 15 | NEWSPIDER_MODULE = 'categories.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'categories (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | #DEFAULT_REQUEST_HEADERS = {
 43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 | #   'Accept-Language': 'en',
 45 | #}
 46 | 
 47 | # Enable or disable spider middlewares
 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 49 | #SPIDER_MIDDLEWARES = {
 50 | #    'categories.middlewares.CategoriesSpiderMiddleware': 543,
 51 | #}
 52 | 
 53 | # Enable or disable downloader middlewares
 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 55 | #DOWNLOADER_MIDDLEWARES = {
 56 | #    'categories.middlewares.CategoriesDownloaderMiddleware': 543,
 57 | #}
 58 | 
 59 | # Enable or disable extensions
 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 61 | #EXTENSIONS = {
 62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 63 | #}
 64 | 
 65 | # Configure item pipelines
 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 67 | ITEM_PIPELINES = {
 68 |    'categories.pipelines.CategoriesPipeline': 300,
 69 |    'categories.pipelines.MongoPipeline': 900,
 70 | }
 71 | 
 72 | # Enable and configure the AutoThrottle extension (disabled by default)
 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 74 | #AUTOTHROTTLE_ENABLED = True
 75 | # The initial download delay
 76 | #AUTOTHROTTLE_START_DELAY = 5
 77 | # The maximum download delay to be set in case of high latencies
 78 | #AUTOTHROTTLE_MAX_DELAY = 60
 79 | # The average number of requests Scrapy should be sending in parallel to
 80 | # each remote server
 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 82 | # Enable showing throttling stats for every response received:
 83 | #AUTOTHROTTLE_DEBUG = False
 84 | 
 85 | # Enable and configure HTTP caching (disabled by default)
 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 87 | #HTTPCACHE_ENABLED = True
 88 | #HTTPCACHE_EXPIRATION_SECS = 0
 89 | #HTTPCACHE_DIR = 'httpcache'
 90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 92 | 
 93 | FEED_EXPORT_ENCODING = 'utf-8'
 94 | 
 95 | MONGO_URI = 'mongodb://localhost'
 96 | MONGO_DATABASE = 'gis_db'
 97 | 
 98 | # noinspection SpellCheckingInspection
 99 | START_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \
100 |                      '?locale=ru_RU&region_id={region_id}&key=rutnpt3272'
101 | 
102 | # noinspection SpellCheckingInspection
103 | CAT_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \
104 |                    '?parent_id={parent_id}&locale=ru_RU&region_id={region_id}&key=rutnpt3272'
105 | 
106 | # noinspection SpellCheckingInspection
107 | ORG_URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \
108 |                    '?page={page}' \
109 |                    '&page_size=50' \
110 |                    '&rubric_id={rubric_id}' \
111 |                    '&region_id={region_id}' \
112 |                    '&locale=ru_RU' \
113 |                    '&fields=items.contact_groups%2Citems.rubrics%2Citems.point' \
114 |                    '&key=rutnpt3272'
115 | 


--------------------------------------------------------------------------------
/gis/gis/spiders/organizations_spider.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import traceback
  4 | 
  5 | from scrapy import Spider, Request
  6 | from gis.items import OrgItem, CatItem
  7 | 
  8 | # noinspection SpellCheckingInspection
  9 | START_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \
 10 |                      '?locale=ru_RU&region_id={region_id}&key=rutnpt3272'
 11 | 
 12 | # noinspection SpellCheckingInspection
 13 | CAT_URL_TEMPLATE = 'https://catalog.api.2gis.ru/3.0/rubricator/list' \
 14 |                    '?parent_id={parent_id}&locale=ru_RU&region_id={region_id}&key=rutnpt3272'
 15 | 
 16 | # noinspection SpellCheckingInspection
 17 | ORG_URL_TEMPLATE = 'https://catalog.api.2gis.ru/2.0/catalog/branch/list' \
 18 |                    '?page={page}' \
 19 |                    '&page_size=50' \
 20 |                    '&rubric_id={rubric_id}' \
 21 |                    '&region_id={region_id}' \
 22 |                    '&locale=ru_RU' \
 23 |                    '&fields=items.contact_groups%2Citems.rubrics%2Citems.point' \
 24 |                    '&key=rutnpt3272'
 25 | 
 26 | page_regex = r"page=([\d]+)"
 27 | rubric_regex = r"rubric_id=([\d]+)"
 28 | 
 29 | 
 30 | def safe_func(func):
 31 | 
 32 |     def wrapped_func(*arg, **kwargs):
 33 |         try:
 34 |             return func(*arg, **kwargs)
 35 |         except Exception as e:
 36 |             print('- ' * 50)
 37 |             print('- ' * 50)
 38 |             print('- ' * 50)
 39 |             traceback.print_exc()
 40 |             print('- ' * 50)
 41 |             print('- ' * 50)
 42 |             print('- ' * 50)
 43 | 
 44 |     return wrapped_func
 45 | 
 46 | 
 47 | class OrganizationsSpider(Spider):
 48 |     name = 'organizations'
 49 | 
 50 |     def __init__(self, region_id=None, is_cat_grabber=None, *arg, **kwargs):
 51 |         super().__init__(*arg, **kwargs)
 52 |         self.fingerprints = set()
 53 | 
 54 |         # setup region_id
 55 |         if region_id is None:
 56 |             print('-' * 45)
 57 |             print('| CRITICAL: argument region_id is required! |')
 58 |             print('-' * 45)
 59 |             raise Exception('argument region_id is required!')
 60 |         self.region_id = region_id
 61 |         self.start_urls = [START_URL_TEMPLATE.format(region_id=region_id)]
 62 |         self.is_cat_grabber = is_cat_grabber is True
 63 | 
 64 |     @safe_func
 65 |     def parse(self, response):
 66 |         data = json.loads(response.text)
 67 |         if data['meta']['code'] in [404, 400]:
 68 |             return
 69 |         for item in data['result']['items']:
 70 |             cat = CatItem(
 71 |                 type='category',
 72 |                 id=int(item.get('id')),
 73 |                 name=item.get('name'),
 74 |                 is_metarubric=item.get('type') == 'metarubric',
 75 |                 region=self.region_id
 76 |             )
 77 | 
 78 |             if cat['is_metarubric']:
 79 |                 for mod in range(-1, 2):
 80 |                     uid_mod = cat['id'] + mod
 81 |                     yield Request(url=CAT_URL_TEMPLATE.format(parent_id=uid_mod, region_id=self.region_id))
 82 | 
 83 |             else:
 84 |                 if not self.is_cat_grabber:
 85 |                     yield Request(url=ORG_URL_TEMPLATE.format(rubric_id=cat['id'], region_id=self.region_id, page=1),
 86 |                                   callback=self.parse_category)
 87 | 
 88 |             yield cat
 89 | 
 90 |     @safe_func
 91 |     def parse_category(self, response):
 92 |         data = json.loads(response.text)
 93 |         if data['meta']['code'] in [404, 400]:
 94 |             return
 95 | 
 96 |         # get next page
 97 |         url = response.request.url
 98 |         page = int(re.findall(page_regex, url)[0])
 99 |         rubric_id = int(re.findall(rubric_regex, url)[0])
100 | 
101 |         if len(data['result']['items']) > 0:
102 |             yield Request(url=ORG_URL_TEMPLATE.format(rubric_id=rubric_id, region_id=self.region_id, page=page + 1),
103 |                           callback=self.parse_category)
104 | 
105 |         for item in data['result']['items']:
106 |             email = None
107 |             if len(item.get('contact_groups', [])) == 0:
108 |                 return
109 |             for contact in item.get('contact_groups', [{}])[0].get('contacts', []):
110 |                 if contact.get('type') == 'email':
111 |                     email = contact.get('value')
112 |             # skip if email does't specified
113 |             if email is None:
114 |                 continue
115 |             rubrics = [r.get('name') for r in item.get('rubrics', [])]
116 |             name = item.get('name')
117 |             address = item.get('address_name')
118 |             fingerprint = f'{name}#{address}'
119 |             if fingerprint not in self.fingerprints:
120 |                 self.fingerprints.add(fingerprint)
121 |                 point = item.get('point', {})
122 |                 point = point if point is not None else {}
123 |                 yield OrgItem(
124 |                     type='item',
125 |                     name=name,
126 |                     address=address,
127 |                     lat=point.get('lat'),
128 |                     lon=point.get('lon'),
129 |                     email=email,
130 |                     rubrics=json.dumps(rubrics, ensure_ascii=False),
131 |                     region=self.region_id
132 |                 )
133 | 


--------------------------------------------------------------------------------