├── parser_2gis ├── py.typed ├── cli │ ├── __init__.py │ └── app.py ├── gui │ ├── __init__.py │ ├── widgets │ │ ├── sg │ │ │ ├── __init__.py │ │ │ └── rubrics_tree.py │ │ └── tk │ │ │ ├── __init__.py │ │ │ ├── custom_entry.py │ │ │ ├── custom_text.py │ │ │ └── line_numbered_text.py │ ├── error_popup.py │ ├── urls_editor.py │ ├── rubric_selector.py │ ├── utils.py │ ├── urls_generator.py │ ├── settings.py │ └── app.py ├── version.py ├── data │ └── images │ │ ├── icon.icns │ │ ├── icon.ico │ │ ├── icon.png │ │ ├── logo.png │ │ ├── loading.gif │ │ ├── settings.png │ │ ├── rubric_item.png │ │ ├── rubric_folder.png │ │ └── settings_inverted.png ├── writer │ ├── models │ │ ├── __init__.py │ │ ├── point.py │ │ ├── org.py │ │ ├── reviews.py │ │ ├── address.py │ │ ├── rubric.py │ │ ├── name_ex.py │ │ ├── contact_group.py │ │ ├── catalog_item.py │ │ ├── schedule.py │ │ └── adm_div_item.py │ ├── exceptions.py │ ├── writers │ │ ├── __init__.py │ │ ├── xlsx_writer.py │ │ ├── json_writer.py │ │ ├── file_writer.py │ │ └── csv_writer.py │ ├── __init__.py │ ├── factory.py │ └── options.py ├── parser │ ├── exceptions.py │ ├── parsers │ │ ├── __init__.py │ │ ├── firm.py │ │ ├── in_building.py │ │ └── main.py │ ├── __init__.py │ ├── factory.py │ ├── options.py │ └── utils.py ├── runner │ ├── __init__.py │ ├── runner.py │ ├── cli.py │ └── gui.py ├── __init__.py ├── chrome │ ├── patches │ │ ├── __init__.py │ │ └── pychrome.py │ ├── __init__.py │ ├── exceptions.py │ ├── options.py │ ├── dom.py │ ├── browser.py │ └── utils.py ├── logger │ ├── __init__.py │ ├── options.py │ └── logger.py ├── exceptions.py ├── paths.py ├── config.py ├── common.py └── main.py ├── MANIFEST.in ├── .gitignore ├── parser-2gis.py ├── setup.cfg ├── tox.ini ├── .pre-commit-config.yaml ├── .github └── workflows │ ├── deploy.yml │ ├── dev_build.yml │ ├── tests.yml │ └── release.yml ├── scripts ├── update_rubrics_list.py └── update_cities_list.py ├── tests └── test_parser.py ├── README.md ├── CHANGELOG.md ├── setup.py └── LICENSE /parser_2gis/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft parser_2gis 2 | recursive-exclude * *.pyc *.pyo *.swo *.swp -------------------------------------------------------------------------------- /parser_2gis/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import cli_app 2 | 3 | __all__ = [ 4 | 'cli_app', 5 | ] 6 | -------------------------------------------------------------------------------- /parser_2gis/gui/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import gui_app 2 | 3 | __all__ = [ 4 | 'gui_app', 5 | ] 6 | -------------------------------------------------------------------------------- /parser_2gis/version.py: -------------------------------------------------------------------------------- 1 | """Version info.""" 2 | 3 | version = '1.2.1' 4 | config_version = '0.1' 5 | -------------------------------------------------------------------------------- /parser_2gis/data/images/icon.icns: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/icon.icns -------------------------------------------------------------------------------- /parser_2gis/data/images/icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/icon.ico -------------------------------------------------------------------------------- /parser_2gis/data/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/icon.png -------------------------------------------------------------------------------- /parser_2gis/data/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | __pycache__/ 3 | dist/ 4 | .vscode/ 5 | *.egg-info/ 6 | .pytest_cache/ 7 | build/ 8 | .tox/ 9 | .mypy_cache/ -------------------------------------------------------------------------------- /parser_2gis/data/images/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/loading.gif -------------------------------------------------------------------------------- /parser-2gis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from parser_2gis import main 4 | 5 | if __name__ == '__main__': 6 | main() 7 | -------------------------------------------------------------------------------- /parser_2gis/data/images/settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/settings.png -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/sg/__init__.py: -------------------------------------------------------------------------------- 1 | from .rubrics_tree import RubricsTree 2 | 3 | __all__ = [ 4 | 'RubricsTree', 5 | ] 6 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .catalog_item import CatalogItem 2 | 3 | __all__ = [ 4 | 'CatalogItem', 5 | ] 6 | -------------------------------------------------------------------------------- /parser_2gis/data/images/rubric_item.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/rubric_item.png -------------------------------------------------------------------------------- /parser_2gis/data/images/rubric_folder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/rubric_folder.png -------------------------------------------------------------------------------- /parser_2gis/parser/exceptions.py: -------------------------------------------------------------------------------- 1 | class ParserException(Exception): 2 | pass 3 | 4 | 5 | __all__ = [ 6 | 'ParserException', 7 | ] 8 | -------------------------------------------------------------------------------- /parser_2gis/data/images/settings_inverted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interlark/parser-2gis/HEAD/parser_2gis/data/images/settings_inverted.png -------------------------------------------------------------------------------- /parser_2gis/parser/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .firm import FirmParser 2 | from .in_building import InBuildingParser 3 | from .main import MainParser 4 | -------------------------------------------------------------------------------- /parser_2gis/runner/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import CLIRunner 2 | from .gui import GUIRunner 3 | 4 | __all__ = [ 5 | 'CLIRunner', 6 | 'GUIRunner', 7 | ] 8 | -------------------------------------------------------------------------------- /parser_2gis/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | from .version import version as __version__ 3 | 4 | __all__ = [ 5 | 'main', 6 | '__version__', 7 | ] 8 | -------------------------------------------------------------------------------- /parser_2gis/chrome/patches/__init__.py: -------------------------------------------------------------------------------- 1 | from .pychrome import patch_pychrome 2 | 3 | 4 | def patch_all(): 5 | """Apply all custom patches.""" 6 | patch_pychrome() 7 | -------------------------------------------------------------------------------- /parser_2gis/chrome/__init__.py: -------------------------------------------------------------------------------- 1 | from .remote import ChromeRemote 2 | from .options import ChromeOptions 3 | 4 | __all__ = [ 5 | 'ChromeRemote', 6 | 'ChromeOptions', 7 | ] 8 | -------------------------------------------------------------------------------- /parser_2gis/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import get_parser 2 | from .options import ParserOptions 3 | 4 | __all__ = [ 5 | 'get_parser', 6 | 'ParserOptions', 7 | ] 8 | -------------------------------------------------------------------------------- /parser_2gis/writer/exceptions.py: -------------------------------------------------------------------------------- 1 | class WriterUnknownFileFormat(Exception): 2 | """Raises when user specified an unknown output file format.""" 3 | pass 4 | 5 | 6 | __all__ = [ 7 | 'WriterUnknownFileFormat', 8 | ] 9 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/point.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Point(BaseModel): 7 | # Широта 8 | lat: float 9 | 10 | # Долгота 11 | lon: float 12 | -------------------------------------------------------------------------------- /parser_2gis/logger/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import logger, setup_cli_logger, setup_gui_logger 2 | from .options import LogOptions 3 | 4 | __all__ = [ 5 | 'logger', 6 | 'setup_cli_logger', 7 | 'setup_gui_logger', 8 | 'LogOptions', 9 | ] 10 | -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/tk/__init__.py: -------------------------------------------------------------------------------- 1 | from .line_numbered_text import LineNumberedText 2 | from .custom_entry import CustomEntry 3 | from .custom_text import CustomText 4 | 5 | __all__ = [ 6 | 'LineNumberedText', 7 | 'CustomEntry', 8 | 'CustomText', 9 | ] 10 | -------------------------------------------------------------------------------- /parser_2gis/writer/writers/__init__.py: -------------------------------------------------------------------------------- 1 | from .file_writer import FileWriter 2 | from .csv_writer import CSVWriter 3 | from .json_writer import JSONWriter 4 | from .xlsx_writer import XLSXWriter 5 | 6 | __all__ = [ 7 | 'FileWriter', 8 | 'CSVWriter', 9 | 'XLSXWriter', 10 | 'JSONWriter', 11 | ] 12 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/org.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Org(BaseModel): 7 | # Идентификатор 8 | id: str 9 | 10 | # Собственное имя организации 11 | name: str 12 | 13 | # Количество филиалов данной организации 14 | branch_count: int 15 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/reviews.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class Reviews(BaseModel): 9 | # Общий рейтинг 10 | general_rating: Optional[float] = None 11 | 12 | # Общее кол-во отзывов 13 | general_review_count: Optional[int] = None 14 | -------------------------------------------------------------------------------- /parser_2gis/writer/__init__.py: -------------------------------------------------------------------------------- 1 | from .options import WriterOptions, CSVOptions 2 | from .writers import CSVWriter, JSONWriter, FileWriter, XLSXWriter 3 | from .factory import get_writer 4 | 5 | __all__ = [ 6 | 'WriterOptions', 7 | 'CSVOptions', 8 | 'CSVWriter', 9 | 'XLSXWriter', 10 | 'JSONWriter', 11 | 'FileWriter', 12 | 'get_writer', 13 | ] 14 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 130 3 | max-complexity = 12 4 | ignore = C901,W503,E722,E731 5 | per-file-ignores = 6 | # module imported but unused 7 | __init__.py:F401, 8 | exceptions.py:F401, 9 | # one line parser's argument definition 10 | parser_2gis/main.py: E501 11 | 12 | [tool:pytest] 13 | pythonpath = . 14 | testpaths = tests 15 | addopts = --capture=no --color=yes 16 | 17 | [mypy] 18 | ignore_missing_imports = true 19 | files = parser_2gis 20 | -------------------------------------------------------------------------------- /parser_2gis/cli/app.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from ..logger import setup_cli_logger 6 | from ..runner import CLIRunner 7 | 8 | if TYPE_CHECKING: 9 | from ..config import Configuration 10 | 11 | 12 | def cli_app(urls: list[str], output_path: str, format: str, config: Configuration) -> None: 13 | setup_cli_logger(config.log) 14 | 15 | runner = CLIRunner(urls, output_path, format, config) 16 | runner.start() 17 | -------------------------------------------------------------------------------- /parser_2gis/exceptions.py: -------------------------------------------------------------------------------- 1 | from .chrome.exceptions import (ChromeException, ChromePathNotFound, 2 | ChromeRuntimeException, 3 | ChromeUserAbortException) 4 | from .parser.exceptions import ParserException 5 | from .writer.exceptions import WriterUnknownFileFormat 6 | 7 | __all__ = [ 8 | 'ChromeException', 9 | 'ChromePathNotFound', 10 | 'ChromeRuntimeException', 11 | 'ChromeUserAbortException', 12 | 'ParserException', 13 | 'WriterUnknownFileFormat', 14 | ] 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = flake8, mypy, py37, py38, py39, py310 3 | toxworkdir = {env:TOX_WORK_DIR:.tox} 4 | 5 | [testenv] 6 | deps = 7 | -e .[dev] 8 | setenv = HOME = {envtmpdir} 9 | commands = 10 | pytest -v 11 | 12 | [testenv:flake8] 13 | deps = 14 | flake8>=3.8.4,<4.1 15 | commands = 16 | flake8 parser_2gis tests 17 | 18 | [gh-actions] 19 | python = 20 | 3.8: py38 21 | 3.9: py39 22 | 3.10: py310 23 | 3.11: py311 24 | 25 | [testenv:mypy] 26 | deps = 27 | mypy==0.950 28 | types-requests==2.27.25 29 | 30 | commands = 31 | mypy {posargs} 32 | -------------------------------------------------------------------------------- /parser_2gis/chrome/exceptions.py: -------------------------------------------------------------------------------- 1 | from pychrome.exceptions import UserAbortException as ChromeUserAbortException 2 | from pychrome.exceptions import RuntimeException as ChromeRuntimeException 3 | 4 | 5 | class ChromeException(Exception): 6 | pass 7 | 8 | 9 | class ChromePathNotFound(ChromeException): 10 | def __init__(self, msg: str = 'Chrome браузер не найден', *args, **kwargs) -> None: 11 | super().__init__(msg, *args, **kwargs) 12 | 13 | 14 | __all__ = [ 15 | 'ChromeUserAbortException', 16 | 'ChromeRuntimeException', 17 | 'ChromeException', 18 | 'ChromePathNotFound', 19 | ] 20 | -------------------------------------------------------------------------------- /parser_2gis/runner/runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING 5 | 6 | if TYPE_CHECKING: 7 | from ..config import Configuration 8 | 9 | 10 | class AbstractRunner(ABC): 11 | def __init__(self, urls: list[str], output_path: str, format: str, config: Configuration): 12 | self._urls = urls 13 | self._output_path = output_path 14 | self._format = format 15 | self._config = config 16 | 17 | @abstractmethod 18 | def start(self): 19 | pass 20 | 21 | @abstractmethod 22 | def stop(self): 23 | pass 24 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/address.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class Address(BaseModel): 9 | # Уникальный идентификатор дома, к которому относится данный адрес 10 | building_id: Optional[str] = None 11 | 12 | # Название здания (в адресе для филиалов) 13 | building_name: Optional[str] = None 14 | 15 | # Уникальный почтовый код здания 16 | building_code: Optional[str] = None 17 | 18 | # Почтовый индекс 19 | postcode: Optional[str] = None 20 | 21 | # Makani адрес объекта 22 | makani: Optional[str] = None 23 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/rubric.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class Rubric(BaseModel): 9 | # Уникальный идентификатор рубрики 10 | id: str 11 | 12 | # Тип рубрики. 13 | # Возможные значения: 14 | # * `primary` — основная 15 | # * `additional` — дополнительная 16 | kind: str 17 | 18 | # Собственное имя рубрики 19 | name: str 20 | 21 | # Короткий идентификатор рубрики 22 | short_id: int 23 | 24 | # Транслированное название страницы в web 25 | alias: Optional[str] = None 26 | 27 | # Идентификатор объединяющей рубрики 28 | parent_id: Optional[str] = None 29 | -------------------------------------------------------------------------------- /parser_2gis/parser/factory.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .parsers import FirmParser, InBuildingParser, MainParser 4 | 5 | 6 | def get_parser(url, chrome_options, parser_options): 7 | """Parser factory function. 8 | 9 | Args: 10 | url: 2GIS URLs with items to be collected. 11 | chrome_options: Chrome options. 12 | parser_options: Parser options. 13 | 14 | Returns: 15 | Parser instance. 16 | """ 17 | for parser in (FirmParser, InBuildingParser, MainParser): 18 | if re.match(parser.url_pattern(), url): 19 | return parser(url, chrome_options, parser_options) 20 | 21 | # Default fallback 22 | return MainParser(url, chrome_options, parser_options) 23 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/name_ex.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class NameEx(BaseModel): 9 | # Собственное имя филиала 10 | primary: str 11 | 12 | # Расширение имени филиала (например "кафе") 13 | extension: Optional[str] = None 14 | 15 | # Юридическое название филиала (например "ООО Солнышко") 16 | legal_name: Optional[str] = None 17 | 18 | # Описание филиала (например "Склад") 19 | description: Optional[str] = None 20 | 21 | # Короткое имя на карте 22 | short_name: Optional[str] = None 23 | 24 | # Дополнительная информация к названию филиала, 25 | # которая должна быть показана в развёрнутой карточке 26 | addition: Optional[str] = None 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.2.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: check-merge-conflict 8 | - id: check-case-conflict 9 | - id: check-json 10 | - id: check-yaml 11 | - id: pretty-format-json 12 | args: [--autofix, --no-ensure-ascii, --no-sort-keys] 13 | - id: check-ast 14 | - id: debug-statements 15 | 16 | - repo: https://github.com/pycqa/flake8 17 | rev: 4.0.1 18 | hooks: 19 | - id: flake8 20 | language_version: python3 21 | 22 | - repo: https://github.com/pre-commit/mirrors-mypy 23 | rev: v0.950 24 | hooks: 25 | - id: mypy 26 | additional_dependencies: [types-requests==2.27.25] 27 | files: parser_2gis 28 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | deploy: 10 | name: Deploy 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Set up Python 3.10 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: '3.10' 21 | 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | python -m pip install setuptools twine 26 | python -m pip install -e .[dev] 27 | 28 | - name: Build and publish 29 | env: 30 | TWINE_USERNAME: __token__ 31 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 32 | run: | 33 | python setup.py sdist bdist_wheel 34 | twine upload dist/* 35 | -------------------------------------------------------------------------------- /parser_2gis/writer/factory.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from .writers import CSVWriter, XLSXWriter, FileWriter, JSONWriter 6 | 7 | from .exceptions import WriterUnknownFileFormat 8 | 9 | if TYPE_CHECKING: 10 | from .options import WriterOptions 11 | 12 | 13 | def get_writer(file_path: str, file_format: str, writer_options: WriterOptions) -> FileWriter: 14 | """Writer factory function. 15 | 16 | Args: 17 | output_path: Path to thr result file. 18 | format: `csv`, `xlsx` or `json` format. 19 | writer_options: Writer options. 20 | 21 | Returns: 22 | File Writer instance. 23 | """ 24 | 25 | if file_format == 'json': 26 | return JSONWriter(file_path, writer_options) 27 | elif file_format == 'csv': 28 | return CSVWriter(file_path, writer_options) 29 | elif file_format == 'xlsx': 30 | return XLSXWriter(file_path, writer_options) 31 | 32 | raise WriterUnknownFileFormat('Неизвестный формат файла: %s', file_format) 33 | -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/tk/custom_entry.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tkinter as tk 4 | from typing import Any 5 | 6 | 7 | class CustomEntry(tk.Entry): 8 | """Custom entry widget that report on internal widget commands.""" 9 | def __init__(self, *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | 12 | # Create a proxy for the underlying widget 13 | widget_name = self._w # type: ignore[attr-defined] 14 | self._orig = widget_name + '_orig' 15 | self.tk.call('rename', widget_name, self._orig) 16 | self.tk.createcommand(widget_name, self._proxy) 17 | 18 | def _proxy(self, command: Any, *args) -> Any: 19 | # Let the actual widget perform the requested action 20 | cmd = (self._orig, command) + args 21 | 22 | try: 23 | result = self.tk.call(cmd) 24 | except tk.TclError: 25 | result = '' 26 | 27 | # Generate an event if something was added or deleted 28 | if command in ('insert', 'delete', 'replace'): 29 | self.event_generate('<>', when='tail') 30 | 31 | return result 32 | -------------------------------------------------------------------------------- /parser_2gis/parser/options.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pydantic import BaseModel, NonNegativeInt, PositiveInt 4 | 5 | from ..chrome.options import default_memory_limit 6 | from ..common import floor_to_hundreds 7 | 8 | 9 | def default_max_records() -> int: 10 | """Try linear approximation for optimal max records.""" 11 | max_records = floor_to_hundreds((550 * default_memory_limit() / 1024 - 400)) 12 | return max_records if max_records > 0 else 1 13 | 14 | 15 | class ParserOptions(BaseModel): 16 | """Represent all possible options for Parser. 17 | 18 | Attrubutes: 19 | skip_404_response: Whether to skip 404 document response or not. 20 | delay_between_clicks: Delay between each item's click in milliseconds. 21 | max_records: Max number of records to parse from one URL. 22 | use_gc: Use Garbage Collector. 23 | gc_pages_interval: Run Garbage Collector every N pages (if `use_gc` enabled). 24 | """ 25 | skip_404_response: bool = True 26 | delay_between_clicks: NonNegativeInt = 0 27 | max_records: PositiveInt = default_max_records() 28 | use_gc: bool = False 29 | gc_pages_interval: PositiveInt = 10 30 | -------------------------------------------------------------------------------- /parser_2gis/logger/options.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | 5 | from pydantic import BaseModel, validator 6 | 7 | 8 | class LogOptions(BaseModel): 9 | # Format string (percent style) 10 | gui_format: str = '%(asctime)s.%(msecs)03d | %(message)s' 11 | cli_format: str = '%(asctime)s.%(msecs)03d | %(levelname)-8s | %(message)s' 12 | 13 | # Date format 14 | gui_datefmt: str = '%H:%M:%S' 15 | cli_datefmt: str = '%d/%m/%Y %H:%M:%S' 16 | 17 | # Level 18 | level: str = 'INFO' 19 | 20 | @validator('level') 21 | def level_validation(cls, v: str) -> str: 22 | v = v.upper() 23 | if v not in ('ERROR', 'WARNING', 'WARN', 'INFO', 24 | 'DEBUG', 'FATAL', 'CRITICAL', 'NOTSET'): 25 | raise ValueError('Level name not found') 26 | 27 | return v 28 | 29 | @validator('gui_format', 'cli_format') 30 | def format_validation(cls, v: str) -> str: 31 | """Validate percent style format string.""" 32 | fmt_match = re.match(r'%\(\w+\)[#0+ -]*(\*|\d+)?(\.(\*|\d+))?[diouxefgcrsa%]', v, re.I) 33 | if not fmt_match: 34 | raise ValueError('Format string is invalid') 35 | 36 | return v 37 | -------------------------------------------------------------------------------- /parser_2gis/chrome/options.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | from typing import Optional 5 | 6 | import psutil 7 | from pydantic import BaseModel, PositiveInt 8 | 9 | from ..common import floor_to_hundreds 10 | 11 | 12 | def default_memory_limit() -> int: 13 | """Default memory limit for V8, 0.75 of total physical memory.""" 14 | memory_total = psutil.virtual_memory().total / 1024 ** 2 # MB 15 | return floor_to_hundreds(round(0.75 * memory_total)) 16 | 17 | 18 | class ChromeOptions(BaseModel): 19 | """Represents all possible options for Chrome. 20 | 21 | Attributes: 22 | binary_path: Chrome binary path. If not set, tries to find automatically. 23 | start_maximized: Start browser maximized. 24 | headless: Start browser hidden, without GUI. 25 | disable_images: Disable images. 26 | silent_browser: Do not show Chrome's output in `stdout`. 27 | memory_size: Max V8's memory size. 28 | """ 29 | binary_path: Optional[pathlib.Path] = None 30 | start_maximized: bool = False 31 | headless: bool = False 32 | disable_images: bool = True 33 | silent_browser: bool = True 34 | memory_limit: PositiveInt = default_memory_limit() 35 | -------------------------------------------------------------------------------- /parser_2gis/writer/writers/xlsx_writer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import csv 4 | import os 5 | import shutil 6 | 7 | from xlsxwriter.workbook import Workbook 8 | 9 | from .csv_writer import CSVWriter 10 | 11 | 12 | class XLSXWriter(CSVWriter): 13 | """Writer (post-process converter) to XLSX table.""" 14 | 15 | def __exit__(self, *exc_info) -> None: 16 | super().__exit__(*exc_info) 17 | 18 | # Convert csv to xlsx table 19 | tmp_xlx_name = os.path.splitext(self._file_path)[0] + '.converted.xlsx' 20 | with Workbook(tmp_xlx_name) as workbook: 21 | bold = workbook.add_format({'bold': True}) # Add header format 22 | 23 | worksheet = workbook.add_worksheet() 24 | with self._open_file(self._file_path, 'r') as f_csv: 25 | csv_reader = csv.reader(f_csv) 26 | for r, row in enumerate(csv_reader): 27 | for c, col in enumerate(row): 28 | if r == 0: 29 | worksheet.write(r, c, col, bold) # Write header 30 | else: 31 | worksheet.write(r, c, col) 32 | 33 | # Replace original table with new one 34 | shutil.move(tmp_xlx_name, self._file_path) 35 | -------------------------------------------------------------------------------- /.github/workflows/dev_build.yml: -------------------------------------------------------------------------------- 1 | name: Development Build 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | build: 8 | name: Development Build 9 | 10 | strategy: 11 | matrix: 12 | include: 13 | - os: ubuntu-20.04 14 | dist: linux 15 | ext: 16 | 17 | - os: macos-12 18 | dist: macos 19 | ext: 20 | 21 | - os: macos-14 22 | dist: macos-arm 23 | ext: 24 | 25 | - os: windows-2019 26 | dist: windows 27 | ext: .exe 28 | 29 | runs-on: ${{ matrix.os }} 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - name: Set up Python 3.11 35 | uses: actions/setup-python@v3 36 | with: 37 | python-version: "3.11.3" 38 | 39 | - name: Install dependencies 40 | run: | 41 | python -m pip install --upgrade pip 42 | python -m pip install -e .[gui,dev] 43 | # Replace pydantic with no-binary version to lightweight target binary a bit 44 | # python -m pip uninstall -y pydantic 45 | # python -m pip install --no-binary pydantic pydantic>=1.9.0 46 | 47 | - name: Build standalone app 48 | run: | 49 | python setup.py build_standalone 50 | 51 | - name: Upload distributive 52 | uses: actions/upload-artifact@v3 53 | with: 54 | name: Parser2GIS-dev-${{matrix.dist}} 55 | path: dist/Parser2GIS${{matrix.ext}} 56 | retention-days: 30 57 | -------------------------------------------------------------------------------- /parser_2gis/chrome/patches/pychrome.py: -------------------------------------------------------------------------------- 1 | # Patch pychrome, make it handle correctly empty CDP messages 2 | 3 | import pychrome.tab 4 | import json 5 | import websocket 6 | import warnings 7 | import logging 8 | 9 | pychrome_logger = logging.getLogger('pychrome') 10 | 11 | 12 | def patch_pychrome(): 13 | def _recv_loop_patched(self): 14 | while not self._stopped.is_set(): 15 | try: 16 | self._ws.settimeout(1) 17 | message_json = self._ws.recv() 18 | if not message_json: 19 | continue 20 | message = json.loads(message_json) 21 | except websocket.WebSocketTimeoutException: 22 | continue 23 | except (websocket.WebSocketException, OSError): 24 | if not self._stopped.is_set(): 25 | pychrome_logger.error('websocket exception', exc_info=True) 26 | self._stopped.set() 27 | return 28 | 29 | if self.debug: # pragma: no cover 30 | print('< RECV %s' % message_json) 31 | 32 | if 'method' in message: 33 | self.event_queue.put(message) 34 | 35 | elif 'id' in message: 36 | if message['id'] in self.method_results: 37 | self.method_results[message['id']].put(message) 38 | else: # pragma: no cover 39 | warnings.warn('unknown message: %s' % message) 40 | 41 | pychrome.tab.Tab._recv_loop = _recv_loop_patched 42 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - 'scripts/**' 7 | - 'parser_2gis/data/**' 8 | pull_request: 9 | paths-ignore: 10 | - 'scripts/**' 11 | - 'parser_2gis/data/**' 12 | 13 | jobs: 14 | test-python: 15 | name: Test Python 16 | 17 | runs-on: ubuntu-latest 18 | 19 | strategy: 20 | matrix: 21 | python-version: ['3.8', '3.9', '3.10', '3.11'] 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v3 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | python -m pip install -e .[dev] 35 | python -m pip install tox-gh-actions 36 | 37 | - name: Test with tox 38 | run: tox 39 | 40 | test-os: 41 | name: Test OS 42 | 43 | strategy: 44 | matrix: 45 | os: [ubuntu-latest, windows-latest, macos-latest] 46 | 47 | runs-on: ${{ matrix.os }} 48 | 49 | steps: 50 | - uses: actions/checkout@v3 51 | 52 | - name: Set up Python 53 | uses: actions/setup-python@v3 54 | with: 55 | python-version: "3.10" 56 | 57 | - name: Install dependencies 58 | run: | 59 | python -m pip install --upgrade pip 60 | python -m pip install -e .[dev] 61 | 62 | - name: Test on ${{ matrix.os }} 63 | run: pytest -v 64 | -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/tk/custom_text.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tkinter as tk 4 | from typing import Any 5 | 6 | 7 | class CustomText(tk.Text): 8 | """Custom text widget that report on internal widget commands.""" 9 | def __init__(self, *args, **kwargs) -> None: 10 | super().__init__(*args, **kwargs) 11 | 12 | # Create a proxy for the underlying widget 13 | widget_name = self._w # type: ignore[attr-defined] 14 | self._orig = widget_name + '_orig' 15 | self.tk.call('rename', widget_name, self._orig) 16 | self.tk.createcommand(widget_name, self._proxy) 17 | 18 | def _proxy(self, *args) -> Any: 19 | # Let the actual widget perform the requested action 20 | cmd = (self._orig,) + args 21 | 22 | try: 23 | result = self.tk.call(cmd) 24 | except tk.TclError: 25 | result = '' 26 | 27 | # Generate an event if something was added or deleted, 28 | # or the cursor position changed. 29 | if ( 30 | args[0] in ('insert', 'replace', 'delete') 31 | or args[0:3] == ('mark', 'set', 'insert') 32 | or args[0:2] == ('xview', 'moveto') 33 | or args[0:2] == ('xview', 'scroll') 34 | or args[0:2] == ('yview', 'moveto') 35 | or args[0:2] == ('yview', 'scroll') 36 | ): 37 | 38 | self.event_generate('<>', when='tail') 39 | 40 | # Return what the actual widget returned 41 | return result 42 | -------------------------------------------------------------------------------- /parser_2gis/writer/options.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import codecs 4 | 5 | from pydantic import BaseModel, Field, validator 6 | 7 | 8 | class CSVOptions(BaseModel): 9 | """Represent all possible options for CSV Writer. 10 | 11 | Attributes: 12 | add_rubrics: Whether to add rubrics to csv or not. 13 | add_comments: Add comments to complex columns (phones, emails, etc.) 14 | with extra info, business hours. 15 | columns_per_entity: Number of columns for a result with multiple possible values. 16 | remove_empty_columns: Remove empty columns after parsing process finished. 17 | remove_duplicates: Remove duplicates after parsing process finished. 18 | join_char: Char for joining complex values. 19 | """ 20 | add_rubrics: bool = True 21 | add_comments: bool = True 22 | columns_per_entity: int = Field(3, gt=0, le=5) 23 | remove_empty_columns: bool = True 24 | remove_duplicates: bool = True 25 | join_char: str = '; ' 26 | 27 | 28 | class WriterOptions(BaseModel): 29 | """Represent all possible options for File Writer. 30 | 31 | Attributes: 32 | encoding: Encoding of output document. 33 | verbose: Echo to stdout parsing item's name. 34 | """ 35 | encoding: str = 'utf-8-sig' 36 | verbose: bool = True 37 | csv: CSVOptions = CSVOptions() 38 | 39 | @validator('encoding') 40 | def encoding_exists(cls, v: str) -> str: 41 | """Determine if `encoding` exists.""" 42 | try: 43 | codecs.lookup(v) 44 | except LookupError: 45 | raise ValueError 46 | return v 47 | -------------------------------------------------------------------------------- /parser_2gis/writer/writers/json_writer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import os 5 | from typing import Any 6 | 7 | from ...logger import logger 8 | from .file_writer import FileWriter 9 | 10 | 11 | class JSONWriter(FileWriter): 12 | """Writer to JSON file.""" 13 | def __enter__(self) -> JSONWriter: 14 | super().__enter__() 15 | self._wrote_count = 0 16 | self._file.write('[') 17 | return self 18 | 19 | def __exit__(self, *exc_info) -> None: 20 | if self._wrote_count > 0: 21 | self._file.write(os.linesep) 22 | self._file.write(']') 23 | super().__exit__(*exc_info) 24 | 25 | def _writedoc(self, catalog_doc: Any) -> None: 26 | """Write a `catalog_doc` into JSON document.""" 27 | item = catalog_doc['result']['items'][0] 28 | 29 | if self._options.verbose: 30 | try: 31 | name = item['name_ex']['primary'] 32 | except KeyError: 33 | name = '...' 34 | 35 | logger.info('Парсинг [%d] > %s', self._wrote_count + 1, name) 36 | 37 | if self._wrote_count > 0: 38 | self._file.write(',') 39 | 40 | self._file.write(os.linesep) 41 | self._file.write(json.dumps(item, ensure_ascii=False)) 42 | self._wrote_count += 1 43 | 44 | def write(self, catalog_doc: Any) -> None: 45 | """Write Catalog Item API JSON document down to JSON file. 46 | 47 | Args: 48 | catalog_doc: Catalog Item API JSON document. 49 | """ 50 | if not self._check_catalog_doc(catalog_doc): 51 | return 52 | 53 | self._writedoc(catalog_doc) 54 | -------------------------------------------------------------------------------- /parser_2gis/gui/error_popup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import textwrap 4 | 5 | from ..common import GUI_ENABLED, running_linux 6 | from ..paths import image_data 7 | from .utils import ensure_gui_enabled 8 | 9 | if GUI_ENABLED: 10 | import PySimpleGUI as sg 11 | 12 | 13 | @ensure_gui_enabled 14 | def gui_error_popup(error_msg: str) -> None: 15 | """Run error modal window. 16 | 17 | Args: 18 | error_msg: Error message. 19 | """ 20 | # App color theme 21 | sg.theme('Green') 22 | 23 | # Set icon 24 | sg.set_global_icon(image_data('icon', 'png')) 25 | 26 | # Adjust error message width 27 | error_msg = '\n'.join( 28 | textwrap.wrap(error_msg, width=60, replace_whitespace=False, break_on_hyphens=False) 29 | ) 30 | 31 | # Window layout 32 | layout = [ 33 | [ 34 | sg.Text(error_msg), 35 | ], 36 | [ 37 | sg.Column([ 38 | [ 39 | sg.Button('Закрыть', key='-BTN_CLOSE-', size=(8, 1), button_color='firebrick3', 40 | focus=True, bind_return_key=True, pad=((0, 0), 3)), 41 | ], 42 | ], expand_x=True, element_justification='center'), 43 | ], 44 | ] 45 | 46 | window_title = 'Error' if running_linux() else 'Ошибка' 47 | window = sg.Window(window_title, layout, auto_size_text=True, finalize=True, 48 | font='Any 12', modal=True, keep_on_top=True) 49 | 50 | while True: 51 | event, _ = window.Read() 52 | 53 | # Close window 54 | if event in (None, '-BTN_CLOSE-'): 55 | break 56 | 57 | window.close() 58 | del window 59 | -------------------------------------------------------------------------------- /parser_2gis/runner/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ..exceptions import ChromeRuntimeException, ChromeUserAbortException 4 | from ..logger import logger 5 | from ..parser import get_parser 6 | from ..writer import get_writer 7 | from .runner import AbstractRunner 8 | 9 | 10 | class CLIRunner(AbstractRunner): 11 | """CLI runner. 12 | 13 | Args: 14 | urls: 2GIS URLs with items to be collected. 15 | output_path: Path to the result file. 16 | format: `csv`, `xlsx` or `json` format. 17 | config: Configuration. 18 | """ 19 | def start(self): 20 | logger.info('Парсинг запущен.') 21 | try: 22 | with get_writer(self._output_path, self._format, self._config.writer) as writer: 23 | for url in self._urls: 24 | logger.info(f'Парсинг ссылки {url}') 25 | with get_parser(url, 26 | chrome_options=self._config.chrome, 27 | parser_options=self._config.parser) as parser: 28 | try: 29 | parser.parse(writer) 30 | finally: 31 | logger.info('Парсинг ссылки завершён.') 32 | except (KeyboardInterrupt, ChromeUserAbortException): 33 | logger.error('Работа парсера прервана пользователем.') 34 | except Exception as e: 35 | if isinstance(e, ChromeRuntimeException) and str(e) == 'Tab has been stopped': 36 | logger.error('Вкладка браузера была закрыта.') 37 | else: 38 | logger.error('Ошибка во время работы парсера.', exc_info=True) 39 | finally: 40 | logger.info('Парсинг завершён.') 41 | 42 | def stop(self): 43 | pass 44 | -------------------------------------------------------------------------------- /scripts/update_rubrics_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Download rubrics. 4 | 5 | import json 6 | import os 7 | import sys 8 | 9 | for _ in range(2): 10 | try: 11 | import parser_2gis.paths 12 | from parser_2gis.chrome import (ChromeOptions, 13 | ChromeRemote) 14 | break 15 | except ImportError: 16 | here = os.path.dirname(os.path.abspath(__file__)) 17 | parent_dir = os.path.abspath(os.path.join(here, os.pardir)) 18 | if parent_dir not in sys.path: 19 | sys.path.insert(1, parent_dir) 20 | 21 | # Get available cities from https://data.2gis.com and save it to data/rubrics.json 22 | 23 | _REGIONS_LIST_RESPONSE = r'https://hermes.2gis.ru/api/data/availableParameters' 24 | 25 | chrome_options = ChromeOptions(headless=True) 26 | with ChromeRemote(chrome_options, [_REGIONS_LIST_RESPONSE]) as chrome_remote: 27 | chrome_remote.navigate('https://data.2gis.com') 28 | response = chrome_remote.wait_response(_REGIONS_LIST_RESPONSE) 29 | data = chrome_remote.get_response_body(response) 30 | 31 | try: 32 | doc = json.loads(data) 33 | except json.JSONDecodeError: 34 | print('Returned invalid JSON document!', file=sys.stderr) 35 | exit(1) 36 | 37 | if not doc: 38 | print('No response, bail!', file=sys.stderr) 39 | exit(1) 40 | 41 | # Cherry-pick 42 | rubrics = doc['rubrics'] 43 | for v in rubrics.values(): 44 | del v['totalCount'] 45 | del v['groupId'] 46 | 47 | # Check for special None rubric 48 | assert any(x['label'] == 'Без рубрики' for x in rubrics.values()) 49 | 50 | # Save rubrics list 51 | rubrics_path = parser_2gis.paths.data_path() / 'rubrics.json' 52 | with open(rubrics_path, 'w', encoding='utf-8') as f: 53 | json.dump(rubrics, f, ensure_ascii=False, indent=4) 54 | -------------------------------------------------------------------------------- /parser_2gis/chrome/dom.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Callable, Dict, List 4 | 5 | from pydantic import BaseModel, Field, validator 6 | 7 | 8 | class DOMNode(BaseModel): 9 | """DOM Node. 10 | 11 | Attributes: 12 | id: Node identifier. 13 | backend_id: The BackendNodeId for this node. 14 | type: Node's type. 15 | name: Node's name. 16 | local_name: Node's local name. 17 | value: Node's value. 18 | children: Node's children. 19 | attributes: Node's attributes. 20 | """ 21 | id: int = Field(..., alias='nodeId') 22 | backend_id: int = Field(..., alias='backendNodeId') 23 | type: int = Field(..., alias='nodeType') 24 | name: str = Field(..., alias='nodeName') 25 | local_name: str = Field(..., alias='localName') 26 | value: str = Field(..., alias='nodeValue') 27 | children: List[DOMNode] = [] 28 | attributes: Dict[str, str] = {} 29 | 30 | @validator('attributes', pre=True) 31 | def validate_attributes(cls, attributes_list: list[str]) -> dict[str, str]: 32 | attributes = {} 33 | attributes_list_count = len(attributes_list) 34 | assert attributes_list_count % 2 == 0 35 | for name_idx in range(0, attributes_list_count, 2): 36 | attributes[attributes_list[name_idx]] = attributes_list[name_idx + 1] 37 | 38 | return attributes 39 | 40 | def search(self, predicate: Callable[[DOMNode], bool]) -> list[DOMNode]: 41 | """Search nodes in the DOM Tree using `predicate`.""" 42 | def _search(node: DOMNode, found_nodes: list[DOMNode]) -> None: 43 | if predicate(node): 44 | found_nodes.append(node) 45 | 46 | for child in node.children: 47 | _search(child, found_nodes) 48 | 49 | found_nodes: list[DOMNode] = [] 50 | _search(self, found_nodes) 51 | return found_nodes 52 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/contact_group.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | from .schedule import Schedule 8 | 9 | 10 | class Contact(BaseModel): 11 | # Тип контакта. 12 | # Возможные значения: 13 | # * `email` — электронная почта 14 | # * `website` — сайт, протокол http 15 | # * `phone` — телефон 16 | # * `fax` — факс 17 | # * `icq` — аккаунт в ICQ 18 | # * `jabber` — Jabber 19 | # * `skype` — Skype 20 | # * `vkontakte` — ВКонтакте 21 | # * `twitter` — Twitter 22 | # * `instagram` — Instagram 23 | # * `facebook` — Facebook 24 | # * `pobox` — P.O.Box (абонентский ящик) 25 | # * `youtube` — Youtube 26 | # * `odnoklassniki` — ok.ru 27 | # * `googleplus` — Google + 28 | # * `linkedin` — Linkedin 29 | # * `pinterest` — Pinterest 30 | # * `whatsapp` — Whatsapp 31 | # * `telegram` — Telegram 32 | # * `viber` — Viber 33 | type: str 34 | 35 | # Техническое значение контакта (например "Телефон в международном формате") 36 | value: str 37 | 38 | # Значение контакта для вывода на экран (например "e-mail Иванова") 39 | text: Optional[str] = None 40 | 41 | # Ссылка на сайт или социальную сеть 42 | url: Optional[str] = None 43 | 44 | # Значение контакта для вывода на принтер (например "e-mail Иванова") 45 | print_text: Optional[str] = None 46 | 47 | # Уточняющая информация о контакте (например "для деловой переписки") 48 | comment: Optional[str] = None 49 | 50 | 51 | class ContactGroup(BaseModel): 52 | # Список контактов 53 | contacts: List[Contact] 54 | 55 | # Расписание группы контактов 56 | schedule: Optional[Schedule] = None 57 | 58 | # Комментарий к группе контактов (например "Многокональный телефон") 59 | comment: Optional[str] = None 60 | 61 | # Имя группы контактов (например "Сервисный центр") 62 | name: Optional[str] = None 63 | -------------------------------------------------------------------------------- /parser_2gis/parser/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def blocked_requests(extended: bool = False) -> list[str]: 5 | """Get blocked request patterns list: metrics, logging, 6 | analytics, counters, ads, etc. 7 | 8 | During the parsing we don't need requests that could slow 9 | down the speed or increase memory consumption or 10 | send any logs of automatic bot activity. 11 | 12 | The lists are separated: basic and extended that includes 13 | images, styles, map tiles, fonts and other visual-related 14 | resources. 15 | 16 | Args: 17 | extended: Whether to return extended list or basic. 18 | 19 | Returns: 20 | List of blocking url patterns. 21 | """ 22 | # Metrics, logging, analytics, counters, ads, etc. 23 | blocked_requests: list[str] = [ 24 | 'https://favorites.api.2gis.*/*', 25 | 'https://2gis.*/_/log', 26 | 'https://2gis.*/_/metrics', 27 | 'https://google-analytics.com/*', 28 | 'https://www.google-analytics.com/*', 29 | 'https://counter.yadro.ru/*', 30 | 'https://www.tns-counter.ru/*', 31 | 'https://mc.yandex.ru/*', 32 | 'https://catalog.api.2gis.ru/3.0/ads/*', 33 | 'https://d-assets.2gis.*/privacyPolicyBanner*.js', 34 | 'https://vk.com/*', 35 | ] 36 | 37 | # Styles, map tiles, images, etc. 38 | blocked_requests_extra: list[str] = [ 39 | 'https://d-assets.2gis.*/fonts/*', 40 | 'https://mapgl.2gis.*/api/fonts/*', 41 | 'https://tile*.maps.2gis.*', 42 | 'https://s*.bss.2gis.*', 43 | 'https://styles.api.2gis.*', 44 | 'https://video-pr.api.2gis.*', 45 | 'https://api.photo.2gis.*/*', 46 | 'https://market-backend.api.2gis.*', 47 | 'https://traffic*.edromaps.2gis.*', 48 | 'https://disk.2gis.*/styles/*', 49 | ] 50 | 51 | ret_list = blocked_requests 52 | if extended: 53 | ret_list.extend(blocked_requests_extra) 54 | 55 | return ret_list 56 | -------------------------------------------------------------------------------- /parser_2gis/parser/parsers/firm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from ...logger import logger 6 | from .main import MainParser 7 | 8 | if TYPE_CHECKING: 9 | from ...writer import FileWriter 10 | 11 | 12 | class FirmParser(MainParser): 13 | """Parser for the firms provided by 2GIS. 14 | 15 | URL pattern for such cases: https://2gis.//firm/ 16 | """ 17 | @staticmethod 18 | def url_pattern(): 19 | """URL pattern for the parser.""" 20 | return r'https?://2gis\.[^/]+(/[^/]+)?/firm/.*' 21 | 22 | def parse(self, writer: FileWriter) -> None: 23 | """Parse URL with an organization. 24 | 25 | Args: 26 | writer: Target file writer. 27 | """ 28 | # Go URL 29 | self._chrome_remote.navigate(self._url, referer='https://google.com', timeout=120) 30 | 31 | # Document loaded, get its response 32 | responses = self._chrome_remote.get_responses(timeout=5) 33 | if not responses: 34 | logger.error('Ошибка получения ответа сервера.') 35 | return 36 | document_response = responses[0] 37 | 38 | # Handle 404 39 | assert document_response['mimeType'] == 'text/html' 40 | if document_response['status'] == 404: 41 | logger.warn('Сервер вернул сообщение "Организация не найдена".') 42 | 43 | if self._options.skip_404_response: 44 | return 45 | 46 | # Wait all 2GIS requests get finished 47 | self._wait_requests_finished() 48 | 49 | # Gather response and collect useful payload. 50 | initial_state = self._chrome_remote.execute_script('window.initialState') 51 | data = list(initial_state['data']['entity']['profile'].values()) 52 | if not data: 53 | logger.warn('Данные организации не найдены.') 54 | return 55 | doc = data[0] 56 | 57 | # Write API document into a file 58 | writer.write({ 59 | 'result': { 60 | 'items': [doc['data']] 61 | }, 62 | 'meta': doc['meta'] 63 | }) 64 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import os 4 | import sys 5 | from tempfile import TemporaryDirectory 6 | 7 | import pytest 8 | from parser_2gis import main as parser_main 9 | 10 | 11 | def check_csv_result(result_path, num_records): 12 | """Check CSV output. 13 | 14 | Args: 15 | file_path: Path to CSV table. 16 | num_records: Expected number of records. 17 | """ 18 | with open(result_path, 'r', encoding='utf-8-sig', errors='replace') as f: 19 | reader = csv.reader(f) 20 | assert len(list(reader)) == num_records + 1 # `num_records` + header 21 | 22 | 23 | def check_json_result(result_path, num_records): 24 | """Check JSON output. 25 | 26 | Args: 27 | file_path: Path to JSON file. 28 | num_records: Expected number of records. 29 | """ 30 | with open(result_path, 'r', encoding='utf-8-sig', errors='replace') as f: 31 | doc = json.load(f) 32 | assert len(doc) == num_records 33 | 34 | 35 | testdata = [ 36 | ['csv', check_csv_result], 37 | ['json', check_json_result], 38 | ] 39 | 40 | 41 | @pytest.mark.parametrize('format, result_checker', testdata) 42 | def test_parser(monkeypatch, format, result_checker, num_records=5): 43 | """Parse TOP `num_records` entries and check result file. 44 | 45 | Args: 46 | format: Result format (`csv` or `json`). 47 | result_checker: Function that checks parsed result. 48 | num_records: Number of records to be parsed. 49 | """ 50 | with monkeypatch.context() as m, TemporaryDirectory() as tmpdir: 51 | result_path = os.path.join(tmpdir, f'output.{format}') 52 | 53 | m.setattr(sys, 'argv', [ 54 | os.path.abspath(__file__), 55 | '-i', 'https://2gis.ru/moscow/search/Аптеки', 56 | '-o', result_path, 57 | '-f', format, 58 | '--parser.max-records', f'{num_records}', 59 | '--chrome.headless', 'yes', 60 | ]) 61 | 62 | # Run parser on a popular query 63 | # that gotta have at least `num_records` records. 64 | parser_main() 65 | 66 | # Check parsed results 67 | result_checker(result_path, num_records) 68 | -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/tk/line_numbered_text.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import tkinter as tk 4 | 5 | from .custom_text import CustomText 6 | 7 | 8 | class TextLineNumbers(tk.Canvas): 9 | """Numbered Line Widget.""" 10 | def __init__(self, *args, **kwargs) -> None: 11 | super().__init__(*args, **kwargs) 12 | self.textwidget: tk.Text | None = None 13 | 14 | def attach(self, textwidget: tk.Text) -> None: 15 | self.textwidget = textwidget 16 | 17 | def redraw(self) -> None: 18 | self.delete('all') 19 | 20 | assert self.textwidget, 'Attach textwidget first' 21 | i = self.textwidget.index('@0,0') 22 | while True: 23 | bbox = self.textwidget.dlineinfo(i) 24 | if bbox is None: 25 | break 26 | 27 | y = bbox[1] 28 | line_n = f'{i}'.split('.')[0] 29 | 30 | self.create_text(2, y, anchor='nw', text=line_n, font=('TkDefaultFont', 12)) 31 | i = self.textwidget.index(f'{i}+1line') 32 | 33 | 34 | class LineNumberedText(tk.Frame): 35 | """Combined Numbered Line and Customized Text Widgets.""" 36 | def __init__(self, *args, **kwargs) -> None: 37 | super().__init__(*args, **kwargs) 38 | self.text = CustomText(self) 39 | 40 | self.vsb = tk.Scrollbar(self, orient='vertical', command=self.text.yview) 41 | self.hsb = tk.Scrollbar(self, orient='horizontal', command=self.text.xview) 42 | self.text.configure(yscrollcommand=self.vsb.set) 43 | self.text.configure(xscrollcommand=self.hsb.set) 44 | self.text.configure(wrap='none', undo=True) 45 | 46 | self.linenumbers = TextLineNumbers(self, width=34) 47 | self.linenumbers.attach(self.text) 48 | 49 | self.vsb.pack(side='right', fill='y') 50 | self.hsb.pack(side='bottom', fill='x') 51 | self.linenumbers.pack(side='left', fill='y') 52 | self.text.pack(side='right', fill='both', expand=True) 53 | 54 | self.text.bind('<>', self._on_change) 55 | self.text.bind('', self._on_change) 56 | 57 | def _on_change(self, event: tk.Event) -> None: 58 | self.linenumbers.redraw() 59 | -------------------------------------------------------------------------------- /parser_2gis/logger/logger.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import os 5 | import warnings 6 | from typing import TYPE_CHECKING 7 | 8 | if TYPE_CHECKING: 9 | from .options import LogOptions 10 | import queue 11 | 12 | 13 | # Set third-party loggers level to error 14 | logging.getLogger('urllib3').setLevel(logging.ERROR) 15 | logging.getLogger('pychrome').setLevel(logging.FATAL) 16 | warnings.filterwarnings( 17 | action='ignore', 18 | module='pychrome' 19 | ) 20 | 21 | _LOGGER_NAME = 'parser-2gis' 22 | 23 | 24 | class QueueHandler(logging.Handler): 25 | def __init__(self, log_queue: queue.Queue[tuple[str, str]]) -> None: 26 | super().__init__() 27 | self._log_queue = log_queue 28 | 29 | def emit(self, record: logging.LogRecord) -> None: 30 | log_message = (record.levelname, self.format(record) + os.linesep) 31 | self._log_queue.put(log_message) 32 | 33 | 34 | def setup_gui_logger(log_queue: queue.Queue[tuple[str, str]], 35 | options: LogOptions) -> None: 36 | """Add queue handler to existing logger so it would 37 | emmit logs to the specified queue. 38 | 39 | Args: 40 | log_queue: Queue to put logging messages into. 41 | """ 42 | formatter = logging.Formatter(options.gui_format, options.gui_datefmt) 43 | queue_handler = QueueHandler(log_queue) 44 | queue_handler.setFormatter(formatter) 45 | logger.addHandler(queue_handler) 46 | 47 | 48 | def setup_cli_logger(options: LogOptions) -> None: 49 | """Setup CLI logger from config. 50 | 51 | Args: 52 | options: Log options. 53 | """ 54 | setup_logger( 55 | options.level, 56 | options.cli_format, 57 | options.cli_datefmt, 58 | ) 59 | 60 | 61 | def setup_logger(level: str, fmt: str, datefmt: str) -> None: 62 | """Setup logger. 63 | 64 | Args: 65 | level: logger level. 66 | fmt: format string in percent style. 67 | datefmt: date format string. 68 | """ 69 | if not logger.handlers: 70 | handler = logging.StreamHandler() 71 | formatter = logging.Formatter(fmt, datefmt) 72 | handler.setFormatter(formatter) 73 | 74 | logger.addHandler(handler) 75 | logger.setLevel(level) 76 | 77 | 78 | logger = logging.getLogger(_LOGGER_NAME) 79 | -------------------------------------------------------------------------------- /scripts/update_cities_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Download cities info for following countries: 4 | # ae, az, bh, by, cl, cy, cz, eg, it, kg, kw, kz, om, qa, ru, sa, uz 5 | 6 | import json 7 | import os 8 | import sys 9 | 10 | for _ in range(2): 11 | try: 12 | import parser_2gis.paths 13 | from parser_2gis.chrome import (ChromeOptions, 14 | ChromeRemote) 15 | break 16 | except ImportError: 17 | here = os.path.dirname(os.path.abspath(__file__)) 18 | parent_dir = os.path.abspath(os.path.join(here, os.pardir)) 19 | if parent_dir not in sys.path: 20 | sys.path.insert(1, parent_dir) 21 | 22 | # Get available cities from https://data.2gis.com and save it to data/cities.json 23 | 24 | _REGIONS_LIST_RESPONSE = r'https://catalog\.api\.2gis.[^/]+/.*/region/list' 25 | 26 | # NOTE: 27 | # There are also cities list in 'https://hermes.2gis.ru/api/data/availableParameters' 28 | # It has less entries than in '/region/list', but more structured (tree vs flat list). 29 | # Better use '/region/list' for parsing purpose. 30 | 31 | chrome_options = ChromeOptions(headless=True) 32 | with ChromeRemote(chrome_options, [_REGIONS_LIST_RESPONSE]) as chrome_remote: 33 | chrome_remote.navigate('https://data.2gis.com') 34 | response = chrome_remote.wait_response(_REGIONS_LIST_RESPONSE) 35 | data = chrome_remote.get_response_body(response) 36 | 37 | try: 38 | doc = json.loads(data) 39 | except json.JSONDecodeError: 40 | print('Returned invalid JSON document!', file=sys.stderr) 41 | exit(1) 42 | 43 | if not doc: 44 | print('No response, bail!', file=sys.stderr) 45 | exit(1) 46 | 47 | cities = [] 48 | for item in doc['result']['items']: 49 | cities.append({ 50 | # "name" could contain trailing underscore char 51 | # for some reasons, get rid of it. 52 | 'name': item['name'].strip('_'), 53 | 'code': item['code'], 54 | 'domain': item['domain'], 55 | 'country_code': item['country_code'], 56 | }) 57 | 58 | cities = sorted(cities, key=lambda x: x['domain']) 59 | cities_path = parser_2gis.paths.data_path() / 'cities.json' 60 | with open(cities_path, 'w', encoding='utf-8') as f: 61 | json.dump(cities, f, ensure_ascii=False, indent=4) 62 | -------------------------------------------------------------------------------- /parser_2gis/writer/writers/file_writer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import TYPE_CHECKING, Any, IO 5 | 6 | from ...logger import logger 7 | 8 | if TYPE_CHECKING: 9 | from ..options import WriterOptions 10 | 11 | 12 | class FileWriter(ABC): 13 | """Base writer.""" 14 | def __init__(self, file_path: str, writer_options: WriterOptions) -> None: 15 | self._file_path = file_path 16 | self._options = writer_options 17 | 18 | @abstractmethod 19 | def write(self, catalog_doc: Any) -> None: 20 | """Write Catalog Item API JSON document retrieved by parser.""" 21 | pass 22 | 23 | def _open_file(self, file_path: str, mode: str = 'r') -> IO[Any]: 24 | return open(file_path, mode, encoding=self._options.encoding, 25 | newline='', errors='replace') 26 | 27 | def _check_catalog_doc(self, catalog_doc: Any, verbose: bool = True) -> bool: 28 | """Check Catalog Item API JSON document for errors. 29 | 30 | Args: 31 | catalog_doc: Catalog Item API JSON document. 32 | verbose: Whether to report about found errors. 33 | 34 | Returns: 35 | `True` if document passed all checks. 36 | `False` if errors found in document. 37 | """ 38 | try: 39 | assert isinstance(catalog_doc, dict) 40 | 41 | if 'error' in catalog_doc['meta']: # An error is found 42 | if verbose: 43 | error_msg = catalog_doc['meta']['error'].get('message', None) 44 | if error_msg: 45 | logger.error('Сервер ответил ошибкой: %s', error_msg) 46 | else: 47 | logger.error('Сервер ответил неизвестной ошибкой.') 48 | 49 | return False 50 | 51 | assert catalog_doc['meta']['code'] == 200 52 | assert 'result' in catalog_doc 53 | assert 'items' in catalog_doc['result'] 54 | assert isinstance(catalog_doc['result']['items'], list) 55 | assert len(catalog_doc['result']['items']) > 0 56 | assert isinstance(catalog_doc['result']['items'][0], dict) 57 | 58 | if len(catalog_doc['result']['items']) > 1 and verbose: 59 | logger.warning('Сервер вернул больше одного ответа.') 60 | 61 | return True 62 | except (KeyError, AssertionError): 63 | if verbose: 64 | logger.error('Сервер ответил неизвестным документом.') 65 | return False 66 | 67 | def __enter__(self) -> FileWriter: 68 | self._file = self._open_file(self._file_path, 'w') 69 | return self 70 | 71 | def __exit__(self, *exc_info) -> None: 72 | self._file.close() 73 | -------------------------------------------------------------------------------- /parser_2gis/paths.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import base64 4 | import functools 5 | import os 6 | import pathlib 7 | 8 | from .common import running_mac, running_windows 9 | 10 | 11 | def data_path() -> pathlib.Path: 12 | """Get package's data path.""" 13 | if '_MEIPASS2' in os.environ: 14 | here = os.environ['_MEIPASS2'] 15 | else: 16 | here = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | path = os.path.join(here, 'data') 19 | return pathlib.Path(path) 20 | 21 | 22 | def user_path(is_config: bool = True) -> pathlib.Path: 23 | """Get user path depending on running OS. 24 | 25 | Note: 26 | Possible path location depending on running OS: 27 | * Unix: ~/.config/parser-2gis or ~/.local/share/parser-2gis (depends on `is_config` flag) 28 | * Mac: ~/Library/Application Support/parser-2gis/ 29 | * Win: C:\\Users\\%USERPROFILE%\\AppData\\Local\\parser-2gis 30 | """ 31 | if running_windows(): 32 | import ctypes 33 | 34 | CSIDL_LOCAL_APPDATA = 28 35 | buf = ctypes.create_unicode_buffer(1024) 36 | ctypes.windll.shell32.SHGetFolderPathW(None, CSIDL_LOCAL_APPDATA, None, 0, buf) # type: ignore 37 | path = buf.value 38 | elif running_mac(): 39 | path = os.path.expanduser('~/Library/Application Support') 40 | else: 41 | if is_config: 42 | path = os.getenv('XDG_CONFIG_HOME', os.path.expanduser('~/.config')) 43 | else: 44 | path = os.getenv('XDG_DATA_HOME', os.path.expanduser('~/.local/share')) 45 | 46 | path = os.path.join(path, 'parser-2gis') 47 | return pathlib.Path(path) 48 | 49 | 50 | @functools.lru_cache() 51 | def image_path(basename: str, ext: str | None = None) -> str: 52 | """Get image `basename`.`ext`. 53 | Extension is ignored if `ext` set to `None`. 54 | 55 | Args: 56 | basename: Image basename. 57 | ext: Image extension. 58 | 59 | Returns: 60 | Image path. 61 | """ 62 | images_dir = data_path() / 'images' 63 | for img_name in os.listdir(images_dir): 64 | img_basename, img_ext = os.path.splitext(img_name) 65 | if img_basename == basename and (ext is None or img_ext == f'.{ext}'): 66 | return os.path.abspath(images_dir / img_name) 67 | 68 | raise FileNotFoundError(f'Изображение {basename} не найдено') 69 | 70 | 71 | @functools.lru_cache() 72 | def image_data(basename: str, ext: str | None = None) -> bytes: 73 | """Get image data `basename`.`ext`. 74 | Extension is ignored if `ext` set to `None`. 75 | 76 | Args: 77 | basename: Image basename. 78 | ext: Image extension. 79 | 80 | Returns: 81 | Image data. 82 | """ 83 | with open(image_path(basename, ext), 'rb') as f_img: 84 | return base64.b64encode(f_img.read()) 85 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/catalog_item.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | from .address import Address 8 | from .adm_div_item import AdmDivItem 9 | from .contact_group import ContactGroup 10 | from .name_ex import NameEx 11 | from .org import Org 12 | from .point import Point 13 | from .reviews import Reviews 14 | from .rubric import Rubric 15 | from .schedule import Schedule 16 | 17 | 18 | class CatalogItem(BaseModel): 19 | # Уникальный идентификатор филиала организации (например "141265769336625_f91d4H3777058262347790J0e8g28765") 20 | id: str 21 | 22 | # Адрес объекта 23 | address: Optional[Address] = None 24 | 25 | # Уточнение о местоположении филиала по указанному адресу (например "офис 413") 26 | address_comment: Optional[str] = None 27 | 28 | # Представление поля address в виде одной строки (например "Димитрова проспект, 7") 29 | address_name: Optional[str] = None 30 | 31 | # Принадлежность к административной территории 32 | adm_div: List[AdmDivItem] = [] 33 | 34 | # Алиас города, в котором находится объект (например "perm") 35 | city_alias: Optional[str] = None 36 | 37 | # Контакты филиала 38 | contact_groups: List[ContactGroup] = [] 39 | 40 | # Текущая локаль для региона (например "ru_RU") 41 | locale: str 42 | 43 | # Полное собственное название филиала или название организации (например "Солнышко, кафе") 44 | name: Optional[str] = None 45 | 46 | # Расширеное название филиала 47 | name_ex: Optional[NameEx] = None 48 | 49 | # Отзывы 50 | reviews: Optional[Reviews] = None 51 | 52 | # Организация 53 | org: Optional[Org] = None 54 | 55 | # Координаты точки поиска, заданные в системе координат WGS84 в формате lon, lat 56 | point: Optional[Point] = None 57 | 58 | # Уникальный идентификатор проекта 59 | region_id: Optional[str] = None 60 | 61 | # Уникальный идентификатор сегмента 62 | segment_id: Optional[str] = None 63 | 64 | # Рубрики филиала 65 | rubrics: List[Rubric] = [] 66 | 67 | # Время работы 68 | schedule: Optional[Schedule] = None 69 | 70 | # Смещение таймзоны в минутах относительно UTC0 (например "420") 71 | timezone_offset: Optional[int] = None 72 | 73 | # Тип объекта 74 | type: str 75 | 76 | # Признак удаленного объекта 77 | is_deleted: Optional[bool] = None 78 | 79 | @property 80 | def url(self) -> str: 81 | return 'https://2gis.com/firm/%s' % self.id.split('_')[0] 82 | 83 | @property 84 | def timezone(self) -> str | None: 85 | if self.timezone_offset is None: 86 | return None 87 | sign = '-' if self.timezone_offset < 0 else '+' 88 | minutes = abs(self.timezone_offset) 89 | h = minutes // 60 90 | m = minutes % 60 91 | return '{}{:02d}:{:02d}'.format(sign, h, m) 92 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/schedule.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Optional 4 | 5 | from pydantic import BaseModel, Field 6 | 7 | 8 | class WorkingHour(BaseModel): 9 | # Значение в формате hh:mm 10 | from_: str = Field(..., alias='from') 11 | 12 | # Значение в формате hh:mm 13 | to: str 14 | 15 | 16 | class ScheduleDay(BaseModel): 17 | # Часы работы 18 | working_hours: List[WorkingHour] 19 | 20 | 21 | class Schedule(BaseModel): 22 | # Понедельник 23 | Mon: Optional[ScheduleDay] = None 24 | 25 | # Вторник 26 | Tue: Optional[ScheduleDay] = None 27 | 28 | # Среда 29 | Wed: Optional[ScheduleDay] = None 30 | 31 | # Четверг 32 | Thu: Optional[ScheduleDay] = None 33 | 34 | # Пятница 35 | Fri: Optional[ScheduleDay] = None 36 | 37 | # Суббота 38 | Sat: Optional[ScheduleDay] = None 39 | 40 | # Воскресенье 41 | Sun: Optional[ScheduleDay] = None 42 | 43 | # Признак того, что организация работает круглосуточно 7 дней в неделю. 44 | # Если поле отсутствует, то организация не считается работающей круглосуточно. 45 | is_24x7: Optional[bool] = None 46 | 47 | # Локализованное описание возможных изменений во времени работы. 48 | # Применяется для праздников, временных ограничений и т.д. 49 | description: Optional[str] = None 50 | 51 | # Комментарий (например "Кругосуточно в праздничные дни") 52 | comment: Optional[str] = None 53 | 54 | # Дата начала изменений в расписании работы. Формат: "YYYY-MM-DD" 55 | date_from: Optional[str] = None 56 | 57 | # Дата конца изменений в расписании работы. Формат: "YYYY-MM-DD" 58 | date_to: Optional[str] = None 59 | 60 | def to_str(self, join_char: str, add_comment: bool = False) -> str: 61 | """Schedule as a string. 62 | 63 | Args: 64 | join_char: Char for splitting split days. 65 | add_comment: Whether to add comment at the end. 66 | 67 | Returns: 68 | Schedule as a string. 69 | """ 70 | days_names = [x.name for x in self.__fields__.values() if x.type_ == ScheduleDay] 71 | days_mapping = dict(Mon='Пн', Tue='Вт', Wed='Ср', Thu='Чт', Fri='Пт', Sat='Сб', Sun='Вс') 72 | 73 | slots_list = [] 74 | for day_name in days_names: 75 | day_value = getattr(self, day_name) 76 | if not day_value: 77 | continue 78 | 79 | day_slot = f'{days_mapping[day_name]}: ' 80 | for i, time_slot in enumerate(day_value.working_hours): 81 | if i > 0: 82 | day_slot += ', ' 83 | day_slot += f'{time_slot.from_}-{time_slot.to}' 84 | 85 | slots_list.append(day_slot) 86 | 87 | result = join_char.join(slots_list) 88 | if add_comment and self.comment: 89 | result += ' (%s)' % self.comment 90 | 91 | return result 92 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | build: 10 | name: Build 11 | 12 | strategy: 13 | matrix: 14 | include: 15 | - os: ubuntu-20.04 16 | dist: linux 17 | 18 | - os: macos-12 19 | dist: macos 20 | ext: 21 | 22 | - os: macos-14 23 | dist: macos-arm 24 | ext: 25 | 26 | - os: windows-2019 27 | dist: windows 28 | 29 | runs-on: ${{ matrix.os }} 30 | 31 | steps: 32 | - name: Get the version 33 | id: get_version 34 | run: echo ::set-output name=version::${GITHUB_REF#refs/tags/} 35 | shell: bash 36 | 37 | - uses: actions/checkout@v3 38 | 39 | - name: Set up Python 3.11 40 | uses: actions/setup-python@v3 41 | with: 42 | python-version: "3.11.3" 43 | 44 | - name: Install dependencies 45 | run: | 46 | python -m pip install --upgrade pip 47 | python -m pip install -e .[gui,dev] 48 | # Replace pydantic with no-binary version to lightweight target binary a bit 49 | # python -m pip uninstall -y pydantic 50 | # python -m pip install --no-binary pydantic pydantic>=1.9.0 51 | 52 | - name: Build standalone app 53 | run: | 54 | python setup.py build_standalone 55 | 56 | - name: Archive Windows distributive 57 | if: matrix.dist == 'windows' 58 | run: pushd dist && 7z a ../Parser2GIS-${{steps.get_version.outputs.version}}-${{matrix.dist}}.zip Parser2GIS.exe && popd 59 | shell: bash 60 | 61 | - name: Archive Linux/MacOS distributive 62 | if: matrix.dist != 'windows' 63 | run: tar -C dist/ -czvf Parser2GIS-${{steps.get_version.outputs.version}}-${{matrix.dist}}.tar.gz Parser2GIS 64 | shell: bash 65 | 66 | - name: Upload distributive 67 | uses: actions/upload-artifact@v3 68 | with: 69 | name: dist-${{ matrix.dist }} 70 | path: Parser2GIS-* 71 | retention-days: 5 72 | 73 | release: 74 | name: Release 75 | 76 | needs: [build] 77 | 78 | runs-on: ubuntu-latest 79 | 80 | steps: 81 | - name: Get the version 82 | id: get_version 83 | run: | 84 | echo ::set-output name=version::${GITHUB_REF#refs/tags/v} 85 | shell: bash 86 | 87 | - uses: actions/checkout@v3 88 | 89 | - name: Get Changelog Entry 90 | id: changelog_reader 91 | uses: mindsers/changelog-reader-action@v2 92 | with: 93 | version: ${{ steps.get_version.outputs.version }} 94 | path: ./CHANGELOG.md 95 | 96 | - uses: actions/download-artifact@v3 97 | 98 | - uses: ncipollo/release-action@v1.10.0 99 | with: 100 | tag: v${{ steps.changelog_reader.outputs.version }} 101 | name: Parser2GIS ${{ steps.changelog_reader.outputs.version }} 102 | body: ${{ steps.changelog_reader.outputs.changes }} 103 | prerelease: ${{ steps.changelog_reader.outputs.status == 'prereleased' }} 104 | draft: true 105 | artifacts: "dist-*/*" 106 | allowUpdates: true 107 | token: ${{ secrets.GITHUB_TOKEN }} 108 | -------------------------------------------------------------------------------- /parser_2gis/runner/gui.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import threading 4 | from typing import TYPE_CHECKING 5 | 6 | from ..exceptions import ChromeRuntimeException, ChromeUserAbortException 7 | from ..logger import logger 8 | from ..parser import get_parser 9 | from ..writer import get_writer 10 | from .runner import AbstractRunner 11 | 12 | if TYPE_CHECKING: 13 | from ..config import Configuration 14 | 15 | 16 | class GUIRunner(AbstractRunner, threading.Thread): 17 | """GUI thread runner. 18 | 19 | Args: 20 | urls: 2GIS URLs with items to be collected. 21 | output_path: Path to the result file. 22 | format: `csv`, `xlsx` or `json` format. 23 | config: Configuration. 24 | """ 25 | def __init__(self, urls: list[str], output_path: str, format: str, 26 | config: Configuration) -> None: 27 | AbstractRunner.__init__(self, urls, output_path, format, config) 28 | threading.Thread.__init__(self) 29 | 30 | self._parser = None 31 | self._lock = threading.Lock() 32 | 33 | def start(self) -> None: 34 | """Start thread.""" 35 | self._cancelled = False 36 | logger.info('Парсинг запущен.') 37 | threading.Thread.start(self) 38 | 39 | def stop(self) -> None: 40 | """Stop thread.""" 41 | if not self._started.is_set(): # type: ignore 42 | raise RuntimeError('start() is not called') 43 | 44 | if self._cancelled: 45 | return # We can stop the thread only once 46 | 47 | self._cancelled = True 48 | self._stop_parser() 49 | 50 | def _stop_parser(self) -> None: 51 | """Close parser if it's been opened.""" 52 | with self._lock: 53 | if self._parser: 54 | self._parser.close() 55 | self._parser = None 56 | 57 | def run(self) -> None: 58 | """Thread's activity.""" 59 | with get_writer(self._output_path, self._format, self._config.writer) as writer: 60 | for url in self._urls: 61 | try: 62 | logger.info(f'Парсинг ссылки {url}') 63 | self._parser = get_parser(url, 64 | chrome_options=self._config.chrome, 65 | parser_options=self._config.parser) 66 | assert self._parser 67 | 68 | if not self._cancelled: 69 | self._parser.parse(writer) 70 | except Exception as e: 71 | if not self._cancelled: # Don't catch intended exceptions caused by stopping parser 72 | if isinstance(e, ChromeRuntimeException) and str(e) == 'Tab has been stopped': 73 | logger.error('Вкладка браузера была закрыта.') 74 | elif isinstance(e, ChromeUserAbortException): 75 | logger.error('Работа парсера прервана пользователем.') 76 | else: 77 | logger.error('Ошибка во время работы парсера.', exc_info=True) 78 | finally: 79 | logger.info('Парсинг ссылки завершён.') 80 | self._stop_parser() 81 | if self._cancelled: 82 | break 83 | 84 | logger.info('Парсинг завершён.') 85 | -------------------------------------------------------------------------------- /parser_2gis/chrome/browser.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import shutil 5 | import subprocess 6 | import tempfile 7 | from typing import TYPE_CHECKING 8 | 9 | from ..common import wait_until_finished 10 | from ..logger import logger 11 | from .exceptions import ChromePathNotFound 12 | from .utils import free_port, locate_chrome_path 13 | 14 | if TYPE_CHECKING: 15 | from .options import ChromeOptions 16 | 17 | 18 | class ChromeBrowser(): 19 | """Chrome Browser with temporary profile. 20 | 21 | Args: 22 | chrome_options: Chrome options. 23 | """ 24 | def __init__(self, chrome_options: ChromeOptions) -> None: 25 | binary_path = (chrome_options.binary_path 26 | if chrome_options.binary_path else locate_chrome_path()) 27 | 28 | if not binary_path: 29 | raise ChromePathNotFound 30 | 31 | logger.debug('Запуск Chrome Браузера.') 32 | 33 | self._profile_path = tempfile.mkdtemp() 34 | self._remote_port = free_port() 35 | self._chrome_cmd = [ 36 | binary_path, 37 | f'--remote-debugging-port={self._remote_port}', 38 | f'--user-data-dir={self._profile_path}', '--no-default-browser-check', 39 | '--no-first-run', '--no-sandbox', '--disable-fre', 40 | '--remote-allow-origins=*', 41 | f'--js-flags=--expose-gc --max-old-space-size={chrome_options.memory_limit}', 42 | ] 43 | 44 | if chrome_options.start_maximized: 45 | self._chrome_cmd.append('--start-maximized') 46 | 47 | if chrome_options.headless: 48 | logger.debug('В Chrome установлен в скрытый режим.') 49 | self._chrome_cmd.append('--headless') 50 | self._chrome_cmd.append('--disable-gpu') 51 | 52 | if chrome_options.disable_images: 53 | logger.debug('В Chrome отключены изображения.') 54 | self._chrome_cmd.append('--blink-settings=imagesEnabled=false') 55 | 56 | if chrome_options.silent_browser: 57 | logger.debug('В Chrome отключен вывод отладочной информации.') 58 | self._proc = subprocess.Popen(self._chrome_cmd, shell=False, 59 | stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) 60 | else: 61 | self._proc = subprocess.Popen(self._chrome_cmd, shell=False) 62 | 63 | @property 64 | def remote_port(self) -> int: 65 | """Remote debugging port.""" 66 | return self._remote_port 67 | 68 | @wait_until_finished(timeout=5, throw_exception=False) 69 | def _delete_profile(self) -> bool: 70 | """Delete profile. 71 | 72 | Returns: 73 | `True` on successful deletion, `False` on failure. 74 | """ 75 | shutil.rmtree(self._profile_path, ignore_errors=True) 76 | profile_deleted = not os.path.isdir(self._profile_path) 77 | return profile_deleted 78 | 79 | def close(self) -> None: 80 | """Close browser and delete temporary profile.""" 81 | logger.debug('Завершение работы Chrome Браузера.') 82 | 83 | # Close the browser 84 | self._proc.terminate() 85 | self._proc.wait() 86 | 87 | # Delete temporary profile 88 | self._delete_profile() 89 | 90 | def __repr__(self) -> str: 91 | classname = self.__class__.__name__ 92 | return f'{classname}(arguments={self._chrome_cmd!r})' 93 | -------------------------------------------------------------------------------- /parser_2gis/writer/models/adm_div_item.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | 8 | class Flags(BaseModel): 9 | # Заполняется только для type=city и принимает единственное значение true в случае, 10 | # если город является главным городом текущего проекта (например Новосибирск) 11 | is_default: Optional[bool] = None 12 | 13 | # Заполняется только для type=adm_div, subtype=city|settlement 14 | # и принимает единственное значение true в случае, если населённый пункт является районным центром. 15 | is_district_area_center: Optional[bool] = None 16 | 17 | # Заполняется только для type=adm_div, subtype=city|settlement 18 | # и принимает единственное значение true в случае, если населённый пункт является областным центром. 19 | is_region_center: Optional[bool] = None 20 | 21 | # Cтрока, наличие которой говорит о том, что филиал временно не работает. 22 | # В строке выгружается код причины закрытия. 23 | temporary_closed: Optional[str] = None 24 | 25 | 26 | class AdmDivItem(BaseModel): 27 | # Идентификатор объекта административной единицы 28 | id: Optional[str] = None 29 | 30 | # Имя объекта 31 | name: str 32 | 33 | # Название территории (для использования в функционале «поделиться», 34 | # для конечных точек маршрута и т.д.). 35 | caption: Optional[str] = None 36 | 37 | # Тип объекта административной единицы. 38 | # Возможные значения: 39 | # * `city` — город 40 | # * `settlement` — населённый пункт 41 | # * `division` — округ 42 | # * `district` — район 43 | # * `living_area` — жилмассив, микрорайон 44 | # * `place` — место 45 | # * `district_area` — район области 46 | # * `region` — регион (область/край/республика и т.п.) 47 | # * `country` - страна 48 | type: str 49 | 50 | # Алиас города, в котором находится объект 51 | city_alias: Optional[str] = None 52 | 53 | # Дополнительные флаги 54 | flags: Optional[Flags] = None 55 | 56 | # Заполняется только для type=city и принимает единственное значение 57 | # true в случае, если город является главным городом текущего проекта (например "Новосибирск") 58 | is_default: Optional[bool] = None 59 | 60 | # Детализированный тип административно-территориальной единицы. 61 | # Возможные значения: 62 | # * `city` — город 63 | # * `microdistrict` — микрорайон 64 | # * `residential_district` — жилмассив 65 | # * `residential_quarter` — квартал 66 | # * `poselok` — посёлок 67 | # * `residential_complex` — жилой комплекс 68 | # * `selo` — село 69 | # * `derevnja` — деревня 70 | # * `cottage_estate` — коттеджный посёлок 71 | # * `urban_settlement` — посёлок городского типа 72 | # * `workers_settlement` — рабочий посёлок 73 | # * `dacha_settlement` — дачный посёлок 74 | # * `resort_settlement` — курортный посёлок 75 | # * `stanitsa` — станица 76 | # * `sloboda` — слобода 77 | # * `khutor` — хутор 78 | # * `aul` — аул 79 | # * `aal` — аал 80 | # * `town` — (военный) городок 81 | # * `farmstead` — заимка 82 | # * `vyselok` — выселок 83 | # * `municipality` — муниципальное образование 84 | # * `station` — станция 85 | # * `townhouse_settlement` — посёлок таунхаусов 86 | # * `territory` — территория 87 | # * `cooperative` — кооператив 88 | # * `partnership` — товарищество 89 | detailed_subtype: Optional[str] = None 90 | -------------------------------------------------------------------------------- /parser_2gis/gui/urls_editor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from ..common import GUI_ENABLED 6 | from .urls_generator import gui_urls_generator 7 | from .utils import ensure_gui_enabled, invoke_widget_hook, setup_text_widget 8 | 9 | if TYPE_CHECKING: 10 | import tkinter as tk 11 | 12 | if GUI_ENABLED: 13 | import PySimpleGUI as sg 14 | from .widgets.tk import LineNumberedText 15 | 16 | 17 | def create_text_widget(column_element: sg.Element, containing_frame: tk.Frame, 18 | toplevel_form: sg.Window) -> tk.Widget: 19 | """Callback for `custom_widget_hook` that creates and 20 | returns Line Numbered Text Widget.""" 21 | # Create and setup Line Numbered Text Widget 22 | urls_widget = LineNumberedText(column_element.TKColFrame) 23 | urls_widget.pack(side='top', fill='both', expand=True) 24 | urls_widget.text.configure(background=sg.theme_input_background_color(), 25 | font=('TkDefaultFont', 12), 26 | highlightthickness=0) 27 | 28 | setup_text_widget(urls_widget.text, toplevel_form.TKroot) 29 | return urls_widget 30 | 31 | 32 | @ensure_gui_enabled 33 | def gui_urls_editor(urls: list[str]) -> list[str] | None: 34 | """Run URLs editor. 35 | 36 | Args: 37 | urls: Currently set urls. 38 | 39 | Returns: 40 | List of URLs or `None` on cancel. 41 | """ 42 | # Window layout 43 | layout = [ 44 | [ 45 | sg.Text('Ссылки'), 46 | ], 47 | [ 48 | sg.Column([[]], key='-COL_URLS-', size=(0, 0,), expand_x=True, expand_y=True), 49 | ], 50 | [ 51 | sg.Button('OK', size=(6, 1), pad=((5, 7), (7, 7)), key='-BTN_OK-'), 52 | sg.Button('Сгенерировать', size=(15, 1), pad=((7, 7), (7, 7)), key='-BTN_BUILD-'), 53 | sg.Column([ 54 | [ 55 | sg.Button('Отмена', size=(8, 1), pad=(0, (7, 7)), key='-BTN_CANCEL-'), 56 | ], 57 | ], expand_x=True, element_justification='right'), 58 | ], 59 | ] 60 | 61 | with invoke_widget_hook(sg.PySimpleGUI, '-COL_URLS-', create_text_widget) as get_widget: 62 | window = sg.Window('URLs', layout=layout, finalize=True, auto_size_text=True, 63 | font='Any 12', modal=True, keep_on_top=True) 64 | 65 | # Get `LineNumberedText` widget 66 | urls_widget = get_widget() 67 | assert urls_widget 68 | 69 | # Insert existing links 70 | urls_widget.text.insert('insert', '\n'.join(urls)) 71 | 72 | # Focus on custom widget 73 | urls_widget.text.focus_set() 74 | 75 | # Result urls 76 | ret_urls = None 77 | 78 | # Main loop 79 | while True: 80 | event, _ = window.read() 81 | if event in (None, '-BTN_CANCEL-'): 82 | break 83 | 84 | elif event == '-BTN_BUILD-': 85 | urls = gui_urls_generator() 86 | if urls: 87 | urls_content = urls_widget.text.get('1.0', 'end')[:-1] 88 | join_character = '\n' if urls_content and urls_content[-1:] != '\n' else '' 89 | urls_widget.text.insert('end', join_character + '\n'.join(urls)) 90 | 91 | elif event == '-BTN_OK-': 92 | urls_content = urls_widget.text.get('1.0', 'end')[:-1] 93 | ret_urls = [x for x in urls_content.splitlines() if x.strip()] 94 | break 95 | 96 | window.close() 97 | del window 98 | 99 | return ret_urls 100 | -------------------------------------------------------------------------------- /parser_2gis/gui/widgets/sg/rubrics_tree.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any 4 | 5 | import PySimpleGUI as sg 6 | 7 | 8 | class RubricsTree(sg.Tree): # type: ignore 9 | """Rubrics tree. 10 | 11 | Args: 12 | rubrics: Rubrics dictionary. 13 | image_parent: Image for parent rubric. 14 | image_item: Image for rubric. 15 | """ 16 | def __init__(self, rubrics: dict[str, Any], image_parent: bytes | None = None, 17 | image_item: bytes | None = None, *args, **kwargs) -> None: 18 | self._rubrics = rubrics 19 | self._image_parent = image_parent 20 | self._image_item = image_item 21 | self.ShowExpanded = False 22 | super().__init__(*args, **kwargs, data=self._build_tree()) 23 | 24 | def _build_tree(self, root_code: str = '0', 25 | tree: sg.TreeData | None = None) -> sg.TreeData: 26 | """Get tree data out of `_rubrics`. 27 | 28 | Args: 29 | root_code: Root key (always '0'). 30 | tree: Tree data (always None). 31 | 32 | Returns: 33 | Generated tree data. 34 | """ 35 | node = self._rubrics[root_code] 36 | parent_code = node['parentCode'] 37 | is_leaf = not bool(node['children']) 38 | 39 | visible = node.get('visible', True) 40 | if not visible: 41 | return 42 | 43 | if root_code == '0': 44 | tree = sg.TreeData() 45 | 46 | if root_code != '0': 47 | # Change root to sg's '', instead of default '0' 48 | assert tree 49 | tree.Insert('' if parent_code == '0' else parent_code, 50 | root_code, node['label'], values=[], 51 | icon=self._image_item if is_leaf else self._image_parent) 52 | 53 | for child_code in node['children']: 54 | self._build_tree(child_code, tree) 55 | 56 | return tree 57 | 58 | def expand(self, expand: bool = True) -> None: 59 | """Expand tree. 60 | 61 | Args: 62 | expand: Whether to expand or collapse the tree. 63 | """ 64 | def recursive_expand(parent: str = '') -> None: 65 | self.widget.item(parent, open=expand) 66 | for child in self.widget.get_children(parent): 67 | recursive_expand(child) 68 | 69 | recursive_expand() 70 | 71 | def clear(self) -> None: 72 | """Clear tree.""" 73 | self.widget.delete(*self.widget.get_children()) 74 | 75 | def filter(self, query: str) -> None: 76 | """Filter tree by user search query. 77 | 78 | Args: 79 | query: User search query. 80 | """ 81 | def mark_visible_nodes(root_code: str = '0') -> bool: 82 | """Tree traversal with marking nodes 83 | matches specified user query. 84 | 85 | Args: 86 | root_code: Root key (always '0'). 87 | 88 | Returns: 89 | Root node visibility. 90 | """ 91 | node = self._rubrics[root_code] 92 | children = node['children'] 93 | label = node['label'] or '' 94 | 95 | visible = False 96 | for child in children: 97 | if mark_visible_nodes(child): 98 | visible = True 99 | 100 | if not visible: 101 | visible = query in label.lower() 102 | 103 | node['visible'] = visible 104 | return visible 105 | 106 | self.ShowExpanded = True 107 | query = query.lower() 108 | if mark_visible_nodes(): 109 | self.update(values=self._build_tree()) 110 | else: 111 | self.clear() 112 | -------------------------------------------------------------------------------- /parser_2gis/chrome/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import os 5 | import socket 6 | 7 | from ..common import running_mac, running_windows 8 | 9 | 10 | @functools.lru_cache() 11 | def locate_chrome_path() -> str | None: 12 | """Locate Chrome's executable path.""" 13 | if running_windows(): 14 | app_dirs = [] 15 | 16 | # Win paths from WinAPI 17 | import ctypes 18 | 19 | csidl = dict( 20 | CSIDL_PROGRAM_FILES=38, # C:\Program Files 21 | CSIDL_PROGRAM_FILESX86=42, # C:\Program Files (x86) 22 | CSIDL_LOCAL_APPDATA=28, # C:\Documents and Settings\\Local Settings\Application Data. 23 | CSIDL_COMMON_APPDATA=35, # C:\Documents and Settings\All Users\Application Data 24 | CSIDL_APPDATA=26, # C:\Users\ 25 | ) 26 | 27 | for _, v in csidl.items(): 28 | buf = ctypes.create_unicode_buffer(1024) 29 | ctypes.windll.shell32.SHGetFolderPathW(None, v, None, 0, buf) # type: ignore 30 | app_dirs.append(buf.value) 31 | 32 | env_dirs = [ 33 | 'PROGRAMFILES', 34 | 'PROGRAMFILES(X86)', 35 | 'PROGRAMW6432', 36 | 'LOCALAPPDATA', 37 | ] 38 | 39 | # Win paths from the environment 40 | for d in env_dirs: 41 | if d in os.environ and os.environ[d] not in app_dirs: 42 | app_dirs.append(os.environ[d]) 43 | 44 | # Chrome's possible installation locations 45 | for path in app_dirs: 46 | binary_path = os.path.join(path, 'Google', 'Chrome', 'Application', 'chrome.exe') 47 | if os.path.isfile(binary_path): 48 | return binary_path 49 | 50 | # We also could try to use Windows registry to find out Chrome's path 51 | import winreg 52 | 53 | reg_path = r'SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe' 54 | for install_type in winreg.HKEY_CURRENT_USER, winreg.HKEY_LOCAL_MACHINE: # type: ignore 55 | try: 56 | with winreg.OpenKey(install_type, reg_path, 0, winreg.KEY_READ) as reg_key: # type: ignore 57 | binary_path = winreg.QueryValue(reg_key, None) # type: ignore 58 | if os.path.isfile(binary_path): 59 | return binary_path 60 | except WindowsError: # type: ignore 61 | continue 62 | 63 | elif running_mac(): 64 | for binary_path in \ 65 | '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', \ 66 | os.path.expanduser('~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'): 67 | if os.path.isfile(binary_path): 68 | return binary_path 69 | 70 | else: 71 | app_dirs = ['/usr/bin', '/usr/sbin', '/usr/local/bin', '/usr/local/sbin', '/sbin', '/opt/google/chrome'] 72 | browser_executables = ['google-chrome', 'chrome', 'chrome-browser', 'google-chrome-stable'] 73 | for d in app_dirs: 74 | for f in browser_executables: 75 | binary_path = os.path.join(d, f) 76 | if os.path.isfile(binary_path): 77 | return binary_path 78 | 79 | # We also could use 'which' to locate Chrome executable 80 | import subprocess 81 | 82 | for f in browser_executables: 83 | try: 84 | ret_output = subprocess.check_output(['which', f]) 85 | binary_path = ret_output.decode('utf-8').strip() 86 | if os.path.isfile(binary_path): 87 | return binary_path 88 | 89 | except subprocess.CalledProcessError: 90 | pass 91 | 92 | return None 93 | 94 | 95 | def free_port() -> int: 96 | """Get free port using sockets. 97 | 98 | Returns: 99 | Free port. 100 | """ 101 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as free_socket: 102 | free_socket.bind(('127.0.0.1', 0)) 103 | free_socket.listen(5) 104 | return free_socket.getsockname()[1] 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 |

Parser2GIS

7 | 8 |

9 | Tests 10 | PyPi version 11 | Supported Python versions 12 | Downloads 13 |

14 | 15 | **Parser2GIS** - парсер сайта [2GIS](https://2gis.ru/) с помощью браузера [Google Chrome](https://google.com/chrome). 16 | 17 | Screenshot 18 | 19 | ## ℹ️ Описание 20 | 21 | Парсер для автоматического сбора базы адресов и контактов предприятий, которые работают на территории 22 | России , Казахстана , Беларуси , 23 | Азербайджана , Киргизии , Узбекистана , Чехии , Египта , Италии , Саудовской Аравии , Кипра , Объединенных Арабских Эмиратов , Чили , Катара , Омана , Бахрейна , Кувейта . 24 | 25 | ## ✨ Особенности 26 | - 💰 Абсолютно бесплатный 27 | - 🤖 Успешно обходит анти-бот блокировки на территории РФ 28 | - 🖥️ Работает под Windows, Linux и MacOS 29 | - 📄 Три выходных формата: CSV таблица, XLSX таблица и JSON список 30 | - 🔗 Наличие генератора ссылок по городам и рубрикам 31 | 32 | ## 🚀 Установка 33 | > Для работы парсера необходимо установить браузер [Google Chrome](https://google.com/chrome). 34 | 35 | ### Установка одним файлом 36 | 37 | Скачать [релиз](https://github.com/interlark/parser-2gis/releases/latest). 38 | 39 | ### Установка из PyPI 40 | ```bash 41 | # CLI 42 | pip install parser-2gis 43 | # CLI + GUI 44 | pip install parser-2gis[gui] 45 | ``` 46 | 47 | ## 📖 Документация 48 | Описание работы доступно на [вики](https://github.com/interlark/parser-2gis/wiki). 49 | 50 | ## 👍 Поддержать проект 51 | 52 | Yoomoney Donate 53 | 54 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # История изменений 2 | 3 | ## [Невошедшее] 4 | 5 | ## [1.2.1] - 14-03-2024 6 | ### Добавлено 7 | - Добавлена поддержка парсинга остановок. Fix [issue](https://github.com/interlark/parser-2gis/issues/52). 8 | - Генератор ссылок добавляет в URL сортировку по алфавиту для исключения повторений поисковой выдачи при навигации по страницам. 9 | - Обновлён список рубрик. 10 | 11 | ## [1.2.0] - 08-02-2024 12 | ### Добавлено 13 | - Небольшой багфикс схемы ответов сервера. 14 | - Поддержка ссылок организаций `https://2gis.ru//firm/`. 15 | - Обновлён список рубрик и городов. 16 | 17 | ## [1.1.2] - 08-03-2023 18 | ### Добавлено 19 | - Поддержка Chrome v111. 20 | - Новый город Басра (Ирак). 21 | - Обновлён список рубрик и городов. 22 | 23 | ## [1.1.1] - 03-02-2023 24 | ### Добавлено 25 | - Обновлён список рубрик и городов. 26 | - Добавлены поля контактов "Telegram", "Viber" и "WhatsApp". 27 | 28 | ## [1.1.0] - 05-01-2023 29 | ### Добавлено 30 | - Обновлён список рубрик и городов. 31 | - Добавлены поля "Рейтинг" и "Количество отзывов". 32 | - Добавлена возможность записи результата в Excel таблицу. 33 | - Добавлена автоматическая навигация к странице, если в URL есть параметр страницы `/page/<номер_страницы>`. 34 | 35 | ## [0.1.10] - 25-10-2022 36 | ### Добавлено 37 | - Обновлён список рубрик и городов. 38 | 39 | ### Исправлено 40 | - Отключен скрытый режим парсинга по-умолчанию. 41 | 42 | ## [0.1.9] - 18-08-2022 43 | ### Добавлено 44 | - Новые рубрики: *Клубы настольного тенниса, Атрибутика для болельщиков, Полицейские станции*. 45 | - Поддержка парсинга ссылок "В здании". Fix [issue](https://github.com/interlark/parser-2gis/issues/13), см. [wiki](https://github.com/interlark/parser-2gis/wiki/URLs). 46 | 47 | ## [0.1.8] - 10-08-2022 48 | ### Добавлено 49 | - Совместимость с Windows 7, Windows 8. 50 | 51 | ## [0.1.7] - 19-07-2022 52 | ### Исправлено 53 | - Возможная [ошибка](https://github.com/interlark/parser-2gis/issues/9) во время получения новго ключа авторизации. 54 | - [Баг](https://github.com/interlark/parser-2gis/issues/7) связаный с остановкой парсера и не переходу к следующей ссылке при возникновении ошибки. 55 | 56 | ### Добавлено 57 | - Новые рубрики: *Прокат компьютеров / ноутбуков, Буккроссинг, Пляжные принадлежности, Администрация города / посёлка / села*. 58 | 59 | ## [0.1.6] - 03-07-2022 60 | ### Исправлено 61 | - Исправлен релиз под Linux. 62 | - Пропуск [некорректных ответов](https://github.com/interlark/parser-2gis/issues/4#issuecomment-1172172691) сервера (JSON expected). 63 | 64 | ### Добавлено 65 | - Новые страны: *Кувейт*. 66 | - Новые рубрики: *Купальники, Мебель для салонов красоты, Дневные детские лагеря*. 67 | 68 | ## [0.1.5] - 25-05-2022 69 | ### Испрвлено 70 | - Исправлен баг с редкой ошибкой чтения ответа сервера при парсинге CSV. 71 | 72 | ### Добавлено 73 | - Колонка "Часовой пояс" в CSV. 74 | 75 | ## [0.1.4] - 24-05-2022 76 | ### Испрвлено 77 | - Исправлен баг с неполным удалением временного профиля браузера. 78 | 79 | ## [0.1.3] - 23-05-2022 80 | ### Испрвлено 81 | - CSV: Исправлено название колонки `Веб сайт` -> `Веб-сайт`. 82 | - Usage: Убрана ошибочно влезшая версия конфигурации. 83 | 84 | ## [0.1.2] - 22-05-2022 85 | ### Добавлено 86 | - Предупреждение при неудачной попытке загрузки GUI. 87 | 88 | ## [0.1.1] - 22-05-2022 89 | ### Исправлено 90 | - Ссылка на репозиторий внутри модуля и в манифесте. 91 | 92 | ## [0.1.0] - 22-05-2022 93 | ### Добавлено 94 | - Первый релиз. 95 | 96 | 97 | [Невошедшее]: https://github.com/interlark/parser-2gis/compare/v1.2.1...HEAD 98 | [1.2.1]: https://github.com/interlark/parser-2gis/compare/v1.2.0...v1.2.1 99 | [1.2.0]: https://github.com/interlark/parser-2gis/compare/v1.1.2...v1.2.0 100 | [1.1.2]: https://github.com/interlark/parser-2gis/compare/v1.1.1...v1.1.2 101 | [1.1.1]: https://github.com/interlark/parser-2gis/compare/v1.1.0...v1.1.1 102 | [1.1.0]: https://github.com/interlark/parser-2gis/compare/v0.1.10...v1.1.0 103 | [0.1.10]: https://github.com/interlark/parser-2gis/compare/v0.1.9...v0.1.10 104 | [0.1.9]: https://github.com/interlark/parser-2gis/compare/v0.1.8...v0.1.9 105 | [0.1.8]: https://github.com/interlark/parser-2gis/compare/v0.1.7...v0.1.8 106 | [0.1.7]: https://github.com/interlark/parser-2gis/compare/v0.1.6...v0.1.7 107 | [0.1.6]: https://github.com/interlark/parser-2gis/compare/v0.1.5...v0.1.6 108 | [0.1.5]: https://github.com/interlark/parser-2gis/compare/v0.1.4...v0.1.5 109 | [0.1.4]: https://github.com/interlark/parser-2gis/compare/v0.1.3...v0.1.4 110 | [0.1.3]: https://github.com/interlark/parser-2gis/compare/v0.1.2...v0.1.3 111 | [0.1.2]: https://github.com/interlark/parser-2gis/compare/v0.1.1...v0.1.2 112 | [0.1.1]: https://github.com/interlark/parser-2gis/compare/v0.1.0...v0.1.1 113 | [0.1.0]: https://github.com/interlark/parser-2gis/releases/tag/v0.1.0 114 | -------------------------------------------------------------------------------- /parser_2gis/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | from json import JSONDecodeError 5 | from typing import Optional 6 | 7 | from pydantic import BaseModel, ValidationError 8 | 9 | from .chrome import ChromeOptions 10 | from .common import report_from_validation_error 11 | from .logger import LogOptions, logger 12 | from .parser import ParserOptions 13 | from .paths import user_path 14 | from .version import config_version 15 | from .writer import WriterOptions 16 | 17 | 18 | class Configuration(BaseModel): 19 | """Configuration model.""" 20 | log: LogOptions = LogOptions() 21 | writer: WriterOptions = WriterOptions() 22 | chrome: ChromeOptions = ChromeOptions() 23 | parser: ParserOptions = ParserOptions() 24 | path: Optional[pathlib.Path] = None 25 | version: str = config_version 26 | 27 | def __init__(self, *args, **kwargs) -> None: 28 | def setup_config(model: BaseModel) -> None: 29 | """Recursively setup config.""" 30 | self.Config.validate_assignment = True 31 | for field in model.__fields__: 32 | attr = getattr(model, field) 33 | if isinstance(attr, BaseModel): 34 | setup_config(attr) 35 | 36 | super().__init__(*args, **kwargs) 37 | setup_config(self) 38 | 39 | def merge_with(self, other_config: Configuration) -> None: 40 | """Merge configuration with another one.""" 41 | def assign_attributes(model_source: BaseModel, 42 | model_target: BaseModel) -> None: 43 | """Recursively assign new attributes to existing config.""" 44 | for field in model_source.__fields_set__: 45 | source_attr = getattr(model_source, field) 46 | if not isinstance(source_attr, BaseModel): 47 | setattr(model_target, field, source_attr) 48 | else: 49 | target_attr = getattr(model_target, field) 50 | assert isinstance(target_attr, BaseModel) 51 | assign_attributes(source_attr, target_attr) 52 | 53 | assign_attributes(other_config, self) 54 | 55 | def save_config(self) -> None: 56 | """Save config if it's been loaded from a path previously.""" 57 | if self.path: 58 | self.path.parent.mkdir(parents=True, exist_ok=True) 59 | with open(self.path, 'w', encoding='utf-8') as f: 60 | f.write(self.json(exclude={'path'}, ensure_ascii=False, indent=4)) 61 | 62 | @classmethod 63 | def load_config(cls, config_path: pathlib.Path | None = None, 64 | auto_create: bool = True) -> Configuration: 65 | """Load configuration from path. If path is not specified, 66 | configuration gets loaded from user's configuration path. 67 | If errors occurred during loading, method would fallback to 68 | default configuration. 69 | 70 | Note: 71 | User configuration path depending on running OS: 72 | * Unix: ~/.config/parser-2gis/parser-2gis.config 73 | * Mac: ~/Library/Application Support/parser-2gis/parser-2gis.config 74 | * Win: C:\\Users\\%USERPROFILE%\\AppData\\Local\\parser-2gis\\parser-2gis.config 75 | 76 | Args: 77 | config_path: Path to the config file. If not specified, user config gets loaded. 78 | auto_create: Create config if it does not exist. 79 | 80 | Returns: 81 | Configuration. 82 | """ 83 | if not config_path: 84 | config_path = user_path() / 'parser-2gis.config' 85 | 86 | try: 87 | if not config_path.is_file(): 88 | if auto_create: 89 | config = cls(path=config_path) 90 | config.save_config() 91 | logger.debug('Создан файл конфигурации: %s', config_path) 92 | else: 93 | config = cls() 94 | else: 95 | config = cls.parse_file(config_path, content_type='json', encoding='utf-8') 96 | config.path = config_path 97 | except (JSONDecodeError, ValidationError) as e: 98 | warning_msg = 'Не удалось загрузить конфигурацию: ' 99 | if isinstance(e, ValidationError): 100 | errors = [] 101 | errors_report = report_from_validation_error(e) 102 | for attr_path, error in errors_report.items(): 103 | error_msg = error['error_message'] 104 | errors.append(f'атрибут {attr_path} ({error_msg})') 105 | 106 | warning_msg += ', '.join(errors) 107 | else: 108 | warning_msg += str(e) 109 | 110 | logger.warning(warning_msg) 111 | config = cls() 112 | 113 | return config 114 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import distutils.cmd 2 | import pathlib 3 | import re 4 | import sys 5 | 6 | from setuptools import setup 7 | 8 | 9 | PACKAGE_NAME = 'parser_2gis' 10 | ROOT_DIR = pathlib.Path(__file__).parent 11 | VERSION_PATH = ROOT_DIR / PACKAGE_NAME / 'version.py' 12 | README_PATH = ROOT_DIR / 'README.md' 13 | 14 | long_description = README_PATH.read_text(encoding='utf-8') 15 | long_description_content_type = 'text/markdown' 16 | 17 | match = re.search(r'^version\s*=\s*[\'"](?P.+?)[\'"]', 18 | VERSION_PATH.read_text(encoding='utf-8'), re.M) 19 | assert match 20 | version = match.group('version') 21 | 22 | 23 | class BuildStandaloneCommand(distutils.cmd.Command): 24 | """A custom command to build standalone app.""" 25 | description = 'Build standalone app with PyInstaller' 26 | user_options = [] 27 | 28 | def initialize_options(self): 29 | pass 30 | 31 | def finalize_options(self): 32 | pass 33 | 34 | def run(self): 35 | import os 36 | import shutil 37 | import subprocess 38 | import sys 39 | 40 | try: 41 | # Target filename 42 | dist_filename = 'Parser2GIS' 43 | 44 | # Dist 45 | build_cmd = [ 46 | 'pyinstaller', 47 | '--clean', 48 | '--onefile', 49 | '--windowed', 50 | '-n', dist_filename, 51 | ] 52 | 53 | # Icon 54 | if sys.platform.startswith('win'): 55 | build_cmd += [ 56 | '--icon', 'parser_2gis/data/images/icon.ico', 57 | ] 58 | elif sys.platform.startswith('darwin'): 59 | build_cmd += [ 60 | '--icon', 'parser_2gis/data/images/icon.icns', 61 | ] 62 | 63 | # Add data 64 | build_cmd += [ 65 | '--add-data', f'parser_2gis/data{os.pathsep}parser_2gis/data', 66 | 'parser-2gis.py', 67 | ] 68 | 69 | print('Running command: %s' % ' '.join(build_cmd), file=sys.stderr) 70 | subprocess.check_call(build_cmd) 71 | finally: 72 | # Cleanup 73 | shutil.rmtree(ROOT_DIR / 'build', ignore_errors=True) 74 | try: 75 | os.remove(ROOT_DIR / f'{dist_filename}.spec') 76 | except FileNotFoundError: 77 | pass 78 | 79 | 80 | if __name__ == '__main__': 81 | setup( 82 | name='parser-2gis', 83 | version=version, 84 | description='Парсер сайта 2GIS', 85 | long_description=long_description, 86 | long_description_content_type=long_description_content_type, 87 | author='Andy Trofimov', 88 | author_email='interlark@gmail.com', 89 | packages=[PACKAGE_NAME], 90 | include_package_data=True, 91 | python_requires='>=3.8', 92 | keywords='parser scraper 2gis', 93 | url='https://github.com/interlark/parser-2gis', 94 | project_urls={ 95 | 'Documentation': 'https://github.com/interlark/parser-2gis/wiki', 96 | 'GitHub': 'https://github.com/interlark/parser-2gis', 97 | 'Changelog': 'https://github.com/interlark/parser-2gis/blob/main/CHANGELOG.md', 98 | }, 99 | install_requires=[ 100 | 'pychrome==0.2.4', 101 | 'pydantic>=1.9.0,<2.0', 102 | 'psutil>=5.4.8', 103 | 'requests>=2.13.0', 104 | 'xlsxwriter>=3.0.5', 105 | ], 106 | extras_require={ 107 | 'gui': [ 108 | 'PySimpleGUI==4.59.0', 109 | ], 110 | 'dev': ( 111 | ( 112 | ["pyinstaller>=5.0,<5.7.0"] 113 | if sys.platform.startswith("win") 114 | else ["pyinstaller>=6.6.0"] 115 | ) + [ 116 | "pytest>=6.2,<8", 117 | "tox>=3.5,<4", 118 | "pre-commit>=2.6", 119 | "wheel>=0.36.2,<0.38", 120 | ] 121 | ), 122 | }, 123 | classifiers=[ 124 | "Topic :: Internet", 125 | "Topic :: Utilities", 126 | "Operating System :: OS Independent", 127 | "Programming Language :: Python :: 3 :: Only", 128 | "Programming Language :: Python :: 3.8", 129 | "Programming Language :: Python :: 3.9", 130 | "Programming Language :: Python :: 3.10", 131 | "Programming Language :: Python :: 3.11", 132 | "Natural Language :: Russian", 133 | "Intended Audience :: End Users/Desktop", 134 | "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)", 135 | ], 136 | license='LGPLv3+', 137 | entry_points={'console_scripts': ['parser-2gis = parser_2gis:main']}, 138 | cmdclass={'build_standalone': BuildStandaloneCommand} 139 | ) 140 | -------------------------------------------------------------------------------- /parser_2gis/parser/parsers/in_building.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import re 5 | from typing import TYPE_CHECKING 6 | 7 | from ...common import wait_until_finished 8 | from ...logger import logger 9 | from .main import MainParser 10 | 11 | if TYPE_CHECKING: 12 | from ...chrome.dom import DOMNode 13 | from ...writer import FileWriter 14 | 15 | 16 | class InBuildingParser(MainParser): 17 | """Parser for the list of organizations provided by 2GIS with the tab "In building". 18 | 19 | URL pattern for such cases: https://2gis.//inside/ 20 | """ 21 | 22 | @staticmethod 23 | def url_pattern(): 24 | """URL pattern for the parser.""" 25 | return r'https?://2gis\.[^/]+/[^/]+/inside/.*' 26 | 27 | @wait_until_finished(timeout=5, throw_exception=False) 28 | def _get_links(self) -> list[DOMNode]: 29 | """Extracts specific DOM node links from current DOM snapshot.""" 30 | def valid_link(node: DOMNode) -> bool: 31 | if node.local_name == 'a' and 'href' in node.attributes: 32 | link_match = re.match(r'/[^/]+/firm/[^/]+$', node.attributes['href']) 33 | return bool(link_match) 34 | 35 | return False 36 | 37 | dom_tree = self._chrome_remote.get_document() 38 | return dom_tree.search(valid_link) 39 | 40 | def parse(self, writer: FileWriter) -> None: 41 | """Parse URL with organizations. 42 | 43 | Args: 44 | writer: Target file writer. 45 | """ 46 | # Go URL 47 | self._chrome_remote.navigate(self._url, referer='https://google.com', timeout=120) 48 | 49 | # Document loaded, get its response 50 | responses = self._chrome_remote.get_responses(timeout=5) 51 | if not responses: 52 | logger.error('Ошибка получения ответа сервера.') 53 | return 54 | document_response = responses[0] 55 | 56 | # Handle 404 57 | assert document_response['mimeType'] == 'text/html' 58 | if document_response['status'] == 404: 59 | logger.warn('Сервер вернул сообщение "Точных совпадений нет / Не найдено".') 60 | 61 | if self._options.skip_404_response: 62 | return 63 | 64 | # Parsed records 65 | collected_records = 0 66 | 67 | # Already visited links 68 | visited_links: set[str] = set() 69 | 70 | # Get new links 71 | @wait_until_finished(timeout=5, throw_exception=False) 72 | def get_unique_links() -> list[DOMNode]: 73 | links = self._get_links() 74 | link_addresses = set(x.attributes['href'] for x in links) - visited_links 75 | visited_links.update(link_addresses) 76 | return [x for x in links if x.attributes['href'] in link_addresses] 77 | 78 | # Loop down through lazy load organizations list 79 | while True: 80 | # Wait all 2GIS requests get finished 81 | self._wait_requests_finished() 82 | 83 | # Gather links to be clicked 84 | links = get_unique_links() 85 | if not links: 86 | break 87 | 88 | # Iterate through gathered links 89 | for link in links: 90 | for _ in range(3): # 3 attempts to get response 91 | # Click the link to provoke request 92 | # with a auth key and secret arguments 93 | self._chrome_remote.perform_click(link) 94 | 95 | # Delay between clicks, could be usefull if 96 | # 2GIS's anti-bot service become more strict. 97 | if self._options.delay_between_clicks: 98 | self._chrome_remote.wait(self._options.delay_between_clicks / 1000) 99 | 100 | # Gather response and collect useful payload. 101 | resp = self._chrome_remote.wait_response(self._item_response_pattern) 102 | 103 | # If request is failed - repeat, otherwise go further. 104 | if resp and resp['status'] >= 0: 105 | break 106 | 107 | # Get response body data 108 | if resp and resp['status'] >= 0: 109 | data = self._chrome_remote.get_response_body(resp, timeout=10) if resp else None 110 | 111 | try: 112 | doc = json.loads(data) 113 | except json.JSONDecodeError: 114 | logger.error('Сервер вернул некорректный JSON документ: "%s", пропуск позиции.', data) 115 | doc = None 116 | else: 117 | doc = None 118 | 119 | if doc: 120 | # Write API document into a file 121 | writer.write(doc) 122 | collected_records += 1 123 | else: 124 | logger.error('Данные не получены, пропуск позиции.') 125 | 126 | # We've reached our limit, bail 127 | if collected_records >= self._options.max_records: 128 | logger.info('Спарсено максимально разрешенное количество записей с данного URL.') 129 | return 130 | -------------------------------------------------------------------------------- /parser_2gis/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import sys 5 | import time 6 | import warnings 7 | from typing import Any, Callable 8 | 9 | from pydantic import ValidationError 10 | 11 | try: 12 | import PySimpleGUI 13 | del PySimpleGUI 14 | GUI_ENABLED = True 15 | except ImportError as e: 16 | if e.name != 'PySimpleGUI': 17 | # GUI was installed, but failed to load 18 | # due to tkinter missing or other dependencies. 19 | warnings.warn('Failed to load GUI: %s' % e.msg) 20 | GUI_ENABLED = False 21 | 22 | 23 | def running_linux() -> bool: 24 | """Determine if current OS is Linux-based.""" 25 | return sys.platform.startswith('linux') 26 | 27 | 28 | def running_windows() -> bool: 29 | """Determine if current OS is Windows.""" 30 | return sys.platform.startswith('win') 31 | 32 | 33 | def running_mac() -> bool: 34 | """Determine if current OS is MacOS.""" 35 | return sys.platform.startswith('darwin') 36 | 37 | 38 | def wait_until_finished(timeout: int | None = None, 39 | finished: Callable[[Any], bool] = lambda x: bool(x), 40 | throw_exception: bool = True, 41 | poll_interval: float = 0.1) -> Callable[..., Callable[..., Any]]: 42 | """Decorator that polls wrapped function until time is out or `finished` 43 | predicate returns `True`. 44 | 45 | Args: 46 | timeout: Max time to wait. 47 | finished: Predicate for succeeded result of decorated function. 48 | throw_exception: Whether to throw `TimeoutError`. 49 | poll_interval: Poll interval for result of decorated function. 50 | """ 51 | def outer(func: Callable[..., Any]) -> Callable[..., Any]: 52 | @functools.wraps(func) 53 | def inner(*args, timeout=timeout, finished=finished, 54 | throw_exception=throw_exception, 55 | poll_interval=poll_interval, **kwargs): 56 | call_time = time.time() 57 | while True: 58 | ret = func(*args, **kwargs) 59 | if finished(ret): 60 | return ret 61 | 62 | if timeout is not None: 63 | if time.time() - call_time > timeout: 64 | if throw_exception: 65 | raise TimeoutError(func) 66 | return ret 67 | 68 | time.sleep(poll_interval) 69 | return inner 70 | return outer 71 | 72 | 73 | def report_from_validation_error(ex: ValidationError, 74 | d: dict | None = None) -> dict: 75 | """Generate validation error report 76 | for `BaseModel` out of `ValidationError`. 77 | 78 | Note: 79 | It's convenient to use when you try to instantiate 80 | model with predefined dictionary. For example: 81 | 82 | class TestModel(BaseModel): 83 | test_int: int 84 | 85 | try: 86 | d = {'test_int': '_12'} 87 | m = TestModel(**d) 88 | except ValidationError as ex: 89 | print(report_from_validation_error(ex, d)) 90 | 91 | Args: 92 | d: Arguments dictionary. 93 | ex: Thrown Pydantic ValidationError. 94 | 95 | Returns: 96 | Dictionary with validation error information. 97 | { 98 | 'full_path_of_invalid_attribute': { 99 | 'invalid_value': ..., 100 | 'error_message': ..., 101 | }, 102 | ... 103 | } 104 | """ 105 | values = {} 106 | for error in ex.errors(): 107 | msg = error['msg'] 108 | loc = error['loc'] 109 | attribute_path = '.'.join([str(location) for location in loc]) 110 | 111 | if d: 112 | value = d 113 | for field in loc: 114 | if field == '__root__': 115 | break 116 | if field in value: 117 | value = value[field] 118 | else: 119 | value = '' # type: ignore 120 | break 121 | 122 | values[attribute_path] = { 123 | 'invalid_value': value, 124 | 'error_message': msg, 125 | } 126 | else: 127 | values[attribute_path] = { 128 | 'error_message': msg, 129 | } 130 | 131 | return values 132 | 133 | 134 | def unwrap_dot_dict(d: dict) -> dict: 135 | """Unwrap flat dictionary with keys represented 136 | by dot-concatenated path to their values. 137 | 138 | Example: 139 | Input: 140 | { 141 | 'full.path.fieldname': 'value1', 142 | 'another.fieldname': 'value2', 143 | } 144 | 145 | Output: 146 | { 147 | 'full': { 148 | 'path': { 149 | 'filedname': 'value1', 150 | }, 151 | }, 152 | 'another': { 153 | 'fieldname': 'value2', 154 | }, 155 | } 156 | """ 157 | output: dict = {} 158 | for key, value in d.items(): 159 | path = key.split('.') 160 | target = functools.reduce(lambda d, k: d.setdefault(k, {}), path[:-1], output) 161 | target[path[-1]] = value 162 | return output 163 | 164 | 165 | def floor_to_hundreds(arg: int | float) -> int: 166 | """Round number down to the nearest hundred.""" 167 | return int(arg // 100 * 100) 168 | -------------------------------------------------------------------------------- /parser_2gis/gui/rubric_selector.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | from typing import Any 5 | 6 | from ..common import GUI_ENABLED, running_linux 7 | from ..paths import data_path, image_data 8 | from .error_popup import gui_error_popup 9 | from .utils import (ensure_gui_enabled, generate_event_handler, 10 | invoke_widget_hook, setup_text_widget) 11 | 12 | if GUI_ENABLED: 13 | import tkinter as tk 14 | import PySimpleGUI as sg 15 | from .widgets.sg import RubricsTree 16 | from .widgets.tk import CustomEntry 17 | 18 | 19 | def filtered_rubrics(rubrics: dict[str, Any], 20 | is_russian: bool = True) -> dict[str, Any]: 21 | """Filter rubrics on russian/non russian nodes. 22 | 23 | Args: 24 | rubrics: Loaded rubric dictionary. 25 | is_russian: Filter criteria. 26 | 27 | Returns: 28 | Filtered rubric dictionary. 29 | """ 30 | # Filter nodes 31 | if is_russian: 32 | # Rubrics for Russia 33 | rubrics = {k: v for k, v in rubrics.items() if v.get('isRussian', True)} 34 | else: 35 | # Rubrics for Non-russian countries 36 | rubrics = {k: v for k, v in rubrics.items() if v.get('isNonRussian', True)} 37 | 38 | # Fix references 39 | for node in rubrics.values(): 40 | node['children'] = [x for x in node['children'] if x in rubrics] 41 | 42 | return rubrics 43 | 44 | 45 | def create_search_widget(column_element: sg.Element, containing_frame: tk.Frame, 46 | toplevel_form: sg.Window) -> tk.Widget: 47 | """Callback for `custom_widget_hook` that creates and 48 | returns Search Widget.""" 49 | search_widget = CustomEntry(column_element.TKColFrame, width=60) 50 | search_widget.pack(side='top', fill='both', expand=True) 51 | setup_text_widget(search_widget, toplevel_form.TKroot, menu_clear=False) 52 | 53 | search_widget.configure(background=sg.theme_input_background_color(), 54 | font=('TkDefaultFont', 12), 55 | highlightthickness=0) 56 | 57 | return search_widget 58 | 59 | 60 | @ensure_gui_enabled 61 | def gui_rubric_selector(is_russian: bool = True) -> dict[str, Any] | None: 62 | """Run rubric selector. 63 | 64 | Run form that could help user to specify rubric. 65 | 66 | Args: 67 | is_russian: Whether rubrics for Russia or not. 68 | 69 | Returns: 70 | Dictionary representing selected rubric 71 | or `None` if nothing selected. 72 | """ 73 | # Locate and load rubrics list 74 | rubric_path = data_path() / 'rubrics.json' 75 | if not rubric_path.is_file(): 76 | raise FileNotFoundError(f'Файл {rubric_path} не найден') 77 | 78 | try: 79 | with open(rubric_path, 'r', encoding='utf-8') as f: 80 | rubrics = filtered_rubrics(json.load(f), is_russian=is_russian) 81 | except json.JSONDecodeError as e: 82 | gui_error_popup(f'Файл {rubric_path.name} повреждён:\n{e}') 83 | return None 84 | 85 | # Window layout 86 | layout = [ 87 | [ 88 | sg.Text('Поиск рубрики', size=(14, 1)), 89 | sg.Column([[]], pad=((0, 5), 0), key='-COL_SEARCH-', expand_x=True), 90 | ], 91 | [ 92 | RubricsTree(rubrics=rubrics, 93 | image_parent=image_data('rubric_folder'), 94 | image_item=image_data('rubric_item'), 95 | headings=[], auto_size_columns=True, 96 | select_mode=sg.TABLE_SELECT_MODE_BROWSE, 97 | num_rows=30, col0_width=80, 98 | key='-TREE-', 99 | enable_events=True, 100 | expand_x=True, expand_y=True), 101 | ], 102 | [ 103 | sg.StatusBar('', size=(0, 1), key='-STATUS-'), 104 | ], 105 | [ 106 | sg.Button('OK', size=(6, 1), pad=((6, 7), (7, 7)), key='-BTN_OK-'), 107 | sg.Button('Отмена', size=(8, 1), pad=((7, 7), (7, 7)), key='-BTN_CANCEL-'), 108 | sg.Column([ 109 | [ 110 | sg.Button('Развернуть всё', size=(16, 1), pad=((0, 7), (7, 7)), key='-BTN_EXPAND_ALL-'), 111 | sg.Button('Свернуть всё', size=(14, 1), pad=((7, 0), (7, 7)), key='-BTN_COLLAPSE_ALL-'), 112 | ], 113 | ], expand_x=True, element_justification='right'), 114 | ], 115 | ] 116 | 117 | with invoke_widget_hook(sg.PySimpleGUI, '-COL_SEARCH-', create_search_widget) as get_widget: 118 | window_title = 'Select rubric' if running_linux() else 'Выбор рубрики' 119 | window = sg.Window(window_title, layout=layout, finalize=True, auto_size_text=True, 120 | font='Any 12', modal=True, keep_on_top=True) 121 | 122 | # Get search widget 123 | search_widget = get_widget() 124 | assert search_widget 125 | 126 | # On Linux\MacOS created window could be behind its parent 127 | window.bring_to_front() 128 | 129 | # Focus on custom widget 130 | search_widget.focus_set() 131 | 132 | # Hide tree header 133 | window['-TREE-'].widget.configure(show='tree') 134 | 135 | # Perform rubrics search on text changed 136 | def perform_rubric_search() -> None: 137 | query = search_widget.get() 138 | window['-TREE-'].filter(query) # noqa: F821 139 | 140 | search_widget.bind('<>', generate_event_handler(perform_rubric_search)) 141 | 142 | # Return rubric 143 | ret_rubric = None 144 | 145 | # Main loop 146 | while True: 147 | event, values = window.read() 148 | 149 | if event in (None, '-BTN_CANCEL-'): 150 | ret_rubric = None 151 | break 152 | 153 | elif event == '-BTN_OK-': 154 | if not ret_rubric: 155 | gui_error_popup('Рубрика не выбрана!') 156 | continue 157 | break 158 | 159 | # Update status bar 160 | elif event == '-TREE-': 161 | tree_values = values['-TREE-'] 162 | if tree_values: 163 | node = rubrics[tree_values[0]] 164 | is_leaf = not bool(node['children']) 165 | if is_leaf: 166 | ret_rubric = rubrics[tree_values[0]] 167 | window['-STATUS-'].update(ret_rubric['label']) 168 | else: 169 | ret_rubric = None 170 | window['-STATUS-'].update('') 171 | 172 | elif event == '-BTN_EXPAND_ALL-': 173 | window['-TREE-'].expand() 174 | 175 | elif event == '-BTN_COLLAPSE_ALL-': 176 | window['-TREE-'].expand(expand=False) 177 | 178 | window.close() 179 | del window 180 | 181 | return ret_rubric 182 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /parser_2gis/gui/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | import functools 5 | import urllib.parse 6 | from typing import TYPE_CHECKING, Any, Callable, TypeVar, cast 7 | 8 | from ..common import GUI_ENABLED, running_mac 9 | 10 | F = TypeVar('F', bound=Callable[..., Any]) 11 | 12 | if TYPE_CHECKING: 13 | import tkinter as tk 14 | 15 | if GUI_ENABLED: 16 | import tkinter as tk # noqa: F811 17 | import PySimpleGUI as sg 18 | 19 | 20 | def generate_event_handler(func: Callable, with_break: bool = False) -> Callable: 21 | """Generate event handler out of function. 22 | 23 | Args: 24 | func: Function to be wrapped in event handler. 25 | with_break: Whether to stop event propagation. 26 | 27 | Returns: 28 | Event handler. 29 | """ 30 | def wrapper(event: tk.Event) -> str | None: 31 | func() 32 | return 'break' if with_break else None 33 | 34 | return wrapper 35 | 36 | 37 | def setup_text_widget(widget: tk.Text | tk.Entry, root: tk.Toplevel, *, 38 | menu_copy: bool = True, menu_paste: bool = True, 39 | menu_cut: bool = True, menu_clear: bool = True, 40 | set_focus: bool = False) -> None: 41 | """Setup text widgets, add context menu and other functionality. 42 | 43 | Args: 44 | widget: tk.Text or tk.Entry widget. 45 | root: Parent window. 46 | menu_copy: Whether text of the `widget` could be copied with context menu. 47 | menu_paste: Whether text of the `widget` could be pasted with context menu. 48 | menu_cut: Whether text of the `widget` could be cut with context menu. 49 | menu_clear: Whether text of the `widget` could be cleared with context menu. 50 | set_focus: Whether to set focus on the `widget`. 51 | """ 52 | # def get_text() -> str: 53 | # if isinstance(widget, tk.Entry): 54 | # return widget.get() 55 | # elif isinstance(widget, tk.Text): 56 | # return widget.get('1.0','end') 57 | # return '' 58 | 59 | def get_clipboard() -> str | None: 60 | try: 61 | return widget.clipboard_get() 62 | except tk.TclError: 63 | # Nothing in clipboard 64 | return None 65 | 66 | def get_selection() -> str | None: 67 | if isinstance(widget, tk.Entry): 68 | if widget.selection_present(): 69 | return widget.selection_get() 70 | else: 71 | return None 72 | elif isinstance(widget, tk.Text): 73 | try: 74 | return widget.get('sel.first', 'sel.last') 75 | except tk.TclError: 76 | # Nothing was selected 77 | return None 78 | 79 | def delete_selection() -> None: 80 | try: 81 | widget.delete('sel.first', 'sel.last') # Works for tk.Entry and tk.Text 82 | except tk.TclError: 83 | # Nothing was selected 84 | pass 85 | 86 | def copy_text() -> None: 87 | selection = get_selection() 88 | if selection: 89 | widget.clipboard_clear() 90 | widget.clipboard_append(selection) 91 | widget.update() 92 | 93 | def paste_text() -> None: 94 | delete_selection() 95 | 96 | clipboard = get_clipboard() 97 | if clipboard: 98 | widget.insert('insert', clipboard) 99 | 100 | def cut_text() -> None: 101 | copy_text() 102 | delete_selection() 103 | 104 | def clear_text() -> None: 105 | widget.delete('1.0', 'end') 106 | 107 | def select_all() -> None: 108 | if isinstance(widget, tk.Entry): 109 | widget.select_range(0, 'end') 110 | widget.icursor('end') 111 | elif isinstance(widget, tk.Text): 112 | widget.tag_add('sel', '1.0', 'end') 113 | 114 | def ctrl_key_press(event) -> None: 115 | """Generate CTRL + X, V, C, A events for non-english layouts.""" 116 | if event.keycode == 88 and event.keysym.lower() != 'x': 117 | event.widget.event_generate('<>') 118 | elif event.keycode == 86 and event.keysym.lower() != 'v': 119 | event.widget.event_generate('<>') 120 | elif event.keycode == 67 and event.keysym.lower() != 'c': 121 | event.widget.event_generate('<>') 122 | elif event.keycode == 65 and event.keysym.lower() != 'a': 123 | event.widget.event_generate('<>') 124 | 125 | # Generate extra events for non-english layouts 126 | widget.bind('', ctrl_key_press) 127 | 128 | # Create menu 129 | menu = tk.Menu(root, tearoff=False) 130 | 131 | if menu_cut: 132 | menu.add_command(label='Вырезать', command=cut_text) 133 | if menu_copy: 134 | menu.add_command(label='Скопировать', command=copy_text) 135 | if menu_paste: 136 | menu.add_command(label='Вставить', command=paste_text) 137 | # Fix paste bahaviour 138 | widget.bind('<>', generate_event_handler(paste_text, with_break=True)) 139 | 140 | # Select all bahaviour 141 | widget.bind('', generate_event_handler(select_all, with_break=True)) 142 | menu.add_command(label='Выделить всё', command=select_all) 143 | 144 | if menu_clear: 145 | menu.add_command(label='Очистить', command=clear_text) 146 | 147 | # Show menu 148 | def show_menu_handler(event: tk.Event) -> None: 149 | """Config menu.""" 150 | is_readonly = widget.cget('state') == 'readonly' 151 | 152 | if menu_copy: 153 | copy_state = 'normal' if get_selection() else 'disabled' 154 | menu.entryconfig('Скопировать', state=copy_state) 155 | if menu_paste: 156 | paste_state = 'normal' if not is_readonly and get_clipboard() else 'disabled' 157 | menu.entryconfig('Вставить', state=paste_state) 158 | if menu_cut: 159 | cut_state = 'normal' if not is_readonly and get_selection() else 'disabled' 160 | menu.entryconfig('Вырезать', state=cut_state) 161 | if menu_clear: 162 | clear_state = 'normal' if not is_readonly else 'disabled' 163 | menu.entryconfig('Очистить', state=clear_state) 164 | 165 | menu.post(event.x_root, event.y_root) 166 | menu.focus_set() 167 | 168 | rclick_event_name = '' if running_mac() else '' 169 | widget.bind(rclick_event_name, show_menu_handler) 170 | 171 | # Hide menu 172 | menu.bind('', generate_event_handler(menu.unpost)) 173 | 174 | # Focus 175 | if set_focus: 176 | widget.focus_set() 177 | 178 | 179 | def ensure_gui_enabled(func: F) -> F: 180 | """Decorator to be sure GUI is enabled 181 | before decorated form is run.""" 182 | @functools.wraps(func) 183 | def _ensure_gui_enabled(*args, **kwargs) -> Any: 184 | assert GUI_ENABLED, 'GUI is not enabled' 185 | return func(*args, **kwargs) 186 | 187 | return cast(F, _ensure_gui_enabled) 188 | 189 | 190 | @contextlib.contextmanager 191 | def invoke_widget_hook(sg: sg, parent_key: str, 192 | widget_callback: Callable[[sg.Element, tk.Frame, sg.Window], tk.Widget]): 193 | """Hacky way to place custom widget inside element with key `parent_key` 194 | by hooking SG module function PackFormIntoFrame during window finalization. 195 | 196 | Args: 197 | sg: PySimpleGUI module 198 | parent_key: Parent element key. 199 | created_widget: Callback with just created parent element as an argument. 200 | 201 | Returns: 202 | Patched SG with `widget_callback` hook. 203 | """ 204 | old_PackFormIntoFrame = sg.PackFormIntoFrame 205 | created_widget = None 206 | 207 | def new_PackFormIntoFrame(form, containing_frame, toplevel_form) -> None: 208 | nonlocal created_widget 209 | if hasattr(form, 'Key') and form.Key == parent_key: 210 | created_widget = widget_callback(form, containing_frame, toplevel_form) 211 | old_PackFormIntoFrame(form, containing_frame, toplevel_form) 212 | 213 | def get_widget() -> tk.Widget | None: 214 | return created_widget 215 | 216 | sg.PackFormIntoFrame = new_PackFormIntoFrame 217 | try: 218 | yield get_widget 219 | finally: 220 | sg.PackFormIntoFrame = old_PackFormIntoFrame 221 | 222 | 223 | def url_query_encode(url: str) -> str: 224 | """URL encode for query, nonascii 225 | regular russian characters allowed (plus space). 226 | 227 | Args: 228 | url: URL to be encoded. 229 | 230 | Returns: 231 | Encoded URL. 232 | """ 233 | encoded_characters = [] 234 | for char in url: 235 | char_ord = ord(char.lower()) 236 | 237 | # Do not escape [а-яё ] 238 | if 1072 <= char_ord <= 1103 \ 239 | or char_ord in (1105, 32): 240 | encoded_characters.append(char) 241 | else: 242 | encoded_characters.append(urllib.parse.quote(char, safe='')) 243 | 244 | return ''.join(encoded_characters) 245 | -------------------------------------------------------------------------------- /parser_2gis/gui/urls_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | 5 | from ..common import GUI_ENABLED, running_linux 6 | from ..paths import data_path 7 | from .error_popup import gui_error_popup 8 | from .rubric_selector import gui_rubric_selector 9 | from .utils import ensure_gui_enabled, setup_text_widget, url_query_encode 10 | 11 | if GUI_ENABLED: 12 | import PySimpleGUI as sg 13 | 14 | 15 | @ensure_gui_enabled 16 | def gui_urls_generator() -> list[str]: 17 | """Run URLs generator. 18 | 19 | Run form that can build a bunch of URLs out of query and specified cities. 20 | 21 | Returns: 22 | List of generated URLs. 23 | """ 24 | # Locate and load cities list 25 | cities_path = data_path() / 'cities.json' 26 | if not cities_path.is_file(): 27 | raise FileNotFoundError(f'Файл {cities_path} не найден') 28 | 29 | try: 30 | with open(cities_path, 'r', encoding='utf-8') as f: 31 | cities = json.load(f) 32 | except json.JSONDecodeError as e: 33 | gui_error_popup(f'Файл {cities_path.name} повреждён:\n{e}') 34 | return [] 35 | 36 | # Countries available 37 | default_city_code = 'ru' 38 | country_code_to_name = dict( 39 | ae='Объединенные Арабские Эмираты', iq='Ирак', 40 | az='Азербайджан', bh='Бахрейн', by='Беларусь', cl='Чили', cy='Кипр', cz='Чехия', 41 | eg='Египт', it='Италия', kg='Киргизия', kw='Кувейт', kz='Казахстан', om='Оман', 42 | qa='Катар', ru='Россия', sa='Саудовская Аравия', uz='Узбекистан') 43 | 44 | country_name_to_code = {v: k for k, v in country_code_to_name.items()} 45 | 46 | # Checkbox layouts 47 | checkbox_layouts = {} 48 | for country_code in country_code_to_name.keys(): 49 | layout = [] 50 | for city in cities: 51 | if city['country_code'] == country_code: 52 | layout.append([ 53 | sg.Checkbox( 54 | city['name'], metadata=city, 55 | checkbox_color=sg.theme_input_background_color()) 56 | ]) 57 | checkbox_layouts[country_code] = sg.Column( 58 | layout, scrollable=True, vertical_scroll_only=True, 59 | expand_x=True, expand_y=True, visible=False) 60 | 61 | # Obtain screen dimensions 62 | _, screen_height = sg.Window.get_screen_size() 63 | 64 | # Window layout 65 | layout = [ 66 | [ 67 | sg.Column([ 68 | [ 69 | sg.Text('Запрос', size=(7, 1)), 70 | sg.Input(key='-IN_QUERY-'), 71 | ], 72 | ]), 73 | ], 74 | [ 75 | sg.Column([ 76 | [ 77 | sg.Text('Страна', size=(7, 1)), 78 | sg.Combo(key='-COUNTRY-', default_value=country_code_to_name[default_city_code], 79 | values=sorted(country_code_to_name.values()), readonly=True, enable_events=True), 80 | ], 81 | ]), 82 | ], 83 | [ 84 | sg.Column([ 85 | [ 86 | sg.Text('Рубрика', size=(7, 1)), 87 | sg.Input(key='-IN_RUBRIC-', disabled=True, 88 | size=(35, 1), expand_x=True), 89 | sg.Column([ 90 | [ 91 | sg.Button('...', size=(4, 1), key='-BTN_RUBRIC-'), 92 | ], 93 | ], element_justification='right', pad=0), 94 | ], 95 | ], expand_x=True), 96 | ], 97 | [ 98 | sg.Frame('Города', [ 99 | list(checkbox_layouts.values()), 100 | ], size=(None, int(screen_height / 2)), expand_x=True, expand_y=True), 101 | ], 102 | [ 103 | sg.Button('OK', size=(6, 1), pad=((6, 0), (7, 7)), key='-BTN_OK-'), 104 | sg.Column([ 105 | [ 106 | sg.Button('Выделить всё', size=(14, 1), pad=((0, 7), (7, 7)), key='-BTN_SELECT_ALL-'), 107 | sg.Button('Снять выделение', size=(17, 1), pad=((7, 0), (7, 7)), key='-BTN_DESELECT_ALL-'), 108 | ], 109 | ], expand_x=True, element_justification='right'), 110 | ], 111 | ] 112 | 113 | window_title = 'Generate links' if running_linux() else 'Сгенерировать ссылки' 114 | window = sg.Window(window_title, layout=layout, auto_size_text=True, 115 | finalize=True, font='Any 12', modal=True, keep_on_top=True) 116 | 117 | setup_text_widget(window['-IN_QUERY-'].widget, window.TKroot, 118 | menu_clear=False, set_focus=True) 119 | 120 | setup_text_widget(window['-IN_RUBRIC-'].widget, window.TKroot, 121 | menu_clear=False, menu_paste=False, menu_cut=False) 122 | 123 | def update_checkbox_layouts(country_name: str) -> None: 124 | """Bring frame with checkboxes visible that 125 | belong to `country_name`. 126 | 127 | Args: 128 | country_name: Name of a country. 129 | """ 130 | for country_code, column_element in checkbox_layouts.items(): 131 | if country_code_to_name[country_code] == country_name: 132 | column_element.update(visible=True) 133 | else: 134 | column_element.update(visible=False) 135 | 136 | # Reset rubrics 137 | rubric_input = window['-IN_RUBRIC-'] # noqa: F821 138 | rubric_input.metadata = None 139 | rubric_input.update(value='Без рубрики') 140 | 141 | def select_checkboxes(country_name: str, state: bool = True) -> None: 142 | """Select all checkboxes that belong to `country_name`. 143 | 144 | Args: 145 | country_name: Name of a country. 146 | state: Desired checkboxes' state. 147 | """ 148 | country_code = country_name_to_code[country_name] 149 | for element in sum(checkbox_layouts[country_code].Rows, []): 150 | element.update(state) 151 | 152 | def get_checkboxes(state: bool | None) -> list[sg.Checkbox]: 153 | """Return all checkboxes. 154 | 155 | Args: 156 | state: Checkbox state requirement. 157 | 158 | Returns: 159 | Checkboxes with specified `state`. 160 | """ 161 | all_checkboxes: list[sg.Checkbox] = sum(sum([x.Rows for x in checkbox_layouts.values()], []), []) 162 | if isinstance(state, bool): 163 | all_checkboxes = [x for x in all_checkboxes if x.get() == state] 164 | 165 | return all_checkboxes 166 | 167 | def get_selected_urls(query: str) -> list[str]: 168 | """Get all checked checkboxes among all frames and generate URLs. 169 | 170 | Args: 171 | query: User's query. 172 | 173 | Returns: 174 | List of urls. 175 | """ 176 | urls = [] 177 | rubric = window['-IN_RUBRIC-'].metadata # noqa: F821 178 | for checkbox in get_checkboxes(state=True): 179 | metadata = checkbox.metadata 180 | base_url = f'https://2gis.{metadata["domain"]}/{metadata["code"]}' 181 | rest_url = f'/search/{url_query_encode(query)}' 182 | if rubric: 183 | rest_url += f'/rubricId/{rubric["code"]}' 184 | 185 | rest_url += '/filters/sort=name' 186 | 187 | url = base_url + rest_url 188 | urls.append(url) 189 | 190 | return urls 191 | 192 | # Set default layout 193 | update_checkbox_layouts(country_code_to_name[default_city_code]) 194 | 195 | # Result urls 196 | ret_urls = [] 197 | 198 | # Main loop 199 | while True: 200 | event, values = window.read() 201 | 202 | if event in (None, ): 203 | break 204 | 205 | elif event == '-COUNTRY-': 206 | update_checkbox_layouts(values['-COUNTRY-']) 207 | 208 | elif event == '-BTN_SELECT_ALL-': 209 | select_checkboxes(values['-COUNTRY-'], True) 210 | 211 | elif event == '-BTN_DESELECT_ALL-': 212 | select_checkboxes(values['-COUNTRY-'], False) 213 | 214 | elif event == '-BTN_RUBRIC-': 215 | rubric_dict = gui_rubric_selector(is_russian=values['-COUNTRY-'] == 'Россия') 216 | if rubric_dict: 217 | rubric_input = window['-IN_RUBRIC-'] 218 | rubric_label = rubric_dict['label'] 219 | rubric_input.update(value=rubric_label) 220 | if rubric_label == 'Без рубрики': 221 | rubric_input.metadata = None 222 | else: 223 | rubric_input.metadata = rubric_dict 224 | window['-IN_QUERY-'].update(value=rubric_label) 225 | 226 | elif event == '-BTN_OK-': 227 | if not values['-IN_QUERY-'].strip(): 228 | gui_error_popup('Необходимо ввести запрос!') 229 | continue 230 | 231 | if not get_checkboxes(state=True): 232 | gui_error_popup('Необходимо выбрать хотя бы один город!') 233 | continue 234 | 235 | ret_urls = get_selected_urls(values['-IN_QUERY-']) 236 | break 237 | 238 | window.close() 239 | del window 240 | 241 | return ret_urls 242 | -------------------------------------------------------------------------------- /parser_2gis/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | from typing import Any 5 | 6 | import pydantic 7 | 8 | from .common import GUI_ENABLED, report_from_validation_error, unwrap_dot_dict 9 | from .config import Configuration 10 | from .version import version 11 | from .cli import cli_app 12 | from .gui import gui_app 13 | 14 | 15 | class ArgumentHelpFormatter(argparse.HelpFormatter): 16 | """Help message formatter which adds default values to argument help.""" 17 | def __init__(self, *args, **kwargs) -> None: 18 | super().__init__(*args, **kwargs) 19 | self._default_config = Configuration().dict() 20 | 21 | def _get_default_value(self, dest: str) -> Any: 22 | if dest == 'version': 23 | return argparse.SUPPRESS 24 | 25 | fileds = dest.split('.') 26 | value = self._default_config 27 | try: 28 | for field in fileds: 29 | value = value[field] 30 | return value 31 | except KeyError: 32 | return argparse.SUPPRESS 33 | 34 | def _get_help_string(self, action: argparse.Action) -> str | None: 35 | help_string = action.help 36 | if help_string: 37 | default_value = self._get_default_value(action.dest) 38 | if default_value != argparse.SUPPRESS: 39 | if isinstance(default_value, bool): 40 | default_value = 'yes' if default_value else 'no' 41 | help_string += f' (по умолчанию: {default_value})' 42 | return help_string 43 | 44 | 45 | def patch_argparse_translations() -> None: 46 | """Patch argparse's `gettext` and make it 47 | translate some strings into russian.""" 48 | 49 | custom_translations = { 50 | 'usage: ': 'Использование: ', 51 | 'one of the arguments %s is required': 'один из аргументов %s обязателен', 52 | 'unrecognized arguments: %s': 'нераспознанные аргументы: %s', 53 | 'the following arguments are required: %s': 'следующие аргументы обязательны: %s', 54 | '%(prog)s: error: %(message)s\n': '%(prog)s: ошибка: %(message)s\n', 55 | 'invalid choice: %(value)r (choose from %(choices)s)': 'неверная опция: %(value)r (выберите одну из %(choices)s)' 56 | } 57 | 58 | orig_gettext = argparse._ # type: ignore[attr-defined] 59 | 60 | def gettext(message: str) -> str: 61 | if message in custom_translations: 62 | return custom_translations[message] 63 | return orig_gettext(message) 64 | 65 | argparse._ = gettext # type: ignore[attr-defined] 66 | 67 | # Also replace hardcoded string `argument` in ArgumentError class 68 | # (patch tested in Python 3.7, 3.8, 3.9, 3.10) 69 | # This bug was fixed only on 6 May 2022 https://github.com/python/cpython/pull/17169 70 | def argument_error__str__(self: argparse.ArgumentError) -> str: 71 | if self.argument_name is None: 72 | format = '%(message)s' 73 | else: 74 | format = 'аргумент %(argument_name)s: %(message)s' 75 | return format % dict(message=self.message, 76 | argument_name=self.argument_name) 77 | 78 | argparse.ArgumentError.__str__ = argument_error__str__ # type: ignore 79 | 80 | 81 | def parse_arguments() -> tuple[argparse.Namespace, Configuration]: 82 | """Parse arguments depending on whether we got GUI support or not. 83 | 84 | Returns: 85 | Tuple of Command line arguments and Configuration. 86 | """ 87 | patch_argparse_translations() # Patch Russian translations 88 | arg_parser = argparse.ArgumentParser('Parser2GIS', description='Парсер данных сайта 2GIS', add_help=False, 89 | formatter_class=ArgumentHelpFormatter, argument_default=argparse.SUPPRESS) 90 | 91 | if GUI_ENABLED: 92 | main_parser_name = 'Основные аргументы' 93 | main_parser_required = False 94 | else: 95 | main_parser_name = 'Обязательные аргументы' 96 | main_parser_required = True 97 | 98 | main_parser = arg_parser.add_argument_group(main_parser_name) 99 | main_parser.add_argument('-i', '--url', nargs='+', default=None, required=main_parser_required, help='URL с выдачей') 100 | main_parser.add_argument('-o', '--output-path', metavar='PATH', default=None, required=main_parser_required, help='Путь до результирующего файла') 101 | main_parser.add_argument('-f', '--format', metavar='{csv,xlsx,json}', choices=['csv', 'xlsx', 'json'], default=None, required=main_parser_required, help='Формат результирующего файла') 102 | 103 | browser_parser = arg_parser.add_argument_group('Аргументы браузера') 104 | browser_parser.add_argument('--chrome.binary_path', metavar='PATH', help='Путь до исполняемого файла браузера. Если не указан, то определяется автоматически') 105 | browser_parser.add_argument('--chrome.disable-images', metavar='{yes,no}', help='Отключить изображения в браузере') 106 | browser_parser.add_argument('--chrome.headless', metavar='{yes,no}', help='Скрыть браузер') 107 | browser_parser.add_argument('--chrome.silent-browser', metavar='{yes,no}', help='Отключить отладочную информацию браузера') 108 | browser_parser.add_argument('--chrome.start-maximized', metavar='{yes,no}', help='Запустить окно браузера развёрнутым') 109 | browser_parser.add_argument('--chrome.memory-limit', metavar='{4096,5120,...}', help='Лимит оперативной памяти браузера (мегабайт)') 110 | 111 | csv_parser = arg_parser.add_argument_group('Аргументы CSV/XLSX') 112 | csv_parser.add_argument('--writer.csv.add-rubrics', metavar='{yes,no}', help='Добавить колонку "Рубрики"') 113 | csv_parser.add_argument('--writer.csv.add-comments', metavar='{yes,no}', help='Добавлять комментарии к ячейкам Телефон, E-Mail, и т.д.') 114 | csv_parser.add_argument('--writer.csv.columns-per-entity', metavar='{1,2,3,...}', help='Количество колонок для результата с несколькими возможными значениями: Телефон_1, Телефон_2, и т.д.') 115 | csv_parser.add_argument('--writer.csv.remove-empty-columns', metavar='{yes,no}', help='Удалить пустые колонки по завершению работы парсера') 116 | csv_parser.add_argument('--writer.csv.remove-duplicates', metavar='{yes,no}', help='Удалить повторяющиеся записи по завершению работы парсера') 117 | csv_parser.add_argument('--writer.csv.join_char', metavar='{; ,% ,...}', help='Разделитель для комплексных значений ячеек Рубрики, Часы работы') 118 | 119 | p_parser = arg_parser.add_argument_group('Аргументы парсера') 120 | p_parser.add_argument('--parser.use-gc', metavar='{yes,no}', help='Включить сборщик мусора - сдерживает быстрое заполнение RAM, уменьшает скорость парсинга') 121 | p_parser.add_argument('--parser.gc-pages-interval', metavar='{5,10,...}', help='Запуск сборщика мусора каждую N-ую страницу результатов (если сборщик включен)') 122 | p_parser.add_argument('--parser.max-records', metavar='{1000,2000,...}', help='Максимальное количество спарсенных записей с одного URL') 123 | p_parser.add_argument('--parser.skip-404-response', metavar='{yes,no}', help='Пропускать ссылки вернувшие сообщение "Точных совпадений нет / Не найдено"') 124 | p_parser.add_argument('--parser.delay_between_clicks', metavar='{0,100,...}', help='Задержка между кликами по записям (миллисекунд)') 125 | 126 | other_parser = arg_parser.add_argument_group('Прочие аргументы') 127 | other_parser.add_argument('--writer.verbose', metavar='{yes,no}', help='Отображать наименования позиций во время парсинга') 128 | other_parser.add_argument('--writer.encoding', metavar='{utf8,1251,...}', help='Кодировка результирующего файла') 129 | 130 | rest_parser = arg_parser.add_argument_group('Служебные аргументы') 131 | rest_parser.add_argument('-v', '--version', action='version', version=f'%(prog)s {version}', help='Показать версию программы и выйти') 132 | rest_parser.add_argument('-h', '--help', action='help', help='Показать эту справку и выйти') 133 | 134 | args = arg_parser.parse_args() 135 | config_args = unwrap_dot_dict(vars(args)) 136 | 137 | try: 138 | # Initialize config with command line arguments 139 | config = Configuration(**config_args) 140 | except pydantic.ValidationError as e: 141 | errors = [] 142 | errors_report = report_from_validation_error(e, config_args) 143 | for path, description in errors_report.items(): 144 | arg = description['invalid_value'] 145 | error_msg = description['error_message'] 146 | errors.append(f'aргумент --{path} {arg} ({error_msg})') 147 | 148 | arg_parser.error(', '.join(errors)) 149 | 150 | return args, config 151 | 152 | 153 | def main() -> None: 154 | """Entry point.""" 155 | # Parse command line arguments 156 | args, command_line_config = parse_arguments() 157 | 158 | # Run CLI if we specified all required args, otherwise run GUI. 159 | if args.url is None or args.output_path is None or args.format is None: 160 | # Load user config and merge it with one created by command line arguments. 161 | user_config = Configuration.load_config(auto_create=True) 162 | user_config.merge_with(command_line_config) 163 | config = user_config 164 | app = gui_app 165 | else: 166 | config = command_line_config 167 | app = cli_app 168 | 169 | app(args.url, args.output_path, args.format, config) 170 | -------------------------------------------------------------------------------- /parser_2gis/gui/settings.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pydantic 4 | 5 | from ..common import (GUI_ENABLED, report_from_validation_error, 6 | running_linux, unwrap_dot_dict) 7 | from ..config import Configuration 8 | from ..logger import logger 9 | from .error_popup import gui_error_popup 10 | from .utils import ensure_gui_enabled 11 | 12 | if GUI_ENABLED: 13 | import PySimpleGUI as sg 14 | 15 | 16 | @ensure_gui_enabled 17 | def gui_settings(config: Configuration) -> None: 18 | """Run settings. 19 | 20 | Args: 21 | config: Configuration to be changed. 22 | """ 23 | # Window layout 24 | layout = [ 25 | [ 26 | sg.Frame('Браузер', expand_x=True, pad=((5, 5), (5, 10)), layout=[ 27 | [ 28 | sg.Checkbox('Отключить изображения', pad=((0, 10), (5, 0)), key='-CHROME.DISABLE_IMAGES-', 29 | tooltip='Отключить изображения для увеличения скорости работы', 30 | default=config.chrome.disable_images, 31 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 32 | ], 33 | [ 34 | sg.Checkbox('Запускать развёрнутым', pad=((0, 10), (0, 0)), key='-CHROME.START_MAXIMIZED-', 35 | tooltip='Запускать браузер развёрнутым во весь экран', 36 | default=config.chrome.start_maximized, 37 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 38 | ], 39 | [ 40 | sg.Checkbox('Скрытый режим', pad=((0, 10), (0, 0)), key='-CHROME.HEADLESS-', 41 | tooltip='Запускать браузер в скрытом виде', 42 | default=config.chrome.headless, 43 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 44 | ], 45 | [ 46 | sg.Column([ 47 | [ 48 | sg.Column([ 49 | [ 50 | sg.Text('Лимит RAM'), 51 | ], 52 | ], expand_x=True, pad=0), 53 | sg.Column([ 54 | [ 55 | sg.Spin([x for x in range(1, 100)], size=(6, 1), key='-CHROME.MEMORY_LIMIT-', 56 | initial_value=config.chrome.memory_limit, 57 | tooltip=('Лимит оперативной памяти браузера (мегабайт)')), 58 | ], 59 | ], element_justification='right', pad=0), 60 | ], 61 | ], expand_x=True, pad=((3, 3), (3, 5))), 62 | ], 63 | ]), 64 | ], 65 | [ 66 | sg.Frame('Парсер', expand_x=True, pad=((5, 5), (5, 10)), layout=[ 67 | [ 68 | sg.Checkbox('Показывать города', pad=((0, 10), (5, 0)), key='-WRITER.VERBOSE-', 69 | tooltip='Показывать города с логе', 70 | default=config.writer.verbose, 71 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 72 | ], 73 | # [ 74 | # sg.Checkbox('Сборщик мусора', pad=((0, 10), (0, 0)), key='-PARSER.USE_GC-', 75 | # tooltip='Сборщик мусора - сдерживает быстрое заполнение RAM, уменьшает скорость парсинга.', 76 | # default=config.parser.use_gc, 77 | # checkbox_color=sg.theme_input_background_color(), enable_events=True), 78 | # ], 79 | [ 80 | sg.Checkbox('Точные совпадения', pad=((0, 10), (0, 0)), key='-PARSER.SKIP_404_RESPONSE-', 81 | tooltip='Пропускать ссылки вернувшие сообщение "Точных совпадений нет / Не найдено"', 82 | default=config.parser.skip_404_response, 83 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 84 | ], 85 | [ 86 | sg.Column([ 87 | [ 88 | sg.Column([ 89 | [ 90 | sg.Text('Задержка кликов'), 91 | ], 92 | ], expand_x=True, pad=0), 93 | sg.Column([ 94 | [ 95 | sg.Spin([x for x in range(1, 100000)], size=(5, 1), key='-PARSER.DELAY_BETWEEN_CLICKS-', 96 | initial_value=config.parser.delay_between_clicks, 97 | tooltip='Задержка между кликами по записям (миллисекунд)'), 98 | ], 99 | ], element_justification='right', pad=0), 100 | ], 101 | ], expand_x=True, pad=((3, 3), (3, 0))), 102 | ], 103 | [ 104 | sg.Column([ 105 | [ 106 | sg.Column([ 107 | [ 108 | sg.Text('Лимит записей'), 109 | ], 110 | ], expand_x=True, pad=0), 111 | sg.Column([ 112 | [ 113 | sg.Spin([x for x in range(1, 100000)], size=(5, 1), key='-PARSER.MAX_RECORDS-', 114 | initial_value=config.parser.max_records, 115 | tooltip='Максимальное количество спарсенных записей с одного URL'), 116 | ], 117 | ], element_justification='right', pad=0), 118 | ], 119 | ], expand_x=True, pad=((3, 3), (3, 5))), 120 | ], 121 | ]), 122 | ], 123 | [ 124 | sg.Frame('CSV/XLSX', expand_x=True, pad=((5, 5), (5, 10)), layout=[ 125 | [ 126 | sg.Checkbox('Добавить "Рубрики"', pad=((0, 10), (5, 0)), key='-WRITER.CSV.ADD_RUBRICS-', 127 | tooltip='Добавить колонку "Рубрики"', 128 | default=config.writer.csv.add_rubrics, 129 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 130 | ], 131 | [ 132 | sg.Checkbox('Добавлять комментарии', pad=((0, 10), (0, 0)), key='-WRITER.CSV.ADD_COMMENTS-', 133 | tooltip='Добавлять комментарии к ячейкам Телефон, E-Mail, и т.д.', 134 | default=config.writer.csv.add_comments, 135 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 136 | ], 137 | [ 138 | sg.Checkbox('Удалить пустые колонки', pad=((0, 10), (0, 0)), key='-WRITER.CSV.REMOVE_EMPTY_COLUMNS-', 139 | tooltip='Удалить пустые колонки по завершению работы парсера', 140 | default=config.writer.csv.remove_empty_columns, 141 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 142 | ], 143 | [ 144 | sg.Checkbox('Удалить дубликаты', pad=((0, 10), (0, 0)), key='-WRITER.CSV.REMOVE_DUPLICATES-', 145 | tooltip='Удалить повторяющиеся записи по завершению работы парсера', 146 | default=config.writer.csv.remove_duplicates, 147 | checkbox_color=sg.theme_input_background_color(), enable_events=True), 148 | ], 149 | [ 150 | sg.Column([ 151 | [ 152 | sg.Column([ 153 | [ 154 | sg.Text('Сложные колонки'), 155 | ], 156 | ], expand_x=True, pad=0), 157 | sg.Column([ 158 | [ 159 | sg.Spin([x for x in range(1, 100)], size=(5, 1), key='-WRITER.CSV.COLUMNS_PER_ENTITY-', 160 | initial_value=config.writer.csv.columns_per_entity, 161 | tooltip=('Количество колонок для результата с несколькими возможными значениями: ' 162 | 'Телефон_1, Телефон_2, и т.д.')), 163 | ], 164 | ], element_justification='right', pad=0), 165 | ], 166 | ], expand_x=True, pad=((3, 3), (3, 5))), 167 | ], 168 | ]), 169 | ], 170 | [ 171 | sg.Button('Сохранить', size=(11, 1), pad=((4, 0), (7, 7)), key='-BTN_SAVE-'), 172 | sg.Column([ 173 | [ 174 | sg.Button('Отмена', size=(8, 1), pad=((7, 0), (7, 7)), 175 | key='-BTN_CANCEL-'), 176 | ], 177 | ], expand_x=True, element_justification='right'), 178 | ], 179 | ] 180 | 181 | window_title = 'Settings' if running_linux() else 'Настройки' 182 | window = sg.Window(window_title, layout, auto_size_text=True, finalize=True, 183 | font='Any 12', modal=True, keep_on_top=True) 184 | 185 | # Main loop 186 | while True: 187 | event, values = window.Read() 188 | 189 | # Close window 190 | if event in (None, '-BTN_CANCEL-'): 191 | break 192 | 193 | # Chrome settings 194 | elif event == '-BTN_SAVE-': 195 | new_parameters_flat = {k.strip('-').lower(): v for k, v in values.items()} 196 | new_parameters = unwrap_dot_dict(new_parameters_flat) 197 | 198 | try: 199 | new_configuration = Configuration(**new_parameters) 200 | config.merge_with(new_configuration) 201 | config.save_config() 202 | break 203 | except pydantic.ValidationError as e: 204 | errors = [] 205 | errors_report = report_from_validation_error(e, new_parameters) 206 | for path, description in errors_report.items(): 207 | arg = description['invalid_value'] 208 | error_msg = description['error_message'] 209 | errors.append(f'[*] Поле: {path}, значение: {arg}, ошибка: {error_msg}') 210 | 211 | gui_error_popup('\n\n'.join(errors)) 212 | except Exception as e: 213 | # Print the error to console and close the window 214 | logger.error('Ошибка при сохранении параметров:\n%s', e, exc_info=True) 215 | break 216 | 217 | window.close() 218 | del window 219 | -------------------------------------------------------------------------------- /parser_2gis/parser/parsers/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import base64 4 | import json 5 | import re 6 | import urllib.parse 7 | from typing import TYPE_CHECKING, Optional 8 | 9 | from ...chrome import ChromeRemote 10 | from ...common import wait_until_finished 11 | from ...logger import logger 12 | from ..utils import blocked_requests 13 | 14 | if TYPE_CHECKING: 15 | from ...chrome import ChromeOptions 16 | from ...chrome.dom import DOMNode 17 | from ...writer import FileWriter 18 | from ..options import ParserOptions 19 | 20 | 21 | class MainParser: 22 | """Main parser that extracts useful payload 23 | from search result pages using Chrome browser 24 | and saves it into a `csv`, `xlsx` or `json` files. 25 | 26 | Args: 27 | url: 2GIS URLs with items to be collected. 28 | chrome_options: Chrome options. 29 | parser_options: Parser options. 30 | """ 31 | def __init__(self, url: str, 32 | chrome_options: ChromeOptions, 33 | parser_options: ParserOptions) -> None: 34 | self._options = parser_options 35 | self._url = url 36 | 37 | # "Catalog Item Document" response pattern. 38 | self._item_response_pattern = r'https://catalog\.api\.2gis.[^/]+/.*/items/byid' 39 | 40 | # Open browser, start remote 41 | response_patterns = [self._item_response_pattern] 42 | self._chrome_remote = ChromeRemote(chrome_options=chrome_options, 43 | response_patterns=response_patterns) 44 | self._chrome_remote.start() 45 | 46 | # Add counter for 2GIS requsts 47 | self._add_xhr_counter() 48 | 49 | # Disable specific requests 50 | blocked_urls = blocked_requests(extended=chrome_options.disable_images) 51 | self._chrome_remote.add_blocked_requests(blocked_urls) 52 | 53 | @staticmethod 54 | def url_pattern(): 55 | """URL pattern for the parser.""" 56 | return r'https?://2gis\.[^/]+/[^/]+/search/.*' 57 | 58 | @wait_until_finished(timeout=5, throw_exception=False) 59 | def _get_links(self) -> list[DOMNode]: 60 | """Extracts specific DOM node links from current DOM snapshot.""" 61 | def valid_link(node: DOMNode) -> bool: 62 | if node.local_name == 'a' and 'href' in node.attributes: 63 | link_match = re.match(r'.*/(firm|station)/.*\?stat=(?P[a-zA-Z0-9%]+)', node.attributes['href']) 64 | if link_match: 65 | try: 66 | base64.b64decode(urllib.parse.unquote(link_match.group('data'))) 67 | return True 68 | except: 69 | pass 70 | 71 | return False 72 | 73 | dom_tree = self._chrome_remote.get_document() 74 | return dom_tree.search(valid_link) 75 | 76 | def _add_xhr_counter(self) -> None: 77 | """Inject old-school wrapper around XMLHttpRequest, 78 | to keep track of all pending requests to 2GIS website.""" 79 | xhr_script = r''' 80 | (function() { 81 | var oldOpen = XMLHttpRequest.prototype.open; 82 | XMLHttpRequest.prototype.open = function(method, url, async, user, pass) { 83 | if (url.match(/^https?\:\/\/[^\/]*2gis\.[a-z]+/i)) { 84 | if (window.openHTTPs == undefined) { 85 | window.openHTTPs = 1; 86 | } else { 87 | window.openHTTPs++; 88 | } 89 | this.addEventListener("readystatechange", function() { 90 | if (this.readyState == 4) { 91 | window.openHTTPs--; 92 | } 93 | }, false); 94 | } 95 | oldOpen.call(this, method, url, async, user, pass); 96 | } 97 | })(); 98 | ''' 99 | self._chrome_remote.add_start_script(xhr_script) 100 | 101 | @wait_until_finished(timeout=120) 102 | def _wait_requests_finished(self) -> bool: 103 | """Wait for all pending requests.""" 104 | return self._chrome_remote.execute_script('window.openHTTPs == 0') 105 | 106 | def _get_available_pages(self) -> dict[int, DOMNode]: 107 | """Get available pages to navigate.""" 108 | dom_tree = self._chrome_remote.get_document() 109 | dom_links = dom_tree.search(lambda x: x.local_name == 'a' and 'href' in x.attributes) 110 | 111 | available_pages = {} 112 | for link in dom_links: 113 | link_match = re.match(r'.*/search/.*/page/(?P\d+)', link.attributes['href']) 114 | if link_match: 115 | available_pages[int(link_match.group('page_number'))] = link 116 | 117 | return available_pages 118 | 119 | def _go_page(self, n_page: int) -> Optional[int]: 120 | """Go page with number `n_page`. 121 | 122 | Note: 123 | `n_page` gotta exists in current DOM. 124 | Otherwise 2GIS anti-bot will redirect you to the first page. 125 | 126 | Args: 127 | n_page: Page number. 128 | 129 | Returns: 130 | Navigated page number. 131 | """ 132 | available_pages = self._get_available_pages() 133 | if n_page in available_pages: 134 | self._chrome_remote.perform_click(available_pages[n_page]) 135 | return n_page 136 | 137 | return None 138 | 139 | def parse(self, writer: FileWriter) -> None: 140 | """Parse URL with result items. 141 | 142 | Args: 143 | writer: Target file writer. 144 | """ 145 | # Starting from page 6 and further 146 | # 2GIS redirects user to the beginning automatically (anti-bot protection). 147 | # If a page argument found in the URL, we should manually walk to it first. 148 | 149 | current_page_number = 1 150 | url = re.sub(r'/page/\d+', '', self._url, re.I) 151 | 152 | page_match = re.search(r'/page/(?P\d+)', self._url, re.I) 153 | if page_match: 154 | walk_page_number = int(page_match.group('page_number')) 155 | else: 156 | walk_page_number = None 157 | 158 | # Go URL 159 | self._chrome_remote.navigate(url, referer='https://google.com', timeout=120) 160 | 161 | # Document loaded, get its response 162 | responses = self._chrome_remote.get_responses(timeout=5) 163 | if not responses: 164 | logger.error('Ошибка получения ответа сервера.') 165 | return 166 | document_response = responses[0] 167 | 168 | # Handle 404 169 | assert document_response['mimeType'] == 'text/html' 170 | if document_response['status'] == 404: 171 | logger.warn('Сервер вернул сообщение "Точных совпадений нет / Не найдено".') 172 | 173 | if self._options.skip_404_response: 174 | return 175 | 176 | # Parsed records 177 | collected_records = 0 178 | 179 | # Already visited links 180 | visited_links: set[str] = set() 181 | 182 | # This wrapper is not necessary, but I'd like to be sure 183 | # we haven't gathered links from old DOM somehow. 184 | @wait_until_finished(timeout=10, throw_exception=False) 185 | def get_unique_links() -> list[DOMNode]: 186 | links = self._get_links() 187 | link_addresses = set(x.attributes['href'] for x in links) 188 | if link_addresses & visited_links: 189 | return [] 190 | 191 | visited_links.update(link_addresses) 192 | return links 193 | 194 | while True: 195 | # Wait all 2GIS requests get finished 196 | self._wait_requests_finished() 197 | 198 | # Gather links to be clicked 199 | links = get_unique_links() 200 | 201 | # We should parse the page if we are not walking 202 | if not walk_page_number: 203 | # Iterate through gathered links 204 | for link in links: 205 | for _ in range(3): # 3 attempts to get response 206 | # Click the link to provoke request 207 | # with a auth key and secret arguments 208 | self._chrome_remote.perform_click(link) 209 | 210 | # Delay between clicks, could be usefull if 211 | # 2GIS's anti-bot service become more strict. 212 | if self._options.delay_between_clicks: 213 | self._chrome_remote.wait(self._options.delay_between_clicks / 1000) 214 | 215 | # Gather response and collect useful payload. 216 | resp = self._chrome_remote.wait_response(self._item_response_pattern) 217 | 218 | # If request is failed - repeat, otherwise go further. 219 | if resp and resp['status'] >= 0: 220 | break 221 | 222 | # Get response body data 223 | if resp and resp['status'] >= 0: 224 | data = self._chrome_remote.get_response_body(resp, timeout=10) if resp else None 225 | 226 | try: 227 | doc = json.loads(data) 228 | except json.JSONDecodeError: 229 | logger.error('Сервер вернул некорректный JSON документ: "%s", пропуск позиции.', data) 230 | doc = None 231 | else: 232 | doc = None 233 | 234 | if doc: 235 | # Write API document into a file 236 | writer.write(doc) 237 | collected_records += 1 238 | else: 239 | logger.error('Данные не получены, пропуск позиции.') 240 | 241 | # We've reached our limit, bail 242 | if collected_records >= self._options.max_records: 243 | logger.info('Спарсено максимально разрешенное количество записей с данного URL.') 244 | return 245 | 246 | # Evaluate Garbage Collection if it's been exposed and enabled 247 | if self._options.use_gc and current_page_number % self._options.gc_pages_interval == 0: 248 | logger.debug('Запуск сборщика мусора.') 249 | self._chrome_remote.execute_script('"gc" in window && window.gc()') 250 | 251 | # Free memory allocated for collected requests 252 | self._chrome_remote.clear_requests() 253 | 254 | # Calculate next page number and navigate it 255 | if walk_page_number: 256 | available_pages = self._get_available_pages() 257 | available_pages_ahead = {k: v for k, v in available_pages.items() 258 | if k > current_page_number} 259 | next_page_number = min(available_pages_ahead, key=lambda n: abs(n - walk_page_number), # type: ignore 260 | default=current_page_number + 1) 261 | else: 262 | next_page_number = current_page_number + 1 263 | 264 | current_page_number = self._go_page(next_page_number) # type: ignore 265 | if not current_page_number: 266 | break # Reached the end of the search results 267 | 268 | # Unset walking page if we've done walking to the desired page 269 | if walk_page_number and walk_page_number <= current_page_number: 270 | walk_page_number = None 271 | 272 | def close(self) -> None: 273 | self._chrome_remote.stop() 274 | 275 | def __enter__(self) -> MainParser: 276 | return self 277 | 278 | def __exit__(self, *exc_info) -> None: 279 | self.close() 280 | 281 | def __repr__(self) -> str: 282 | classname = self.__class__.__name__ 283 | return (f'{classname}(parser_options={self._options!r}, ' 284 | 'chrome_remote={self._chrome_remote!r}, ' 285 | 'url={self._url!r}') 286 | -------------------------------------------------------------------------------- /parser_2gis/writer/writers/csv_writer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import csv 4 | import os 5 | import re 6 | import shutil 7 | from typing import Any, Callable 8 | 9 | from pydantic import ValidationError 10 | 11 | from ...common import report_from_validation_error 12 | from ...logger import logger 13 | from ..models import CatalogItem 14 | from .file_writer import FileWriter 15 | 16 | 17 | class CSVWriter(FileWriter): 18 | """Writer to CSV table.""" 19 | @property 20 | def _type_names(self) -> dict[str, str]: 21 | return { 22 | 'parking': 'Парковка', 23 | 'street': 'Улица', 24 | 'road': 'Дорога', 25 | 'crossroad': 'Перекрёсток', 26 | 'station': 'Остановка', 27 | } 28 | 29 | @property 30 | def _complex_mapping(self) -> dict[str, Any]: 31 | # Complex mapping means its content could contain several entities bound by user settings. 32 | # For example: phone -> phone_1, phone_2, ..., phone_n 33 | return { 34 | 'phone': 'Телефон', 'email': 'E-mail', 'website': 'Веб-сайт', 'instagram': 'Instagram', 35 | 'twitter': 'Twitter', 'facebook': 'Facebook', 'vkontakte': 'ВКонтакте', 'whatsapp': 'WhatsApp', 36 | 'viber': 'Viber', 'telegram': 'Telegram', 'youtube': 'YouTube', 'skype': 'Skype' 37 | } 38 | 39 | @property 40 | def _data_mapping(self) -> dict[str, Any]: 41 | data_mapping = { 42 | 'name': 'Наименование', 'description': 'Описание', 'rubrics': 'Рубрики', 43 | 'address': 'Адрес', 'address_comment': 'Комментарий к адресу', 44 | 'postcode': 'Почтовый индекс', 'living_area': 'Микрорайон', 'district': 'Район', 'city': 'Город', 45 | 'district_area': 'Округ', 'region': 'Регион', 'country': 'Страна', 'schedule': 'Часы работы', 46 | 'timezone': 'Часовой пояс', 'general_rating': 'Рейтинг', 'general_review_count': 'Количество отзывов' 47 | } 48 | 49 | # Expand complex mapping 50 | for k, v in self._complex_mapping.items(): 51 | for n in range(1, self._options.csv.columns_per_entity + 1): 52 | data_mapping[f'{k}_{n}'] = f'{v} {n}' 53 | 54 | if not self._options.csv.add_rubrics: 55 | data_mapping.pop('rubrics', None) 56 | 57 | return { 58 | **data_mapping, 59 | **{ 60 | 'point_lat': 'Широта', 61 | 'point_lon': 'Долгота', 62 | 'url': '2GIS URL', 63 | 'type': 'Тип', 64 | } 65 | } 66 | 67 | def _writerow(self, row: dict[str, Any]) -> None: 68 | """Write a `row` into CSV.""" 69 | if self._options.verbose: 70 | logger.info('Парсинг [%d] > %s', self._wrote_count + 1, row['name']) 71 | 72 | try: 73 | self._writer.writerow(row) 74 | except Exception as e: 75 | logger.error('Ошибка во время записи: %s', e) 76 | 77 | def __enter__(self) -> CSVWriter: 78 | super().__enter__() 79 | self._writer = csv.DictWriter(self._file, self._data_mapping.keys()) 80 | self._writer.writerow(self._data_mapping) # Write header 81 | self._wrote_count = 0 82 | return self 83 | 84 | def __exit__(self, *exc_info) -> None: 85 | super().__exit__(*exc_info) 86 | if self._options.csv.remove_empty_columns: 87 | logger.info('Удаление пустых колонок CSV.') 88 | self._remove_empty_columns() 89 | if self._options.csv.remove_duplicates: 90 | logger.info('Удаление повторяющихся записей CSV.') 91 | self._remove_duplicates() 92 | 93 | def _remove_empty_columns(self) -> None: 94 | """Post-process: Remove empty columns.""" 95 | complex_columns = self._complex_mapping.keys() 96 | complex_columns_count = {c: 0 for c in self._data_mapping.keys() if 97 | re.match('|'.join(fr'^{x}_\d+$' for x in complex_columns), c)} 98 | 99 | # Looking for empty columns 100 | with self._open_file(self._file_path, 'r') as f_csv: 101 | csv_reader = csv.DictReader(f_csv, self._data_mapping.keys()) # type: ignore 102 | next(csv_reader, None) # Skip header 103 | for row in csv.DictReader(f_csv, self._data_mapping.keys()): # type: ignore 104 | for column_name in complex_columns_count.keys(): 105 | if row[column_name] != '': 106 | complex_columns_count[column_name] += 1 107 | 108 | # Generate new data mapping 109 | new_data_mapping: dict[str, Any] = {} 110 | for k, v in self._data_mapping.items(): 111 | if k in complex_columns_count: 112 | if complex_columns_count[k] > 0: 113 | new_data_mapping[k] = v 114 | else: 115 | new_data_mapping[k] = v 116 | 117 | # Rename single complex column - remove postfix numbers 118 | for column in complex_columns: 119 | if f'{column}_1' in new_data_mapping and f'{column}_2' not in new_data_mapping: 120 | new_data_mapping[f'{column}_1'] = re.sub(r'\s+\d+$', '', new_data_mapping[f'{column}_1']) 121 | 122 | # Populate new csv 123 | tmp_csv_name = os.path.splitext(self._file_path)[0] + '.removed-columns.csv' 124 | 125 | with self._open_file(tmp_csv_name, 'w') as f_tmp_csv, \ 126 | self._open_file(self._file_path, 'r') as f_csv: 127 | csv_writer = csv.DictWriter(f_tmp_csv, new_data_mapping.keys()) # type: ignore 128 | csv_reader = csv.DictReader(f_csv, self._data_mapping.keys()) # type: ignore 129 | csv_writer.writerow(new_data_mapping) # Write new header 130 | next(csv_reader, None) # Skip header 131 | 132 | for row in csv_reader: 133 | new_row = {k: v for k, v in row.items() if k in new_data_mapping} 134 | csv_writer.writerow(new_row) 135 | 136 | # Replace original table with new one 137 | shutil.move(tmp_csv_name, self._file_path) 138 | 139 | def _remove_duplicates(self) -> None: 140 | """Post-process: Remove duplicates.""" 141 | tmp_csv_name = os.path.splitext(self._file_path)[0] + '.deduplicated.csv' 142 | with self._open_file(tmp_csv_name, 'w') as f_tmp_csv, \ 143 | self._open_file(self._file_path, 'r') as f_csv: 144 | seen_records = set() 145 | for line in f_csv: 146 | if line in seen_records: 147 | continue 148 | 149 | seen_records.add(line) 150 | f_tmp_csv.write(line) 151 | 152 | # Replace original table with new one 153 | shutil.move(tmp_csv_name, self._file_path) 154 | 155 | def write(self, catalog_doc: Any) -> None: 156 | """Write Catalog Item API JSON document down to CSV table. 157 | 158 | Args: 159 | catalog_doc: Catalog Item API JSON document. 160 | """ 161 | if not self._check_catalog_doc(catalog_doc): 162 | return 163 | 164 | row = self._extract_raw(catalog_doc) 165 | if row: 166 | self._writerow(row) 167 | self._wrote_count += 1 168 | 169 | def _extract_raw(self, catalog_doc: Any) -> dict[str, Any]: 170 | """Extract data from Catalog Item API JSON document. 171 | 172 | Args: 173 | catalog_doc: Catalog Item API JSON document. 174 | 175 | Returns: 176 | Dictionary for CSV row. 177 | """ 178 | data: dict[str, Any] = {k: None for k in self._data_mapping.keys()} 179 | 180 | item = catalog_doc['result']['items'][0] 181 | 182 | try: 183 | catalog_item = CatalogItem(**item) 184 | except ValidationError as e: 185 | errors = [] 186 | errors_report = report_from_validation_error(e, item) 187 | for path, description in errors_report.items(): 188 | arg = description['invalid_value'] 189 | error_msg = description['error_message'] 190 | errors.append(f'[*] Поле: {path}, значение: {arg}, ошибка: {error_msg}') 191 | 192 | error_str = 'Ошибка парсинга:\n' + '\n'.join(errors) 193 | error_str += '\nДокумент каталога: ' + str(catalog_doc) 194 | logger.error(error_str) 195 | 196 | return {} 197 | 198 | # Name, description 199 | if catalog_item.name_ex: 200 | data['name'] = catalog_item.name_ex.primary 201 | data['description'] = catalog_item.name_ex.extension 202 | elif catalog_item.name: 203 | data['name'] = catalog_item.name 204 | elif catalog_item.type in self._type_names: 205 | data['name'] = self._type_names[catalog_item.type] 206 | 207 | # Type 208 | data['type'] = catalog_item.type 209 | 210 | # Address 211 | data['address'] = catalog_item.address_name 212 | 213 | # Reviews 214 | if catalog_item.reviews: 215 | data['general_rating'] = catalog_item.reviews.general_rating 216 | data['general_review_count'] = catalog_item.reviews.general_review_count 217 | 218 | # Point location 219 | if catalog_item.point: 220 | data['point_lat'] = catalog_item.point.lat # Latitude (широта) 221 | data['point_lon'] = catalog_item.point.lon # Longitude (долгота) 222 | 223 | # Address comment 224 | data['address_comment'] = catalog_item.address_comment 225 | 226 | # Post code 227 | if catalog_item.address: 228 | data['postcode'] = catalog_item.address.postcode 229 | 230 | # Timezone 231 | if catalog_item.timezone is not None: 232 | data['timezone'] = catalog_item.timezone 233 | 234 | # Administrative location details 235 | for div in catalog_item.adm_div: 236 | for t in ('country', 'region', 'district_area', 'city', 'district', 'living_area'): 237 | if div.type == t: 238 | data[t] = div.name 239 | 240 | # Item URL 241 | data['url'] = catalog_item.url 242 | 243 | # Contacts 244 | for contact_group in catalog_item.contact_groups: 245 | def append_contact(contact_type: str, priority_fields: list[str], 246 | formatter: Callable[[str], str] | None = None) -> None: 247 | """Add contact to `data`. 248 | 249 | Args: 250 | contact_type: Contact type (see `Contact` in `catalog_item.py`) 251 | priority_fields: Field of contact to be added, sorted by priority 252 | formatter: Field value formatter 253 | """ 254 | contacts = [x for x in contact_group.contacts if x.type == contact_type] 255 | for i, contact in enumerate(contacts, 1): 256 | contact_value = None 257 | 258 | for field in priority_fields: 259 | if hasattr(contact, field): 260 | contact_value = getattr(contact, field) 261 | break 262 | 263 | # Empty contact value, bail 264 | if not contact_value: 265 | return 266 | 267 | data_name = f'{contact_type}_{i}' 268 | if data_name in data: 269 | data[data_name] = formatter(contact_value) if formatter else contact_value 270 | 271 | # Add comment on demand 272 | if self._options.csv.add_comments and contact.comment: 273 | data[data_name] += ' (%s)' % contact.comment 274 | 275 | # URLs 276 | for t in ['website', 'vkontakte', 'whatsapp', 'viber', 'telegram', 277 | 'instagram', 'facebook', 'twitter', 'youtube', 'skype']: 278 | append_contact(t, ['url']) 279 | 280 | # Remove arguments from WhatsApp URL 281 | for field in data: 282 | if field.startswith('whatsapp') and data[field]: 283 | data[field] = data[field].split('?')[0] 284 | 285 | # Values 286 | for t in ['email', 'skype']: 287 | append_contact(t, ['value']) 288 | 289 | # Phone (`value` sometimes has strange crap inside, so we better parse `text`. 290 | # If no `text` field in contact - use `value` attribute) 291 | append_contact('phone', ['text', 'value'], 292 | formatter=lambda x: re.sub(r'^\+7', '8', re.sub(r'[^0-9+]', '', x))) 293 | 294 | # Schedule 295 | if catalog_item.schedule: 296 | data['schedule'] = catalog_item.schedule.to_str(self._options.csv.join_char, 297 | self._options.csv.add_comments) 298 | 299 | # Rubrics 300 | if self._options.csv.add_rubrics: 301 | data['rubrics'] = self._options.csv.join_char.join(x.name for x in catalog_item.rubrics) 302 | 303 | return data 304 | -------------------------------------------------------------------------------- /parser_2gis/gui/app.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import queue 4 | import webbrowser 5 | from functools import partial 6 | from typing import TYPE_CHECKING 7 | 8 | from ..common import GUI_ENABLED, running_linux, running_windows 9 | from ..logger import logger, setup_cli_logger, setup_gui_logger 10 | from ..paths import image_data, image_path 11 | from ..runner import GUIRunner 12 | from ..version import version 13 | from .error_popup import gui_error_popup 14 | from .settings import gui_settings 15 | from .urls_editor import gui_urls_editor 16 | from .utils import (ensure_gui_enabled, generate_event_handler, 17 | setup_text_widget) 18 | 19 | if TYPE_CHECKING: 20 | from ..config import Configuration 21 | 22 | if GUI_ENABLED: 23 | import tkinter as tk 24 | 25 | import PySimpleGUI as sg 26 | 27 | 28 | @ensure_gui_enabled 29 | def gui_app(urls: list[str], output_path: str, format: str, config: Configuration) -> None: 30 | """Run GUI. 31 | 32 | Args: 33 | url: 2GIS URLs with results to be collected. 34 | output_path: Path to the result file. 35 | format: `csv`, `xlsx` or `json` format. 36 | config: User configuration. 37 | """ 38 | # App color theme 39 | sg.theme('Green') 40 | 41 | # Set icon 42 | sg.set_global_icon(image_data('icon', 'png')) 43 | 44 | # Setup main CLI logger 45 | setup_cli_logger(config.log) 46 | 47 | # Result format 48 | default_result_format = format if format else 'csv' 49 | result_filetype = {'csv': [('CSV Table', '*.csv')], 50 | 'xlsx': [('Microsoft Excel Spreadsheet', '*.xlsx')], 51 | 'json': [('JSON', '*.json')]} 52 | 53 | # If urls wasn't passed then let it be an empty list 54 | if urls is None: 55 | urls = [] 56 | 57 | # Window layout 58 | layout = [ 59 | [ 60 | sg.Text('URL', size=(4, 1)), 61 | sg.Input(key='-IN_URL-', use_readonly_for_disable=True, expand_x=True), 62 | sg.Button('...', size=(4, 1), key='-BTN_URLS-'), 63 | sg.Button('', image_data=image_data('settings'), key='-BTN_SETTINGS-', tooltip=str(config.path)), 64 | ], 65 | [ 66 | sg.Frame('Результат', expand_x=True, expand_y=True, layout=[ 67 | [ 68 | sg.Column([ 69 | [ 70 | sg.Text('Тип'), 71 | sg.Combo(key='-FILE_FORMAT-', default_value=default_result_format, 72 | values=['csv', 'xlsx', 'json'], readonly=True, enable_events=True), 73 | sg.Text('Путь'), 74 | sg.Input(key='-OUTPUT_PATH-', expand_x=True, 75 | default_text='' if output_path is None else output_path), 76 | sg.FileSaveAs(key='-OUTPUT_PATH_BROWSE-', button_text='Обзор', size=(7, 1), 77 | default_extension=f'.{default_result_format}', 78 | file_types=result_filetype[default_result_format]), 79 | ], 80 | ], expand_x=True), 81 | ], 82 | ]), 83 | ], 84 | [ 85 | sg.Frame('Лог', expand_x=True, expand_y=True, layout=[ 86 | [ 87 | sg.Multiline(key='-LOG-', size=(80, 20), expand_x=True, autoscroll=True, 88 | reroute_stdout=True, reroute_stderr=True, echo_stdout_stderr=True), 89 | ], 90 | ]), 91 | ], 92 | [ 93 | sg.Image(data=image_data('logo'), key='-IMG_LOGO-', 94 | enable_events=True, background_color=sg.theme_background_color()), 95 | sg.Text(f'v{version}'), 96 | sg.Column([ 97 | [ 98 | sg.Image(key='-IMG_LOADING-', visible=False, background_color=sg.theme_background_color()), 99 | ], 100 | ], expand_x=True, element_justification='right'), 101 | sg.Column([ 102 | [ 103 | sg.Button('Запуск', key='-BTN_START-', size=(8, 1)), 104 | sg.Button('Стоп', key='-BTN_STOP-', size=(6, 1), button_color=('white', 'orange3'), visible=False), 105 | ], 106 | ], element_justification='right'), 107 | sg.Button('Выход', size=(7, 1), button_color=('white', 'firebrick3'), key='-BTN_EXIT-'), 108 | ], 109 | ] 110 | 111 | # tkinter could encounter encoding problem with cyrillics characters on linux systems (toolbar, topbar), 112 | # so let the window titles be in English. No big deal, actually. 113 | window_title = 'Parser 2GIS' if running_linux() else 'Парсер 2GIS' 114 | 115 | # Main window 116 | window = sg.Window(window_title, layout, auto_size_text=True, finalize=True, font='Any 12') 117 | 118 | # Setup text widgets 119 | setup_text_widget(window['-IN_URL-'].widget, window.TKroot, menu_clear=False, set_focus=True) 120 | setup_text_widget(window['-OUTPUT_PATH-'].widget, window.TKroot, menu_clear=False) 121 | setup_text_widget(window['-LOG-'].widget, window.TKroot, menu_paste=False, menu_cut=False) 122 | 123 | # Forbid user to edit output console, 124 | # block any keys except ctl+c, ←, ↑, →, ↓ 125 | def log_key_handler(e: tk.Event) -> str | None: 126 | if e.char == '\x03' or e.keysym in ('Left', 'Up', 'Right', 'Down'): 127 | return None 128 | 129 | return 'break' 130 | 131 | window['-LOG-'].widget.bind('', log_key_handler) 132 | window['-LOG-'].widget.bind('<>', lambda e: 'break') 133 | window['-LOG-'].widget.bind('<>', lambda e: 'break') 134 | 135 | # Enable logging queue to be able to handle log in the mainloop 136 | log_queue: queue.Queue[tuple[str, str]] = queue.Queue() # Queue of log messages (log_level, log_message) 137 | setup_gui_logger(log_queue, config.log) 138 | 139 | # Hand cursor for logo 140 | window['-IMG_LOGO-'].widget.config(cursor='hand2') 141 | 142 | # Set config settings button hover/click image 143 | def change_settings_image(image_name: str) -> None: 144 | window['-BTN_SETTINGS-'].update(image_data=image_data(image_name)) # noqa: F821 145 | 146 | window['-BTN_SETTINGS-'].TKButton.bind( 147 | '