├── .gitignore
├── .isort.cfg
├── .style.yapf
├── README.md
├── dialogs_data_parsers
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── crawler.py
    │   └── log_config.py
    ├── flibusta
    │   ├── __init__.py
    │   ├── author_words_annotation_generator.py
    │   ├── dialogs_iterator.py
    │   └── dialogs_parser.py
    ├── pikabu
    │   ├── __init__.py
    │   ├── dialogs_iterator.py
    │   ├── story_crawler.py
    │   └── story_links_crawler.py
    └── utils.py
├── requirements.txt
├── scripts
    ├── annotate_flibusta_raw_dialogs.py
    ├── crawl_pikabu_stories.py
    ├── crawl_pikabu_story_links.py
    └── prepare_flibusta_raw_dialogs.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # idea
107 | /.idea/
108 | 
109 | # custom
110 | /data/
111 | .vscode
112 | /logs/
113 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | line_length=120
3 | multi_line_output=2
4 | balanced_wrapping=True
5 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | BASED_ON_STYLE = google
3 | 
4 | COLUMN_LIMIT = 120
5 | SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = true
6 | SPACES_AROUND_POWER_OPERATOR = true
7 | INDENT_DICTIONARY_VALUE = true
8 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dialogs Data Parsers
 2 | Репозиторий для парсинга диалоговых данных.
 3 | **Только для исследовательских целей!**
 4 | 
 5 | ```shell script
 6 | pip install -U -e .
 7 | ```
 8 | 
 9 | ## Pikabu
10 | #### Parsing
11 | Для сбора и парсинга данных с [pikabu](https://pikabu.ru) нужно сперва собрать ссылки на истории:
12 | ```shell script
13 | python scripts/crawl_pikabu_story_links.py --root_dir path/to/output/dir 
14 | ```
15 | - *--root_dir* - Путь к root директории, в которой будут лежать результаты парсинга.
16 | 
17 | Остальные аргументы можно посмотреть в скрипте (для них есть дефолтные значения).
18 | 
19 | После того, как ссылки на истории собраны, можно запускать парсер:
20 | ```shell script
21 | python scripts/crawl_pikabu_stories.py --root_dir path/to/output/dir 
22 | ```
23 | - *--root_dir* - Путь к root директории, в которой лежат ссылки на истории (тот же самый путь, что указывали в 
24 | предыдущем скрипте).
25 | 
26 | Парсинг всего pikabu длится примерно неделю. Скрипт устойчив к прерываниям. Если по какой-то причине
27 | парсинг прервался, то его можно перезапустить, указав тот же самый *--root_dir*. Парсинг продолжится с
28 | того же места, где был прерван.
29 | 
30 | #### Data format
31 | Результатом парсинга pikabu является jsonl файл. Каждая строчка - отдельный json со структурой 
32 | (пример изображён с индентацией, но в настоящем файле этот Json будет записан в одну строку):
33 | ```json
34 | {
35 |  "url": "story url",
36 |  "story": {
37 |   "title": "story title",
38 |   "text": "story text",
39 |   "user_nick": "story author",
40 |   "tags": ["tag1", "tag2", "tag3"],
41 |   "comments_count": 42,
42 |   "shares": 10,
43 |   "saves": 228,
44 |   "timestamp": "2017-04-21T11:38:56+03:00",
45 |   "rating": 5
46 |  },
47 |  "comments": {
48 |   "8811": {
49 |    "text": "comment text",
50 |    "parent_id": 0,
51 |    "children": [8812, 8813]
52 |   },
53 |   "8812": {
54 |    "text": "comment text",
55 |    "parent_id": 8811,
56 |    "children": [8814, 8815, 8816] 
57 |   }
58 |  }
59 | }
60 | ```
61 | Несколько важных моментов:
62 | - Ключи в словаре `comments` - это id комментариев. Как ключи, они имеют текстовый формат, однако, id, которые
63 | записаны в поля `parent_id` и `children` - имеют integer тип. Учитывайте это во время парсинга файла.
64 | - Если у комментария `parent_id` равен 0, то это значит, что у комментария нет родителя (в данном случае саму историю
65 | можно воспринимать как родителя).
66 | - Комментарии хранятся в формате дерева. Такой формат можно распарсить в виде диалогов. Пример парсера в
67 | `examples/pikabu_dialogs_iterator`
68 | 
69 | ## Flibusta
70 | #### Parsing
71 | Для того, чтобы распарсить диалоги из книг с флибусты нужен дамп книг. Ссылку на дамп я здесь, разумеется,
72 | прикладывать не буду :)
73 | 
74 | Дапм представляет из себя директорию с множеством .zip архивов примерно с такими названиями: `f.fb2-188228-190111.zip`.
75 | Каждый архив в свою очередь содержит множество файлов формата `fb2`.
76 | 
77 | Для парсинга диалогов используется скрипт:
78 | ```shell script
79 | python parse_flibusta_dialogs.py --flibusta_archives_dir path/to/flibusta/dir/with/archives --out_file_path path/to/dialogs/results/file --logs_dir path/to/logs/dir
80 | ```
81 | - *--flibusta_archives_dir* - Путь к root директории со всеми архивами;
82 | - *--out_file_path* - Путь к выходному jsonl файлу с диалогами;
83 | - *--logs_dir* - Путь к директории, куда будут писаться логи.
84 | 
85 | Парсинг 130 архивов длится примерно 13 часов и это примерно 40-50 миллионов диалогов. Можно переписать на мультипроцессинге
86 | и парсинг будет за 2 часа. Но мне лень.
87 | 
88 | #### Data format
89 | Результатом парсинга flibusta является jsonl файл. Каждая строчка этого файла - просто список сообщений:
90 | ```json
91 | ["Привет, как дела?", "Нормально", "Ясно, понятно"]
92 | ```
93 | 
94 | По идее, в этих данных должны быть отфильтрованы слова автора, но возможно иногда они будут попадаться.
95 | Плюс, возможны другие аномалии. Но беглый ручной осмотр пары сотен диалогов ничего странного не выявил.


--------------------------------------------------------------------------------
/dialogs_data_parsers/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | from dialogs_data_parsers.common import log_config
4 | 
5 | sys.excepthook = log_config.handle_unhandled_exception
6 | 
7 | __version__ = '0.0.1'
8 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/common/__init__.py


--------------------------------------------------------------------------------
/dialogs_data_parsers/common/crawler.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from functools import partial
 4 | from typing import Optional
 5 | 
 6 | import aiohttp
 7 | 
 8 | _logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Crawler:
12 |     def __init__(self, concurrency, timeout, retries):
13 |         self._timeout = timeout
14 |         self._retries = retries
15 | 
16 |         self._semaphore = asyncio.BoundedSemaphore(concurrency)
17 | 
18 |     async def perform_request(self, url, headers=None, data=None, params=None, method='get') -> Optional[str]:
19 |         """Requests a page and returns content."""
20 |         _logger.debug(f'Requesting page: {url}')
21 |         i_retry = 0
22 |         async with self._get_session(headers=headers) as session:
23 |             while i_retry < self._retries:
24 |                 try:
25 |                     method = session.get if method == 'get' else partial(session.post, data=data)
26 |                     async with self._semaphore, method(url, allow_redirects=False, params=params) as response:
27 |                         text = await response.text()
28 |                         _logger.debug(f'Page source obtained: {url}')
29 |                         return text
30 |                 except asyncio.TimeoutError:
31 |                     i_retry += 1
32 |                     _logger.warning(f'Timeout for page [{i_retry}/{self._retries}]: {url}')
33 |             else:
34 |                 _logger.warning(f'Max number of retries exceeded for page: {url}')
35 |                 return None
36 | 
37 |     def _get_session(self, headers):
38 |         connector = aiohttp.TCPConnector()
39 |         timeout = aiohttp.ClientTimeout(total=self._timeout)
40 |         session = aiohttp.ClientSession(connector=connector, timeout=timeout, headers=headers)
41 | 
42 |         return session
43 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/common/log_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import pathlib
 4 | import sys
 5 | from typing import Dict
 6 | 
 7 | _LOGGER = logging.getLogger(__name__)
 8 | _FORMATTER = '[%(asctime)s %(module)s %(funcName)s %(levelname)s] %(message)s'
 9 | 
10 | 
11 | def prepare_logging(logs_dir, log_files_prefix=''):
12 |     """Configures logging."""
13 |     log_config = _get_log_config(logs_dir, log_files_prefix)
14 |     logging.config.dictConfig(log_config)
15 | 
16 | 
17 | def handle_unhandled_exception(exc_type, exc_value, exc_traceback):
18 |     """Handler for unhandled exceptions that will write to the logs"""
19 |     if issubclass(exc_type, KeyboardInterrupt):
20 |         # call the default excepthook saved at __excepthook__
21 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
22 |         return
23 |     _LOGGER.critical("Unhandled exception", exc_info=(exc_type, exc_value, exc_traceback))
24 | 
25 | 
26 | def _get_rotating_file_handler(log_file: str, level: str, max_bytes: int = 10485760, backup_count: int = 5) -> Dict:
27 |     handler_dict = {
28 |         'class': 'logging.handlers.RotatingFileHandler',
29 |         'level': level,
30 |         'formatter': 'default',
31 |         'filename': log_file,
32 |         'mode': 'a',
33 |         'maxBytes': max_bytes,
34 |         'backupCount': backup_count,
35 |     }
36 | 
37 |     return handler_dict
38 | 
39 | 
40 | def _get_console_output_handler(level) -> Dict:
41 |     handler_dict = {
42 |         'class': 'logging.StreamHandler',
43 |         'level': level,
44 |         'formatter': 'default',
45 |     }
46 | 
47 |     return handler_dict
48 | 
49 | 
50 | def _get_log_config(log_dir, log_files_prefix) -> dict:
51 |     log_dir = pathlib.Path(log_dir)
52 | 
53 |     log_dir.mkdir(exist_ok=True, parents=True)
54 |     info_file = str(log_dir / f'{log_files_prefix}info.log')
55 |     errors_file = str(log_dir / f'{log_files_prefix}errors.log')
56 |     debug_file = str(log_dir / f'{log_files_prefix}debug.log')
57 | 
58 |     handlers = {
59 |         'info_file': _get_rotating_file_handler(info_file, 'INFO'),
60 |         'debug_file': _get_rotating_file_handler(debug_file, 'DEBUG'),
61 |         'errors_file': _get_rotating_file_handler(errors_file, 'ERROR'),
62 |         'console': _get_console_output_handler('INFO')
63 |     }
64 | 
65 |     log_config = {
66 |         'disable_existing_loggers': False,
67 |         'version': 1,
68 |         'formatters': {
69 |             'default': {
70 |                 'format': _FORMATTER
71 |             }
72 |         },
73 |         'handlers': handlers,
74 |         'loggers': {
75 |             '': {
76 |                 'handlers': list(handlers.keys()),
77 |                 'level': 'DEBUG'
78 |             }
79 |         }
80 |     }
81 | 
82 |     return log_config
83 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/flibusta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/flibusta/__init__.py


--------------------------------------------------------------------------------
/dialogs_data_parsers/flibusta/author_words_annotation_generator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import re
 4 | from itertools import chain, cycle
 5 | from pathlib import Path
 6 | 
 7 | from dialogs_data_parsers.flibusta.dialogs_parser import DIALOG_SEPARATORS
 8 | 
 9 | _PERSON_FLAG = 0
10 | _AUTHOR_FLAG = 1
11 | _PUNCT_FLAG = 2
12 | _DASH_FLAG = 3
13 | _SPLIT_FLAGS = (_PERSON_FLAG, _PUNCT_FLAG, _DASH_FLAG, _AUTHOR_FLAG, _PUNCT_FLAG, _DASH_FLAG)
14 | 
15 | _DIALOGS_SEPARATORS_SET = set(DIALOG_SEPARATORS)
16 | _AUTHOR_WORDS_SEPARATOR_PATTERN = re.compile(f'([.,!?:;]+)(\s?[{DIALOG_SEPARATORS}])')
17 | _AUGMENT_PUNCT_CHOICES = list(set(chain(*[[symbol * i for i in range(0, 4)] for symbol in '.,!?:; '])))
18 | _AUGMENT_DASH_CHOICES = list(
19 |     set(chain(*[[' ' * i + symbol for i in range(0, 4)] for symbol in list(_DIALOGS_SEPARATORS_SET) + [' ']])))
20 | 
21 | 
22 | class FlibustaAuthorWordsAnnotationGenerator:
23 |     def __init__(self, raw_dialogs_file_path, out_file_path, n_samples, augment_p):
24 |         self._raw_dialogs_file_path = raw_dialogs_file_path
25 |         self._augment_p = augment_p
26 |         self._out_file_path = Path(out_file_path)
27 |         self._n_samples = int(n_samples)
28 | 
29 |     def run(self):
30 |         utterances = self._iterate_on_utterances()
31 |         n_samples_done = 0
32 | 
33 |         self._out_file_path.parent.mkdir(exist_ok=True, parents=True)
34 | 
35 |         with open(self._out_file_path, 'w') as out_file:
36 |             for utterance in utterances:
37 |                 augmented_split_utterance_and_flags = self._generate_augmented_split_utterance_and_flags(utterance)
38 |                 if len(augmented_split_utterance_and_flags) == 0:
39 |                     continue
40 |                 payload = json.dumps(augmented_split_utterance_and_flags, ensure_ascii=False)
41 |                 out_file.write(payload)
42 |                 out_file.write('\n')
43 |                 n_samples_done += 1
44 | 
45 |                 if n_samples_done == self._n_samples:
46 |                     break
47 | 
48 |                 if n_samples_done % 10000 == 0:
49 |                     print(f'Samples: {n_samples_done}/{self._n_samples}')
50 | 
51 |     def _iterate_on_utterances(self):
52 |         with open(self._raw_dialogs_file_path) as file:
53 |             for line in file:
54 |                 dialog = json.loads(line)
55 |                 for utterance in dialog:
56 |                     yield utterance
57 | 
58 |     def _generate_augmented_split_utterance_and_flags(self, utterance):
59 |         split_utterance = _AUTHOR_WORDS_SEPARATOR_PATTERN.split(utterance)
60 |         split_utterance = [utterance for utterance in split_utterance if len(utterance)]
61 | 
62 |         augmented_split_utterance = []
63 |         augmented_split_utterance_flags = []
64 | 
65 |         for sub_utterance, flag in zip(split_utterance, cycle(_SPLIT_FLAGS)):
66 |             if flag != _PUNCT_FLAG and flag != _DASH_FLAG:
67 |                 augmented_split_utterance.append(sub_utterance)
68 |                 augmented_split_utterance_flags.append(flag)
69 |             else:
70 |                 if random.random() <= self._augment_p:
71 |                     choices = _AUGMENT_PUNCT_CHOICES if flag == _PUNCT_FLAG else _AUGMENT_DASH_CHOICES
72 |                     sub_utterance = random.choice(choices)
73 | 
74 |                 augmented_split_utterance[-1] += sub_utterance
75 | 
76 |         augmented_split_utterance_and_flags = list(zip(augmented_split_utterance, augmented_split_utterance_flags))
77 | 
78 |         return augmented_split_utterance_and_flags
79 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/flibusta/dialogs_iterator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | _logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class FlibustaDialogsIterator:
 8 |     def __init__(self, file_path, logging_period):
 9 |         self._file_path = file_path
10 |         self._logging_period = logging_period
11 | 
12 |     def __iter__(self):
13 |         with open(self._file_path) as file:
14 |             n_samples_done = 0
15 |             for n_lines_done, raw_line in enumerate(file, start=1):
16 |                 if self._logging_period and n_lines_done % self._logging_period == 0:
17 |                     _logger.info(f'Flibusta lines: {n_lines_done}, samples: {n_samples_done}')
18 | 
19 |                 dialog = json.loads(raw_line)
20 | 
21 |                 for n_utterances in range(2, len(dialog) + 1):
22 |                     subdialog = dialog[:n_utterances]
23 |                     n_samples_done += 1
24 |                     yield subdialog
25 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/flibusta/dialogs_parser.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import multiprocessing
  4 | import re
  5 | import unicodedata
  6 | from pathlib import Path
  7 | from zipfile import BadZipFile, ZipFile
  8 | 
  9 | import bs4
 10 | from more_itertools import chunked
 11 | 
 12 | _logger = logging.getLogger(__name__)
 13 | logging.getLogger("filelock").setLevel(logging.WARNING)
 14 | 
 15 | DIALOG_SEPARATORS = '-‐‑‒–—―₋−⸺⸻﹘﹣－'
 16 | 
 17 | 
 18 | class FlibustaDialogsParser:
 19 |     _MIN_N_UTTERANCES = 2
 20 |     _ARCHIVE_PATTERN = re.compile(r'.*fb2-.+\.zip$')
 21 | 
 22 |     _DIALOGS_CHUNK_WRITE_SIZE = 1000
 23 | 
 24 |     def __init__(self, flibusta_archives_dir, out_file_path):
 25 |         self._flibusta_archives_dir = flibusta_archives_dir
 26 |         self._out_file_path = Path(out_file_path)
 27 |         self._out_file_path.parent.mkdir(exist_ok=True, parents=True)
 28 |         if self._out_file_path.is_file():
 29 |             self._out_file_path.unlink()
 30 | 
 31 |         manager = multiprocessing.Manager()
 32 |         self._archives_counter = manager.Value('i', 0)
 33 |         self._dialogs_counter = manager.Value('i', 0)
 34 |         self._out_file_lock = manager.Lock()
 35 |         self._archive_paths = list(self._iterate_on_archive_paths())
 36 | 
 37 |     def run(self):
 38 |         with multiprocessing.Pool() as pool:
 39 |             pool.map(self._parse_archive, self._archive_paths)
 40 | 
 41 |     def _iterate_on_archive_paths(self):
 42 |         for path in Path(self._flibusta_archives_dir).iterdir():
 43 |             if self._ARCHIVE_PATTERN.match(path.name):
 44 |                 yield path
 45 | 
 46 |     def _parse_archive(self, archive_path):
 47 |         dialogs = self._iterate_on_dialogs(archive_path)
 48 | 
 49 |         for dialogs_chunk in chunked(dialogs, n=self._DIALOGS_CHUNK_WRITE_SIZE):
 50 |             payloads = []
 51 |             for dialog in dialogs_chunk:
 52 |                 payload = json.dumps(dialog, ensure_ascii=False)
 53 |                 payloads.append(payload)
 54 | 
 55 |             chunk_payload = '\n'.join(payloads)
 56 | 
 57 |             self._out_file_lock.acquire()
 58 |             with open(self._out_file_path, 'a') as out_file:
 59 |                 out_file.write(chunk_payload)
 60 |                 out_file.write('\n')
 61 |                 out_file.flush()
 62 |             self._out_file_lock.release()
 63 | 
 64 |             self._dialogs_counter.value += len(dialogs_chunk)
 65 |             _logger.info(f'Archives: {self._archives_counter.value}/{len(self._archive_paths)}, '
 66 |                          f'Dialogs: {self._dialogs_counter.value}')
 67 | 
 68 |         self._archives_counter.value += 1
 69 | 
 70 |     def _iterate_on_dialogs(self, archive_path):
 71 |         book_texts = self._iterate_on_book_texts(archive_path)
 72 |         dialog_separators_set = set(DIALOG_SEPARATORS)
 73 | 
 74 |         for book_text in book_texts:
 75 |             book_text_lines = re.split('\n+', book_text)
 76 |             dialog = []
 77 | 
 78 |             for line in book_text_lines:
 79 |                 line = line.strip()
 80 | 
 81 |                 if len(line) > 2 and line[0] in dialog_separators_set:
 82 |                     line = unicodedata.normalize("NFKC", line)
 83 |                     line = re.sub(r'^\W+', '', line)
 84 |                     dialog.append(line)
 85 |                 else:
 86 |                     if len(dialog) >= self._MIN_N_UTTERANCES:
 87 |                         yield dialog
 88 | 
 89 |                     dialog = []
 90 | 
 91 |             if len(dialog) >= self._MIN_N_UTTERANCES:
 92 |                 yield dialog
 93 | 
 94 |     def _iterate_on_book_texts(self, archive_path):
 95 |         try:
 96 |             with ZipFile(archive_path, 'r') as zip_file:
 97 |                 for file_name in zip_file.namelist():
 98 |                     raw_fb2_text = zip_file.read(file_name)
 99 |                     book_soup = bs4.BeautifulSoup(raw_fb2_text, features="html.parser")
100 |                     lang_tag = book_soup.find('lang')
101 | 
102 |                     if lang_tag and lang_tag.text.lower().strip() == 'ru':
103 |                         book_text = book_soup.text
104 |                         yield book_text
105 |         except BadZipFile:
106 |             _logger.warning(f'Bad zip file: {archive_path}')
107 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/pikabu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/pikabu/__init__.py


--------------------------------------------------------------------------------
/dialogs_data_parsers/pikabu/dialogs_iterator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import re
  4 | from typing import Optional
  5 | 
  6 | from treelib import Tree
  7 | 
  8 | from dialogs_data_parsers.utils import iterate_on_parts_by_condition
  9 | 
 10 | _logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class PikabuDialogsWithMetaIterator:
 14 |     def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000):
 15 |         self._file_path = file_path
 16 |         self._logging_period = logging_period
 17 |         self._max_n_words_per_utterance = max_n_words_per_utterance
 18 | 
 19 |     def __iter__(self):
 20 |         with open(self._file_path) as file:
 21 |             n_samples_done = 0
 22 |             for n_lines_done, raw_line in enumerate(file, start=1):
 23 |                 if self._logging_period and n_lines_done % self._logging_period == 0:
 24 |                     _logger.info(f'Pikabu lines: {n_lines_done}, samples: {n_samples_done}')
 25 | 
 26 |                 line_data = json.loads(raw_line)
 27 |                 dialog_tree = self._get_dialog_tree(line_data)
 28 |                 dialogs = _iterate_on_dialogs_from_tree(dialog_tree)
 29 |                 dialogs = set(dialogs)
 30 | 
 31 |                 subdialogs = set()
 32 | 
 33 |                 for dialog in dialogs:
 34 |                     dialog = tuple(dialog)
 35 |                     for n_utterances in range(2, len(dialog) + 1):
 36 |                         subdialog = tuple(dialog[:n_utterances])
 37 |                         subdialogs.add(_Dialog(subdialog))
 38 | 
 39 |                 n_samples_done += len(subdialogs)
 40 |                 for subdialog in subdialogs:
 41 |                     yield tuple(subdialog)
 42 | 
 43 |     def _get_dialog_tree(self, line_data):
 44 |         tree = Tree()
 45 |         tree.create_node(identifier=0)
 46 |         comments = line_data['comments']
 47 |         if comments:
 48 |             ids_and_comments = ((int(id_), comment) for id_, comment in comments.items())
 49 |             ids_and_comments = sorted(ids_and_comments, key=lambda x: x[0])
 50 | 
 51 |             for id_, comment_json in ids_and_comments:
 52 |                 parent_id = int(comment_json['parent_id'])
 53 |                 comment = comment_json['text']
 54 |                 comment = comment.replace('\n', ' ')
 55 |                 comment_text = self._process_comment(comment)
 56 |                 if comment_text:
 57 |                     meta = comment_json.get('meta')
 58 |                     data = {'text': comment_text, 'meta': meta}
 59 |                 else:
 60 |                     data = None
 61 | 
 62 |                 tree.create_node(identifier=id_, parent=parent_id, data=data)
 63 | 
 64 |         return tree
 65 | 
 66 |     def _process_comment(self, text) -> Optional[str]:
 67 |         if not text:
 68 |             return None
 69 | 
 70 |         if '@' in text or 'http' in text:
 71 |             return None
 72 | 
 73 |         n_words = len(re.findall(r'\w+', text))
 74 |         if n_words > self._max_n_words_per_utterance:
 75 |             return None
 76 | 
 77 |         text = text.strip()
 78 |         if text.startswith('Комментарий удален.'):
 79 |             return None
 80 | 
 81 |         return text
 82 | 
 83 | 
 84 | class PikabuDialogsIterator(PikabuDialogsWithMetaIterator):
 85 |     def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000):
 86 |         super().__init__(file_path, max_n_words_per_utterance=max_n_words_per_utterance, logging_period=logging_period)
 87 | 
 88 |     def __iter__(self):
 89 |         for subdialog in super().__iter__():
 90 |             utterances = [utterance['text'] for utterance in subdialog]
 91 |             yield utterances
 92 | 
 93 | 
 94 | _UPVOTE_DOWNVOTE_REGEX = re.compile(r'av=(\d+):(\d+)')
 95 | _MIN_N_VOTES = 30
 96 | _LABEL_THRESHOLD = 0.8
 97 | UNK_RATING_LABEL = 0
 98 | BALANCED_RATING_LABEL = 1
 99 | HIGH_RATING_LABEL = 2
100 | LOW_RATING_LABEL = 3
101 | 
102 | 
103 | class PikabuDialogsWithResponseRatingIterator(PikabuDialogsWithMetaIterator):
104 |     def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000):
105 |         super().__init__(file_path, max_n_words_per_utterance=max_n_words_per_utterance, logging_period=logging_period)
106 | 
107 |     def __iter__(self):
108 |         for subdialog in super().__iter__():
109 |             utterances = [utterance['text'] for utterance in subdialog]
110 |             response_meta = subdialog[-1]['meta']
111 |             response_rating_label = _get_rating_from_meta(response_meta)
112 |             if response_rating_label is not None:
113 |                 yield {'dialog': utterances, 'label': response_rating_label}
114 | 
115 | 
116 | def _get_rating_from_meta(response_meta):
117 |     upvote_downvote = _UPVOTE_DOWNVOTE_REGEX.findall(response_meta)
118 |     label = UNK_RATING_LABEL
119 |     if len(upvote_downvote) == 1:
120 |         upvote_downvote = upvote_downvote.pop()
121 |         upvote, downvote = map(int, upvote_downvote)
122 |         n_votes = upvote + downvote
123 |         if n_votes >= _MIN_N_VOTES:
124 |             upvote_ratio = upvote / n_votes
125 |             if upvote_ratio > _LABEL_THRESHOLD:
126 |                 label = HIGH_RATING_LABEL
127 |             elif (1 - upvote_ratio) > _LABEL_THRESHOLD:
128 |                 label = LOW_RATING_LABEL
129 |             else:
130 |                 label = BALANCED_RATING_LABEL
131 | 
132 |     return label
133 | 
134 | 
135 | class _Dialog:
136 |     def __init__(self, utterance_dicts):
137 |         self._utterance_dicts = utterance_dicts
138 | 
139 |     def __iter__(self):
140 |         yield from self._utterance_dicts
141 | 
142 |     def __hash__(self):
143 |         return hash(tuple(d['text'] for d in self))
144 | 
145 |     def __eq__(self, other):
146 |         return hash(self) == hash(other)
147 | 
148 | 
149 | def _iterate_on_dialogs_from_tree(dialog_tree: Tree):
150 |     for path in dialog_tree.paths_to_leaves():
151 |         path = path[1:]  # Skip dummy root node
152 |         dialog = [dialog_tree[p].data for p in path]
153 | 
154 |         # Split dialog on parts by empty utterance:
155 |         dialogs = iterate_on_parts_by_condition(dialog, lambda utterance: not utterance)
156 | 
157 |         for dialog in dialogs:
158 |             yield _Dialog(dialog)
159 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/pikabu/story_crawler.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import copy
  3 | import json
  4 | import logging
  5 | import re
  6 | from pathlib import Path
  7 | 
  8 | import aiofiles
  9 | import bs4
 10 | from more_itertools import chunked
 11 | 
 12 | from dialogs_data_parsers.common.crawler import Crawler
 13 | 
 14 | _logger = logging.getLogger(__name__)
 15 | _GET_COMMENTS_URL = 'https://pikabu.ru/ajax/comments_actions.php'
 16 | _URLS_CHUNK_SIZE = 1000
 17 | 
 18 | 
 19 | def iterate_on_urls(story_links_dir):
 20 |     for story_links_file in Path(story_links_dir).iterdir():
 21 |         with open(story_links_file) as file:
 22 |             for url in file:
 23 |                 yield url.strip()
 24 | 
 25 | 
 26 | class PikabuStoryCrawler(Crawler):
 27 |     def __init__(self, concurrency, timeout, retries, story_links, out_file_path):
 28 |         super().__init__(concurrency=concurrency, timeout=timeout, retries=retries)
 29 | 
 30 |         self._out_file_path = out_file_path
 31 |         self._parsed_urls = self._get_parsed_urls()
 32 |         self._all_urls = set(story_links)
 33 |         self._urls_to_parse = self._all_urls.difference(self._parsed_urls)
 34 |         self._n_urls_to_parse = len(self._urls_to_parse)
 35 | 
 36 |     @classmethod
 37 |     def from_story_links_dir(cls, concurrency, timeout, retries, story_links_dir, out_file_path):
 38 |         story_links = iterate_on_urls(story_links_dir)
 39 |         return cls(
 40 |             concurrency=concurrency,
 41 |             timeout=timeout,
 42 |             retries=retries,
 43 |             story_links=story_links,
 44 |             out_file_path=out_file_path)
 45 | 
 46 |     async def run(self):
 47 |         for urls_chunk in chunked(self._urls_to_parse, n=_URLS_CHUNK_SIZE):
 48 |             coroutines = [self._crawl(url) for url in urls_chunk]
 49 |             await asyncio.gather(*coroutines)
 50 | 
 51 |     def _get_parsed_urls(self):
 52 |         urls = set()
 53 |         if Path(self._out_file_path).is_file():
 54 |             with open(self._out_file_path) as file:
 55 |                 for line in file:
 56 |                     url = line.split(',', maxsplit=1)[0].split('"')[-2]
 57 |                     assert url.startswith('https')
 58 |                     urls.add(url)
 59 | 
 60 |         return urls
 61 | 
 62 |     async def _crawl(self, url):
 63 |         _logger.debug(f'Crawling story: {url}')
 64 |         try:
 65 |             result = await self._get_story_and_comments(url=url)
 66 |         except Exception:
 67 |             _logger.exception(f'Failed to crawl story: {url}')
 68 |             result = None
 69 | 
 70 |         if result is None:
 71 |             _logger.debug(f'Result is None for story: {url}')
 72 |             return
 73 | 
 74 |         async with aiofiles.open(self._out_file_path, "a") as f:
 75 |             result_str = json.dumps(result, ensure_ascii=False)
 76 |             await f.write(result_str + '\n')
 77 |             await f.flush()
 78 |             _logger.debug(f'Story crawled and saved: {url}')
 79 | 
 80 |     async def _get_story_and_comments(self, url):
 81 |         story_id = url.split('_')[-1]
 82 |         story_html = await self.perform_request(url, headers=_get_headers(), method='get')
 83 | 
 84 |         if not story_html:
 85 |             return None
 86 | 
 87 |         story_soup = bs4.BeautifulSoup(story_html, features="html.parser")
 88 | 
 89 |         # Page not exists (deleted)
 90 |         if story_soup.find('div', {'class': 'app-404'}):
 91 |             _logger.debug(f'404 for story: {url}')
 92 |             return {'url': url, 'story': None, 'comments': []}
 93 | 
 94 |         story = _parse_story_soup(story_soup)
 95 | 
 96 |         parser = _CommentsParser()
 97 |         start_comment_id = 0
 98 |         prev_n_comments_parsed = None
 99 |         while prev_n_comments_parsed != parser.n_comments_parsed:
100 |             data = _get_payload_data(story_id, start_comment_id)
101 |             headers = _get_headers(url)
102 |             result = await self.perform_request(_GET_COMMENTS_URL, headers=headers, data=data, method='post')
103 | 
104 |             _logger.debug(f'Parsing result for story: {url}')
105 | 
106 |             result_data = json.loads(result)['data']
107 |             start_comment_id = result_data['last_id']
108 |             prev_n_comments_parsed = parser.n_comments_parsed
109 | 
110 |             for comment_data in result_data['comments']:
111 |                 comment_soup = bs4.BeautifulSoup(comment_data['html'], features="html.parser")
112 |                 parser.parse_comment_and_children(comment_soup)
113 | 
114 |             _logger.debug(f'{parser.n_comments_parsed} comments parsed: {url}')
115 | 
116 |         self._n_urls_to_parse -= 1
117 |         _logger.info(f'{url} Comments: {parser.n_comments_parsed}, Left: {self._n_urls_to_parse}')
118 | 
119 |         result = {'url': url, 'story': story, 'comments': parser.id_to_comment}
120 |         return result
121 | 
122 | 
123 | class _CommentsParser:
124 |     def __init__(self):
125 |         self._id_to_comment = {}
126 | 
127 |     @property
128 |     def n_comments_parsed(self):
129 |         return len(self._id_to_comment)
130 | 
131 |     @property
132 |     def id_to_comment(self):
133 |         id_to_comment = copy.deepcopy(self._id_to_comment)
134 |         for comment in id_to_comment.values():
135 |             comment['children'] = sorted(list(comment['children']))
136 | 
137 |         return id_to_comment
138 | 
139 |     def parse_comment_and_children(self, comment_soup):
140 |         comment_soups = comment_soup.find_all('div', {'class': 'comment'})
141 | 
142 |         for comment_soup in comment_soups:
143 |             comment = _parse_comment_soup(comment_soup)
144 |             self._id_to_comment[comment['id']] = comment
145 |             parent_id = comment['parent_id']
146 |             if parent_id != 0:
147 |                 self._id_to_comment[parent_id]['children'].add(comment['id'])
148 | 
149 | 
150 | def _parse_comment_soup(soup):
151 |     body = soup.find('div', {'class': 'comment__body'})
152 |     meta = soup['data-meta']
153 | 
154 |     id_ = int(_get_meta_tag(meta, r'^id=(\d+),', raise_if_not_found=True))
155 |     pid = int(_get_meta_tag(meta, r',pid=(\d+),', raise_if_not_found=True))
156 |     date = _get_meta_tag(meta, r',d=(.+),de=')
157 |     rating = int(_get_meta_tag(meta, r'r=(\d+),', default=0))
158 | 
159 |     comment = {
160 |         'user_nick': body.find('span', {
161 |             'class': 'user__nick'
162 |         }).get_text(' '),
163 |         'text': body.find('div', {
164 |             'class': 'comment__content'
165 |         }).get_text('\n'),
166 |         'id': id_,
167 |         'parent_id': pid,
168 |         'date': date,
169 |         'rating': rating,
170 |         'children': set()
171 |     }
172 |     return comment
173 | 
174 | 
175 | def _get_meta_tag(meta, regex, default=None, raise_if_not_found=False):
176 |     match = re.search(regex, meta)
177 |     if not match and raise_if_not_found:
178 |         raise ValueError(f"Can't find regex: {regex} in meta: {meta}")
179 |     elif not match:
180 |         return default
181 |     else:
182 |         return match.group(1)
183 | 
184 | 
185 | def _get_headers(referer=None):
186 |     headers = {
187 |         'authority': 'pikabu.ru',
188 |         'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' \
189 |                       'Chrome/86.0.4240.198 Safari/537.36',
190 |         'origin': 'https://pikabu.ru'
191 |     }
192 |     if referer:
193 |         headers['referer'] = referer
194 |     return headers
195 | 
196 | 
197 | def _get_payload_data(story_id, start_comment_id):
198 |     return {'action': 'get_story_comments', 'story_id': story_id, 'start_comment_id': start_comment_id}
199 | 
200 | 
201 | def _get_element_text(elem, separator='', default=None):
202 |     return elem.get_text(separator) if elem else default
203 | 
204 | 
205 | def _parse_story_soup(soup):
206 |     story_main = soup.find('div', {'class': 'story__main'})
207 | 
208 |     title = _get_element_text(story_main.find('span', {'class': 'story__title-link'}), ' ')
209 |     text = _get_element_text(story_main.find('div', {'class': 'story-block story-block_type_text'}), '\n')
210 |     user_nick = _get_element_text(story_main.find('a', {'class': 'user__nick'}), ' ')
211 |     tags = sorted(set(_get_element_text(tag, ' ') for tag in story_main.find_all('a', {'class': 'tags__tag'})))
212 |     shares = int(_get_element_text(story_main.find('span', {'class': 'story__share-count'}), default=0))
213 |     saves = int(_get_element_text(story_main.find('span', {'class': 'story__save-count'}), default=0))
214 |     rating = int(_get_element_text(story_main.find('span', {'class': 'story__rating-count'}), default=0))
215 | 
216 |     comments_count = _get_element_text(story_main.find('span', {'class': 'story__comments-link-count'}), default='0')
217 |     comments_count = int(re.findall(r'\d+', comments_count)[0])
218 | 
219 |     time_ = story_main.find('time') or dict()
220 |     timestamp = time_.get('datetime')
221 | 
222 |     res = {
223 |         "title": title,
224 |         "text": text,
225 |         "user_nick": user_nick,
226 |         "tags": tags,
227 |         "comments_count": comments_count,
228 |         "shares": shares,
229 |         "saves": saves,
230 |         "timestamp": timestamp,
231 |         "rating": rating
232 |     }
233 | 
234 |     return res
235 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/pikabu/story_links_crawler.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import datetime
  3 | import json
  4 | import logging
  5 | import os
  6 | from pathlib import Path
  7 | 
  8 | import aiofiles
  9 | import bs4
 10 | from more_itertools import chunked
 11 | 
 12 | from dialogs_data_parsers.common.crawler import Crawler
 13 | 
 14 | _logger = logging.getLogger(__name__)
 15 | _DAYS_CHUNK_SIZE = 30
 16 | 
 17 | 
 18 | def _get_days_range(start_day, end_day):
 19 |     start = datetime.datetime.strptime(start_day, "%d-%m-%Y")
 20 |     end = datetime.datetime.strptime(end_day, "%d-%m-%Y")
 21 |     dates_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days + 1)]
 22 |     days = [date.strftime("%d-%m-%Y") for date in dates_generated]
 23 |     return days
 24 | 
 25 | 
 26 | class PikabuStoryLinksCrawler(Crawler):
 27 |     def __init__(self, concurrency, timeout, retries, out_dir, start_day, end_day, pikabu_section):
 28 |         super().__init__(concurrency=concurrency, timeout=timeout, retries=retries)
 29 | 
 30 |         self._out_dir = out_dir
 31 |         self._start_day = start_day
 32 |         self._end_day = end_day
 33 |         self._pikabu_section = pikabu_section
 34 |         self._n_total_links = 0
 35 | 
 36 |     async def run(self):
 37 |         Path(self._out_dir).mkdir(exist_ok=True, parents=True)
 38 |         days_range = _get_days_range(self._start_day, self._end_day)
 39 |         parsed_days = [path.name for path in Path(self._out_dir).iterdir()]
 40 |         days_range = set(days_range) - set(parsed_days)
 41 | 
 42 |         for days_chunk in chunked(days_range, n=_DAYS_CHUNK_SIZE):
 43 |             coroutines = [self._crawl(day) for day in days_chunk]
 44 |             await asyncio.gather(*coroutines)
 45 | 
 46 |     async def _crawl(self, day):
 47 |         links = await self._get_story_links(day=day)
 48 |         out_file_path = os.path.join(self._out_dir, day)
 49 |         async with aiofiles.open(out_file_path, "w") as f:
 50 |             await f.write('\n'.join(links))
 51 |             await f.flush()
 52 | 
 53 |     async def _get_story_links(self, day):
 54 |         current_page_id = 1
 55 |         links = set()
 56 |         headers = _get_headers(day=day, pikabu_section=self._pikabu_section)
 57 |         url = _get_url(day=day, pikabu_section=self._pikabu_section)
 58 | 
 59 |         while True:
 60 |             old_number_of_links = len(links)
 61 |             params = _get_params(page_number=current_page_id)
 62 |             response_text = await self.perform_request(url=url, headers=headers, params=params, method='get')
 63 |             stories = json.loads(response_text)['data']['stories']
 64 |             story_soups = [bs4.BeautifulSoup(s['html'], features="html.parser") for s in stories]
 65 | 
 66 |             for story_soup in story_soups:
 67 |                 link_element = story_soup.find('a', {'class': 'story__title-link'})
 68 |                 if link_element is not None:
 69 |                     href = link_element.get('href')
 70 |                     if href:
 71 |                         links.add(href)
 72 | 
 73 |             new_number_of_links = len(links)
 74 | 
 75 |             if old_number_of_links < new_number_of_links:
 76 |                 _logger.debug(f'Day: {day}, links obtained: {new_number_of_links}, pages scrolled: {current_page_id}')
 77 |                 current_page_id += 1
 78 |             else:
 79 |                 break
 80 | 
 81 |         self._n_total_links += len(links)
 82 |         _logger.info(f'Day: {day} done, total number of links: {self._n_total_links}')
 83 | 
 84 |         return links
 85 | 
 86 | 
 87 | def _get_headers(day, pikabu_section):
 88 |     headers = {'referer': f'https://pikabu.ru/{pikabu_section}/{day}'}
 89 |     return headers
 90 | 
 91 | 
 92 | def _get_params(page_number):
 93 |     params = (('twitmode', '1'), ('of', 'v2'), ('page', f'{page_number}'), ('_', '1574097199724'))
 94 |     return params
 95 | 
 96 | 
 97 | def _get_url(day, pikabu_section):
 98 |     url = f'https://pikabu.ru/{pikabu_section}/{day}'
 99 |     return url
100 | 


--------------------------------------------------------------------------------
/dialogs_data_parsers/utils.py:
--------------------------------------------------------------------------------
 1 | def iterate_on_parts_by_condition(iterable, condition):
 2 |     cur_chunk = []
 3 |     for elem in iterable:
 4 |         if not condition(elem):
 5 |             cur_chunk.append(elem)
 6 |         else:
 7 |             yield cur_chunk
 8 |             cur_chunk = []
 9 | 
10 |     if cur_chunk:
11 |         yield cur_chunk
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | more_itertools==8.8.0
2 | beautifulsoup4==4.9.3
3 | aiohttp==3.7.4
4 | treelib==1.6.1
5 | aiofiles==0.7.0
6 | tqdm==4.62.1
7 | 


--------------------------------------------------------------------------------
/scripts/annotate_flibusta_raw_dialogs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from dialogs_data_parsers.flibusta.author_words_annotation_generator import FlibustaAuthorWordsAnnotationGenerator
 4 | 
 5 | 
 6 | def _parse_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--raw_dialogs_file_path', type=str, required=True)
 9 |     parser.add_argument('--out_file_path', type=str, required=True)
10 |     parser.add_argument('--n_samples', type=int, required=True)
11 |     parser.add_argument('--augment_p', type=float, required=False, default=0.3)
12 | 
13 |     args = parser.parse_args()
14 |     return args
15 | 
16 | 
17 | def main():
18 |     args = _parse_args()
19 |     samples_generator = FlibustaAuthorWordsAnnotationGenerator(
20 |         raw_dialogs_file_path=args.raw_dialogs_file_path,
21 |         out_file_path=args.out_file_path,
22 |         n_samples=args.n_samples,
23 |         augment_p=args.augment_p)
24 | 
25 |     samples_generator.run()
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/scripts/crawl_pikabu_stories.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import os
 4 | 
 5 | from dialogs_data_parsers.common.log_config import prepare_logging
 6 | from dialogs_data_parsers.pikabu.story_crawler import PikabuStoryCrawler
 7 | 
 8 | 
 9 | def _parse_args():
10 |     parser = argparse.ArgumentParser(description='Crawls pikabu stories and stores them in one jsonl file.')
11 |     parser.add_argument(
12 |         '--root_dir',
13 |         type=str,
14 |         required=True,
15 |         help='Path to the root pikabu results directory. Sub-directory with links will be created there.')
16 |     parser.add_argument('--concurrency', type=int, required=False, default=12, help='Number of concurrent requests.')
17 |     parser.add_argument('--timeout', type=int, required=False, default=10, help='Timeout in seconds.')
18 |     parser.add_argument('--retries', type=int, required=False, default=5, help='Number of request retries.')
19 | 
20 |     args = parser.parse_args()
21 |     return args
22 | 
23 | 
24 | def main():
25 |     args = _parse_args()
26 | 
27 |     out_file_path = os.path.join(args.root_dir, 'stories.jsonl')
28 |     story_links_dir = os.path.join(args.root_dir, 'story_links')
29 |     logs_dir = os.path.join(args.root_dir, 'logs')
30 |     prepare_logging(logs_dir, log_files_prefix='stories_')
31 | 
32 |     crawler = PikabuStoryCrawler.from_story_links_dir(
33 |         concurrency=args.concurrency,
34 |         timeout=args.timeout,
35 |         retries=args.retries,
36 |         story_links_dir=story_links_dir,
37 |         out_file_path=out_file_path)
38 | 
39 |     loop = asyncio.get_event_loop()
40 |     loop.run_until_complete(crawler.run())
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/scripts/crawl_pikabu_story_links.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import datetime
 4 | import os
 5 | 
 6 | from dialogs_data_parsers.common.log_config import prepare_logging
 7 | from dialogs_data_parsers.pikabu.story_links_crawler import PikabuStoryLinksCrawler
 8 | 
 9 | 
10 | def _parse_args():
11 |     parser = argparse.ArgumentParser(description='Crawls pikabu story links and stores them in day-separated files.')
12 |     parser.add_argument(
13 |         '--root_dir',
14 |         type=str,
15 |         required=True,
16 |         help='Path to the root pikabu results directory. Sub-directory with links will be created there.')
17 |     parser.add_argument(
18 |         '--start_day', type=str, required=False, default='01-09-2010', help='Stories to crawl start day (%d-%m-%Y).')
19 |     parser.add_argument(
20 |         '--end_day',
21 |         type=str,
22 |         required=False,
23 |         default=_get_default_end_day(),
24 |         help='Stories to crawl end day (%d-%m-%Y).')
25 |     parser.add_argument('--concurrency', type=int, required=False, default=12, help='Number of concurrent requests.')
26 |     parser.add_argument('--timeout', type=int, required=False, default=10, help='Timeout in seconds.')
27 |     parser.add_argument('--retries', type=int, required=False, default=5, help='Number of request retries.')
28 |     parser.add_argument('--pikabu_section', type=str, required=False, default='best', help='Pikabu section to crawl.')
29 | 
30 |     args = parser.parse_args()
31 |     return args
32 | 
33 | 
34 | def _get_default_end_day():
35 |     date = datetime.datetime.now().date() - datetime.timedelta(days=1)
36 |     return date.strftime("%d-%m-%Y")
37 | 
38 | 
39 | def main():
40 |     args = _parse_args()
41 | 
42 |     out_dir = os.path.join(args.root_dir, 'story_links')
43 |     logs_dir = os.path.join(args.root_dir, 'logs')
44 |     prepare_logging(logs_dir, log_files_prefix='story_links_')
45 |     crawler = PikabuStoryLinksCrawler(
46 |         concurrency=args.concurrency,
47 |         timeout=args.timeout,
48 |         retries=args.retries,
49 |         out_dir=out_dir,
50 |         start_day=args.start_day,
51 |         end_day=args.end_day,
52 |         pikabu_section=args.pikabu_section)
53 | 
54 |     loop = asyncio.get_event_loop()
55 |     loop.run_until_complete(crawler.run())
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/scripts/prepare_flibusta_raw_dialogs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from dialogs_data_parsers.common.log_config import prepare_logging
 4 | from dialogs_data_parsers.flibusta.dialogs_parser import FlibustaDialogsParser
 5 | 
 6 | 
 7 | def _parse_args():
 8 |     parser = argparse.ArgumentParser(description='Parses flibusta dialogs from flibusta fb2 archives.')
 9 |     parser.add_argument(
10 |         '--flibusta_archives_dir',
11 |         type=str,
12 |         required=True,
13 |         help='Path to the dir with flibusta zip archives. Each archive contains fb2 files.')
14 |     parser.add_argument('--out_file_path', type=str, required=True, help='Path to the output dialogs file.')
15 |     parser.add_argument('--logs_dir', type=str, required=True, help='Path to the logs directory.')
16 | 
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | def main():
22 |     args = _parse_args()
23 |     prepare_logging(args.logs_dir)
24 |     parser = FlibustaDialogsParser(args.flibusta_archives_dir, args.out_file_path)
25 |     parser.run()
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | _THIS_DIR = pathlib.Path(__file__).parent
 6 | 
 7 | 
 8 | def _get_requirements():
 9 |     with (_THIS_DIR / 'requirements.txt').open() as fp:
10 |         return fp.read()
11 | 
12 | 
13 | setup(
14 |     name='dialogs_data_parsers',
15 |     version='0.0.1',
16 |     install_requires=_get_requirements(),
17 |     package_dir={'dialogs_data_parsers': 'dialogs_data_parsers'},
18 |     packages=find_packages(exclude=['tests', 'tests.*']))
19 | 


--------------------------------------------------------------------------------