├── .gitignore ├── .isort.cfg ├── .style.yapf ├── README.md ├── dialogs_data_parsers ├── __init__.py ├── common │ ├── __init__.py │ ├── crawler.py │ └── log_config.py ├── flibusta │ ├── __init__.py │ ├── author_words_annotation_generator.py │ ├── dialogs_iterator.py │ └── dialogs_parser.py ├── pikabu │ ├── __init__.py │ ├── dialogs_iterator.py │ ├── story_crawler.py │ └── story_links_crawler.py └── utils.py ├── requirements.txt ├── scripts ├── annotate_flibusta_raw_dialogs.py ├── crawl_pikabu_stories.py ├── crawl_pikabu_story_links.py └── prepare_flibusta_raw_dialogs.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # idea 107 | /.idea/ 108 | 109 | # custom 110 | /data/ 111 | .vscode 112 | /logs/ 113 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | line_length=120 3 | multi_line_output=2 4 | balanced_wrapping=True 5 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | BASED_ON_STYLE = google 3 | 4 | COLUMN_LIMIT = 120 5 | SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = true 6 | SPACES_AROUND_POWER_OPERATOR = true 7 | INDENT_DICTIONARY_VALUE = true 8 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dialogs Data Parsers 2 | Репозиторий для парсинга диалоговых данных. 3 | **Только для исследовательских целей!** 4 | 5 | ```shell script 6 | pip install -U -e . 7 | ``` 8 | 9 | ## Pikabu 10 | #### Parsing 11 | Для сбора и парсинга данных с [pikabu](https://pikabu.ru) нужно сперва собрать ссылки на истории: 12 | ```shell script 13 | python scripts/crawl_pikabu_story_links.py --root_dir path/to/output/dir 14 | ``` 15 | - *--root_dir* - Путь к root директории, в которой будут лежать результаты парсинга. 16 | 17 | Остальные аргументы можно посмотреть в скрипте (для них есть дефолтные значения). 18 | 19 | После того, как ссылки на истории собраны, можно запускать парсер: 20 | ```shell script 21 | python scripts/crawl_pikabu_stories.py --root_dir path/to/output/dir 22 | ``` 23 | - *--root_dir* - Путь к root директории, в которой лежат ссылки на истории (тот же самый путь, что указывали в 24 | предыдущем скрипте). 25 | 26 | Парсинг всего pikabu длится примерно неделю. Скрипт устойчив к прерываниям. Если по какой-то причине 27 | парсинг прервался, то его можно перезапустить, указав тот же самый *--root_dir*. Парсинг продолжится с 28 | того же места, где был прерван. 29 | 30 | #### Data format 31 | Результатом парсинга pikabu является jsonl файл. Каждая строчка - отдельный json со структурой 32 | (пример изображён с индентацией, но в настоящем файле этот Json будет записан в одну строку): 33 | ```json 34 | { 35 | "url": "story url", 36 | "story": { 37 | "title": "story title", 38 | "text": "story text", 39 | "user_nick": "story author", 40 | "tags": ["tag1", "tag2", "tag3"], 41 | "comments_count": 42, 42 | "shares": 10, 43 | "saves": 228, 44 | "timestamp": "2017-04-21T11:38:56+03:00", 45 | "rating": 5 46 | }, 47 | "comments": { 48 | "8811": { 49 | "text": "comment text", 50 | "parent_id": 0, 51 | "children": [8812, 8813] 52 | }, 53 | "8812": { 54 | "text": "comment text", 55 | "parent_id": 8811, 56 | "children": [8814, 8815, 8816] 57 | } 58 | } 59 | } 60 | ``` 61 | Несколько важных моментов: 62 | - Ключи в словаре `comments` - это id комментариев. Как ключи, они имеют текстовый формат, однако, id, которые 63 | записаны в поля `parent_id` и `children` - имеют integer тип. Учитывайте это во время парсинга файла. 64 | - Если у комментария `parent_id` равен 0, то это значит, что у комментария нет родителя (в данном случае саму историю 65 | можно воспринимать как родителя). 66 | - Комментарии хранятся в формате дерева. Такой формат можно распарсить в виде диалогов. Пример парсера в 67 | `examples/pikabu_dialogs_iterator` 68 | 69 | ## Flibusta 70 | #### Parsing 71 | Для того, чтобы распарсить диалоги из книг с флибусты нужен дамп книг. Ссылку на дамп я здесь, разумеется, 72 | прикладывать не буду :) 73 | 74 | Дапм представляет из себя директорию с множеством .zip архивов примерно с такими названиями: `f.fb2-188228-190111.zip`. 75 | Каждый архив в свою очередь содержит множество файлов формата `fb2`. 76 | 77 | Для парсинга диалогов используется скрипт: 78 | ```shell script 79 | python parse_flibusta_dialogs.py --flibusta_archives_dir path/to/flibusta/dir/with/archives --out_file_path path/to/dialogs/results/file --logs_dir path/to/logs/dir 80 | ``` 81 | - *--flibusta_archives_dir* - Путь к root директории со всеми архивами; 82 | - *--out_file_path* - Путь к выходному jsonl файлу с диалогами; 83 | - *--logs_dir* - Путь к директории, куда будут писаться логи. 84 | 85 | Парсинг 130 архивов длится примерно 13 часов и это примерно 40-50 миллионов диалогов. Можно переписать на мультипроцессинге 86 | и парсинг будет за 2 часа. Но мне лень. 87 | 88 | #### Data format 89 | Результатом парсинга flibusta является jsonl файл. Каждая строчка этого файла - просто список сообщений: 90 | ```json 91 | ["Привет, как дела?", "Нормально", "Ясно, понятно"] 92 | ``` 93 | 94 | По идее, в этих данных должны быть отфильтрованы слова автора, но возможно иногда они будут попадаться. 95 | Плюс, возможны другие аномалии. Но беглый ручной осмотр пары сотен диалогов ничего странного не выявил. -------------------------------------------------------------------------------- /dialogs_data_parsers/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from dialogs_data_parsers.common import log_config 4 | 5 | sys.excepthook = log_config.handle_unhandled_exception 6 | 7 | __version__ = '0.0.1' 8 | -------------------------------------------------------------------------------- /dialogs_data_parsers/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/common/__init__.py -------------------------------------------------------------------------------- /dialogs_data_parsers/common/crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from functools import partial 4 | from typing import Optional 5 | 6 | import aiohttp 7 | 8 | _logger = logging.getLogger(__name__) 9 | 10 | 11 | class Crawler: 12 | def __init__(self, concurrency, timeout, retries): 13 | self._timeout = timeout 14 | self._retries = retries 15 | 16 | self._semaphore = asyncio.BoundedSemaphore(concurrency) 17 | 18 | async def perform_request(self, url, headers=None, data=None, params=None, method='get') -> Optional[str]: 19 | """Requests a page and returns content.""" 20 | _logger.debug(f'Requesting page: {url}') 21 | i_retry = 0 22 | async with self._get_session(headers=headers) as session: 23 | while i_retry < self._retries: 24 | try: 25 | method = session.get if method == 'get' else partial(session.post, data=data) 26 | async with self._semaphore, method(url, allow_redirects=False, params=params) as response: 27 | text = await response.text() 28 | _logger.debug(f'Page source obtained: {url}') 29 | return text 30 | except asyncio.TimeoutError: 31 | i_retry += 1 32 | _logger.warning(f'Timeout for page [{i_retry}/{self._retries}]: {url}') 33 | else: 34 | _logger.warning(f'Max number of retries exceeded for page: {url}') 35 | return None 36 | 37 | def _get_session(self, headers): 38 | connector = aiohttp.TCPConnector() 39 | timeout = aiohttp.ClientTimeout(total=self._timeout) 40 | session = aiohttp.ClientSession(connector=connector, timeout=timeout, headers=headers) 41 | 42 | return session 43 | -------------------------------------------------------------------------------- /dialogs_data_parsers/common/log_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import pathlib 4 | import sys 5 | from typing import Dict 6 | 7 | _LOGGER = logging.getLogger(__name__) 8 | _FORMATTER = '[%(asctime)s %(module)s %(funcName)s %(levelname)s] %(message)s' 9 | 10 | 11 | def prepare_logging(logs_dir, log_files_prefix=''): 12 | """Configures logging.""" 13 | log_config = _get_log_config(logs_dir, log_files_prefix) 14 | logging.config.dictConfig(log_config) 15 | 16 | 17 | def handle_unhandled_exception(exc_type, exc_value, exc_traceback): 18 | """Handler for unhandled exceptions that will write to the logs""" 19 | if issubclass(exc_type, KeyboardInterrupt): 20 | # call the default excepthook saved at __excepthook__ 21 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 22 | return 23 | _LOGGER.critical("Unhandled exception", exc_info=(exc_type, exc_value, exc_traceback)) 24 | 25 | 26 | def _get_rotating_file_handler(log_file: str, level: str, max_bytes: int = 10485760, backup_count: int = 5) -> Dict: 27 | handler_dict = { 28 | 'class': 'logging.handlers.RotatingFileHandler', 29 | 'level': level, 30 | 'formatter': 'default', 31 | 'filename': log_file, 32 | 'mode': 'a', 33 | 'maxBytes': max_bytes, 34 | 'backupCount': backup_count, 35 | } 36 | 37 | return handler_dict 38 | 39 | 40 | def _get_console_output_handler(level) -> Dict: 41 | handler_dict = { 42 | 'class': 'logging.StreamHandler', 43 | 'level': level, 44 | 'formatter': 'default', 45 | } 46 | 47 | return handler_dict 48 | 49 | 50 | def _get_log_config(log_dir, log_files_prefix) -> dict: 51 | log_dir = pathlib.Path(log_dir) 52 | 53 | log_dir.mkdir(exist_ok=True, parents=True) 54 | info_file = str(log_dir / f'{log_files_prefix}info.log') 55 | errors_file = str(log_dir / f'{log_files_prefix}errors.log') 56 | debug_file = str(log_dir / f'{log_files_prefix}debug.log') 57 | 58 | handlers = { 59 | 'info_file': _get_rotating_file_handler(info_file, 'INFO'), 60 | 'debug_file': _get_rotating_file_handler(debug_file, 'DEBUG'), 61 | 'errors_file': _get_rotating_file_handler(errors_file, 'ERROR'), 62 | 'console': _get_console_output_handler('INFO') 63 | } 64 | 65 | log_config = { 66 | 'disable_existing_loggers': False, 67 | 'version': 1, 68 | 'formatters': { 69 | 'default': { 70 | 'format': _FORMATTER 71 | } 72 | }, 73 | 'handlers': handlers, 74 | 'loggers': { 75 | '': { 76 | 'handlers': list(handlers.keys()), 77 | 'level': 'DEBUG' 78 | } 79 | } 80 | } 81 | 82 | return log_config 83 | -------------------------------------------------------------------------------- /dialogs_data_parsers/flibusta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/flibusta/__init__.py -------------------------------------------------------------------------------- /dialogs_data_parsers/flibusta/author_words_annotation_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import re 4 | from itertools import chain, cycle 5 | from pathlib import Path 6 | 7 | from dialogs_data_parsers.flibusta.dialogs_parser import DIALOG_SEPARATORS 8 | 9 | _PERSON_FLAG = 0 10 | _AUTHOR_FLAG = 1 11 | _PUNCT_FLAG = 2 12 | _DASH_FLAG = 3 13 | _SPLIT_FLAGS = (_PERSON_FLAG, _PUNCT_FLAG, _DASH_FLAG, _AUTHOR_FLAG, _PUNCT_FLAG, _DASH_FLAG) 14 | 15 | _DIALOGS_SEPARATORS_SET = set(DIALOG_SEPARATORS) 16 | _AUTHOR_WORDS_SEPARATOR_PATTERN = re.compile(f'([.,!?:;]+)(\s?[{DIALOG_SEPARATORS}])') 17 | _AUGMENT_PUNCT_CHOICES = list(set(chain(*[[symbol * i for i in range(0, 4)] for symbol in '.,!?:; ']))) 18 | _AUGMENT_DASH_CHOICES = list( 19 | set(chain(*[[' ' * i + symbol for i in range(0, 4)] for symbol in list(_DIALOGS_SEPARATORS_SET) + [' ']]))) 20 | 21 | 22 | class FlibustaAuthorWordsAnnotationGenerator: 23 | def __init__(self, raw_dialogs_file_path, out_file_path, n_samples, augment_p): 24 | self._raw_dialogs_file_path = raw_dialogs_file_path 25 | self._augment_p = augment_p 26 | self._out_file_path = Path(out_file_path) 27 | self._n_samples = int(n_samples) 28 | 29 | def run(self): 30 | utterances = self._iterate_on_utterances() 31 | n_samples_done = 0 32 | 33 | self._out_file_path.parent.mkdir(exist_ok=True, parents=True) 34 | 35 | with open(self._out_file_path, 'w') as out_file: 36 | for utterance in utterances: 37 | augmented_split_utterance_and_flags = self._generate_augmented_split_utterance_and_flags(utterance) 38 | if len(augmented_split_utterance_and_flags) == 0: 39 | continue 40 | payload = json.dumps(augmented_split_utterance_and_flags, ensure_ascii=False) 41 | out_file.write(payload) 42 | out_file.write('\n') 43 | n_samples_done += 1 44 | 45 | if n_samples_done == self._n_samples: 46 | break 47 | 48 | if n_samples_done % 10000 == 0: 49 | print(f'Samples: {n_samples_done}/{self._n_samples}') 50 | 51 | def _iterate_on_utterances(self): 52 | with open(self._raw_dialogs_file_path) as file: 53 | for line in file: 54 | dialog = json.loads(line) 55 | for utterance in dialog: 56 | yield utterance 57 | 58 | def _generate_augmented_split_utterance_and_flags(self, utterance): 59 | split_utterance = _AUTHOR_WORDS_SEPARATOR_PATTERN.split(utterance) 60 | split_utterance = [utterance for utterance in split_utterance if len(utterance)] 61 | 62 | augmented_split_utterance = [] 63 | augmented_split_utterance_flags = [] 64 | 65 | for sub_utterance, flag in zip(split_utterance, cycle(_SPLIT_FLAGS)): 66 | if flag != _PUNCT_FLAG and flag != _DASH_FLAG: 67 | augmented_split_utterance.append(sub_utterance) 68 | augmented_split_utterance_flags.append(flag) 69 | else: 70 | if random.random() <= self._augment_p: 71 | choices = _AUGMENT_PUNCT_CHOICES if flag == _PUNCT_FLAG else _AUGMENT_DASH_CHOICES 72 | sub_utterance = random.choice(choices) 73 | 74 | augmented_split_utterance[-1] += sub_utterance 75 | 76 | augmented_split_utterance_and_flags = list(zip(augmented_split_utterance, augmented_split_utterance_flags)) 77 | 78 | return augmented_split_utterance_and_flags 79 | -------------------------------------------------------------------------------- /dialogs_data_parsers/flibusta/dialogs_iterator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | _logger = logging.getLogger(__name__) 5 | 6 | 7 | class FlibustaDialogsIterator: 8 | def __init__(self, file_path, logging_period): 9 | self._file_path = file_path 10 | self._logging_period = logging_period 11 | 12 | def __iter__(self): 13 | with open(self._file_path) as file: 14 | n_samples_done = 0 15 | for n_lines_done, raw_line in enumerate(file, start=1): 16 | if self._logging_period and n_lines_done % self._logging_period == 0: 17 | _logger.info(f'Flibusta lines: {n_lines_done}, samples: {n_samples_done}') 18 | 19 | dialog = json.loads(raw_line) 20 | 21 | for n_utterances in range(2, len(dialog) + 1): 22 | subdialog = dialog[:n_utterances] 23 | n_samples_done += 1 24 | yield subdialog 25 | -------------------------------------------------------------------------------- /dialogs_data_parsers/flibusta/dialogs_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import multiprocessing 4 | import re 5 | import unicodedata 6 | from pathlib import Path 7 | from zipfile import BadZipFile, ZipFile 8 | 9 | import bs4 10 | from more_itertools import chunked 11 | 12 | _logger = logging.getLogger(__name__) 13 | logging.getLogger("filelock").setLevel(logging.WARNING) 14 | 15 | DIALOG_SEPARATORS = '-‐‑‒–—―₋−⸺⸻﹘﹣-' 16 | 17 | 18 | class FlibustaDialogsParser: 19 | _MIN_N_UTTERANCES = 2 20 | _ARCHIVE_PATTERN = re.compile(r'.*fb2-.+\.zip$') 21 | 22 | _DIALOGS_CHUNK_WRITE_SIZE = 1000 23 | 24 | def __init__(self, flibusta_archives_dir, out_file_path): 25 | self._flibusta_archives_dir = flibusta_archives_dir 26 | self._out_file_path = Path(out_file_path) 27 | self._out_file_path.parent.mkdir(exist_ok=True, parents=True) 28 | if self._out_file_path.is_file(): 29 | self._out_file_path.unlink() 30 | 31 | manager = multiprocessing.Manager() 32 | self._archives_counter = manager.Value('i', 0) 33 | self._dialogs_counter = manager.Value('i', 0) 34 | self._out_file_lock = manager.Lock() 35 | self._archive_paths = list(self._iterate_on_archive_paths()) 36 | 37 | def run(self): 38 | with multiprocessing.Pool() as pool: 39 | pool.map(self._parse_archive, self._archive_paths) 40 | 41 | def _iterate_on_archive_paths(self): 42 | for path in Path(self._flibusta_archives_dir).iterdir(): 43 | if self._ARCHIVE_PATTERN.match(path.name): 44 | yield path 45 | 46 | def _parse_archive(self, archive_path): 47 | dialogs = self._iterate_on_dialogs(archive_path) 48 | 49 | for dialogs_chunk in chunked(dialogs, n=self._DIALOGS_CHUNK_WRITE_SIZE): 50 | payloads = [] 51 | for dialog in dialogs_chunk: 52 | payload = json.dumps(dialog, ensure_ascii=False) 53 | payloads.append(payload) 54 | 55 | chunk_payload = '\n'.join(payloads) 56 | 57 | self._out_file_lock.acquire() 58 | with open(self._out_file_path, 'a') as out_file: 59 | out_file.write(chunk_payload) 60 | out_file.write('\n') 61 | out_file.flush() 62 | self._out_file_lock.release() 63 | 64 | self._dialogs_counter.value += len(dialogs_chunk) 65 | _logger.info(f'Archives: {self._archives_counter.value}/{len(self._archive_paths)}, ' 66 | f'Dialogs: {self._dialogs_counter.value}') 67 | 68 | self._archives_counter.value += 1 69 | 70 | def _iterate_on_dialogs(self, archive_path): 71 | book_texts = self._iterate_on_book_texts(archive_path) 72 | dialog_separators_set = set(DIALOG_SEPARATORS) 73 | 74 | for book_text in book_texts: 75 | book_text_lines = re.split('\n+', book_text) 76 | dialog = [] 77 | 78 | for line in book_text_lines: 79 | line = line.strip() 80 | 81 | if len(line) > 2 and line[0] in dialog_separators_set: 82 | line = unicodedata.normalize("NFKC", line) 83 | line = re.sub(r'^\W+', '', line) 84 | dialog.append(line) 85 | else: 86 | if len(dialog) >= self._MIN_N_UTTERANCES: 87 | yield dialog 88 | 89 | dialog = [] 90 | 91 | if len(dialog) >= self._MIN_N_UTTERANCES: 92 | yield dialog 93 | 94 | def _iterate_on_book_texts(self, archive_path): 95 | try: 96 | with ZipFile(archive_path, 'r') as zip_file: 97 | for file_name in zip_file.namelist(): 98 | raw_fb2_text = zip_file.read(file_name) 99 | book_soup = bs4.BeautifulSoup(raw_fb2_text, features="html.parser") 100 | lang_tag = book_soup.find('lang') 101 | 102 | if lang_tag and lang_tag.text.lower().strip() == 'ru': 103 | book_text = book_soup.text 104 | yield book_text 105 | except BadZipFile: 106 | _logger.warning(f'Bad zip file: {archive_path}') 107 | -------------------------------------------------------------------------------- /dialogs_data_parsers/pikabu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexeykarnachev/dialogs_data_parsers/64c86e27bb9af0ab9c9734d275eef7fdb9965b71/dialogs_data_parsers/pikabu/__init__.py -------------------------------------------------------------------------------- /dialogs_data_parsers/pikabu/dialogs_iterator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | from typing import Optional 5 | 6 | from treelib import Tree 7 | 8 | from dialogs_data_parsers.utils import iterate_on_parts_by_condition 9 | 10 | _logger = logging.getLogger(__name__) 11 | 12 | 13 | class PikabuDialogsWithMetaIterator: 14 | def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000): 15 | self._file_path = file_path 16 | self._logging_period = logging_period 17 | self._max_n_words_per_utterance = max_n_words_per_utterance 18 | 19 | def __iter__(self): 20 | with open(self._file_path) as file: 21 | n_samples_done = 0 22 | for n_lines_done, raw_line in enumerate(file, start=1): 23 | if self._logging_period and n_lines_done % self._logging_period == 0: 24 | _logger.info(f'Pikabu lines: {n_lines_done}, samples: {n_samples_done}') 25 | 26 | line_data = json.loads(raw_line) 27 | dialog_tree = self._get_dialog_tree(line_data) 28 | dialogs = _iterate_on_dialogs_from_tree(dialog_tree) 29 | dialogs = set(dialogs) 30 | 31 | subdialogs = set() 32 | 33 | for dialog in dialogs: 34 | dialog = tuple(dialog) 35 | for n_utterances in range(2, len(dialog) + 1): 36 | subdialog = tuple(dialog[:n_utterances]) 37 | subdialogs.add(_Dialog(subdialog)) 38 | 39 | n_samples_done += len(subdialogs) 40 | for subdialog in subdialogs: 41 | yield tuple(subdialog) 42 | 43 | def _get_dialog_tree(self, line_data): 44 | tree = Tree() 45 | tree.create_node(identifier=0) 46 | comments = line_data['comments'] 47 | if comments: 48 | ids_and_comments = ((int(id_), comment) for id_, comment in comments.items()) 49 | ids_and_comments = sorted(ids_and_comments, key=lambda x: x[0]) 50 | 51 | for id_, comment_json in ids_and_comments: 52 | parent_id = int(comment_json['parent_id']) 53 | comment = comment_json['text'] 54 | comment = comment.replace('\n', ' ') 55 | comment_text = self._process_comment(comment) 56 | if comment_text: 57 | meta = comment_json.get('meta') 58 | data = {'text': comment_text, 'meta': meta} 59 | else: 60 | data = None 61 | 62 | tree.create_node(identifier=id_, parent=parent_id, data=data) 63 | 64 | return tree 65 | 66 | def _process_comment(self, text) -> Optional[str]: 67 | if not text: 68 | return None 69 | 70 | if '@' in text or 'http' in text: 71 | return None 72 | 73 | n_words = len(re.findall(r'\w+', text)) 74 | if n_words > self._max_n_words_per_utterance: 75 | return None 76 | 77 | text = text.strip() 78 | if text.startswith('Комментарий удален.'): 79 | return None 80 | 81 | return text 82 | 83 | 84 | class PikabuDialogsIterator(PikabuDialogsWithMetaIterator): 85 | def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000): 86 | super().__init__(file_path, max_n_words_per_utterance=max_n_words_per_utterance, logging_period=logging_period) 87 | 88 | def __iter__(self): 89 | for subdialog in super().__iter__(): 90 | utterances = [utterance['text'] for utterance in subdialog] 91 | yield utterances 92 | 93 | 94 | _UPVOTE_DOWNVOTE_REGEX = re.compile(r'av=(\d+):(\d+)') 95 | _MIN_N_VOTES = 30 96 | _LABEL_THRESHOLD = 0.8 97 | UNK_RATING_LABEL = 0 98 | BALANCED_RATING_LABEL = 1 99 | HIGH_RATING_LABEL = 2 100 | LOW_RATING_LABEL = 3 101 | 102 | 103 | class PikabuDialogsWithResponseRatingIterator(PikabuDialogsWithMetaIterator): 104 | def __init__(self, file_path, max_n_words_per_utterance, logging_period=10000): 105 | super().__init__(file_path, max_n_words_per_utterance=max_n_words_per_utterance, logging_period=logging_period) 106 | 107 | def __iter__(self): 108 | for subdialog in super().__iter__(): 109 | utterances = [utterance['text'] for utterance in subdialog] 110 | response_meta = subdialog[-1]['meta'] 111 | response_rating_label = _get_rating_from_meta(response_meta) 112 | if response_rating_label is not None: 113 | yield {'dialog': utterances, 'label': response_rating_label} 114 | 115 | 116 | def _get_rating_from_meta(response_meta): 117 | upvote_downvote = _UPVOTE_DOWNVOTE_REGEX.findall(response_meta) 118 | label = UNK_RATING_LABEL 119 | if len(upvote_downvote) == 1: 120 | upvote_downvote = upvote_downvote.pop() 121 | upvote, downvote = map(int, upvote_downvote) 122 | n_votes = upvote + downvote 123 | if n_votes >= _MIN_N_VOTES: 124 | upvote_ratio = upvote / n_votes 125 | if upvote_ratio > _LABEL_THRESHOLD: 126 | label = HIGH_RATING_LABEL 127 | elif (1 - upvote_ratio) > _LABEL_THRESHOLD: 128 | label = LOW_RATING_LABEL 129 | else: 130 | label = BALANCED_RATING_LABEL 131 | 132 | return label 133 | 134 | 135 | class _Dialog: 136 | def __init__(self, utterance_dicts): 137 | self._utterance_dicts = utterance_dicts 138 | 139 | def __iter__(self): 140 | yield from self._utterance_dicts 141 | 142 | def __hash__(self): 143 | return hash(tuple(d['text'] for d in self)) 144 | 145 | def __eq__(self, other): 146 | return hash(self) == hash(other) 147 | 148 | 149 | def _iterate_on_dialogs_from_tree(dialog_tree: Tree): 150 | for path in dialog_tree.paths_to_leaves(): 151 | path = path[1:] # Skip dummy root node 152 | dialog = [dialog_tree[p].data for p in path] 153 | 154 | # Split dialog on parts by empty utterance: 155 | dialogs = iterate_on_parts_by_condition(dialog, lambda utterance: not utterance) 156 | 157 | for dialog in dialogs: 158 | yield _Dialog(dialog) 159 | -------------------------------------------------------------------------------- /dialogs_data_parsers/pikabu/story_crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import copy 3 | import json 4 | import logging 5 | import re 6 | from pathlib import Path 7 | 8 | import aiofiles 9 | import bs4 10 | from more_itertools import chunked 11 | 12 | from dialogs_data_parsers.common.crawler import Crawler 13 | 14 | _logger = logging.getLogger(__name__) 15 | _GET_COMMENTS_URL = 'https://pikabu.ru/ajax/comments_actions.php' 16 | _URLS_CHUNK_SIZE = 1000 17 | 18 | 19 | def iterate_on_urls(story_links_dir): 20 | for story_links_file in Path(story_links_dir).iterdir(): 21 | with open(story_links_file) as file: 22 | for url in file: 23 | yield url.strip() 24 | 25 | 26 | class PikabuStoryCrawler(Crawler): 27 | def __init__(self, concurrency, timeout, retries, story_links, out_file_path): 28 | super().__init__(concurrency=concurrency, timeout=timeout, retries=retries) 29 | 30 | self._out_file_path = out_file_path 31 | self._parsed_urls = self._get_parsed_urls() 32 | self._all_urls = set(story_links) 33 | self._urls_to_parse = self._all_urls.difference(self._parsed_urls) 34 | self._n_urls_to_parse = len(self._urls_to_parse) 35 | 36 | @classmethod 37 | def from_story_links_dir(cls, concurrency, timeout, retries, story_links_dir, out_file_path): 38 | story_links = iterate_on_urls(story_links_dir) 39 | return cls( 40 | concurrency=concurrency, 41 | timeout=timeout, 42 | retries=retries, 43 | story_links=story_links, 44 | out_file_path=out_file_path) 45 | 46 | async def run(self): 47 | for urls_chunk in chunked(self._urls_to_parse, n=_URLS_CHUNK_SIZE): 48 | coroutines = [self._crawl(url) for url in urls_chunk] 49 | await asyncio.gather(*coroutines) 50 | 51 | def _get_parsed_urls(self): 52 | urls = set() 53 | if Path(self._out_file_path).is_file(): 54 | with open(self._out_file_path) as file: 55 | for line in file: 56 | url = line.split(',', maxsplit=1)[0].split('"')[-2] 57 | assert url.startswith('https') 58 | urls.add(url) 59 | 60 | return urls 61 | 62 | async def _crawl(self, url): 63 | _logger.debug(f'Crawling story: {url}') 64 | try: 65 | result = await self._get_story_and_comments(url=url) 66 | except Exception: 67 | _logger.exception(f'Failed to crawl story: {url}') 68 | result = None 69 | 70 | if result is None: 71 | _logger.debug(f'Result is None for story: {url}') 72 | return 73 | 74 | async with aiofiles.open(self._out_file_path, "a") as f: 75 | result_str = json.dumps(result, ensure_ascii=False) 76 | await f.write(result_str + '\n') 77 | await f.flush() 78 | _logger.debug(f'Story crawled and saved: {url}') 79 | 80 | async def _get_story_and_comments(self, url): 81 | story_id = url.split('_')[-1] 82 | story_html = await self.perform_request(url, headers=_get_headers(), method='get') 83 | 84 | if not story_html: 85 | return None 86 | 87 | story_soup = bs4.BeautifulSoup(story_html, features="html.parser") 88 | 89 | # Page not exists (deleted) 90 | if story_soup.find('div', {'class': 'app-404'}): 91 | _logger.debug(f'404 for story: {url}') 92 | return {'url': url, 'story': None, 'comments': []} 93 | 94 | story = _parse_story_soup(story_soup) 95 | 96 | parser = _CommentsParser() 97 | start_comment_id = 0 98 | prev_n_comments_parsed = None 99 | while prev_n_comments_parsed != parser.n_comments_parsed: 100 | data = _get_payload_data(story_id, start_comment_id) 101 | headers = _get_headers(url) 102 | result = await self.perform_request(_GET_COMMENTS_URL, headers=headers, data=data, method='post') 103 | 104 | _logger.debug(f'Parsing result for story: {url}') 105 | 106 | result_data = json.loads(result)['data'] 107 | start_comment_id = result_data['last_id'] 108 | prev_n_comments_parsed = parser.n_comments_parsed 109 | 110 | for comment_data in result_data['comments']: 111 | comment_soup = bs4.BeautifulSoup(comment_data['html'], features="html.parser") 112 | parser.parse_comment_and_children(comment_soup) 113 | 114 | _logger.debug(f'{parser.n_comments_parsed} comments parsed: {url}') 115 | 116 | self._n_urls_to_parse -= 1 117 | _logger.info(f'{url} Comments: {parser.n_comments_parsed}, Left: {self._n_urls_to_parse}') 118 | 119 | result = {'url': url, 'story': story, 'comments': parser.id_to_comment} 120 | return result 121 | 122 | 123 | class _CommentsParser: 124 | def __init__(self): 125 | self._id_to_comment = {} 126 | 127 | @property 128 | def n_comments_parsed(self): 129 | return len(self._id_to_comment) 130 | 131 | @property 132 | def id_to_comment(self): 133 | id_to_comment = copy.deepcopy(self._id_to_comment) 134 | for comment in id_to_comment.values(): 135 | comment['children'] = sorted(list(comment['children'])) 136 | 137 | return id_to_comment 138 | 139 | def parse_comment_and_children(self, comment_soup): 140 | comment_soups = comment_soup.find_all('div', {'class': 'comment'}) 141 | 142 | for comment_soup in comment_soups: 143 | comment = _parse_comment_soup(comment_soup) 144 | self._id_to_comment[comment['id']] = comment 145 | parent_id = comment['parent_id'] 146 | if parent_id != 0: 147 | self._id_to_comment[parent_id]['children'].add(comment['id']) 148 | 149 | 150 | def _parse_comment_soup(soup): 151 | body = soup.find('div', {'class': 'comment__body'}) 152 | meta = soup['data-meta'] 153 | 154 | id_ = int(_get_meta_tag(meta, r'^id=(\d+),', raise_if_not_found=True)) 155 | pid = int(_get_meta_tag(meta, r',pid=(\d+),', raise_if_not_found=True)) 156 | date = _get_meta_tag(meta, r',d=(.+),de=') 157 | rating = int(_get_meta_tag(meta, r'r=(\d+),', default=0)) 158 | 159 | comment = { 160 | 'user_nick': body.find('span', { 161 | 'class': 'user__nick' 162 | }).get_text(' '), 163 | 'text': body.find('div', { 164 | 'class': 'comment__content' 165 | }).get_text('\n'), 166 | 'id': id_, 167 | 'parent_id': pid, 168 | 'date': date, 169 | 'rating': rating, 170 | 'children': set() 171 | } 172 | return comment 173 | 174 | 175 | def _get_meta_tag(meta, regex, default=None, raise_if_not_found=False): 176 | match = re.search(regex, meta) 177 | if not match and raise_if_not_found: 178 | raise ValueError(f"Can't find regex: {regex} in meta: {meta}") 179 | elif not match: 180 | return default 181 | else: 182 | return match.group(1) 183 | 184 | 185 | def _get_headers(referer=None): 186 | headers = { 187 | 'authority': 'pikabu.ru', 188 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' \ 189 | 'Chrome/86.0.4240.198 Safari/537.36', 190 | 'origin': 'https://pikabu.ru' 191 | } 192 | if referer: 193 | headers['referer'] = referer 194 | return headers 195 | 196 | 197 | def _get_payload_data(story_id, start_comment_id): 198 | return {'action': 'get_story_comments', 'story_id': story_id, 'start_comment_id': start_comment_id} 199 | 200 | 201 | def _get_element_text(elem, separator='', default=None): 202 | return elem.get_text(separator) if elem else default 203 | 204 | 205 | def _parse_story_soup(soup): 206 | story_main = soup.find('div', {'class': 'story__main'}) 207 | 208 | title = _get_element_text(story_main.find('span', {'class': 'story__title-link'}), ' ') 209 | text = _get_element_text(story_main.find('div', {'class': 'story-block story-block_type_text'}), '\n') 210 | user_nick = _get_element_text(story_main.find('a', {'class': 'user__nick'}), ' ') 211 | tags = sorted(set(_get_element_text(tag, ' ') for tag in story_main.find_all('a', {'class': 'tags__tag'}))) 212 | shares = int(_get_element_text(story_main.find('span', {'class': 'story__share-count'}), default=0)) 213 | saves = int(_get_element_text(story_main.find('span', {'class': 'story__save-count'}), default=0)) 214 | rating = int(_get_element_text(story_main.find('span', {'class': 'story__rating-count'}), default=0)) 215 | 216 | comments_count = _get_element_text(story_main.find('span', {'class': 'story__comments-link-count'}), default='0') 217 | comments_count = int(re.findall(r'\d+', comments_count)[0]) 218 | 219 | time_ = story_main.find('time') or dict() 220 | timestamp = time_.get('datetime') 221 | 222 | res = { 223 | "title": title, 224 | "text": text, 225 | "user_nick": user_nick, 226 | "tags": tags, 227 | "comments_count": comments_count, 228 | "shares": shares, 229 | "saves": saves, 230 | "timestamp": timestamp, 231 | "rating": rating 232 | } 233 | 234 | return res 235 | -------------------------------------------------------------------------------- /dialogs_data_parsers/pikabu/story_links_crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | import json 4 | import logging 5 | import os 6 | from pathlib import Path 7 | 8 | import aiofiles 9 | import bs4 10 | from more_itertools import chunked 11 | 12 | from dialogs_data_parsers.common.crawler import Crawler 13 | 14 | _logger = logging.getLogger(__name__) 15 | _DAYS_CHUNK_SIZE = 30 16 | 17 | 18 | def _get_days_range(start_day, end_day): 19 | start = datetime.datetime.strptime(start_day, "%d-%m-%Y") 20 | end = datetime.datetime.strptime(end_day, "%d-%m-%Y") 21 | dates_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days + 1)] 22 | days = [date.strftime("%d-%m-%Y") for date in dates_generated] 23 | return days 24 | 25 | 26 | class PikabuStoryLinksCrawler(Crawler): 27 | def __init__(self, concurrency, timeout, retries, out_dir, start_day, end_day, pikabu_section): 28 | super().__init__(concurrency=concurrency, timeout=timeout, retries=retries) 29 | 30 | self._out_dir = out_dir 31 | self._start_day = start_day 32 | self._end_day = end_day 33 | self._pikabu_section = pikabu_section 34 | self._n_total_links = 0 35 | 36 | async def run(self): 37 | Path(self._out_dir).mkdir(exist_ok=True, parents=True) 38 | days_range = _get_days_range(self._start_day, self._end_day) 39 | parsed_days = [path.name for path in Path(self._out_dir).iterdir()] 40 | days_range = set(days_range) - set(parsed_days) 41 | 42 | for days_chunk in chunked(days_range, n=_DAYS_CHUNK_SIZE): 43 | coroutines = [self._crawl(day) for day in days_chunk] 44 | await asyncio.gather(*coroutines) 45 | 46 | async def _crawl(self, day): 47 | links = await self._get_story_links(day=day) 48 | out_file_path = os.path.join(self._out_dir, day) 49 | async with aiofiles.open(out_file_path, "w") as f: 50 | await f.write('\n'.join(links)) 51 | await f.flush() 52 | 53 | async def _get_story_links(self, day): 54 | current_page_id = 1 55 | links = set() 56 | headers = _get_headers(day=day, pikabu_section=self._pikabu_section) 57 | url = _get_url(day=day, pikabu_section=self._pikabu_section) 58 | 59 | while True: 60 | old_number_of_links = len(links) 61 | params = _get_params(page_number=current_page_id) 62 | response_text = await self.perform_request(url=url, headers=headers, params=params, method='get') 63 | stories = json.loads(response_text)['data']['stories'] 64 | story_soups = [bs4.BeautifulSoup(s['html'], features="html.parser") for s in stories] 65 | 66 | for story_soup in story_soups: 67 | link_element = story_soup.find('a', {'class': 'story__title-link'}) 68 | if link_element is not None: 69 | href = link_element.get('href') 70 | if href: 71 | links.add(href) 72 | 73 | new_number_of_links = len(links) 74 | 75 | if old_number_of_links < new_number_of_links: 76 | _logger.debug(f'Day: {day}, links obtained: {new_number_of_links}, pages scrolled: {current_page_id}') 77 | current_page_id += 1 78 | else: 79 | break 80 | 81 | self._n_total_links += len(links) 82 | _logger.info(f'Day: {day} done, total number of links: {self._n_total_links}') 83 | 84 | return links 85 | 86 | 87 | def _get_headers(day, pikabu_section): 88 | headers = {'referer': f'https://pikabu.ru/{pikabu_section}/{day}'} 89 | return headers 90 | 91 | 92 | def _get_params(page_number): 93 | params = (('twitmode', '1'), ('of', 'v2'), ('page', f'{page_number}'), ('_', '1574097199724')) 94 | return params 95 | 96 | 97 | def _get_url(day, pikabu_section): 98 | url = f'https://pikabu.ru/{pikabu_section}/{day}' 99 | return url 100 | -------------------------------------------------------------------------------- /dialogs_data_parsers/utils.py: -------------------------------------------------------------------------------- 1 | def iterate_on_parts_by_condition(iterable, condition): 2 | cur_chunk = [] 3 | for elem in iterable: 4 | if not condition(elem): 5 | cur_chunk.append(elem) 6 | else: 7 | yield cur_chunk 8 | cur_chunk = [] 9 | 10 | if cur_chunk: 11 | yield cur_chunk 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | more_itertools==8.8.0 2 | beautifulsoup4==4.9.3 3 | aiohttp==3.7.4 4 | treelib==1.6.1 5 | aiofiles==0.7.0 6 | tqdm==4.62.1 7 | -------------------------------------------------------------------------------- /scripts/annotate_flibusta_raw_dialogs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from dialogs_data_parsers.flibusta.author_words_annotation_generator import FlibustaAuthorWordsAnnotationGenerator 4 | 5 | 6 | def _parse_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--raw_dialogs_file_path', type=str, required=True) 9 | parser.add_argument('--out_file_path', type=str, required=True) 10 | parser.add_argument('--n_samples', type=int, required=True) 11 | parser.add_argument('--augment_p', type=float, required=False, default=0.3) 12 | 13 | args = parser.parse_args() 14 | return args 15 | 16 | 17 | def main(): 18 | args = _parse_args() 19 | samples_generator = FlibustaAuthorWordsAnnotationGenerator( 20 | raw_dialogs_file_path=args.raw_dialogs_file_path, 21 | out_file_path=args.out_file_path, 22 | n_samples=args.n_samples, 23 | augment_p=args.augment_p) 24 | 25 | samples_generator.run() 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /scripts/crawl_pikabu_stories.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import os 4 | 5 | from dialogs_data_parsers.common.log_config import prepare_logging 6 | from dialogs_data_parsers.pikabu.story_crawler import PikabuStoryCrawler 7 | 8 | 9 | def _parse_args(): 10 | parser = argparse.ArgumentParser(description='Crawls pikabu stories and stores them in one jsonl file.') 11 | parser.add_argument( 12 | '--root_dir', 13 | type=str, 14 | required=True, 15 | help='Path to the root pikabu results directory. Sub-directory with links will be created there.') 16 | parser.add_argument('--concurrency', type=int, required=False, default=12, help='Number of concurrent requests.') 17 | parser.add_argument('--timeout', type=int, required=False, default=10, help='Timeout in seconds.') 18 | parser.add_argument('--retries', type=int, required=False, default=5, help='Number of request retries.') 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | 24 | def main(): 25 | args = _parse_args() 26 | 27 | out_file_path = os.path.join(args.root_dir, 'stories.jsonl') 28 | story_links_dir = os.path.join(args.root_dir, 'story_links') 29 | logs_dir = os.path.join(args.root_dir, 'logs') 30 | prepare_logging(logs_dir, log_files_prefix='stories_') 31 | 32 | crawler = PikabuStoryCrawler.from_story_links_dir( 33 | concurrency=args.concurrency, 34 | timeout=args.timeout, 35 | retries=args.retries, 36 | story_links_dir=story_links_dir, 37 | out_file_path=out_file_path) 38 | 39 | loop = asyncio.get_event_loop() 40 | loop.run_until_complete(crawler.run()) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /scripts/crawl_pikabu_story_links.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | import datetime 4 | import os 5 | 6 | from dialogs_data_parsers.common.log_config import prepare_logging 7 | from dialogs_data_parsers.pikabu.story_links_crawler import PikabuStoryLinksCrawler 8 | 9 | 10 | def _parse_args(): 11 | parser = argparse.ArgumentParser(description='Crawls pikabu story links and stores them in day-separated files.') 12 | parser.add_argument( 13 | '--root_dir', 14 | type=str, 15 | required=True, 16 | help='Path to the root pikabu results directory. Sub-directory with links will be created there.') 17 | parser.add_argument( 18 | '--start_day', type=str, required=False, default='01-09-2010', help='Stories to crawl start day (%d-%m-%Y).') 19 | parser.add_argument( 20 | '--end_day', 21 | type=str, 22 | required=False, 23 | default=_get_default_end_day(), 24 | help='Stories to crawl end day (%d-%m-%Y).') 25 | parser.add_argument('--concurrency', type=int, required=False, default=12, help='Number of concurrent requests.') 26 | parser.add_argument('--timeout', type=int, required=False, default=10, help='Timeout in seconds.') 27 | parser.add_argument('--retries', type=int, required=False, default=5, help='Number of request retries.') 28 | parser.add_argument('--pikabu_section', type=str, required=False, default='best', help='Pikabu section to crawl.') 29 | 30 | args = parser.parse_args() 31 | return args 32 | 33 | 34 | def _get_default_end_day(): 35 | date = datetime.datetime.now().date() - datetime.timedelta(days=1) 36 | return date.strftime("%d-%m-%Y") 37 | 38 | 39 | def main(): 40 | args = _parse_args() 41 | 42 | out_dir = os.path.join(args.root_dir, 'story_links') 43 | logs_dir = os.path.join(args.root_dir, 'logs') 44 | prepare_logging(logs_dir, log_files_prefix='story_links_') 45 | crawler = PikabuStoryLinksCrawler( 46 | concurrency=args.concurrency, 47 | timeout=args.timeout, 48 | retries=args.retries, 49 | out_dir=out_dir, 50 | start_day=args.start_day, 51 | end_day=args.end_day, 52 | pikabu_section=args.pikabu_section) 53 | 54 | loop = asyncio.get_event_loop() 55 | loop.run_until_complete(crawler.run()) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /scripts/prepare_flibusta_raw_dialogs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from dialogs_data_parsers.common.log_config import prepare_logging 4 | from dialogs_data_parsers.flibusta.dialogs_parser import FlibustaDialogsParser 5 | 6 | 7 | def _parse_args(): 8 | parser = argparse.ArgumentParser(description='Parses flibusta dialogs from flibusta fb2 archives.') 9 | parser.add_argument( 10 | '--flibusta_archives_dir', 11 | type=str, 12 | required=True, 13 | help='Path to the dir with flibusta zip archives. Each archive contains fb2 files.') 14 | parser.add_argument('--out_file_path', type=str, required=True, help='Path to the output dialogs file.') 15 | parser.add_argument('--logs_dir', type=str, required=True, help='Path to the logs directory.') 16 | 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | def main(): 22 | args = _parse_args() 23 | prepare_logging(args.logs_dir) 24 | parser = FlibustaDialogsParser(args.flibusta_archives_dir, args.out_file_path) 25 | parser.run() 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from setuptools import find_packages, setup 4 | 5 | _THIS_DIR = pathlib.Path(__file__).parent 6 | 7 | 8 | def _get_requirements(): 9 | with (_THIS_DIR / 'requirements.txt').open() as fp: 10 | return fp.read() 11 | 12 | 13 | setup( 14 | name='dialogs_data_parsers', 15 | version='0.0.1', 16 | install_requires=_get_requirements(), 17 | package_dir={'dialogs_data_parsers': 'dialogs_data_parsers'}, 18 | packages=find_packages(exclude=['tests', 'tests.*'])) 19 | --------------------------------------------------------------------------------