├── .gitignore
├── src
    └── webinardump
    │   ├── __init__.py
    │   ├── dumpers
    │       ├── __init__.py
    │       ├── webinarru.py
    │       ├── yadisk.py
    │       └── base.py
    │   ├── utils.py
    │   └── cli.py
├── tests
    ├── datafixtures
    │   ├── manifest_webinarru.json
    │   ├── vid.m3u
    │   ├── empty.ts
    │   └── manifest_yadisk.html
    ├── conftest.py
    ├── test_utils.py
    └── test_basic.py
├── CHANGELOG.md
├── AUTHORS
├── tools
    └── debug.py
├── .github
    └── workflows
    │   └── python-package.yml
├── ruff.toml
├── pyproject.toml
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | dump/
2 | 


--------------------------------------------------------------------------------
/src/webinardump/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.1.1'


--------------------------------------------------------------------------------
/tests/datafixtures/manifest_webinarru.json:
--------------------------------------------------------------------------------
1 | {"name": "yatst"}


--------------------------------------------------------------------------------
/tests/datafixtures/vid.m3u:
--------------------------------------------------------------------------------
1 | 1.ts?some=other1
2 | 2.ts?some=other2
3 | 


--------------------------------------------------------------------------------
/tests/datafixtures/empty.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idlesign/webinardump/HEAD/tests/datafixtures/empty.ts


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # webinardump changelog
2 | 
3 | ### v0.1.1 [2025-11-15]
4 | * ** Minor fixes.
5 | 
6 | ## v0.1.0 [2020-07-30]
7 | * ++ Basic functionality.


--------------------------------------------------------------------------------
/tests/datafixtures/manifest_yadisk.html:
--------------------------------------------------------------------------------
1 | <script id="store-prefetch">{"resources": {"a": {"name": "yatst", "videoStreams": {"videos": [{"dimension": "720p", "url": "https://here/there.m3u8"}]}}}}</script>


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | webinardump Authors
 2 | ===================
 3 | 
 4 | Created by Igor `idle sign` Starikov.
 5 | 
 6 | 
 7 | Contributors
 8 | ------------
 9 | 
10 | KarenKing <https://github.com/KARENKING112>
11 | 


--------------------------------------------------------------------------------
/src/webinardump/dumpers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import Dumper
 2 | from .webinarru import WebinarRu
 3 | from .yadisk import YandexDisk
 4 | 
 5 | __all__ = [
 6 |     'Dumper',
 7 |     'WebinarRu',
 8 |     'YandexDisk',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/tools/debug.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | from webinardump.dumpers import YandexDisk
 5 | 
 6 | logging.basicConfig(level=logging.INFO, format='%(levelname)-8s: %(message)s')
 7 | 
 8 | 
 9 | dumper = YandexDisk(target_dir=Path('../tools/dumped/'))
10 | 
11 | dumper.run({
12 |     'url_video': 'https://disk.yandex.ru/i/xxx',
13 | })
14 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def mock_call(monkeypatch):
 8 |     calls = []
 9 | 
10 |     def mock_call(cmd, **kwargs):
11 |         if 'ffmpeg' in cmd:
12 |             Path('yatst/all_chunks.mp4').write_bytes(b'')
13 | 
14 |         calls.append(cmd)
15 | 
16 |     monkeypatch.setattr("webinardump.utils.check_call", mock_call)
17 | 
18 |     return calls
19 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from webinardump.utils import get_files_sorted
 2 | 
 3 | 
 4 | def test_get_files_sorted(tmp_path):
 5 | 
 6 |     (tmp_path / '1.a').touch()
 7 |     (tmp_path / '01.a').touch()
 8 |     (tmp_path / '02.a').touch()
 9 |     (tmp_path / '9.a').touch()
10 |     (tmp_path / '1.b').touch()
11 |     (tmp_path / '10.a').touch()
12 |     (tmp_path / '11.a').touch()
13 | 
14 |     fnames = get_files_sorted(tmp_path, suffixes={'.a'})
15 |     assert fnames == ['01.a', '1.a', '02.a', '9.a', '10.a', '11.a']


--------------------------------------------------------------------------------
/src/webinardump/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | from pathlib import Path
 4 | from subprocess import check_call
 5 | 
 6 | LOGGER = logging.getLogger('webinardump')
 7 | RE_DIGITS = re.compile(r'(\d+)')
 8 | 
 9 | 
10 | 
11 | def call(cmd: str, *, path: Path):
12 |     return check_call(cmd, cwd=path, shell=True)
13 | 
14 | 
15 | def get_files_sorted(path: Path, *, suffixes: set[str]) -> list[str]:
16 |     def natural(text):
17 |         return [int(char) if char.isdigit() else char for char in RE_DIGITS.split(text)]
18 | 
19 |     files = [file.name for file in path.iterdir() if file.is_file() and file.suffix in suffixes]
20 |     files.sort(key=natural)
21 | 
22 |     return files
23 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         python-version: [3.11, 3.12, 3.13]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Setup uv
26 |       uses: astral-sh/setup-uv@v6
27 |     - name: Install deps
28 |       run: |
29 |         uv sync --only-group tests
30 |         uv pip install coveralls
31 |     - uses: astral-sh/ruff-action@v3
32 |       with:
33 |         args: check
34 |     - name: Run tests
35 |       env:
36 |         GITHUB_TOKEN: ${{ secrets.github_token }}
37 |       run: |
38 |         uv run coverage run -m pytest
39 |         uv run coveralls --service=github
40 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | target-version = "py311"
 2 | line-length = 120
 3 | exclude = [
 4 | ]
 5 | 
 6 | [format]
 7 | quote-style = "single"
 8 | exclude = []
 9 | 
10 | [lint]
11 | select = [
12 |     "B",  # possible bugs
13 |     "BLE",  # broad exception
14 |     "C4",  # comprehensions
15 |     "DTZ",  # work with datetimes
16 |     "E",  # code style
17 |     "ERA",  # commented code
18 |     "EXE",  # check executables
19 |     "F",  # misc
20 |     "FA",  # future annotations
21 |     "FBT",  # booleans
22 |     "FURB",  # modernizing
23 |     "G",  # logging format
24 |     "I",  # imports
25 |     "ICN",  # import conventions
26 |     "INT",  # i18n
27 |     "ISC",  # stringc concat
28 |     "PERF",  # perfomance
29 |     "PIE",  # misc
30 |     "PLC",  # misc
31 |     "PLE",  # misc err
32 |     "PT",  # pytest
33 |     "PTH",  # pathlib
34 |     "PYI",  # typing
35 |     "RSE", # exc raise
36 |     "RUF",  # misc
37 |     "SLOT",  # slots related
38 |     "TC",  # typing
39 |     "UP",  # py upgrade
40 | ]
41 | 
42 | ignore = []
43 | 
44 | 
45 | [lint.extend-per-file-ignores]
46 | "tests/*" = []
47 | "src/sponsrdump/utils.py" = [
48 |     "G004",
49 | ]
50 | "src/sponsrdump/cli.py" = [
51 |     "RUF001",
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/webinardump/cli.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from .dumpers import Dumper
 6 | 
 7 | 
 8 | def get_user_input(param: str, hint: str, *, choices: list[str] | None = None) -> str:
 9 | 
10 |     choices = set(choices or [])
11 | 
12 |     while True:
13 |         data = input(f'{hint}: ')
14 |         data = data.strip()
15 |         if not data or (choices and data not in choices):
16 |             continue
17 | 
18 |         return data
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(prog='webinardump')
23 |     parser.add_argument('-t', '--target', type=Path, default=Path(), help='Directory to dump to')
24 |     parser.add_argument('--timeout', type=int, default=3, help='Request timeout')
25 |     parser.add_argument('--rmax', type=int, default=10, help='Max concurrent requests number')
26 |     parser.add_argument('--debug', help='Show debug information', action='store_true')
27 | 
28 |     args = parser.parse_args()
29 | 
30 |     logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='%(levelname)-8s: %(message)s')
31 | 
32 |     dumper_choices = []
33 |     print('Available dumpers:')
34 | 
35 |     for idx, dumper in enumerate(Dumper.registry, 1):
36 |         print(f'{idx} — {dumper.title}')
37 |         dumper_choices.append(f'{idx}')
38 | 
39 |     chosen = get_user_input('', 'Select dumper number', choices=dumper_choices)
40 | 
41 |     dumper = Dumper.registry[int(chosen)-1](
42 |         target_dir=args.target,
43 |         timeout=args.timeout,
44 |         concurrent=args.rmax,
45 |     )
46 |     dumper.run(get_user_input)
47 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "webinardump"
 3 | dynamic = ["version"]
 4 | description = "Make local backup copies of webinars"
 5 | authors = [
 6 |     { name = "Igor Starikov", email = "idlesign@yandex.ru" }
 7 | ]
 8 | readme = "README.md"
 9 | license = "BSD-3-Clause"
10 | license-files = ["LICENSE"]
11 | requires-python = ">=3.11"
12 | keywords = ["backup", "webinars"]
13 | dependencies = [
14 |     "requests>=2.31.0",
15 | ]
16 | 
17 | [project.urls]
18 | Homepage = "https://github.com/idlesign/webinardump"
19 | 
20 | [project.scripts]
21 | webinardump = "webinardump.cli:main"
22 | 
23 | [dependency-groups]
24 | dev = [
25 |     {include-group = "linters"},
26 |     {include-group = "tests"},
27 | ]
28 | linters = [
29 | #    "ruff",
30 | ]
31 | tests = [
32 |     "pytest",
33 |     "pytest-responsemock",
34 |     "pytest-datafixtures",
35 | ]
36 | 
37 | [build-system]
38 | requires = ["hatchling"]
39 | build-backend = "hatchling.build"
40 | 
41 | [tool.hatch.version]
42 | path = "src/webinardump/__init__.py"
43 | 
44 | [tool.hatch.build.targets.wheel]
45 | packages = ["src/webinardump"]
46 | 
47 | [tool.hatch.build.targets.sdist]
48 | packages = ["src/"]
49 | 
50 | [tool.pytest.ini_options]
51 | testpaths = [
52 |     "tests",
53 | ]
54 | 
55 | [tool.coverage.run]
56 | source = [
57 |     "src/",
58 | ]
59 | omit = [
60 |     "*/cli.py",
61 | ]
62 | 
63 | [tool.coverage.report]
64 | fail_under = 99.00
65 | exclude_also = [
66 |     "raise NotImplementedError",
67 |     "if TYPE_CHECKING:",
68 | ]
69 | 
70 | [tool.tox]
71 | skip_missing_interpreters = true
72 | env_list = [
73 |   "py311",
74 |   "py312",
75 |   "py313",
76 | ]
77 | 
78 | [tool.tox.env_run_base]
79 | dependency_groups = ["tests"]
80 | commands = [
81 |   ["pytest", { replace = "posargs", default = ["tests"], extend = true }],
82 | ]
83 | 


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
 1 | from webinardump.dumpers import WebinarRu, YandexDisk
 2 | 
 3 | CALLS = [
 4 |     'ffmpeg -y -f concat -i all_chunks.txt -c copy -bsf:a aac_adtstoasc all_chunks.mp4'
 5 | ]
 6 | 
 7 | 
 8 | def test_yadisk(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call):
 9 |     data_manifest = datafix_read('manifest_yadisk.html')
10 |     data_m3u = datafix_read('vid.m3u')
11 |     data_ts = datafix_readbin('empty.ts')
12 | 
13 |     with response_mock([
14 |         f'GET https://disk.yandex.ru/i/xxx -> 200:{data_manifest}',
15 |         f'GET https://here/there.m3u8 -> 200:{data_m3u}',
16 |         b'GET https://here/1.ts?some=other1 -> 200:' + data_ts,
17 |         b'GET https://here/2.ts?some=other2 -> 200:' + data_ts,
18 |     ]):
19 |         fpath = YandexDisk(target_dir=tmp_path).run({
20 |             'url_video': 'https://disk.yandex.ru/i/xxx',
21 |         })
22 |         assert fpath
23 |         assert mock_call == CALLS
24 | 
25 | 
26 | def test_webinarru(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call):
27 |     data_manifest = datafix_read('manifest_webinarru.json')
28 |     data_m3u = datafix_read('vid.m3u')
29 |     data_ts = datafix_readbin('empty.ts')
30 | 
31 |     with response_mock([
32 |         'GET https://events.webinar.ru/api/eventsessions/aaa/record/isviewable?'
33 |         f'recordAccessToken=bbb -> 200:{data_manifest}',
34 | 
35 |         f'GET https://here/there.m3u8 -> 200:{data_m3u}',
36 |         b'GET https://here/1.ts?some=other1 -> 200:' + data_ts,
37 |         b'GET https://here/2.ts?some=other2 -> 200:' + data_ts,
38 |     ]):
39 |         fpath = WebinarRu(target_dir=tmp_path).run({
40 |             'url_video': ' https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb',
41 |             'url_playlist': 'https://here/there.m3u8',
42 |         })
43 |         assert fpath
44 |         assert mock_call == CALLS
45 | 


--------------------------------------------------------------------------------
/src/webinardump/dumpers/webinarru.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import ClassVar
 3 | 
 4 | from ..utils import LOGGER
 5 | from .base import Dumper
 6 | 
 7 | 
 8 | class WebinarRu(Dumper):
 9 | 
10 |     title = 'webinar.ru'
11 | 
12 |     _user_input_map: ClassVar[dict[str, str]] = {
13 |         'url_video': 'Video URL (with `record-new/`)',
14 |         'url_playlist': 'Video chunk list URL (with `chunklist.m3u8`)',
15 |     }
16 | 
17 |     _headers: ClassVar[dict[str, str]] = {
18 |         **Dumper._headers,
19 |         'Origin': 'https://events.webinar.ru',
20 |     }
21 | 
22 |     def _gather(self, *, url_video: str, start_chunk: str = '', url_playlist: str = '', **params) -> Path:
23 |         """Runs video dump.
24 | 
25 |         :param url_video: Video URL. Hint: has record-new/
26 |         :param url_playlist: Video chunk list URL. Hint: ends with chunklist.m3u8
27 |         :param start_chunk: Optional chunk name to continue download from.
28 |         """
29 |         assert url_playlist, 'Playlist URL must be specified'
30 | 
31 |         assert 'record-new/' in url_video, (
32 |             'Unexpected video URL format\n'
33 |             f'Given:    {url_video}.\n'
34 |             f'Expected: https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb')
35 | 
36 |         _, _, tail = url_video.partition('record-new/')
37 |         session_id, _, video_id = tail.partition('/')
38 | 
39 |         LOGGER.info('Getting manifest ...')
40 | 
41 |         manifest = self._get_response_simple(
42 |             f'https://events.webinar.ru/api/eventsessions/{session_id}/record/isviewable?recordAccessToken={video_id}',
43 |             json=True
44 |         )
45 | 
46 |         return self._video_dump(
47 |             title=manifest['name'],
48 |             url_playlist=url_playlist,
49 |             url_referer=url_video,
50 |             start_chunk=start_chunk,
51 |         )
52 | 


--------------------------------------------------------------------------------
/src/webinardump/dumpers/yadisk.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from pathlib import Path
 4 | from typing import ClassVar
 5 | 
 6 | from ..utils import LOGGER
 7 | from .base import Dumper
 8 | 
 9 | 
10 | class YandexDisk(Dumper):
11 | 
12 |     title = 'Яндекс.Диск'
13 | 
14 |     _user_input_map: ClassVar[dict[str, str]] = {
15 |         'url_video': 'Video URL (https://disk.yandex.ru/i/xxx)',
16 |     }
17 | 
18 |     def _get_manifest(self, url: str) -> dict:
19 |         LOGGER.debug(f'Getting manifest from {url} ...')
20 | 
21 |         contents = self._get_response_simple(url)
22 |         manifest = re.findall(r'id="store-prefetch">([^<]+)</script', contents)
23 |         assert manifest, f'Manifest not found for {url}'
24 |         manifest = manifest[0]
25 |         manifest = json.loads(manifest)
26 |         return manifest
27 | 
28 |     def _get_playlist_and_title(self, manifest: dict) -> tuple[str, str]:
29 | 
30 |         resources = list(manifest['resources'].values())
31 |         resource = resources[0]
32 | 
33 |         dimension_max = 0
34 |         url_playlist = '<none>'
35 | 
36 |         for stream_info in resource['videoStreams']['videos']:
37 |             dimension, *_ = stream_info['dimension'].partition('p')
38 |             if not dimension.isnumeric():
39 |                 continue  # e.g. 'adaptive'
40 |             dimension = int(dimension)
41 |             if dimension_max < dimension:
42 |                 dimension_max = dimension
43 |                 url_playlist = stream_info['url']
44 | 
45 |         return url_playlist, resource['name']
46 | 
47 |     def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path:
48 | 
49 |         manifest = self._get_manifest(url_video)
50 |         url_playlist, title = self._get_playlist_and_title(manifest)
51 | 
52 |         return self._video_dump(
53 |             title=title,
54 |             url_playlist=url_playlist,
55 |             url_referer=url_video,
56 |             start_chunk=start_chunk,
57 |         )
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # webinardump
  2 | 
  3 | <https://github.com/idlesign/webinardump>
  4 | 
  5 | [![PyPI - Version](https://img.shields.io/pypi/v/webinardump)](https://pypi.python.org/pypi/webinardump)
  6 | [![License](https://img.shields.io/pypi/l/webinardump)](https://pypi.python.org/pypi/webinardump)
  7 | [![Coverage](https://img.shields.io/coverallsCoverage/github/idlesign/webinardump)](https://coveralls.io/r/idlesign/webinardump)
  8 | 
  9 | ## Описание
 10 | 
 11 | *Приложение позволяет скачать запись вебинара и сохранить в виде .mp4 файла.*
 12 | 
 13 | 
 14 | ## Откуда качает
 15 | 
 16 | * Яндекс.Диск (записи стримов)
 17 | * webinar.ru
 18 | 
 19 | 
 20 | ## Зависимости
 21 | 
 22 | Что нужно иметь для запуска приложения и работы с ним.
 23 | 
 24 | * Linux (Unix)
 25 | * Python 3.11+
 26 | * ffmpeg (для Ubuntu: `sudo apt install ffmpeg`)
 27 | * uv (для установки и обновления приложения)
 28 | * Базовые знания о работе в браузере с отладочной консолью.
 29 | 
 30 | 
 31 | ## Установка и обновление
 32 | 
 33 | Производится при помощи приложения [uv](https://docs.astral.sh/uv/getting-started/installation/):
 34 | 
 35 | ```shell
 36 | $ uv tool install webinardump
 37 | ```
 38 | 
 39 | После этого запускать приложение можно командой
 40 | 
 41 | ```shell
 42 | $ webinardump
 43 | ```
 44 | 
 45 | Для обновления выполните
 46 | 
 47 | ```shell
 48 | $ uv tool upgrade webinardump
 49 | ```
 50 | 
 51 | ## Как использовать
 52 | 
 53 | Переместитесь в желаемый каталог и выполните следующую команду. 
 54 | 
 55 | ```shell
 56 | 
 57 | ; Указываем путь для скачивания - my_webinar_dir/
 58 | ; Указываем таймаут запросов - 10 секунд
 59 | ; Указываем максимальное количество одновременных запросов - 20
 60 | $ webinardump --target my_webinar_dir/ --timeout 10 --rmax 20
 61 | ```
 62 | Приложение скачает фрагменты вебинара, а потом соберёт из них единый файл.
 63 | 
 64 | 
 65 | ### disk.yandex.ru
 66 | 
 67 | 1. Взять ссылку на вебинар (запись стрима). Вида https://disk.yandex.ru/i/xxx
 68 | 2. Запустить скачиватель и скормить ему ссылку из предыдущего пункта.
 69 | 
 70 | 
 71 | ### webinar.ru
 72 | 
 73 | Процесс скачивания автоматизирован не полностью, потребуется искать
 74 | некоторые ссылки при помощи браузера.
 75 | 
 76 | 1. Взять ссылку на вебинар. Вида https://events.webinar.ru/event/xxx/yyy/zzz
 77 | 2. Открыть в браузере.
 78 | 3. Включить отладочную консоль (F12).
 79 | 4. Запустить воспроизведение.
 80 | 5. Отыскать ссылку с `record-new/` и запомнить её.
 81 | 6. Отыскать ссылку, оканчивающуюся на `chunklist.m3u8` и запомнить её.
 82 | 7. Запустить скачиватель и скормить ему ссылки и двух предыдущих пунктов.
 83 | 
 84 | ## Для разработки
 85 | 
 86 | При разработке используется [makeapp](https://pypi.org/project/makeapp/). Ставим:
 87 | 
 88 | ```shell
 89 | $ uv tool install makeapp
 90 | ```
 91 | 
 92 | После клонирования репозитория sponsrdump, в его директории выполняем:
 93 | 
 94 | ```shell
 95 | # ставим утилиты
 96 | $ ma tools
 97 | 
 98 | # инициализируем виртуальное окружение
 99 | $ ma up --tool
100 | 
101 | # теперь в окружении доступны зависимости и команда sponsrdump
102 | ```
103 | 
104 | Проверь стиль перед отправкой кода на обзор:
105 | 
106 | ```shell
107 | # проверяем стиль
108 | $ ma style
109 | ```
110 | 


--------------------------------------------------------------------------------
/src/webinardump/dumpers/base.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from collections.abc import Callable
  3 | from concurrent.futures import ThreadPoolExecutor, as_completed
  4 | from contextlib import chdir
  5 | from pathlib import Path
  6 | from random import choice
  7 | from threading import Lock
  8 | from time import sleep
  9 | from typing import ClassVar
 10 | 
 11 | import requests
 12 | from requests import Session
 13 | from requests.adapters import HTTPAdapter, Retry
 14 | 
 15 | from ..utils import LOGGER, call, get_files_sorted
 16 | 
 17 | 
 18 | class Dumper:
 19 | 
 20 |     title: str = ''
 21 | 
 22 |     _user_input_map: ClassVar[dict[str, str]]
 23 | 
 24 |     _headers: ClassVar[dict[str, str]] = {
 25 |         'Connection': 'keep-alive',
 26 |         'Accept': '*/*',
 27 |         'User-Agent': (
 28 |             'Mozilla/5.0 (X11; Linux x86_64) '
 29 |             'AppleWebKit/537.36 (KHTML, like Gecko) '
 30 |             'Chrome/79.0.3945.136 YaBrowser/20.2.3.320 (beta) Yowser/2.5 Safari/537.36'
 31 |         ),
 32 |         'Sec-Fetch-Site': 'same-site',
 33 |         'Sec-Fetch-Mode': 'cors',
 34 |         'Accept-Language': 'ru,en;q=0.9',
 35 |         'Accept-Encoding': 'gzip, deflate, sdch, br',
 36 |     }
 37 | 
 38 |     _media_ext: ClassVar[set[str]] = {'.ts', '.m4s'}
 39 | 
 40 |     registry: ClassVar[list[type['Dumper']]] = []
 41 | 
 42 |     def __init_subclass__(cls):
 43 |         super().__init_subclass__()
 44 |         cls.registry.append(cls)
 45 | 
 46 |     def __init__(self, *, target_dir: Path, timeout: int = 3, concurrent: int = 10, sleepy: bool = False) -> None:
 47 |         self._target_dir = target_dir
 48 |         self._timeout = timeout
 49 |         self._concurrent = concurrent
 50 |         self._user_input_map = self._user_input_map or {}
 51 |         self._session = self._get_session()
 52 |         self._sleepy = sleepy
 53 | 
 54 |     def __str__(self):
 55 |         return self.title
 56 | 
 57 |     def _get_session(self) -> Session:
 58 |         # todo при ошибках сессия в нитях блокируется. можно попробовать несколько сессий
 59 |         session = requests.Session()
 60 |         session.headers = self._headers
 61 |         retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[500])
 62 |         session.mount('http://', HTTPAdapter(max_retries=retries))
 63 |         session.mount('https://', HTTPAdapter(max_retries=retries))
 64 |         return session
 65 | 
 66 |     def _get_args(self, *, get_param_hook: Callable[[str, str], str]) -> dict:
 67 |         input_data = {}
 68 | 
 69 |         for param, hint in self._user_input_map.items():
 70 |             input_data[param] = get_param_hook(param, hint)
 71 | 
 72 |         return input_data
 73 | 
 74 |     def _chunks_get_list(self, url: str, *, url_prefix: str = '') -> list[str]:
 75 |         """Get video chunks names from playlist file at URL.
 76 | 
 77 |         * Links to other playlists:
 78 |             #EXTM3U
 79 |             #EXT-X-VERSION:7
 80 |             #EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",AUTOSELECT=YES,NAME="xxx",URI="a1/index.m3u8"
 81 |             #EXT-X-STREAM-INF:BANDWIDTH=5205192,RESOLUTION=1920x1080,CODECS="avc1.42C02A,mp4a.40.2",AUDIO="audio"
 82 |             v1/index.m3u8
 83 | 
 84 |         * Links to chunks:
 85 |             #EXTM3U
 86 |             #EXT-X-VERSION:6
 87 |             #EXT-X-MEDIA-SEQUENCE:1
 88 |             #EXT-X-INDEPENDENT-SEGMENTS
 89 |             #EXT-X-TARGETDURATION:12
 90 |             #EXT-X-MAP:URI="init/1.m4s"
 91 |             #EXTINF:2.000,
 92 |             media/1.m4s
 93 |             #EXT-X-DISCONTINUITY
 94 | 
 95 |         :param url: File URL.
 96 |         :param url_prefix: File URL prefix.
 97 | 
 98 |         """
 99 |         LOGGER.info(f'Getting video chunks from playlist {url} ...')
100 | 
101 |         sub_playlist = self._get_response_simple(url)
102 |         playlists = []
103 |         chunk_lists = []
104 |         media_ext = self._media_ext
105 | 
106 |         for line in sub_playlist.splitlines():
107 |             line = line.strip()
108 | 
109 |             if 'EXT-X-MAP:URI' in line:
110 |                 # naive parsing. todo: respect EXT-X-DISCONTINUITY and EXT-X-MEDIA:TYPE=AUDIO,...,AUTOSELECT=YES
111 |                 line = line.rpartition("=")[2].strip('"')
112 | 
113 |             if line.endswith('.m3u8'):
114 |                 playlists.append(line)
115 |                 continue
116 | 
117 |             path = Path(line.partition('?')[0])
118 |             if path.suffix not in media_ext:
119 |                 continue
120 | 
121 |             if url_prefix:
122 |                 line = f'{url_prefix}/{line}'
123 | 
124 |             chunk_lists.append(line)
125 | 
126 |         if playlists:
127 |             LOGGER.info('Sub playlists found. Will use the first one ...')
128 | 
129 |             sub_playlist = playlists[0]
130 |             sub_playlist_url_prefix = sub_playlist.rpartition('/')[0]
131 |             sub_playlist_url = f'{url.rpartition("/")[0]}/{sub_playlist}'
132 | 
133 |             chunk_lists = self._chunks_get_list(sub_playlist_url, url_prefix=sub_playlist_url_prefix)
134 | 
135 |         else:
136 |             assert chunk_lists, 'No video chunks found in playlist file'
137 | 
138 |         return chunk_lists
139 | 
140 |     def _chunks_download(
141 |         self,
142 |         *,
143 |         url_video_root: str,
144 |         dump_dir: Path,
145 |         chunk_names: list[str],
146 |         start_chunk: str,
147 |         headers: dict[str, str] | None = None,
148 |         concurrent: int = 10,
149 |     ) -> None:
150 | 
151 |         chunks_total = len(chunk_names)
152 | 
153 |         progress_file = (dump_dir / 'files.txt')
154 |         progress_file.touch()
155 | 
156 |         files_done = dict.fromkeys(progress_file.read_text().splitlines())
157 |         lock = Lock()
158 | 
159 |         def dump(*, name: str, file_idx: int, url: str, session: Session, sleepy: bool, timeout: int) -> None:
160 | 
161 |             name = name.partition('?')[0]  # drop GET-args
162 | 
163 |             if name in files_done:
164 |                 LOGGER.info(f'File {name} has already been downloaded before. Skipping.')
165 |                 return
166 | 
167 |             filename = name.rpartition('/')[2]  # drop url prefix
168 |             filename = f'{file_idx}_{filename}'
169 | 
170 |             with session.get(url, headers=headers or {}, stream=True, timeout=timeout) as r:
171 |                 r.raise_for_status()
172 |                 with (dump_dir / filename).open('wb') as f:
173 |                     f.writelines(r.iter_content(chunk_size=8192))
174 | 
175 |             files_done[name] = True
176 |             with lock:
177 |                 progress_file.write_text('\n'.join(files_done))
178 | 
179 |             if sleepy:
180 |                 sleep(choice([1, 0.5, 0.7, 0.6]))
181 | 
182 |         with ThreadPoolExecutor(max_workers=concurrent) as executor:
183 | 
184 |             future_url_map = {}
185 | 
186 |             for idx, chunk_name in enumerate(chunk_names, 1):
187 | 
188 |                 if chunk_name == start_chunk:
189 |                     start_chunk = ''  # clear to allow further download
190 | 
191 |                 if start_chunk:
192 |                     continue
193 | 
194 |                 chunk_url = f'{url_video_root.rstrip("/")}/{chunk_name}'
195 |                 submitted = executor.submit(
196 |                     dump,
197 |                     name=chunk_name,
198 |                     file_idx=idx,
199 |                     url=chunk_url,
200 |                     session=self._session,
201 |                     sleepy=self._sleepy,
202 |                     timeout=self._timeout,
203 |                 )
204 | 
205 |                 future_url_map[submitted] = (chunk_name, chunk_url)
206 | 
207 |             if future_url_map:
208 |                 LOGGER.info(f'Downloading up to {concurrent} files concurrently ...')
209 | 
210 |                 counter = 1
211 |                 for future in as_completed(future_url_map):
212 |                     chunk_name, chunk_url = future_url_map[future]
213 |                     future.result()
214 |                     percent = round(counter * 100 / chunks_total, 1)
215 |                     counter += 1
216 |                     LOGGER.info(f'Got {counter}/{chunks_total} ({chunk_name.partition("?")[0]}) [{percent}%] ...')
217 | 
218 |     def _video_concat(self, path: Path) -> Path:
219 | 
220 |         LOGGER.info('Concatenating video ...')
221 | 
222 |         fname_video = 'all_chunks.mp4'
223 |         fname_index = 'all_chunks.txt'
224 | 
225 |         mode_m4s = False
226 | 
227 |         filenames = get_files_sorted(path, suffixes=self._media_ext)
228 | 
229 |         for filename in filenames:
230 |             if filename.endswith('m4s'):
231 |                 mode_m4s = True
232 |                 break
233 | 
234 |         def create_index(line_tpl: str = '%s'):
235 |             with (path / fname_index).open('w') as f:
236 |                 f.writelines([f'{line_tpl % fname}\n' for fname in filenames])
237 | 
238 |         if mode_m4s:
239 |             fname_raw = 'all_chunks.mp4'
240 |             create_index()
241 |             call(f'xargs cat < {fname_index} >> {fname_raw}', path=path)
242 |             call(f'ffmpeg -y -i {fname_raw} -c copy {fname_video}', path=path)
243 | 
244 |         else:
245 |             # presumably ts
246 |             create_index('file %s')
247 |             call(f'ffmpeg -y -f concat -i {fname_index} -c copy -bsf:a aac_adtstoasc {fname_video}', path=path)
248 | 
249 |         return path / fname_video
250 | 
251 |     def _get_response_simple(self, url: str, *, json: bool = False) -> str | dict:
252 |         """Returns a text or a dictionary from a URL.
253 | 
254 |         :param url:
255 |         :param json:
256 | 
257 |         """
258 |         response = self._session.get(url)
259 |         response.raise_for_status()
260 | 
261 |         if json:
262 |             return response.json()
263 | 
264 |         return response.text
265 | 
266 |     def _video_dump(
267 |         self,
268 |         *,
269 |         title: str,
270 |         url_playlist: str,
271 |         url_referer: str,
272 |         start_chunk: str = '',
273 |     ) -> Path:
274 |         assert url_playlist.endswith('m3u8'), f'No playlist in `{url_playlist}`'
275 | 
276 |         LOGGER.info(f'Title: {title}')
277 | 
278 |         chunk_names = self._chunks_get_list(url_playlist)
279 | 
280 |         target_dir = self._target_dir
281 |         LOGGER.info(f'Downloading video into {target_dir} ...')
282 | 
283 |         with chdir(target_dir):
284 |             dump_dir = (target_dir / title).absolute()
285 |             dump_dir.mkdir(parents=True, exist_ok=True)
286 | 
287 |             url_root = url_playlist.rpartition('/')[0]  # strip playlist filename
288 | 
289 |             self._chunks_download(
290 |                 url_video_root=url_root,
291 |                 dump_dir=dump_dir,
292 |                 chunk_names=chunk_names,
293 |                 start_chunk=start_chunk,
294 |                 headers={'Referer': url_referer.strip()},
295 |                 concurrent=self._concurrent,
296 |             )
297 | 
298 |             fpath_video_target = Path(f'{title}.mp4').absolute()
299 |             fpath_video = self._video_concat(dump_dir)
300 | 
301 |             shutil.move(fpath_video, fpath_video_target)
302 |             shutil.rmtree(dump_dir, ignore_errors=True)
303 | 
304 |         LOGGER.info(f'Video is ready: {fpath_video_target}')
305 |         return fpath_video_target
306 | 
307 |     def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path:
308 |         raise NotImplementedError
309 | 
310 |     def run(self, params_or_hook: Callable[[str, str], str] | dict[str, str]) -> Path:
311 |         params = params_or_hook if isinstance(params_or_hook, dict) else self._get_args(get_param_hook=params_or_hook)
312 |         return self._gather(**params)
313 | 


--------------------------------------------------------------------------------