├── .gitignore
├── src
└── webinardump
│ ├── __init__.py
│ ├── dumpers
│ ├── __init__.py
│ ├── webinarru.py
│ ├── yadisk.py
│ └── base.py
│ ├── utils.py
│ └── cli.py
├── tests
├── datafixtures
│ ├── manifest_webinarru.json
│ ├── vid.m3u
│ ├── empty.ts
│ └── manifest_yadisk.html
├── conftest.py
├── test_utils.py
└── test_basic.py
├── CHANGELOG.md
├── AUTHORS
├── tools
└── debug.py
├── .github
└── workflows
│ └── python-package.yml
├── ruff.toml
├── pyproject.toml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | dump/
2 |
--------------------------------------------------------------------------------
/src/webinardump/__init__.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.1.1'
--------------------------------------------------------------------------------
/tests/datafixtures/manifest_webinarru.json:
--------------------------------------------------------------------------------
1 | {"name": "yatst"}
--------------------------------------------------------------------------------
/tests/datafixtures/vid.m3u:
--------------------------------------------------------------------------------
1 | 1.ts?some=other1
2 | 2.ts?some=other2
3 |
--------------------------------------------------------------------------------
/tests/datafixtures/empty.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idlesign/webinardump/HEAD/tests/datafixtures/empty.ts
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # webinardump changelog
2 |
3 | ### v0.1.1 [2025-11-15]
4 | * ** Minor fixes.
5 |
6 | ## v0.1.0 [2020-07-30]
7 | * ++ Basic functionality.
--------------------------------------------------------------------------------
/tests/datafixtures/manifest_yadisk.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | webinardump Authors
2 | ===================
3 |
4 | Created by Igor `idle sign` Starikov.
5 |
6 |
7 | Contributors
8 | ------------
9 |
10 | KarenKing
11 |
--------------------------------------------------------------------------------
/src/webinardump/dumpers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Dumper
2 | from .webinarru import WebinarRu
3 | from .yadisk import YandexDisk
4 |
5 | __all__ = [
6 | 'Dumper',
7 | 'WebinarRu',
8 | 'YandexDisk',
9 | ]
10 |
--------------------------------------------------------------------------------
/tools/debug.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 |
4 | from webinardump.dumpers import YandexDisk
5 |
6 | logging.basicConfig(level=logging.INFO, format='%(levelname)-8s: %(message)s')
7 |
8 |
9 | dumper = YandexDisk(target_dir=Path('../tools/dumped/'))
10 |
11 | dumper.run({
12 | 'url_video': 'https://disk.yandex.ru/i/xxx',
13 | })
14 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture
7 | def mock_call(monkeypatch):
8 | calls = []
9 |
10 | def mock_call(cmd, **kwargs):
11 | if 'ffmpeg' in cmd:
12 | Path('yatst/all_chunks.mp4').write_bytes(b'')
13 |
14 | calls.append(cmd)
15 |
16 | monkeypatch.setattr("webinardump.utils.check_call", mock_call)
17 |
18 | return calls
19 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from webinardump.utils import get_files_sorted
2 |
3 |
4 | def test_get_files_sorted(tmp_path):
5 |
6 | (tmp_path / '1.a').touch()
7 | (tmp_path / '01.a').touch()
8 | (tmp_path / '02.a').touch()
9 | (tmp_path / '9.a').touch()
10 | (tmp_path / '1.b').touch()
11 | (tmp_path / '10.a').touch()
12 | (tmp_path / '11.a').touch()
13 |
14 | fnames = get_files_sorted(tmp_path, suffixes={'.a'})
15 | assert fnames == ['01.a', '1.a', '02.a', '9.a', '10.a', '11.a']
--------------------------------------------------------------------------------
/src/webinardump/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from pathlib import Path
4 | from subprocess import check_call
5 |
6 | LOGGER = logging.getLogger('webinardump')
7 | RE_DIGITS = re.compile(r'(\d+)')
8 |
9 |
10 |
11 | def call(cmd: str, *, path: Path):
12 | return check_call(cmd, cwd=path, shell=True)
13 |
14 |
15 | def get_files_sorted(path: Path, *, suffixes: set[str]) -> list[str]:
16 | def natural(text):
17 | return [int(char) if char.isdigit() else char for char in RE_DIGITS.split(text)]
18 |
19 | files = [file.name for file in path.iterdir() if file.is_file() and file.suffix in suffixes]
20 | files.sort(key=natural)
21 |
22 | return files
23 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | name: Python package
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 | workflow_dispatch:
9 |
10 | jobs:
11 | build:
12 |
13 | runs-on: ubuntu-latest
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | python-version: [3.11, 3.12, 3.13]
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 | - name: Set up Python ${{ matrix.python-version }}
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 | - name: Setup uv
26 | uses: astral-sh/setup-uv@v6
27 | - name: Install deps
28 | run: |
29 | uv sync --only-group tests
30 | uv pip install coveralls
31 | - uses: astral-sh/ruff-action@v3
32 | with:
33 | args: check
34 | - name: Run tests
35 | env:
36 | GITHUB_TOKEN: ${{ secrets.github_token }}
37 | run: |
38 | uv run coverage run -m pytest
39 | uv run coveralls --service=github
40 |
--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
1 | target-version = "py311"
2 | line-length = 120
3 | exclude = [
4 | ]
5 |
6 | [format]
7 | quote-style = "single"
8 | exclude = []
9 |
10 | [lint]
11 | select = [
12 | "B", # possible bugs
13 | "BLE", # broad exception
14 | "C4", # comprehensions
15 | "DTZ", # work with datetimes
16 | "E", # code style
17 | "ERA", # commented code
18 | "EXE", # check executables
19 | "F", # misc
20 | "FA", # future annotations
21 | "FBT", # booleans
22 | "FURB", # modernizing
23 | "G", # logging format
24 | "I", # imports
25 | "ICN", # import conventions
26 | "INT", # i18n
27 | "ISC", # stringc concat
28 | "PERF", # perfomance
29 | "PIE", # misc
30 | "PLC", # misc
31 | "PLE", # misc err
32 | "PT", # pytest
33 | "PTH", # pathlib
34 | "PYI", # typing
35 | "RSE", # exc raise
36 | "RUF", # misc
37 | "SLOT", # slots related
38 | "TC", # typing
39 | "UP", # py upgrade
40 | ]
41 |
42 | ignore = []
43 |
44 |
45 | [lint.extend-per-file-ignores]
46 | "tests/*" = []
47 | "src/sponsrdump/utils.py" = [
48 | "G004",
49 | ]
50 | "src/sponsrdump/cli.py" = [
51 | "RUF001",
52 | ]
53 |
--------------------------------------------------------------------------------
/src/webinardump/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from pathlib import Path
4 |
5 | from .dumpers import Dumper
6 |
7 |
8 | def get_user_input(param: str, hint: str, *, choices: list[str] | None = None) -> str:
9 |
10 | choices = set(choices or [])
11 |
12 | while True:
13 | data = input(f'{hint}: ')
14 | data = data.strip()
15 | if not data or (choices and data not in choices):
16 | continue
17 |
18 | return data
19 |
20 |
21 | def main():
22 | parser = argparse.ArgumentParser(prog='webinardump')
23 | parser.add_argument('-t', '--target', type=Path, default=Path(), help='Directory to dump to')
24 | parser.add_argument('--timeout', type=int, default=3, help='Request timeout')
25 | parser.add_argument('--rmax', type=int, default=10, help='Max concurrent requests number')
26 | parser.add_argument('--debug', help='Show debug information', action='store_true')
27 |
28 | args = parser.parse_args()
29 |
30 | logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='%(levelname)-8s: %(message)s')
31 |
32 | dumper_choices = []
33 | print('Available dumpers:')
34 |
35 | for idx, dumper in enumerate(Dumper.registry, 1):
36 | print(f'{idx} — {dumper.title}')
37 | dumper_choices.append(f'{idx}')
38 |
39 | chosen = get_user_input('', 'Select dumper number', choices=dumper_choices)
40 |
41 | dumper = Dumper.registry[int(chosen)-1](
42 | target_dir=args.target,
43 | timeout=args.timeout,
44 | concurrent=args.rmax,
45 | )
46 | dumper.run(get_user_input)
47 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "webinardump"
3 | dynamic = ["version"]
4 | description = "Make local backup copies of webinars"
5 | authors = [
6 | { name = "Igor Starikov", email = "idlesign@yandex.ru" }
7 | ]
8 | readme = "README.md"
9 | license = "BSD-3-Clause"
10 | license-files = ["LICENSE"]
11 | requires-python = ">=3.11"
12 | keywords = ["backup", "webinars"]
13 | dependencies = [
14 | "requests>=2.31.0",
15 | ]
16 |
17 | [project.urls]
18 | Homepage = "https://github.com/idlesign/webinardump"
19 |
20 | [project.scripts]
21 | webinardump = "webinardump.cli:main"
22 |
23 | [dependency-groups]
24 | dev = [
25 | {include-group = "linters"},
26 | {include-group = "tests"},
27 | ]
28 | linters = [
29 | # "ruff",
30 | ]
31 | tests = [
32 | "pytest",
33 | "pytest-responsemock",
34 | "pytest-datafixtures",
35 | ]
36 |
37 | [build-system]
38 | requires = ["hatchling"]
39 | build-backend = "hatchling.build"
40 |
41 | [tool.hatch.version]
42 | path = "src/webinardump/__init__.py"
43 |
44 | [tool.hatch.build.targets.wheel]
45 | packages = ["src/webinardump"]
46 |
47 | [tool.hatch.build.targets.sdist]
48 | packages = ["src/"]
49 |
50 | [tool.pytest.ini_options]
51 | testpaths = [
52 | "tests",
53 | ]
54 |
55 | [tool.coverage.run]
56 | source = [
57 | "src/",
58 | ]
59 | omit = [
60 | "*/cli.py",
61 | ]
62 |
63 | [tool.coverage.report]
64 | fail_under = 99.00
65 | exclude_also = [
66 | "raise NotImplementedError",
67 | "if TYPE_CHECKING:",
68 | ]
69 |
70 | [tool.tox]
71 | skip_missing_interpreters = true
72 | env_list = [
73 | "py311",
74 | "py312",
75 | "py313",
76 | ]
77 |
78 | [tool.tox.env_run_base]
79 | dependency_groups = ["tests"]
80 | commands = [
81 | ["pytest", { replace = "posargs", default = ["tests"], extend = true }],
82 | ]
83 |
--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | from webinardump.dumpers import WebinarRu, YandexDisk
2 |
3 | CALLS = [
4 | 'ffmpeg -y -f concat -i all_chunks.txt -c copy -bsf:a aac_adtstoasc all_chunks.mp4'
5 | ]
6 |
7 |
8 | def test_yadisk(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call):
9 | data_manifest = datafix_read('manifest_yadisk.html')
10 | data_m3u = datafix_read('vid.m3u')
11 | data_ts = datafix_readbin('empty.ts')
12 |
13 | with response_mock([
14 | f'GET https://disk.yandex.ru/i/xxx -> 200:{data_manifest}',
15 | f'GET https://here/there.m3u8 -> 200:{data_m3u}',
16 | b'GET https://here/1.ts?some=other1 -> 200:' + data_ts,
17 | b'GET https://here/2.ts?some=other2 -> 200:' + data_ts,
18 | ]):
19 | fpath = YandexDisk(target_dir=tmp_path).run({
20 | 'url_video': 'https://disk.yandex.ru/i/xxx',
21 | })
22 | assert fpath
23 | assert mock_call == CALLS
24 |
25 |
26 | def test_webinarru(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call):
27 | data_manifest = datafix_read('manifest_webinarru.json')
28 | data_m3u = datafix_read('vid.m3u')
29 | data_ts = datafix_readbin('empty.ts')
30 |
31 | with response_mock([
32 | 'GET https://events.webinar.ru/api/eventsessions/aaa/record/isviewable?'
33 | f'recordAccessToken=bbb -> 200:{data_manifest}',
34 |
35 | f'GET https://here/there.m3u8 -> 200:{data_m3u}',
36 | b'GET https://here/1.ts?some=other1 -> 200:' + data_ts,
37 | b'GET https://here/2.ts?some=other2 -> 200:' + data_ts,
38 | ]):
39 | fpath = WebinarRu(target_dir=tmp_path).run({
40 | 'url_video': ' https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb',
41 | 'url_playlist': 'https://here/there.m3u8',
42 | })
43 | assert fpath
44 | assert mock_call == CALLS
45 |
--------------------------------------------------------------------------------
/src/webinardump/dumpers/webinarru.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import ClassVar
3 |
4 | from ..utils import LOGGER
5 | from .base import Dumper
6 |
7 |
8 | class WebinarRu(Dumper):
9 |
10 | title = 'webinar.ru'
11 |
12 | _user_input_map: ClassVar[dict[str, str]] = {
13 | 'url_video': 'Video URL (with `record-new/`)',
14 | 'url_playlist': 'Video chunk list URL (with `chunklist.m3u8`)',
15 | }
16 |
17 | _headers: ClassVar[dict[str, str]] = {
18 | **Dumper._headers,
19 | 'Origin': 'https://events.webinar.ru',
20 | }
21 |
22 | def _gather(self, *, url_video: str, start_chunk: str = '', url_playlist: str = '', **params) -> Path:
23 | """Runs video dump.
24 |
25 | :param url_video: Video URL. Hint: has record-new/
26 | :param url_playlist: Video chunk list URL. Hint: ends with chunklist.m3u8
27 | :param start_chunk: Optional chunk name to continue download from.
28 | """
29 | assert url_playlist, 'Playlist URL must be specified'
30 |
31 | assert 'record-new/' in url_video, (
32 | 'Unexpected video URL format\n'
33 | f'Given: {url_video}.\n'
34 | f'Expected: https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb')
35 |
36 | _, _, tail = url_video.partition('record-new/')
37 | session_id, _, video_id = tail.partition('/')
38 |
39 | LOGGER.info('Getting manifest ...')
40 |
41 | manifest = self._get_response_simple(
42 | f'https://events.webinar.ru/api/eventsessions/{session_id}/record/isviewable?recordAccessToken={video_id}',
43 | json=True
44 | )
45 |
46 | return self._video_dump(
47 | title=manifest['name'],
48 | url_playlist=url_playlist,
49 | url_referer=url_video,
50 | start_chunk=start_chunk,
51 | )
52 |
--------------------------------------------------------------------------------
/src/webinardump/dumpers/yadisk.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from pathlib import Path
4 | from typing import ClassVar
5 |
6 | from ..utils import LOGGER
7 | from .base import Dumper
8 |
9 |
10 | class YandexDisk(Dumper):
11 |
12 | title = 'Яндекс.Диск'
13 |
14 | _user_input_map: ClassVar[dict[str, str]] = {
15 | 'url_video': 'Video URL (https://disk.yandex.ru/i/xxx)',
16 | }
17 |
18 | def _get_manifest(self, url: str) -> dict:
19 | LOGGER.debug(f'Getting manifest from {url} ...')
20 |
21 | contents = self._get_response_simple(url)
22 | manifest = re.findall(r'id="store-prefetch">([^<]+) tuple[str, str]:
29 |
30 | resources = list(manifest['resources'].values())
31 | resource = resources[0]
32 |
33 | dimension_max = 0
34 | url_playlist = ''
35 |
36 | for stream_info in resource['videoStreams']['videos']:
37 | dimension, *_ = stream_info['dimension'].partition('p')
38 | if not dimension.isnumeric():
39 | continue # e.g. 'adaptive'
40 | dimension = int(dimension)
41 | if dimension_max < dimension:
42 | dimension_max = dimension
43 | url_playlist = stream_info['url']
44 |
45 | return url_playlist, resource['name']
46 |
47 | def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path:
48 |
49 | manifest = self._get_manifest(url_video)
50 | url_playlist, title = self._get_playlist_and_title(manifest)
51 |
52 | return self._video_dump(
53 | title=title,
54 | url_playlist=url_playlist,
55 | url_referer=url_video,
56 | start_chunk=start_chunk,
57 | )
58 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # webinardump
2 |
3 |
4 |
5 | [](https://pypi.python.org/pypi/webinardump)
6 | [](https://pypi.python.org/pypi/webinardump)
7 | [](https://coveralls.io/r/idlesign/webinardump)
8 |
9 | ## Описание
10 |
11 | *Приложение позволяет скачать запись вебинара и сохранить в виде .mp4 файла.*
12 |
13 |
14 | ## Откуда качает
15 |
16 | * Яндекс.Диск (записи стримов)
17 | * webinar.ru
18 |
19 |
20 | ## Зависимости
21 |
22 | Что нужно иметь для запуска приложения и работы с ним.
23 |
24 | * Linux (Unix)
25 | * Python 3.11+
26 | * ffmpeg (для Ubuntu: `sudo apt install ffmpeg`)
27 | * uv (для установки и обновления приложения)
28 | * Базовые знания о работе в браузере с отладочной консолью.
29 |
30 |
31 | ## Установка и обновление
32 |
33 | Производится при помощи приложения [uv](https://docs.astral.sh/uv/getting-started/installation/):
34 |
35 | ```shell
36 | $ uv tool install webinardump
37 | ```
38 |
39 | После этого запускать приложение можно командой
40 |
41 | ```shell
42 | $ webinardump
43 | ```
44 |
45 | Для обновления выполните
46 |
47 | ```shell
48 | $ uv tool upgrade webinardump
49 | ```
50 |
51 | ## Как использовать
52 |
53 | Переместитесь в желаемый каталог и выполните следующую команду.
54 |
55 | ```shell
56 |
57 | ; Указываем путь для скачивания - my_webinar_dir/
58 | ; Указываем таймаут запросов - 10 секунд
59 | ; Указываем максимальное количество одновременных запросов - 20
60 | $ webinardump --target my_webinar_dir/ --timeout 10 --rmax 20
61 | ```
62 | Приложение скачает фрагменты вебинара, а потом соберёт из них единый файл.
63 |
64 |
65 | ### disk.yandex.ru
66 |
67 | 1. Взять ссылку на вебинар (запись стрима). Вида https://disk.yandex.ru/i/xxx
68 | 2. Запустить скачиватель и скормить ему ссылку из предыдущего пункта.
69 |
70 |
71 | ### webinar.ru
72 |
73 | Процесс скачивания автоматизирован не полностью, потребуется искать
74 | некоторые ссылки при помощи браузера.
75 |
76 | 1. Взять ссылку на вебинар. Вида https://events.webinar.ru/event/xxx/yyy/zzz
77 | 2. Открыть в браузере.
78 | 3. Включить отладочную консоль (F12).
79 | 4. Запустить воспроизведение.
80 | 5. Отыскать ссылку с `record-new/` и запомнить её.
81 | 6. Отыскать ссылку, оканчивающуюся на `chunklist.m3u8` и запомнить её.
82 | 7. Запустить скачиватель и скормить ему ссылки и двух предыдущих пунктов.
83 |
84 | ## Для разработки
85 |
86 | При разработке используется [makeapp](https://pypi.org/project/makeapp/). Ставим:
87 |
88 | ```shell
89 | $ uv tool install makeapp
90 | ```
91 |
92 | После клонирования репозитория sponsrdump, в его директории выполняем:
93 |
94 | ```shell
95 | # ставим утилиты
96 | $ ma tools
97 |
98 | # инициализируем виртуальное окружение
99 | $ ma up --tool
100 |
101 | # теперь в окружении доступны зависимости и команда sponsrdump
102 | ```
103 |
104 | Проверь стиль перед отправкой кода на обзор:
105 |
106 | ```shell
107 | # проверяем стиль
108 | $ ma style
109 | ```
110 |
--------------------------------------------------------------------------------
/src/webinardump/dumpers/base.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from collections.abc import Callable
3 | from concurrent.futures import ThreadPoolExecutor, as_completed
4 | from contextlib import chdir
5 | from pathlib import Path
6 | from random import choice
7 | from threading import Lock
8 | from time import sleep
9 | from typing import ClassVar
10 |
11 | import requests
12 | from requests import Session
13 | from requests.adapters import HTTPAdapter, Retry
14 |
15 | from ..utils import LOGGER, call, get_files_sorted
16 |
17 |
18 | class Dumper:
19 |
20 | title: str = ''
21 |
22 | _user_input_map: ClassVar[dict[str, str]]
23 |
24 | _headers: ClassVar[dict[str, str]] = {
25 | 'Connection': 'keep-alive',
26 | 'Accept': '*/*',
27 | 'User-Agent': (
28 | 'Mozilla/5.0 (X11; Linux x86_64) '
29 | 'AppleWebKit/537.36 (KHTML, like Gecko) '
30 | 'Chrome/79.0.3945.136 YaBrowser/20.2.3.320 (beta) Yowser/2.5 Safari/537.36'
31 | ),
32 | 'Sec-Fetch-Site': 'same-site',
33 | 'Sec-Fetch-Mode': 'cors',
34 | 'Accept-Language': 'ru,en;q=0.9',
35 | 'Accept-Encoding': 'gzip, deflate, sdch, br',
36 | }
37 |
38 | _media_ext: ClassVar[set[str]] = {'.ts', '.m4s'}
39 |
40 | registry: ClassVar[list[type['Dumper']]] = []
41 |
42 | def __init_subclass__(cls):
43 | super().__init_subclass__()
44 | cls.registry.append(cls)
45 |
46 | def __init__(self, *, target_dir: Path, timeout: int = 3, concurrent: int = 10, sleepy: bool = False) -> None:
47 | self._target_dir = target_dir
48 | self._timeout = timeout
49 | self._concurrent = concurrent
50 | self._user_input_map = self._user_input_map or {}
51 | self._session = self._get_session()
52 | self._sleepy = sleepy
53 |
54 | def __str__(self):
55 | return self.title
56 |
57 | def _get_session(self) -> Session:
58 | # todo при ошибках сессия в нитях блокируется. можно попробовать несколько сессий
59 | session = requests.Session()
60 | session.headers = self._headers
61 | retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[500])
62 | session.mount('http://', HTTPAdapter(max_retries=retries))
63 | session.mount('https://', HTTPAdapter(max_retries=retries))
64 | return session
65 |
66 | def _get_args(self, *, get_param_hook: Callable[[str, str], str]) -> dict:
67 | input_data = {}
68 |
69 | for param, hint in self._user_input_map.items():
70 | input_data[param] = get_param_hook(param, hint)
71 |
72 | return input_data
73 |
74 | def _chunks_get_list(self, url: str, *, url_prefix: str = '') -> list[str]:
75 | """Get video chunks names from playlist file at URL.
76 |
77 | * Links to other playlists:
78 | #EXTM3U
79 | #EXT-X-VERSION:7
80 | #EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",AUTOSELECT=YES,NAME="xxx",URI="a1/index.m3u8"
81 | #EXT-X-STREAM-INF:BANDWIDTH=5205192,RESOLUTION=1920x1080,CODECS="avc1.42C02A,mp4a.40.2",AUDIO="audio"
82 | v1/index.m3u8
83 |
84 | * Links to chunks:
85 | #EXTM3U
86 | #EXT-X-VERSION:6
87 | #EXT-X-MEDIA-SEQUENCE:1
88 | #EXT-X-INDEPENDENT-SEGMENTS
89 | #EXT-X-TARGETDURATION:12
90 | #EXT-X-MAP:URI="init/1.m4s"
91 | #EXTINF:2.000,
92 | media/1.m4s
93 | #EXT-X-DISCONTINUITY
94 |
95 | :param url: File URL.
96 | :param url_prefix: File URL prefix.
97 |
98 | """
99 | LOGGER.info(f'Getting video chunks from playlist {url} ...')
100 |
101 | sub_playlist = self._get_response_simple(url)
102 | playlists = []
103 | chunk_lists = []
104 | media_ext = self._media_ext
105 |
106 | for line in sub_playlist.splitlines():
107 | line = line.strip()
108 |
109 | if 'EXT-X-MAP:URI' in line:
110 | # naive parsing. todo: respect EXT-X-DISCONTINUITY and EXT-X-MEDIA:TYPE=AUDIO,...,AUTOSELECT=YES
111 | line = line.rpartition("=")[2].strip('"')
112 |
113 | if line.endswith('.m3u8'):
114 | playlists.append(line)
115 | continue
116 |
117 | path = Path(line.partition('?')[0])
118 | if path.suffix not in media_ext:
119 | continue
120 |
121 | if url_prefix:
122 | line = f'{url_prefix}/{line}'
123 |
124 | chunk_lists.append(line)
125 |
126 | if playlists:
127 | LOGGER.info('Sub playlists found. Will use the first one ...')
128 |
129 | sub_playlist = playlists[0]
130 | sub_playlist_url_prefix = sub_playlist.rpartition('/')[0]
131 | sub_playlist_url = f'{url.rpartition("/")[0]}/{sub_playlist}'
132 |
133 | chunk_lists = self._chunks_get_list(sub_playlist_url, url_prefix=sub_playlist_url_prefix)
134 |
135 | else:
136 | assert chunk_lists, 'No video chunks found in playlist file'
137 |
138 | return chunk_lists
139 |
140 | def _chunks_download(
141 | self,
142 | *,
143 | url_video_root: str,
144 | dump_dir: Path,
145 | chunk_names: list[str],
146 | start_chunk: str,
147 | headers: dict[str, str] | None = None,
148 | concurrent: int = 10,
149 | ) -> None:
150 |
151 | chunks_total = len(chunk_names)
152 |
153 | progress_file = (dump_dir / 'files.txt')
154 | progress_file.touch()
155 |
156 | files_done = dict.fromkeys(progress_file.read_text().splitlines())
157 | lock = Lock()
158 |
159 | def dump(*, name: str, file_idx: int, url: str, session: Session, sleepy: bool, timeout: int) -> None:
160 |
161 | name = name.partition('?')[0] # drop GET-args
162 |
163 | if name in files_done:
164 | LOGGER.info(f'File {name} has already been downloaded before. Skipping.')
165 | return
166 |
167 | filename = name.rpartition('/')[2] # drop url prefix
168 | filename = f'{file_idx}_{filename}'
169 |
170 | with session.get(url, headers=headers or {}, stream=True, timeout=timeout) as r:
171 | r.raise_for_status()
172 | with (dump_dir / filename).open('wb') as f:
173 | f.writelines(r.iter_content(chunk_size=8192))
174 |
175 | files_done[name] = True
176 | with lock:
177 | progress_file.write_text('\n'.join(files_done))
178 |
179 | if sleepy:
180 | sleep(choice([1, 0.5, 0.7, 0.6]))
181 |
182 | with ThreadPoolExecutor(max_workers=concurrent) as executor:
183 |
184 | future_url_map = {}
185 |
186 | for idx, chunk_name in enumerate(chunk_names, 1):
187 |
188 | if chunk_name == start_chunk:
189 | start_chunk = '' # clear to allow further download
190 |
191 | if start_chunk:
192 | continue
193 |
194 | chunk_url = f'{url_video_root.rstrip("/")}/{chunk_name}'
195 | submitted = executor.submit(
196 | dump,
197 | name=chunk_name,
198 | file_idx=idx,
199 | url=chunk_url,
200 | session=self._session,
201 | sleepy=self._sleepy,
202 | timeout=self._timeout,
203 | )
204 |
205 | future_url_map[submitted] = (chunk_name, chunk_url)
206 |
207 | if future_url_map:
208 | LOGGER.info(f'Downloading up to {concurrent} files concurrently ...')
209 |
210 | counter = 1
211 | for future in as_completed(future_url_map):
212 | chunk_name, chunk_url = future_url_map[future]
213 | future.result()
214 | percent = round(counter * 100 / chunks_total, 1)
215 | counter += 1
216 | LOGGER.info(f'Got {counter}/{chunks_total} ({chunk_name.partition("?")[0]}) [{percent}%] ...')
217 |
218 | def _video_concat(self, path: Path) -> Path:
219 |
220 | LOGGER.info('Concatenating video ...')
221 |
222 | fname_video = 'all_chunks.mp4'
223 | fname_index = 'all_chunks.txt'
224 |
225 | mode_m4s = False
226 |
227 | filenames = get_files_sorted(path, suffixes=self._media_ext)
228 |
229 | for filename in filenames:
230 | if filename.endswith('m4s'):
231 | mode_m4s = True
232 | break
233 |
234 | def create_index(line_tpl: str = '%s'):
235 | with (path / fname_index).open('w') as f:
236 | f.writelines([f'{line_tpl % fname}\n' for fname in filenames])
237 |
238 | if mode_m4s:
239 | fname_raw = 'all_chunks.mp4'
240 | create_index()
241 | call(f'xargs cat < {fname_index} >> {fname_raw}', path=path)
242 | call(f'ffmpeg -y -i {fname_raw} -c copy {fname_video}', path=path)
243 |
244 | else:
245 | # presumably ts
246 | create_index('file %s')
247 | call(f'ffmpeg -y -f concat -i {fname_index} -c copy -bsf:a aac_adtstoasc {fname_video}', path=path)
248 |
249 | return path / fname_video
250 |
251 | def _get_response_simple(self, url: str, *, json: bool = False) -> str | dict:
252 | """Returns a text or a dictionary from a URL.
253 |
254 | :param url:
255 | :param json:
256 |
257 | """
258 | response = self._session.get(url)
259 | response.raise_for_status()
260 |
261 | if json:
262 | return response.json()
263 |
264 | return response.text
265 |
266 | def _video_dump(
267 | self,
268 | *,
269 | title: str,
270 | url_playlist: str,
271 | url_referer: str,
272 | start_chunk: str = '',
273 | ) -> Path:
274 | assert url_playlist.endswith('m3u8'), f'No playlist in `{url_playlist}`'
275 |
276 | LOGGER.info(f'Title: {title}')
277 |
278 | chunk_names = self._chunks_get_list(url_playlist)
279 |
280 | target_dir = self._target_dir
281 | LOGGER.info(f'Downloading video into {target_dir} ...')
282 |
283 | with chdir(target_dir):
284 | dump_dir = (target_dir / title).absolute()
285 | dump_dir.mkdir(parents=True, exist_ok=True)
286 |
287 | url_root = url_playlist.rpartition('/')[0] # strip playlist filename
288 |
289 | self._chunks_download(
290 | url_video_root=url_root,
291 | dump_dir=dump_dir,
292 | chunk_names=chunk_names,
293 | start_chunk=start_chunk,
294 | headers={'Referer': url_referer.strip()},
295 | concurrent=self._concurrent,
296 | )
297 |
298 | fpath_video_target = Path(f'{title}.mp4').absolute()
299 | fpath_video = self._video_concat(dump_dir)
300 |
301 | shutil.move(fpath_video, fpath_video_target)
302 | shutil.rmtree(dump_dir, ignore_errors=True)
303 |
304 | LOGGER.info(f'Video is ready: {fpath_video_target}')
305 | return fpath_video_target
306 |
307 | def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path:
308 | raise NotImplementedError
309 |
310 | def run(self, params_or_hook: Callable[[str, str], str] | dict[str, str]) -> Path:
311 | params = params_or_hook if isinstance(params_or_hook, dict) else self._get_args(get_param_hook=params_or_hook)
312 | return self._gather(**params)
313 |
--------------------------------------------------------------------------------