├── .gitignore ├── src └── webinardump │ ├── __init__.py │ ├── dumpers │ ├── __init__.py │ ├── webinarru.py │ ├── yadisk.py │ └── base.py │ ├── utils.py │ └── cli.py ├── tests ├── datafixtures │ ├── manifest_webinarru.json │ ├── vid.m3u │ ├── empty.ts │ └── manifest_yadisk.html ├── conftest.py ├── test_utils.py └── test_basic.py ├── CHANGELOG.md ├── AUTHORS ├── tools └── debug.py ├── .github └── workflows │ └── python-package.yml ├── ruff.toml ├── pyproject.toml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | dump/ 2 | -------------------------------------------------------------------------------- /src/webinardump/__init__.py: -------------------------------------------------------------------------------- 1 | VERSION = '0.1.1' -------------------------------------------------------------------------------- /tests/datafixtures/manifest_webinarru.json: -------------------------------------------------------------------------------- 1 | {"name": "yatst"} -------------------------------------------------------------------------------- /tests/datafixtures/vid.m3u: -------------------------------------------------------------------------------- 1 | 1.ts?some=other1 2 | 2.ts?some=other2 3 | -------------------------------------------------------------------------------- /tests/datafixtures/empty.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idlesign/webinardump/HEAD/tests/datafixtures/empty.ts -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # webinardump changelog 2 | 3 | ### v0.1.1 [2025-11-15] 4 | * ** Minor fixes. 5 | 6 | ## v0.1.0 [2020-07-30] 7 | * ++ Basic functionality. -------------------------------------------------------------------------------- /tests/datafixtures/manifest_yadisk.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | webinardump Authors 2 | =================== 3 | 4 | Created by Igor `idle sign` Starikov. 5 | 6 | 7 | Contributors 8 | ------------ 9 | 10 | KarenKing 11 | -------------------------------------------------------------------------------- /src/webinardump/dumpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Dumper 2 | from .webinarru import WebinarRu 3 | from .yadisk import YandexDisk 4 | 5 | __all__ = [ 6 | 'Dumper', 7 | 'WebinarRu', 8 | 'YandexDisk', 9 | ] 10 | -------------------------------------------------------------------------------- /tools/debug.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from webinardump.dumpers import YandexDisk 5 | 6 | logging.basicConfig(level=logging.INFO, format='%(levelname)-8s: %(message)s') 7 | 8 | 9 | dumper = YandexDisk(target_dir=Path('../tools/dumped/')) 10 | 11 | dumper.run({ 12 | 'url_video': 'https://disk.yandex.ru/i/xxx', 13 | }) 14 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def mock_call(monkeypatch): 8 | calls = [] 9 | 10 | def mock_call(cmd, **kwargs): 11 | if 'ffmpeg' in cmd: 12 | Path('yatst/all_chunks.mp4').write_bytes(b'') 13 | 14 | calls.append(cmd) 15 | 16 | monkeypatch.setattr("webinardump.utils.check_call", mock_call) 17 | 18 | return calls 19 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from webinardump.utils import get_files_sorted 2 | 3 | 4 | def test_get_files_sorted(tmp_path): 5 | 6 | (tmp_path / '1.a').touch() 7 | (tmp_path / '01.a').touch() 8 | (tmp_path / '02.a').touch() 9 | (tmp_path / '9.a').touch() 10 | (tmp_path / '1.b').touch() 11 | (tmp_path / '10.a').touch() 12 | (tmp_path / '11.a').touch() 13 | 14 | fnames = get_files_sorted(tmp_path, suffixes={'.a'}) 15 | assert fnames == ['01.a', '1.a', '02.a', '9.a', '10.a', '11.a'] -------------------------------------------------------------------------------- /src/webinardump/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from pathlib import Path 4 | from subprocess import check_call 5 | 6 | LOGGER = logging.getLogger('webinardump') 7 | RE_DIGITS = re.compile(r'(\d+)') 8 | 9 | 10 | 11 | def call(cmd: str, *, path: Path): 12 | return check_call(cmd, cwd=path, shell=True) 13 | 14 | 15 | def get_files_sorted(path: Path, *, suffixes: set[str]) -> list[str]: 16 | def natural(text): 17 | return [int(char) if char.isdigit() else char for char in RE_DIGITS.split(text)] 18 | 19 | files = [file.name for file in path.iterdir() if file.is_file() and file.suffix in suffixes] 20 | files.sort(key=natural) 21 | 22 | return files 23 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: [3.11, 3.12, 3.13] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Setup uv 26 | uses: astral-sh/setup-uv@v6 27 | - name: Install deps 28 | run: | 29 | uv sync --only-group tests 30 | uv pip install coveralls 31 | - uses: astral-sh/ruff-action@v3 32 | with: 33 | args: check 34 | - name: Run tests 35 | env: 36 | GITHUB_TOKEN: ${{ secrets.github_token }} 37 | run: | 38 | uv run coverage run -m pytest 39 | uv run coveralls --service=github 40 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | target-version = "py311" 2 | line-length = 120 3 | exclude = [ 4 | ] 5 | 6 | [format] 7 | quote-style = "single" 8 | exclude = [] 9 | 10 | [lint] 11 | select = [ 12 | "B", # possible bugs 13 | "BLE", # broad exception 14 | "C4", # comprehensions 15 | "DTZ", # work with datetimes 16 | "E", # code style 17 | "ERA", # commented code 18 | "EXE", # check executables 19 | "F", # misc 20 | "FA", # future annotations 21 | "FBT", # booleans 22 | "FURB", # modernizing 23 | "G", # logging format 24 | "I", # imports 25 | "ICN", # import conventions 26 | "INT", # i18n 27 | "ISC", # stringc concat 28 | "PERF", # perfomance 29 | "PIE", # misc 30 | "PLC", # misc 31 | "PLE", # misc err 32 | "PT", # pytest 33 | "PTH", # pathlib 34 | "PYI", # typing 35 | "RSE", # exc raise 36 | "RUF", # misc 37 | "SLOT", # slots related 38 | "TC", # typing 39 | "UP", # py upgrade 40 | ] 41 | 42 | ignore = [] 43 | 44 | 45 | [lint.extend-per-file-ignores] 46 | "tests/*" = [] 47 | "src/sponsrdump/utils.py" = [ 48 | "G004", 49 | ] 50 | "src/sponsrdump/cli.py" = [ 51 | "RUF001", 52 | ] 53 | -------------------------------------------------------------------------------- /src/webinardump/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from .dumpers import Dumper 6 | 7 | 8 | def get_user_input(param: str, hint: str, *, choices: list[str] | None = None) -> str: 9 | 10 | choices = set(choices or []) 11 | 12 | while True: 13 | data = input(f'{hint}: ') 14 | data = data.strip() 15 | if not data or (choices and data not in choices): 16 | continue 17 | 18 | return data 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser(prog='webinardump') 23 | parser.add_argument('-t', '--target', type=Path, default=Path(), help='Directory to dump to') 24 | parser.add_argument('--timeout', type=int, default=3, help='Request timeout') 25 | parser.add_argument('--rmax', type=int, default=10, help='Max concurrent requests number') 26 | parser.add_argument('--debug', help='Show debug information', action='store_true') 27 | 28 | args = parser.parse_args() 29 | 30 | logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='%(levelname)-8s: %(message)s') 31 | 32 | dumper_choices = [] 33 | print('Available dumpers:') 34 | 35 | for idx, dumper in enumerate(Dumper.registry, 1): 36 | print(f'{idx} — {dumper.title}') 37 | dumper_choices.append(f'{idx}') 38 | 39 | chosen = get_user_input('', 'Select dumper number', choices=dumper_choices) 40 | 41 | dumper = Dumper.registry[int(chosen)-1]( 42 | target_dir=args.target, 43 | timeout=args.timeout, 44 | concurrent=args.rmax, 45 | ) 46 | dumper.run(get_user_input) 47 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "webinardump" 3 | dynamic = ["version"] 4 | description = "Make local backup copies of webinars" 5 | authors = [ 6 | { name = "Igor Starikov", email = "idlesign@yandex.ru" } 7 | ] 8 | readme = "README.md" 9 | license = "BSD-3-Clause" 10 | license-files = ["LICENSE"] 11 | requires-python = ">=3.11" 12 | keywords = ["backup", "webinars"] 13 | dependencies = [ 14 | "requests>=2.31.0", 15 | ] 16 | 17 | [project.urls] 18 | Homepage = "https://github.com/idlesign/webinardump" 19 | 20 | [project.scripts] 21 | webinardump = "webinardump.cli:main" 22 | 23 | [dependency-groups] 24 | dev = [ 25 | {include-group = "linters"}, 26 | {include-group = "tests"}, 27 | ] 28 | linters = [ 29 | # "ruff", 30 | ] 31 | tests = [ 32 | "pytest", 33 | "pytest-responsemock", 34 | "pytest-datafixtures", 35 | ] 36 | 37 | [build-system] 38 | requires = ["hatchling"] 39 | build-backend = "hatchling.build" 40 | 41 | [tool.hatch.version] 42 | path = "src/webinardump/__init__.py" 43 | 44 | [tool.hatch.build.targets.wheel] 45 | packages = ["src/webinardump"] 46 | 47 | [tool.hatch.build.targets.sdist] 48 | packages = ["src/"] 49 | 50 | [tool.pytest.ini_options] 51 | testpaths = [ 52 | "tests", 53 | ] 54 | 55 | [tool.coverage.run] 56 | source = [ 57 | "src/", 58 | ] 59 | omit = [ 60 | "*/cli.py", 61 | ] 62 | 63 | [tool.coverage.report] 64 | fail_under = 99.00 65 | exclude_also = [ 66 | "raise NotImplementedError", 67 | "if TYPE_CHECKING:", 68 | ] 69 | 70 | [tool.tox] 71 | skip_missing_interpreters = true 72 | env_list = [ 73 | "py311", 74 | "py312", 75 | "py313", 76 | ] 77 | 78 | [tool.tox.env_run_base] 79 | dependency_groups = ["tests"] 80 | commands = [ 81 | ["pytest", { replace = "posargs", default = ["tests"], extend = true }], 82 | ] 83 | -------------------------------------------------------------------------------- /tests/test_basic.py: -------------------------------------------------------------------------------- 1 | from webinardump.dumpers import WebinarRu, YandexDisk 2 | 3 | CALLS = [ 4 | 'ffmpeg -y -f concat -i all_chunks.txt -c copy -bsf:a aac_adtstoasc all_chunks.mp4' 5 | ] 6 | 7 | 8 | def test_yadisk(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call): 9 | data_manifest = datafix_read('manifest_yadisk.html') 10 | data_m3u = datafix_read('vid.m3u') 11 | data_ts = datafix_readbin('empty.ts') 12 | 13 | with response_mock([ 14 | f'GET https://disk.yandex.ru/i/xxx -> 200:{data_manifest}', 15 | f'GET https://here/there.m3u8 -> 200:{data_m3u}', 16 | b'GET https://here/1.ts?some=other1 -> 200:' + data_ts, 17 | b'GET https://here/2.ts?some=other2 -> 200:' + data_ts, 18 | ]): 19 | fpath = YandexDisk(target_dir=tmp_path).run({ 20 | 'url_video': 'https://disk.yandex.ru/i/xxx', 21 | }) 22 | assert fpath 23 | assert mock_call == CALLS 24 | 25 | 26 | def test_webinarru(response_mock, tmp_path, datafix_read, datafix_readbin, mock_call): 27 | data_manifest = datafix_read('manifest_webinarru.json') 28 | data_m3u = datafix_read('vid.m3u') 29 | data_ts = datafix_readbin('empty.ts') 30 | 31 | with response_mock([ 32 | 'GET https://events.webinar.ru/api/eventsessions/aaa/record/isviewable?' 33 | f'recordAccessToken=bbb -> 200:{data_manifest}', 34 | 35 | f'GET https://here/there.m3u8 -> 200:{data_m3u}', 36 | b'GET https://here/1.ts?some=other1 -> 200:' + data_ts, 37 | b'GET https://here/2.ts?some=other2 -> 200:' + data_ts, 38 | ]): 39 | fpath = WebinarRu(target_dir=tmp_path).run({ 40 | 'url_video': ' https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb', 41 | 'url_playlist': 'https://here/there.m3u8', 42 | }) 43 | assert fpath 44 | assert mock_call == CALLS 45 | -------------------------------------------------------------------------------- /src/webinardump/dumpers/webinarru.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import ClassVar 3 | 4 | from ..utils import LOGGER 5 | from .base import Dumper 6 | 7 | 8 | class WebinarRu(Dumper): 9 | 10 | title = 'webinar.ru' 11 | 12 | _user_input_map: ClassVar[dict[str, str]] = { 13 | 'url_video': 'Video URL (with `record-new/`)', 14 | 'url_playlist': 'Video chunk list URL (with `chunklist.m3u8`)', 15 | } 16 | 17 | _headers: ClassVar[dict[str, str]] = { 18 | **Dumper._headers, 19 | 'Origin': 'https://events.webinar.ru', 20 | } 21 | 22 | def _gather(self, *, url_video: str, start_chunk: str = '', url_playlist: str = '', **params) -> Path: 23 | """Runs video dump. 24 | 25 | :param url_video: Video URL. Hint: has record-new/ 26 | :param url_playlist: Video chunk list URL. Hint: ends with chunklist.m3u8 27 | :param start_chunk: Optional chunk name to continue download from. 28 | """ 29 | assert url_playlist, 'Playlist URL must be specified' 30 | 31 | assert 'record-new/' in url_video, ( 32 | 'Unexpected video URL format\n' 33 | f'Given: {url_video}.\n' 34 | f'Expected: https://events.webinar.ru/xxx/yyy/record-new/aaa/bbb') 35 | 36 | _, _, tail = url_video.partition('record-new/') 37 | session_id, _, video_id = tail.partition('/') 38 | 39 | LOGGER.info('Getting manifest ...') 40 | 41 | manifest = self._get_response_simple( 42 | f'https://events.webinar.ru/api/eventsessions/{session_id}/record/isviewable?recordAccessToken={video_id}', 43 | json=True 44 | ) 45 | 46 | return self._video_dump( 47 | title=manifest['name'], 48 | url_playlist=url_playlist, 49 | url_referer=url_video, 50 | start_chunk=start_chunk, 51 | ) 52 | -------------------------------------------------------------------------------- /src/webinardump/dumpers/yadisk.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from pathlib import Path 4 | from typing import ClassVar 5 | 6 | from ..utils import LOGGER 7 | from .base import Dumper 8 | 9 | 10 | class YandexDisk(Dumper): 11 | 12 | title = 'Яндекс.Диск' 13 | 14 | _user_input_map: ClassVar[dict[str, str]] = { 15 | 'url_video': 'Video URL (https://disk.yandex.ru/i/xxx)', 16 | } 17 | 18 | def _get_manifest(self, url: str) -> dict: 19 | LOGGER.debug(f'Getting manifest from {url} ...') 20 | 21 | contents = self._get_response_simple(url) 22 | manifest = re.findall(r'id="store-prefetch">([^<]+) tuple[str, str]: 29 | 30 | resources = list(manifest['resources'].values()) 31 | resource = resources[0] 32 | 33 | dimension_max = 0 34 | url_playlist = '' 35 | 36 | for stream_info in resource['videoStreams']['videos']: 37 | dimension, *_ = stream_info['dimension'].partition('p') 38 | if not dimension.isnumeric(): 39 | continue # e.g. 'adaptive' 40 | dimension = int(dimension) 41 | if dimension_max < dimension: 42 | dimension_max = dimension 43 | url_playlist = stream_info['url'] 44 | 45 | return url_playlist, resource['name'] 46 | 47 | def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path: 48 | 49 | manifest = self._get_manifest(url_video) 50 | url_playlist, title = self._get_playlist_and_title(manifest) 51 | 52 | return self._video_dump( 53 | title=title, 54 | url_playlist=url_playlist, 55 | url_referer=url_video, 56 | start_chunk=start_chunk, 57 | ) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webinardump 2 | 3 | 4 | 5 | [![PyPI - Version](https://img.shields.io/pypi/v/webinardump)](https://pypi.python.org/pypi/webinardump) 6 | [![License](https://img.shields.io/pypi/l/webinardump)](https://pypi.python.org/pypi/webinardump) 7 | [![Coverage](https://img.shields.io/coverallsCoverage/github/idlesign/webinardump)](https://coveralls.io/r/idlesign/webinardump) 8 | 9 | ## Описание 10 | 11 | *Приложение позволяет скачать запись вебинара и сохранить в виде .mp4 файла.* 12 | 13 | 14 | ## Откуда качает 15 | 16 | * Яндекс.Диск (записи стримов) 17 | * webinar.ru 18 | 19 | 20 | ## Зависимости 21 | 22 | Что нужно иметь для запуска приложения и работы с ним. 23 | 24 | * Linux (Unix) 25 | * Python 3.11+ 26 | * ffmpeg (для Ubuntu: `sudo apt install ffmpeg`) 27 | * uv (для установки и обновления приложения) 28 | * Базовые знания о работе в браузере с отладочной консолью. 29 | 30 | 31 | ## Установка и обновление 32 | 33 | Производится при помощи приложения [uv](https://docs.astral.sh/uv/getting-started/installation/): 34 | 35 | ```shell 36 | $ uv tool install webinardump 37 | ``` 38 | 39 | После этого запускать приложение можно командой 40 | 41 | ```shell 42 | $ webinardump 43 | ``` 44 | 45 | Для обновления выполните 46 | 47 | ```shell 48 | $ uv tool upgrade webinardump 49 | ``` 50 | 51 | ## Как использовать 52 | 53 | Переместитесь в желаемый каталог и выполните следующую команду. 54 | 55 | ```shell 56 | 57 | ; Указываем путь для скачивания - my_webinar_dir/ 58 | ; Указываем таймаут запросов - 10 секунд 59 | ; Указываем максимальное количество одновременных запросов - 20 60 | $ webinardump --target my_webinar_dir/ --timeout 10 --rmax 20 61 | ``` 62 | Приложение скачает фрагменты вебинара, а потом соберёт из них единый файл. 63 | 64 | 65 | ### disk.yandex.ru 66 | 67 | 1. Взять ссылку на вебинар (запись стрима). Вида https://disk.yandex.ru/i/xxx 68 | 2. Запустить скачиватель и скормить ему ссылку из предыдущего пункта. 69 | 70 | 71 | ### webinar.ru 72 | 73 | Процесс скачивания автоматизирован не полностью, потребуется искать 74 | некоторые ссылки при помощи браузера. 75 | 76 | 1. Взять ссылку на вебинар. Вида https://events.webinar.ru/event/xxx/yyy/zzz 77 | 2. Открыть в браузере. 78 | 3. Включить отладочную консоль (F12). 79 | 4. Запустить воспроизведение. 80 | 5. Отыскать ссылку с `record-new/` и запомнить её. 81 | 6. Отыскать ссылку, оканчивающуюся на `chunklist.m3u8` и запомнить её. 82 | 7. Запустить скачиватель и скормить ему ссылки и двух предыдущих пунктов. 83 | 84 | ## Для разработки 85 | 86 | При разработке используется [makeapp](https://pypi.org/project/makeapp/). Ставим: 87 | 88 | ```shell 89 | $ uv tool install makeapp 90 | ``` 91 | 92 | После клонирования репозитория sponsrdump, в его директории выполняем: 93 | 94 | ```shell 95 | # ставим утилиты 96 | $ ma tools 97 | 98 | # инициализируем виртуальное окружение 99 | $ ma up --tool 100 | 101 | # теперь в окружении доступны зависимости и команда sponsrdump 102 | ``` 103 | 104 | Проверь стиль перед отправкой кода на обзор: 105 | 106 | ```shell 107 | # проверяем стиль 108 | $ ma style 109 | ``` 110 | -------------------------------------------------------------------------------- /src/webinardump/dumpers/base.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from collections.abc import Callable 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from contextlib import chdir 5 | from pathlib import Path 6 | from random import choice 7 | from threading import Lock 8 | from time import sleep 9 | from typing import ClassVar 10 | 11 | import requests 12 | from requests import Session 13 | from requests.adapters import HTTPAdapter, Retry 14 | 15 | from ..utils import LOGGER, call, get_files_sorted 16 | 17 | 18 | class Dumper: 19 | 20 | title: str = '' 21 | 22 | _user_input_map: ClassVar[dict[str, str]] 23 | 24 | _headers: ClassVar[dict[str, str]] = { 25 | 'Connection': 'keep-alive', 26 | 'Accept': '*/*', 27 | 'User-Agent': ( 28 | 'Mozilla/5.0 (X11; Linux x86_64) ' 29 | 'AppleWebKit/537.36 (KHTML, like Gecko) ' 30 | 'Chrome/79.0.3945.136 YaBrowser/20.2.3.320 (beta) Yowser/2.5 Safari/537.36' 31 | ), 32 | 'Sec-Fetch-Site': 'same-site', 33 | 'Sec-Fetch-Mode': 'cors', 34 | 'Accept-Language': 'ru,en;q=0.9', 35 | 'Accept-Encoding': 'gzip, deflate, sdch, br', 36 | } 37 | 38 | _media_ext: ClassVar[set[str]] = {'.ts', '.m4s'} 39 | 40 | registry: ClassVar[list[type['Dumper']]] = [] 41 | 42 | def __init_subclass__(cls): 43 | super().__init_subclass__() 44 | cls.registry.append(cls) 45 | 46 | def __init__(self, *, target_dir: Path, timeout: int = 3, concurrent: int = 10, sleepy: bool = False) -> None: 47 | self._target_dir = target_dir 48 | self._timeout = timeout 49 | self._concurrent = concurrent 50 | self._user_input_map = self._user_input_map or {} 51 | self._session = self._get_session() 52 | self._sleepy = sleepy 53 | 54 | def __str__(self): 55 | return self.title 56 | 57 | def _get_session(self) -> Session: 58 | # todo при ошибках сессия в нитях блокируется. можно попробовать несколько сессий 59 | session = requests.Session() 60 | session.headers = self._headers 61 | retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[500]) 62 | session.mount('http://', HTTPAdapter(max_retries=retries)) 63 | session.mount('https://', HTTPAdapter(max_retries=retries)) 64 | return session 65 | 66 | def _get_args(self, *, get_param_hook: Callable[[str, str], str]) -> dict: 67 | input_data = {} 68 | 69 | for param, hint in self._user_input_map.items(): 70 | input_data[param] = get_param_hook(param, hint) 71 | 72 | return input_data 73 | 74 | def _chunks_get_list(self, url: str, *, url_prefix: str = '') -> list[str]: 75 | """Get video chunks names from playlist file at URL. 76 | 77 | * Links to other playlists: 78 | #EXTM3U 79 | #EXT-X-VERSION:7 80 | #EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio",AUTOSELECT=YES,NAME="xxx",URI="a1/index.m3u8" 81 | #EXT-X-STREAM-INF:BANDWIDTH=5205192,RESOLUTION=1920x1080,CODECS="avc1.42C02A,mp4a.40.2",AUDIO="audio" 82 | v1/index.m3u8 83 | 84 | * Links to chunks: 85 | #EXTM3U 86 | #EXT-X-VERSION:6 87 | #EXT-X-MEDIA-SEQUENCE:1 88 | #EXT-X-INDEPENDENT-SEGMENTS 89 | #EXT-X-TARGETDURATION:12 90 | #EXT-X-MAP:URI="init/1.m4s" 91 | #EXTINF:2.000, 92 | media/1.m4s 93 | #EXT-X-DISCONTINUITY 94 | 95 | :param url: File URL. 96 | :param url_prefix: File URL prefix. 97 | 98 | """ 99 | LOGGER.info(f'Getting video chunks from playlist {url} ...') 100 | 101 | sub_playlist = self._get_response_simple(url) 102 | playlists = [] 103 | chunk_lists = [] 104 | media_ext = self._media_ext 105 | 106 | for line in sub_playlist.splitlines(): 107 | line = line.strip() 108 | 109 | if 'EXT-X-MAP:URI' in line: 110 | # naive parsing. todo: respect EXT-X-DISCONTINUITY and EXT-X-MEDIA:TYPE=AUDIO,...,AUTOSELECT=YES 111 | line = line.rpartition("=")[2].strip('"') 112 | 113 | if line.endswith('.m3u8'): 114 | playlists.append(line) 115 | continue 116 | 117 | path = Path(line.partition('?')[0]) 118 | if path.suffix not in media_ext: 119 | continue 120 | 121 | if url_prefix: 122 | line = f'{url_prefix}/{line}' 123 | 124 | chunk_lists.append(line) 125 | 126 | if playlists: 127 | LOGGER.info('Sub playlists found. Will use the first one ...') 128 | 129 | sub_playlist = playlists[0] 130 | sub_playlist_url_prefix = sub_playlist.rpartition('/')[0] 131 | sub_playlist_url = f'{url.rpartition("/")[0]}/{sub_playlist}' 132 | 133 | chunk_lists = self._chunks_get_list(sub_playlist_url, url_prefix=sub_playlist_url_prefix) 134 | 135 | else: 136 | assert chunk_lists, 'No video chunks found in playlist file' 137 | 138 | return chunk_lists 139 | 140 | def _chunks_download( 141 | self, 142 | *, 143 | url_video_root: str, 144 | dump_dir: Path, 145 | chunk_names: list[str], 146 | start_chunk: str, 147 | headers: dict[str, str] | None = None, 148 | concurrent: int = 10, 149 | ) -> None: 150 | 151 | chunks_total = len(chunk_names) 152 | 153 | progress_file = (dump_dir / 'files.txt') 154 | progress_file.touch() 155 | 156 | files_done = dict.fromkeys(progress_file.read_text().splitlines()) 157 | lock = Lock() 158 | 159 | def dump(*, name: str, file_idx: int, url: str, session: Session, sleepy: bool, timeout: int) -> None: 160 | 161 | name = name.partition('?')[0] # drop GET-args 162 | 163 | if name in files_done: 164 | LOGGER.info(f'File {name} has already been downloaded before. Skipping.') 165 | return 166 | 167 | filename = name.rpartition('/')[2] # drop url prefix 168 | filename = f'{file_idx}_{filename}' 169 | 170 | with session.get(url, headers=headers or {}, stream=True, timeout=timeout) as r: 171 | r.raise_for_status() 172 | with (dump_dir / filename).open('wb') as f: 173 | f.writelines(r.iter_content(chunk_size=8192)) 174 | 175 | files_done[name] = True 176 | with lock: 177 | progress_file.write_text('\n'.join(files_done)) 178 | 179 | if sleepy: 180 | sleep(choice([1, 0.5, 0.7, 0.6])) 181 | 182 | with ThreadPoolExecutor(max_workers=concurrent) as executor: 183 | 184 | future_url_map = {} 185 | 186 | for idx, chunk_name in enumerate(chunk_names, 1): 187 | 188 | if chunk_name == start_chunk: 189 | start_chunk = '' # clear to allow further download 190 | 191 | if start_chunk: 192 | continue 193 | 194 | chunk_url = f'{url_video_root.rstrip("/")}/{chunk_name}' 195 | submitted = executor.submit( 196 | dump, 197 | name=chunk_name, 198 | file_idx=idx, 199 | url=chunk_url, 200 | session=self._session, 201 | sleepy=self._sleepy, 202 | timeout=self._timeout, 203 | ) 204 | 205 | future_url_map[submitted] = (chunk_name, chunk_url) 206 | 207 | if future_url_map: 208 | LOGGER.info(f'Downloading up to {concurrent} files concurrently ...') 209 | 210 | counter = 1 211 | for future in as_completed(future_url_map): 212 | chunk_name, chunk_url = future_url_map[future] 213 | future.result() 214 | percent = round(counter * 100 / chunks_total, 1) 215 | counter += 1 216 | LOGGER.info(f'Got {counter}/{chunks_total} ({chunk_name.partition("?")[0]}) [{percent}%] ...') 217 | 218 | def _video_concat(self, path: Path) -> Path: 219 | 220 | LOGGER.info('Concatenating video ...') 221 | 222 | fname_video = 'all_chunks.mp4' 223 | fname_index = 'all_chunks.txt' 224 | 225 | mode_m4s = False 226 | 227 | filenames = get_files_sorted(path, suffixes=self._media_ext) 228 | 229 | for filename in filenames: 230 | if filename.endswith('m4s'): 231 | mode_m4s = True 232 | break 233 | 234 | def create_index(line_tpl: str = '%s'): 235 | with (path / fname_index).open('w') as f: 236 | f.writelines([f'{line_tpl % fname}\n' for fname in filenames]) 237 | 238 | if mode_m4s: 239 | fname_raw = 'all_chunks.mp4' 240 | create_index() 241 | call(f'xargs cat < {fname_index} >> {fname_raw}', path=path) 242 | call(f'ffmpeg -y -i {fname_raw} -c copy {fname_video}', path=path) 243 | 244 | else: 245 | # presumably ts 246 | create_index('file %s') 247 | call(f'ffmpeg -y -f concat -i {fname_index} -c copy -bsf:a aac_adtstoasc {fname_video}', path=path) 248 | 249 | return path / fname_video 250 | 251 | def _get_response_simple(self, url: str, *, json: bool = False) -> str | dict: 252 | """Returns a text or a dictionary from a URL. 253 | 254 | :param url: 255 | :param json: 256 | 257 | """ 258 | response = self._session.get(url) 259 | response.raise_for_status() 260 | 261 | if json: 262 | return response.json() 263 | 264 | return response.text 265 | 266 | def _video_dump( 267 | self, 268 | *, 269 | title: str, 270 | url_playlist: str, 271 | url_referer: str, 272 | start_chunk: str = '', 273 | ) -> Path: 274 | assert url_playlist.endswith('m3u8'), f'No playlist in `{url_playlist}`' 275 | 276 | LOGGER.info(f'Title: {title}') 277 | 278 | chunk_names = self._chunks_get_list(url_playlist) 279 | 280 | target_dir = self._target_dir 281 | LOGGER.info(f'Downloading video into {target_dir} ...') 282 | 283 | with chdir(target_dir): 284 | dump_dir = (target_dir / title).absolute() 285 | dump_dir.mkdir(parents=True, exist_ok=True) 286 | 287 | url_root = url_playlist.rpartition('/')[0] # strip playlist filename 288 | 289 | self._chunks_download( 290 | url_video_root=url_root, 291 | dump_dir=dump_dir, 292 | chunk_names=chunk_names, 293 | start_chunk=start_chunk, 294 | headers={'Referer': url_referer.strip()}, 295 | concurrent=self._concurrent, 296 | ) 297 | 298 | fpath_video_target = Path(f'{title}.mp4').absolute() 299 | fpath_video = self._video_concat(dump_dir) 300 | 301 | shutil.move(fpath_video, fpath_video_target) 302 | shutil.rmtree(dump_dir, ignore_errors=True) 303 | 304 | LOGGER.info(f'Video is ready: {fpath_video_target}') 305 | return fpath_video_target 306 | 307 | def _gather(self, *, url_video: str, start_chunk: str = '', **params) -> Path: 308 | raise NotImplementedError 309 | 310 | def run(self, params_or_hook: Callable[[str, str], str] | dict[str, str]) -> Path: 311 | params = params_or_hook if isinstance(params_or_hook, dict) else self._get_args(get_param_hook=params_or_hook) 312 | return self._gather(**params) 313 | --------------------------------------------------------------------------------