├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat └── requirements.txt ├── requirements.txt ├── scrapetube ├── __init__.py └── scrapetube.py ├── setup.py ├── tests └── test.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | .vscode 6 | docs/_build/ 7 | docs/source/_build 8 | .tox 9 | .env 10 | .DS_store -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-20.04 5 | tools: 6 | python: "3.8" 7 | 8 | 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | 13 | python: 14 | install: 15 | - requirements: docs/requirements.txt 16 | - requirements: requirements.txt 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Cheskel Twersky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapetube 2 | This module will help you scrape youtube without the official youtube api and without selenium. 3 | 4 | With this module you can: 5 | 6 | 7 | * Get all videos from a Youtube channel. 8 | * Get all videos from a playlist. 9 | * Search youtube. 10 | 11 | # Installation 12 | 13 | ```bash 14 | pip3 install scrapetube 15 | ``` 16 | 17 | # Usage 18 | Here's a few short code examples. 19 | 20 | ## Get all videos for a channel 21 | ```python 22 | import scrapetube 23 | 24 | videos = scrapetube.get_channel("UCCezIgC97PvUuR4_gbFUs5g") 25 | 26 | for video in videos: 27 | print(video['videoId']) 28 | ``` 29 | 30 | ## Get all videos for a playlist 31 | ```python 32 | import scrapetube 33 | 34 | videos = scrapetube.get_playlist("PL-osiE80TeTt2d9bfVyTiXJA-UTHn6WwU") 35 | 36 | for video in videos: 37 | print(video['videoId']) 38 | ``` 39 | 40 | ## Make a search 41 | ```python 42 | import scrapetube 43 | 44 | videos = scrapetube.get_search("python") 45 | 46 | for video in videos: 47 | print(video['videoId']) 48 | ``` 49 | 50 | # Full Documentation 51 | 52 | [https://scrapetube.readthedocs.io/en/latest/](https://scrapetube.readthedocs.io/en/latest/) 53 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath("../")) 5 | from scrapetube import __version__ 6 | 7 | 8 | version = __version__ 9 | project = "Scrapetube" 10 | copyright = "2021, Cheskel Twersky" 11 | author = "Cheskel Twersky" 12 | 13 | 14 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"] 15 | 16 | templates_path = ["_templates"] 17 | 18 | 19 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 20 | 21 | 22 | html_theme = "sphinx_rtd_theme" 23 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Welcome to Scrapetube's documentation! 3 | ================================================ 4 | This module will help you scrape youtube without the official youtube api and without selenium. 5 | 6 | With this module you can: 7 | 8 | 9 | * Get all videos from a Youtube channel. 10 | * Get all videos from a playlist. 11 | * Search youtube. 12 | 13 | 14 | Reference 15 | ========= 16 | .. currentmodule:: scrapetube 17 | 18 | .. autofunction:: get_channel 19 | 20 | .. autofunction:: get_search 21 | 22 | .. autofunction:: get_playlist 23 | 24 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | typing_extensions 3 | -------------------------------------------------------------------------------- /scrapetube/__init__.py: -------------------------------------------------------------------------------- 1 | from .scrapetube import get_channel, get_search, get_playlist, get_video 2 | 3 | __version__ = "2.5.1" 4 | -------------------------------------------------------------------------------- /scrapetube/scrapetube.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from typing import Generator 4 | 5 | import requests 6 | from typing_extensions import Literal 7 | 8 | type_property_map = { 9 | "videos": "videoRenderer", 10 | "streams": "videoRenderer", 11 | "shorts": "reelWatchEndpoint" 12 | } 13 | 14 | def get_channel( 15 | channel_id: str = None, 16 | channel_url: str = None, 17 | channel_username: str = None, 18 | limit: int = None, 19 | sleep: float = 1, 20 | proxies: dict = None, 21 | sort_by: Literal["newest", "oldest", "popular"] = "newest", 22 | content_type: Literal["videos", "shorts", "streams"] = "videos", 23 | ) -> Generator[dict, None, None]: 24 | 25 | """Get videos for a channel. 26 | 27 | Parameters: 28 | channel_id (``str``, *optional*): 29 | The channel id from the channel you want to get the videos for. 30 | If you prefer to use the channel url instead, see ``channel_url`` below. 31 | 32 | channel_url (``str``, *optional*): 33 | The url to the channel you want to get the videos for. 34 | Since there is a few type's of channel url's, you can use the one you want 35 | by passing it here instead of using ``channel_id``. 36 | 37 | channel_username (``str``, *optional*): 38 | The username from the channel you want to get the videos for. 39 | Ex. ``LinusTechTips`` (without the @). 40 | If you prefer to use the channel url instead, see ``channel_url`` above. 41 | 42 | limit (``int``, *optional*): 43 | Limit the number of videos you want to get. 44 | 45 | sleep (``int``, *optional*): 46 | Seconds to sleep between API calls to youtube, in order to prevent getting blocked. 47 | Defaults to 1. 48 | 49 | proxies (``dict``, *optional*): 50 | A dictionary with the proxies you want to use. Ex: 51 | ``{'https': 'http://username:password@101.102.103.104:3128'}`` 52 | 53 | sort_by (``str``, *optional*): 54 | In what order to retrieve to videos. Pass one of the following values. 55 | ``"newest"``: Get the new videos first. 56 | ``"oldest"``: Get the old videos first. 57 | ``"popular"``: Get the popular videos first. Defaults to "newest". 58 | 59 | content_type (``str``, *optional*): 60 | In order to get content type. Pass one of the following values. 61 | ``"videos"``: Videos 62 | ``"shorts"``: Shorts 63 | ``"streams"``: Streams 64 | """ 65 | 66 | base_url = "" 67 | if channel_url: 68 | base_url = channel_url 69 | elif channel_id: 70 | base_url = f"https://www.youtube.com/channel/{channel_id}" 71 | elif channel_username: 72 | base_url = f"https://www.youtube.com/@{channel_username}" 73 | 74 | url = "{base_url}/{content_type}?view=0&flow=grid".format( 75 | base_url=base_url, 76 | content_type=content_type, 77 | ) 78 | api_endpoint = "https://www.youtube.com/youtubei/v1/browse" 79 | videos = get_videos(url, api_endpoint, "contents", type_property_map[content_type], limit, sleep, proxies, sort_by) 80 | for video in videos: 81 | yield video 82 | 83 | 84 | def get_playlist( 85 | playlist_id: str, limit: int = None, sleep: int = 1, proxies: dict = None 86 | ) -> Generator[dict, None, None]: 87 | 88 | """Get videos for a playlist. 89 | 90 | Parameters: 91 | playlist_id (``str``): 92 | The playlist id from the playlist you want to get the videos for. 93 | 94 | limit (``int``, *optional*): 95 | Limit the number of videos you want to get. 96 | 97 | sleep (``int``, *optional*): 98 | Seconds to sleep between API calls to youtube, in order to prevent getting blocked. 99 | Defaults to 1. 100 | 101 | proxies (``dict``, *optional*): 102 | A dictionary with the proxies you want to use. Ex: 103 | ``{'https': 'http://username:password@101.102.103.104:3128'}`` 104 | """ 105 | 106 | url = f"https://www.youtube.com/playlist?list={playlist_id}" 107 | api_endpoint = "https://www.youtube.com/youtubei/v1/browse" 108 | videos = get_videos(url, api_endpoint, "playlistVideoListRenderer", "playlistVideoRenderer", limit, sleep, proxies) 109 | for video in videos: 110 | yield video 111 | 112 | 113 | def get_search( 114 | query: str, 115 | limit: int = None, 116 | sleep: int = 1, 117 | sort_by: Literal["relevance", "upload_date", "view_count", "rating"] = "relevance", 118 | results_type: Literal["video", "channel", "playlist", "movie"] = "video", 119 | proxies: dict = None, 120 | ) -> Generator[dict, None, None]: 121 | 122 | """Search youtube and get videos. 123 | 124 | Parameters: 125 | query (``str``): 126 | The term you want to search for. 127 | 128 | limit (``int``, *optional*): 129 | Limit the number of videos you want to get. 130 | 131 | sleep (``int``, *optional*): 132 | Seconds to sleep between API calls to youtube, in order to prevent getting blocked. 133 | Defaults to 1. 134 | 135 | sort_by (``str``, *optional*): 136 | In what order to retrieve to videos. Pass one of the following values. 137 | ``"relevance"``: Get the new videos in order of relevance. 138 | ``"upload_date"``: Get the new videos first. 139 | ``"view_count"``: Get the popular videos first. 140 | ``"rating"``: Get videos with more likes first. 141 | Defaults to "relevance". 142 | 143 | results_type (``str``, *optional*): 144 | What type you want to search for. Pass one of the following values: 145 | ``"video"|"channel"|"playlist"|"movie"``. Defaults to "video". 146 | 147 | proxies (``dict``, *optional*): 148 | A dictionary with the proxies you want to use. Ex: 149 | ``{'https': 'http://username:password@101.102.103.104:3128'}`` 150 | 151 | """ 152 | 153 | sort_by_map = { 154 | "relevance": "A", 155 | "upload_date": "I", 156 | "view_count": "M", 157 | "rating": "E", 158 | } 159 | 160 | results_type_map = { 161 | "video": ["B", "videoRenderer"], 162 | "channel": ["C", "channelRenderer"], 163 | "playlist": ["D", "playlistRenderer"], 164 | "movie": ["E", "videoRenderer"], 165 | } 166 | 167 | param_string = f"CA{sort_by_map[sort_by]}SAhA{results_type_map[results_type][0]}" 168 | url = f"https://www.youtube.com/results?search_query={query}&sp={param_string}" 169 | api_endpoint = "https://www.youtube.com/youtubei/v1/search" 170 | videos = get_videos( 171 | url, api_endpoint, "contents", results_type_map[results_type][1], limit, sleep, proxies 172 | ) 173 | for video in videos: 174 | yield video 175 | 176 | 177 | 178 | def get_video( 179 | id: str, 180 | ) -> dict: 181 | 182 | """Get a single video. 183 | 184 | Parameters: 185 | id (``str``): 186 | The video id from the video you want to get. 187 | """ 188 | 189 | session = get_session() 190 | url = f"https://www.youtube.com/watch?v={id}" 191 | html = get_initial_data(session, url) 192 | client = json.loads( 193 | get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}' 194 | )["client"] 195 | session.headers["X-YouTube-Client-Name"] = "1" 196 | session.headers["X-YouTube-Client-Version"] = client["clientVersion"] 197 | data = json.loads( 198 | get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}" 199 | ) 200 | return next(search_dict(data, "videoPrimaryInfoRenderer")) 201 | 202 | 203 | 204 | def get_videos( 205 | url: str, api_endpoint: str, selector_list: str, selector_item: str, limit: int, sleep: float, proxies: dict = None, sort_by: str = None 206 | ) -> Generator[dict, None, None]: 207 | session = get_session(proxies) 208 | is_first = True 209 | quit_it = False 210 | count = 0 211 | while True: 212 | if is_first: 213 | html = get_initial_data(session, url) 214 | client = json.loads( 215 | get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}' 216 | )["client"] 217 | api_key = get_json_from_html(html, "innertubeApiKey", 3) 218 | session.headers["X-YouTube-Client-Name"] = "1" 219 | session.headers["X-YouTube-Client-Version"] = client["clientVersion"] 220 | data = json.loads( 221 | get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}" 222 | ) 223 | data = next(search_dict(data, selector_list), None) 224 | next_data = get_next_data(data, sort_by) 225 | is_first = False 226 | if sort_by and sort_by != "newest": 227 | continue 228 | else: 229 | data = get_ajax_data(session, api_endpoint, api_key, next_data, client) 230 | next_data = get_next_data(data) 231 | for result in get_videos_items(data, selector_item): 232 | try: 233 | count += 1 234 | yield result 235 | if count == limit: 236 | quit_it = True 237 | break 238 | except GeneratorExit: 239 | quit_it = True 240 | break 241 | 242 | if not next_data or quit_it: 243 | break 244 | 245 | time.sleep(sleep) 246 | 247 | session.close() 248 | 249 | 250 | def get_session(proxies: dict = None) -> requests.Session: 251 | session = requests.Session() 252 | if proxies: 253 | session.proxies.update(proxies) 254 | session.headers[ 255 | "User-Agent" 256 | ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 257 | session.headers["Accept-Language"] = "en" 258 | return session 259 | 260 | def get_initial_data(session: requests.Session, url: str) -> str: 261 | session.cookies.set("CONSENT", "YES+cb", domain=".youtube.com") 262 | response = session.get(url, params={"ucbcb":1}) 263 | 264 | html = response.text 265 | return html 266 | 267 | 268 | def get_ajax_data( 269 | session: requests.Session, 270 | api_endpoint: str, 271 | api_key: str, 272 | next_data: dict, 273 | client: dict, 274 | ) -> dict: 275 | data = { 276 | "context": {"clickTracking": next_data["click_params"], "client": client}, 277 | "continuation": next_data["token"], 278 | } 279 | response = session.post(api_endpoint, params={"key": api_key}, json=data) 280 | return response.json() 281 | 282 | 283 | def get_json_from_html(html: str, key: str, num_chars: int = 2, stop: str = '"') -> str: 284 | pos_begin = html.find(key) + len(key) + num_chars 285 | pos_end = html.find(stop, pos_begin) 286 | return html[pos_begin:pos_end] 287 | 288 | 289 | def get_next_data(data: dict, sort_by: str = None) -> dict: 290 | # Youtube, please don't change the order of these 291 | sort_by_map = { 292 | "newest": 0, 293 | "popular": 1, 294 | "oldest": 2, 295 | } 296 | if sort_by and sort_by != "newest": 297 | endpoint = next( 298 | search_dict(data, "feedFilterChipBarRenderer"), None)["contents"][sort_by_map[sort_by]]["chipCloudChipRenderer"]["navigationEndpoint"] 299 | else: 300 | endpoint = next(search_dict(data, "continuationEndpoint"), None) 301 | if not endpoint: 302 | return None 303 | next_data = { 304 | "token": endpoint["continuationCommand"]["token"], 305 | "click_params": {"clickTrackingParams": endpoint["clickTrackingParams"]}, 306 | } 307 | 308 | return next_data 309 | 310 | 311 | def search_dict(partial: dict, search_key: str) -> Generator[dict, None, None]: 312 | stack = [partial] 313 | while stack: 314 | current_item = stack.pop(0) 315 | if isinstance(current_item, dict): 316 | for key, value in current_item.items(): 317 | if key == search_key: 318 | yield value 319 | else: 320 | stack.append(value) 321 | elif isinstance(current_item, list): 322 | for value in current_item: 323 | stack.append(value) 324 | 325 | 326 | def get_videos_items(data: dict, selector: str) -> Generator[dict, None, None]: 327 | return search_dict(data, selector) 328 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from setuptools import setup 4 | 5 | with open("scrapetube/__init__.py", encoding="utf-8") as f: 6 | version = re.findall(r"__version__ = \"(.+)\"", f.read())[0] 7 | 8 | with open("README.md", encoding="utf-8") as f: 9 | readme = f.read() 10 | 11 | with open("requirements.txt", encoding="utf-8") as f: 12 | requirements = [r.strip() for r in f] 13 | 14 | setup( 15 | name="scrapetube", 16 | version=version, 17 | packages=["scrapetube"], 18 | include_package_data=True, 19 | url="https://github.com/dermasmid/scrapetube", 20 | license="MIT", 21 | long_description=readme, 22 | long_description_content_type="text/markdown", 23 | author="Cheskel Twersky", 24 | author_email="twerskycheskel@gmail.com", 25 | description="Scrape youtube without the official youtube api and without selenium.", 26 | keywords="youtube python channel videos search playlist list get", 27 | classifiers=[ 28 | "Programming Language :: Python :: 3", 29 | "License :: OSI Approved :: MIT License", 30 | "Operating System :: OS Independent", 31 | ], 32 | project_urls={"Documentation": "https://scrapetube.readthedocs.io/en/latest/"}, 33 | install_requires=requirements, 34 | python_requires=">=3.6", 35 | ) 36 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.insert( 5 | 0, "/".join(os.path.dirname(os.path.realpath(__file__)).split(os.sep)[:-1]) 6 | ) 7 | 8 | import scrapetube 9 | 10 | 11 | videos = scrapetube.get_channel("UC9-y-6csu5WGm29I7JiwpnA", sort_by="popular") 12 | 13 | for video in videos: 14 | print(video["videoId"]) 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py38 3 | 4 | [testenv] 5 | 6 | deps = -r{toxinidir}/requirements.txt 7 | 8 | commands = 9 | python tests/test.py 10 | --------------------------------------------------------------------------------