├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── requirements.txt
├── requirements.txt
├── scrapetube
    ├── __init__.py
    └── scrapetube.py
├── setup.py
├── tests
    └── test.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | .vscode
 6 | docs/_build/
 7 | docs/source/_build
 8 | .tox
 9 | .env
10 | .DS_store


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-20.04
 5 |   tools:
 6 |     python: "3.8"
 7 | 
 8 | 
 9 | sphinx:
10 |    configuration: docs/conf.py
11 | 
12 | 
13 | python:
14 |    install:
15 |     - requirements: docs/requirements.txt
16 |     - requirements: requirements.txt
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Cheskel Twersky
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapetube
 2 | This module will help you scrape youtube without the official youtube api and without selenium.
 3 | 
 4 | With this module you can:
 5 | 
 6 | 
 7 | * Get all videos from a Youtube channel.
 8 | * Get all videos from a playlist.
 9 | * Search youtube.
10 | 
11 | # Installation
12 | 
13 | ```bash
14 | pip3 install scrapetube
15 | ```
16 | 
17 | # Usage
18 | Here's a few short code examples.
19 | 
20 | ## Get all videos for a channel
21 | ```python
22 | import scrapetube
23 | 
24 | videos = scrapetube.get_channel("UCCezIgC97PvUuR4_gbFUs5g")
25 | 
26 | for video in videos:
27 |     print(video['videoId'])
28 | ```
29 | 
30 | ## Get all videos for a playlist
31 | ```python
32 | import scrapetube
33 | 
34 | videos = scrapetube.get_playlist("PL-osiE80TeTt2d9bfVyTiXJA-UTHn6WwU")
35 | 
36 | for video in videos:
37 |     print(video['videoId'])
38 | ```
39 | 
40 | ## Make a search
41 | ```python
42 | import scrapetube
43 | 
44 | videos = scrapetube.get_search("python")
45 | 
46 | for video in videos:
47 |     print(video['videoId'])
48 | ```
49 | 
50 | # Full Documentation
51 | 
52 | [https://scrapetube.readthedocs.io/en/latest/](https://scrapetube.readthedocs.io/en/latest/)
53 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | sys.path.insert(0, os.path.abspath("../"))
 5 | from scrapetube import __version__
 6 | 
 7 | 
 8 | version = __version__
 9 | project = "Scrapetube"
10 | copyright = "2021, Cheskel Twersky"
11 | author = "Cheskel Twersky"
12 | 
13 | 
14 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"]
15 | 
16 | templates_path = ["_templates"]
17 | 
18 | 
19 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
20 | 
21 | 
22 | html_theme = "sphinx_rtd_theme"
23 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Welcome to Scrapetube's documentation!
 3 | ================================================
 4 | This module will help you scrape youtube without the official youtube api and without selenium.
 5 | 
 6 | With this module you can:
 7 | 
 8 | 
 9 | * Get all videos from a Youtube channel.
10 | * Get all videos from a playlist.
11 | * Search youtube.
12 | 
13 | 
14 | Reference
15 | =========
16 | .. currentmodule:: scrapetube
17 | 
18 | .. autofunction:: get_channel
19 | 
20 | .. autofunction:: get_search
21 | 
22 | .. autofunction:: get_playlist
23 | 
24 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | typing_extensions
3 | 


--------------------------------------------------------------------------------
/scrapetube/__init__.py:
--------------------------------------------------------------------------------
1 | from .scrapetube import get_channel, get_search, get_playlist, get_video
2 | 
3 | __version__ = "2.5.1"
4 | 


--------------------------------------------------------------------------------
/scrapetube/scrapetube.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | from typing import Generator
  4 | 
  5 | import requests
  6 | from typing_extensions import Literal
  7 | 
  8 | type_property_map = {
  9 |     "videos": "videoRenderer",
 10 |     "streams": "videoRenderer",
 11 |     "shorts": "reelWatchEndpoint"
 12 | }
 13 | 
 14 | def get_channel(
 15 |     channel_id: str = None,
 16 |     channel_url: str = None,
 17 |     channel_username: str = None,
 18 |     limit: int = None,
 19 |     sleep: float = 1,
 20 |     proxies: dict = None,
 21 |     sort_by: Literal["newest", "oldest", "popular"] = "newest",
 22 |     content_type: Literal["videos", "shorts", "streams"] = "videos",
 23 | ) -> Generator[dict, None, None]:
 24 | 
 25 |     """Get videos for a channel.
 26 | 
 27 |     Parameters:
 28 |         channel_id (``str``, *optional*):
 29 |             The channel id from the channel you want to get the videos for.
 30 |             If you prefer to use the channel url instead, see ``channel_url`` below.
 31 | 
 32 |         channel_url (``str``, *optional*):
 33 |             The url to the channel you want to get the videos for.
 34 |             Since there is a few type's of channel url's, you can use the one you want
 35 |             by passing it here instead of using ``channel_id``.
 36 | 
 37 |         channel_username (``str``, *optional*):
 38 |             The username from the channel you want to get the videos for.
 39 |             Ex. ``LinusTechTips`` (without the @).
 40 |             If you prefer to use the channel url instead, see ``channel_url`` above.
 41 | 
 42 |         limit (``int``, *optional*):
 43 |             Limit the number of videos you want to get.
 44 | 
 45 |         sleep (``int``, *optional*):
 46 |             Seconds to sleep between API calls to youtube, in order to prevent getting blocked.
 47 |             Defaults to 1.
 48 | 
 49 |         proxies (``dict``, *optional*):
 50 |             A dictionary with the proxies you want to use. Ex:
 51 |             ``{'https': 'http://username:password@101.102.103.104:3128'}``
 52 |         
 53 |         sort_by (``str``, *optional*):
 54 |             In what order to retrieve to videos. Pass one of the following values.
 55 |             ``"newest"``: Get the new videos first.
 56 |             ``"oldest"``: Get the old videos first.
 57 |             ``"popular"``: Get the popular videos first. Defaults to "newest".
 58 | 
 59 |         content_type (``str``, *optional*):
 60 |             In order to get content type. Pass one of the following values.
 61 |             ``"videos"``: Videos
 62 |             ``"shorts"``: Shorts
 63 |             ``"streams"``: Streams
 64 |     """
 65 | 
 66 |     base_url = ""
 67 |     if channel_url:
 68 |         base_url = channel_url
 69 |     elif channel_id:
 70 |         base_url = f"https://www.youtube.com/channel/{channel_id}"
 71 |     elif channel_username:
 72 |         base_url = f"https://www.youtube.com/@{channel_username}"
 73 | 
 74 |     url = "{base_url}/{content_type}?view=0&flow=grid".format(
 75 |         base_url=base_url,
 76 |         content_type=content_type,
 77 |     )
 78 |     api_endpoint = "https://www.youtube.com/youtubei/v1/browse"
 79 |     videos = get_videos(url, api_endpoint, "contents", type_property_map[content_type], limit, sleep, proxies, sort_by)
 80 |     for video in videos:
 81 |         yield video
 82 | 
 83 | 
 84 | def get_playlist(
 85 |     playlist_id: str, limit: int = None, sleep: int = 1, proxies: dict = None
 86 | ) -> Generator[dict, None, None]:
 87 | 
 88 |     """Get videos for a playlist.
 89 | 
 90 |     Parameters:
 91 |         playlist_id (``str``):
 92 |             The playlist id from the playlist you want to get the videos for.
 93 | 
 94 |         limit (``int``, *optional*):
 95 |             Limit the number of videos you want to get.
 96 | 
 97 |         sleep (``int``, *optional*):
 98 |             Seconds to sleep between API calls to youtube, in order to prevent getting blocked.
 99 |             Defaults to 1.
100 |         
101 |         proxies (``dict``, *optional*):
102 |             A dictionary with the proxies you want to use. Ex:
103 |             ``{'https': 'http://username:password@101.102.103.104:3128'}``
104 |     """
105 | 
106 |     url = f"https://www.youtube.com/playlist?list={playlist_id}"
107 |     api_endpoint = "https://www.youtube.com/youtubei/v1/browse"
108 |     videos = get_videos(url, api_endpoint, "playlistVideoListRenderer", "playlistVideoRenderer", limit, sleep, proxies)
109 |     for video in videos:
110 |         yield video
111 | 
112 | 
113 | def get_search(
114 |     query: str,
115 |     limit: int = None,
116 |     sleep: int = 1,
117 |     sort_by: Literal["relevance", "upload_date", "view_count", "rating"] = "relevance",
118 |     results_type: Literal["video", "channel", "playlist", "movie"] = "video",
119 |     proxies: dict = None,
120 | ) -> Generator[dict, None, None]:
121 | 
122 |     """Search youtube and get videos.
123 | 
124 |     Parameters:
125 |         query (``str``):
126 |             The term you want to search for.
127 | 
128 |         limit (``int``, *optional*):
129 |             Limit the number of videos you want to get.
130 | 
131 |         sleep (``int``, *optional*):
132 |             Seconds to sleep between API calls to youtube, in order to prevent getting blocked.
133 |             Defaults to 1.
134 | 
135 |         sort_by (``str``, *optional*):
136 |             In what order to retrieve to videos. Pass one of the following values.
137 |             ``"relevance"``: Get the new videos in order of relevance.
138 |             ``"upload_date"``: Get the new videos first.
139 |             ``"view_count"``: Get the popular videos first.
140 |             ``"rating"``: Get videos with more likes first.
141 |             Defaults to "relevance".
142 | 
143 |         results_type (``str``, *optional*):
144 |             What type you want to search for. Pass one of the following values:
145 |             ``"video"|"channel"|"playlist"|"movie"``. Defaults to "video".
146 |         
147 |         proxies (``dict``, *optional*):
148 |             A dictionary with the proxies you want to use. Ex:
149 |             ``{'https': 'http://username:password@101.102.103.104:3128'}``
150 | 
151 |     """
152 | 
153 |     sort_by_map = {
154 |         "relevance": "A",
155 |         "upload_date": "I",
156 |         "view_count": "M",
157 |         "rating": "E",
158 |     }
159 | 
160 |     results_type_map = {
161 |         "video": ["B", "videoRenderer"],
162 |         "channel": ["C", "channelRenderer"],
163 |         "playlist": ["D", "playlistRenderer"],
164 |         "movie": ["E", "videoRenderer"],
165 |     }
166 | 
167 |     param_string = f"CA{sort_by_map[sort_by]}SAhA{results_type_map[results_type][0]}"
168 |     url = f"https://www.youtube.com/results?search_query={query}&sp={param_string}"
169 |     api_endpoint = "https://www.youtube.com/youtubei/v1/search"
170 |     videos = get_videos(
171 |         url, api_endpoint, "contents", results_type_map[results_type][1], limit, sleep, proxies
172 |     )
173 |     for video in videos:
174 |         yield video
175 | 
176 | 
177 | 
178 | def get_video(
179 |     id: str,
180 | ) -> dict:
181 | 
182 |     """Get a single video.
183 | 
184 |     Parameters:
185 |         id (``str``):
186 |             The video id from the video you want to get.
187 |     """
188 | 
189 |     session = get_session()
190 |     url = f"https://www.youtube.com/watch?v={id}"
191 |     html = get_initial_data(session, url)
192 |     client = json.loads(
193 |         get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}'
194 |     )["client"]
195 |     session.headers["X-YouTube-Client-Name"] = "1"
196 |     session.headers["X-YouTube-Client-Version"] = client["clientVersion"]
197 |     data = json.loads(
198 |         get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}"
199 |     )
200 |     return next(search_dict(data, "videoPrimaryInfoRenderer"))
201 | 
202 | 
203 | 
204 | def get_videos(
205 |     url: str, api_endpoint: str, selector_list: str, selector_item: str, limit: int, sleep: float, proxies: dict = None, sort_by: str = None
206 | ) -> Generator[dict, None, None]:
207 |     session = get_session(proxies)
208 |     is_first = True
209 |     quit_it = False
210 |     count = 0
211 |     while True:
212 |         if is_first:
213 |             html = get_initial_data(session, url)
214 |             client = json.loads(
215 |                 get_json_from_html(html, "INNERTUBE_CONTEXT", 2, '"}},') + '"}}'
216 |             )["client"]
217 |             api_key = get_json_from_html(html, "innertubeApiKey", 3)
218 |             session.headers["X-YouTube-Client-Name"] = "1"
219 |             session.headers["X-YouTube-Client-Version"] = client["clientVersion"]
220 |             data = json.loads(
221 |                 get_json_from_html(html, "var ytInitialData = ", 0, "};") + "}"
222 |             )
223 |             data = next(search_dict(data, selector_list), None)
224 |             next_data = get_next_data(data, sort_by)
225 |             is_first = False
226 |             if sort_by and sort_by != "newest": 
227 |                 continue
228 |         else:
229 |             data = get_ajax_data(session, api_endpoint, api_key, next_data, client)
230 |             next_data = get_next_data(data)
231 |         for result in get_videos_items(data, selector_item):
232 |             try:
233 |                 count += 1
234 |                 yield result
235 |                 if count == limit:
236 |                     quit_it = True
237 |                     break
238 |             except GeneratorExit:
239 |                 quit_it = True
240 |                 break
241 | 
242 |         if not next_data or quit_it:
243 |             break
244 | 
245 |         time.sleep(sleep)
246 | 
247 |     session.close()
248 | 
249 | 
250 | def get_session(proxies: dict = None) -> requests.Session:
251 |     session = requests.Session()
252 |     if proxies:
253 |         session.proxies.update(proxies)
254 |     session.headers[
255 |         "User-Agent"
256 |     ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
257 |     session.headers["Accept-Language"] = "en"
258 |     return session
259 | 
260 | def get_initial_data(session: requests.Session, url: str) -> str:
261 |     session.cookies.set("CONSENT", "YES+cb", domain=".youtube.com")
262 |     response = session.get(url, params={"ucbcb":1})
263 | 
264 |     html = response.text
265 |     return html
266 | 
267 | 
268 | def get_ajax_data(
269 |     session: requests.Session,
270 |     api_endpoint: str,
271 |     api_key: str,
272 |     next_data: dict,
273 |     client: dict,
274 | ) -> dict:
275 |     data = {
276 |         "context": {"clickTracking": next_data["click_params"], "client": client},
277 |         "continuation": next_data["token"],
278 |     }
279 |     response = session.post(api_endpoint, params={"key": api_key}, json=data)
280 |     return response.json()
281 | 
282 | 
283 | def get_json_from_html(html: str, key: str, num_chars: int = 2, stop: str = '"') -> str:
284 |     pos_begin = html.find(key) + len(key) + num_chars
285 |     pos_end = html.find(stop, pos_begin)
286 |     return html[pos_begin:pos_end]
287 | 
288 | 
289 | def get_next_data(data: dict, sort_by: str = None) -> dict:
290 |     # Youtube, please don't change the order of these
291 |     sort_by_map = {
292 |         "newest": 0, 
293 |         "popular": 1,
294 |         "oldest": 2, 
295 |     }
296 |     if sort_by and sort_by != "newest":
297 |         endpoint = next(
298 |             search_dict(data, "feedFilterChipBarRenderer"), None)["contents"][sort_by_map[sort_by]]["chipCloudChipRenderer"]["navigationEndpoint"]
299 |     else:
300 |         endpoint = next(search_dict(data, "continuationEndpoint"), None)
301 |     if not endpoint:
302 |         return None
303 |     next_data = {
304 |         "token": endpoint["continuationCommand"]["token"],
305 |         "click_params": {"clickTrackingParams": endpoint["clickTrackingParams"]},
306 |     }
307 | 
308 |     return next_data
309 | 
310 | 
311 | def search_dict(partial: dict, search_key: str) -> Generator[dict, None, None]:
312 |     stack = [partial]
313 |     while stack:
314 |         current_item = stack.pop(0)
315 |         if isinstance(current_item, dict):
316 |             for key, value in current_item.items():
317 |                 if key == search_key:
318 |                     yield value
319 |                 else:
320 |                     stack.append(value)
321 |         elif isinstance(current_item, list):
322 |             for value in current_item:
323 |                 stack.append(value)
324 | 
325 | 
326 | def get_videos_items(data: dict, selector: str) -> Generator[dict, None, None]:
327 |     return search_dict(data, selector)
328 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | with open("scrapetube/__init__.py", encoding="utf-8") as f:
 6 |     version = re.findall(r"__version__ = \"(.+)\"", f.read())[0]
 7 | 
 8 | with open("README.md", encoding="utf-8") as f:
 9 |     readme = f.read()
10 | 
11 | with open("requirements.txt", encoding="utf-8") as f:
12 |     requirements = [r.strip() for r in f]
13 | 
14 | setup(
15 |     name="scrapetube",
16 |     version=version,
17 |     packages=["scrapetube"],
18 |     include_package_data=True,
19 |     url="https://github.com/dermasmid/scrapetube",
20 |     license="MIT",
21 |     long_description=readme,
22 |     long_description_content_type="text/markdown",
23 |     author="Cheskel Twersky",
24 |     author_email="twerskycheskel@gmail.com",
25 |     description="Scrape youtube without the official youtube api and without selenium.",
26 |     keywords="youtube python channel videos search playlist list get",
27 |     classifiers=[
28 |         "Programming Language :: Python :: 3",
29 |         "License :: OSI Approved :: MIT License",
30 |         "Operating System :: OS Independent",
31 |     ],
32 |     project_urls={"Documentation": "https://scrapetube.readthedocs.io/en/latest/"},
33 |     install_requires=requirements,
34 |     python_requires=">=3.6",
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | sys.path.insert(
 5 |     0, "/".join(os.path.dirname(os.path.realpath(__file__)).split(os.sep)[:-1])
 6 | )
 7 | 
 8 | import scrapetube
 9 | 
10 | 
11 | videos = scrapetube.get_channel("UC9-y-6csu5WGm29I7JiwpnA", sort_by="popular")
12 | 
13 | for video in videos:
14 |     print(video["videoId"])
15 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36,py38
 3 | 
 4 | [testenv]
 5 | 
 6 | deps = -r{toxinidir}/requirements.txt
 7 | 
 8 | commands =
 9 |     python tests/test.py
10 | 


--------------------------------------------------------------------------------