├── .gitattributes ├── .gitignore ├── CITATION.cff ├── README.md ├── examples ├── __init__.py ├── check_tool.py ├── comments_example.py ├── hashtag_example.py ├── ms_token_example.py ├── network_info_example.py ├── user_example.py └── video_example.py ├── pytok ├── __init__.py ├── api │ ├── __init__.py │ ├── base.py │ ├── hashtag.py │ ├── search.py │ ├── sound.py │ ├── trending.py │ ├── user.py │ └── video.py ├── captcha_solver.py ├── exceptions.py ├── helpers.py ├── tiktok.py └── utils.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── captcha_examples.json ├── test_captcha.py ├── test_user.py └── test_utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | bmp.log 3 | geckodriver.log 4 | server.log 5 | browsermob-proxy/* 6 | myScripts/* 7 | test.py 8 | debug.log 9 | res.html 10 | tmp/* 11 | dist/* 12 | *.egg-info 13 | tmp/ 14 | tmp 15 | .pytest_cache/* 16 | test.mp4 17 | test.txt 18 | .pytest_cache/* 19 | tests/__pycache__/* 20 | *.pyc 21 | acrawl.js 22 | test2.py 23 | build 24 | MANIFEST 25 | src 26 | .vscode 27 | .env 28 | tests/data -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Steel" 5 | given-names: "Ben" 6 | orcid: "https://orcid.org/0009-0006-3845-1394" 7 | - family-names: "Abrahams" 8 | given-names: "Alexei" 9 | orcid: "https://orcid.org/0000-0002-6547-072X" 10 | title: "PyTok" 11 | version: 0.1.0 12 | doi: 10.5281/zenodo.12802714 13 | date-released: 2024-07-23 14 | url: "https://github.com/networkdynamics/pytok" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![DOI](https://zenodo.org/badge/555492190.svg)](https://zenodo.org/doi/10.5281/zenodo.12802713) 3 | 4 | # pytok 5 | 6 | This is a Playwright based version of David Teacher's unofficial api wrapper for TikTok.com in python. It re-implements a currently limited set of the features of the original library, with a shifted focus on using browser automation to allow automatic captcha solves with a hopefully minor trade-off in performance. 7 | 8 | ## Installation 9 | 10 | ```bash 11 | pip install git+https://github.com/networkdynamics/pytok.git@master 12 | ``` 13 | 14 | ## Quick Start Guide 15 | 16 | Here's a quick bit of code to get the videos from a particular hashtag on TikTok. There's more examples in the [examples](https://github.com/networkdynamics/pytok/tree/master/examples) directory. 17 | 18 | ```py 19 | import asyncio 20 | 21 | from pytok.tiktok import PyTok 22 | 23 | async def main(): 24 | async with PyTok() as api: 25 | user = api.user(username="therock") 26 | user_data = await user.info() 27 | print(user_data) 28 | 29 | videos = [] 30 | async for video in user.videos(): 31 | video_data = video.info() 32 | print(video_data) 33 | 34 | if __name__ == "__main__": 35 | asyncio.run(main()) 36 | ``` 37 | 38 | 39 | Please note pulling data from TikTok takes a while! We recommend leaving the scripts running on a server for a while for them to finish downloading everything. Feel free to play around with the delay constants to either speed up the process or avoid TikTok rate limiting, like so: `PyTok(request_delay=10)` 40 | 41 | Please do not hesitate to make an issue in this repo to get our help with this! 42 | 43 | ## Citation 44 | 45 | If you use this library in your research, please cite it using the following BibTeX entry: 46 | 47 | ```bibtex 48 | @software{ben_steel_2024_12802714, 49 | author = {Ben Steel and 50 | Alexei Abrahams}, 51 | title = {{networkdynamics/pytok: Initial working version of 52 | library}}, 53 | month = jul, 54 | year = 2024, 55 | publisher = {Zenodo}, 56 | version = {v0.1.0}, 57 | doi = {10.5281/zenodo.12802714}, 58 | url = {https://doi.org/10.5281/zenodo.12802714} 59 | } 60 | ``` 61 | 62 | ## Format and Schema 63 | 64 | The JSONable dictionary returned by the `info()` methods contains all of the data that the TikTok API returns. We have provided helper functions to parse that data into Pandas DataFrames, `utils.get_comment_df()`, `utils.get_video_df()` and `utils.get_user_df()` for the data from comments, videos, and users respectively. 65 | 66 | The video dataframe will contain the following columns: 67 | |Field name | Description | 68 | |----------|----------| 69 | |`video_id`| Unique video ID | 70 | |`createtime`| UTC datetime of video creation time in YYYY-MM-DD HH:MM:SS format | 71 | |`author_name`| Unique author name | 72 | |`author_id`| Unique author ID | 73 | |`desc`| The full video description from the author | 74 | |`hashtags`| A list of hashtags used in the video description | 75 | |`share_video_id`| If the video is sharing another video, this is the video ID of that original video, else empty | 76 | |`share_video_user_id`| If the video is sharing another video, this the user ID of the author of that video, else empty | 77 | |`share_video_user_name`| If the video is sharing another video, this is the user name of the author of that video, else empty | 78 | |`share_type`| If the video is sharing another video, this is the type of the share, stitch, duet etc. | 79 | |`mentions`| A list of users mentioned in the video description, if any | 80 | |`digg_count`| The number of likes on the video | 81 | |`share_count`| The number of times the video was shared | 82 | |`comment_count`| The number of comments on the video | 83 | |`play_count`| The number of times the video was played | 84 | 85 | The comment dataframe will contain the following columns: 86 | |Field name | Description | 87 | |----------|-----------| 88 | |`comment_id`| Unique comment ID | 89 | |`createtime`| UTC datetime of comment creation time in YYYY-MM-DD HH:MM:SS format | 90 | |`author_name`| Unique author name | 91 | |`author_id`| Unique author ID | 92 | |`text`| Text of the comment | 93 | |`mentions`| A list of users that are tagged in the comment | 94 | |`video_id`| The ID of the video the comment is on | 95 | |`comment_language`| The language of the comment, as predicted by the TikTok API | 96 | |`digg_count`| The number of likes the comment got | 97 | |`reply_comment_id`| If the comment is replying to another comment, this is the ID of that comment | 98 | 99 | The user dataframe will contain the following columns: 100 | |Field name | Description | 101 | |----------|-----------| 102 | |`id`| Unique author ID | 103 | |`unique_id`| Unique user name | 104 | |`nickname`| Display user name, changeable | 105 | |`signature`| Short user description | 106 | |`verified`| Whether or not the user is verified | 107 | |`num_following`| How many other accounts the user is following | 108 | |`num_followers`| How many followers the user has | 109 | |`num_videos`| How many videos the user has made | 110 | |`num_likes`| How many total likes the user has had | 111 | |`createtime`| When the user account was made. This is derived from the `id` field, and can occasionally be incorrect with a very low unix epoch such as 1971 | 112 | 113 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/networkdynamics/pytok/c1b8704be711f647d2a222bfeef9ea2b6a325375/examples/__init__.py -------------------------------------------------------------------------------- /examples/check_tool.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | async def main(): 7 | async with PyTok(browser="chromium") as api: 8 | await api._page.goto("https://www.browserscan.net/") 9 | pass 10 | 11 | if __name__ == "__main__": 12 | asyncio.run(main()) 13 | -------------------------------------------------------------------------------- /examples/comments_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | videos = [ 7 | { 8 | 'id': '7058106162235100462', 9 | 'author': { 10 | 'uniqueId': 'charlesmcbryde' 11 | } 12 | } 13 | ] 14 | 15 | async def main(): 16 | async with PyTok(headless=False) as api: 17 | for video in videos: 18 | comments = [] 19 | async for comment in api.video(id=video['id'], username=video['author']['uniqueId']).comments(count=1000): 20 | comments.append(comment) 21 | 22 | assert len(comments) > 0, "No comments found" 23 | with open("out.json", "w") as f: 24 | json.dump(comments, f) 25 | 26 | if __name__ == "__main__": 27 | asyncio.run(main()) 28 | -------------------------------------------------------------------------------- /examples/hashtag_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | hashtag_name = 'fyp' 7 | 8 | async def main(): 9 | async with PyTok(manual_captcha_solves=True) as api: 10 | hashtag = api.hashtag(name=hashtag_name) 11 | 12 | videos = [] 13 | async for video in hashtag.videos(count=1000): 14 | video_info = await video.info() 15 | videos.append(video_info) 16 | 17 | with open("out.json", "w") as out_file: 18 | json.dump(videos, out_file) 19 | 20 | if __name__ == "__main__": 21 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/ms_token_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | async def main(): 7 | async with PyTok(headless=True) as api: 8 | user = api.user(username="therock") 9 | # get random user to load page 10 | user_data = await user.info() 11 | ms_tokens = await api.get_ms_tokens() 12 | print(ms_tokens) 13 | 14 | if __name__ == "__main__": 15 | asyncio.run(main()) 16 | -------------------------------------------------------------------------------- /examples/network_info_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | username = 'therock' 7 | id = '7296444945991224622' 8 | 9 | async def main(): 10 | async with PyTok() as api: 11 | video = api.video(username=username, id=id) 12 | 13 | # Bytes of the TikTok video 14 | video_data = await video.info() 15 | network_data = await video.network_info() 16 | bytes_network_data = await video.bytes_network_info() 17 | 18 | all_data = { 19 | "video_data": video_data, 20 | "network_data": network_data, 21 | "bytes_network_data": bytes_network_data 22 | } 23 | 24 | with open("out.json", "w") as out_file: 25 | json.dump(all_data, out_file) 26 | 27 | if __name__ == "__main__": 28 | asyncio.run(main()) 29 | 30 | -------------------------------------------------------------------------------- /examples/user_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | async def main(): 7 | users = ['therock'] 8 | async with PyTok(manual_captcha_solves=True, log_captcha_solves=True) as api: 9 | for username in users: 10 | user = api.user(username=username) 11 | user_data = await user.info() 12 | 13 | videos = [] 14 | videos_bytes = [] 15 | async for video in user.videos(): 16 | video_data = await video.info() 17 | videos.append(video_data) 18 | 19 | assert len(videos) > 0, "No videos found" 20 | with open("out.json", "w") as f: 21 | json.dump(videos, f) 22 | 23 | if __name__ == "__main__": 24 | asyncio.run(main()) 25 | -------------------------------------------------------------------------------- /examples/video_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | from pytok.tiktok import PyTok 5 | 6 | username = 'therock' 7 | id = '7296444945991224622' 8 | 9 | async def main(): 10 | async with PyTok() as api: 11 | video = api.video(username=username, id=id) 12 | 13 | # Bytes of the TikTok video 14 | video_data = await video.info() 15 | related_videos = [] 16 | async for related_video in video.related_videos(): 17 | related_videos.append(related_video) 18 | video_bytes = await video.bytes() 19 | 20 | with open("out.json", "w") as out_file: 21 | json.dump(video_data, out_file) 22 | 23 | with open("related.json", "w") as out_file: 24 | json.dump(list(related_videos), out_file) 25 | 26 | with open("out.mp4", "wb") as out_file: 27 | out_file.write(video_bytes) 28 | 29 | if __name__ == "__main__": 30 | asyncio.run(main()) 31 | 32 | -------------------------------------------------------------------------------- /pytok/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. include:: ../README.md 3 | """ 4 | __docformat__ = "restructuredtext" 5 | -------------------------------------------------------------------------------- /pytok/api/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains classes that all represent different types of data sent back by the TikTok servers. 3 | 4 | The files within in module correspond to what type of object is described and all have different methods associated with them. 5 | 6 | 7 | ### How To Interpret TikTok Data 8 | There are quite a few ambigious keys in the JSON that TikTok returns so here's a section that tries to document some of them. 9 | 10 | **Note**: These are incomplete, if you get confused about something feel free to add it here as a PR once you figure it out. 11 | 12 | | JSON Key | Description | 13 | |------------------|-------------| 14 | | createTime | The [unix epoch](https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp) of creation, all other time fields are also unix epochs. | 15 | | secUid & (userId or id) | Two different unique attributes that are used in conjunction to reference a specific account, so if you're storing users somewhere in a database, you should store both secUid & userId. | 16 | | id | A unique attribute used to reference a non-user object like video, hashtag, etc | 17 | | diggCount | The likes for a specific video. | 18 | | digged | Used to check if the current user has liked/digged a video, this will always be false since this package doesn't support logged-in user functions. | 19 | """ 20 | -------------------------------------------------------------------------------- /pytok/api/base.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import datetime 3 | import random 4 | 5 | from patchright.async_api import expect, Page 6 | 7 | from .. import exceptions, captcha_solver 8 | 9 | TOK_DELAY = 20 10 | CAPTCHA_DELAY = 999999 11 | 12 | 13 | def get_login_close_element(page): 14 | return page.get_by_text("Continue as guest", exact=True) \ 15 | .or_(page.get_by_text("Continue without login", exact=True)) 16 | 17 | 18 | def get_captcha_element(page): 19 | return page.locator('Rotate the shapes') \ 20 | .or_(page.get_by_text('Verify to continue:', exact=True)) \ 21 | .or_(page.get_by_text('Click on the shapes with the same size', exact=True)) \ 22 | .or_(page.get_by_text('Drag the slider to fit the puzzle', exact=True).first) 23 | 24 | 25 | class Base: 26 | 27 | async def check_initial_call(self, url): 28 | event = await self.wait_for_requests(url) 29 | response = await event.value.response() 30 | if response.status >= 300: 31 | raise exceptions.NotAvailableException("Content is not available") 32 | 33 | async def wait_for_content_or_captcha(self, content_tag): 34 | page = self.parent._page 35 | 36 | content_element = page.locator(content_tag).first 37 | # content_element = page.get_by_text('Videos', exact=True) 38 | captcha_element = get_captcha_element(page) 39 | 40 | try: 41 | await expect(content_element.or_(captcha_element)).to_be_visible(timeout=TOK_DELAY * 1000) 42 | 43 | except TimeoutError as e: 44 | raise exceptions.TimeoutException(str(e)) 45 | 46 | captcha_visible = await captcha_element.is_visible() 47 | if captcha_visible: 48 | await self.solve_captcha() 49 | asyncio.sleep(1) 50 | await expect(content_element).to_be_visible(timeout=TOK_DELAY * 1000) 51 | 52 | return content_element 53 | 54 | async def wait_for_content_or_unavailable(self, content_tag, unavailable_text, no_content_text=None): 55 | page: Page = self.parent._page 56 | content_element = page.locator(content_tag).first 57 | captcha_element = get_captcha_element(page) 58 | unavailable_element = page.get_by_text(unavailable_text, exact=True) 59 | 60 | # try: 61 | expected_elements = content_element.or_(captcha_element).or_(unavailable_element) 62 | 63 | def add_no_content_text(expected_es, text): 64 | if no_content_text: 65 | if isinstance(no_content_text, list): 66 | for text in no_content_text: 67 | expected_es = expected_es.or_(page.get_by_text(text, exact=True)) 68 | elif isinstance(no_content_text, str): 69 | expected_es = expected_es.or_(page.get_by_text(no_content_text, exact=True)) 70 | return expected_es 71 | expected_elements = add_no_content_text(expected_elements, no_content_text) 72 | 73 | await self.check_and_resolve_refresh_button() 74 | await self.check_and_resolve_login_popup() 75 | 76 | self.parent.logger.debug(f"Checking for '{unavailable_text}'") 77 | if await unavailable_element.is_visible(): 78 | raise exceptions.NotAvailableException(f"Content is not available with message: '{unavailable_text}'") 79 | 80 | if no_content_text: 81 | if isinstance(no_content_text, list): 82 | for text in no_content_text: 83 | no_content_element = page.get_by_text(text, exact=True) 84 | if await no_content_element.is_visible(): 85 | raise exceptions.NoContentException(f"Content is not available with message: '{text}'") 86 | else: 87 | self.parent.logger.debug(f"Could not find text '{text}'") 88 | elif isinstance(no_content_text, str): 89 | no_content_element = page.get_by_text(no_content_text, exact=True) 90 | if await no_content_element.is_visible(): 91 | raise exceptions.NoContentException(f"Content is not available with message: '{no_content_text}'") 92 | else: 93 | self.parent.logger.debug(f"Could not find text '{text}'") 94 | 95 | max_tries = 10 96 | tries = 0 97 | self.parent.logger.debug("Waiting for main content to become visible") 98 | while not (await content_element.is_visible()) and tries < max_tries: 99 | await asyncio.sleep(0.5) 100 | await self.check_and_resolve_refresh_button() 101 | tries += 1 102 | 103 | if tries >= max_tries: 104 | # try some other behaviour 105 | url = page.url 106 | await page.goto("https://www.tiktok.com") 107 | await asyncio.sleep(5) 108 | await page.goto(url) 109 | 110 | return content_element 111 | 112 | async def check_and_resolve_refresh_button(self): 113 | page: Page = self.parent._page 114 | refresh_button = page.get_by_text('Refresh') 115 | self.parent.logger.debug("Checking for refresh button") 116 | if await refresh_button.is_visible(): 117 | self.parent.logger.debug("Refresh button found, clicking") 118 | await refresh_button.click() 119 | await asyncio.sleep(1) 120 | 121 | async def check_and_resolve_login_popup(self): 122 | page: Page = self.parent._page 123 | login_popup = page.get_by_text('Log in to TikTok') 124 | self.parent.logger.debug("Checking for login to TikTok pop up") 125 | if await login_popup.is_visible(): 126 | self.parent.logger.debug("Login prompt found, checking for close button") 127 | login_close = page.locator('[data-e2e="modal-close-inner-button"]') 128 | if await login_close.is_visible(): 129 | await login_close.click() 130 | await asyncio.sleep(1) 131 | else: 132 | raise exceptions.NotAvailableException(f"Content is not available with message: 'Log in to TikTok'") 133 | 134 | 135 | async def wait_for_content_or_unavailable_or_captcha(self, content_tag, unavailable_text, no_content_text=None): 136 | page: Page = self.parent._page 137 | content_element = page.locator(content_tag).first 138 | captcha_element = get_captcha_element(page) 139 | unavailable_element = page.get_by_text(unavailable_text, exact=True) 140 | 141 | # try: 142 | expected_elements = content_element.or_(captcha_element).or_(unavailable_element) 143 | 144 | def add_no_content_text(expected_es, text): 145 | if no_content_text: 146 | if isinstance(no_content_text, list): 147 | for text in no_content_text: 148 | expected_es = expected_es.or_(page.get_by_text(text, exact=True)) 149 | elif isinstance(no_content_text, str): 150 | expected_es = expected_es.or_(page.get_by_text(no_content_text, exact=True)) 151 | return expected_es 152 | expected_elements = add_no_content_text(expected_elements, no_content_text) 153 | 154 | await self.check_and_resolve_refresh_button() 155 | await self.check_and_resolve_login_popup() 156 | 157 | # await expect(expected_elements).to_be_visible( 158 | # timeout=TOK_DELAY * 1000) 159 | 160 | self.parent.logger.debug("Checking for captcha") 161 | if await captcha_element.is_visible(): 162 | self.parent.logger.debug("Captcha found") 163 | await self.solve_captcha() 164 | await asyncio.sleep(1) 165 | if await captcha_element.is_visible(): 166 | raise exceptions.CaptchaException("Captcha is still visible after solving") 167 | expected_elements = content_element.or_(unavailable_element) 168 | expected_elements = add_no_content_text(expected_elements, no_content_text) 169 | await expect(expected_elements).to_be_visible( 170 | timeout=TOK_DELAY * 1000) # waits TOK_DELAY seconds and launches new browser instance 171 | 172 | # check after resolving captcha 173 | await self.check_and_resolve_refresh_button() 174 | await self.check_and_resolve_login_popup() 175 | 176 | self.parent.logger.debug(f"Checking for '{unavailable_text}'") 177 | if await unavailable_element.is_visible(): 178 | raise exceptions.NotAvailableException(f"Content is not available with message: '{unavailable_text}'") 179 | 180 | if no_content_text: 181 | if isinstance(no_content_text, list): 182 | for text in no_content_text: 183 | no_content_element = page.get_by_text(text, exact=True) 184 | if await no_content_element.is_visible(): 185 | raise exceptions.NoContentException(f"Content is not available with message: '{text}'") 186 | else: 187 | self.parent.logger.debug(f"Could not find text '{text}'") 188 | elif isinstance(no_content_text, str): 189 | no_content_element = page.get_by_text(no_content_text, exact=True) 190 | if await no_content_element.is_visible(): 191 | raise exceptions.NoContentException(f"Content is not available with message: '{no_content_text}'") 192 | else: 193 | self.parent.logger.debug(f"Could not find text '{text}'") 194 | 195 | max_tries = 10 196 | tries = 0 197 | self.parent.logger.debug("Waiting for main content to become visible") 198 | content_is_visible = await content_element.is_visible() 199 | while not content_is_visible and tries < max_tries: 200 | await asyncio.sleep(1) 201 | await self.check_and_resolve_refresh_button() 202 | tries += 1 203 | content_is_visible = await content_element.is_visible() 204 | 205 | if tries >= max_tries: 206 | pass 207 | # raise exceptions.TimeoutException(f"Content is not available for unknown reason") 208 | 209 | return content_element 210 | 211 | async def check_for_unavailable_or_captcha(self, unavailable_text): 212 | page = self.parent._page 213 | captcha_element = get_captcha_element(page) 214 | unavailable_element = page.get_by_text(unavailable_text, exact=True) 215 | 216 | captcha_visible = await captcha_element.is_visible() 217 | if captcha_visible: 218 | num_tries = 0 219 | max_tries = 3 220 | captcha_exceptions = [] 221 | while num_tries < max_tries: 222 | num_tries += 1 223 | try: 224 | await self.solve_captcha() 225 | await asyncio.sleep(1) 226 | captcha_is_visible = await captcha_element.is_visible() 227 | if captcha_is_visible: 228 | captcha_exceptions.append(exceptions.CaptchaException("Captcha is still visible after solving")) 229 | continue 230 | else: 231 | break 232 | except Exception as e: 233 | captcha_exceptions.append(e) 234 | else: 235 | print( 236 | f"Failed to solve captcha after {max_tries} tries with errors: {captcha_exceptions}, continuing anyway...") 237 | 238 | login_element = get_login_close_element(page) 239 | login_visible = await login_element.is_visible() 240 | if login_visible: 241 | try: 242 | login_close = get_login_close_element(page) 243 | login_close_visible = await login_close.is_visible() 244 | if login_close_visible: 245 | await login_close.click() 246 | except Exception as e: 247 | print(f"Failed to close login with error: {e}, continuing anyway...") 248 | 249 | if await unavailable_element.is_visible(): 250 | raise exceptions.NotAvailableException(f"Content is not available with message: '{unavailable_text}'") 251 | 252 | async def check_for_unavailable(self, unavailable_text): 253 | page = self.parent._page 254 | unavailable_element = page.get_by_text(unavailable_text, exact=True) 255 | if await unavailable_element.is_visible(): 256 | raise exceptions.NotAvailableException(f"Content is not available with message: '{unavailable_text}'") 257 | 258 | async def check_for_reload_button(self): 259 | page = self.parent._page 260 | reload_button = page.get_by_text('Refresh', exact=True) 261 | if await reload_button.is_visible(): 262 | await reload_button.click() 263 | 264 | async def wait_for_requests(self, api_path, timeout=TOK_DELAY): 265 | page = self.parent._page 266 | try: 267 | async with page.expect_request(api_path, timeout=timeout * 1000) as first: 268 | return await first.value 269 | except Exception as e: 270 | raise exceptions.TimeoutException(str(e)) 271 | 272 | def get_requests(self, api_path): 273 | """searches a list of all requests thus far issued by the Playwright browser instance""" 274 | return [request for request in self.parent._requests if api_path in request.url] 275 | 276 | def get_responses(self, api_path): 277 | return [response for response in self.parent._responses if api_path in response.url] 278 | 279 | async def get_response_body(self, response): 280 | return await response.body() 281 | 282 | async def scroll_to_bottom(self, speed=4): 283 | page = self.parent._page 284 | current_scroll_position = await page.evaluate( 285 | "() => document.documentElement.scrollTop || document.body.scrollTop;") 286 | new_height = current_scroll_position + 1 287 | while current_scroll_position <= new_height: 288 | current_scroll_position += speed + random.randint(-speed, speed) 289 | await page.evaluate(f"() => window.scrollTo(0, {current_scroll_position});") 290 | new_height = await page.evaluate("() => document.body.scrollHeight;") 291 | 292 | async def scroll_to(self, position, speed=5): 293 | page = self.parent._page 294 | current_scroll_position = await page.evaluate( 295 | "() => document.documentElement.scrollTop || document.body.scrollTop;") 296 | new_height = current_scroll_position + 1 297 | while current_scroll_position <= new_height: 298 | current_scroll_position += speed + random.randint(-speed, speed) 299 | await page.evaluate(f"() => window.scrollTo(0, {current_scroll_position});") 300 | new_height = await page.evaluate("() => document.body.scrollHeight;") 301 | if current_scroll_position > position: 302 | break 303 | 304 | async def slight_scroll_up(self, speed=4): 305 | page = self.parent._page 306 | desired_scroll = -500 307 | current_scroll = 0 308 | while current_scroll > desired_scroll: 309 | current_scroll -= speed + random.randint(-speed, speed) 310 | await page.evaluate(f"() => window.scrollBy(0, {-speed});") 311 | 312 | async def scroll_down(self, amount, speed=4): 313 | page = self.parent._page 314 | 315 | current_scroll_position = await page.evaluate( 316 | "() => document.documentElement.scrollTop || document.body.scrollTop;") 317 | desired_position = current_scroll_position + amount 318 | while current_scroll_position < desired_position: 319 | scroll_amount = speed + random.randint(-speed, speed) * 0.5 320 | await page.evaluate(f"() => window.scrollBy(0, {scroll_amount});") 321 | new_scroll_position = await page.evaluate( 322 | "() => document.documentElement.scrollTop || document.body.scrollTop;") 323 | if new_scroll_position > current_scroll_position: 324 | current_scroll_position = new_scroll_position 325 | else: 326 | # we hit the bottom 327 | break 328 | 329 | async def wait_until_not_skeleton_or_captcha(self, skeleton_tag): 330 | page = self.parent._page 331 | content = page.locator(f'[data-e2e={skeleton_tag}]') 332 | try: 333 | await expect(content).not_to_be_visible() 334 | except TimeoutError as e: 335 | captcha_element = get_captcha_element(page) 336 | if await captcha_element.is_visible(): 337 | await self.solve_captcha() 338 | asyncio.sleep(1) 339 | else: 340 | raise exceptions.TimeoutException(str(e)) 341 | 342 | async def check_and_wait_for_captcha(self): 343 | page = self.parent._page 344 | captcha_element = get_captcha_element(page) 345 | captcha_visible = await captcha_element.is_visible() 346 | if captcha_visible: 347 | await self.solve_captcha() 348 | await asyncio.sleep(1) 349 | 350 | async def check_and_close_signin(self): 351 | page = self.parent._page 352 | signin_element = get_login_close_element(page) 353 | signin_visible = await signin_element.is_visible() 354 | if signin_visible: 355 | await signin_element.click() 356 | 357 | async def solve_captcha(self): 358 | if self.parent._manual_captcha_solves: 359 | input("Press Enter to continue after solving CAPTCHA:") 360 | await asyncio.sleep(1) 361 | if self.parent._log_captcha_solves: 362 | request = self.get_requests('/captcha/verify')[0] 363 | body = request.post_data 364 | with open(f"manual_captcha_{datetime.now().isoformat()}.json", "w") as f: 365 | f.write(body) 366 | return 367 | """ 368 | this method not only calculates the CAPTCHA solution but also POSTs it to TikTok's server. 369 | """ 370 | # get captcha data 371 | request = self.get_requests('/captcha/get')[0] 372 | captcha_response = await request.response() 373 | if captcha_response is not None: 374 | captcha_json = await captcha_response.json() 375 | else: 376 | raise exceptions.EmptyResponseException 377 | 378 | if 'mode' in captcha_json['data']: 379 | captcha_data = captcha_json['data'] 380 | elif 'challenges' in captcha_json['data']: 381 | captcha_data = captcha_json['data']['challenges'][0] 382 | captcha_type = captcha_data['mode'] 383 | if captcha_type not in ['slide', 'whirl']: 384 | raise exceptions.CaptchaException(f"Unsupported captcha type: {captcha_type}") 385 | 386 | """ 387 | captcha_data['question']['url1'] is a URL from TikTok's content delivery network. If you copy-paste it into your 388 | web browser, you should GET the puzzle image. puzzle_response is the full response from the server, and 389 | puzzle is the image itself, returned as a sequence of bytes. 390 | """ 391 | puzzle_req = self.get_requests(captcha_data['question']['url1'])[0] 392 | puzzle_response = await puzzle_req.response() 393 | puzzle = await puzzle_response.body() 394 | 395 | if not puzzle: 396 | raise exceptions.CaptchaException("Puzzle was not found in response") 397 | 398 | """ 399 | captcha_data['question']['url2'] is a URL from TikTok's content delivery network. If you copy-paste it into your 400 | web browser, you should GET the puzzle piece that has to be moved to the correct position in the puzzle. 401 | piece_response: the full Playwright/HTTP response object 402 | piece: the image of the puzzle piece, returned as a sequence of bytes 403 | """ 404 | piece_req = self.get_requests(captcha_data['question']['url2'])[0] 405 | piece_response = await piece_req.response() 406 | piece = await piece_response.body() 407 | 408 | if not piece: 409 | raise exceptions.CaptchaException("Piece was not found in response") 410 | 411 | """ 412 | -at this point in the code you have the puzzle image (puzzle) and the piece image (piece) 413 | -now a local CAPTCHA solver will decide how to place the piece in the puzzle 414 | -finally, the solution will be POSTed to TikTok, and the server's response will be obtained 415 | """ 416 | solve = await captcha_solver.CaptchaSolver(captcha_response, puzzle, piece).solve_captcha() 417 | 418 | page = self.parent._page 419 | drag = page.locator('css=div.secsdk-captcha-drag-icon').first 420 | bar = page.locator('css=div.captcha_verify_slide--slidebar').first 421 | 422 | drag_bounding_box = await drag.bounding_box() 423 | bar_bounding_box = await bar.bounding_box() 424 | 425 | drag_centre = { 426 | 'x': drag_bounding_box['x'] + drag_bounding_box['width'] / 2, 427 | 'y': drag_bounding_box['y'] + drag_bounding_box['height'] / 2 428 | } 429 | 430 | bar_effective_width = bar_bounding_box['width'] - drag_bounding_box['width'] 431 | distance_to_drag = bar_effective_width * solve['maxloc'] 432 | 433 | from pyclick import HumanCurve 434 | 435 | curve_kwargs = { 436 | 'knotsCount': 7, 437 | 'distortionMean': 14.3, 438 | 'distortionStdev': 22.7, 439 | 'distortionFrequency': 0.8, 440 | 'targetPoints': 500 441 | } 442 | points = HumanCurve( 443 | [0, 0], 444 | [int(drag_centre['x']), int(drag_centre['y'])], 445 | **curve_kwargs 446 | ).points 447 | for point in points: 448 | await page.mouse.move(point[0], point[1]) 449 | await page.mouse.down() 450 | points = HumanCurve( 451 | [int(drag_centre['x']), int(drag_centre['y'])], 452 | [int(drag_centre['x'] + distance_to_drag), int(drag_centre['y'])], 453 | **curve_kwargs 454 | ).points 455 | for point in points: 456 | await page.mouse.move(point[0], point[1]) 457 | await page.mouse.up() 458 | 459 | if self.parent._log_captcha_solves: 460 | await asyncio.sleep(1) 461 | request = self.get_requests('/captcha/verify')[0] 462 | body = request.post_data 463 | with open(f"automated_captcha_{datetime.now().isoformat()}.json", "w") as f: 464 | f.write(body) 465 | 466 | -------------------------------------------------------------------------------- /pytok/api/hashtag.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import urllib.parse 5 | 6 | from typing import TYPE_CHECKING, ClassVar, Iterator, Optional 7 | 8 | import requests 9 | 10 | if TYPE_CHECKING: 11 | from ..tiktok import PyTok 12 | from .video import Video 13 | 14 | from .base import Base 15 | from ..helpers import edit_url, extract_tag_contents 16 | from ..exceptions import * 17 | 18 | 19 | class Hashtag(Base): 20 | """ 21 | A TikTok Hashtag/Challenge. 22 | 23 | Example Usage 24 | ```py 25 | hashtag = api.hashtag(name='funny') 26 | ``` 27 | """ 28 | 29 | parent: ClassVar[PyTok] 30 | 31 | id: Optional[str] 32 | """The ID of the hashtag""" 33 | name: Optional[str] 34 | """The name of the hashtag (omiting the #)""" 35 | as_dict: dict 36 | """The raw data associated with this hashtag.""" 37 | 38 | def __init__( 39 | self, 40 | name: Optional[str] = None, 41 | id: Optional[str] = None, 42 | data: Optional[dict] = None, 43 | ): 44 | """ 45 | You must provide the name or id of the hashtag. 46 | """ 47 | self.name = name 48 | self.id = id 49 | 50 | if data is not None: 51 | self.as_dict = data 52 | self.__extract_from_data() 53 | else: 54 | self.as_dict = None 55 | 56 | async def info(self, **kwargs) -> dict: 57 | """ 58 | Returns TikTok's dictionary representation of the hashtag object. 59 | """ 60 | if self.as_dict is None: 61 | return await self.info_full(**kwargs) 62 | return self.as_dict 63 | 64 | async def info_full(self, **kwargs) -> dict: 65 | """ 66 | Returns all information sent by TikTok related to this hashtag. 67 | 68 | Example Usage 69 | ```py 70 | hashtag_data = api.hashtag(name='funny').info_full() 71 | ``` 72 | """ 73 | page = self.parent._page 74 | 75 | url = f"https://www.tiktok.com/tag/{self.name}" 76 | await page.goto(url) 77 | 78 | await self.wait_for_content_or_unavailable_or_captcha('[data-e2e=challenge-item]', 'Not available') 79 | await self.check_and_close_signin() 80 | 81 | challenge_responses = self.get_responses("api/challenge/detail") 82 | challenge_responses = [request for request in challenge_responses if f"challengeName={urllib.parse.quote_plus(self.name)}" in request.url] 83 | if len(challenge_responses) == 0: 84 | raise ApiFailedException("Failed to get challenge request") 85 | else: 86 | challenge_response = challenge_responses[0] 87 | 88 | rep_body = await self.get_response_body(challenge_response) 89 | rep_d = json.loads(rep_body.decode('utf-8')) 90 | 91 | if 'challengeInfo' not in rep_d: 92 | raise ApiFailedException("Failed to get challengeInfo from response") 93 | 94 | self.as_dict = rep_d['challengeInfo'] 95 | return self.as_dict 96 | 97 | async def videos(self, count=30, offset=0, **kwargs) -> Iterator[Video]: 98 | """Returns a dictionary listing TikToks with a specific hashtag. 99 | 100 | - Parameters: 101 | - count (int): The amount of videos you want returned. 102 | - offset (int): The the offset of videos from 0 you want to get. 103 | 104 | Example Usage 105 | ```py 106 | for video in api.hashtag(name='funny').videos(): 107 | # do something 108 | ``` 109 | """ 110 | await self.info() 111 | 112 | try: 113 | async for video in self._get_videos_api(count, offset, **kwargs): 114 | yield video 115 | except ApiFailedException: 116 | async for video in self._get_videos_scraping(count, offset, **kwargs): 117 | yield video 118 | 119 | 120 | async def _get_videos_scraping(self, count=30, offset=0, **kwargs): 121 | processed_urls = [] 122 | amount_yielded = 0 123 | pull_method = 'browser' 124 | tries = 0 125 | MAX_TRIES = 5 126 | data_request_path = "api/challenge/item_list" 127 | 128 | while amount_yielded < count: 129 | await self.parent.request_delay() 130 | 131 | search_requests = self.get_requests(data_request_path) 132 | search_requests = [response for response in search_requests if f"challengeID={self.as_dict['challenge']['id']}" in response.url] 133 | search_requests = [request for request in search_requests if request.url not in processed_urls] 134 | for request in search_requests: 135 | processed_urls.append(request.url) 136 | response = await request.response() 137 | try: 138 | body = await self.get_response_body(response) 139 | res = json.loads(body) 140 | except: 141 | continue 142 | if res.get('type') == 'verify': 143 | # this is the captcha denied response 144 | continue 145 | 146 | videos = res.get("itemList", []) 147 | amount_yielded += len(videos) 148 | for video in videos: 149 | yield self.parent.video(data=video) 150 | 151 | if not res.get("hasMore", False): 152 | self.parent.logger.info( 153 | "TikTok isn't sending more TikToks beyond this point." 154 | ) 155 | return 156 | 157 | for _ in range(tries): 158 | await self.slight_scroll_up() 159 | await self.scroll_to_bottom() 160 | await self.parent.request_delay() 161 | 162 | search_requests = self.get_requests(data_request_path) 163 | search_requests = [request for request in search_requests if request.url not in processed_urls] 164 | 165 | if len(search_requests) == 0: 166 | tries += 1 167 | if tries > MAX_TRIES: 168 | raise 169 | continue 170 | 171 | 172 | async def _get_videos_api(self, count=30, offset=0, **kwargs): 173 | responses = self.get_responses("api/challenge/item_list") 174 | responses = [response for response in responses if f"challengeID={self.as_dict['challenge']['id']}" in response.url] 175 | 176 | amount_yielded = 0 177 | cursor = 0 178 | while amount_yielded < count: 179 | for response in responses: 180 | next_url = edit_url(response.url, {"cursor": cursor}) 181 | cookies = await self.parent._context.cookies() 182 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 183 | r = requests.get(next_url, headers=response.headers, cookies=cookies) 184 | try: 185 | res = r.json() 186 | except json.decoder.JSONDecodeError: 187 | raise ApiFailedException("Failed to decode JSON from TikTok API response") 188 | 189 | cursor = res["cursor"] 190 | videos = res.get("itemList", []) 191 | 192 | amount_yielded += len(videos) 193 | for video in videos: 194 | yield self.parent.video(data=video) 195 | 196 | # if not res.get("hasMore", False): 197 | # self.parent.logger.info( 198 | # "TikTok isn't sending more TikToks beyond this point." 199 | # ) 200 | # return 201 | 202 | def __extract_from_data(self): 203 | data = self.as_dict 204 | keys = data.keys() 205 | 206 | if "title" in keys: 207 | self.id = data["id"] 208 | self.name = data["title"] 209 | 210 | if None in (self.name, self.id): 211 | Hashtag.parent.logger.error( 212 | f"Failed to create Hashtag with data: {data}\nwhich has keys {data.keys()}" 213 | ) 214 | 215 | def __repr__(self): 216 | return self.__str__() 217 | 218 | def __str__(self): 219 | return f"PyTok.hashtag(id='{self.id}', name='{self.name}')" 220 | 221 | def __getattr__(self, name): 222 | # TODO: Maybe switch to using @property instead 223 | if name in ["id", "name", "as_dict"]: 224 | self.as_dict = self.info() 225 | self.__extract_from_data() 226 | return self.__getattribute__(name) 227 | 228 | raise AttributeError(f"{name} doesn't exist on PyTok.api.Hashtag") 229 | -------------------------------------------------------------------------------- /pytok/api/search.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import time 5 | from typing import TYPE_CHECKING, Iterator, Type 6 | from urllib.parse import urlencode 7 | import re 8 | 9 | from .user import User 10 | from .hashtag import Hashtag 11 | from .video import Video 12 | from .base import Base 13 | from ..exceptions import * 14 | 15 | if TYPE_CHECKING: 16 | from ..tiktok import PyTok 17 | 18 | import requests 19 | from playwright.async_api import TimeoutError 20 | 21 | class Search(Base): 22 | """Contains static methods about searching.""" 23 | 24 | parent: PyTok 25 | 26 | def __init__(self, search_term): 27 | self.search_term = search_term 28 | 29 | def videos(self, count=28, offset=0, **kwargs) -> Iterator[Video]: 30 | """ 31 | Searches for Videos 32 | 33 | - Parameters: 34 | - search_term (str): The phrase you want to search for. 35 | - count (int): The amount of videos you want returned. 36 | - offset (int): The offset of videos from your data you want returned. 37 | 38 | Example Usage 39 | ```py 40 | for video in api.search.videos('therock'): 41 | # do something 42 | ``` 43 | """ 44 | return self.search_type( 45 | "item", count=count, offset=offset, **kwargs 46 | ) 47 | 48 | def users(self, count=28, offset=0, **kwargs) -> Iterator[User]: 49 | """ 50 | Searches for users using an alternate endpoint than Search.users 51 | 52 | - Parameters: 53 | - search_term (str): The phrase you want to search for. 54 | - count (int): The amount of videos you want returned. 55 | 56 | Example Usage 57 | ```py 58 | for user in api.search.users_alternate('therock'): 59 | # do something 60 | ``` 61 | """ 62 | return self.search_type( 63 | "user", count=count, offset=offset, **kwargs 64 | ) 65 | 66 | async def search_type(self, obj_type, count=28, offset=0, **kwargs) -> Iterator: 67 | """ 68 | Searches for users using an alternate endpoint than Search.users 69 | 70 | - Parameters: 71 | - search_term (str): The phrase you want to search for. 72 | - count (int): The amount of videos you want returned. 73 | - obj_type (str): user | item 74 | 75 | Just use .video & .users 76 | ``` 77 | """ 78 | 79 | if obj_type == "user": 80 | subdomain = "www" 81 | subpath = "user" 82 | elif obj_type == "item": 83 | subdomain = "us" 84 | subpath = "video" 85 | else: 86 | raise TypeError("invalid obj_type") 87 | 88 | page = self.parent._page 89 | 90 | url = f"https://{subdomain}.tiktok.com/search/{subpath}?q={self.search_term}" 91 | await page.goto(url) 92 | 93 | await self.wait_for_content_or_captcha('search_video-item') 94 | 95 | processed_urls = [] 96 | amount_yielded = 0 97 | pull_method = 'browser' 98 | 99 | path = f"api/search/{obj_type}" 100 | 101 | while amount_yielded < count: 102 | await self.parent.request_delay() 103 | 104 | if pull_method == 'browser': 105 | search_requests = self.get_requests(path) 106 | search_requests = [request for request in search_requests if request.url not in processed_urls] 107 | for request in search_requests: 108 | processed_urls.append(request.url) 109 | body = await self.get_response_body(request) 110 | res = json.loads(body) 111 | if res.get('type') == 'verify': 112 | # this is the captcha denied response 113 | continue 114 | 115 | # When I move to 3.10+ support make this a match switch. 116 | if obj_type == "user": 117 | for result in res.get("user_list", []): 118 | yield User(data=result) 119 | amount_yielded += 1 120 | 121 | if obj_type == "item": 122 | for result in res.get("item_list", []): 123 | yield Video(data=result) 124 | amount_yielded += 1 125 | 126 | if res.get("has_more", 0) == 0: 127 | Search.parent.logger.info( 128 | "TikTok is not sending videos beyond this point." 129 | ) 130 | return 131 | 132 | try: 133 | load_more_button = self.wait_for_content_or_captcha('search-load-more') 134 | except TimeoutError: 135 | return 136 | 137 | load_more_button.click() 138 | 139 | self.wait_until_not_skeleton_or_captcha('video-skeleton-container') 140 | 141 | 142 | elif pull_method == 'requests': 143 | cursor = res["cursor"] 144 | next_url = re.sub("offset=([0-9]+)", f"offset={cursor}", request.url) 145 | cookies = self.parent._context.cookies() 146 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 147 | r = requests.get(next_url, headers=request.headers, cookies=cookies) 148 | res = r.json() 149 | 150 | if res.get('type') == 'verify': 151 | pull_method = 'browser' 152 | continue 153 | 154 | if obj_type == "user": 155 | for result in res.get("user_list", []): 156 | yield User(data=result) 157 | amount_yielded += 1 158 | 159 | if obj_type == "item": 160 | for result in res.get("item_list", []): 161 | yield Video(data=result) 162 | amount_yielded += 1 163 | 164 | if res.get("has_more", 0) == 0: 165 | self.parent.logger.info( 166 | "TikTok is not sending videos beyond this point." 167 | ) 168 | return 169 | -------------------------------------------------------------------------------- /pytok/api/sound.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from os import path 3 | 4 | import json 5 | 6 | from urllib.parse import quote, urlencode 7 | 8 | from ..helpers import extract_tag_contents 9 | from ..exceptions import * 10 | 11 | from typing import TYPE_CHECKING, ClassVar, Iterator, Optional 12 | 13 | if TYPE_CHECKING: 14 | from ..tiktok import PyTok 15 | from .user import User 16 | from .video import Video 17 | 18 | 19 | class Sound: 20 | """ 21 | A TikTok Sound/Music/Song. 22 | 23 | Example Usage 24 | ```py 25 | song = api.song(id='7016547803243022337') 26 | ``` 27 | """ 28 | 29 | parent: ClassVar[PyTok] 30 | 31 | id: str 32 | """TikTok's ID for the sound""" 33 | title: Optional[str] 34 | """The title of the song.""" 35 | author: Optional[User] 36 | """The author of the song (if it exists)""" 37 | 38 | def __init__(self, id: Optional[str] = None, data: Optional[str] = None): 39 | """ 40 | You must provide the id of the sound or it will not work. 41 | """ 42 | if data is not None: 43 | self.as_dict = data 44 | self.__extract_from_data() 45 | elif id is None: 46 | raise TypeError("You must provide id parameter.") 47 | else: 48 | self.id = id 49 | 50 | def info(self, use_html=False, **kwargs) -> dict: 51 | """ 52 | Returns a dictionary of TikTok's Sound/Music object. 53 | 54 | - Parameters: 55 | - use_html (bool): If you want to perform an HTML request or not. 56 | Defaults to False to use an API call, which shouldn't get detected 57 | as often as an HTML request. 58 | 59 | 60 | Example Usage 61 | ```py 62 | sound_data = api.sound(id='7016547803243022337').info() 63 | ``` 64 | """ 65 | raise NotImplementedError() 66 | 67 | def info_full(self, **kwargs) -> dict: 68 | """ 69 | Returns all the data associated with a TikTok Sound. 70 | 71 | This makes an API request, there is no HTML request option, as such 72 | with Sound.info() 73 | 74 | Example Usage 75 | ```py 76 | sound_data = api.sound(id='7016547803243022337').info_full() 77 | ``` 78 | """ 79 | raise NotImplementedError() 80 | 81 | def videos(self, count=30, offset=0, **kwargs) -> Iterator[Video]: 82 | """ 83 | Returns Video objects of videos created with this sound. 84 | 85 | - Parameters: 86 | - count (int): The amount of videos you want returned. 87 | - offset (int): The offset of videos you want returned. 88 | 89 | Example Usage 90 | ```py 91 | for video in api.sound(id='7016547803243022337').videos(): 92 | # do something 93 | ``` 94 | """ 95 | raise NotImplementedError() 96 | 97 | def __extract_from_data(self): 98 | data = self.as_dict 99 | keys = data.keys() 100 | 101 | self.id = data.get("id") 102 | self.title = data.get("title") 103 | 104 | if data.get("authorName") is not None: 105 | self.author = self.parent.user(username=data["authorName"]) 106 | 107 | if self.id is None: 108 | Sound.parent.logger.error( 109 | f"Failed to create Sound with data: {data}\nwhich has keys {data.keys()}" 110 | ) 111 | 112 | def __ensure_valid(self): 113 | if self.id == "": 114 | raise SoundRemovedException("This sound has been removed!") 115 | 116 | def __repr__(self): 117 | return self.__str__() 118 | 119 | def __str__(self): 120 | return f"PyTok.sound(id='{self.id}')" 121 | 122 | -------------------------------------------------------------------------------- /pytok/api/trending.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | from urllib.parse import urlencode 5 | 6 | from .video import Video 7 | from .sound import Sound 8 | from .user import User 9 | from .hashtag import Hashtag 10 | 11 | from typing import TYPE_CHECKING, Iterator 12 | 13 | if TYPE_CHECKING: 14 | from ..tiktok import PyTok 15 | 16 | 17 | class Trending: 18 | """Contains static methods related to trending.""" 19 | 20 | parent: PyTok 21 | 22 | @staticmethod 23 | def videos(count=30, **kwargs) -> Iterator[Video]: 24 | """ 25 | Returns Videos that are trending on TikTok. 26 | 27 | - Parameters: 28 | - count (int): The amount of videos you want returned. 29 | """ 30 | 31 | raise NotImplementedError() 32 | -------------------------------------------------------------------------------- /pytok/api/user.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | import asyncio 5 | import json 6 | import re 7 | from urllib.parse import urlencode, urlparse 8 | 9 | from patchright.async_api import TimeoutError as PlaywrightTimeoutError 10 | import requests 11 | 12 | from ..exceptions import * 13 | from ..helpers import extract_tag_contents, edit_url 14 | 15 | from typing import TYPE_CHECKING, ClassVar, Iterator, Optional 16 | 17 | if TYPE_CHECKING: 18 | from ..tiktok import PyTok 19 | from .video import Video 20 | 21 | from .base import Base 22 | 23 | 24 | class User(Base): 25 | """ 26 | A TikTok User. 27 | 28 | Example Usage 29 | ```py 30 | user = api.user(username='therock') 31 | # or 32 | user_id = '5831967' 33 | sec_uid = 'MS4wLjABAAAA-VASjiXTh7wDDyXvjk10VFhMWUAoxr8bgfO1kAL1-9s' 34 | user = api.user(user_id=user_id, sec_uid=sec_uid) 35 | ``` 36 | 37 | """ 38 | 39 | parent: ClassVar[PyTok] 40 | 41 | user_id: str 42 | """The user ID of the user.""" 43 | sec_uid: str 44 | """The sec UID of the user.""" 45 | username: str 46 | """The username of the user.""" 47 | as_dict: dict 48 | """The raw data associated with this user.""" 49 | 50 | def __init__( 51 | self, 52 | username: Optional[str] = None, 53 | user_id: Optional[str] = None, 54 | sec_uid: Optional[str] = None, 55 | data: Optional[dict] = None, 56 | ): 57 | """ 58 | You must provide the username or (user_id and sec_uid) otherwise this 59 | will not function correctly. 60 | """ 61 | self.__update_id_sec_uid_username(user_id, sec_uid, username) 62 | if data is not None: 63 | self.as_dict = data 64 | self.__extract_from_data() 65 | else: 66 | self.as_dict = {} 67 | 68 | def info(self, **kwargs): 69 | """ 70 | Returns a dictionary of TikTok's User object 71 | 72 | Example Usage 73 | ```py 74 | user_data = api.user(username='therock').info() 75 | ``` 76 | """ 77 | return self.info_full(**kwargs) 78 | 79 | async def info_full(self, **kwargs) -> dict: 80 | """ 81 | Returns a dictionary of information associated with this User. 82 | Includes statistics about this user. 83 | 84 | Example Usage 85 | ```py 86 | user_data = api.user(username='therock').info_full() 87 | ``` 88 | """ 89 | 90 | # TODO: Find the one using only user_id & sec_uid 91 | if not self.username: 92 | raise TypeError( 93 | "You must provide the username when creating this class to use this method." 94 | ) 95 | 96 | url = f"https://www.tiktok.com/@{self.username}?lang=en" 97 | 98 | page = self.parent._page 99 | 100 | self.parent.logger.debug(f"Loading page: {url}") 101 | if page.url != url: 102 | try: 103 | async with page.expect_request(url) as event: 104 | await page.goto(url, timeout=60 * 1000) 105 | request = await event.value 106 | response = await request.response() 107 | if response.status >= 300: 108 | raise NotAvailableException("Content is not available") 109 | except PlaywrightTimeoutError: 110 | raise TimeoutException("Page load timed out") 111 | 112 | # try: 113 | await self.wait_for_content_or_unavailable_or_captcha('[data-e2e="user-post-item"]', 114 | "Couldn't find this account", 115 | no_content_text=["No content", "This account is private"]) 116 | # resolve any remaining issues 117 | await asyncio.sleep(0.5) 118 | await self.wait_for_content_or_unavailable_or_captcha('[data-e2e="user-post-item"]', 119 | "Couldn't find this account", 120 | no_content_text=["No content", "This account is private"]) 121 | 122 | data_responses = self.get_responses('api/user/detail') 123 | 124 | if len(data_responses) > 0: 125 | data_response = data_responses[-1] 126 | data = await data_response.json() 127 | user_info = data["userInfo"] 128 | user = user_info["user"] | user_info["stats"] 129 | self.as_dict = user 130 | self.__extract_from_data() 131 | return user 132 | else: 133 | # get initial html data 134 | html_body = await page.content() 135 | 136 | tag_contents = extract_tag_contents(html_body) 137 | self.initial_json = json.loads(tag_contents) 138 | 139 | if 'UserModule' in self.initial_json: 140 | user = self.initial_json["UserModule"]["users"][self.username] | self.initial_json["UserModule"]["stats"][self.username] 141 | elif '__DEFAULT_SCOPE__' in self.initial_json: 142 | user_detail = self.initial_json['__DEFAULT_SCOPE__']['webapp.user-detail'] 143 | if user_detail['statusCode'] != 0: 144 | raise InvalidJSONException("Failed to find user data in HTML") 145 | user_info = user_detail['userInfo'] 146 | user = user_info['user'] | user_info['stats'] 147 | else: 148 | raise InvalidJSONException("Failed to find user data in HTML") 149 | 150 | self.as_dict = user 151 | self.__extract_from_data() 152 | return user 153 | 154 | async def videos(self, get_bytes=False, count=None, batch_size=100, **kwargs) -> Iterator[Video]: 155 | """ 156 | Returns an iterator yielding Video objects. 157 | 158 | - Parameters: 159 | - count (int): The amount of videos you want returned. 160 | - cursor (int): The unix epoch to get uploaded videos since. 161 | 162 | Example Usage 163 | ```py 164 | user = api.user(username='therock') 165 | for video in user.videos(count=100): 166 | # do something 167 | ``` 168 | """ 169 | if self.as_dict and self.as_dict['videoCount'] == 0: 170 | return 171 | 172 | try: 173 | videos, finished, cursor = await self._get_initial_videos(count, get_bytes) 174 | for video in videos: 175 | yield video 176 | 177 | if finished or count and len(videos) >= count: 178 | return 179 | 180 | async for video in self._get_videos_api(count, cursor, get_bytes, **kwargs): 181 | yield video 182 | except ApiFailedException: 183 | async for video in self._get_videos_scraping(count, get_bytes): 184 | yield video 185 | except Exception as ex: 186 | raise 187 | 188 | async def _get_videos_api(self, count, cursor, get_bytes, **kwargs) -> Iterator[Video]: 189 | # requesting videos via the api in the context of the browser session makes tiktok kill the session 190 | # using requests instead 191 | amount_yielded = 0 192 | 193 | data_request = self.parent.request_cache['videos'] 194 | 195 | all_cookies = await self.parent._context.cookies() 196 | verify_cookies = [cookie for cookie in all_cookies if cookie['name'] == 's_v_web_id'] 197 | if not verify_cookies: 198 | raise ApiFailedException("Failed to get videos from API without verify cookies") 199 | verify_fp = verify_cookies[0]['value'] 200 | 201 | while (count is None or amount_yielded < count): 202 | next_url = edit_url( 203 | data_request.url, 204 | { 205 | 'cursor': cursor, 206 | 'id': self.user_id, 207 | 'secUid': self.sec_uid, 208 | 'needPinnedItemIds': True, 209 | 'post_item_list_request_type': 0, 210 | 'verifyFp': verify_fp 211 | } 212 | ) 213 | headers = { 214 | 'accept': '*/*', 215 | 'accept-encoding': 'gzip, deflate, br, zstd', 216 | 'accept-language': 'en-GB,en;q=0.9', 217 | 'priority': 'u=1, i', 218 | 'referer': f'https://www.tiktok.com/@{self.username}?lang=en', 219 | 'sec-ch-ua': '"Not;A=Brand";v="24", "Chromium";v="128"', 220 | 'sec-ch-ua-mobile': '?0', 221 | 'sec-ch-ua-platform': '"Windows"', 222 | 'sec-fetch-dest': 'empty', 223 | 'sec-fetch-mode': 'cors', 224 | 'sec-fetch-site': 'same-origin', 225 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.18 Safari/537.36' 226 | } 227 | cookies = await self.parent._context.cookies() 228 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 229 | r = requests.get(next_url, headers=headers, cookies=cookies) 230 | 231 | if r.status_code != 200: 232 | raise ApiFailedException(f"Failed to get videos from API with status code {r.status_code}") 233 | if not r.content: 234 | raise ApiFailedException(f"Failed to get videos from API with empty response") 235 | 236 | res = r.json() 237 | 238 | if res.get('type') == 'verify': 239 | raise ApiFailedException("TikTok API is asking for verification") 240 | 241 | videos = res.get('itemList', []) 242 | cursor = int(res['cursor']) 243 | 244 | if videos: 245 | amount_yielded += len(videos) 246 | for video in videos: 247 | yield self.parent.video(data=video) 248 | 249 | has_more = res.get("hasMore") 250 | if not has_more: 251 | self.parent.logger.info( 252 | "TikTok isn't sending more TikToks beyond this point." 253 | ) 254 | return 255 | 256 | await self.parent.request_delay() 257 | 258 | 259 | async def _get_videos_scraping(self, count, get_bytes): 260 | page = self.parent._page 261 | 262 | url = f"https://www.tiktok.com/@{self.username}" 263 | if url not in page.url: 264 | await page.goto(url) 265 | await self.check_initial_call(url) 266 | await self.wait_for_content_or_unavailable_or_captcha('[data-e2e=user-post-item]', "This account is private") 267 | 268 | video_pull_method = 'scroll' 269 | if video_pull_method == 'scroll': 270 | async for video in self._get_videos_scroll(count, get_bytes): 271 | yield video 272 | elif video_pull_method == 'individual': 273 | async for video in self._get_videos_individual(count, get_bytes): 274 | yield video 275 | 276 | async def _get_videos_individual(self, count, get_bytes): 277 | page = self.parent._page 278 | 279 | await page.locator("[data-e2e=user-post-item]").click() 280 | 281 | self.wait_for_content_or_captcha('browse-video') 282 | 283 | still_more = True 284 | all_videos = [] 285 | 286 | while still_more: 287 | html_req_path = page.url 288 | initial_html_request = self.get_requests(html_req_path)[0] 289 | html_body = self.get_response_body(initial_html_request) 290 | tag_contents = extract_tag_contents(html_body) 291 | res = json.loads(tag_contents) 292 | 293 | all_videos += res['itemList'] 294 | 295 | if still_more: 296 | await page.locator("[data-e2e=browse-video]").press('ArrowDown') 297 | 298 | async def _load_each_video(self, videos): 299 | page = self.parent._page 300 | 301 | # get description elements with identifiable links 302 | desc_elements_locator = page.locator("[data-e2e=user-post-item-desc]") 303 | desc_elements_count = await desc_elements_locator.count() 304 | 305 | video_elements = [] 306 | for video in videos: 307 | found = False 308 | for i in range(desc_elements_count): 309 | desc_element = desc_elements_locator.nth(i) 310 | inner_html = await desc_element.inner_html() 311 | match = re.search(r'href="https:\/\/www\.tiktok\.com\/@[^\/]+\/video\/([0-9]+)"', inner_html) 312 | if not match: 313 | continue 314 | video_id = match.group(1) 315 | if video['id'] == video_id: 316 | # get sibling element of video element 317 | video_element = page.locator(f"xpath=//a[contains(@href, '{video['id']}')]/../..").first 318 | video_elements.append((video, video_element)) 319 | found = True 320 | break 321 | 322 | if not found: 323 | pass 324 | # TODO: log this 325 | # raise Exception(f"Could not find video element for video {video['id']}") 326 | 327 | for video, element in video_elements: 328 | await element.scroll_into_view_if_needed() 329 | await element.hover() 330 | try: 331 | play_path = urlparse(video['video']['playAddr']).path 332 | except KeyError: 333 | print(f"Missing JSON attributes for video: {video['id']}") 334 | continue 335 | 336 | try: 337 | requests = self.get_requests(play_path) 338 | resp = await requests[0].response() 339 | except Exception as ex: 340 | print(f"Failed to load video file for video: {video['id']}") 341 | 342 | await self.parent.request_delay() 343 | 344 | async def _get_initial_videos(self, count, get_bytes): 345 | all_videos = [] 346 | finished = False 347 | 348 | cursor = 0 349 | video_responses = self.get_responses('api/post/item_list') 350 | video_responses = [res for res in video_responses if f"secUid={self.sec_uid}" in res.url] 351 | for video_response in video_responses: 352 | try: 353 | if len(video_response._body) == 0: 354 | continue 355 | video_data = await video_response.json() 356 | if video_data.get('itemList'): 357 | videos = video_data['itemList'] 358 | video_objs = [self.parent.video(data=video) for video in videos] 359 | all_videos += video_objs 360 | finished = not video_data.get('hasMore', False) 361 | cursor = video_data.get('cursor', 0) 362 | except Exception as ex: 363 | pass 364 | 365 | if len(video_responses) == 0: 366 | raise ApiFailedException("Failed to get videos from API") 367 | 368 | self.parent.request_cache['videos'] = video_responses[-1] 369 | return all_videos, finished, cursor 370 | 371 | async def _get_videos_scroll(self, count, get_bytes): 372 | 373 | data_request_path = "api/post/item_list" 374 | data_urls = [] 375 | tries = 1 376 | amount_yielded = 0 377 | MAX_TRIES = 10 378 | 379 | cursors = [] 380 | while tries <= MAX_TRIES: 381 | await self.check_and_wait_for_captcha() 382 | await self.parent.request_delay() 383 | await self.slight_scroll_up() 384 | await self.parent.request_delay() 385 | await self.scroll_down(30000, speed=12) 386 | 387 | data_requests = [req for req in self.get_requests(data_request_path) if req.url not in data_urls] 388 | data_requests = [res for res in data_requests if f"secUid={self.sec_uid}" in res.url] 389 | 390 | if not data_requests: 391 | tries += 1 392 | if tries > MAX_TRIES: 393 | raise EmptyResponseException('TikTok backend broke') 394 | continue 395 | 396 | for data_request in data_requests: 397 | data_urls.append(data_request.url) 398 | try: 399 | data_response = await data_request.response() 400 | res_body = await self.get_response_body(data_response) 401 | except Exception as ex: 402 | continue 403 | 404 | if not res_body: 405 | continue 406 | 407 | self.parent.request_cache['videos'] = data_request 408 | 409 | res = json.loads(res_body) 410 | videos = res.get("itemList", []) 411 | cursors.append(int(res['cursor'])) 412 | 413 | if get_bytes: 414 | await self._load_each_video(videos) 415 | 416 | amount_yielded += len(videos) 417 | video_objs = [self.parent.video(data=video) for video in videos] 418 | 419 | for video in video_objs: 420 | yield video 421 | 422 | if count and amount_yielded >= count: 423 | return 424 | 425 | has_more = res.get("hasMore", False) 426 | if not has_more: 427 | User.parent.logger.info( 428 | "TikTok isn't sending more TikToks beyond this point." 429 | ) 430 | return 431 | 432 | return 433 | 434 | def liked(self, count: int = 30, cursor: int = 0, **kwargs) -> Iterator[Video]: 435 | """ 436 | Returns a dictionary listing TikToks that a given a user has liked. 437 | 438 | **Note**: The user's likes must be **public** (which is not the default option) 439 | 440 | - Parameters: 441 | - count (int): The amount of videos you want returned. 442 | - cursor (int): The unix epoch to get uploaded videos since. 443 | 444 | Example Usage 445 | ```py 446 | for liked_video in api.user(username='public_likes'): 447 | # do something 448 | ``` 449 | """ 450 | processed = User.parent._process_kwargs(kwargs) 451 | kwargs["custom_device_id"] = processed.device_id 452 | 453 | amount_yielded = 0 454 | first = True 455 | 456 | if self.user_id is None and self.sec_uid is None: 457 | self.__find_attributes() 458 | 459 | while amount_yielded < count: 460 | query = { 461 | "count": 30, 462 | "id": self.user_id, 463 | "type": 2, 464 | "secUid": self.sec_uid, 465 | "cursor": cursor, 466 | "sourceType": 9, 467 | "appId": 1233, 468 | "region": processed.region, 469 | "priority_region": processed.region, 470 | "language": processed.language, 471 | } 472 | path = "api/favorite/item_list/?{}&{}".format( 473 | User.parent._add_url_params(), urlencode(query) 474 | ) 475 | 476 | res = self.parent.get_data(path, **kwargs) 477 | 478 | if "itemList" not in res.keys(): 479 | if first: 480 | User.parent.logger.error("User's likes are most likely private") 481 | return 482 | 483 | videos = res.get("itemList", []) 484 | amount_yielded += len(videos) 485 | for video in videos: 486 | amount_yielded += 1 487 | yield self.parent.video(data=video) 488 | 489 | if not res.get("hasMore", False) and not first: 490 | User.parent.logger.info( 491 | "TikTok isn't sending more TikToks beyond this point." 492 | ) 493 | return 494 | 495 | cursor = res["cursor"] 496 | first = False 497 | 498 | def __extract_from_data(self): 499 | data = self.as_dict 500 | keys = data.keys() 501 | 502 | if "user_info" in keys: 503 | self.__update_id_sec_uid_username( 504 | data["user_info"]["uid"], 505 | data["user_info"]["sec_uid"], 506 | data["user_info"]["unique_id"], 507 | ) 508 | elif "uniqueId" in keys: 509 | self.__update_id_sec_uid_username( 510 | data["id"], data["secUid"], data["uniqueId"] 511 | ) 512 | 513 | if None in (self.username, self.user_id, self.sec_uid): 514 | User.parent.logger.error( 515 | f"Failed to create User with data: {data}\nwhich has keys {data.keys()}" 516 | ) 517 | 518 | def __update_id_sec_uid_username(self, id, sec_uid, username): 519 | self.user_id = id 520 | self.sec_uid = sec_uid 521 | self.username = username 522 | 523 | def __find_attributes(self) -> None: 524 | # It is more efficient to check search first, since self.user_object() makes HTML request. 525 | found = False 526 | for u in self.parent.search.users(self.username): 527 | if u.username == self.username: 528 | found = True 529 | self.__update_id_sec_uid_username(u.user_id, u.sec_uid, u.username) 530 | break 531 | 532 | if not found: 533 | user_object = self.info() 534 | self.__update_id_sec_uid_username( 535 | user_object["id"], user_object["secUid"], user_object["uniqueId"] 536 | ) 537 | 538 | def __repr__(self): 539 | return self.__str__() 540 | 541 | def __str__(self): 542 | return f"PyTok.user(username='{self.username}', user_id='{self.user_id}', sec_uid='{self.sec_uid}')" 543 | 544 | -------------------------------------------------------------------------------- /pytok/api/video.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | from datetime import datetime 5 | import json 6 | from urllib import parse as url_parsers 7 | from typing import TYPE_CHECKING, ClassVar, Optional 8 | 9 | import brotli 10 | import requests 11 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError 12 | 13 | if TYPE_CHECKING: 14 | from ..tiktok import PyTok 15 | from .user import User 16 | from .sound import Sound 17 | from .hashtag import Hashtag 18 | 19 | from .base import Base 20 | from ..helpers import extract_tag_contents, edit_url, extract_video_id_from_url, extract_user_id_from_url 21 | from .. import exceptions 22 | 23 | class Counter: 24 | def __init__(self): 25 | self._counter = 0 26 | 27 | def add(self, n): 28 | self._counter += n 29 | 30 | def get(self): 31 | return self._counter 32 | 33 | class Video(Base): 34 | """ 35 | A TikTok Video class 36 | 37 | Example Usage 38 | ```py 39 | video = api.video(id='7041997751718137094') 40 | ``` 41 | """ 42 | 43 | parent: ClassVar[PyTok] 44 | 45 | id: Optional[str] 46 | """TikTok's ID of the Video""" 47 | create_time: Optional[datetime] 48 | """The creation time of the Video""" 49 | stats: Optional[dict] 50 | """TikTok's stats of the Video""" 51 | author: Optional[User] 52 | """The User who created the Video""" 53 | sound: Optional[Sound] 54 | """The Sound that is associated with the Video""" 55 | hashtags: Optional[list[Hashtag]] 56 | """A List of Hashtags on the Video""" 57 | as_dict: dict 58 | """The raw data associated with this Video.""" 59 | 60 | def __init__( 61 | self, 62 | id: Optional[str] = None, 63 | username: Optional[str] = None, 64 | url: Optional[str] = None, 65 | data: Optional[dict] = None, 66 | ): 67 | """ 68 | You must provide the id or a valid url, else this will fail. 69 | """ 70 | self.id = id 71 | self.username = username 72 | if data is not None: 73 | self.as_dict = data 74 | self.__extract_from_data() 75 | elif url is not None: 76 | self.id = extract_video_id_from_url(url) 77 | self.username = extract_user_id_from_url(url) 78 | 79 | if self.id is None and url is None: 80 | raise TypeError("You must provide id or url parameter.") 81 | 82 | async def info(self, **kwargs) -> dict: 83 | """ 84 | Returns a dictionary of all data associated with a TikTok Video. 85 | 86 | Example Usage 87 | ```py 88 | video_data = api.video(id='7041997751718137094').info() 89 | ``` 90 | """ 91 | if not hasattr(self, 'as_dict'): 92 | url = self._get_url() 93 | page = self.parent._page 94 | if page.url != url: 95 | await self.view() 96 | 97 | await self.check_and_resolve_login_popup() 98 | 99 | # get initial html data 100 | initial_html_response = self.get_responses(url)[-1] 101 | html_body = await self.get_response_body(initial_html_response) 102 | contents = extract_tag_contents(html_body) 103 | res = json.loads(contents) 104 | 105 | video_detail = res['__DEFAULT_SCOPE__']['webapp.video-detail'] 106 | if video_detail['statusCode'] != 0: 107 | raise exceptions.NotAvailableException( 108 | f"Content is not available with status message: {video_detail['statusMsg']}") 109 | video_data = video_detail['itemInfo']['itemStruct'] 110 | self.as_dict = video_data 111 | else: 112 | video_data = self.as_dict 113 | 114 | return video_data 115 | 116 | async def network_info(self, **kwargs) -> dict: 117 | """ 118 | Returns a dictionary of all network data associated with a TikTok Video. 119 | 120 | Example Usage 121 | ```py 122 | video_data = api.video(id='7041997751718137094').network_data() 123 | ``` 124 | """ 125 | url = self._get_url() 126 | page = self.parent._page 127 | if page.url != url: 128 | await self.view() 129 | initial_html_response = self.get_responses(url)[-1] 130 | network_info = {} 131 | network_info['server_addr'] = await initial_html_response.server_addr() 132 | network_info['headers'] = await initial_html_response.all_headers() 133 | return network_info 134 | 135 | async def bytes_network_info(self, **kwargs) -> dict: 136 | """ 137 | Returns a dictionary of all network data associated with a TikTok Video. 138 | 139 | Example Usage 140 | ```py 141 | video_data = api.video(id='7041997751718137094').bytes_network_data() 142 | ``` 143 | """ 144 | play_path = url_parsers.urlparse(self.as_dict['video']['playAddr']).path 145 | reqs = self.get_requests(play_path) 146 | if len(reqs) == 0: 147 | # TODO load page and pull 148 | raise Exception("No requests found for video") 149 | for req in reqs: 150 | try: 151 | res = await req.response() 152 | network_info = {} 153 | network_info['server_addr'] = await res.server_addr() 154 | network_info['headers'] = await res.all_headers() 155 | return network_info 156 | except Exception: 157 | continue 158 | else: 159 | raise Exception("Failed to get video bytes") 160 | 161 | def _get_url(self) -> str: 162 | if self.username is not None: 163 | return f"https://www.tiktok.com/@{self.username}/video/{self.id}" 164 | else: 165 | # will autoresolve to correct username 166 | return f"https://www.tiktok.com/@user/video/{self.id}" 167 | 168 | async def view(self, **kwargs) -> None: 169 | """ 170 | Opens the TikTok Video in your default browser. 171 | 172 | Example Usage 173 | ```py 174 | api.video(id='7041997751718137094').view() 175 | ``` 176 | """ 177 | page = self.parent._page 178 | url = self._get_url() 179 | try: 180 | async with page.expect_request(url) as event: 181 | await page.goto(url) 182 | request = await event.value 183 | response = await request.response() 184 | if response.status >= 300: 185 | raise exceptions.NotAvailableException("Content is not available") 186 | # no need to check for captcha, because video data is in the html regardless 187 | await self.wait_for_content_or_unavailable('[id="main-content-video_detail"]', 'Video currently unavailable') 188 | except PlaywrightTimeoutError as e: 189 | raise exceptions.TimeoutException(str(e)) 190 | 191 | async def _related_videos(self, counter, count=20): 192 | data_request_path = "api/related/item_list" 193 | data_requests = self.get_requests(data_request_path) 194 | for req in data_requests: 195 | # parse params from url 196 | url_parsed = url_parsers.urlparse(req.url) 197 | params = url_parsers.parse_qs(url_parsed.query) 198 | if params['itemID'][0] != self.id: 199 | continue 200 | res = await req.response() 201 | if res is None: 202 | continue 203 | body = await res.body() 204 | if len(body) == 0: 205 | continue 206 | d = await res.json() 207 | for v in d.get('itemList', []): 208 | yield v 209 | counter.add(1) 210 | if counter.get() >= count: 211 | break 212 | 213 | async def related_videos(self, count=20) -> list[dict]: 214 | """ 215 | Returns a list of related 216 | TikTok Videos to the current Video. 217 | 218 | """ 219 | counter = Counter() 220 | async for video in self._related_videos(counter, count=count): 221 | yield video 222 | 223 | # get via scroll 224 | # solve captcha if necessary 225 | if counter.get() == 0: 226 | await self.check_and_wait_for_captcha() 227 | await self.parent._page.reload() 228 | await asyncio.sleep(5) 229 | async for video in self._related_videos(counter, count=count): 230 | yield video 231 | 232 | async def bytes(self, **kwargs) -> bytes: 233 | """ 234 | Returns the bytes of a TikTok Video. 235 | 236 | Example Usage 237 | ```py 238 | video_bytes = api.video(id='7041997751718137094').bytes() 239 | 240 | # Saving The Video 241 | with open('saved_video.mp4', 'wb') as output: 242 | output.write(video_bytes) 243 | ``` 244 | """ 245 | bytes_url = self.as_dict['video']['playAddr'] 246 | if len(bytes_url) == 0: 247 | raise exceptions.NotAvailableException("Post does not have a video") 248 | play_path = url_parsers.urlparse(bytes_url).path 249 | resps = self.get_responses(play_path) 250 | if len(resps) > 0: 251 | for res in resps: 252 | if hasattr(res, '_body'): 253 | if len(res._body) > 0: 254 | return res._body 255 | # if we don't have the bytes in the response, we need to get it from the server 256 | 257 | # send the request ourselves 258 | try: 259 | return await asyncio.wait_for(self._request_bytes(bytes_url), timeout=10) 260 | except TimeoutError: 261 | raise exceptions.TimeoutException("Failed to get video bytes in time") 262 | 263 | async def _request_bytes(self, bytes_url): 264 | bytes_headers = { 265 | 'sec-ch-ua': '"HeadlessChrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"', 266 | 'referer': 'https://www.tiktok.com/', 267 | 'accept-encoding': 'identity;q=1, *;q=0', 268 | 'sec-ch-ua-mobile': '?0', 269 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.4 Safari/537.36', 270 | 'range': 'bytes=0-', 271 | 'sec-ch-ua-platform': '"Windows"' 272 | } 273 | cookies = await self.parent._context.cookies() 274 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 275 | r = requests.get(bytes_url, headers=bytes_headers, cookies=cookies) 276 | if r.content is not None or len(r.content) > 0: 277 | return r.content 278 | raise Exception("Failed to get video bytes") 279 | 280 | async def _get_comments_and_req(self, count): 281 | # get request 282 | data_request_path = "api/comment/list" 283 | data_responses = self.get_responses(data_request_path) 284 | 285 | amount_yielded = 0 286 | all_comments = [] 287 | processed_urls = [] 288 | 289 | valid_data_request = None 290 | for data_response in data_responses: 291 | try: 292 | res = await data_response.json() 293 | 294 | self.parent.request_cache['comments'] = data_response.request 295 | 296 | processed_urls.append(data_response.url) 297 | 298 | comments = res.get("comments", []) 299 | 300 | amount_yielded += len(comments) 301 | all_comments += comments 302 | 303 | if amount_yielded > count: 304 | return all_comments, processed_urls, True 305 | 306 | has_more = res.get("has_more") 307 | if has_more != 1: 308 | self.parent.logger.info( 309 | "TikTok isn't sending more TikToks beyond this point." 310 | ) 311 | return all_comments, processed_urls, True 312 | except Exception: 313 | pass 314 | 315 | return all_comments, processed_urls, False 316 | 317 | async def _get_comment_replies(self, comment, batch_size): 318 | if 'comments' not in self.parent.request_cache: 319 | return 320 | data_request = self.parent.request_cache['comments'] 321 | num_already_fetched = len( 322 | comment.get('reply_comment', []) if comment.get('reply_comment', []) is not None else []) 323 | num_comments_to_fetch = comment['reply_comment_total'] - num_already_fetched 324 | while num_comments_to_fetch > 0: 325 | 326 | url_parsed = url_parsers.urlparse(data_request.url) 327 | params = url_parsers.parse_qs(url_parsed.query) 328 | params['cursor'] = num_already_fetched 329 | del params['aweme_id'] 330 | params['count'] = min(num_comments_to_fetch, batch_size) 331 | params['item_id'] = comment['aweme_id'] 332 | params['comment_id'] = comment['cid'] 333 | params['focus_state'] = 'true' 334 | url_path = url_parsed.path.replace("api/comment/list", "api/comment/list/reply") 335 | next_url = f"{url_parsed.scheme}://{url_parsed.netloc}{url_path}?{url_parsers.urlencode(params, doseq=True)}" 336 | cookies = await self.parent._context.cookies() 337 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 338 | r = requests.get(next_url, headers=data_request.headers, cookies=cookies) 339 | res = r.json() 340 | 341 | reply_comments = res.get("comments", []) 342 | 343 | if reply_comments: 344 | comment['reply_comment'] = comment['reply_comment'] + reply_comments if comment[ 345 | 'reply_comment'] else reply_comments 346 | 347 | has_more = res.get("has_more") 348 | if has_more != 1: 349 | self.parent.logger.info( 350 | "TikTok isn't sending more TikToks beyond this point." 351 | ) 352 | break 353 | 354 | await self.parent.request_delay() 355 | 356 | num_already_fetched = len(comment['reply_comment']) 357 | num_comments_to_fetch = comment['reply_comment_total'] - num_already_fetched 358 | 359 | async def comments(self, count=200, batch_size=100): 360 | if self.id and self.username: 361 | await self.view() 362 | await self.wait_for_content_or_unavailable_or_captcha('css=[data-e2e=comment-level-1]', 363 | 'Be the first to comment!') 364 | # TODO allow multi layer comment fetch 365 | 366 | amount_yielded = 0 367 | all_comments, processed_urls, finished = await self._get_comments_and_req(count) 368 | 369 | for comment in all_comments: 370 | await self._get_comment_replies(comment, batch_size) 371 | 372 | amount_yielded += len(all_comments) 373 | for comment in all_comments: 374 | yield comment 375 | 376 | if finished: 377 | return 378 | 379 | # so that we don't re-yield any comments previously yielded 380 | comment_ids = set(comment['cid'] for comment in all_comments) 381 | try: 382 | async for comment in self._get_api_comments(count, batch_size, comment_ids): 383 | yield comment 384 | except exceptions.ApiFailedException as e: 385 | async for comment in self._get_scroll_comments(count, amount_yielded, processed_urls): 386 | yield comment 387 | else: 388 | # if we only have the video id, we need to entirely rely on the api 389 | async for comment in self._get_api_comments(count, batch_size, set()): 390 | yield comment 391 | 392 | async def _get_scroll_comments(self, count, amount_yielded, processed_urls): 393 | page = self.parent._page 394 | if page.url != self._get_url(): 395 | await self.view() 396 | tries = 0 397 | 398 | data_request_path = "api/comment/list" 399 | while amount_yielded < count: 400 | # scroll down to induce request 401 | await self.scroll_to(10000) 402 | await self.slight_scroll_up() 403 | await self.check_and_wait_for_captcha() 404 | await self.check_and_close_signin() 405 | 406 | data_responses = self.get_responses(data_request_path) 407 | data_responses = [data_response for data_response in data_responses if 408 | data_response.url not in processed_urls] 409 | 410 | if len(data_responses) == 0: 411 | if tries > 5: 412 | print(f"Not sending anymore!") 413 | break 414 | tries += 1 415 | 416 | for data_response in data_responses: 417 | try: 418 | res = await data_response.json() 419 | 420 | processed_urls.append(data_response.url) 421 | 422 | comments = res.get("comments", []) 423 | 424 | for comment in comments: 425 | await self._get_comment_replies(comment, 100) 426 | 427 | amount_yielded += len(comments) 428 | for comment in comments: 429 | yield comment 430 | 431 | if amount_yielded > count: 432 | return 433 | 434 | has_more = res.get("has_more") 435 | if has_more != 1: 436 | self.parent.logger.info( 437 | "TikTok isn't sending more TikToks beyond this point." 438 | ) 439 | return 440 | except Exception as e: 441 | processed_urls.append(data_response.url) 442 | 443 | async def _get_comments_via_requests(self, count, cursor, data_request): 444 | ms_tokens = await self.parent.get_ms_tokens() 445 | next_url = edit_url(data_request.url, {'count': count, 'cursor': cursor, 'aweme_id': self.id}) 446 | cookies = await self.parent._context.cookies() 447 | cookies = {cookie['name']: cookie['value'] for cookie in cookies} 448 | headers = await data_request.all_headers() 449 | headers = {k: v for k, v in headers.items() if not k.startswith(':')} 450 | headers['referer'] = None 451 | r = requests.get(next_url, headers=headers, cookies=cookies) 452 | 453 | if r.status_code != 200: 454 | raise Exception(f"Failed to get comments with status code {r.status_code}") 455 | 456 | if len(r.content) == 0: 457 | print("Failed to comments from API, switching to scroll") 458 | raise exceptions.ApiFailedException("No content in response") 459 | 460 | try: 461 | res = r.json() 462 | except Exception: 463 | res = json.loads(brotli.decompress(r.content).decode()) 464 | 465 | return res 466 | 467 | async def _get_api_comments(self, count, batch_size, comment_ids): 468 | 469 | data_request = self.parent.request_cache['comments'] 470 | 471 | try: 472 | amount_yielded = 0 473 | cursor = 0 474 | while amount_yielded < count: 475 | # try directly requesting through browser 476 | url = edit_url(data_request.url, 477 | {'count': 20, 'cursor': cursor, 'aweme_id': self.id}) # , 'msToken': ms_tokens[-1]}) 478 | page = self.parent._page 479 | async with page.expect_request(url) as event: 480 | await page.goto(url) 481 | request = await event.value 482 | response = await request.response() 483 | if response.status >= 300: 484 | raise exceptions.NotAvailableException("Content is not available") 485 | 486 | if response.status != 200: 487 | raise Exception(f"Failed to get comments with status code {response.status}") 488 | 489 | content = await response.body() 490 | if len(content) == 0: 491 | raise Exception("No content in response") 492 | 493 | res = await response.json() 494 | cursor = res.get("cursor", 0) 495 | 496 | comments = res.get("comments", []) 497 | amount_yielded += len(comments) 498 | for comment in comments: 499 | if comment['cid'] not in comment_ids: 500 | try: 501 | await self._get_comment_replies(comment, batch_size) 502 | except Exception: 503 | pass 504 | yield comment 505 | except Exception as e: 506 | try: 507 | # try getting all at once 508 | retries = 5 509 | for _ in range(retries): 510 | try: 511 | cursor = '0' 512 | res = await self._get_comments_via_requests(count, cursor, data_request) 513 | 514 | comments = res.get("comments", []) 515 | for comment in comments: 516 | if comment['cid'] not in comment_ids: 517 | try: 518 | await self._get_comment_replies(comment, batch_size) 519 | except Exception: 520 | pass 521 | yield comment 522 | 523 | return 524 | except Exception as e: 525 | pass 526 | else: 527 | print("Failed to get all comments at once") 528 | print("Trying batched...") 529 | raise Exception("Failed to get comments") 530 | except Exception as e: 531 | 532 | amount_yielded = len(comment_ids) 533 | cursor = 0 534 | while amount_yielded < count: 535 | res = await self._get_comments_via_requests(20, cursor, data_request) 536 | 537 | if res.get('type') == 'verify': 538 | # force new request for cache 539 | self._get_comments_and_req() 540 | 541 | cursor = res.get("cursor", 0) 542 | comments = res.get("comments", []) 543 | 544 | if comments: 545 | for comment in comments: 546 | await self._get_comment_replies(comment, batch_size) 547 | 548 | amount_yielded += len(comments) 549 | for comment in comments: 550 | yield comment 551 | 552 | has_more = res.get("has_more") 553 | if has_more != 1: 554 | self.parent.logger.info( 555 | "TikTok isn't sending more TikToks beyond this point." 556 | ) 557 | return 558 | 559 | await self.parent.request_delay() 560 | 561 | def __extract_from_data(self) -> None: 562 | data = self.as_dict 563 | keys = data.keys() 564 | 565 | if "author" in keys: 566 | self.id = data["id"] 567 | self.username = data["author"]["uniqueId"] 568 | self.create_time = datetime.fromtimestamp(int(data["createTime"])) 569 | self.stats = data["stats"] 570 | self.author = self.parent.user(data=data["author"]) 571 | self.sound = self.parent.sound(data=data["music"]) 572 | 573 | self.hashtags = [ 574 | self.parent.hashtag(data=hashtag) 575 | for hashtag in data.get("challenges", []) 576 | ] 577 | 578 | if self.id is None: 579 | Video.parent.logger.error( 580 | f"Failed to create Video with data: {data}\nwhich has keys {data.keys()}" 581 | ) 582 | 583 | def __repr__(self): 584 | return self.__str__() 585 | 586 | def __str__(self): 587 | return f"PyTok.video(id='{self.id}')" 588 | 589 | # def __getattr__(self, name): 590 | # # Handle author, sound, hashtags, as_dict 591 | # if name in ["author", "sound", "hashtags", "stats", "create_time", "as_dict"]: 592 | # self.as_dict = self.info() 593 | # self.__extract_from_data() 594 | # return self.__getattribute__(name) 595 | 596 | # raise AttributeError(f"{name} doesn't exist on PyTok.api.Video") 597 | -------------------------------------------------------------------------------- /pytok/captcha_solver.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | from urllib.parse import urlparse 4 | 5 | import cv2 6 | import base64 7 | import numpy as np 8 | import requests 9 | 10 | 11 | class CaptchaSolver: 12 | def __init__(self, response, puzzle, piece): 13 | self._request = response.request 14 | self._response = response 15 | self._client = requests.Session() 16 | self._puzzle = base64.b64encode(puzzle) 17 | self._piece = base64.b64encode(piece) 18 | 19 | def _host(self): 20 | return urlparse(self._request.url).netloc 21 | 22 | def _params(self): 23 | return urlparse(self._request.url).query 24 | 25 | def _headers(self) -> dict: 26 | return self._request.headers 27 | 28 | async def _get_challenge(self) -> dict: 29 | return await self._response.json() 30 | 31 | async def _solve_captcha(self) -> dict: 32 | if self._mode == "slide": 33 | solver = PuzzleSolver(self._puzzle, self._piece) 34 | maxloc = solver.get_position() 35 | elif self._mode == "whirl": 36 | maxloc = whirl_solver(self._puzzle, self._piece) 37 | randlength = round( 38 | random.random() * (100 - 50) + 50 39 | ) 40 | await asyncio.sleep(1) # don't remove delay or it will fail 41 | return { 42 | "maxloc": maxloc, 43 | "randlenght": randlength 44 | } 45 | 46 | def _post_captcha(self, solve: dict) -> dict: 47 | params = self._params() 48 | 49 | body = { 50 | "id": solve["id"], 51 | "mode": self._mode 52 | } 53 | if self._mode == "slide": 54 | body.update({ 55 | "modified_img_width": 552, 56 | "reply": list( 57 | { 58 | "relative_time": i * solve["randlenght"], 59 | "x": round( 60 | solve["maxloc"] / (solve["randlenght"] / (i + 1)) 61 | ), 62 | "y": solve["tip"], 63 | } 64 | for i in range( 65 | solve["randlenght"] 66 | ) 67 | ), 68 | }) 69 | elif self._mode == "whirl": 70 | body.update({ 71 | "modified_img_width": 340, 72 | "drag_width": 271, 73 | "reply": list( 74 | { 75 | "relative_time": i * solve["randlenght"], 76 | "x": round( 77 | 271 * solve["maxloc"] / (solve["randlenght"] / (i + 1)) 78 | ), 79 | "y": solve["tip"], 80 | } 81 | for i in range( 82 | solve["randlenght"] 83 | ) 84 | ), 85 | }) 86 | 87 | host = self._host() 88 | headers = self._headers() 89 | 90 | resp = self._client.post( 91 | url=( 92 | "https://" 93 | + host 94 | + "/captcha/verify?" 95 | + params 96 | ), 97 | headers=headers.update( 98 | { 99 | "content-type": "application/json" 100 | } 101 | ), 102 | json=body 103 | ) 104 | 105 | if resp.status_code != 200: 106 | raise Exception("Captcha was not solved") 107 | else: 108 | # status code was 200, but perhaps the response was to say that the CAPTCHA failed. 109 | if resp.json()['code'] >= 500: 110 | raise Exception(f"CAPTCHA server responded 200 but said: {resp.json()['message']}") 111 | 112 | return resp.json() 113 | 114 | async def solve_captcha(self): 115 | # this method is called 116 | captcha_challenge = await self._get_challenge() 117 | 118 | if 'mode' in captcha_challenge["data"]: 119 | captcha_challenge = captcha_challenge["data"] 120 | elif 'challenges' in captcha_challenge["data"]: 121 | captcha_challenge = captcha_challenge["data"]["challenges"][0] 122 | captcha_id = captcha_challenge["id"] 123 | self._mode = captcha_challenge["mode"] 124 | 125 | solve = await self._solve_captcha() 126 | 127 | solve['id'] = captcha_id 128 | if captcha_challenge["mode"] == "slide": 129 | tip_y = captcha_challenge["question"]["tip_y"] 130 | solve['tip'] = tip_y 131 | elif captcha_challenge["mode"] == "whirl": 132 | solve['tip'] = 0 133 | return solve 134 | 135 | 136 | class PuzzleSolver: 137 | def __init__(self, base64puzzle, base64piece): 138 | self.puzzle = base64puzzle 139 | self.piece = base64piece 140 | 141 | def get_position(self): 142 | puzzle = self._background_preprocessing() 143 | piece = self._piece_preprocessing() 144 | matched = cv2.matchTemplate( 145 | puzzle, 146 | piece, 147 | cv2.TM_CCOEFF_NORMED 148 | ) 149 | min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(matched) 150 | return max_loc[0] 151 | 152 | def _background_preprocessing(self): 153 | img = self._img_to_grayscale(self.piece) 154 | background = self._sobel_operator(img) 155 | return background 156 | 157 | def _piece_preprocessing(self): 158 | img = self._img_to_grayscale(self.puzzle) 159 | template = self._sobel_operator(img) 160 | return template 161 | 162 | def _sobel_operator(self, img): 163 | scale = 1 164 | delta = 0 165 | ddepth = cv2.CV_16S 166 | 167 | img = cv2.GaussianBlur(img, (3, 3), 0) 168 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 169 | grad_x = cv2.Sobel( 170 | gray, 171 | ddepth, 172 | 1, 173 | 0, 174 | ksize=3, 175 | scale=scale, 176 | delta=delta, 177 | borderType=cv2.BORDER_DEFAULT, 178 | ) 179 | grad_y = cv2.Sobel( 180 | gray, 181 | ddepth, 182 | 0, 183 | 1, 184 | ksize=3, 185 | scale=scale, 186 | delta=delta, 187 | borderType=cv2.BORDER_DEFAULT, 188 | ) 189 | abs_grad_x = cv2.convertScaleAbs(grad_x) 190 | abs_grad_y = cv2.convertScaleAbs(grad_y) 191 | grad = cv2.addWeighted(abs_grad_x, 0.5, abs_grad_y, 0.5, 0) 192 | 193 | return grad 194 | 195 | def _img_to_grayscale(self, img): 196 | return cv2.imdecode( 197 | self._string_to_image(img), 198 | cv2.IMREAD_COLOR 199 | ) 200 | 201 | def _string_to_image(self, base64_string): 202 | return np.frombuffer( 203 | base64.b64decode(base64_string), 204 | dtype="uint8" 205 | ) 206 | 207 | 208 | def _get_images_and_edges(b64_puzzle, b64_piece, resolution=300): 209 | puzzle = cv2.imdecode(np.frombuffer(base64.b64decode(b64_puzzle), dtype="uint8"), cv2.IMREAD_COLOR) 210 | piece = cv2.imdecode(np.frombuffer(base64.b64decode(b64_piece), dtype="uint8"), cv2.IMREAD_COLOR) 211 | 212 | # get inner edge of puzzle 213 | r = (piece.shape[0] / 2) + 1 214 | puzzle_edge = np.zeros((resolution, 3)) 215 | for idx, theta in enumerate(np.linspace(0, 2 * np.pi, resolution)): 216 | x = int(puzzle.shape[0] / 2 + r * np.cos(theta)) 217 | y = int(puzzle.shape[1] / 2 + r * np.sin(theta)) 218 | puzzle_edge[idx] = puzzle[x, y] 219 | 220 | # get outer edge of piece 221 | r = (piece.shape[0] / 2) - 1 222 | piece_edge = np.zeros((resolution, 3)) 223 | for idx, theta in enumerate(np.linspace(0, 2 * np.pi, resolution)): 224 | x = min(int(piece.shape[0] / 2 + r * np.cos(theta)), piece.shape[0] - 1) 225 | y = min(int(piece.shape[1] / 2 + r * np.sin(theta)), piece.shape[1] - 1) 226 | piece_edge[idx] = piece[x, y] 227 | 228 | return puzzle, piece, puzzle_edge, piece_edge 229 | 230 | 231 | def whirl_solver(b64_puzzle, b64_piece): 232 | resolution = 300 233 | _, _, puzzle_edge, piece_edge = _get_images_and_edges(b64_puzzle, b64_piece, resolution=resolution) 234 | 235 | # find the best match 236 | best_match = 0 237 | best_angle = 0 238 | for angle in range(resolution): 239 | match = np.sum(puzzle_edge * np.roll(piece_edge, angle, axis=0)) 240 | if match > best_match: 241 | best_match = match 242 | best_angle = angle 243 | 244 | return (resolution - best_angle) / resolution -------------------------------------------------------------------------------- /pytok/exceptions.py: -------------------------------------------------------------------------------- 1 | class TikTokException(Exception): 2 | """Generic exception that all other TikTok errors are children of.""" 3 | 4 | def __init__(self, *args, **kwargs): 5 | super().__init__(*args, **kwargs) 6 | 7 | 8 | class CaptchaException(TikTokException): 9 | """TikTok is showing captcha""" 10 | 11 | 12 | class NotFoundException(TikTokException): 13 | """TikTok indicated that this object does not exist.""" 14 | 15 | 16 | class EmptyResponseException(TikTokException): 17 | """TikTok sent back an empty response.""" 18 | 19 | 20 | class SoundRemovedException(TikTokException): 21 | """This TikTok sound has no id from being removed by TikTok.""" 22 | 23 | 24 | class InvalidJSONException(TikTokException): 25 | """TikTok returned invalid JSON.""" 26 | 27 | 28 | class NotAvailableException(TikTokException): 29 | """The requested object is not available in this region.""" 30 | 31 | class NoContentException(TikTokException): 32 | """TikTok returned no content""" 33 | 34 | class TimeoutException(TikTokException): 35 | """Timed out trying to get content from TikTok""" 36 | 37 | class ApiFailedException(TikTokException): 38 | """TikTok API is failing""" 39 | 40 | class FewerVideosThanExpectedException(TikTokException): 41 | """TikTok is returning fewer videos for this user than their metadata led us to expect""" 42 | 43 | class AccountPrivateException(TikTokException): 44 | """This TikTok account is private and cannot be scraped""" -------------------------------------------------------------------------------- /pytok/helpers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib import parse as url_parsers 3 | 4 | import requests 5 | 6 | from .exceptions import * 7 | 8 | 9 | def extract_tag_contents(html): 10 | if isinstance(html, bytes): 11 | html = html.decode("utf-8") 12 | data_json_match = re.search(r"""")[0] 28 | return j_raw 29 | else: 30 | sigi_json = re.search('