├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── ntscraper ├── __init__.py └── nitter.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── instances_test.py ├── profile_test.py ├── search_test.py └── tweet_test.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | 154 | .vscode/ 155 | 156 | test.py -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Lorenzo Bocchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unofficial Nitter scraper 2 | 3 | ## Note 4 | 5 | Twitter has recently made some changes which affected every third party Twitter client, including Nitter. As a result, most Nitter instances have shut down or will shut down shortly. Even local instances are affected by this, so you may not be able to scrape as many tweets as expected, if at all. 6 | 7 | ## The scraper 8 | 9 | This is a simple library to scrape Nitter instances for tweets. It can: 10 | 11 | - search and scrape tweets with a certain term 12 | 13 | - search and scrape tweets with a certain hashtag 14 | 15 | - scrape tweets from a user profile 16 | 17 | - get profile information of a user, such as display name, username, number of tweets, profile picture ... 18 | 19 | If the instance to use is not provided to the scraper, it will use a random public instance. If you can, please host your own instance in order to avoid overloading the public ones and letting Nitter stay alive for everyone. You can read more about that here: https://github.com/zedeus/nitter#installation. 20 | 21 | --- 22 | 23 | ## Installation 24 | 25 | ``` 26 | pip install ntscraper 27 | ``` 28 | 29 | ## How to use 30 | 31 | First, initialize the library: 32 | 33 | ```python 34 | from ntscraper import Nitter 35 | 36 | scraper = Nitter(log_level=1, skip_instance_check=False) 37 | ``` 38 | The valid logging levels are: 39 | - None = no logs 40 | - 0 = only warning and error logs 41 | - 1 = previous + informational logs (default) 42 | 43 | The `skip_instance_check` parameter is used to skip the check of the Nitter instances altogether during the execution of the script. If you use your own instance or trust the instance you are relying on, then you can skip set it to 'True', otherwise it's better to leave it to false. 44 | 45 | Then, choose the proper function for what you want to do from the following. 46 | 47 | ### Scrape tweets 48 | 49 | ```python 50 | github_hash_tweets = scraper.get_tweets("github", mode='hashtag') 51 | 52 | bezos_tweets = scraper.get_tweets("JeffBezos", mode='user') 53 | ``` 54 | 55 | Parameters: 56 | - term: search term 57 | - mode: modality to scrape the tweets. Default is 'term' which will look for tweets containing the search term. Other modes are 'hashtag' to search for a hashtag and 'user' to scrape tweets from a user profile 58 | - number: number of tweets to scrape. Default is -1 (no limit). 59 | - since: date to start scraping from, formatted as YYYY-MM-DD. Default is None 60 | - until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None 61 | - near: location to search tweets from. Default is None (anywhere) 62 | - language: language of the tweets to search. Default is None (any language). The language must be specified as a 2-letter ISO 639-1 code (e.g. 'en' for English, 'es' for Spanish, 'fr' for French ...) 63 | - to: user to which the tweets are directed. Default is None (any user). For example, if you want to search for tweets directed to @github, you would set this parameter to 'github' 64 | - replies: whether to include replies in the search. If 'filters' or 'exclude' are set, this is overridden. Default is False 65 | - filters: list of filters to apply to the search. Default is None. Valid filters are: 'nativeretweets', 'media', 'videos', 'news', 'verified', 'native_video', 'replies', 'links', 'images', 'safe', 'quote', 'pro_video' 66 | - exclude: list of filters to exclude from the search. Default is None. Valid filters are the same as above 67 | - max_retries: max retries to scrape a page. Default is 5 68 | - instance: Nitter instance to use. Default is None and will be chosen at random 69 | 70 | Returns a dictionary with tweets and threads for the term. 71 | 72 | #### Multiprocessing 73 | 74 | You can also scrape multiple terms at once using multiprocessing: 75 | 76 | ```python 77 | terms = ["github", "bezos", "musk"] 78 | 79 | results = scraper.get_tweets(terms, mode='term') 80 | ``` 81 | 82 | Each term will be scraped in a different process. The result will be a list of dictionaries, one for each term. 83 | 84 | The multiprocessing code needs to run in a `if __name__ == "__main__"` block to avoid errors. With multiprocessing, only full logging is supported. Also, the number of processes is limited to the number of available cores on your machine. 85 | 86 | NOTE: using multiprocessing on public instances is highly discouraged since it puts too much load on the servers and could potentially also get you rate limited. Please only use it on your local instance. 87 | 88 | ### Get single tweet 89 | 90 | ```python 91 | tweet = scraper.get_tweet_by_id("x", "1826317783430303888") 92 | ``` 93 | 94 | Parameters: 95 | - username: username of the tweet's author 96 | - tweet_id: ID of the tweet 97 | - instane: Nitter instance to use. Default is None 98 | - max_retries: max retries to scrape a page. Default is 5 99 | 100 | Returns a dictionary with the tweet's content. 101 | 102 | ### Get profile information 103 | 104 | ```python 105 | bezos_information = scraper.get_profile_info("JeffBezos") 106 | ``` 107 | 108 | Parameters: 109 | - username: username of the page to scrape 110 | - max_retries: max retries to scrape a page. Default is 5 111 | - instance: Nitter instance to use. Default is None 112 | - mode: mode of fetching profile info. 'simple' for basic info, 'detail' for detailed info including following and followers lists. Default is 'simple' 113 | 114 | Returns a dictionary of the profile's information. 115 | 116 | #### Multiprocessing 117 | 118 | As for the term scraping, you can also get info from multiple profiles at once using multiprocessing: 119 | 120 | ```python 121 | usernames = ["x", "github"] 122 | 123 | results = scraper.get_profile_info(usernames) 124 | ``` 125 | 126 | Each user will be scraped in a different process. The result will be a list of dictionaries, one for each user. 127 | 128 | The multiprocessing code needs to run in a `if __name__ == "__main__"` block to avoid errors. With multiprocessing, only full logging is supported. Also, the number of processes is limited to the number of available cores on your machine. 129 | 130 | NOTE: using multiprocessing on public instances is highly discouraged since it puts too much load on the servers and could potentially also get you rate limited. Please only use it on your local instance. 131 | 132 | ### Get random Nitter instance 133 | 134 | ```python 135 | random_instance = scraper.get_random_instance() 136 | ``` 137 | 138 | Returns a random Nitter instance. 139 | 140 | ## Note 141 | 142 | Due to recent changes on Twitter's side, some Nitter instances may not work properly even if they are marked as "working" on Nitter's wiki. If you have trouble scraping with a certain instance, try changing it and check if the problem persists. 143 | 144 | ## To do list 145 | 146 | - [ ] Add scraping of individual posts with comments -------------------------------------------------------------------------------- /ntscraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .nitter import Nitter -------------------------------------------------------------------------------- /ntscraper/nitter.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import random 4 | from urllib.parse import unquote, urlparse 5 | from time import sleep 6 | from base64 import b64decode 7 | from random import uniform 8 | from re import match, sub 9 | from datetime import datetime 10 | import logging 11 | from logging.handlers import QueueHandler 12 | from multiprocessing import Pool, Queue, cpu_count 13 | from sys import stdout 14 | from tqdm import tqdm 15 | 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format="%(asctime)s - %(message)s", 19 | datefmt="%d-%b-%y %H:%M:%S", 20 | handlers=[logging.StreamHandler(stdout)], 21 | ) 22 | 23 | log_queue = Queue() 24 | log_handler = QueueHandler(log_queue) 25 | root_logger = logging.getLogger() 26 | root_logger.addHandler(log_handler) 27 | 28 | valid_filters = [ 29 | "nativeretweets", 30 | "media", 31 | "videos", 32 | "news", 33 | "verified", 34 | "native_video", 35 | "replies", 36 | "links", 37 | "images", 38 | "safe", 39 | "quote", 40 | "pro_video", 41 | ] 42 | 43 | 44 | class Nitter: 45 | def __init__(self, instances=None, log_level=1, skip_instance_check=False): 46 | """ 47 | Nitter scraper 48 | :param instances: accepts a list of instances or a single instance in this format: "https://{host}:{port}", e.g. "http://localhost:8080 49 | :param log_level: logging level 50 | :param skip_instance_check: True if the health check of all instances and the instance change during execution should be skipped 51 | """ 52 | if instances: 53 | # check instances type is list or str 54 | if isinstance(instances, list): 55 | self.instances = instances 56 | elif isinstance(instances, str): 57 | self.instances = [instances] 58 | else: 59 | raise ValueError("Instances type not supported, only list and str are supported") 60 | else: 61 | self.instances = self._get_instances() 62 | if self.instances is None: 63 | raise ValueError("Could not fetch instances") 64 | self.working_instances = [] 65 | self.skip_instance_check = skip_instance_check 66 | if skip_instance_check: 67 | self.working_instances = self.instances 68 | else: 69 | self._test_all_instances("/x", no_print=True) 70 | if log_level == 0: 71 | log_level = logging.WARNING 72 | elif log_level == 1: 73 | log_level = logging.INFO 74 | elif log_level: 75 | raise ValueError("Invalid log level") 76 | 77 | logger = logging.getLogger() 78 | logger.setLevel(log_level) 79 | 80 | self.retry_count = 0 81 | self.cooldown_count = 0 82 | self.session_reset = False 83 | self.instance = "" 84 | self.r = None 85 | 86 | def _initialize_session(self, instance): 87 | """ 88 | Initialize the requests session 89 | """ 90 | if instance is None: 91 | if self.skip_instance_check: 92 | raise ValueError("No instance specified and instance check skipped") 93 | self.instance = self.get_random_instance() 94 | logging.info( 95 | f"No instance specified, using random instance {self.instance}" 96 | ) 97 | else: 98 | self.instance = instance 99 | self.r = requests.Session() 100 | self.r.headers.update( 101 | { 102 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0", 103 | "Host": self.instance.split("://")[1], 104 | } 105 | ) 106 | 107 | def _is_instance_encrypted(self): 108 | """ 109 | Check if the current instance uses encrypted media 110 | 111 | :return: True if encrypted, False otherwise 112 | """ 113 | soup = self._get_page("/x") 114 | 115 | if soup is None: 116 | raise ValueError("Invalid instance") 117 | 118 | if ( 119 | soup.find("a", class_="profile-card-avatar").find("img") 120 | and "/enc/" 121 | in soup.find("a", class_="profile-card-avatar").find("img")["src"] 122 | ): 123 | return True 124 | else: 125 | return False 126 | 127 | def _get_instances(self): 128 | """ 129 | Fetch the list of clear web Nitter instances. 130 | 131 | :return: list of Nitter instances, or None if lookup failed 132 | """ 133 | r = requests.get("https://raw.githubusercontent.com/libredirect/instances/main/data.json") 134 | if r.ok: 135 | return r.json()["nitter"]["clearnet"] 136 | else: 137 | return None 138 | 139 | def _test_all_instances(self, endpoint, no_print=False): 140 | """ 141 | Test all Nitter instances when a high number of retries is detected 142 | 143 | :param endpoint: endpoint to use 144 | :param no_print: True if no output should be printed 145 | """ 146 | if not no_print: 147 | print("High number of retries detected. Testing all instances...") 148 | working_instances = [] 149 | 150 | for instance in tqdm(self.instances, desc="Testing instances"): 151 | self._initialize_session(instance) 152 | req_session = requests.Session() 153 | req_session.headers.update( 154 | { 155 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0" 156 | } 157 | ) 158 | try: 159 | r = req_session.get( 160 | instance + endpoint, 161 | cookies={"hlsPlayback": "on"}, 162 | timeout=10, 163 | ) 164 | if r.ok: 165 | soup = BeautifulSoup(r.text, "lxml") 166 | if soup is not None and len( 167 | soup.find_all("div", class_="timeline-item") 168 | ): 169 | working_instances.append(instance) 170 | except: 171 | pass 172 | if not no_print: 173 | print("New working instances:", ", ".join(working_instances)) 174 | self.working_instances = working_instances 175 | 176 | def _get_new_instance(self, message): 177 | instance = self.get_random_instance() 178 | logging.warning(f"{message}. Trying {instance}") 179 | return instance 180 | 181 | def _check_error_page(self, soup): 182 | """ 183 | Check if the page contains an error. If so, print the error and return None 184 | 185 | :param soup: page to check 186 | :return: None if error is found, soup otherwise 187 | """ 188 | if not soup.find( 189 | lambda tag: tag.name == "div" 190 | and ( 191 | tag.get("class") == ["timeline-item"] 192 | or tag.get("class") == ["timeline-item", "thread"] 193 | ) 194 | ): 195 | if soup.find("div", class_="error-panel"): 196 | message = ( 197 | f"Fetching error: " 198 | + soup.find("div", class_="error-panel").find("span").text.strip() 199 | ) 200 | else: 201 | if soup.find("div", class_="timeline-header timeline-protected"): 202 | message = "Account is protected" 203 | else: 204 | message = f"Empty page on {self.instance}" 205 | logging.warning(message) 206 | soup = None 207 | return soup 208 | 209 | def _get_page(self, endpoint, max_retries=5): 210 | """ 211 | Download page from Nitter instance 212 | 213 | :param endpoint: endpoint to use 214 | :param max_retries: max number of retries, default 5 215 | :return: page content, or None if max retries reached 216 | """ 217 | keep_trying = True 218 | soup = None 219 | while keep_trying and (self.retry_count < max_retries): 220 | try: 221 | r = self.r.get( 222 | self.instance + endpoint, 223 | cookies={"hlsPlayback": "on", "infiniteScroll": ""}, 224 | timeout=10, 225 | ) 226 | except: 227 | if self.retry_count == max_retries // 2: 228 | if not self.skip_instance_check: 229 | self._test_all_instances(endpoint) 230 | if not self.working_instances: 231 | logging.warning( 232 | "All instances are unreachable. Check your request and try again." 233 | ) 234 | return None 235 | if not self.skip_instance_check: 236 | self._initialize_session( 237 | instance=self._get_new_instance(f"{self.instance} unreachable") 238 | ) 239 | self.retry_count += 1 240 | self.cooldown_count = 0 241 | self.session_reset = True 242 | sleep(1) 243 | continue 244 | soup = BeautifulSoup(r.text, "lxml") 245 | if r.ok: 246 | self.session_reset = False 247 | soup = self._check_error_page(soup) 248 | keep_trying = False 249 | else: 250 | soup = self._check_error_page(soup) 251 | if soup is None: 252 | keep_trying = False 253 | else: 254 | if self.retry_count == max_retries // 2: 255 | if not self.skip_instance_check: 256 | self._test_all_instances(endpoint) 257 | if not self.working_instances: 258 | logging.warning( 259 | "All instances are unreachable. Check your request and try again." 260 | ) 261 | soup = None 262 | keep_trying = False 263 | else: 264 | self.retry_count += 1 265 | else: 266 | if "cursor" in endpoint: 267 | if not self.session_reset: 268 | logging.warning( 269 | "Cooldown reached, trying again in 20 seconds" 270 | ) 271 | self.cooldown_count += 1 272 | sleep(20) 273 | if self.cooldown_count >= 5 and not self.session_reset: 274 | if not self.skip_instance_check: 275 | self._initialize_session() 276 | else: 277 | self._initialize_session(self.instance) 278 | self.session_reset = True 279 | self.cooldown_count = 0 280 | elif self.session_reset: 281 | if not self.skip_instance_check: 282 | self._initialize_session( 283 | self._get_new_instance( 284 | f"Error fetching {self.instance}" 285 | ) 286 | ) 287 | else: 288 | self.cooldown_count = 0 289 | if not self.skip_instance_check: 290 | self._initialize_session( 291 | self._get_new_instance( 292 | f"Error fetching {self.instance}" 293 | ) 294 | ) 295 | self.retry_count += 1 296 | sleep(2) 297 | 298 | if self.retry_count >= max_retries: 299 | logging.warning("Max retries reached. Check your request and try again.") 300 | soup = None 301 | self.retry_count = 0 302 | 303 | return soup 304 | 305 | def _get_quoted_media(self, quoted_tweet, is_encrypted): 306 | """ 307 | Extract media from a quoted tweet 308 | 309 | :param quoted_tweet: tweet to extract media from 310 | :param is_encrypted: True if instance uses encrypted media 311 | :return: lists of images, videos and gifs, or empty lists if no media is found 312 | """ 313 | quoted_pictures, quoted_videos, quoted_gifs = [], [], [] 314 | if quoted_tweet.find("div", class_="attachments"): 315 | if is_encrypted: 316 | quoted_pictures = [ 317 | "https://pbs.twimg.com/" 318 | + b64decode(img["src"].split("/")[-1].encode("utf-8")) 319 | .decode("utf-8") 320 | .split("?")[0] 321 | for img in quoted_tweet.find("div", class_="attachments").find_all( 322 | "img" 323 | ) 324 | ] 325 | quoted_videos = [ 326 | b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode( 327 | "utf-8" 328 | ) 329 | if "data-url" in video.attrs 330 | else video.find("source")["src"] 331 | for video in quoted_tweet.find( 332 | "div", class_="attachments" 333 | ).find_all("video", class_="") 334 | ] 335 | quoted_gifs = [ 336 | "https://" 337 | + b64decode( 338 | gif.source["src"].split("/")[-1].encode("utf-8") 339 | ).decode("utf-8") 340 | for gif in quoted_tweet.find("div", class_="attachments").find_all( 341 | "video", class_="gif" 342 | ) 343 | ] 344 | else: 345 | quoted_pictures = [ 346 | "https://pbs.twimg.com" 347 | + unquote(img["src"].split("/pic")[1]).split("?")[0] 348 | for img in quoted_tweet.find("div", class_="attachments").find_all( 349 | "img" 350 | ) 351 | ] 352 | quoted_videos = [ 353 | unquote("https" + video["data-url"].split("https")[1]) 354 | if "data-url" in video.attrs 355 | else unquote(video.find("source")["src"]) 356 | for video in quoted_tweet.find( 357 | "div", class_="attachments" 358 | ).find_all("video", class_="") 359 | ] 360 | quoted_gifs = [ 361 | unquote("https://" + gif.source["src"].split("/pic/")[1]) 362 | for gif in quoted_tweet.find("div", class_="attachments").find_all( 363 | "video", class_="gif" 364 | ) 365 | ] 366 | return quoted_pictures, quoted_videos, quoted_gifs 367 | 368 | def _get_tweet_media(self, tweet, is_encrypted): 369 | """ 370 | Extract media from a tweet 371 | 372 | :param tweet: tweet to extract media from 373 | :param is_encrypted: True if instance uses encrypted media 374 | :return: lists of images, videos and gifs, or empty lists if no media is found 375 | """ 376 | pictures, videos, gifs = [], [], [] 377 | if tweet.find("div", class_="tweet-body").find( 378 | "div", class_="attachments", recursive=False 379 | ): 380 | if is_encrypted: 381 | pictures = [ 382 | "https://pbs.twimg.com/" 383 | + b64decode(img["src"].split("/")[-1].encode("utf-8")) 384 | .decode("utf-8") 385 | .split("?")[0] 386 | for img in tweet.find("div", class_="tweet-body") 387 | .find("div", class_="attachments", recursive=False) 388 | .find_all("img") 389 | ] 390 | videos = [ 391 | b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode( 392 | "utf-8" 393 | ) 394 | if "data-url" in video.attrs 395 | else video.find("source")["src"] 396 | for video in tweet.find("div", class_="tweet-body") 397 | .find("div", class_="attachments", recursive=False) 398 | .find_all("video", class_="") 399 | ] 400 | gifs = [ 401 | "https://" 402 | + b64decode( 403 | gif.source["src"].split("/")[-1].encode("utf-8") 404 | ).decode("utf-8") 405 | for gif in tweet.find("div", class_="tweet-body") 406 | .find("div", class_="attachments", recursive=False) 407 | .find_all("video", class_="gif") 408 | ] 409 | else: 410 | pictures = [ 411 | "https://pbs.twimg.com" 412 | + unquote(img["src"].split("/pic")[1]).split("?")[0] 413 | for img in tweet.find("div", class_="tweet-body") 414 | .find("div", class_="attachments", recursive=False) 415 | .find_all("img") 416 | ] 417 | videos = [ 418 | unquote("https" + video["data-url"].split("https")[1]) 419 | if "data-url" in video.attrs 420 | else video.find("source")["src"] 421 | for video in tweet.find("div", class_="tweet-body") 422 | .find("div", class_="attachments", recursive=False) 423 | .find_all("video", class_="") 424 | ] 425 | gifs = [ 426 | unquote("https://" + gif.source["src"].split("/pic/")[1]) 427 | for gif in tweet.find("div", class_="tweet-body") 428 | .find("div", class_="attachments", recursive=False) 429 | .find_all("video", class_="gif") 430 | ] 431 | return pictures, videos, gifs 432 | 433 | def _get_tweet_stats(self, tweet): 434 | """ 435 | Extract stats from a tweet 436 | 437 | :param tweet: tweet to extract stats from 438 | :return: dictionary of stats. If a stat is not found, it is set to 0 439 | """ 440 | return { 441 | "comments": int( 442 | tweet.find_all("span", class_="tweet-stat")[0] 443 | .find("div") 444 | .text.strip() 445 | .replace(",", "") 446 | or 0 447 | ), 448 | "retweets": int( 449 | tweet.find_all("span", class_="tweet-stat")[1] 450 | .find("div") 451 | .text.strip() 452 | .replace(",", "") 453 | or 0 454 | ), 455 | "quotes": int( 456 | tweet.find_all("span", class_="tweet-stat")[2] 457 | .find("div") 458 | .text.strip() 459 | .replace(",", "") 460 | or 0 461 | ), 462 | "likes": int( 463 | tweet.find_all("span", class_="tweet-stat")[3] 464 | .find("div") 465 | .text.strip() 466 | .replace(",", "") 467 | or 0 468 | ), 469 | } 470 | 471 | def _get_user(self, tweet, is_encrypted): 472 | """ 473 | Extract user from a tweet 474 | 475 | :param tweet: tweet to extract user from 476 | :param is_encrypted: True if instance uses encrypted media 477 | :return: dictionary of user 478 | """ 479 | avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png" # Default avatar 480 | profile_id = "unknown" # Default profile ID 481 | 482 | if is_encrypted: 483 | try: 484 | avatar = "https://pbs.twimg.com/" + b64decode( 485 | tweet.find("img", class_="avatar")["src"] 486 | .split("/")[-1] 487 | .encode("utf-8") 488 | ).decode("utf-8") 489 | except: 490 | avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png" # Fallback avatar 491 | 492 | else: 493 | avatar_tag = tweet.find("img", class_="avatar") 494 | if avatar_tag and avatar_tag.has_attr("src"): 495 | avatar = unquote(avatar_tag["src"]) # Successfully getting avatar 496 | else: 497 | avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png" # Fallback avatar 498 | 499 | # Extract profile_id directly from the avatar URL if available 500 | if "profile_images" in avatar: 501 | profile_id = avatar.split("/profile_images/")[1].split("/")[0] 502 | 503 | return { 504 | "name": tweet.find("a", class_="fullname").text.strip(), 505 | "username": tweet.find("a", class_="username").text.strip(), 506 | "profile_id": profile_id, 507 | "avatar": avatar, 508 | } 509 | 510 | def _get_tweet_date(self, tweet): 511 | """ 512 | Extract date from a tweet 513 | 514 | :param tweet: tweet to extract date from 515 | :return: date of tweet 516 | """ 517 | return ( 518 | tweet.find("span", class_="tweet-date") 519 | .find("a")["title"] 520 | .split("/")[-1] 521 | .split("#")[0] 522 | if tweet.find("span", class_="tweet-date") 523 | else "" 524 | ) 525 | 526 | def _get_tweet_text(self, tweet): 527 | """ 528 | Extract text from a tweet 529 | 530 | :param tweet: tweet to extract text from 531 | :return: text of tweet 532 | """ 533 | return ( 534 | tweet.find("div", class_="tweet-content media-body") 535 | .text.strip() 536 | .replace("\n", " ") 537 | if tweet.find("div", class_="tweet-content media-body") 538 | else tweet.find("div", class_="quote-text").text.strip().replace("\n", " ") 539 | if tweet.find("div", class_="quote-text") 540 | else "" 541 | ) 542 | 543 | def _get_tweet_link(self, tweet): 544 | """ 545 | Extract link from a tweet 546 | 547 | :param tweet: tweet to extract link from 548 | :return: link of tweet 549 | """ 550 | tweet_date = tweet.find("span", class_="tweet-date") 551 | return "https://twitter.com" + tweet_date.find("a")["href"] if tweet_date else "" 552 | 553 | def _get_external_link(self, tweet): 554 | """ 555 | Extract external link from a tweet 556 | 557 | :param tweet: tweet to extract external link from 558 | :return: external link of tweet 559 | """ 560 | return ( 561 | tweet.find("a", class_="card-container")["href"] 562 | if tweet.find("a", class_="card-container") 563 | else "" 564 | ) 565 | 566 | def _get_replied_to(self, tweet): 567 | """ 568 | Extract the users a tweet is replying to. If the tweet is not a reply, 569 | return an empty list. 570 | 571 | :param tweet: tweet to extract replies from 572 | :return: list of users the tweet is replying to 573 | """ 574 | return ( 575 | [ 576 | user.text.strip() 577 | for user in tweet.find("div", class_="replying-to").find_all("a") 578 | ] 579 | if tweet.find("div", class_="replying-to") 580 | else [] 581 | ) 582 | 583 | def _extract_tweet(self, tweet, is_encrypted): 584 | """ 585 | Extract content from a tweet 586 | 587 | :param tweet: tweet to extract content from 588 | :param is_encrypted: True if instance uses encrypted media 589 | :return: dictionary of content for the tweet 590 | """ 591 | # Replace link text with link 592 | if tweet.find_all("a"): 593 | for link in tweet.find_all("a"): 594 | if "https" in link["href"]: 595 | link.replace_with(link["href"]) 596 | 597 | # Extract the quoted tweet 598 | quoted_tweet = ( 599 | tweet.find("div", class_="quote") 600 | if tweet.find("div", class_="quote") 601 | else None 602 | ) 603 | 604 | # Extract media from the quoted tweet 605 | if quoted_tweet: 606 | deleted = False 607 | if quoted_tweet["class"] == ["quote", "unavailable"]: 608 | deleted = True 609 | ( 610 | quoted_pictures, 611 | quoted_videos, 612 | quoted_gifs, 613 | ) = self._get_quoted_media(quoted_tweet, is_encrypted) 614 | 615 | # Extract media from the tweet 616 | pictures, videos, gifs = self._get_tweet_media(tweet, is_encrypted) 617 | 618 | # Extract the tweet id 619 | link = self._get_tweet_link(tweet) 620 | id = urlparse(link).path.rsplit("/", 1)[-1] 621 | 622 | return { 623 | "id": id, 624 | "link": link, 625 | "text": self._get_tweet_text(tweet), 626 | "user": self._get_user(tweet, is_encrypted), 627 | "date": self._get_tweet_date(tweet), 628 | "is-retweet": tweet.find("div", class_="retweet-header") is not None, 629 | "is-pinned": tweet.find("div", class_="pinned") is not None, 630 | "external-link": self._get_external_link(tweet), 631 | "replying-to": self._get_replied_to(tweet), 632 | "quoted-post": { 633 | "link": self._get_tweet_link(quoted_tweet) if not deleted else "", 634 | "text": self._get_tweet_text(quoted_tweet) if not deleted else "", 635 | "user": self._get_user(quoted_tweet, is_encrypted) 636 | if not deleted 637 | else {}, 638 | "date": self._get_tweet_date(quoted_tweet) if not deleted else "", 639 | "pictures": quoted_pictures, 640 | "videos": quoted_videos, 641 | "gifs": quoted_gifs, 642 | } 643 | if quoted_tweet 644 | else {}, 645 | "stats": self._get_tweet_stats(tweet), 646 | "pictures": pictures, 647 | "videos": videos, 648 | "gifs": gifs, 649 | } 650 | 651 | def _check_date_validity(self, date): 652 | """ 653 | Check if a date is valid 654 | 655 | :param date: date to check 656 | :return: True if date is valid 657 | """ 658 | to_return = True 659 | if not match(r"^\d{4}-\d{2}-\d{2}$", date): 660 | to_return = False 661 | try: 662 | year, month, day = [int(number) for number in date.split("-")] 663 | datetime(year=year, month=month, day=day) 664 | except: 665 | to_return = False 666 | 667 | if not ( 668 | datetime(year=2006, month=3, day=21) 669 | < datetime(year=year, month=month, day=day) 670 | <= datetime.now() 671 | ): 672 | to_return = False 673 | 674 | return to_return 675 | 676 | def _search( 677 | self, 678 | term, 679 | mode, 680 | number, 681 | since, 682 | until, 683 | near, 684 | language, 685 | to, 686 | replies, 687 | filters, 688 | exclude, 689 | max_retries, 690 | instance, 691 | ): 692 | """ 693 | Scrape the specified search terms from Nitter 694 | 695 | :param term: term to seach for 696 | :param mode: search mode. 697 | :param number: number of tweets to scrape. 698 | :param since: date to start scraping from. 699 | :param until: date to stop scraping at. 700 | :param near: location to search near. 701 | :param language: language of the tweets. 702 | :param to: user to which the tweets are directed. 703 | :param replies: True if both tweets and replies are needed. 704 | :param filters: list of filters to apply. 705 | :param exclude: list of filters to exclude. 706 | :param max_retries: max retries to scrape a page. 707 | :param instance: Nitter instance to use. 708 | :return: dictionary of tweets and threads for the term. 709 | """ 710 | tweets = {"tweets": [], "threads": []} 711 | if mode == "hashtag": 712 | endpoint = "/search?f=tweets&q=%23" + term 713 | elif mode == "term": 714 | endpoint = "/search?f=tweets&q=" + term 715 | elif mode == "user": 716 | if since or until or filters or exclude or near: 717 | endpoint = f"/{term}/search?f=tweets&q=" 718 | else: 719 | endpoint = f"/{term}" 720 | if replies and not filters: 721 | endpoint += "/with_replies" 722 | else: 723 | raise ValueError("Invalid mode. Use 'term', 'hashtag', or 'user'.") 724 | 725 | self._initialize_session(instance) 726 | 727 | if language: 728 | endpoint += f"+lang%3A{language}" 729 | 730 | if to: 731 | endpoint += f"+to%3A{to}" 732 | 733 | if since: 734 | if self._check_date_validity(since): 735 | endpoint += f"&since={since}" 736 | else: 737 | raise ValueError( 738 | "Invalid 'since' date. Use the YYYY-MM-DD format and make sure the date is valid." 739 | ) 740 | 741 | if until: 742 | if self._check_date_validity(until): 743 | endpoint += f"&until={until}" 744 | else: 745 | raise ValueError( 746 | "Invalid 'until' date. Use the YYYY-MM-DD format and make sure the date is valid." 747 | ) 748 | 749 | if near: 750 | endpoint += f"&near={near}" 751 | 752 | if filters: 753 | for f in filters: 754 | if f not in valid_filters: 755 | raise ValueError( 756 | f"Invalid filter '{f}'. Valid filters are: {', '.join(valid_filters)}" 757 | ) 758 | endpoint += f"&f-{f}=on" 759 | 760 | if exclude: 761 | for e in exclude: 762 | if e not in valid_filters: 763 | raise ValueError( 764 | f"Invalid exclusion filter '{e}'. Valid filters are: {', '.join(valid_filters)}" 765 | ) 766 | endpoint += f"&e-{e}=on" 767 | 768 | if mode != "user": 769 | if "?" in endpoint: 770 | endpoint += "&scroll=false" 771 | else: 772 | endpoint += "?scroll=false" 773 | 774 | soup = self._get_page(endpoint, max_retries) 775 | 776 | if soup is None: 777 | return tweets 778 | 779 | is_encrypted = self._is_instance_encrypted() 780 | 781 | already_scraped = set() 782 | 783 | number = float("inf") if number == -1 else number 784 | keep_scraping = True 785 | while keep_scraping: 786 | thread = [] 787 | 788 | for tweet in soup.find_all("div", class_="timeline-item"): 789 | if len(tweet["class"]) == 1: 790 | to_append = self._extract_tweet(tweet, is_encrypted) 791 | # Extract tweets 792 | if len(tweets["tweets"]) + len(tweets["threads"]) < number: 793 | if self._get_tweet_link(tweet) not in already_scraped: 794 | tweets["tweets"].append(to_append) 795 | already_scraped.add(self._get_tweet_link(tweet)) 796 | else: 797 | keep_scraping = False 798 | break 799 | else: 800 | if "thread" in tweet["class"]: 801 | to_append = self._extract_tweet(tweet, is_encrypted) 802 | # Extract threads 803 | if self._get_tweet_link(tweet) not in already_scraped: 804 | thread.append(to_append) 805 | already_scraped.add(self._get_tweet_link(tweet)) 806 | 807 | if len(tweet["class"]) == 3: 808 | tweets["threads"].append(thread) 809 | thread = [] 810 | 811 | logging.info( 812 | f"Current stats for {term}: {len(tweets['tweets'])} tweets, {len(tweets['threads'])} threads..." 813 | ) 814 | if ( 815 | not (since and until) 816 | and not (since) 817 | and len(tweets["tweets"]) + len(tweets["threads"]) >= number 818 | ): 819 | keep_scraping = False 820 | else: 821 | sleep(uniform(1, 2)) 822 | 823 | # Go to the next page 824 | show_more_buttons = soup.find_all("div", class_="show-more") 825 | if soup.find_all("div", class_="show-more"): 826 | if mode == "user": 827 | if since or until: 828 | next_page = ( 829 | f"/{term}/search?" 830 | + show_more_buttons[-1].find("a")["href"].split("?")[-1] 831 | ) 832 | else: 833 | next_page = ( 834 | f"/{term}?" 835 | + show_more_buttons[-1].find("a")["href"].split("?")[-1] 836 | ) 837 | else: 838 | next_page = "/search" + show_more_buttons[-1].find("a")["href"] 839 | soup = self._get_page(next_page, max_retries) 840 | if soup is None: 841 | keep_scraping = False 842 | else: 843 | keep_scraping = False 844 | return tweets 845 | 846 | def _search_dispatch(self, args): 847 | return self._search(*args) 848 | 849 | def get_random_instance(self): 850 | """ 851 | Get a random Nitter instance 852 | 853 | :return: URL of random Nitter instance 854 | """ 855 | return random.choice(self.working_instances) 856 | 857 | def get_tweet_by_id(self, username, tweet_id, instance=None, max_retries=5): 858 | """ 859 | Fetch a tweet by its ID. 860 | 861 | :param username: The username of the tweet. 862 | :param tweet_id: The ID of the tweet to fetch. 863 | :param instance: The specific Nitter instance to use. 864 | :param max_retries: Max retries to scrape a page. Default is 5. 865 | :return: Dictionary of the tweet content. 866 | """ 867 | if instance: 868 | self._initialize_session(instance) 869 | else: 870 | if not self.working_instances: 871 | raise ValueError("No working instances available.") 872 | self.instance = self.get_random_instance() 873 | 874 | endpoint = f"/{username}/status/{tweet_id}" 875 | soup = self._get_page(endpoint, max_retries) 876 | 877 | if soup is None: 878 | return None 879 | 880 | is_encrypted = self._is_instance_encrypted() 881 | 882 | tweet = soup.find("div", class_="timeline-item") 883 | if tweet: 884 | return self._extract_tweet(tweet, is_encrypted) 885 | else: 886 | logging.warning(f"Tweet with ID {tweet_id} not found.") 887 | return None 888 | 889 | def get_tweets( 890 | self, 891 | terms, 892 | mode="term", 893 | number=-1, 894 | since=None, 895 | until=None, 896 | near=None, 897 | language=None, 898 | to=None, 899 | replies=False, 900 | filters=None, 901 | exclude=None, 902 | max_retries=5, 903 | instance=None, 904 | ): 905 | """ 906 | Scrape the specified term from Nitter 907 | 908 | :param terms: string/s to search for 909 | :param mode: search mode. Default is 'term', can also be 'hashtag' or 'user' 910 | :param number: number of tweets to scrape. Default is -1 (to not set a limit). 911 | :param since: date to start scraping from, formatted as YYYY-MM-DD. Default is None 912 | :param until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None 913 | :param near: near location of the tweets. Default is None (anywhere) 914 | :param language: language of the tweets. Default is None (any language) 915 | :param to: user to which the tweets are directed. Default is None (any user) 916 | :param replies: True if both tweets and replies are needed. If 'filters' or 'exclude' are set, this option will be overridden. Default is False 917 | :param filters: list of filters to apply. Default is None 918 | :param exclude: list of filters to exclude. Default is None 919 | :param max_retries: max retries to scrape a page. Default is 5 920 | :param instance: Nitter instance to use. Default is None 921 | :return: dictionary or array with dictionaries (in case of multiple terms) of the tweets and threads for the provided terms 922 | """ 923 | if type(terms) == str: 924 | term = terms.strip() 925 | 926 | return self._search( 927 | term, 928 | mode, 929 | number, 930 | since, 931 | until, 932 | near, 933 | language, 934 | to, 935 | replies, 936 | filters, 937 | exclude, 938 | max_retries, 939 | instance, 940 | ) 941 | elif len(terms) == 1: 942 | term = terms[0].strip() 943 | 944 | return self._search( 945 | term, 946 | mode, 947 | number, 948 | since, 949 | until, 950 | near, 951 | language, 952 | to, 953 | replies, 954 | filters, 955 | exclude, 956 | max_retries, 957 | instance, 958 | ) 959 | else: 960 | if len(terms) > cpu_count(): 961 | raise ValueError( 962 | f"Too many terms. You can search at most {cpu_count()} terms." 963 | ) 964 | 965 | args = [ 966 | ( 967 | term.strip(), 968 | mode, 969 | number, 970 | since, 971 | until, 972 | near, 973 | language, 974 | to, 975 | replies, 976 | filters, 977 | exclude, 978 | max_retries, 979 | instance, 980 | ) 981 | for term in terms 982 | ] 983 | with Pool(len(terms)) as p: 984 | results = list(p.map(self._search_dispatch, args)) 985 | 986 | return results 987 | 988 | def _profile_info(self, username, max_retries, instance): 989 | """ 990 | Gets the profile information for a user. 991 | 992 | :param username: username of the page to scrape 993 | :param max_retries: max retries to scrape a page. Default is 5 994 | :param instance: Nitter instance to use. Default is None 995 | :return: dictionary of the profile's information 996 | """ 997 | self._initialize_session(instance) 998 | username = sub(r"[^A-Za-z0-9_+-:]", "", username) 999 | soup = self._get_page(f"/{username}", max_retries) 1000 | if soup is None: 1001 | return None 1002 | 1003 | is_encrypted = self._is_instance_encrypted() 1004 | # Extract id if the banner exists, no matter if the instance uses base64 or not 1005 | if soup.find("div", class_="profile-banner").find("img") and is_encrypted: 1006 | profile_id = ( 1007 | b64decode( 1008 | soup.find("div", class_="profile-banner") 1009 | .find("img")["src"] 1010 | .split("/enc/")[1] 1011 | .encode("utf-8") 1012 | ) 1013 | .decode("utf-8") 1014 | .split("/profile_banners/")[1] 1015 | .split("/")[0] 1016 | ) 1017 | elif soup.find("div", class_="profile-banner").find("img"): 1018 | profile_id = ( 1019 | unquote(soup.find("div", class_="profile-banner").find("img")["src"]) 1020 | .split("profile_banners/")[1] 1021 | .split("/")[0] 1022 | ) 1023 | else: 1024 | profile_id = "" 1025 | 1026 | # Extract profile image, no matter if the instance uses base64 or not 1027 | if soup.find("a", class_="profile-card-avatar").find("img") and is_encrypted: 1028 | profile_image = "https://" + b64decode( 1029 | soup.find("a", class_="profile-card-avatar") 1030 | .find("img")["src"] 1031 | .split("/enc/")[1] 1032 | .encode("utf-8") 1033 | ).decode("utf-8") 1034 | elif soup.find("a", class_="profile-card-avatar").find("img"): 1035 | profile_image = ( 1036 | "https://" 1037 | + unquote( 1038 | soup.find("a", class_="profile-card-avatar").find("img")["src"] 1039 | ).split("/pic/")[1] 1040 | ) 1041 | else: 1042 | profile_image = "" 1043 | 1044 | icon_container = ( 1045 | soup.find("div", class_="photo-rail-header").find( 1046 | "div", class_="icon-container" 1047 | ) 1048 | if soup.find("div", class_="photo-rail-header") 1049 | else None 1050 | ) 1051 | 1052 | return { 1053 | "image": profile_image, 1054 | "name": soup.find("a", class_="profile-card-fullname").text.strip(), 1055 | "username": soup.find("a", class_="profile-card-username").text.strip(), 1056 | "id": profile_id, 1057 | "bio": soup.find("div", class_="profile-bio").p.text.strip() 1058 | if soup.find("div", class_="profile-bio") 1059 | else "", 1060 | "location": soup.find("div", class_="profile-location") 1061 | .find_all("span")[-1] 1062 | .text.strip() 1063 | if soup.find("div", class_="profile-location") 1064 | else "", 1065 | "website": soup.find("div", class_="profile-website").find("a")["href"] 1066 | if soup.find("div", class_="profile-website") 1067 | else "", 1068 | "joined": soup.find("div", class_="profile-joindate").find("span")["title"], 1069 | "stats": { 1070 | "tweets": int( 1071 | soup.find("ul", class_="profile-statlist") 1072 | .find("li", class_="posts") 1073 | .find_all("span")[1] 1074 | .text.strip() 1075 | .replace(",", "") 1076 | ), 1077 | "following": int( 1078 | soup.find("ul", class_="profile-statlist") 1079 | .find("li", class_="following") 1080 | .find_all("span")[1] 1081 | .text.strip() 1082 | .replace(",", "") 1083 | ), 1084 | "followers": int( 1085 | soup.find("ul", class_="profile-statlist") 1086 | .find("li", class_="followers") 1087 | .find_all("span")[1] 1088 | .text.strip() 1089 | .replace(",", "") 1090 | ), 1091 | "likes": int( 1092 | soup.find("ul", class_="profile-statlist") 1093 | .find("li", class_="likes") 1094 | .find_all("span")[1] 1095 | .text.strip() 1096 | .replace(",", "") 1097 | ), 1098 | "media": int( 1099 | icon_container.text.strip().replace(",", "").split(" ")[0] 1100 | if icon_container 1101 | else 0 1102 | ), 1103 | }, 1104 | } 1105 | 1106 | def _search_profile_dispatch(self, args): 1107 | return self.get_profile_info(*args) 1108 | 1109 | def get_profile_info(self, username, max_retries=5, instance=None, mode='simple'): 1110 | """ 1111 | Get profile information for a user or a list of users 1112 | 1113 | :param username: username/s of the page to scrape (str or list of str) 1114 | :param max_retries: max retries to scrape a page. Default is 5 1115 | :param instance: Nitter instance to use. Default is None 1116 | :param mode: Mode of fetching profile info. 'simple' for basic info, 'detail' for detailed info including following and followers lists. Default is 'simple' 1117 | :return: dictionary of the profile's information or list of dictionaries if username is a list. The dictionary contains the following keys: 1118 | - image: URL of the profile image 1119 | - name: Full name of the user 1120 | - username: Username of the user 1121 | - id: Profile ID 1122 | - bio: Bio of the user 1123 | - location: Location of the user 1124 | - website: Website URL of the user 1125 | - joined: Date when the user joined 1126 | - stats: Dictionary containing the following keys: 1127 | - tweets: Number of tweets 1128 | - following: Number of users the profile is following 1129 | - followers: Number of followers 1130 | - likes: Number of likes 1131 | - media: Number of media posts 1132 | - following_list: List of usernames the profile is following (only in 'detail' mode) 1133 | - followers_list: List of usernames following the profile (only in 'detail' mode) 1134 | """ 1135 | 1136 | def _get_follow_list(endpoint): 1137 | follow_list = [] 1138 | cursor = None 1139 | while True: 1140 | url = f"{endpoint}?cursor={cursor}" if cursor else endpoint 1141 | soup = self._get_page(url, max_retries) 1142 | if not soup: 1143 | break 1144 | users = [user.text.strip() for user in soup.find_all("a", class_="username")] 1145 | if not users: 1146 | break 1147 | follow_list.extend(users) 1148 | load_more = soup.find("div", class_="show-more") 1149 | if load_more and load_more.find("a"): 1150 | cursor = load_more.find("a")["href"].split("cursor=")[-1] 1151 | else: 1152 | break 1153 | return follow_list 1154 | 1155 | if isinstance(username, str): 1156 | username = username.strip() 1157 | profile_info = self._profile_info(username, max_retries, instance) 1158 | if profile_info and mode == 'detail': 1159 | profile_info["following_list"] = _get_follow_list(f"/{username}/following") 1160 | profile_info["followers_list"] = _get_follow_list(f"/{username}/followers") 1161 | return profile_info 1162 | elif len(username) == 1: 1163 | username = username[0].strip() 1164 | profile_info = self._profile_info(username, max_retries, instance) 1165 | if profile_info and mode == 'detail': 1166 | profile_info["following_list"] = _get_follow_list(f"/{username}/following") 1167 | profile_info["followers_list"] = _get_follow_list(f"/{username}/followers") 1168 | return profile_info 1169 | else: 1170 | if len(username) > cpu_count(): 1171 | raise ValueError(f"Too many usernames. You can use at most {cpu_count()} usernames.") 1172 | 1173 | args = [(user.strip(), max_retries, instance, mode) for user in username] 1174 | with Pool(len(username)) as p: 1175 | results = list(p.map(self._search_profile_dispatch, args)) 1176 | return results 1177 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.1 2 | requests==2.28.1 3 | setuptools==65.5.0 4 | lxml==4.9.2 5 | tqdm==4.66.1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from os import path 4 | 5 | HERE = path.abspath(path.dirname(__file__)) 6 | 7 | with open(path.join(HERE, 'README.md'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name="ntscraper", 12 | version="0.4.0", 13 | description="Unofficial library to scrape Twitter profiles and posts from Nitter instances", 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | project_urls={ 17 | 'Homepage': 'https://github.com/bocchilorenzo/ntscraper', 18 | 'Source': 'https://github.com/bocchilorenzo/ntscraper', 19 | 'Documentation': 'https://github.com/bocchilorenzo/ntscraper' 20 | }, 21 | keywords=["twitter", "nitter", "scraping"], 22 | author="Lorenzo Bocchi", 23 | author_email="lorenzobocchi99@yahoo.com", 24 | license="MIT", 25 | classifiers=[ 26 | "Intended Audience :: Developers", 27 | "License :: OSI Approved :: MIT License", 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 3", 30 | "Programming Language :: Python :: 3.7", 31 | "Programming Language :: Python :: 3.9", 32 | "Programming Language :: Python :: 3.10", 33 | "Operating System :: OS Independent" 34 | ], 35 | packages=["ntscraper"], 36 | include_package_data=True, 37 | install_requires=["requests>=2.28", "beautifulsoup4>=4.11", "lxml>=4.9", "tqdm>=4.66"], 38 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bocchilorenzo/ntscraper/26c87edf8ed31472debe6c929dfa4d64c689102a/tests/__init__.py -------------------------------------------------------------------------------- /tests/instances_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestProfile(unittest.TestCase): 5 | def get_instances(self): 6 | """ 7 | Test retrieval of instances. Should only return updated instances. 8 | """ 9 | nitter = Nitter() 10 | instances = nitter.__get_instances() 11 | self.assertGreater(len(instances), 0) -------------------------------------------------------------------------------- /tests/profile_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestProfile(unittest.TestCase): 5 | def scrape_profile_info(self): 6 | """ 7 | Test scraping profile info of a username (Twitter, we need a stable username) 8 | """ 9 | nitter = Nitter() 10 | profile = nitter.get_profile_info("Twitter") 11 | self.assertEqual(profile['name'], "Twitter") 12 | self.assertEqual(profile['username'], "@Twitter") 13 | self.assertEqual(profile['bio'], "What's happening?!") 14 | self.assertEqual(profile['location'], 'everywhere') 15 | self.assertEqual(profile['website'], 'https://about.twitter.com/') 16 | self.assertEqual(profile['joined'], '2:35 PM - 20 Feb 2007') 17 | self.assertGreater(profile['stats']['tweets'], 0) 18 | self.assertGreater(profile['stats']['following'], 0) 19 | self.assertGreater(profile['stats']['followers'], 0) 20 | self.assertGreater(profile['stats']['likes'], 0) 21 | self.assertGreater(profile['stats']['media'], 0) 22 | self.assertEqual(profile['image'], 'https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_400x400.jpg') 23 | 24 | def scrape_profile_tweets(self): 25 | """ 26 | Test scraping profile tweets of a username (Twitter, we need a stable username) 27 | """ 28 | nitter = Nitter() 29 | tweets = nitter.get_tweets("Twitter", 'user') 30 | self.assertGreater(len(tweets['tweets']), 0) 31 | 32 | def scrape_profile_tweets_since(self): 33 | """ 34 | Test scraping profile tweets of a username (Twitter, we need a stable username) in a certain time period 35 | """ 36 | nitter = Nitter() 37 | tweets = nitter.get_tweets("Twitter", mode='user', since='2022-12-01', until='2022-12-31', number=1) 38 | self.assertGreater(len(tweets['threads']), 1) -------------------------------------------------------------------------------- /tests/search_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestSearch(unittest.TestCase): 5 | def scrape_term(self): 6 | """ 7 | Test scraping a term 8 | """ 9 | nitter = Nitter() 10 | tweets = nitter.get_tweets("Twitter", 'term') 11 | self.assertGreater(len(tweets['tweets']), 0) 12 | 13 | def test_scrape_user(self): 14 | """ 15 | Test scraping a user 16 | """ 17 | nitter = Nitter() 18 | tweets = nitter.get_tweets("X", mode='user', number=10) 19 | self.assertGreater(len(tweets['tweets']), 0) 20 | 21 | def scrape_hashtag(self): 22 | """ 23 | Test scraping a hashtag 24 | """ 25 | nitter = Nitter() 26 | tweets = nitter.get_tweets("twitter", 'hashtag') 27 | self.assertGreater(len(tweets['tweets']), 0) 28 | 29 | def random_instance(self): 30 | """ 31 | Test whether a random instance is returned 32 | """ 33 | nitter = Nitter() 34 | self.assertIsNotNone(nitter.get_random_instance()) -------------------------------------------------------------------------------- /tests/tweet_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | 5 | class TestGetTweetById(unittest.TestCase): 6 | def test_get_tweet_by_id(self): 7 | """ 8 | Test fetching a tweet by its ID. 9 | """ 10 | nitter = Nitter() 11 | tweet = nitter.get_tweet_by_id("X", "1824507305389592885", instance="https://nt.vern.cc") 12 | self.assertIsNotNone(tweet, "Tweet should note be None") 13 | self.assertEqual(tweet['user']['username'], "@X", "Username should match the expected username") 14 | self.assertEqual(tweet['text'], "since it’s friday, let’s have some fun! comment with a @grok generated pic" 15 | " that describes your entire personality 👹") 16 | self.assertEqual(tweet['date'], "Aug 16, 2024 · 6:03 PM UTC", "Date should match the expected date") 17 | self.assertGreaterEqual(tweet['stats']['likes'], 3471, "Likes count should be greater than or equal to 3471") 18 | self.assertGreaterEqual(tweet['stats']['retweets'], 303, "Retweets count should be greater than or equal to 303") 19 | self.assertGreaterEqual(tweet['stats']['comments'], 2000, "Comments count should be greater than or equal to 2000" 20 | ) 21 | 22 | 23 | if __name__ == '__main__': 24 | unittest.main() 25 | --------------------------------------------------------------------------------