├── tests ├── __init__.py ├── instances_test.py ├── search_test.py └── profile_test.py ├── ntscraper ├── __init__.py └── nitter.py ├── .gitattributes ├── requirements.txt ├── LICENSE.txt ├── setup.py ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ntscraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .nitter import Nitter -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.1 2 | requests==2.28.1 3 | setuptools==65.5.0 4 | lxml==4.9.2 -------------------------------------------------------------------------------- /tests/instances_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestProfile(unittest.TestCase): 5 | def get_instances(self): 6 | """ 7 | Test retrieval of instances. Should only return updated instances. 8 | """ 9 | nitter = Nitter() 10 | instances = nitter.__get_instances() 11 | self.assertGreater(len(instances), 0) -------------------------------------------------------------------------------- /tests/search_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestSearch(unittest.TestCase): 5 | def scrape_term(self): 6 | """ 7 | Test scraping a term 8 | """ 9 | nitter = Nitter() 10 | tweets = nitter.get_tweets("Twitter", 'term') 11 | self.assertGreater(len(tweets['tweets']), 0) 12 | 13 | def scrape_hashtag(self): 14 | """ 15 | Test scraping a hashtag 16 | """ 17 | nitter = Nitter() 18 | tweets = nitter.get_tweets("twitter", 'hashtag') 19 | self.assertGreater(len(tweets['tweets']), 0) 20 | 21 | def random_instance(self): 22 | """ 23 | Test whether a random instance is returned 24 | """ 25 | nitter = Nitter() 26 | self.assertIsNotNone(nitter.get_random_instance()) -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Lorenzo Bocchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from os import path 4 | 5 | HERE = path.abspath(path.dirname(__file__)) 6 | 7 | with open(path.join(HERE, 'README.md'), encoding='utf-8') as f: 8 | long_description = f.read() 9 | 10 | setup( 11 | name="ntscraper", 12 | version="0.1.12", 13 | description="Unofficial library to scrape Twitter profiles and posts from Nitter instances", 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | project_urls={ 17 | 'Homepage': 'https://github.com/bocchilorenzo/ntscraper', 18 | 'Source': 'https://github.com/bocchilorenzo/ntscraper', 19 | 'Documentation': 'https://github.com/bocchilorenzo/ntscraper' 20 | }, 21 | keywords=["twitter", "nitter", "scraping"], 22 | author="Lorenzo Bocchi", 23 | author_email="lorenzobocchi99@yahoo.com", 24 | license="MIT", 25 | classifiers=[ 26 | "Intended Audience :: Developers", 27 | "License :: OSI Approved :: MIT License", 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 3", 30 | "Programming Language :: Python :: 3.7", 31 | "Programming Language :: Python :: 3.9", 32 | "Programming Language :: Python :: 3.10", 33 | "Operating System :: OS Independent" 34 | ], 35 | packages=["ntscraper"], 36 | include_package_data=True, 37 | install_requires=["requests", "beautifulsoup4", "lxml"], 38 | ) -------------------------------------------------------------------------------- /tests/profile_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ntscraper import Nitter 3 | 4 | class TestProfile(unittest.TestCase): 5 | def scrape_profile_info(self): 6 | """ 7 | Test scraping profile info of a username (Twitter, we need a stable username) 8 | """ 9 | nitter = Nitter() 10 | profile = nitter.get_profile_info("Twitter") 11 | self.assertEqual(profile['name'], "Twitter") 12 | self.assertEqual(profile['username'], "@Twitter") 13 | self.assertEqual(profile['bio'], "What's happening?!") 14 | self.assertEqual(profile['location'], 'everywhere') 15 | self.assertEqual(profile['website'], 'https://about.twitter.com/') 16 | self.assertEqual(profile['joined'], '2:35 PM - 20 Feb 2007') 17 | self.assertGreater(profile['stats']['tweets'], 0) 18 | self.assertGreater(profile['stats']['following'], 0) 19 | self.assertGreater(profile['stats']['followers'], 0) 20 | self.assertGreater(profile['stats']['likes'], 0) 21 | self.assertGreater(profile['stats']['media'], 0) 22 | self.assertEqual(profile['image'], 'https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_400x400.jpg') 23 | 24 | def scrape_profile_tweets(self): 25 | """ 26 | Test scraping profile tweets of a username (Twitter, we need a stable username) 27 | """ 28 | nitter = Nitter() 29 | tweets = nitter.get_tweets("Twitter", 'user') 30 | self.assertGreater(len(tweets['tweets']), 0) 31 | 32 | def scrape_profile_tweets_since(self): 33 | """ 34 | Test scraping profile tweets of a username (Twitter, we need a stable username) in a certain time period 35 | """ 36 | nitter = Nitter() 37 | tweets = nitter.get_tweets("Twitter", mode='user', since='2022-12-01', until='2022-12-31', number=1) 38 | self.assertGreater(len(tweets['threads']), 1) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unofficial Nitter scraper 2 | 3 | This is a simple library to scrape Nitter instances for tweets. It can: 4 | 5 | - search and scrape tweets with a certain term 6 | 7 | - search and scrape tweets with a certain hashtag 8 | 9 | - scrape tweets from a user profile 10 | 11 | - get profile information of a user, such as display name, username, number of tweets, profile picture ... 12 | 13 | If the instance to use is not provided to the scraper, it will use a random instance among those listed as "online" and "working" in https://github.com/zedeus/nitter/wiki/Instances. 14 | 15 | --- 16 | 17 | ## Installation 18 | 19 | ``` 20 | pip install ntscraper 21 | ``` 22 | 23 | ## How to use 24 | 25 | First, initialize the library: 26 | 27 | ``` 28 | from ntscraper import Nitter 29 | 30 | scraper = Nitter(log_level=1) 31 | ``` 32 | The valid logging levels are: 33 | - None = no logs 34 | - 0 = only warning and error logs 35 | - 1 = previous + informational logs (default) 36 | 37 | Then, choose the proper function for what you want to do from the following. 38 | 39 | ### Scrape tweets 40 | 41 | ``` 42 | github_hash_tweets = scraper.get_tweets("github", mode='hashtag') 43 | 44 | bezos_tweets = scraper.get_tweets("JeffBezos", mode='user') 45 | ``` 46 | 47 | Parameters: 48 | - term: search term 49 | - mode: modality to scrape the tweets. Default is 'term' which will look for tweets containing the search term. Other modes are 'hashtag' to search for a hashtag and 'user' to scrape tweets from a user profile 50 | - number: number of tweets to scrape. Default is 5. If 'since' is specified, this is bypassed. 51 | - since: date to start scraping from, formatted as YYYY-MM-DD. Default is None 52 | - until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None 53 | - max_retries: max retries to scrape a page. Default is 5 54 | - instance: Nitter instance to use. Default is None and will be chosen at random 55 | 56 | Returns a dictionary with tweets and threads for the term. 57 | 58 | ### Get profile information 59 | 60 | ``` 61 | bezos_information = scraper.get_profile_info("JeffBezos") 62 | ``` 63 | 64 | Parameters: 65 | - username: username of the page to scrape 66 | - max_retries: max retries to scrape a page. Default is 5 67 | - instance: Nitter instance to use. Default is None 68 | 69 | Returns a dictionary of the profile's information. 70 | 71 | ### Get random Nitter instance 72 | 73 | ``` 74 | random_instance = scraper.get_random_instance() 75 | ``` 76 | 77 | Returns a random Nitter instance. 78 | 79 | ## Note 80 | 81 | Due to recent changes on Twitter's side, some Nitter instances may not work properly even if they are marked as "working" on Nitter's wiki. If you have trouble scraping with a certain instance, try changing it and check if the problem persists. 82 | 83 | ## To do list 84 | 85 | - [ ] Add scraping of individual posts with comments -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | 154 | .vscode/ 155 | 156 | test.py -------------------------------------------------------------------------------- /ntscraper/nitter.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import random 4 | from urllib.parse import unquote 5 | from time import sleep 6 | from base64 import b64decode 7 | from random import uniform 8 | from re import match 9 | from datetime import datetime 10 | import logging 11 | 12 | 13 | class Nitter: 14 | def __init__(self, log_level=1): 15 | """ 16 | Nitter scraper 17 | 18 | :param log_level: logging level. Default 1 19 | """ 20 | self.instances = self.__get_instances() 21 | if log_level == 0: 22 | log_level = logging.WARNING 23 | elif log_level == 1: 24 | log_level = logging.INFO 25 | elif log_level: 26 | raise ValueError("Invalid log level") 27 | 28 | logging.basicConfig(level=log_level, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') 29 | 30 | self.retry_count = 0 31 | self.cooldown_count = 0 32 | self.session_reset = False 33 | self.instance = "" 34 | 35 | def __initialize_session(self, instance): 36 | """ 37 | Initialize the requests session 38 | """ 39 | if instance is None: 40 | self.instance = self.get_random_instance() 41 | logging.info(f"No instance specified, using random instance {self.instance}") 42 | else: 43 | self.instance = instance 44 | self.r = requests.Session() 45 | self.r.headers.update( 46 | { 47 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0" 48 | } 49 | ) 50 | 51 | def __is_instance_encrypted(self): 52 | """ 53 | Check if the current instance uses encrypted media 54 | 55 | :return: True if encrypted, False otherwise 56 | """ 57 | soup = self.__get_page("/x") 58 | 59 | if soup is None: 60 | raise ValueError("Invalid instance") 61 | 62 | if ( 63 | soup.find("a", class_="profile-card-avatar").find("img") 64 | and "/enc/" 65 | in soup.find("a", class_="profile-card-avatar").find("img")["src"] 66 | ): 67 | return True 68 | else: 69 | return False 70 | 71 | def __get_instances(self): 72 | """ 73 | Fetch the list of clear web Nitter instances from the wiki 74 | 75 | :return: list of Nitter instances, or None if lookup failed 76 | """ 77 | r = requests.get("https://github.com/zedeus/nitter/wiki/Instances") 78 | instance_list = [] 79 | if r.ok: 80 | soup = BeautifulSoup(r.text, "lxml") 81 | official = soup.find_all("tbody")[0] 82 | instance_list.append(official.find("a")["href"]) 83 | table = soup.find_all("tbody")[1] 84 | for instance in table.find_all("tr"): 85 | columns = instance.find_all("td") 86 | if ( 87 | columns[1].text.strip() == "✅" 88 | ) and ( 89 | columns[2].text.strip() == "✅" 90 | ): 91 | url = instance.find("a")["href"] 92 | if not url.endswith(".onion"): 93 | instance_list.append(url) 94 | return instance_list 95 | else: 96 | return None 97 | 98 | def __get_new_instance(self, message): 99 | instance = self.get_random_instance() 100 | logging.warning(f"{message}. Trying {instance}") 101 | return instance 102 | 103 | def __get_page(self, endpoint, max_retries=5): 104 | """ 105 | Download page from Nitter instance 106 | 107 | :param endpoint: endpoint to use 108 | :param max_retries: max number of retries, default 5 109 | :return: page content, or None if max retries reached 110 | """ 111 | keep_trying = True 112 | soup = None 113 | while keep_trying and (self.retry_count < max_retries): 114 | try: 115 | self.r = requests.get( 116 | self.instance + endpoint, cookies={"hlsPlayback": "on", "infiniteScroll": ""}, timeout=5 117 | ) 118 | except: 119 | self.__initialize_session(instance = self.__get_new_instance(f"{self.instance} unreachable")) 120 | self.retry_count += 1 121 | self.cooldown_count = 0 122 | self.session_reset = True 123 | sleep(1) 124 | continue 125 | if self.r.ok: 126 | self.session_reset = False 127 | soup = BeautifulSoup(self.r.text, "lxml") 128 | if not soup.find( 129 | lambda tag: tag.name == "div" 130 | and (tag.get("class") == ["timeline-item"] or tag.get("class") == ["timeline-item", "thread"]) 131 | ): 132 | bottom_page = soup.find_all("div", class_="show-more") 133 | if bottom_page and bottom_page[-1].find("a").text == "Load newest": 134 | keep_trying = False 135 | soup = None 136 | else: 137 | self.__initialize_session(self.__get_new_instance(f"Empty profile on {self.instance}")) 138 | self.retry_count += 1 139 | else: 140 | keep_trying = False 141 | else: 142 | if "cursor" in endpoint: 143 | if not self.session_reset: 144 | logging.warning("Cooldown reached, trying again in 20 seconds") 145 | self.cooldown_count += 1 146 | sleep(20) 147 | if self.cooldown_count >= 5 and not self.session_reset: 148 | self.__initialize_session() 149 | self.session_reset = True 150 | self.cooldown_count = 0 151 | elif self.session_reset: 152 | self.__initialize_session(self.__get_new_instance(f"Error fetching {self.instance}")) 153 | else: 154 | self.cooldown_count = 0 155 | self.__initialize_session(self.__get_new_instance(f"Error fetching {self.instance}")) 156 | self.retry_count += 1 157 | sleep(2) 158 | current_retry_count = self.retry_count 159 | self.retry_count = 0 160 | if current_retry_count >= max_retries: 161 | logging.warning("Max retries reached. Check your request and try again.") 162 | return None 163 | 164 | return soup 165 | 166 | def __get_quoted_media(self, quoted_tweet, is_encrypted): 167 | """ 168 | Extract media from a quoted tweet 169 | 170 | :param quoted_tweet: tweet to extract media from 171 | :param is_encrypted: True if instance uses encrypted media 172 | :return: lists of images, videos and gifs, or empty lists if no media is found 173 | """ 174 | quoted_pictures, quoted_videos, quoted_gifs = [], [], [] 175 | if quoted_tweet.find("div", class_="attachments"): 176 | if is_encrypted: 177 | quoted_pictures = [ 178 | "https://pbs.twimg.com/" 179 | + b64decode(img["src"].split("/")[-1].encode("utf-8")) 180 | .decode("utf-8") 181 | .split("?")[0] 182 | for img in quoted_tweet.find("div", class_="attachments").find_all( 183 | "img" 184 | ) 185 | ] 186 | quoted_videos = [ 187 | b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode( 188 | "utf-8" 189 | ) if "data-url" in video.attrs 190 | else video.find("source")["src"] 191 | for video in quoted_tweet.find( 192 | "div", class_="attachments" 193 | ).find_all("video", class_="") 194 | ] 195 | quoted_gifs = [ 196 | "https://" 197 | + b64decode( 198 | gif.source["src"].split("/")[-1].encode("utf-8") 199 | ).decode("utf-8") 200 | for gif in quoted_tweet.find("div", class_="attachments").find_all( 201 | "video", class_="gif" 202 | ) 203 | ] 204 | else: 205 | quoted_pictures = [ 206 | "https://pbs.twimg.com" 207 | + unquote(img["src"].split("/pic")[1]).split("?")[0] 208 | for img in quoted_tweet.find("div", class_="attachments").find_all( 209 | "img" 210 | ) 211 | ] 212 | quoted_videos = [ 213 | unquote("https" + video["data-url"].split("https")[1]) 214 | if "data-url" in video.attrs 215 | else unquote(video.find("source")["src"]) 216 | for video in quoted_tweet.find( 217 | "div", class_="attachments" 218 | ).find_all("video", class_="") 219 | ] 220 | quoted_gifs = [ 221 | unquote("https://" + gif.source["src"].split("/pic/")[1]) 222 | for gif in quoted_tweet.find("div", class_="attachments").find_all( 223 | "video", class_="gif" 224 | ) 225 | ] 226 | return quoted_pictures, quoted_videos, quoted_gifs 227 | 228 | def __get_tweet_media(self, tweet, is_encrypted): 229 | """ 230 | Extract media from a tweet 231 | 232 | :param tweet: tweet to extract media from 233 | :param is_encrypted: True if instance uses encrypted media 234 | :return: lists of images, videos and gifs, or empty lists if no media is found 235 | """ 236 | pictures, videos, gifs = [], [], [] 237 | if tweet.find("div", class_="tweet-body").find( 238 | "div", class_="attachments", recursive=False 239 | ): 240 | if is_encrypted: 241 | pictures = [ 242 | "https://pbs.twimg.com/" 243 | + b64decode(img["src"].split("/")[-1].encode("utf-8")) 244 | .decode("utf-8") 245 | .split("?")[0] 246 | for img in tweet.find("div", class_="tweet-body") 247 | .find("div", class_="attachments", recursive=False) 248 | .find_all("img") 249 | ] 250 | videos = [ 251 | b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode( 252 | "utf-8" 253 | ) if "data-url" in video.attrs 254 | else video.find("source")["src"] 255 | for video in tweet.find("div", class_="tweet-body") 256 | .find("div", class_="attachments", recursive=False) 257 | .find_all("video", class_="") 258 | ] 259 | gifs = [ 260 | "https://" 261 | + b64decode( 262 | gif.source["src"].split("/")[-1].encode("utf-8") 263 | ).decode("utf-8") 264 | for gif in tweet.find("div", class_="tweet-body") 265 | .find("div", class_="attachments", recursive=False) 266 | .find_all("video", class_="gif") 267 | ] 268 | else: 269 | pictures = [ 270 | "https://pbs.twimg.com" 271 | + unquote(img["src"].split("/pic")[1]).split("?")[0] 272 | for img in tweet.find("div", class_="tweet-body") 273 | .find("div", class_="attachments", recursive=False) 274 | .find_all("img") 275 | ] 276 | videos = [ 277 | unquote("https" + video["data-url"].split("https")[1]) 278 | if "data-url" in video.attrs 279 | else video.find("source")["src"] 280 | for video in tweet.find("div", class_="tweet-body") 281 | .find("div", class_="attachments", recursive=False) 282 | .find_all("video", class_="") 283 | ] 284 | gifs = [ 285 | unquote("https://" + gif.source["src"].split("/pic/")[1]) 286 | for gif in tweet.find("div", class_="tweet-body") 287 | .find("div", class_="attachments", recursive=False) 288 | .find_all("video", class_="gif") 289 | ] 290 | return pictures, videos, gifs 291 | 292 | def __get_tweet_stats(self, tweet): 293 | """ 294 | Extract stats from a tweet 295 | 296 | :param tweet: tweet to extract stats from 297 | :return: dictionary of stats. If a stat is not found, it is set to 0 298 | """ 299 | return { 300 | "comments": int( 301 | tweet.find_all("span", class_="tweet-stat")[0] 302 | .find("div") 303 | .text.strip() 304 | .replace(",", "") 305 | or 0 306 | ), 307 | "retweets": int( 308 | tweet.find_all("span", class_="tweet-stat")[1] 309 | .find("div") 310 | .text.strip() 311 | .replace(",", "") 312 | or 0 313 | ), 314 | "quotes": int( 315 | tweet.find_all("span", class_="tweet-stat")[2] 316 | .find("div") 317 | .text.strip() 318 | .replace(",", "") 319 | or 0 320 | ), 321 | "likes": int( 322 | tweet.find_all("span", class_="tweet-stat")[3] 323 | .find("div") 324 | .text.strip() 325 | .replace(",", "") 326 | or 0 327 | ), 328 | } 329 | 330 | def __get_user(self, tweet, is_encrypted): 331 | """ 332 | Extract user from a tweet 333 | 334 | :param tweet: tweet to extract user from 335 | :param is_encrypted: True if instance uses encrypted media 336 | :return: dictionary of user 337 | """ 338 | if is_encrypted: 339 | avatar = "https://pbs.twimg.com/" + b64decode( 340 | tweet.find("img", class_="avatar")["src"].split("/")[-1].encode("utf-8") 341 | ).decode("utf-8") 342 | else: 343 | avatar = "https://pbs.twimg.com" + unquote( 344 | tweet.find("img", class_="avatar")["src"].split("/pic")[1] 345 | ) 346 | return { 347 | "name": tweet.find("a", class_="fullname").text.strip(), 348 | "username": tweet.find("a", class_="username").text.strip(), 349 | "avatar": avatar, 350 | } 351 | 352 | def __get_tweet_date(self, tweet): 353 | """ 354 | Extract date from a tweet 355 | 356 | :param tweet: tweet to extract date from 357 | :return: date of tweet 358 | """ 359 | return ( 360 | tweet.find("span", class_="tweet-date") 361 | .find("a")["title"] 362 | .split("/")[-1] 363 | .split("#")[0] 364 | ) 365 | 366 | def __get_tweet_text(self, tweet): 367 | """ 368 | Extract text from a tweet 369 | 370 | :param tweet: tweet to extract text from 371 | :return: text of tweet 372 | """ 373 | return ( 374 | tweet.find("div", class_="tweet-content media-body") 375 | .text.strip() 376 | .replace("\n", " ") 377 | if tweet.find("div", class_="tweet-content media-body") 378 | else tweet.find("div", class_="quote-text").text.strip().replace("\n", " ") 379 | if tweet.find("div", class_="quote-text") else "" 380 | ) 381 | 382 | def __get_tweet_link(self, tweet): 383 | """ 384 | Extract link from a tweet 385 | 386 | :param tweet: tweet to extract link from 387 | :return: link of tweet 388 | """ 389 | return "https://twitter.com" + tweet.find("a")["href"] 390 | 391 | def __get_external_link(self, tweet): 392 | """ 393 | Extract external link from a tweet 394 | 395 | :param tweet: tweet to extract external link from 396 | :return: external link of tweet 397 | """ 398 | return ( 399 | tweet.find("a", class_="card-container")["href"] 400 | if tweet.find("a", class_="card-container") 401 | else "" 402 | ) 403 | 404 | def __extract_tweet(self, tweet, is_encrypted): 405 | """ 406 | Extract content from a tweet 407 | 408 | :param tweet: tweet to extract content from 409 | :param is_encrypted: True if instance uses encrypted media 410 | :return: dictionary of content for the tweet 411 | """ 412 | # Replace link text with link 413 | if tweet.find_all("a"): 414 | for link in tweet.find_all("a"): 415 | if "https" in link["href"]: 416 | link.replace_with(link["href"]) 417 | 418 | # Extract the quoted tweet 419 | quoted_tweet = ( 420 | tweet.find("div", class_="quote") 421 | if tweet.find("div", class_="quote") 422 | else None 423 | ) 424 | 425 | # Extract media from the quoted tweet 426 | if quoted_tweet: 427 | deleted = False 428 | if quoted_tweet["class"] == ["quote", "unavailable"]: 429 | deleted = True 430 | ( 431 | quoted_pictures, 432 | quoted_videos, 433 | quoted_gifs, 434 | ) = self.__get_quoted_media(quoted_tweet, is_encrypted) 435 | 436 | # Extract media from the tweet 437 | pictures, videos, gifs = self.__get_tweet_media(tweet, is_encrypted) 438 | 439 | return { 440 | "link": self.__get_tweet_link(tweet), 441 | "text": self.__get_tweet_text(tweet), 442 | "user": self.__get_user(tweet, is_encrypted), 443 | "date": self.__get_tweet_date(tweet), 444 | "is-retweet": tweet.find("div", class_="retweet-header") 445 | is not None, 446 | "external-link": self.__get_external_link(tweet), 447 | "quoted-post": { 448 | "link": self.__get_tweet_link(quoted_tweet) if not deleted else "", 449 | "text": self.__get_tweet_text(quoted_tweet) if not deleted else "", 450 | "user": self.__get_user(quoted_tweet, is_encrypted) if not deleted else {}, 451 | "date": self.__get_tweet_date(quoted_tweet) if not deleted else "", 452 | "pictures": quoted_pictures, 453 | "videos": quoted_videos, 454 | "gifs": quoted_gifs, 455 | } 456 | if quoted_tweet 457 | else {}, 458 | "stats": self.__get_tweet_stats(tweet), 459 | "pictures": pictures, 460 | "videos": videos, 461 | "gifs": gifs, 462 | } 463 | 464 | def __check_date_validity(self, date): 465 | """ 466 | Check if a date is valid 467 | 468 | :param date: date to check 469 | :return: True if date is valid 470 | """ 471 | to_return = True 472 | if not match(r"^\d{4}-\d{2}-\d{2}$", date): 473 | to_return = False 474 | try: 475 | year, month, day = [int(number) for number in date.split("-")] 476 | datetime(year=year,month=month,day=day) 477 | except: 478 | to_return = False 479 | 480 | if not (datetime(year=2006, month=3, day=21) < datetime(year=year,month=month,day=day) <= datetime.now()): 481 | to_return = False 482 | 483 | return to_return 484 | 485 | def __search(self, term, mode, number, since, until, max_retries, instance): 486 | """ 487 | Scrape the specified search terms from Nitter 488 | 489 | :param term: term to seach for 490 | :param number: number of tweets to scrape. 491 | :param since: date to start scraping from. 492 | :param until: date to stop scraping at. 493 | :param max_retries: max retries to scrape a page. 494 | :param instance: Nitter instance to use. 495 | :param mode: search mode. 496 | :return: dictionary of tweets and threads for the term. 497 | """ 498 | tweets = {"tweets": [], "threads": []} 499 | if mode == "hashtag": 500 | endpoint = "/search?f=tweets&q=%23" + term 501 | elif mode == "term": 502 | endpoint = "/search?f=tweets&q=" + term 503 | elif mode == "user": 504 | if since or until: 505 | endpoint = f"/{term}/search?f=tweets&q=" 506 | else: 507 | endpoint = f"/{term}" 508 | else: 509 | raise ValueError("Invalid mode. Use 'term', 'hashtag', or 'user'.") 510 | 511 | self.__initialize_session(instance) 512 | 513 | if since: 514 | if self.__check_date_validity(since): 515 | endpoint += f"&since={since}" 516 | else: 517 | raise ValueError("Invalid 'since' date. Use the YYYY-MM-DD format and make sure the date is valid.") 518 | 519 | if until: 520 | if self.__check_date_validity(until): 521 | endpoint += f"&until={until}" 522 | else: 523 | raise ValueError("Invalid 'until' date. Use the YYYY-MM-DD format and make sure the date is valid.") 524 | 525 | soup = self.__get_page(endpoint, max_retries) 526 | 527 | 528 | if soup is None: 529 | return None 530 | 531 | is_encrypted = self.__is_instance_encrypted() 532 | 533 | already_scraped = set() 534 | 535 | keep_scraping = True 536 | while keep_scraping: 537 | thread = [] 538 | 539 | for tweet in soup.find_all("div", class_="timeline-item"): 540 | if len(tweet["class"]) == 1: 541 | to_append = self.__extract_tweet(tweet, is_encrypted) 542 | # Extract tweets 543 | if len(tweets["tweets"]) + len(tweets["threads"]) < number or (since and until) or since: 544 | if self.__get_tweet_link(tweet) not in already_scraped: 545 | tweets["tweets"].append(to_append) 546 | already_scraped.add(self.__get_tweet_link(tweet)) 547 | else: 548 | keep_scraping = False 549 | break 550 | else: 551 | if "thread" in tweet["class"]: 552 | to_append = self.__extract_tweet(tweet, is_encrypted) 553 | # Extract threads 554 | if self.__get_tweet_link(tweet) not in already_scraped: 555 | thread.append(to_append) 556 | already_scraped.add(self.__get_tweet_link(tweet)) 557 | 558 | if len(tweet["class"]) == 3: 559 | tweets["threads"].append(thread) 560 | thread = [] 561 | 562 | logging.info(f"Current stats: {len(tweets['tweets'])} tweets, {len(tweets['threads'])} threads...") 563 | if not(since and until) and not(since) and len(tweets["tweets"]) + len(tweets["threads"]) >= number: 564 | keep_scraping = False 565 | else: 566 | sleep(uniform(1, 2)) 567 | 568 | # Go to the next page 569 | show_more_buttons = soup.find_all("div", class_="show-more") 570 | if soup.find_all("div", class_="show-more"): 571 | if mode == "user": 572 | if since or until: 573 | next_page = ( 574 | f"/{term}/search?" 575 | + show_more_buttons[-1] 576 | .find("a")["href"] 577 | .split("?")[-1] 578 | ) 579 | else: 580 | next_page = ( 581 | f"/{term}?" 582 | + show_more_buttons[-1] 583 | .find("a")["href"] 584 | .split("?")[-1] 585 | ) 586 | else: 587 | next_page = ( 588 | "/search" 589 | + show_more_buttons[-1].find("a")[ 590 | "href" 591 | ] 592 | ) 593 | soup = self.__get_page(next_page, max_retries) 594 | if soup is None: 595 | keep_scraping = False 596 | else: 597 | keep_scraping = False 598 | return tweets 599 | 600 | def get_random_instance(self): 601 | """ 602 | Get a random Nitter instance 603 | 604 | :return: URL of random Nitter instance 605 | """ 606 | return random.choice(self.instances) 607 | 608 | def get_tweets(self, term, mode='term', number=5, since=None, until=None, max_retries=5, instance=None): 609 | """ 610 | Scrape the specified term from Nitter 611 | 612 | :param term: string to search for 613 | :param mode: search mode. Default is 'term', can also be 'hashtag' or 'user' 614 | :param number: number of tweets to scrape. Default is 5. If 'since' is specified, this is bypassed. 615 | :param since: date to start scraping from, formatted as YYYY-MM-DD. Default is None 616 | :param until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None 617 | :param max_retries: max retries to scrape a page. Default is 5 618 | :param instance: Nitter instance to use. Default is None 619 | :return: dictionary with tweets and threads for the term 620 | """ 621 | return self.__search(term, mode, number, since, until, max_retries, instance) 622 | 623 | def get_profile_info(self, username, max_retries=5, instance=None): 624 | """ 625 | Get profile information for a user 626 | 627 | :param username: username of the page to scrape 628 | :param max_retries: max retries to scrape a page. Default is 5 629 | :param instance: Nitter instance to use. Default is None 630 | :return: dictionary of the profile's information 631 | """ 632 | self.__initialize_session(instance) 633 | soup = self.__get_page(f"/{username}", max_retries) 634 | if soup is None: 635 | return None 636 | 637 | is_encrypted = self.__is_instance_encrypted() 638 | # Extract id if the banner exists, no matter if the instance uses base64 or not 639 | if soup.find("div", class_="profile-banner").find("img") and is_encrypted: 640 | profile_id = ( 641 | b64decode( 642 | soup.find("div", class_="profile-banner") 643 | .find("img")["src"] 644 | .split("/enc/")[1] 645 | .encode("utf-8") 646 | ) 647 | .decode("utf-8") 648 | .split("/profile_banners/")[1] 649 | .split("/")[0] 650 | ) 651 | elif soup.find("div", class_="profile-banner").find("img"): 652 | profile_id = ( 653 | unquote(soup.find("div", class_="profile-banner").find("img")["src"]) 654 | .split("profile_banners/")[1] 655 | .split("/")[0] 656 | ) 657 | else: 658 | profile_id = "" 659 | 660 | # Extract profile image, no matter if the instance uses base64 or not 661 | if soup.find("a", class_="profile-card-avatar").find("img") and is_encrypted: 662 | profile_image = "https://" + b64decode( 663 | soup.find("a", class_="profile-card-avatar") 664 | .find("img")["src"] 665 | .split("/enc/")[1] 666 | .encode("utf-8") 667 | ).decode("utf-8") 668 | elif soup.find("a", class_="profile-card-avatar").find("img"): 669 | profile_image = ( 670 | "https://" 671 | + unquote( 672 | soup.find("a", class_="profile-card-avatar").find("img")["src"] 673 | ).split("/pic/")[1] 674 | ) 675 | else: 676 | profile_image = "" 677 | 678 | return { 679 | "image": profile_image, 680 | "name": soup.find("a", class_="profile-card-fullname").text.strip(), 681 | "username": soup.find("a", class_="profile-card-username").text.strip(), 682 | "id": profile_id, 683 | "bio": soup.find("div", class_="profile-bio").p.text.strip() 684 | if soup.find("div", class_="profile-bio") 685 | else "", 686 | "location": soup.find("div", class_="profile-location") 687 | .find_all("span")[-1] 688 | .text.strip() 689 | if soup.find("div", class_="profile-location") 690 | else "", 691 | "website": soup.find("div", class_="profile-website").find("a")["href"] 692 | if soup.find("div", class_="profile-website") 693 | else "", 694 | "joined": soup.find("div", class_="profile-joindate").find("span")["title"], 695 | "stats": { 696 | "tweets": int( 697 | soup.find("ul", class_="profile-statlist") 698 | .find("li", class_="posts") 699 | .find_all("span")[1] 700 | .text.strip() 701 | .replace(",", "") 702 | ), 703 | "following": int( 704 | soup.find("ul", class_="profile-statlist") 705 | .find("li", class_="following") 706 | .find_all("span")[1] 707 | .text.strip() 708 | .replace(",", "") 709 | ), 710 | "followers": int( 711 | soup.find("ul", class_="profile-statlist") 712 | .find("li", class_="followers") 713 | .find_all("span")[1] 714 | .text.strip() 715 | .replace(",", "") 716 | ), 717 | "likes": int( 718 | soup.find("ul", class_="profile-statlist") 719 | .find("li", class_="likes") 720 | .find_all("span")[1] 721 | .text.strip() 722 | .replace(",", "") 723 | ), 724 | "media": int( 725 | soup.find("div", class_="photo-rail-header") 726 | .find("div", class_="icon-container") 727 | .text.strip() 728 | .replace(",", "") 729 | .split(" ")[0] 730 | ) 731 | }, 732 | } 733 | --------------------------------------------------------------------------------