├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── README.md
├── ntscraper
    ├── __init__.py
    └── nitter.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── instances_test.py
    ├── profile_test.py
    ├── search_test.py
    └── tweet_test.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | .vscode/
155 | 
156 | test.py


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Lorenzo Bocchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unofficial Nitter scraper
  2 | 
  3 | ## Note
  4 | 
  5 | Twitter has recently made some changes which affected every third party Twitter client, including Nitter. As a result, most Nitter instances have shut down or will shut down shortly. Even local instances are affected by this, so you may not be able to scrape as many tweets as expected, if at all.
  6 | 
  7 | ## The scraper
  8 | 
  9 | This is a simple library to scrape Nitter instances for tweets. It can:
 10 | 
 11 | - search and scrape tweets with a certain term
 12 | 
 13 | - search and scrape tweets with a certain hashtag
 14 | 
 15 | - scrape tweets from a user profile
 16 | 
 17 | - get profile information of a user, such as display name, username, number of tweets, profile picture ...
 18 | 
 19 | If the instance to use is not provided to the scraper, it will use a random public instance. If you can, please host your own instance in order to avoid overloading the public ones and letting Nitter stay alive for everyone. You can read more about that here: https://github.com/zedeus/nitter#installation.
 20 | 
 21 | ---
 22 | 
 23 | ## Installation
 24 | 
 25 | ```
 26 | pip install ntscraper
 27 | ```
 28 | 
 29 | ## How to use
 30 | 
 31 | First, initialize the library:
 32 | 
 33 | ```python
 34 | from ntscraper import Nitter
 35 | 
 36 | scraper = Nitter(log_level=1, skip_instance_check=False)
 37 | ```
 38 | The valid logging levels are:
 39 | - None = no logs
 40 | - 0 = only warning and error logs
 41 | - 1 = previous + informational logs (default)
 42 | 
 43 | The `skip_instance_check` parameter is used to skip the check of the Nitter instances altogether during the execution of the script. If you use your own instance or trust the instance you are relying on, then you can skip set it to 'True', otherwise it's better to leave it to false.
 44 | 
 45 | Then, choose the proper function for what you want to do from the following.
 46 | 
 47 | ### Scrape tweets
 48 | 
 49 | ```python
 50 | github_hash_tweets = scraper.get_tweets("github", mode='hashtag')
 51 | 
 52 | bezos_tweets = scraper.get_tweets("JeffBezos", mode='user')
 53 | ```
 54 | 
 55 | Parameters:
 56 | - term: search term
 57 | - mode: modality to scrape the tweets. Default is 'term' which will look for tweets containing the search term. Other modes are 'hashtag' to search for a hashtag and 'user' to scrape tweets from a user profile
 58 | - number: number of tweets to scrape. Default is -1 (no limit).
 59 | - since: date to start scraping from, formatted as YYYY-MM-DD. Default is None
 60 | - until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None
 61 | - near: location to search tweets from. Default is None (anywhere)
 62 | - language: language of the tweets to search. Default is None (any language). The language must be specified as a 2-letter ISO 639-1 code (e.g. 'en' for English, 'es' for Spanish, 'fr' for French ...)
 63 | - to: user to which the tweets are directed. Default is None (any user). For example, if you want to search for tweets directed to @github, you would set this parameter to 'github'
 64 | - replies: whether to include replies in the search. If 'filters' or 'exclude' are set, this is overridden. Default is False
 65 | - filters: list of filters to apply to the search. Default is None. Valid filters are: 'nativeretweets', 'media', 'videos', 'news', 'verified', 'native_video', 'replies', 'links', 'images', 'safe', 'quote', 'pro_video'
 66 | - exclude: list of filters to exclude from the search. Default is None. Valid filters are the same as above
 67 | - max_retries: max retries to scrape a page. Default is 5
 68 | - instance: Nitter instance to use. Default is None and will be chosen at random
 69 | 
 70 | Returns a dictionary with tweets and threads for the term.
 71 | 
 72 | #### Multiprocessing
 73 | 
 74 | You can also scrape multiple terms at once using multiprocessing:
 75 | 
 76 | ```python
 77 | terms = ["github", "bezos", "musk"]
 78 | 
 79 | results = scraper.get_tweets(terms, mode='term')
 80 | ```
 81 | 
 82 | Each term will be scraped in a different process. The result will be a list of dictionaries, one for each term.
 83 | 
 84 | The multiprocessing code needs to run in a `if __name__ == "__main__"` block to avoid errors. With multiprocessing, only full logging is supported. Also, the number of processes is limited to the number of available cores on your machine.
 85 | 
 86 | NOTE: using multiprocessing on public instances is highly discouraged since it puts too much load on the servers and could potentially also get you rate limited. Please only use it on your local instance.
 87 | 
 88 | ### Get single tweet
 89 | 
 90 | ```python
 91 | tweet = scraper.get_tweet_by_id("x", "1826317783430303888")
 92 | ```
 93 | 
 94 | Parameters:
 95 | - username: username of the tweet's author
 96 | - tweet_id: ID of the tweet
 97 | - instane: Nitter instance to use. Default is None
 98 | - max_retries: max retries to scrape a page. Default is 5
 99 | 
100 | Returns a dictionary with the tweet's content.
101 | 
102 | ### Get profile information
103 | 
104 | ```python
105 | bezos_information = scraper.get_profile_info("JeffBezos")
106 | ```
107 | 
108 | Parameters:
109 | - username: username of the page to scrape
110 | - max_retries: max retries to scrape a page. Default is 5
111 | - instance: Nitter instance to use. Default is None
112 | - mode: mode of fetching profile info. 'simple' for basic info, 'detail' for detailed info including following and followers lists. Default is 'simple'
113 | 
114 | Returns a dictionary of the profile's information.
115 | 
116 | #### Multiprocessing
117 | 
118 | As for the term scraping, you can also get info from multiple profiles at once using multiprocessing:
119 | 
120 | ```python
121 | usernames = ["x", "github"]
122 | 
123 | results = scraper.get_profile_info(usernames)
124 | ```
125 | 
126 | Each user will be scraped in a different process. The result will be a list of dictionaries, one for each user.
127 | 
128 | The multiprocessing code needs to run in a `if __name__ == "__main__"` block to avoid errors. With multiprocessing, only full logging is supported. Also, the number of processes is limited to the number of available cores on your machine.
129 | 
130 | NOTE: using multiprocessing on public instances is highly discouraged since it puts too much load on the servers and could potentially also get you rate limited. Please only use it on your local instance.
131 | 
132 | ### Get random Nitter instance
133 | 
134 | ```python
135 | random_instance = scraper.get_random_instance()
136 | ```
137 | 
138 | Returns a random Nitter instance.
139 | 
140 | ## Note
141 | 
142 | Due to recent changes on Twitter's side, some Nitter instances may not work properly even if they are marked as "working" on Nitter's wiki. If you have trouble scraping with a certain instance, try changing it and check if the problem persists.
143 | 
144 | ## To do list
145 | 
146 | - [ ] Add scraping of individual posts with comments


--------------------------------------------------------------------------------
/ntscraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .nitter import Nitter


--------------------------------------------------------------------------------
/ntscraper/nitter.py:
--------------------------------------------------------------------------------
   1 | import requests
   2 | from bs4 import BeautifulSoup
   3 | import random
   4 | from urllib.parse import unquote, urlparse
   5 | from time import sleep
   6 | from base64 import b64decode
   7 | from random import uniform
   8 | from re import match, sub
   9 | from datetime import datetime
  10 | import logging
  11 | from logging.handlers import QueueHandler
  12 | from multiprocessing import Pool, Queue, cpu_count
  13 | from sys import stdout
  14 | from tqdm import tqdm
  15 | 
  16 | logging.basicConfig(
  17 |     level=logging.INFO,
  18 |     format="%(asctime)s - %(message)s",
  19 |     datefmt="%d-%b-%y %H:%M:%S",
  20 |     handlers=[logging.StreamHandler(stdout)],
  21 | )
  22 | 
  23 | log_queue = Queue()
  24 | log_handler = QueueHandler(log_queue)
  25 | root_logger = logging.getLogger()
  26 | root_logger.addHandler(log_handler)
  27 | 
  28 | valid_filters = [
  29 |     "nativeretweets",
  30 |     "media",
  31 |     "videos",
  32 |     "news",
  33 |     "verified",
  34 |     "native_video",
  35 |     "replies",
  36 |     "links",
  37 |     "images",
  38 |     "safe",
  39 |     "quote",
  40 |     "pro_video",
  41 | ]
  42 | 
  43 | 
  44 | class Nitter:
  45 |     def __init__(self, instances=None, log_level=1, skip_instance_check=False):
  46 |         """
  47 |         Nitter scraper
  48 |         :param instances: accepts a list of instances or a single instance in this format: "https://{host}:{port}", e.g. "http://localhost:8080
  49 |         :param log_level: logging level
  50 |         :param skip_instance_check: True if the health check of all instances and the instance change during execution should be skipped
  51 |         """
  52 |         if instances:
  53 |             # check instances type is list or str
  54 |             if isinstance(instances, list):
  55 |                 self.instances = instances
  56 |             elif isinstance(instances, str):
  57 |                 self.instances = [instances]
  58 |             else:
  59 |                 raise ValueError("Instances type not supported, only list and str are supported")
  60 |         else:
  61 |             self.instances = self._get_instances()
  62 |         if self.instances is None:
  63 |             raise ValueError("Could not fetch instances")
  64 |         self.working_instances = []
  65 |         self.skip_instance_check = skip_instance_check
  66 |         if skip_instance_check:
  67 |             self.working_instances = self.instances
  68 |         else:
  69 |             self._test_all_instances("/x", no_print=True)
  70 |         if log_level == 0:
  71 |             log_level = logging.WARNING
  72 |         elif log_level == 1:
  73 |             log_level = logging.INFO
  74 |         elif log_level:
  75 |             raise ValueError("Invalid log level")
  76 | 
  77 |         logger = logging.getLogger()
  78 |         logger.setLevel(log_level)
  79 | 
  80 |         self.retry_count = 0
  81 |         self.cooldown_count = 0
  82 |         self.session_reset = False
  83 |         self.instance = ""
  84 |         self.r = None
  85 | 
  86 |     def _initialize_session(self, instance):
  87 |         """
  88 |         Initialize the requests session
  89 |         """
  90 |         if instance is None:
  91 |             if self.skip_instance_check:
  92 |                 raise ValueError("No instance specified and instance check skipped")
  93 |             self.instance = self.get_random_instance()
  94 |             logging.info(
  95 |                 f"No instance specified, using random instance {self.instance}"
  96 |             )
  97 |         else:
  98 |             self.instance = instance
  99 |         self.r = requests.Session()
 100 |         self.r.headers.update(
 101 |             {
 102 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0",
 103 |                 "Host": self.instance.split("://")[1],
 104 |             }
 105 |         )
 106 | 
 107 |     def _is_instance_encrypted(self):
 108 |         """
 109 |         Check if the current instance uses encrypted media
 110 | 
 111 |         :return: True if encrypted, False otherwise
 112 |         """
 113 |         soup = self._get_page("/x")
 114 | 
 115 |         if soup is None:
 116 |             raise ValueError("Invalid instance")
 117 | 
 118 |         if (
 119 |             soup.find("a", class_="profile-card-avatar").find("img")
 120 |             and "/enc/"
 121 |             in soup.find("a", class_="profile-card-avatar").find("img")["src"]
 122 |         ):
 123 |             return True
 124 |         else:
 125 |             return False
 126 | 
 127 |     def _get_instances(self):
 128 |         """
 129 |         Fetch the list of clear web Nitter instances.
 130 | 
 131 |         :return: list of Nitter instances, or None if lookup failed
 132 |         """
 133 |         r = requests.get("https://raw.githubusercontent.com/libredirect/instances/main/data.json")
 134 |         if r.ok:
 135 |             return r.json()["nitter"]["clearnet"]
 136 |         else:
 137 |             return None
 138 | 
 139 |     def _test_all_instances(self, endpoint, no_print=False):
 140 |         """
 141 |         Test all Nitter instances when a high number of retries is detected
 142 | 
 143 |         :param endpoint: endpoint to use
 144 |         :param no_print: True if no output should be printed
 145 |         """
 146 |         if not no_print:
 147 |             print("High number of retries detected. Testing all instances...")
 148 |         working_instances = []
 149 | 
 150 |         for instance in tqdm(self.instances, desc="Testing instances"):
 151 |             self._initialize_session(instance)
 152 |             req_session = requests.Session()
 153 |             req_session.headers.update(
 154 |                 {
 155 |                     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
 156 |                 }
 157 |             )
 158 |             try:
 159 |                 r = req_session.get(
 160 |                     instance + endpoint,
 161 |                     cookies={"hlsPlayback": "on"},
 162 |                     timeout=10,
 163 |                 )
 164 |                 if r.ok:
 165 |                     soup = BeautifulSoup(r.text, "lxml")
 166 |                     if soup is not None and len(
 167 |                         soup.find_all("div", class_="timeline-item")
 168 |                     ):
 169 |                         working_instances.append(instance)
 170 |             except:
 171 |                 pass
 172 |         if not no_print:
 173 |             print("New working instances:", ", ".join(working_instances))
 174 |         self.working_instances = working_instances
 175 | 
 176 |     def _get_new_instance(self, message):
 177 |         instance = self.get_random_instance()
 178 |         logging.warning(f"{message}. Trying {instance}")
 179 |         return instance
 180 | 
 181 |     def _check_error_page(self, soup):
 182 |         """
 183 |         Check if the page contains an error. If so, print the error and return None
 184 | 
 185 |         :param soup: page to check
 186 |         :return: None if error is found, soup otherwise
 187 |         """
 188 |         if not soup.find(
 189 |             lambda tag: tag.name == "div"
 190 |             and (
 191 |                 tag.get("class") == ["timeline-item"]
 192 |                 or tag.get("class") == ["timeline-item", "thread"]
 193 |             )
 194 |         ):
 195 |             if soup.find("div", class_="error-panel"):
 196 |                 message = (
 197 |                     f"Fetching error: "
 198 |                     + soup.find("div", class_="error-panel").find("span").text.strip()
 199 |                 )
 200 |             else:
 201 |                 if soup.find("div", class_="timeline-header timeline-protected"):
 202 |                     message = "Account is protected"
 203 |                 else:
 204 |                     message = f"Empty page on {self.instance}"
 205 |             logging.warning(message)
 206 |             soup = None
 207 |         return soup
 208 | 
 209 |     def _get_page(self, endpoint, max_retries=5):
 210 |         """
 211 |         Download page from Nitter instance
 212 | 
 213 |         :param endpoint: endpoint to use
 214 |         :param max_retries: max number of retries, default 5
 215 |         :return: page content, or None if max retries reached
 216 |         """
 217 |         keep_trying = True
 218 |         soup = None
 219 |         while keep_trying and (self.retry_count < max_retries):
 220 |             try:
 221 |                 r = self.r.get(
 222 |                     self.instance + endpoint,
 223 |                     cookies={"hlsPlayback": "on", "infiniteScroll": ""},
 224 |                     timeout=10,
 225 |                 )
 226 |             except:
 227 |                 if self.retry_count == max_retries // 2:
 228 |                     if not self.skip_instance_check:
 229 |                         self._test_all_instances(endpoint)
 230 |                         if not self.working_instances:
 231 |                             logging.warning(
 232 |                                 "All instances are unreachable. Check your request and try again."
 233 |                             )
 234 |                             return None
 235 |                 if not self.skip_instance_check:
 236 |                     self._initialize_session(
 237 |                         instance=self._get_new_instance(f"{self.instance} unreachable")
 238 |                     )
 239 |                 self.retry_count += 1
 240 |                 self.cooldown_count = 0
 241 |                 self.session_reset = True
 242 |                 sleep(1)
 243 |                 continue
 244 |             soup = BeautifulSoup(r.text, "lxml")
 245 |             if r.ok:
 246 |                 self.session_reset = False
 247 |                 soup = self._check_error_page(soup)
 248 |                 keep_trying = False
 249 |             else:
 250 |                 soup = self._check_error_page(soup)
 251 |                 if soup is None:
 252 |                     keep_trying = False
 253 |                 else:
 254 |                     if self.retry_count == max_retries // 2:
 255 |                         if not self.skip_instance_check:
 256 |                             self._test_all_instances(endpoint)
 257 |                             if not self.working_instances:
 258 |                                 logging.warning(
 259 |                                     "All instances are unreachable. Check your request and try again."
 260 |                                 )
 261 |                                 soup = None
 262 |                                 keep_trying = False
 263 |                         else:
 264 |                             self.retry_count += 1
 265 |                     else:
 266 |                         if "cursor" in endpoint:
 267 |                             if not self.session_reset:
 268 |                                 logging.warning(
 269 |                                     "Cooldown reached, trying again in 20 seconds"
 270 |                                 )
 271 |                                 self.cooldown_count += 1
 272 |                                 sleep(20)
 273 |                             if self.cooldown_count >= 5 and not self.session_reset:
 274 |                                 if not self.skip_instance_check:
 275 |                                     self._initialize_session()
 276 |                                 else:
 277 |                                     self._initialize_session(self.instance)
 278 |                                 self.session_reset = True
 279 |                                 self.cooldown_count = 0
 280 |                             elif self.session_reset:
 281 |                                 if not self.skip_instance_check:
 282 |                                     self._initialize_session(
 283 |                                         self._get_new_instance(
 284 |                                             f"Error fetching {self.instance}"
 285 |                                         )
 286 |                                     )
 287 |                         else:
 288 |                             self.cooldown_count = 0
 289 |                             if not self.skip_instance_check:
 290 |                                 self._initialize_session(
 291 |                                     self._get_new_instance(
 292 |                                         f"Error fetching {self.instance}"
 293 |                                     )
 294 |                                 )
 295 |                         self.retry_count += 1
 296 |             sleep(2)
 297 | 
 298 |         if self.retry_count >= max_retries:
 299 |             logging.warning("Max retries reached. Check your request and try again.")
 300 |             soup = None
 301 |         self.retry_count = 0
 302 | 
 303 |         return soup
 304 | 
 305 |     def _get_quoted_media(self, quoted_tweet, is_encrypted):
 306 |         """
 307 |         Extract media from a quoted tweet
 308 | 
 309 |         :param quoted_tweet: tweet to extract media from
 310 |         :param is_encrypted: True if instance uses encrypted media
 311 |         :return: lists of images, videos and gifs, or empty lists if no media is found
 312 |         """
 313 |         quoted_pictures, quoted_videos, quoted_gifs = [], [], []
 314 |         if quoted_tweet.find("div", class_="attachments"):
 315 |             if is_encrypted:
 316 |                 quoted_pictures = [
 317 |                     "https://pbs.twimg.com/"
 318 |                     + b64decode(img["src"].split("/")[-1].encode("utf-8"))
 319 |                     .decode("utf-8")
 320 |                     .split("?")[0]
 321 |                     for img in quoted_tweet.find("div", class_="attachments").find_all(
 322 |                         "img"
 323 |                     )
 324 |                 ]
 325 |                 quoted_videos = [
 326 |                     b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode(
 327 |                         "utf-8"
 328 |                     )
 329 |                     if "data-url" in video.attrs
 330 |                     else video.find("source")["src"]
 331 |                     for video in quoted_tweet.find(
 332 |                         "div", class_="attachments"
 333 |                     ).find_all("video", class_="")
 334 |                 ]
 335 |                 quoted_gifs = [
 336 |                     "https://"
 337 |                     + b64decode(
 338 |                         gif.source["src"].split("/")[-1].encode("utf-8")
 339 |                     ).decode("utf-8")
 340 |                     for gif in quoted_tweet.find("div", class_="attachments").find_all(
 341 |                         "video", class_="gif"
 342 |                     )
 343 |                 ]
 344 |             else:
 345 |                 quoted_pictures = [
 346 |                     "https://pbs.twimg.com"
 347 |                     + unquote(img["src"].split("/pic")[1]).split("?")[0]
 348 |                     for img in quoted_tweet.find("div", class_="attachments").find_all(
 349 |                         "img"
 350 |                     )
 351 |                 ]
 352 |                 quoted_videos = [
 353 |                     unquote("https" + video["data-url"].split("https")[1])
 354 |                     if "data-url" in video.attrs
 355 |                     else unquote(video.find("source")["src"])
 356 |                     for video in quoted_tweet.find(
 357 |                         "div", class_="attachments"
 358 |                     ).find_all("video", class_="")
 359 |                 ]
 360 |                 quoted_gifs = [
 361 |                     unquote("https://" + gif.source["src"].split("/pic/")[1])
 362 |                     for gif in quoted_tweet.find("div", class_="attachments").find_all(
 363 |                         "video", class_="gif"
 364 |                     )
 365 |                 ]
 366 |         return quoted_pictures, quoted_videos, quoted_gifs
 367 | 
 368 |     def _get_tweet_media(self, tweet, is_encrypted):
 369 |         """
 370 |         Extract media from a tweet
 371 | 
 372 |         :param tweet: tweet to extract media from
 373 |         :param is_encrypted: True if instance uses encrypted media
 374 |         :return: lists of images, videos and gifs, or empty lists if no media is found
 375 |         """
 376 |         pictures, videos, gifs = [], [], []
 377 |         if tweet.find("div", class_="tweet-body").find(
 378 |             "div", class_="attachments", recursive=False
 379 |         ):
 380 |             if is_encrypted:
 381 |                 pictures = [
 382 |                     "https://pbs.twimg.com/"
 383 |                     + b64decode(img["src"].split("/")[-1].encode("utf-8"))
 384 |                     .decode("utf-8")
 385 |                     .split("?")[0]
 386 |                     for img in tweet.find("div", class_="tweet-body")
 387 |                     .find("div", class_="attachments", recursive=False)
 388 |                     .find_all("img")
 389 |                 ]
 390 |                 videos = [
 391 |                     b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode(
 392 |                         "utf-8"
 393 |                     )
 394 |                     if "data-url" in video.attrs
 395 |                     else video.find("source")["src"]
 396 |                     for video in tweet.find("div", class_="tweet-body")
 397 |                     .find("div", class_="attachments", recursive=False)
 398 |                     .find_all("video", class_="")
 399 |                 ]
 400 |                 gifs = [
 401 |                     "https://"
 402 |                     + b64decode(
 403 |                         gif.source["src"].split("/")[-1].encode("utf-8")
 404 |                     ).decode("utf-8")
 405 |                     for gif in tweet.find("div", class_="tweet-body")
 406 |                     .find("div", class_="attachments", recursive=False)
 407 |                     .find_all("video", class_="gif")
 408 |                 ]
 409 |             else:
 410 |                 pictures = [
 411 |                     "https://pbs.twimg.com"
 412 |                     + unquote(img["src"].split("/pic")[1]).split("?")[0]
 413 |                     for img in tweet.find("div", class_="tweet-body")
 414 |                     .find("div", class_="attachments", recursive=False)
 415 |                     .find_all("img")
 416 |                 ]
 417 |                 videos = [
 418 |                     unquote("https" + video["data-url"].split("https")[1])
 419 |                     if "data-url" in video.attrs
 420 |                     else video.find("source")["src"]
 421 |                     for video in tweet.find("div", class_="tweet-body")
 422 |                     .find("div", class_="attachments", recursive=False)
 423 |                     .find_all("video", class_="")
 424 |                 ]
 425 |                 gifs = [
 426 |                     unquote("https://" + gif.source["src"].split("/pic/")[1])
 427 |                     for gif in tweet.find("div", class_="tweet-body")
 428 |                     .find("div", class_="attachments", recursive=False)
 429 |                     .find_all("video", class_="gif")
 430 |                 ]
 431 |         return pictures, videos, gifs
 432 | 
 433 |     def _get_tweet_stats(self, tweet):
 434 |         """
 435 |         Extract stats from a tweet
 436 | 
 437 |         :param tweet: tweet to extract stats from
 438 |         :return: dictionary of stats. If a stat is not found, it is set to 0
 439 |         """
 440 |         return {
 441 |             "comments": int(
 442 |                 tweet.find_all("span", class_="tweet-stat")[0]
 443 |                 .find("div")
 444 |                 .text.strip()
 445 |                 .replace(",", "")
 446 |                 or 0
 447 |             ),
 448 |             "retweets": int(
 449 |                 tweet.find_all("span", class_="tweet-stat")[1]
 450 |                 .find("div")
 451 |                 .text.strip()
 452 |                 .replace(",", "")
 453 |                 or 0
 454 |             ),
 455 |             "quotes": int(
 456 |                 tweet.find_all("span", class_="tweet-stat")[2]
 457 |                 .find("div")
 458 |                 .text.strip()
 459 |                 .replace(",", "")
 460 |                 or 0
 461 |             ),
 462 |             "likes": int(
 463 |                 tweet.find_all("span", class_="tweet-stat")[3]
 464 |                 .find("div")
 465 |                 .text.strip()
 466 |                 .replace(",", "")
 467 |                 or 0
 468 |             ),
 469 |         }
 470 | 
 471 |     def _get_user(self, tweet, is_encrypted):
 472 |         """
 473 |         Extract user from a tweet
 474 | 
 475 |         :param tweet: tweet to extract user from
 476 |         :param is_encrypted: True if instance uses encrypted media
 477 |         :return: dictionary of user
 478 |         """
 479 |         avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png"  # Default avatar
 480 |         profile_id = "unknown"  # Default profile ID
 481 | 
 482 |         if is_encrypted:
 483 |             try:
 484 |                 avatar = "https://pbs.twimg.com/" + b64decode(
 485 |                     tweet.find("img", class_="avatar")["src"]
 486 |                     .split("/")[-1]
 487 |                     .encode("utf-8")
 488 |                 ).decode("utf-8")
 489 |             except:
 490 |                 avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png"  # Fallback avatar
 491 | 
 492 |         else:
 493 |             avatar_tag = tweet.find("img", class_="avatar")
 494 |             if avatar_tag and avatar_tag.has_attr("src"):
 495 |                 avatar = unquote(avatar_tag["src"])  # Successfully getting avatar
 496 |             else:
 497 |                 avatar = "https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png"  # Fallback avatar
 498 | 
 499 |         # Extract profile_id directly from the avatar URL if available
 500 |         if "profile_images" in avatar:
 501 |             profile_id = avatar.split("/profile_images/")[1].split("/")[0]
 502 | 
 503 |         return {
 504 |             "name": tweet.find("a", class_="fullname").text.strip(),
 505 |             "username": tweet.find("a", class_="username").text.strip(),
 506 |             "profile_id": profile_id,
 507 |             "avatar": avatar,
 508 |         }
 509 | 
 510 |     def _get_tweet_date(self, tweet):
 511 |         """
 512 |         Extract date from a tweet
 513 | 
 514 |         :param tweet: tweet to extract date from
 515 |         :return: date of tweet
 516 |         """
 517 |         return (
 518 |             tweet.find("span", class_="tweet-date")
 519 |             .find("a")["title"]
 520 |             .split("/")[-1]
 521 |             .split("#")[0]
 522 |             if tweet.find("span", class_="tweet-date")
 523 |             else ""
 524 |         )
 525 | 
 526 |     def _get_tweet_text(self, tweet):
 527 |         """
 528 |         Extract text from a tweet
 529 | 
 530 |         :param tweet: tweet to extract text from
 531 |         :return: text of tweet
 532 |         """
 533 |         return (
 534 |             tweet.find("div", class_="tweet-content media-body")
 535 |             .text.strip()
 536 |             .replace("\n", " ")
 537 |             if tweet.find("div", class_="tweet-content media-body")
 538 |             else tweet.find("div", class_="quote-text").text.strip().replace("\n", " ")
 539 |             if tweet.find("div", class_="quote-text")
 540 |             else ""
 541 |         )
 542 | 
 543 |     def _get_tweet_link(self, tweet):
 544 |         """
 545 |         Extract link from a tweet
 546 | 
 547 |         :param tweet: tweet to extract link from
 548 |         :return: link of tweet
 549 |         """
 550 |         tweet_date = tweet.find("span", class_="tweet-date")
 551 |         return "https://twitter.com" + tweet_date.find("a")["href"] if tweet_date else ""
 552 | 
 553 |     def _get_external_link(self, tweet):
 554 |         """
 555 |         Extract external link from a tweet
 556 | 
 557 |         :param tweet: tweet to extract external link from
 558 |         :return: external link of tweet
 559 |         """
 560 |         return (
 561 |             tweet.find("a", class_="card-container")["href"]
 562 |             if tweet.find("a", class_="card-container")
 563 |             else ""
 564 |         )
 565 | 
 566 |     def _get_replied_to(self, tweet):
 567 |         """
 568 |         Extract the users a tweet is replying to. If the tweet is not a reply,
 569 |         return an empty list.
 570 | 
 571 |         :param tweet: tweet to extract replies from
 572 |         :return: list of users the tweet is replying to
 573 |         """
 574 |         return (
 575 |             [
 576 |                 user.text.strip()
 577 |                 for user in tweet.find("div", class_="replying-to").find_all("a")
 578 |             ]
 579 |             if tweet.find("div", class_="replying-to")
 580 |             else []
 581 |         )
 582 | 
 583 |     def _extract_tweet(self, tweet, is_encrypted):
 584 |         """
 585 |         Extract content from a tweet
 586 | 
 587 |         :param tweet: tweet to extract content from
 588 |         :param is_encrypted: True if instance uses encrypted media
 589 |         :return: dictionary of content for the tweet
 590 |         """
 591 |         # Replace link text with link
 592 |         if tweet.find_all("a"):
 593 |             for link in tweet.find_all("a"):
 594 |                 if "https" in link["href"]:
 595 |                     link.replace_with(link["href"])
 596 | 
 597 |         # Extract the quoted tweet
 598 |         quoted_tweet = (
 599 |             tweet.find("div", class_="quote")
 600 |             if tweet.find("div", class_="quote")
 601 |             else None
 602 |         )
 603 | 
 604 |         # Extract media from the quoted tweet
 605 |         if quoted_tweet:
 606 |             deleted = False
 607 |             if quoted_tweet["class"] == ["quote", "unavailable"]:
 608 |                 deleted = True
 609 |             (
 610 |                 quoted_pictures,
 611 |                 quoted_videos,
 612 |                 quoted_gifs,
 613 |             ) = self._get_quoted_media(quoted_tweet, is_encrypted)
 614 | 
 615 |         # Extract media from the tweet
 616 |         pictures, videos, gifs = self._get_tweet_media(tweet, is_encrypted)
 617 | 
 618 |         # Extract the tweet id
 619 |         link = self._get_tweet_link(tweet)
 620 |         id = urlparse(link).path.rsplit("/", 1)[-1]
 621 | 
 622 |         return {
 623 |             "id": id,
 624 |             "link": link,
 625 |             "text": self._get_tweet_text(tweet),
 626 |             "user": self._get_user(tweet, is_encrypted),
 627 |             "date": self._get_tweet_date(tweet),
 628 |             "is-retweet": tweet.find("div", class_="retweet-header") is not None,
 629 |             "is-pinned": tweet.find("div", class_="pinned") is not None,
 630 |             "external-link": self._get_external_link(tweet),
 631 |             "replying-to": self._get_replied_to(tweet),
 632 |             "quoted-post": {
 633 |                 "link": self._get_tweet_link(quoted_tweet) if not deleted else "",
 634 |                 "text": self._get_tweet_text(quoted_tweet) if not deleted else "",
 635 |                 "user": self._get_user(quoted_tweet, is_encrypted)
 636 |                 if not deleted
 637 |                 else {},
 638 |                 "date": self._get_tweet_date(quoted_tweet) if not deleted else "",
 639 |                 "pictures": quoted_pictures,
 640 |                 "videos": quoted_videos,
 641 |                 "gifs": quoted_gifs,
 642 |             }
 643 |             if quoted_tweet
 644 |             else {},
 645 |             "stats": self._get_tweet_stats(tweet),
 646 |             "pictures": pictures,
 647 |             "videos": videos,
 648 |             "gifs": gifs,
 649 |         }
 650 | 
 651 |     def _check_date_validity(self, date):
 652 |         """
 653 |         Check if a date is valid
 654 | 
 655 |         :param date: date to check
 656 |         :return: True if date is valid
 657 |         """
 658 |         to_return = True
 659 |         if not match(r"^\d{4}-\d{2}-\d{2}$", date):
 660 |             to_return = False
 661 |         try:
 662 |             year, month, day = [int(number) for number in date.split("-")]
 663 |             datetime(year=year, month=month, day=day)
 664 |         except:
 665 |             to_return = False
 666 | 
 667 |         if not (
 668 |             datetime(year=2006, month=3, day=21)
 669 |             < datetime(year=year, month=month, day=day)
 670 |             <= datetime.now()
 671 |         ):
 672 |             to_return = False
 673 | 
 674 |         return to_return
 675 | 
 676 |     def _search(
 677 |         self,
 678 |         term,
 679 |         mode,
 680 |         number,
 681 |         since,
 682 |         until,
 683 |         near,
 684 |         language,
 685 |         to,
 686 |         replies,
 687 |         filters,
 688 |         exclude,
 689 |         max_retries,
 690 |         instance,
 691 |     ):
 692 |         """
 693 |         Scrape the specified search terms from Nitter
 694 | 
 695 |         :param term: term to seach for
 696 |         :param mode: search mode.
 697 |         :param number: number of tweets to scrape.
 698 |         :param since: date to start scraping from.
 699 |         :param until: date to stop scraping at.
 700 |         :param near: location to search near.
 701 |         :param language: language of the tweets.
 702 |         :param to: user to which the tweets are directed.
 703 |         :param replies: True if both tweets and replies are needed.
 704 |         :param filters: list of filters to apply.
 705 |         :param exclude: list of filters to exclude.
 706 |         :param max_retries: max retries to scrape a page.
 707 |         :param instance: Nitter instance to use.
 708 |         :return: dictionary of tweets and threads for the term.
 709 |         """
 710 |         tweets = {"tweets": [], "threads": []}
 711 |         if mode == "hashtag":
 712 |             endpoint = "/search?f=tweets&q=%23" + term
 713 |         elif mode == "term":
 714 |             endpoint = "/search?f=tweets&q=" + term
 715 |         elif mode == "user":
 716 |             if since or until or filters or exclude or near:
 717 |                 endpoint = f"/{term}/search?f=tweets&q="
 718 |             else:
 719 |                 endpoint = f"/{term}"
 720 |                 if replies and not filters:
 721 |                     endpoint += "/with_replies"
 722 |         else:
 723 |             raise ValueError("Invalid mode. Use 'term', 'hashtag', or 'user'.")
 724 | 
 725 |         self._initialize_session(instance)
 726 | 
 727 |         if language:
 728 |             endpoint += f"+lang%3A{language}"
 729 | 
 730 |         if to:
 731 |             endpoint += f"+to%3A{to}"
 732 | 
 733 |         if since:
 734 |             if self._check_date_validity(since):
 735 |                 endpoint += f"&since={since}"
 736 |             else:
 737 |                 raise ValueError(
 738 |                     "Invalid 'since' date. Use the YYYY-MM-DD format and make sure the date is valid."
 739 |                 )
 740 | 
 741 |         if until:
 742 |             if self._check_date_validity(until):
 743 |                 endpoint += f"&until={until}"
 744 |             else:
 745 |                 raise ValueError(
 746 |                     "Invalid 'until' date. Use the YYYY-MM-DD format and make sure the date is valid."
 747 |                 )
 748 | 
 749 |         if near:
 750 |             endpoint += f"&near={near}"
 751 | 
 752 |         if filters:
 753 |             for f in filters:
 754 |                 if f not in valid_filters:
 755 |                     raise ValueError(
 756 |                         f"Invalid filter '{f}'. Valid filters are: {', '.join(valid_filters)}"
 757 |                     )
 758 |                 endpoint += f"&f-{f}=on"
 759 | 
 760 |         if exclude:
 761 |             for e in exclude:
 762 |                 if e not in valid_filters:
 763 |                     raise ValueError(
 764 |                         f"Invalid exclusion filter '{e}'. Valid filters are: {', '.join(valid_filters)}"
 765 |                     )
 766 |                 endpoint += f"&e-{e}=on"
 767 | 
 768 |         if mode != "user":
 769 |             if "?" in endpoint:
 770 |                 endpoint += "&scroll=false"
 771 |             else:
 772 |                 endpoint += "?scroll=false"
 773 | 
 774 |         soup = self._get_page(endpoint, max_retries)
 775 | 
 776 |         if soup is None:
 777 |             return tweets
 778 | 
 779 |         is_encrypted = self._is_instance_encrypted()
 780 | 
 781 |         already_scraped = set()
 782 | 
 783 |         number = float("inf") if number == -1 else number
 784 |         keep_scraping = True
 785 |         while keep_scraping:
 786 |             thread = []
 787 | 
 788 |             for tweet in soup.find_all("div", class_="timeline-item"):
 789 |                 if len(tweet["class"]) == 1:
 790 |                     to_append = self._extract_tweet(tweet, is_encrypted)
 791 |                     # Extract tweets
 792 |                     if len(tweets["tweets"]) + len(tweets["threads"]) < number:
 793 |                         if self._get_tweet_link(tweet) not in already_scraped:
 794 |                             tweets["tweets"].append(to_append)
 795 |                             already_scraped.add(self._get_tweet_link(tweet))
 796 |                     else:
 797 |                         keep_scraping = False
 798 |                         break
 799 |                 else:
 800 |                     if "thread" in tweet["class"]:
 801 |                         to_append = self._extract_tweet(tweet, is_encrypted)
 802 |                         # Extract threads
 803 |                         if self._get_tweet_link(tweet) not in already_scraped:
 804 |                             thread.append(to_append)
 805 |                             already_scraped.add(self._get_tweet_link(tweet))
 806 | 
 807 |                         if len(tweet["class"]) == 3:
 808 |                             tweets["threads"].append(thread)
 809 |                             thread = []
 810 | 
 811 |             logging.info(
 812 |                 f"Current stats for {term}: {len(tweets['tweets'])} tweets, {len(tweets['threads'])} threads..."
 813 |             )
 814 |             if (
 815 |                 not (since and until)
 816 |                 and not (since)
 817 |                 and len(tweets["tweets"]) + len(tweets["threads"]) >= number
 818 |             ):
 819 |                 keep_scraping = False
 820 |             else:
 821 |                 sleep(uniform(1, 2))
 822 | 
 823 |                 # Go to the next page
 824 |                 show_more_buttons = soup.find_all("div", class_="show-more")
 825 |                 if soup.find_all("div", class_="show-more"):
 826 |                     if mode == "user":
 827 |                         if since or until:
 828 |                             next_page = (
 829 |                                 f"/{term}/search?"
 830 |                                 + show_more_buttons[-1].find("a")["href"].split("?")[-1]
 831 |                             )
 832 |                         else:
 833 |                             next_page = (
 834 |                                 f"/{term}?"
 835 |                                 + show_more_buttons[-1].find("a")["href"].split("?")[-1]
 836 |                             )
 837 |                     else:
 838 |                         next_page = "/search" + show_more_buttons[-1].find("a")["href"]
 839 |                     soup = self._get_page(next_page, max_retries)
 840 |                     if soup is None:
 841 |                         keep_scraping = False
 842 |                 else:
 843 |                     keep_scraping = False
 844 |         return tweets
 845 | 
 846 |     def _search_dispatch(self, args):
 847 |         return self._search(*args)
 848 | 
 849 |     def get_random_instance(self):
 850 |         """
 851 |         Get a random Nitter instance
 852 | 
 853 |         :return: URL of random Nitter instance
 854 |         """
 855 |         return random.choice(self.working_instances)
 856 | 
 857 |     def get_tweet_by_id(self, username, tweet_id, instance=None, max_retries=5):
 858 |         """
 859 |         Fetch a tweet by its ID.
 860 | 
 861 |         :param username: The username of the tweet.
 862 |         :param tweet_id: The ID of the tweet to fetch.
 863 |         :param instance: The specific Nitter instance to use.
 864 |         :param max_retries: Max retries to scrape a page. Default is 5.
 865 |         :return: Dictionary of the tweet content.
 866 |         """
 867 |         if instance:
 868 |             self._initialize_session(instance)
 869 |         else:
 870 |             if not self.working_instances:
 871 |                 raise ValueError("No working instances available.")
 872 |             self.instance = self.get_random_instance()
 873 | 
 874 |         endpoint = f"/{username}/status/{tweet_id}"
 875 |         soup = self._get_page(endpoint, max_retries)
 876 | 
 877 |         if soup is None:
 878 |             return None
 879 | 
 880 |         is_encrypted = self._is_instance_encrypted()
 881 | 
 882 |         tweet = soup.find("div", class_="timeline-item")
 883 |         if tweet:
 884 |             return self._extract_tweet(tweet, is_encrypted)
 885 |         else:
 886 |             logging.warning(f"Tweet with ID {tweet_id} not found.")
 887 |             return None
 888 | 
 889 |     def get_tweets(
 890 |         self,
 891 |         terms,
 892 |         mode="term",
 893 |         number=-1,
 894 |         since=None,
 895 |         until=None,
 896 |         near=None,
 897 |         language=None,
 898 |         to=None,
 899 |         replies=False,
 900 |         filters=None,
 901 |         exclude=None,
 902 |         max_retries=5,
 903 |         instance=None,
 904 |     ):
 905 |         """
 906 |         Scrape the specified term from Nitter
 907 | 
 908 |         :param terms: string/s to search for
 909 |         :param mode: search mode. Default is 'term', can also be 'hashtag' or 'user'
 910 |         :param number: number of tweets to scrape. Default is -1 (to not set a limit).
 911 |         :param since: date to start scraping from, formatted as YYYY-MM-DD. Default is None
 912 |         :param until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None
 913 |         :param near: near location of the tweets. Default is None (anywhere)
 914 |         :param language: language of the tweets. Default is None (any language)
 915 |         :param to: user to which the tweets are directed. Default is None (any user)
 916 |         :param replies: True if both tweets and replies are needed. If 'filters' or 'exclude' are set, this option will be overridden. Default is False
 917 |         :param filters: list of filters to apply. Default is None
 918 |         :param exclude: list of filters to exclude. Default is None
 919 |         :param max_retries: max retries to scrape a page. Default is 5
 920 |         :param instance: Nitter instance to use. Default is None
 921 |         :return: dictionary or array with dictionaries (in case of multiple terms) of the tweets and threads for the provided terms
 922 |         """
 923 |         if type(terms) == str:
 924 |             term = terms.strip()
 925 | 
 926 |             return self._search(
 927 |                 term,
 928 |                 mode,
 929 |                 number,
 930 |                 since,
 931 |                 until,
 932 |                 near,
 933 |                 language,
 934 |                 to,
 935 |                 replies,
 936 |                 filters,
 937 |                 exclude,
 938 |                 max_retries,
 939 |                 instance,
 940 |             )
 941 |         elif len(terms) == 1:
 942 |             term = terms[0].strip()
 943 | 
 944 |             return self._search(
 945 |                 term,
 946 |                 mode,
 947 |                 number,
 948 |                 since,
 949 |                 until,
 950 |                 near,
 951 |                 language,
 952 |                 to,
 953 |                 replies,
 954 |                 filters,
 955 |                 exclude,
 956 |                 max_retries,
 957 |                 instance,
 958 |             )
 959 |         else:
 960 |             if len(terms) > cpu_count():
 961 |                 raise ValueError(
 962 |                     f"Too many terms. You can search at most {cpu_count()} terms."
 963 |                 )
 964 | 
 965 |             args = [
 966 |                 (
 967 |                     term.strip(),
 968 |                     mode,
 969 |                     number,
 970 |                     since,
 971 |                     until,
 972 |                     near,
 973 |                     language,
 974 |                     to,
 975 |                     replies,
 976 |                     filters,
 977 |                     exclude,
 978 |                     max_retries,
 979 |                     instance,
 980 |                 )
 981 |                 for term in terms
 982 |             ]
 983 |             with Pool(len(terms)) as p:
 984 |                 results = list(p.map(self._search_dispatch, args))
 985 | 
 986 |             return results
 987 | 
 988 |     def _profile_info(self, username, max_retries, instance):
 989 |         """
 990 |         Gets the profile information for a user.
 991 | 
 992 |         :param username: username of the page to scrape
 993 |         :param max_retries: max retries to scrape a page. Default is 5
 994 |         :param instance: Nitter instance to use. Default is None
 995 |         :return: dictionary of the profile's information
 996 |         """
 997 |         self._initialize_session(instance)
 998 |         username = sub(r"[^A-Za-z0-9_+-:]", "", username)
 999 |         soup = self._get_page(f"/{username}", max_retries)
1000 |         if soup is None:
1001 |             return None
1002 | 
1003 |         is_encrypted = self._is_instance_encrypted()
1004 |         # Extract id if the banner exists, no matter if the instance uses base64 or not
1005 |         if soup.find("div", class_="profile-banner").find("img") and is_encrypted:
1006 |             profile_id = (
1007 |                 b64decode(
1008 |                     soup.find("div", class_="profile-banner")
1009 |                     .find("img")["src"]
1010 |                     .split("/enc/")[1]
1011 |                     .encode("utf-8")
1012 |                 )
1013 |                 .decode("utf-8")
1014 |                 .split("/profile_banners/")[1]
1015 |                 .split("/")[0]
1016 |             )
1017 |         elif soup.find("div", class_="profile-banner").find("img"):
1018 |             profile_id = (
1019 |                 unquote(soup.find("div", class_="profile-banner").find("img")["src"])
1020 |                 .split("profile_banners/")[1]
1021 |                 .split("/")[0]
1022 |             )
1023 |         else:
1024 |             profile_id = ""
1025 | 
1026 |         # Extract profile image, no matter if the instance uses base64 or not
1027 |         if soup.find("a", class_="profile-card-avatar").find("img") and is_encrypted:
1028 |             profile_image = "https://" + b64decode(
1029 |                 soup.find("a", class_="profile-card-avatar")
1030 |                 .find("img")["src"]
1031 |                 .split("/enc/")[1]
1032 |                 .encode("utf-8")
1033 |             ).decode("utf-8")
1034 |         elif soup.find("a", class_="profile-card-avatar").find("img"):
1035 |             profile_image = (
1036 |                 "https://"
1037 |                 + unquote(
1038 |                     soup.find("a", class_="profile-card-avatar").find("img")["src"]
1039 |                 ).split("/pic/")[1]
1040 |             )
1041 |         else:
1042 |             profile_image = ""
1043 | 
1044 |         icon_container = (
1045 |             soup.find("div", class_="photo-rail-header").find(
1046 |                 "div", class_="icon-container"
1047 |             )
1048 |             if soup.find("div", class_="photo-rail-header")
1049 |             else None
1050 |         )
1051 | 
1052 |         return {
1053 |             "image": profile_image,
1054 |             "name": soup.find("a", class_="profile-card-fullname").text.strip(),
1055 |             "username": soup.find("a", class_="profile-card-username").text.strip(),
1056 |             "id": profile_id,
1057 |             "bio": soup.find("div", class_="profile-bio").p.text.strip()
1058 |             if soup.find("div", class_="profile-bio")
1059 |             else "",
1060 |             "location": soup.find("div", class_="profile-location")
1061 |             .find_all("span")[-1]
1062 |             .text.strip()
1063 |             if soup.find("div", class_="profile-location")
1064 |             else "",
1065 |             "website": soup.find("div", class_="profile-website").find("a")["href"]
1066 |             if soup.find("div", class_="profile-website")
1067 |             else "",
1068 |             "joined": soup.find("div", class_="profile-joindate").find("span")["title"],
1069 |             "stats": {
1070 |                 "tweets": int(
1071 |                     soup.find("ul", class_="profile-statlist")
1072 |                     .find("li", class_="posts")
1073 |                     .find_all("span")[1]
1074 |                     .text.strip()
1075 |                     .replace(",", "")
1076 |                 ),
1077 |                 "following": int(
1078 |                     soup.find("ul", class_="profile-statlist")
1079 |                     .find("li", class_="following")
1080 |                     .find_all("span")[1]
1081 |                     .text.strip()
1082 |                     .replace(",", "")
1083 |                 ),
1084 |                 "followers": int(
1085 |                     soup.find("ul", class_="profile-statlist")
1086 |                     .find("li", class_="followers")
1087 |                     .find_all("span")[1]
1088 |                     .text.strip()
1089 |                     .replace(",", "")
1090 |                 ),
1091 |                 "likes": int(
1092 |                     soup.find("ul", class_="profile-statlist")
1093 |                     .find("li", class_="likes")
1094 |                     .find_all("span")[1]
1095 |                     .text.strip()
1096 |                     .replace(",", "")
1097 |                 ),
1098 |                 "media": int(
1099 |                     icon_container.text.strip().replace(",", "").split(" ")[0]
1100 |                     if icon_container
1101 |                     else 0
1102 |                 ),
1103 |             },
1104 |         }
1105 | 
1106 |     def _search_profile_dispatch(self, args):
1107 |         return self.get_profile_info(*args)
1108 | 
1109 |     def get_profile_info(self, username, max_retries=5, instance=None, mode='simple'):
1110 |         """
1111 |         Get profile information for a user or a list of users
1112 | 
1113 |         :param username: username/s of the page to scrape (str or list of str)
1114 |         :param max_retries: max retries to scrape a page. Default is 5
1115 |         :param instance: Nitter instance to use. Default is None
1116 |         :param mode: Mode of fetching profile info. 'simple' for basic info, 'detail' for detailed info including following and followers lists. Default is 'simple'
1117 |         :return: dictionary of the profile's information or list of dictionaries if username is a list. The dictionary contains the following keys:
1118 |             - image: URL of the profile image
1119 |             - name: Full name of the user
1120 |             - username: Username of the user
1121 |             - id: Profile ID
1122 |             - bio: Bio of the user
1123 |             - location: Location of the user
1124 |             - website: Website URL of the user
1125 |             - joined: Date when the user joined
1126 |             - stats: Dictionary containing the following keys:
1127 |                 - tweets: Number of tweets
1128 |                 - following: Number of users the profile is following
1129 |                 - followers: Number of followers
1130 |                 - likes: Number of likes
1131 |                 - media: Number of media posts
1132 |             - following_list: List of usernames the profile is following (only in 'detail' mode)
1133 |             - followers_list: List of usernames following the profile (only in 'detail' mode)
1134 |         """
1135 | 
1136 |         def _get_follow_list(endpoint):
1137 |             follow_list = []
1138 |             cursor = None
1139 |             while True:
1140 |                 url = f"{endpoint}?cursor={cursor}" if cursor else endpoint
1141 |                 soup = self._get_page(url, max_retries)
1142 |                 if not soup:
1143 |                     break
1144 |                 users = [user.text.strip() for user in soup.find_all("a", class_="username")]
1145 |                 if not users:
1146 |                     break
1147 |                 follow_list.extend(users)
1148 |                 load_more = soup.find("div", class_="show-more")
1149 |                 if load_more and load_more.find("a"):
1150 |                     cursor = load_more.find("a")["href"].split("cursor=")[-1]
1151 |                 else:
1152 |                     break
1153 |             return follow_list
1154 | 
1155 |         if isinstance(username, str):
1156 |             username = username.strip()
1157 |             profile_info = self._profile_info(username, max_retries, instance)
1158 |             if profile_info and mode == 'detail':
1159 |                 profile_info["following_list"] = _get_follow_list(f"/{username}/following")
1160 |                 profile_info["followers_list"] = _get_follow_list(f"/{username}/followers")
1161 |             return profile_info
1162 |         elif len(username) == 1:
1163 |             username = username[0].strip()
1164 |             profile_info = self._profile_info(username, max_retries, instance)
1165 |             if profile_info and mode == 'detail':
1166 |                 profile_info["following_list"] = _get_follow_list(f"/{username}/following")
1167 |                 profile_info["followers_list"] = _get_follow_list(f"/{username}/followers")
1168 |             return profile_info
1169 |         else:
1170 |             if len(username) > cpu_count():
1171 |                 raise ValueError(f"Too many usernames. You can use at most {cpu_count()} usernames.")
1172 | 
1173 |             args = [(user.strip(), max_retries, instance, mode) for user in username]
1174 |             with Pool(len(username)) as p:
1175 |                 results = list(p.map(self._search_profile_dispatch, args))
1176 |             return results
1177 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.1
2 | requests==2.28.1
3 | setuptools==65.5.0
4 | lxml==4.9.2
5 | tqdm==4.66.1


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | from os import path
 4 | 
 5 | HERE = path.abspath(path.dirname(__file__))
 6 | 
 7 | with open(path.join(HERE, 'README.md'), encoding='utf-8') as f:
 8 |     long_description = f.read()
 9 | 
10 | setup(
11 |     name="ntscraper",
12 |     version="0.4.0",
13 |     description="Unofficial library to scrape Twitter profiles and posts from Nitter instances",
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     project_urls={
17 |         'Homepage': 'https://github.com/bocchilorenzo/ntscraper',
18 |         'Source': 'https://github.com/bocchilorenzo/ntscraper',
19 |         'Documentation': 'https://github.com/bocchilorenzo/ntscraper'
20 |     },
21 |     keywords=["twitter", "nitter", "scraping"],
22 |     author="Lorenzo Bocchi",
23 |     author_email="lorenzobocchi99@yahoo.com",
24 |     license="MIT",
25 |     classifiers=[
26 |         "Intended Audience :: Developers",
27 |         "License :: OSI Approved :: MIT License",
28 |         "Programming Language :: Python",
29 |         "Programming Language :: Python :: 3",
30 |         "Programming Language :: Python :: 3.7",
31 |         "Programming Language :: Python :: 3.9",
32 |         "Programming Language :: Python :: 3.10",
33 |         "Operating System :: OS Independent"
34 |     ],
35 |     packages=["ntscraper"],
36 |     include_package_data=True,
37 |     install_requires=["requests>=2.28", "beautifulsoup4>=4.11", "lxml>=4.9", "tqdm>=4.66"],
38 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bocchilorenzo/ntscraper/26c87edf8ed31472debe6c929dfa4d64c689102a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/instances_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestProfile(unittest.TestCase):
 5 |     def get_instances(self):
 6 |         """
 7 |         Test retrieval of instances. Should only return updated instances.
 8 |         """
 9 |         nitter = Nitter()
10 |         instances = nitter.__get_instances()
11 |         self.assertGreater(len(instances), 0)


--------------------------------------------------------------------------------
/tests/profile_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestProfile(unittest.TestCase):
 5 |     def scrape_profile_info(self):
 6 |         """
 7 |         Test scraping profile info of a username (Twitter, we need a stable username)
 8 |         """
 9 |         nitter = Nitter()
10 |         profile = nitter.get_profile_info("Twitter")
11 |         self.assertEqual(profile['name'], "Twitter")
12 |         self.assertEqual(profile['username'], "@Twitter")
13 |         self.assertEqual(profile['bio'], "What's happening?!")
14 |         self.assertEqual(profile['location'], 'everywhere')
15 |         self.assertEqual(profile['website'], 'https://about.twitter.com/')
16 |         self.assertEqual(profile['joined'], '2:35 PM - 20 Feb 2007')
17 |         self.assertGreater(profile['stats']['tweets'], 0)
18 |         self.assertGreater(profile['stats']['following'], 0)
19 |         self.assertGreater(profile['stats']['followers'], 0)
20 |         self.assertGreater(profile['stats']['likes'], 0)
21 |         self.assertGreater(profile['stats']['media'], 0)
22 |         self.assertEqual(profile['image'], 'https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_400x400.jpg')
23 |     
24 |     def scrape_profile_tweets(self):
25 |         """
26 |         Test scraping profile tweets of a username (Twitter, we need a stable username)
27 |         """
28 |         nitter = Nitter()
29 |         tweets = nitter.get_tweets("Twitter", 'user')
30 |         self.assertGreater(len(tweets['tweets']), 0)
31 | 
32 |     def scrape_profile_tweets_since(self):
33 |         """
34 |         Test scraping profile tweets of a username (Twitter, we need a stable username) in a certain time period
35 |         """
36 |         nitter = Nitter()
37 |         tweets = nitter.get_tweets("Twitter", mode='user', since='2022-12-01', until='2022-12-31', number=1)
38 |         self.assertGreater(len(tweets['threads']), 1)


--------------------------------------------------------------------------------
/tests/search_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestSearch(unittest.TestCase):
 5 |     def scrape_term(self):
 6 |         """
 7 |         Test scraping a term
 8 |         """
 9 |         nitter = Nitter()
10 |         tweets = nitter.get_tweets("Twitter", 'term')
11 |         self.assertGreater(len(tweets['tweets']), 0)
12 | 
13 |     def test_scrape_user(self):
14 |         """
15 |         Test scraping a user
16 |         """
17 |         nitter = Nitter()
18 |         tweets = nitter.get_tweets("X", mode='user', number=10)
19 |         self.assertGreater(len(tweets['tweets']), 0)
20 | 
21 |     def scrape_hashtag(self):
22 |         """
23 |         Test scraping a hashtag
24 |         """
25 |         nitter = Nitter()
26 |         tweets = nitter.get_tweets("twitter", 'hashtag')
27 |         self.assertGreater(len(tweets['tweets']), 0)
28 | 
29 |     def random_instance(self):
30 |         """
31 |         Test whether a random instance is returned
32 |         """
33 |         nitter = Nitter()
34 |         self.assertIsNotNone(nitter.get_random_instance())


--------------------------------------------------------------------------------
/tests/tweet_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | 
 5 | class TestGetTweetById(unittest.TestCase):
 6 |     def test_get_tweet_by_id(self):
 7 |         """
 8 |         Test fetching a tweet by its ID.
 9 |         """
10 |         nitter = Nitter()
11 |         tweet = nitter.get_tweet_by_id("X", "1824507305389592885", instance="https://nt.vern.cc")
12 |         self.assertIsNotNone(tweet, "Tweet should note be None")
13 |         self.assertEqual(tweet['user']['username'], "@X", "Username should match the expected username")
14 |         self.assertEqual(tweet['text'], "since it’s friday, let’s have some fun!  comment with a @grok generated pic"
15 |                                         " that describes your entire personality 👹")
16 |         self.assertEqual(tweet['date'], "Aug 16, 2024 · 6:03 PM UTC", "Date should match the expected date")
17 |         self.assertGreaterEqual(tweet['stats']['likes'], 3471, "Likes count should be greater than or equal to 3471")
18 |         self.assertGreaterEqual(tweet['stats']['retweets'], 303, "Retweets count should be greater than or equal to 303")
19 |         self.assertGreaterEqual(tweet['stats']['comments'], 2000, "Comments count should be greater than or equal to 2000"
20 |                                 )
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     unittest.main()
25 | 


--------------------------------------------------------------------------------