├── tests
    ├── __init__.py
    ├── instances_test.py
    ├── search_test.py
    └── profile_test.py
├── ntscraper
    ├── __init__.py
    └── nitter.py
├── .gitattributes
├── requirements.txt
├── LICENSE.txt
├── setup.py
├── README.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ntscraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .nitter import Nitter


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.1
2 | requests==2.28.1
3 | setuptools==65.5.0
4 | lxml==4.9.2


--------------------------------------------------------------------------------
/tests/instances_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestProfile(unittest.TestCase):
 5 |     def get_instances(self):
 6 |         """
 7 |         Test retrieval of instances. Should only return updated instances.
 8 |         """
 9 |         nitter = Nitter()
10 |         instances = nitter.__get_instances()
11 |         self.assertGreater(len(instances), 0)


--------------------------------------------------------------------------------
/tests/search_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestSearch(unittest.TestCase):
 5 |     def scrape_term(self):
 6 |         """
 7 |         Test scraping a term
 8 |         """
 9 |         nitter = Nitter()
10 |         tweets = nitter.get_tweets("Twitter", 'term')
11 |         self.assertGreater(len(tweets['tweets']), 0)
12 | 
13 |     def scrape_hashtag(self):
14 |         """
15 |         Test scraping a hashtag
16 |         """
17 |         nitter = Nitter()
18 |         tweets = nitter.get_tweets("twitter", 'hashtag')
19 |         self.assertGreater(len(tweets['tweets']), 0)
20 | 
21 |     def random_instance(self):
22 |         """
23 |         Test whether a random instance is returned
24 |         """
25 |         nitter = Nitter()
26 |         self.assertIsNotNone(nitter.get_random_instance())


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Lorenzo Bocchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | from os import path
 4 | 
 5 | HERE = path.abspath(path.dirname(__file__))
 6 | 
 7 | with open(path.join(HERE, 'README.md'), encoding='utf-8') as f:
 8 |     long_description = f.read()
 9 | 
10 | setup(
11 |     name="ntscraper",
12 |     version="0.1.12",
13 |     description="Unofficial library to scrape Twitter profiles and posts from Nitter instances",
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     project_urls={
17 |         'Homepage': 'https://github.com/bocchilorenzo/ntscraper',
18 |         'Source': 'https://github.com/bocchilorenzo/ntscraper',
19 |         'Documentation': 'https://github.com/bocchilorenzo/ntscraper'
20 |     },
21 |     keywords=["twitter", "nitter", "scraping"],
22 |     author="Lorenzo Bocchi",
23 |     author_email="lorenzobocchi99@yahoo.com",
24 |     license="MIT",
25 |     classifiers=[
26 |         "Intended Audience :: Developers",
27 |         "License :: OSI Approved :: MIT License",
28 |         "Programming Language :: Python",
29 |         "Programming Language :: Python :: 3",
30 |         "Programming Language :: Python :: 3.7",
31 |         "Programming Language :: Python :: 3.9",
32 |         "Programming Language :: Python :: 3.10",
33 |         "Operating System :: OS Independent"
34 |     ],
35 |     packages=["ntscraper"],
36 |     include_package_data=True,
37 |     install_requires=["requests", "beautifulsoup4", "lxml"],
38 | )


--------------------------------------------------------------------------------
/tests/profile_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from ntscraper import Nitter
 3 | 
 4 | class TestProfile(unittest.TestCase):
 5 |     def scrape_profile_info(self):
 6 |         """
 7 |         Test scraping profile info of a username (Twitter, we need a stable username)
 8 |         """
 9 |         nitter = Nitter()
10 |         profile = nitter.get_profile_info("Twitter")
11 |         self.assertEqual(profile['name'], "Twitter")
12 |         self.assertEqual(profile['username'], "@Twitter")
13 |         self.assertEqual(profile['bio'], "What's happening?!")
14 |         self.assertEqual(profile['location'], 'everywhere')
15 |         self.assertEqual(profile['website'], 'https://about.twitter.com/')
16 |         self.assertEqual(profile['joined'], '2:35 PM - 20 Feb 2007')
17 |         self.assertGreater(profile['stats']['tweets'], 0)
18 |         self.assertGreater(profile['stats']['following'], 0)
19 |         self.assertGreater(profile['stats']['followers'], 0)
20 |         self.assertGreater(profile['stats']['likes'], 0)
21 |         self.assertGreater(profile['stats']['media'], 0)
22 |         self.assertEqual(profile['image'], 'https://pbs.twimg.com/profile_images/1488548719062654976/u6qfBBkF_400x400.jpg')
23 |     
24 |     def scrape_profile_tweets(self):
25 |         """
26 |         Test scraping profile tweets of a username (Twitter, we need a stable username)
27 |         """
28 |         nitter = Nitter()
29 |         tweets = nitter.get_tweets("Twitter", 'user')
30 |         self.assertGreater(len(tweets['tweets']), 0)
31 | 
32 |     def scrape_profile_tweets_since(self):
33 |         """
34 |         Test scraping profile tweets of a username (Twitter, we need a stable username) in a certain time period
35 |         """
36 |         nitter = Nitter()
37 |         tweets = nitter.get_tweets("Twitter", mode='user', since='2022-12-01', until='2022-12-31', number=1)
38 |         self.assertGreater(len(tweets['threads']), 1)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Unofficial Nitter scraper
 2 | 
 3 | This is a simple library to scrape Nitter instances for tweets. It can:
 4 | 
 5 | - search and scrape tweets with a certain term
 6 | 
 7 | - search and scrape tweets with a certain hashtag
 8 | 
 9 | - scrape tweets from a user profile
10 | 
11 | - get profile information of a user, such as display name, username, number of tweets, profile picture ...
12 | 
13 | If the instance to use is not provided to the scraper, it will use a random instance among those listed as "online" and "working" in https://github.com/zedeus/nitter/wiki/Instances.
14 | 
15 | ---
16 | 
17 | ## Installation
18 | 
19 | ```
20 | pip install ntscraper
21 | ```
22 | 
23 | ## How to use
24 | 
25 | First, initialize the library:
26 | 
27 | ```
28 | from ntscraper import Nitter
29 | 
30 | scraper = Nitter(log_level=1)
31 | ```
32 | The valid logging levels are:
33 | - None = no logs
34 | - 0 = only warning and error logs
35 | - 1 = previous + informational logs (default)
36 | 
37 | Then, choose the proper function for what you want to do from the following.
38 | 
39 | ### Scrape tweets
40 | 
41 | ```
42 | github_hash_tweets = scraper.get_tweets("github", mode='hashtag')
43 | 
44 | bezos_tweets = scraper.get_tweets("JeffBezos", mode='user')
45 | ```
46 | 
47 | Parameters:
48 | - term: search term
49 | - mode: modality to scrape the tweets. Default is 'term' which will look for tweets containing the search term. Other modes are 'hashtag' to search for a hashtag and 'user' to scrape tweets from a user profile
50 | - number: number of tweets to scrape. Default is 5. If 'since' is specified, this is bypassed.
51 | - since: date to start scraping from, formatted as YYYY-MM-DD. Default is None
52 | - until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None
53 | - max_retries: max retries to scrape a page. Default is 5
54 | - instance: Nitter instance to use. Default is None and will be chosen at random
55 | 
56 | Returns a dictionary with tweets and threads for the term.
57 | 
58 | ### Get profile information
59 | 
60 | ```
61 | bezos_information = scraper.get_profile_info("JeffBezos")
62 | ```
63 | 
64 | Parameters:
65 | - username: username of the page to scrape
66 | - max_retries: max retries to scrape a page. Default is 5
67 | - instance: Nitter instance to use. Default is None
68 | 
69 | Returns a dictionary of the profile's information.
70 | 
71 | ### Get random Nitter instance
72 | 
73 | ```
74 | random_instance = scraper.get_random_instance()
75 | ```
76 | 
77 | Returns a random Nitter instance.
78 | 
79 | ## Note
80 | 
81 | Due to recent changes on Twitter's side, some Nitter instances may not work properly even if they are marked as "working" on Nitter's wiki. If you have trouble scraping with a certain instance, try changing it and check if the problem persists.
82 | 
83 | ## To do list
84 | 
85 | - [ ] Add scraping of individual posts with comments


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | .vscode/
155 | 
156 | test.py


--------------------------------------------------------------------------------
/ntscraper/nitter.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import random
  4 | from urllib.parse import unquote
  5 | from time import sleep
  6 | from base64 import b64decode
  7 | from random import uniform
  8 | from re import match
  9 | from datetime import datetime
 10 | import logging
 11 | 
 12 | 
 13 | class Nitter:
 14 |     def __init__(self, log_level=1):
 15 |         """
 16 |         Nitter scraper
 17 | 
 18 |         :param log_level: logging level. Default 1
 19 |         """
 20 |         self.instances = self.__get_instances()
 21 |         if log_level == 0:
 22 |             log_level = logging.WARNING
 23 |         elif log_level == 1:
 24 |             log_level = logging.INFO
 25 |         elif log_level:
 26 |             raise ValueError("Invalid log level")
 27 |         
 28 |         logging.basicConfig(level=log_level, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
 29 | 
 30 |         self.retry_count = 0
 31 |         self.cooldown_count = 0
 32 |         self.session_reset = False
 33 |         self.instance = ""
 34 | 
 35 |     def __initialize_session(self, instance):
 36 |         """
 37 |         Initialize the requests session
 38 |         """
 39 |         if instance is None:
 40 |             self.instance = self.get_random_instance()
 41 |             logging.info(f"No instance specified, using random instance {self.instance}")
 42 |         else:
 43 |             self.instance = instance
 44 |         self.r = requests.Session()
 45 |         self.r.headers.update(
 46 |             {
 47 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0"
 48 |             }
 49 |         )
 50 | 
 51 |     def __is_instance_encrypted(self):
 52 |         """
 53 |         Check if the current instance uses encrypted media
 54 | 
 55 |         :return: True if encrypted, False otherwise
 56 |         """
 57 |         soup = self.__get_page("/x")
 58 | 
 59 |         if soup is None:
 60 |             raise ValueError("Invalid instance")
 61 | 
 62 |         if (
 63 |             soup.find("a", class_="profile-card-avatar").find("img")
 64 |             and "/enc/"
 65 |             in soup.find("a", class_="profile-card-avatar").find("img")["src"]
 66 |         ):
 67 |             return True
 68 |         else:
 69 |             return False
 70 | 
 71 |     def __get_instances(self):
 72 |         """
 73 |         Fetch the list of clear web Nitter instances from the wiki
 74 | 
 75 |         :return: list of Nitter instances, or None if lookup failed
 76 |         """
 77 |         r = requests.get("https://github.com/zedeus/nitter/wiki/Instances")
 78 |         instance_list = []
 79 |         if r.ok:
 80 |             soup = BeautifulSoup(r.text, "lxml")
 81 |             official = soup.find_all("tbody")[0]
 82 |             instance_list.append(official.find("a")["href"])
 83 |             table = soup.find_all("tbody")[1]
 84 |             for instance in table.find_all("tr"):
 85 |                 columns = instance.find_all("td")
 86 |                 if (
 87 |                     columns[1].text.strip() == "✅"
 88 |                     ) and (
 89 |                     columns[2].text.strip() == "✅"
 90 |                 ):
 91 |                     url = instance.find("a")["href"]
 92 |                     if not url.endswith(".onion"):
 93 |                         instance_list.append(url)
 94 |             return instance_list
 95 |         else:
 96 |             return None
 97 |         
 98 |     def __get_new_instance(self, message):
 99 |         instance = self.get_random_instance()
100 |         logging.warning(f"{message}. Trying {instance}")
101 |         return instance
102 | 
103 |     def __get_page(self, endpoint, max_retries=5):
104 |         """
105 |         Download page from Nitter instance
106 | 
107 |         :param endpoint: endpoint to use
108 |         :param max_retries: max number of retries, default 5
109 |         :return: page content, or None if max retries reached
110 |         """
111 |         keep_trying = True
112 |         soup = None
113 |         while keep_trying and (self.retry_count < max_retries):
114 |             try:
115 |                 self.r = requests.get(
116 |                     self.instance + endpoint, cookies={"hlsPlayback": "on", "infiniteScroll": ""}, timeout=5
117 |                 )
118 |             except:
119 |                 self.__initialize_session(instance = self.__get_new_instance(f"{self.instance} unreachable"))
120 |                 self.retry_count += 1
121 |                 self.cooldown_count = 0
122 |                 self.session_reset = True
123 |                 sleep(1)
124 |                 continue
125 |             if self.r.ok:
126 |                 self.session_reset = False
127 |                 soup = BeautifulSoup(self.r.text, "lxml")
128 |                 if not soup.find(
129 |                     lambda tag: tag.name == "div"
130 |                     and (tag.get("class") == ["timeline-item"] or tag.get("class") == ["timeline-item", "thread"])
131 |                 ):
132 |                     bottom_page = soup.find_all("div", class_="show-more")
133 |                     if bottom_page and bottom_page[-1].find("a").text == "Load newest":
134 |                         keep_trying = False
135 |                         soup = None
136 |                     else:
137 |                         self.__initialize_session(self.__get_new_instance(f"Empty profile on {self.instance}"))
138 |                         self.retry_count += 1
139 |                 else:
140 |                     keep_trying = False
141 |             else:
142 |                 if "cursor" in endpoint:
143 |                     if not self.session_reset:
144 |                         logging.warning("Cooldown reached, trying again in 20 seconds")
145 |                         self.cooldown_count += 1
146 |                         sleep(20)
147 |                     if self.cooldown_count >= 5 and not self.session_reset:
148 |                         self.__initialize_session()
149 |                         self.session_reset = True
150 |                         self.cooldown_count = 0
151 |                     elif self.session_reset:
152 |                         self.__initialize_session(self.__get_new_instance(f"Error fetching {self.instance}"))
153 |                 else:
154 |                     self.cooldown_count = 0
155 |                     self.__initialize_session(self.__get_new_instance(f"Error fetching {self.instance}"))
156 |                 self.retry_count += 1
157 |             sleep(2)
158 |         current_retry_count = self.retry_count
159 |         self.retry_count = 0
160 |         if current_retry_count >= max_retries:
161 |             logging.warning("Max retries reached. Check your request and try again.")
162 |             return None
163 | 
164 |         return soup
165 | 
166 |     def __get_quoted_media(self, quoted_tweet, is_encrypted):
167 |         """
168 |         Extract media from a quoted tweet
169 | 
170 |         :param quoted_tweet: tweet to extract media from
171 |         :param is_encrypted: True if instance uses encrypted media
172 |         :return: lists of images, videos and gifs, or empty lists if no media is found
173 |         """
174 |         quoted_pictures, quoted_videos, quoted_gifs = [], [], []
175 |         if quoted_tweet.find("div", class_="attachments"):
176 |             if is_encrypted:
177 |                 quoted_pictures = [
178 |                     "https://pbs.twimg.com/"
179 |                     + b64decode(img["src"].split("/")[-1].encode("utf-8"))
180 |                     .decode("utf-8")
181 |                     .split("?")[0]
182 |                     for img in quoted_tweet.find("div", class_="attachments").find_all(
183 |                         "img"
184 |                     )
185 |                 ]
186 |                 quoted_videos = [
187 |                     b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode(
188 |                         "utf-8"
189 |                     ) if "data-url" in video.attrs
190 |                     else video.find("source")["src"]
191 |                     for video in quoted_tweet.find(
192 |                         "div", class_="attachments"
193 |                     ).find_all("video", class_="")
194 |                 ]
195 |                 quoted_gifs = [
196 |                     "https://"
197 |                     + b64decode(
198 |                         gif.source["src"].split("/")[-1].encode("utf-8")
199 |                     ).decode("utf-8")
200 |                     for gif in quoted_tweet.find("div", class_="attachments").find_all(
201 |                         "video", class_="gif"
202 |                     )
203 |                 ]
204 |             else:
205 |                 quoted_pictures = [
206 |                     "https://pbs.twimg.com"
207 |                     + unquote(img["src"].split("/pic")[1]).split("?")[0]
208 |                     for img in quoted_tweet.find("div", class_="attachments").find_all(
209 |                         "img"
210 |                     )
211 |                 ]
212 |                 quoted_videos = [
213 |                     unquote("https" + video["data-url"].split("https")[1])
214 |                     if "data-url" in video.attrs
215 |                     else unquote(video.find("source")["src"])
216 |                     for video in quoted_tweet.find(
217 |                         "div", class_="attachments"
218 |                     ).find_all("video", class_="")
219 |                 ]
220 |                 quoted_gifs = [
221 |                     unquote("https://" + gif.source["src"].split("/pic/")[1])
222 |                     for gif in quoted_tweet.find("div", class_="attachments").find_all(
223 |                         "video", class_="gif"
224 |                     )
225 |                 ]
226 |         return quoted_pictures, quoted_videos, quoted_gifs
227 | 
228 |     def __get_tweet_media(self, tweet, is_encrypted):
229 |         """
230 |         Extract media from a tweet
231 | 
232 |         :param tweet: tweet to extract media from
233 |         :param is_encrypted: True if instance uses encrypted media
234 |         :return: lists of images, videos and gifs, or empty lists if no media is found
235 |         """
236 |         pictures, videos, gifs = [], [], []
237 |         if tweet.find("div", class_="tweet-body").find(
238 |             "div", class_="attachments", recursive=False
239 |         ):
240 |             if is_encrypted:
241 |                 pictures = [
242 |                     "https://pbs.twimg.com/"
243 |                     + b64decode(img["src"].split("/")[-1].encode("utf-8"))
244 |                     .decode("utf-8")
245 |                     .split("?")[0]
246 |                     for img in tweet.find("div", class_="tweet-body")
247 |                     .find("div", class_="attachments", recursive=False)
248 |                     .find_all("img")
249 |                 ]
250 |                 videos = [
251 |                     b64decode(video["data-url"].split("/")[-1].encode("utf-8")).decode(
252 |                         "utf-8"
253 |                     ) if "data-url" in video.attrs
254 |                     else video.find("source")["src"]
255 |                     for video in tweet.find("div", class_="tweet-body")
256 |                     .find("div", class_="attachments", recursive=False)
257 |                     .find_all("video", class_="")
258 |                 ]
259 |                 gifs = [
260 |                     "https://"
261 |                     + b64decode(
262 |                         gif.source["src"].split("/")[-1].encode("utf-8")
263 |                     ).decode("utf-8")
264 |                     for gif in tweet.find("div", class_="tweet-body")
265 |                     .find("div", class_="attachments", recursive=False)
266 |                     .find_all("video", class_="gif")
267 |                 ]
268 |             else:
269 |                 pictures = [
270 |                     "https://pbs.twimg.com"
271 |                     + unquote(img["src"].split("/pic")[1]).split("?")[0]
272 |                     for img in tweet.find("div", class_="tweet-body")
273 |                     .find("div", class_="attachments", recursive=False)
274 |                     .find_all("img")
275 |                 ]
276 |                 videos = [
277 |                     unquote("https" + video["data-url"].split("https")[1])
278 |                     if "data-url" in video.attrs
279 |                     else video.find("source")["src"]
280 |                     for video in tweet.find("div", class_="tweet-body")
281 |                     .find("div", class_="attachments", recursive=False)
282 |                     .find_all("video", class_="")
283 |                 ]
284 |                 gifs = [
285 |                     unquote("https://" + gif.source["src"].split("/pic/")[1])
286 |                     for gif in tweet.find("div", class_="tweet-body")
287 |                     .find("div", class_="attachments", recursive=False)
288 |                     .find_all("video", class_="gif")
289 |                 ]
290 |         return pictures, videos, gifs
291 | 
292 |     def __get_tweet_stats(self, tweet):
293 |         """
294 |         Extract stats from a tweet
295 | 
296 |         :param tweet: tweet to extract stats from
297 |         :return: dictionary of stats. If a stat is not found, it is set to 0
298 |         """
299 |         return {
300 |             "comments": int(
301 |                 tweet.find_all("span", class_="tweet-stat")[0]
302 |                 .find("div")
303 |                 .text.strip()
304 |                 .replace(",", "")
305 |                 or 0
306 |             ),
307 |             "retweets": int(
308 |                 tweet.find_all("span", class_="tweet-stat")[1]
309 |                 .find("div")
310 |                 .text.strip()
311 |                 .replace(",", "")
312 |                 or 0
313 |             ),
314 |             "quotes": int(
315 |                 tweet.find_all("span", class_="tweet-stat")[2]
316 |                 .find("div")
317 |                 .text.strip()
318 |                 .replace(",", "")
319 |                 or 0
320 |             ),
321 |             "likes": int(
322 |                 tweet.find_all("span", class_="tweet-stat")[3]
323 |                 .find("div")
324 |                 .text.strip()
325 |                 .replace(",", "")
326 |                 or 0
327 |             ),
328 |         }
329 | 
330 |     def __get_user(self, tweet, is_encrypted):
331 |         """
332 |         Extract user from a tweet
333 | 
334 |         :param tweet: tweet to extract user from
335 |         :param is_encrypted: True if instance uses encrypted media
336 |         :return: dictionary of user
337 |         """
338 |         if is_encrypted:
339 |             avatar = "https://pbs.twimg.com/" + b64decode(
340 |                 tweet.find("img", class_="avatar")["src"].split("/")[-1].encode("utf-8")
341 |             ).decode("utf-8")
342 |         else:
343 |             avatar = "https://pbs.twimg.com" + unquote(
344 |                 tweet.find("img", class_="avatar")["src"].split("/pic")[1]
345 |             )
346 |         return {
347 |             "name": tweet.find("a", class_="fullname").text.strip(),
348 |             "username": tweet.find("a", class_="username").text.strip(),
349 |             "avatar": avatar,
350 |         }
351 | 
352 |     def __get_tweet_date(self, tweet):
353 |         """
354 |         Extract date from a tweet
355 | 
356 |         :param tweet: tweet to extract date from
357 |         :return: date of tweet
358 |         """
359 |         return (
360 |             tweet.find("span", class_="tweet-date")
361 |             .find("a")["title"]
362 |             .split("/")[-1]
363 |             .split("#")[0]
364 |         )
365 | 
366 |     def __get_tweet_text(self, tweet):
367 |         """
368 |         Extract text from a tweet
369 | 
370 |         :param tweet: tweet to extract text from
371 |         :return: text of tweet
372 |         """
373 |         return (
374 |             tweet.find("div", class_="tweet-content media-body")
375 |             .text.strip()
376 |             .replace("\n", " ")
377 |             if tweet.find("div", class_="tweet-content media-body")
378 |             else tweet.find("div", class_="quote-text").text.strip().replace("\n", " ")
379 |             if tweet.find("div", class_="quote-text") else ""
380 |         )
381 | 
382 |     def __get_tweet_link(self, tweet):
383 |         """
384 |         Extract link from a tweet
385 | 
386 |         :param tweet: tweet to extract link from
387 |         :return: link of tweet
388 |         """
389 |         return "https://twitter.com" + tweet.find("a")["href"]
390 | 
391 |     def __get_external_link(self, tweet):
392 |         """
393 |         Extract external link from a tweet
394 | 
395 |         :param tweet: tweet to extract external link from
396 |         :return: external link of tweet
397 |         """
398 |         return (
399 |             tweet.find("a", class_="card-container")["href"]
400 |             if tweet.find("a", class_="card-container")
401 |             else ""
402 |         )
403 | 
404 |     def __extract_tweet(self, tweet, is_encrypted):
405 |         """
406 |         Extract content from a tweet
407 | 
408 |         :param tweet: tweet to extract content from
409 |         :param is_encrypted: True if instance uses encrypted media
410 |         :return: dictionary of content for the tweet
411 |         """
412 |         # Replace link text with link
413 |         if tweet.find_all("a"):
414 |             for link in tweet.find_all("a"):
415 |                 if "https" in link["href"]:
416 |                     link.replace_with(link["href"])
417 | 
418 |         # Extract the quoted tweet
419 |         quoted_tweet = (
420 |             tweet.find("div", class_="quote")
421 |             if tweet.find("div", class_="quote")
422 |             else None
423 |         )
424 | 
425 |         # Extract media from the quoted tweet
426 |         if quoted_tweet:
427 |             deleted = False
428 |             if quoted_tweet["class"] == ["quote", "unavailable"]:
429 |                 deleted = True
430 |             (
431 |                 quoted_pictures,
432 |                 quoted_videos,
433 |                 quoted_gifs,
434 |             ) = self.__get_quoted_media(quoted_tweet, is_encrypted)
435 | 
436 |         # Extract media from the tweet
437 |         pictures, videos, gifs = self.__get_tweet_media(tweet, is_encrypted)
438 | 
439 |         return {
440 |             "link": self.__get_tweet_link(tweet),
441 |             "text": self.__get_tweet_text(tweet),
442 |             "user": self.__get_user(tweet, is_encrypted),
443 |             "date": self.__get_tweet_date(tweet),
444 |             "is-retweet": tweet.find("div", class_="retweet-header")
445 |             is not None,
446 |             "external-link": self.__get_external_link(tweet),
447 |             "quoted-post": {
448 |                 "link": self.__get_tweet_link(quoted_tweet) if not deleted else "",
449 |                 "text": self.__get_tweet_text(quoted_tweet) if not deleted else "",
450 |                 "user": self.__get_user(quoted_tweet, is_encrypted) if not deleted else {},
451 |                 "date": self.__get_tweet_date(quoted_tweet) if not deleted else "",
452 |                 "pictures": quoted_pictures,
453 |                 "videos": quoted_videos,
454 |                 "gifs": quoted_gifs,
455 |             }
456 |             if quoted_tweet
457 |             else {},
458 |             "stats": self.__get_tweet_stats(tweet),
459 |             "pictures": pictures,
460 |             "videos": videos,
461 |             "gifs": gifs,
462 |         }
463 | 
464 |     def __check_date_validity(self, date):
465 |         """
466 |         Check if a date is valid
467 | 
468 |         :param date: date to check
469 |         :return: True if date is valid
470 |         """
471 |         to_return = True
472 |         if not match(r"^\d{4}-\d{2}-\d{2}$", date):
473 |             to_return = False
474 |         try:
475 |             year, month, day = [int(number) for number in date.split("-")]
476 |             datetime(year=year,month=month,day=day)
477 |         except:
478 |             to_return = False
479 |         
480 |         if not (datetime(year=2006, month=3, day=21) < datetime(year=year,month=month,day=day) <= datetime.now()):
481 |             to_return = False
482 |         
483 |         return to_return
484 |     
485 |     def __search(self, term, mode, number, since, until, max_retries, instance):
486 |         """
487 |         Scrape the specified search terms from Nitter
488 | 
489 |         :param term: term to seach for
490 |         :param number: number of tweets to scrape.
491 |         :param since: date to start scraping from.
492 |         :param until: date to stop scraping at.
493 |         :param max_retries: max retries to scrape a page.
494 |         :param instance: Nitter instance to use.
495 |         :param mode: search mode.
496 |         :return: dictionary of tweets and threads for the term.
497 |         """
498 |         tweets = {"tweets": [], "threads": []}
499 |         if mode == "hashtag":
500 |             endpoint = "/search?f=tweets&q=%23" + term
501 |         elif mode == "term":
502 |             endpoint = "/search?f=tweets&q=" + term
503 |         elif mode == "user":
504 |             if since or until:
505 |                 endpoint = f"/{term}/search?f=tweets&q="
506 |             else:
507 |                 endpoint = f"/{term}"
508 |         else:
509 |             raise ValueError("Invalid mode. Use 'term', 'hashtag', or 'user'.")
510 |         
511 |         self.__initialize_session(instance)
512 | 
513 |         if since:
514 |             if self.__check_date_validity(since):
515 |                 endpoint += f"&since={since}"
516 |             else:
517 |                 raise ValueError("Invalid 'since' date. Use the YYYY-MM-DD format and make sure the date is valid.")
518 |         
519 |         if until:
520 |             if self.__check_date_validity(until):
521 |                 endpoint += f"&until={until}"
522 |             else:
523 |                 raise ValueError("Invalid 'until' date. Use the YYYY-MM-DD format and make sure the date is valid.")
524 | 
525 |         soup = self.__get_page(endpoint, max_retries)
526 | 
527 | 
528 |         if soup is None:
529 |             return None
530 | 
531 |         is_encrypted = self.__is_instance_encrypted()
532 |         
533 |         already_scraped = set()
534 | 
535 |         keep_scraping = True
536 |         while keep_scraping:
537 |             thread = []
538 |             
539 |             for tweet in soup.find_all("div", class_="timeline-item"):
540 |                 if len(tweet["class"]) == 1:
541 |                     to_append = self.__extract_tweet(tweet, is_encrypted)
542 |                     # Extract tweets
543 |                     if len(tweets["tweets"]) + len(tweets["threads"]) < number or (since and until) or since:
544 |                         if self.__get_tweet_link(tweet) not in already_scraped:
545 |                             tweets["tweets"].append(to_append)
546 |                             already_scraped.add(self.__get_tweet_link(tweet))
547 |                     else:
548 |                         keep_scraping = False
549 |                         break
550 |                 else:
551 |                     if "thread" in tweet["class"]:
552 |                         to_append = self.__extract_tweet(tweet, is_encrypted)
553 |                         # Extract threads
554 |                         if self.__get_tweet_link(tweet) not in already_scraped:
555 |                             thread.append(to_append)
556 |                             already_scraped.add(self.__get_tweet_link(tweet))
557 | 
558 |                         if len(tweet["class"]) == 3:
559 |                             tweets["threads"].append(thread)
560 |                             thread = []
561 | 
562 |             logging.info(f"Current stats: {len(tweets['tweets'])} tweets, {len(tweets['threads'])} threads...")
563 |             if not(since and until) and not(since) and len(tweets["tweets"]) + len(tweets["threads"]) >= number:
564 |                 keep_scraping = False
565 |             else:
566 |                 sleep(uniform(1, 2))
567 | 
568 |                 # Go to the next page
569 |                 show_more_buttons = soup.find_all("div", class_="show-more")
570 |                 if soup.find_all("div", class_="show-more"):
571 |                     if mode == "user":
572 |                         if since or until:
573 |                             next_page = (
574 |                                 f"/{term}/search?"
575 |                                 + show_more_buttons[-1]
576 |                                 .find("a")["href"]
577 |                                 .split("?")[-1]
578 |                             )
579 |                         else:
580 |                             next_page = (
581 |                                 f"/{term}?"
582 |                                 + show_more_buttons[-1]
583 |                                 .find("a")["href"]
584 |                                 .split("?")[-1]
585 |                             )
586 |                     else:
587 |                         next_page = (
588 |                             "/search"
589 |                             + show_more_buttons[-1].find("a")[
590 |                                 "href"
591 |                             ]
592 |                         )
593 |                     soup = self.__get_page(next_page, max_retries)
594 |                     if soup is None:
595 |                         keep_scraping = False
596 |                 else:
597 |                     keep_scraping = False
598 |         return tweets
599 | 
600 |     def get_random_instance(self):
601 |         """
602 |         Get a random Nitter instance
603 | 
604 |         :return: URL of random Nitter instance
605 |         """
606 |         return random.choice(self.instances)
607 | 
608 |     def get_tweets(self, term, mode='term', number=5, since=None, until=None, max_retries=5, instance=None):
609 |         """
610 |         Scrape the specified term from Nitter
611 | 
612 |         :param term: string to search for
613 |         :param mode: search mode. Default is 'term', can also be 'hashtag' or 'user'
614 |         :param number: number of tweets to scrape. Default is 5. If 'since' is specified, this is bypassed.
615 |         :param since: date to start scraping from, formatted as YYYY-MM-DD. Default is None
616 |         :param until: date to stop scraping at, formatted as YYYY-MM-DD. Default is None
617 |         :param max_retries: max retries to scrape a page. Default is 5
618 |         :param instance: Nitter instance to use. Default is None
619 |         :return: dictionary with tweets and threads for the term
620 |         """
621 |         return self.__search(term, mode, number, since, until, max_retries, instance)
622 | 
623 |     def get_profile_info(self, username, max_retries=5, instance=None):
624 |         """
625 |         Get profile information for a user
626 | 
627 |         :param username: username of the page to scrape
628 |         :param max_retries: max retries to scrape a page. Default is 5
629 |         :param instance: Nitter instance to use. Default is None
630 |         :return: dictionary of the profile's information
631 |         """
632 |         self.__initialize_session(instance)
633 |         soup = self.__get_page(f"/{username}", max_retries)
634 |         if soup is None:
635 |             return None
636 | 
637 |         is_encrypted = self.__is_instance_encrypted()
638 |         # Extract id if the banner exists, no matter if the instance uses base64 or not
639 |         if soup.find("div", class_="profile-banner").find("img") and is_encrypted:
640 |             profile_id = (
641 |                 b64decode(
642 |                     soup.find("div", class_="profile-banner")
643 |                     .find("img")["src"]
644 |                     .split("/enc/")[1]
645 |                     .encode("utf-8")
646 |                 )
647 |                 .decode("utf-8")
648 |                 .split("/profile_banners/")[1]
649 |                 .split("/")[0]
650 |             )
651 |         elif soup.find("div", class_="profile-banner").find("img"):
652 |             profile_id = (
653 |                 unquote(soup.find("div", class_="profile-banner").find("img")["src"])
654 |                 .split("profile_banners/")[1]
655 |                 .split("/")[0]
656 |             )
657 |         else:
658 |             profile_id = ""
659 | 
660 |         # Extract profile image, no matter if the instance uses base64 or not
661 |         if soup.find("a", class_="profile-card-avatar").find("img") and is_encrypted:
662 |             profile_image = "https://" + b64decode(
663 |                 soup.find("a", class_="profile-card-avatar")
664 |                 .find("img")["src"]
665 |                 .split("/enc/")[1]
666 |                 .encode("utf-8")
667 |             ).decode("utf-8")
668 |         elif soup.find("a", class_="profile-card-avatar").find("img"):
669 |             profile_image = (
670 |                 "https://"
671 |                 + unquote(
672 |                     soup.find("a", class_="profile-card-avatar").find("img")["src"]
673 |                 ).split("/pic/")[1]
674 |             )
675 |         else:
676 |             profile_image = ""
677 | 
678 |         return {
679 |             "image": profile_image,
680 |             "name": soup.find("a", class_="profile-card-fullname").text.strip(),
681 |             "username": soup.find("a", class_="profile-card-username").text.strip(),
682 |             "id": profile_id,
683 |             "bio": soup.find("div", class_="profile-bio").p.text.strip()
684 |             if soup.find("div", class_="profile-bio")
685 |             else "",
686 |             "location": soup.find("div", class_="profile-location")
687 |             .find_all("span")[-1]
688 |             .text.strip()
689 |             if soup.find("div", class_="profile-location")
690 |             else "",
691 |             "website": soup.find("div", class_="profile-website").find("a")["href"]
692 |             if soup.find("div", class_="profile-website")
693 |             else "",
694 |             "joined": soup.find("div", class_="profile-joindate").find("span")["title"],
695 |             "stats": {
696 |                 "tweets": int(
697 |                     soup.find("ul", class_="profile-statlist")
698 |                     .find("li", class_="posts")
699 |                     .find_all("span")[1]
700 |                     .text.strip()
701 |                     .replace(",", "")
702 |                 ),
703 |                 "following": int(
704 |                     soup.find("ul", class_="profile-statlist")
705 |                     .find("li", class_="following")
706 |                     .find_all("span")[1]
707 |                     .text.strip()
708 |                     .replace(",", "")
709 |                 ),
710 |                 "followers": int(
711 |                     soup.find("ul", class_="profile-statlist")
712 |                     .find("li", class_="followers")
713 |                     .find_all("span")[1]
714 |                     .text.strip()
715 |                     .replace(",", "")
716 |                 ),
717 |                 "likes": int(
718 |                     soup.find("ul", class_="profile-statlist")
719 |                     .find("li", class_="likes")
720 |                     .find_all("span")[1]
721 |                     .text.strip()
722 |                     .replace(",", "")
723 |                 ),
724 |                 "media": int(
725 |                     soup.find("div", class_="photo-rail-header")
726 |                     .find("div", class_="icon-container")
727 |                     .text.strip()
728 |                     .replace(",", "")
729 |                     .split(" ")[0]
730 |                 )
731 |             },
732 |         }
733 | 


--------------------------------------------------------------------------------