├── README.md
├── base_interface.py
└── threads_interface.py


/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Features 🚀
 3 | 
 4 | A Software that scrapes the following sections from threads:
 5 | 
 6 | - Fetch user's and thread's unique identifiers.
 7 | - Retrieve user's details, threads, and replies.
 8 | - Retrieve thread's details and its likers.
 9 | - Save the fetched data into CSV and JSON files.
10 | 
11 | ## :file_folder: File Structure
12 | 
13 | - `base_interface.py`: Provides a basic interface for interacting with Threads.
14 | - `threads_interface.py`: A public interface for the scraper with methods for fetching and saving data.
15 | 
16 | ## :rocket: How to Use
17 | 
18 | Example:
19 | 
20 | 1. Import the `ThreadsInterface` class from `threads_interface.py`.
21 | 2. Create an instance of the `ThreadsInterface` class.
22 | 3. Use the instance to call the methods for fetching and saving data.
23 | 
24 | ## Output
25 | 
26 | 1. **Scrape User ID**
27 |     
28 |     - Input: `username`
29 |     - Output: `user_id`
30 |     - Example:
31 |         - Input: `john_doe`
32 |         - Output: `12345`
33 | 2. **Scrape Thread ID**
34 |     
35 |     - Input: `url_id` (last part of a thread's URL)
36 |     - Output: `thread_id`
37 |     - Example:
38 |         - Input: `CuXFPIeLLod`
39 |         - Output: `54321`
40 | 3. **Fetch User**
41 |     
42 |     - Input: `user_id`
43 |     - Output: User information in JSON format
44 |     - Example:
45 |         - Input: `12345`
46 |         - Output: `{ "username": "john_doe", "email": "johndoe@example.com", "date_joined": "2022-01-01" }`
47 | 4. **Fetch User Threads**
48 |     
49 |     - Input: `user_id`
50 |     - Output: List of threads posted by the user in JSON format
51 |     - Example:
52 |         - Input: `12345`
53 |         - Output: `[{ "thread_id": "54321", "title": "My first thread", "date_posted": "2022-02-02" }, {...}]`
54 | 5. **Fetch User Replies**
55 |     
56 |     - Input: `user_id`
57 |     - Output: List of replies posted by the user in JSON format
58 |     - Example:
59 |         - Input: `12345`
60 |         - Output: `[{ "reply_id": "4321", "thread_id": "54321", "content": "Great thread!", "date_posted": "2022-02-03" }, {...}]`
61 | 6. **Fetch Thread**
62 |     
63 |     - Input: `thread_id`
64 |     - Output: Thread information in JSON format
65 |     - Example:
66 |         - Input: `54321`
67 |         - Output: `{ "title": "My first thread", "content": "Hello, world!", "date_posted": "2022-02-02" }`
68 | 7. **Fetch Thread Likers**
69 |     
70 |     - Input: `thread_id`
71 |     - Output: List of users who liked the thread in JSON format
72 |     - Example:
73 |         - Input: `54321`
74 |         - Output: `[{ "user_id": "12345", "username": "john_doe" }, {...}]`
75 | 8. **Generate Scraper Token**
76 |     
77 |     - Input: None
78 |     - Output: A token for the Thread Scraper
79 |     - Example:
80 |         - Input: None
81 |         - Output: `abc123def456ghi789`
82 | 


--------------------------------------------------------------------------------
/base_interface.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provide a basic interface for the Threads.
 3 | """
 4 | import re
 5 | import requests
 6 | 
 7 | class BaseThreadsInterface:
 8 |     """
 9 |     A basic interface for interacting with Threads.
10 |     """
11 | 
12 |     def __init__(self):
13 |         """
14 |         Initialize the object.
15 |         """
16 |         self.headers_for_html_fetching = {
17 |             'Authority': 'www.threads.net',
18 |             'Accept': (
19 |                 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
20 |                 'q=0.8,application/signed-exchange;v=b3;q=0.7'
21 |             ),
22 |             'Accept-Language': 'en-US,en;q=0.9',
23 |             'Cache-Control': 'no-cache',
24 |             'Content-Type': 'application/x-www-form-urlencoded',
25 |             'Origin': 'https://www.threads.net',
26 |             'Pragma': 'no-cache',
27 |             'Referer': 'https://www.instagram.com',
28 |             'Sec-Fetch-Dest': 'document',
29 |             'Sec-Fetch-Mode': 'navigate',
30 |             'Sec-Fetch-Site': 'cross-site',
31 |             'Sec-Fetch-User': '?1',
32 |             'Upgrade-Insecure-Requests': '1',
33 |             'User-Agent': (
34 |                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) '
35 |                 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
36 |             ),
37 |         }
38 | 
39 |     def retrieve_user_id(self, username: str) -> int:
40 |         """
41 |         Retrieve the unique identifier for a user.
42 | 
43 |         Args:
44 |             username (str): The user's username.
45 | 
46 |         Returns:
47 |             The user's unique identifier as an integer.
48 |         """
49 |         response = requests.get(
50 |             url=f'https://www.instagram.com/{username}',
51 |             headers=self.headers_for_html_fetching,
52 |         )
53 | 
54 |         user_id_key_value = re.search('"user_id":"(\\d+)",', response.text).group()
55 |         user_id = re.search('\\d+', user_id_key_value).group()
56 | 
57 |         return int(user_id)
58 | 
59 |     def retrieve_thread_id(self, url_id: str) -> int:
60 |         """
61 |         Retrieve the unique identifier for a thread.
62 | 
63 |         Args:
64 |             url_id (str): The thread's URL identifier.
65 | 
66 |         Returns:
67 |             The thread's unique identifier as an integer.
68 |         """
69 |         alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
70 | 
71 |         thread_id = 0
72 | 
73 |         for character in url_id:
74 |             thread_id = (thread_id * 64) + alphabet.index(character)
75 | 
76 |         return thread_id
77 | 


--------------------------------------------------------------------------------
/threads_interface.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Provide a public interface for the Threads.
  3 | """
  4 | import json
  5 | import re
  6 | import requests
  7 | from base_interface import BaseThreadsInterface
  8 | 
  9 | import csv
 10 | import os
 11 | import pandas as pd
 12 | 
 13 | 
 14 | class ThreadsInterface(BaseThreadsInterface):
 15 |     """
 16 |     A public interface for interacting with Threads.
 17 | 
 18 |     Each unique endpoint requires a unique document ID, predefined by the developers.
 19 |     """
 20 |     THREADS_API_URL = 'https://www.threads.net/api/graphql'
 21 | 
 22 |     def __init__(self):
 23 |         """
 24 |         Initialize the object.
 25 |         """
 26 |         super().__init__()
 27 | 
 28 |         self.api_token = self._generate_api_token()
 29 |         self.default_headers = {
 30 |             'Authority': 'www.threads.net',
 31 |             'Accept': '*/*',
 32 |             'Accept-Language': 'en-US,en;q=0.9',
 33 |             'Cache-Control': 'no-cache',
 34 |             'Content-Type': 'application/x-www-form-urlencoded',
 35 |             'Origin': 'https://www.threads.net',
 36 |             'Pragma': 'no-cache',
 37 |             'Sec-Fetch-Site': 'same-origin',
 38 |             'X-ASBD-ID': '129477',
 39 |             'X-FB-LSD': self.api_token,
 40 |             'X-IG-App-ID': '238260118697367',
 41 |         }
 42 | 
 43 |     def retrieve_user(self, user_id: int) -> dict:
 44 |         """
 45 |         Retrieve a user.
 46 | 
 47 |         Args:
 48 |             user_id (int): The user's unique identifier.
 49 | 
 50 |         Returns:
 51 |             The user as a dictionary.
 52 |         """
 53 |         headers = self.default_headers.copy()
 54 |         headers['X-FB-Friendly-Name'] = 'BarcelonaProfileRootQuery'
 55 | 
 56 |         response = requests.post(
 57 |             url=self.THREADS_API_URL,
 58 |             headers=headers,
 59 |             data={
 60 |                 'lsd': self.api_token,
 61 |                 'variables': json.dumps(
 62 |                     {
 63 |                         'userID': user_id,
 64 |                     }
 65 |                 ),
 66 |                 'doc_id': '23996318473300828',
 67 |             },
 68 |         )
 69 | 
 70 |         return response.json()
 71 | 
 72 |     def retrieve_user_threads(self, user_id: int) -> dict:
 73 |         """
 74 |         Retrieve a user's threads.
 75 | 
 76 |         Args:
 77 |             user_id (int): The user's unique identifier.
 78 | 
 79 |         Returns:
 80 |             The list of user's threads inside a dictionary.
 81 |         """
 82 |         headers = self.default_headers.copy()
 83 |         headers['X-FB-Friendly-Name'] = 'BarcelonaProfileThreadsTabQuery'
 84 | 
 85 |         response = requests.post(
 86 |             url=self.THREADS_API_URL,
 87 |             headers=headers,
 88 |             data={
 89 |                 'lsd': self.api_token,
 90 |                 'variables': json.dumps(
 91 |                     {
 92 |                         'userID': user_id,
 93 |                     }
 94 |                 ),
 95 |                 'doc_id': '6232751443445612',
 96 |             },
 97 |         )
 98 | 
 99 |         return response.json()
100 | 
101 |     def retrieve_user_replies(self, user_id: int) -> dict:
102 |         """
103 |         Retrieve a user's replies.
104 | 
105 |         Args:
106 |             user_id (int): The user's unique identifier.
107 | 
108 |         Returns:
109 |             The list of user's replies inside a dictionary.
110 |         """
111 |         headers = self.default_headers.copy()
112 |         headers['X-FB-Friendly-Name'] = 'BarcelonaProfileRepliesTabQuery'
113 | 
114 |         response = requests.post(
115 |             url=self.THREADS_API_URL,
116 |             headers=headers,
117 |             data={
118 |                 'lsd': self.api_token,
119 |                 'variables': json.dumps(
120 |                     {
121 |                         'userID': user_id,
122 |                     }
123 |                 ),
124 |                 'doc_id': '6307072669391286',
125 |             },
126 |         )
127 | 
128 |         return response.json()
129 | 
130 |     def retrieve_thread(self, thread_id: int) -> dict:
131 |         """
132 |         Retrieve a thread.
133 | 
134 |         Args:
135 |             thread_id (int): The thread's unique identifier.
136 | 
137 |         Returns:
138 |             The thread as a dictionary.
139 |         """
140 |         headers = self.default_headers.copy()
141 |         headers['X-FB-Friendly-Name'] = 'BarcelonaPostPageQuery'
142 | 
143 |         response = requests.post(
144 |             url=self.THREADS_API_URL,
145 |             headers=headers,
146 |             data={
147 |                 'lsd': self.api_token,
148 |                 'variables': json.dumps(
149 |                     {
150 |                         'postID': thread_id,
151 |                     }
152 |                 ),
153 |                 'doc_id': '5587632691339264',
154 |             },
155 |         )
156 | 
157 |         return response.json()
158 | 
159 |     def retrieve_thread_likers(self, thread_id: int) -> dict:
160 |         """
161 |         Retrieve the likers of a thread.
162 | 
163 |         Args:
164 |             thread_id (int): The thread's unique identifier.
165 | 
166 |         Returns:
167 |             The list of likers of the thread inside a dictionary.
168 |         """
169 |         response = requests.post(
170 |             url=self.THREADS_API_URL,
171 |             headers=self.default_headers,
172 |             data={
173 |                 'lsd': self.api_token,
174 |                 'variables': json.dumps(
175 |                     {
176 |                         'mediaID': thread_id,
177 |                     }
178 |                 ),
179 |                 'doc_id': '9360915773983802',
180 |             },
181 |         )
182 | 
183 |         return response.json()
184 | 
185 |     def _generate_api_token(self) -> str:
186 |         """
187 |         Generate a token for the Threads.
188 | 
189 |         The token, called `lsd` internally, is required for any request.
190 |         For anonymous users, it is just generated automatically from the back-end and passed to the front-end.
191 | 
192 |         Returns:
193 |             The token for the Threads as a string.
194 |         """
195 |         response = requests.get(
196 |             url='https://www.instagram.com/instagram',
197 |             headers=self.headers_for_html_fetching,
198 |         )
199 | 
200 |         token_key_value = re.search(
201 |             'LSD",\\[\\],{"token":"(.*?)"},\\d+\\]', response.text).group()
202 |         token_key_value = token_key_value.replace('LSD",[],{"token":"', '')
203 |         token = token_key_value.split('"')[0]
204 | 
205 |         return token
206 | 
207 |     def save_data_to_csv(self, data: dict
208 |         """
209 |         Save the provided data into a CSV file.
210 | 
211 |         Args:
212 |             data (dict): The data to be saved.
213 |             filename (str): The filename of the CSV file.
214 |         """
215 |         # Convert the dictionary to a DataFrame
216 |         df = pd.DataFrame(data)
217 | 
218 |         # Check if file exists
219 |         if os.path.isfile(filename):
220 |             # If it exists, append without writing headers
221 |             df.to_csv(filename, mode='a', header=False, index=False)
222 |         else:
223 |             # If it doesn't exist, write the DataFrame with headers
224 |             df.to_csv(filename, index=False)
225 | 
226 |     def save_data_to_json(self, data: dict, filename: str):
227 |         """
228 |         Save the provided data into a JSON file.
229 | 
230 |         Args:
231 |             data (dict): The data to be saved.
232 |             filename (str): The filename of the JSON file.
233 |         """
234 |         with open(filename, 'a') as json_file:
235 |             json.dump(data, json_file)
236 | 


--------------------------------------------------------------------------------