├── README.md ├── base_interface.py └── threads_interface.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Features 🚀 3 | 4 | A Software that scrapes the following sections from threads: 5 | 6 | - Fetch user's and thread's unique identifiers. 7 | - Retrieve user's details, threads, and replies. 8 | - Retrieve thread's details and its likers. 9 | - Save the fetched data into CSV and JSON files. 10 | 11 | ## :file_folder: File Structure 12 | 13 | - `base_interface.py`: Provides a basic interface for interacting with Threads. 14 | - `threads_interface.py`: A public interface for the scraper with methods for fetching and saving data. 15 | 16 | ## :rocket: How to Use 17 | 18 | Example: 19 | 20 | 1. Import the `ThreadsInterface` class from `threads_interface.py`. 21 | 2. Create an instance of the `ThreadsInterface` class. 22 | 3. Use the instance to call the methods for fetching and saving data. 23 | 24 | ## Output 25 | 26 | 1. **Scrape User ID** 27 | 28 | - Input: `username` 29 | - Output: `user_id` 30 | - Example: 31 | - Input: `john_doe` 32 | - Output: `12345` 33 | 2. **Scrape Thread ID** 34 | 35 | - Input: `url_id` (last part of a thread's URL) 36 | - Output: `thread_id` 37 | - Example: 38 | - Input: `CuXFPIeLLod` 39 | - Output: `54321` 40 | 3. **Fetch User** 41 | 42 | - Input: `user_id` 43 | - Output: User information in JSON format 44 | - Example: 45 | - Input: `12345` 46 | - Output: `{ "username": "john_doe", "email": "johndoe@example.com", "date_joined": "2022-01-01" }` 47 | 4. **Fetch User Threads** 48 | 49 | - Input: `user_id` 50 | - Output: List of threads posted by the user in JSON format 51 | - Example: 52 | - Input: `12345` 53 | - Output: `[{ "thread_id": "54321", "title": "My first thread", "date_posted": "2022-02-02" }, {...}]` 54 | 5. **Fetch User Replies** 55 | 56 | - Input: `user_id` 57 | - Output: List of replies posted by the user in JSON format 58 | - Example: 59 | - Input: `12345` 60 | - Output: `[{ "reply_id": "4321", "thread_id": "54321", "content": "Great thread!", "date_posted": "2022-02-03" }, {...}]` 61 | 6. **Fetch Thread** 62 | 63 | - Input: `thread_id` 64 | - Output: Thread information in JSON format 65 | - Example: 66 | - Input: `54321` 67 | - Output: `{ "title": "My first thread", "content": "Hello, world!", "date_posted": "2022-02-02" }` 68 | 7. **Fetch Thread Likers** 69 | 70 | - Input: `thread_id` 71 | - Output: List of users who liked the thread in JSON format 72 | - Example: 73 | - Input: `54321` 74 | - Output: `[{ "user_id": "12345", "username": "john_doe" }, {...}]` 75 | 8. **Generate Scraper Token** 76 | 77 | - Input: None 78 | - Output: A token for the Thread Scraper 79 | - Example: 80 | - Input: None 81 | - Output: `abc123def456ghi789` 82 | -------------------------------------------------------------------------------- /base_interface.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide a basic interface for the Threads. 3 | """ 4 | import re 5 | import requests 6 | 7 | class BaseThreadsInterface: 8 | """ 9 | A basic interface for interacting with Threads. 10 | """ 11 | 12 | def __init__(self): 13 | """ 14 | Initialize the object. 15 | """ 16 | self.headers_for_html_fetching = { 17 | 'Authority': 'www.threads.net', 18 | 'Accept': ( 19 | 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;' 20 | 'q=0.8,application/signed-exchange;v=b3;q=0.7' 21 | ), 22 | 'Accept-Language': 'en-US,en;q=0.9', 23 | 'Cache-Control': 'no-cache', 24 | 'Content-Type': 'application/x-www-form-urlencoded', 25 | 'Origin': 'https://www.threads.net', 26 | 'Pragma': 'no-cache', 27 | 'Referer': 'https://www.instagram.com', 28 | 'Sec-Fetch-Dest': 'document', 29 | 'Sec-Fetch-Mode': 'navigate', 30 | 'Sec-Fetch-Site': 'cross-site', 31 | 'Sec-Fetch-User': '?1', 32 | 'Upgrade-Insecure-Requests': '1', 33 | 'User-Agent': ( 34 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' 35 | 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15' 36 | ), 37 | } 38 | 39 | def retrieve_user_id(self, username: str) -> int: 40 | """ 41 | Retrieve the unique identifier for a user. 42 | 43 | Args: 44 | username (str): The user's username. 45 | 46 | Returns: 47 | The user's unique identifier as an integer. 48 | """ 49 | response = requests.get( 50 | url=f'https://www.instagram.com/{username}', 51 | headers=self.headers_for_html_fetching, 52 | ) 53 | 54 | user_id_key_value = re.search('"user_id":"(\\d+)",', response.text).group() 55 | user_id = re.search('\\d+', user_id_key_value).group() 56 | 57 | return int(user_id) 58 | 59 | def retrieve_thread_id(self, url_id: str) -> int: 60 | """ 61 | Retrieve the unique identifier for a thread. 62 | 63 | Args: 64 | url_id (str): The thread's URL identifier. 65 | 66 | Returns: 67 | The thread's unique identifier as an integer. 68 | """ 69 | alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' 70 | 71 | thread_id = 0 72 | 73 | for character in url_id: 74 | thread_id = (thread_id * 64) + alphabet.index(character) 75 | 76 | return thread_id 77 | -------------------------------------------------------------------------------- /threads_interface.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provide a public interface for the Threads. 3 | """ 4 | import json 5 | import re 6 | import requests 7 | from base_interface import BaseThreadsInterface 8 | 9 | import csv 10 | import os 11 | import pandas as pd 12 | 13 | 14 | class ThreadsInterface(BaseThreadsInterface): 15 | """ 16 | A public interface for interacting with Threads. 17 | 18 | Each unique endpoint requires a unique document ID, predefined by the developers. 19 | """ 20 | THREADS_API_URL = 'https://www.threads.net/api/graphql' 21 | 22 | def __init__(self): 23 | """ 24 | Initialize the object. 25 | """ 26 | super().__init__() 27 | 28 | self.api_token = self._generate_api_token() 29 | self.default_headers = { 30 | 'Authority': 'www.threads.net', 31 | 'Accept': '*/*', 32 | 'Accept-Language': 'en-US,en;q=0.9', 33 | 'Cache-Control': 'no-cache', 34 | 'Content-Type': 'application/x-www-form-urlencoded', 35 | 'Origin': 'https://www.threads.net', 36 | 'Pragma': 'no-cache', 37 | 'Sec-Fetch-Site': 'same-origin', 38 | 'X-ASBD-ID': '129477', 39 | 'X-FB-LSD': self.api_token, 40 | 'X-IG-App-ID': '238260118697367', 41 | } 42 | 43 | def retrieve_user(self, user_id: int) -> dict: 44 | """ 45 | Retrieve a user. 46 | 47 | Args: 48 | user_id (int): The user's unique identifier. 49 | 50 | Returns: 51 | The user as a dictionary. 52 | """ 53 | headers = self.default_headers.copy() 54 | headers['X-FB-Friendly-Name'] = 'BarcelonaProfileRootQuery' 55 | 56 | response = requests.post( 57 | url=self.THREADS_API_URL, 58 | headers=headers, 59 | data={ 60 | 'lsd': self.api_token, 61 | 'variables': json.dumps( 62 | { 63 | 'userID': user_id, 64 | } 65 | ), 66 | 'doc_id': '23996318473300828', 67 | }, 68 | ) 69 | 70 | return response.json() 71 | 72 | def retrieve_user_threads(self, user_id: int) -> dict: 73 | """ 74 | Retrieve a user's threads. 75 | 76 | Args: 77 | user_id (int): The user's unique identifier. 78 | 79 | Returns: 80 | The list of user's threads inside a dictionary. 81 | """ 82 | headers = self.default_headers.copy() 83 | headers['X-FB-Friendly-Name'] = 'BarcelonaProfileThreadsTabQuery' 84 | 85 | response = requests.post( 86 | url=self.THREADS_API_URL, 87 | headers=headers, 88 | data={ 89 | 'lsd': self.api_token, 90 | 'variables': json.dumps( 91 | { 92 | 'userID': user_id, 93 | } 94 | ), 95 | 'doc_id': '6232751443445612', 96 | }, 97 | ) 98 | 99 | return response.json() 100 | 101 | def retrieve_user_replies(self, user_id: int) -> dict: 102 | """ 103 | Retrieve a user's replies. 104 | 105 | Args: 106 | user_id (int): The user's unique identifier. 107 | 108 | Returns: 109 | The list of user's replies inside a dictionary. 110 | """ 111 | headers = self.default_headers.copy() 112 | headers['X-FB-Friendly-Name'] = 'BarcelonaProfileRepliesTabQuery' 113 | 114 | response = requests.post( 115 | url=self.THREADS_API_URL, 116 | headers=headers, 117 | data={ 118 | 'lsd': self.api_token, 119 | 'variables': json.dumps( 120 | { 121 | 'userID': user_id, 122 | } 123 | ), 124 | 'doc_id': '6307072669391286', 125 | }, 126 | ) 127 | 128 | return response.json() 129 | 130 | def retrieve_thread(self, thread_id: int) -> dict: 131 | """ 132 | Retrieve a thread. 133 | 134 | Args: 135 | thread_id (int): The thread's unique identifier. 136 | 137 | Returns: 138 | The thread as a dictionary. 139 | """ 140 | headers = self.default_headers.copy() 141 | headers['X-FB-Friendly-Name'] = 'BarcelonaPostPageQuery' 142 | 143 | response = requests.post( 144 | url=self.THREADS_API_URL, 145 | headers=headers, 146 | data={ 147 | 'lsd': self.api_token, 148 | 'variables': json.dumps( 149 | { 150 | 'postID': thread_id, 151 | } 152 | ), 153 | 'doc_id': '5587632691339264', 154 | }, 155 | ) 156 | 157 | return response.json() 158 | 159 | def retrieve_thread_likers(self, thread_id: int) -> dict: 160 | """ 161 | Retrieve the likers of a thread. 162 | 163 | Args: 164 | thread_id (int): The thread's unique identifier. 165 | 166 | Returns: 167 | The list of likers of the thread inside a dictionary. 168 | """ 169 | response = requests.post( 170 | url=self.THREADS_API_URL, 171 | headers=self.default_headers, 172 | data={ 173 | 'lsd': self.api_token, 174 | 'variables': json.dumps( 175 | { 176 | 'mediaID': thread_id, 177 | } 178 | ), 179 | 'doc_id': '9360915773983802', 180 | }, 181 | ) 182 | 183 | return response.json() 184 | 185 | def _generate_api_token(self) -> str: 186 | """ 187 | Generate a token for the Threads. 188 | 189 | The token, called `lsd` internally, is required for any request. 190 | For anonymous users, it is just generated automatically from the back-end and passed to the front-end. 191 | 192 | Returns: 193 | The token for the Threads as a string. 194 | """ 195 | response = requests.get( 196 | url='https://www.instagram.com/instagram', 197 | headers=self.headers_for_html_fetching, 198 | ) 199 | 200 | token_key_value = re.search( 201 | 'LSD",\\[\\],{"token":"(.*?)"},\\d+\\]', response.text).group() 202 | token_key_value = token_key_value.replace('LSD",[],{"token":"', '') 203 | token = token_key_value.split('"')[0] 204 | 205 | return token 206 | 207 | def save_data_to_csv(self, data: dict 208 | """ 209 | Save the provided data into a CSV file. 210 | 211 | Args: 212 | data (dict): The data to be saved. 213 | filename (str): The filename of the CSV file. 214 | """ 215 | # Convert the dictionary to a DataFrame 216 | df = pd.DataFrame(data) 217 | 218 | # Check if file exists 219 | if os.path.isfile(filename): 220 | # If it exists, append without writing headers 221 | df.to_csv(filename, mode='a', header=False, index=False) 222 | else: 223 | # If it doesn't exist, write the DataFrame with headers 224 | df.to_csv(filename, index=False) 225 | 226 | def save_data_to_json(self, data: dict, filename: str): 227 | """ 228 | Save the provided data into a JSON file. 229 | 230 | Args: 231 | data (dict): The data to be saved. 232 | filename (str): The filename of the JSON file. 233 | """ 234 | with open(filename, 'a') as json_file: 235 | json.dump(data, json_file) 236 | --------------------------------------------------------------------------------