├── .gitignore ├── README.md ├── __pycache__ ├── excel.cpython-310.pyc ├── excel.cpython-311.pyc ├── logger.cpython-310.pyc ├── logger.cpython-311.pyc └── tweet.cpython-311.pyc ├── excel.py ├── files └── conf.json ├── index.py ├── logger.py ├── requirements.txt └── tweet.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.xls 2 | data.json 3 | /files/conf.json 4 | /output 5 | /Archieve 6 | token 7 | file.log 8 | temp.json 9 | conf.json -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twitter-Scrapper 2 | A small project of scrapping data from twitter 3 | 4 | - Make sure to set your access token in "./files/conf.json" 5 | - How to get your access token: https://youtu.be/uHOz7BSPXCo 6 | - Executable download: https://sourceforge.net/projects/twitter-scrapper/ 7 | - Support me: https://www.buymeacoffee.com/mostafaehab 8 | -------------------------------------------------------------------------------- /__pycache__/excel.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mostafa-Ehab/Twitter-Scrapper/89a9f1153b75d4e1b1b0b695b3bae105f6c1e755/__pycache__/excel.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/excel.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mostafa-Ehab/Twitter-Scrapper/89a9f1153b75d4e1b1b0b695b3bae105f6c1e755/__pycache__/excel.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/logger.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mostafa-Ehab/Twitter-Scrapper/89a9f1153b75d4e1b1b0b695b3bae105f6c1e755/__pycache__/logger.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/logger.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mostafa-Ehab/Twitter-Scrapper/89a9f1153b75d4e1b1b0b695b3bae105f6c1e755/__pycache__/logger.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/tweet.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mostafa-Ehab/Twitter-Scrapper/89a9f1153b75d4e1b1b0b695b3bae105f6c1e755/__pycache__/tweet.cpython-311.pyc -------------------------------------------------------------------------------- /excel.py: -------------------------------------------------------------------------------- 1 | # Writing to an excel 2 | # sheet using Python 3 | from xlwt import Workbook 4 | 5 | def Excel( 6 | data: list, 7 | output: list 8 | ): 9 | # Workbook is created 10 | wb = Workbook() 11 | 12 | # add_sheet is used to create sheet. 13 | sheet1 = wb.add_sheet('Sheet 1') 14 | 15 | for i, row in enumerate(output): 16 | sheet1.write(0, i, row) 17 | 18 | for i, row in enumerate(data): 19 | for j, cell in enumerate(output): 20 | sheet1.write(i+1, j, row[cell]) 21 | print(row) 22 | 23 | wb.save('Excel.xls') 24 | -------------------------------------------------------------------------------- /files/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "headless": false, 3 | "output_form": [ 4 | "URL", 5 | "Date", 6 | "Text", 7 | "Lang", 8 | "Likes", 9 | "Retweets", 10 | "Replies" 11 | ], 12 | "token": null, 13 | "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188" 14 | } -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.chrome.options import Options 3 | from logger import Logger 4 | import json 5 | 6 | from tweet import Tweet 7 | from excel import Excel 8 | 9 | def main(): 10 | log.warning("Loading configurations...") 11 | if not conf["token"]: 12 | log.warning("Please set your access token in './files/conf.json' file") 13 | log.warning("For more info visit this link: https://youtu.be/uHOz7BSPXCo") 14 | input("\n\tPress any key to exit...") 15 | return 16 | 17 | driver = open_driver(conf["headless"], conf["userAgent"]) 18 | driver.get("https://twitter.com/") 19 | set_token(driver, conf["token"]) 20 | driver.get("https://twitter.com/") 21 | 22 | log.warning("Starting...") 23 | data = profile_search(driver) 24 | 25 | log.warning("Saving...") 26 | Excel(data, conf["output_form"]) 27 | 28 | 29 | def profile_search( 30 | driver: webdriver.Chrome 31 | ): 32 | url = input("Enter profile URL: ") 33 | num = int(input("Enter the required number of tweets: ")) 34 | driver.get(url) 35 | 36 | log.warning("Fetching...") 37 | Ad = [] 38 | results = [] 39 | while len(results) < num: 40 | tweet = Tweet(driver, Ad) 41 | 42 | data = {} 43 | 44 | data["URL"] = tweet.get_url() 45 | data["Date"] = tweet.get_date() 46 | data["Text"] = tweet.get_text() 47 | data["Lang"] = tweet.get_lang() 48 | data["Likes"] = tweet.get_num_likes() 49 | data["Retweets"] = tweet.get_num_retweet() 50 | data["Replies"] = tweet.get_num_reply() 51 | 52 | results.append(data) 53 | 54 | json.dump(results, open("./files/temp.json", "w")) 55 | 56 | log.info(f"{len(results) + 1} : {data['URL']}") 57 | 58 | return results 59 | 60 | 61 | def open_driver( 62 | headless: bool, 63 | agent: str 64 | ) -> webdriver.Chrome: 65 | 66 | options = Options() 67 | 68 | options.add_argument('--log-level=3') 69 | options.add_argument('ignore-certificate-errors') 70 | 71 | if headless: 72 | options.add_argument('--headless') 73 | 74 | options.add_argument(f"user-agent={agent}") 75 | 76 | driver = webdriver.Chrome(options=options) 77 | 78 | return driver 79 | 80 | def set_token( 81 | driver: webdriver.Chrome, 82 | token: str 83 | ) -> None: 84 | src = f""" 85 | let date = new Date(); 86 | date.setTime(date.getTime() + (7*24*60*60*1000)); 87 | let expires = "; expires=" + date.toUTCString(); 88 | 89 | document.cookie = "auth_token={token}" + expires + "; path=/"; 90 | """ 91 | driver.execute_script(src) 92 | 93 | def load_conf() -> dict: 94 | with open("./files/conf.json", "r") as file: 95 | return json.loads(file.read()) 96 | 97 | 98 | if __name__ == "__main__": 99 | log = Logger() 100 | try: 101 | conf = load_conf() 102 | except Exception: 103 | log.warning("Sorry and error occured, Please check your config file") 104 | input("\n\tPress any key to exit...") 105 | else: 106 | main() 107 | 108 | 109 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | from termcolor import colored 2 | from datetime import datetime 3 | 4 | 5 | class Logger: 6 | def success(self, text, time=True): 7 | cprint(f"{get_time() if time else ''}{text}", "green") 8 | fprint(f"{get_time() if time else ''}{text}") 9 | 10 | def info(self, text, time=True): 11 | cprint(f"{get_time() if time else ''}{text}", "white") 12 | fprint(f"{get_time() if time else ''}{text}") 13 | 14 | def warning(self, text, time=True): 15 | cprint(f"{get_time() if time else ''}{text}", "yellow") 16 | fprint(f"{get_time() if time else ''}{text}") 17 | 18 | def error(self, text, time=True): 19 | cprint(f"{get_time() if time else ''}{text}", "red") 20 | fprint(f"{get_time() if time else ''}{text}") 21 | 22 | def end(self, color="white", num=20): 23 | cprint(("-" * num), color) 24 | 25 | 26 | def fprint(text): 27 | with open("./files/file.log", "a") as f: 28 | f.write(f"{text}\n") 29 | 30 | 31 | def cprint(text, color, end="\n"): 32 | print(colored(text, color), end=end) 33 | 34 | 35 | def get_time(): 36 | time = datetime.now() 37 | formated_time = time.strftime("%d/%m/%Y %H:%M:%S: ") 38 | return formated_time 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium 2 | termcolor 3 | xlwt -------------------------------------------------------------------------------- /tweet.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException 4 | from selenium.webdriver.remote.webelement import WebElement 5 | from datetime import datetime 6 | from time import sleep 7 | import traceback 8 | 9 | 10 | class Tweet: 11 | def __init__(self, 12 | driver: webdriver.Chrome, 13 | Ad: list 14 | ): 15 | self.driver = driver 16 | self.Ad = Ad 17 | 18 | while True: 19 | try: 20 | self.tweet = self.__get_first_tweet() 21 | 22 | self.__remove_pinned() 23 | 24 | self.tweet_url, self.retweet = self.__get_tweet_url() 25 | self.tweet_date = self.__get_tweet_date() 26 | self.tweet_text = self.__get_tweet_text() 27 | self.tweet_lang = self.__get_tweet_lang() 28 | self.tweet_num_likes = self.__get_tweet_num_likes() 29 | self.tweet_num_retweet = self.__get_tweet_num_retweet() 30 | self.tweet_num_reply = self.__get_tweet_num_reply() 31 | 32 | except TypeError: 33 | self.Ad.append(self.tweet) 34 | sleep(1) 35 | driver.execute_script("arguments[0].scrollIntoView();", self.tweet) 36 | continue 37 | 38 | except Exception: 39 | print(traceback.format_exc()) 40 | sleep(1) 41 | # driver.execute_script("arguments[0].scrollIntoView();", self.tweet) 42 | input("An error occured: ") 43 | continue 44 | break 45 | 46 | self.__delete_tweet() 47 | 48 | 49 | def get_url(self) -> str: 50 | return self.tweet_url 51 | 52 | def get_date(self) -> str: 53 | return self.tweet_date 54 | 55 | def get_text(self) -> str: 56 | return self.tweet_text 57 | 58 | def get_lang(self) -> str: 59 | return self.tweet_lang 60 | 61 | def get_num_likes(self) -> str: 62 | return self.tweet_num_likes 63 | 64 | def get_num_retweet(self) -> str: 65 | return self.tweet_num_retweet 66 | 67 | def get_num_reply(self) -> str: 68 | return self.tweet_num_reply 69 | 70 | 71 | def __get_first_tweet(self) -> WebElement: 72 | while True: 73 | try: 74 | tweets = self.driver.find_elements(By.CSS_SELECTOR, "article[data-testid='tweet']") 75 | for tweet in tweets: 76 | if tweet not in self.Ad: 77 | return tweet 78 | except IndexError: 79 | sleep(0.5) 80 | continue 81 | 82 | 83 | def __remove_pinned(self): 84 | while True: 85 | try: 86 | if self.tweet.find_element(By.CSS_SELECTOR, 'div[data-testid="socialContext"]').get_attribute("innerText") == "Pinned": 87 | print("Skipping pinned...") 88 | raise TypeError 89 | 90 | except NoSuchElementException: 91 | pass 92 | 93 | except StaleElementReferenceException: 94 | sleep(1) 95 | continue 96 | 97 | break 98 | 99 | 100 | def __get_tweet_url(self) -> tuple[str, bool]: 101 | urls = self.tweet.find_elements(By.CSS_SELECTOR, "a") 102 | 103 | if urls[0].get_attribute("href") == urls[1].get_attribute("href"): 104 | url = urls[3].get_attribute("href") 105 | re_tweet = False 106 | else: 107 | url = urls[4].get_attribute("href") 108 | re_tweet = True 109 | 110 | return url, re_tweet 111 | 112 | 113 | def __get_tweet_date(self) -> str: 114 | # 2023-07-11T12:59:22.000Z 115 | try: 116 | date = self.tweet.find_element( 117 | By.CSS_SELECTOR, "time").get_attribute("datetime")[:10] 118 | date = datetime.strptime(date, '%Y-%m-%d') 119 | except NoSuchElementException: 120 | raise TypeError 121 | 122 | return date.strftime('%d/%m/%Y') 123 | 124 | 125 | def __get_tweet_text(self) -> str: 126 | try: 127 | element = self.tweet.find_element( 128 | By.CSS_SELECTOR, "div[data-testid='tweetText']") 129 | 130 | return element.get_attribute("innerText") 131 | except NoSuchElementException: 132 | return "" 133 | 134 | 135 | def __get_tweet_lang(self) -> str: 136 | try: 137 | element = self.tweet.find_element( 138 | By.CSS_SELECTOR, "div[data-testid='tweetText']") 139 | return element.get_attribute("lang") 140 | except NoSuchElementException: 141 | return "" 142 | 143 | # def check_media(self): 144 | # try: 145 | # self.tweet.find_element(By.XPATH, "./div/div/div[2]/div[2]/div[4]") 146 | # media = True 147 | # except NoSuchElementException: 148 | # media = False 149 | 150 | # return media 151 | 152 | def __get_tweet_num_likes(self): 153 | return self.tweet.find_element(By.CSS_SELECTOR, "button[data-testid='like']").get_attribute("innerText") 154 | 155 | def __get_tweet_num_retweet(self): 156 | return self.tweet.find_element(By.CSS_SELECTOR, "button[data-testid='retweet']").get_attribute("innerText") 157 | 158 | def __get_tweet_num_reply(self): 159 | return self.tweet.find_element(By.CSS_SELECTOR, "button[data-testid='reply']").get_attribute("innerText") 160 | 161 | 162 | def __delete_tweet(self): 163 | self.driver.execute_script(""" 164 | var element = arguments[0]; 165 | element.parentNode.removeChild(element); 166 | """, self.tweet) 167 | --------------------------------------------------------------------------------