├── MANIFEST.in ├── .gitignore ├── pyproject.toml ├── src └── twitter_scraper_without_api │ ├── __init__.py │ ├── scraping_utilities.py │ ├── driver_utils.py │ ├── driver_initialisation.py │ ├── element_finder.py │ └── twitter_scraper.py ├── requirements.txt ├── setup.py ├── .github └── workflows │ └── python-app.yml └── README.MD /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.MD 2 | include LICENS 3 | include requirements.txt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | firefox-geckodriver/* 2 | geckodriver.log 3 | src/twitter_scraper_without_api/__pycache__/* 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .scraping_utilities import * 3 | from .driver_utils import * 4 | from .driver_initialisation import * 5 | from .element_finder import * 6 | from .twitter_scraper import * -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/scraping_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from inspect import currentframe 4 | import re 5 | 6 | 7 | 8 | class Scraping_utilities: 9 | 10 | @staticmethod 11 | def __parse_name(string): 12 | try: 13 | return string.split("(")[0].strip() 14 | except Exception as ex: 15 | print("Error on line no: {}".format( ex)) 16 | 17 | @staticmethod 18 | def __extract_digits(string): 19 | try: 20 | return int(re.search(r'\d+', string).group(0)) 21 | except Exception as ex: 22 | print("Error on line no.: {}".format( ex)) 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | async-generator==1.10 2 | attrs==21.4.0 3 | beautifulsoup4==4.11.1 4 | build==0.7.0 5 | certifi==2022.5.18.1 6 | cffi==1.15.0 7 | charset-normalizer==2.0.12 8 | colorama==0.4.4 9 | cryptography==37.0.2 10 | distlib==0.3.4 11 | distro==1.7.0 12 | filelock==3.7.0 13 | h11==0.13.0 14 | idna==3.3 15 | importlib-metadata==4.11.3 16 | mozdownload==1.26.0 17 | mozfile==2.1.0 18 | mozinfo==1.2.2 19 | numpy==1.21.6 20 | outcome==1.1.0 21 | packaging==21.3 22 | pandas==1.3.5 23 | pep517==0.12.0 24 | platformdirs==2.5.2 25 | platinfo==0.15.0 26 | progressbar2==4.0.0 27 | py-firefox-driver-manager==0.0.4 28 | pycparser==2.21 29 | pyOpenSSL==22.0.0 30 | pyparsing==3.0.9 31 | PySocks==1.7.1 32 | python-dateutil==2.8.2 33 | python-utils==3.3.0 34 | pytz==2022.1 35 | pywin32-ctypes==0.2.0 36 | redo==2.0.3 37 | requests==2.27.1 38 | selenium==4.1.5 39 | six==1.16.0 40 | sniffio==1.2.0 41 | sortedcontainers==2.4.0 42 | soupsieve==2.3.2.post1 43 | style==1.1.0 44 | tomli==2.0.1 45 | treeherder-client==5.0.0 46 | trio==0.20.0 47 | trio-websocket==0.9.2 48 | typing_extensions==4.2.0 49 | update==0.0.1 50 | urllib3==1.26.9 51 | virtualenv==20.14.1 52 | wsproto==1.1.0 53 | zipp==3.8.0 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import setuptools 3 | import os 4 | import sys 5 | 6 | def read_file(filename): 7 | with open(os.path.join(os.path.dirname(__file__), filename)) as file: 8 | return file.read() 9 | 10 | thelibFolder = os.path.dirname(os.path.realpath(__file__)) 11 | requirementPath = thelibFolder + '/requirements.txt' 12 | install_requires = [] # Here we'll get: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"] 13 | if os.path.isfile(requirementPath): 14 | with open(requirementPath,encoding='utf-8') as f: 15 | install_requires = f.read().splitlines() 16 | 17 | print(install_requires) 18 | setup( 19 | name='twitter_scraper_without_api', 20 | version='0.0.6', 21 | license='', 22 | author='Hamed', 23 | author_email='hamed.minaei@gmail.com', 24 | description='twitter_scraper without API', 25 | long_description=read_file('README.MD'), 26 | long_description_content_type="text/markdown", 27 | url="https://github.com/HamedMinaeizaeim/twitter_scraper", 28 | project_urls={ 29 | "Bug Tracker": "https://github.com/HamedMinaeizaeim/twitter_scraper/issues", 30 | }, 31 | classifiers=[ 32 | "Programming Language :: Python :: 3", 33 | "License :: OSI Approved :: MIT License", 34 | "Operating System :: OS Independent", 35 | ], 36 | install_requires=install_requires, 37 | packages=['twitter_scraper_without_api'], 38 | package_dir={'': 'src'}, 39 | python_requires=">=3.6", 40 | ) 41 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | #- name: Test with pytest 38 | # run: | 39 | # pytest 40 | 41 | deploy: 42 | 43 | runs-on: ubuntu-latest 44 | 45 | steps: 46 | - uses: actions/checkout@v3 47 | - name: Set up Python 48 | uses: actions/setup-python@v3 49 | with: 50 | python-version: '3.x' 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install build 55 | - name: Build package 56 | run: python -m build 57 | - name: Publish package 58 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 59 | with: 60 | user: __token__ 61 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/driver_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | try: 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.common.by import By 6 | from selenium.common.exceptions import WebDriverException 7 | import time 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | from selenium.webdriver.common.keys import Keys 12 | from inspect import currentframe 13 | from random import randint 14 | except Exception as ex: 15 | frameinfo = currentframe() 16 | print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex)) 17 | 18 | frameinfo = currentframe() 19 | 20 | class Utilities: 21 | """this class contains all the method related to driver behaviour, 22 | like scrolling, waiting for element to appear, it contains all static 23 | method, which accepts driver instance as a argument""" 24 | 25 | @staticmethod 26 | def __wait_until_tweets_appear(driver): 27 | try: 28 | WebDriverWait(driver, 10).until(EC.presence_of_element_located( 29 | (By.CSS_SELECTOR, '[data-testid="tweet"]'))) 30 | except WebDriverException: 31 | print("Tweets did not appear!") 32 | 33 | @staticmethod 34 | def __scroll_down(driver): 35 | try: 36 | body = driver.find_element_by_css_selector('body') 37 | for _ in range(3): 38 | body.send_keys(Keys.PAGE_DOWN) 39 | except Exception as ex: 40 | print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex)) 41 | 42 | @staticmethod 43 | def __wait_until_completion(driver): 44 | """waits until the page have completed loading""" 45 | try: 46 | state = "" 47 | while state != "complete": 48 | time.sleep(randint(3, 5)) 49 | state = driver.execute_script("return document.readyState") 50 | except Exception as ex: 51 | print(ex) 52 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 |

Twitter scraper selenium

2 |

Python's package to scrape Twitter's front-end easily with selenium.

3 | 4 | 5 | [![PyPI license](https://img.shields.io/pypi/l/ansicolortags.svg)](https://opensource.org/licenses/MIT) [![Python >=3.6.9](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/) 6 | [![Maintenance](https://img.shields.io/badge/Maintained-Yes-green.svg)](https://github.com/shaikhsajid1111/facebook_page_scraper/graphs/commit-activity) 7 | 8 | # Twitter_scraper_without_API 9 | 10 | This code was developed to extract information from twitter without using API as there are a limitation and costs for using official twitter API. You can extract based on your keyword and time frame (in minutes). You can extract unlimitted number of tweets. 11 | 12 | 13 | ## Pre-requests 14 | 15 | - Python 3.6+ 16 | - Browsers(Firefox) 17 | 18 | ## Instalation 19 | 20 | you can install from source code using 21 | 22 | git clone https://github.com/HamedMinaeizaeim/twitter_scraper_without_API.git 23 | and then run 24 | 25 | 26 | Python setup.py install 27 | or you can run 28 | 29 | 30 | pip install -r requirements.txt 31 | alternatively, you can install using **PyPl** : 32 | 33 | 34 | 35 | pip install twitter_scraper_without_API 36 | 37 | 38 | 39 | 40 | 41 | ## How to use 42 | 43 | To use this library, you just need to import the TwitterScraper scraper class and then specify your keyword search. By default, it will return all tweets within a minute. You can change it to extract tweets in the last n minutes. Here is a code to do that: 44 | 45 | from src.twitter_scraper_without_api import TwitterScraper 46 | twitter = TwitterScraper('bitcoin') 47 | twitter.last_n_mins = 3 48 | twitter.fetch_data() 49 | 50 | ## Export option 51 | 52 | You can export data as json, panda (Dataframe) and csv 53 | 54 | df = twitter.store_data('dataFrame') 55 | csv = twitter.store_data('csv') 56 | json = twitter.store_data('json') 57 | 58 | 59 | ## Privacy 60 | 61 | There is no issue with privacy in this library and search is based on publicly avaialble information 62 | -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/driver_initialisation.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | # to add capabilities for chrome and firefox, import their Options with different aliases 3 | from selenium.webdriver.chrome.options import Options as ChromeOptions 4 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 5 | # import webdriver for downloading respective driver for the browser 6 | 7 | from py_firefox_driver_manager import GeckoFireFoxdriverManager 8 | 9 | class DriverInitilizer: 10 | def __init__(self, proxy=None): 11 | self.proxy = proxy 12 | 13 | def set_properties(self, browser_option): 14 | 15 | browser_option.add_argument( 16 | '--headless') # runs browser in headless mode 17 | browser_option.add_argument('--no-sandbox') 18 | browser_option.add_argument("--disable-dev-shm-usage") 19 | browser_option.add_argument('--ignore-certificate-errors') 20 | browser_option.add_argument('--disable-gpu') 21 | browser_option.add_argument('--log-level=3') 22 | browser_option.add_argument('--disable-notifications') 23 | browser_option.add_argument('--disable-popup-blocking') 24 | return browser_option 25 | 26 | def setup_profile(self): 27 | """ 28 | This code is setup the profile 29 | :param fileLocation: location of file to be save 30 | :return profile: 31 | """ 32 | profile = webdriver.FirefoxProfile() 33 | #profile.set_preference("browser.download.dir", self.file_location); 34 | profile.set_preference("browser.download.folderList", 2); 35 | profile.set_preference("browser.helperApps.neverAsk.saveToDisk", 36 | "application/csv,application/excel,application/vnd.msexcel,application/vnd.ms-excel,text/anytext,text/comma-separated-values,text/csv,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/octet-stream"); 37 | profile.set_preference("browser.download.manager.showWhenStarting", False); 38 | profile.set_preference("browser.helperApps.neverAsk.openFile", 39 | "application/csv,application/excel,application/vnd.msexcel,application/vnd.ms-excel,text/anytext,text/comma-separated-values,text/csv,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/octet-stream"); 40 | profile.set_preference("browser.helperApps.alwaysAsk.force", False); 41 | profile.set_preference("browser.download.manager.useWindow", False); 42 | profile.set_preference("browser.download.manager.focusWhenStarting", False); 43 | profile.set_preference("browser.download.manager.alertOnEXEOpen", False); 44 | profile.set_preference("browser.download.manager.showAlertOnComplete", False); 45 | profile.set_preference("browser.download.manager.closeWhenDone", True); 46 | profile.set_preference("pdfjs.disabled", True) 47 | profile.set_preference('permissions.default.stylesheet', 2) 48 | profile.set_preference('permissions.default.image', 2) 49 | profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') 50 | profile.set_preference("http.response.timeout", 500) 51 | profile.set_preference("dom.max_script_run_time", 500) 52 | return profile 53 | 54 | def set_driver_for_browser(self): 55 | """expects browser name and returns a driver instance""" 56 | browser_option = FirefoxOptions() 57 | if self.proxy is not None: 58 | options = { 59 | 'https': 'https://{}'.format(self.proxy.replace(" ", "")), 60 | 'http': 'http://{}'.format(self.proxy.replace(" ", "")), 61 | 'no_proxy': 'localhost, 127.0.0.1' 62 | } 63 | 64 | return webdriver.Firefox(executable_path=GeckoFireFoxdriverManager().install_geckodriver(), 65 | options=self.set_properties(browser_option), seleniumwire_options=options) 66 | 67 | 68 | return webdriver.Firefox(executable_path=GeckoFireFoxdriverManager().install_geckodriver(), 69 | options=self.set_properties(browser_option)) 70 | 71 | 72 | -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/element_finder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from selenium.common.exceptions import NoSuchElementException 4 | from .scraping_utilities import Scraping_utilities 5 | from inspect import currentframe 6 | from dateutil.parser import parse 7 | 8 | 9 | 10 | class Finder: 11 | """ 12 | this class should contain all the static method to find that accept 13 | webdriver instance and perform operation to find elements and return the 14 | found element. 15 | method should follow convention like so: 16 | 17 | @staticmethod 18 | def __method_name(parameters): 19 | """ 20 | 21 | @staticmethod 22 | def __fetch_all_tweets(driver): 23 | try: 24 | return driver.find_elements_by_css_selector('[data-testid="tweet"]') 25 | except Exception as ex: 26 | print("Error at method fetch_all_tweets on line no : {}".format(ex)) 27 | 28 | @staticmethod 29 | def __find_replies(tweet): 30 | try: 31 | replies_element = tweet.find_element_by_css_selector('[data-testid="reply"]') 32 | replies = replies_element.get_attribute("aria-label") 33 | return Scraping_utilities._Scraping_utilities__extract_digits(replies) 34 | except Exception as ex: 35 | print("Error at method find_replies on line no : {}".format( ex)) 36 | return "" 37 | 38 | @staticmethod 39 | def __find_shares(tweet): 40 | try: 41 | shares_element = tweet.find_element_by_css_selector('[data-testid="retweet"]') 42 | shares = shares_element.get_attribute("aria-label") 43 | return Scraping_utilities._Scraping_utilities__extract_digits(shares) 44 | except Exception as ex: 45 | print("Error at method find_shares on line no: {}".format( ex)) 46 | return "" 47 | 48 | @staticmethod 49 | def __find_status(tweet): 50 | try: 51 | anchor = tweet.find_element_by_css_selector("a.r-bcqeeo.r-3s2u2q.r-qvutc0") 52 | return (anchor.get_attribute("href").split("/"), anchor.get_attribute("href")) 53 | except Exception as ex: 54 | print("Error at method find_status on line no: {}".format( ex)) 55 | return [] 56 | 57 | @staticmethod 58 | def __find_all_anchor_tags(tweet): 59 | try: 60 | return tweet.find_elements_by_tag_name('a') 61 | except Exception as ex: 62 | print("Error at method find_all_anchor_tags on line no : {}".format( 63 | ex)) 64 | 65 | @staticmethod 66 | def __find_timestamp(tweet): 67 | try: 68 | timestamp = tweet.find_element_by_tag_name( 69 | "time").get_attribute("datetime") 70 | #posted_time = parse(timestamp).isoformat() 71 | return timestamp 72 | except Exception as ex: 73 | print("Error at method find_timestamp on line no.: {}".format( 74 | ex)) 75 | 76 | 77 | @staticmethod 78 | def __find_content(tweet): 79 | try: 80 | #content_element = tweet.find_element_by_css_selector('.//*[@dir="auto"]')[4] 81 | content_element = tweet.find_element_by_css_selector('div[lang]') 82 | return content_element.text 83 | except NoSuchElementException: 84 | return "" 85 | except Exception as ex: 86 | print("Error at method find_content on line no: {}".format( 87 | ex)) 88 | 89 | @staticmethod 90 | def __find_like(tweet): 91 | try: 92 | like_element = tweet.find_element_by_css_selector('[data-testid="like"]') 93 | likes = like_element.get_attribute("aria-label") 94 | return Scraping_utilities._Scraping_utilities__extract_digits(likes) 95 | except Exception as ex: 96 | print("Error at method find_like on line no: {}".format( 97 | ex)) 98 | @staticmethod 99 | def __find_images(tweet): 100 | try: 101 | image_element = tweet.find_elements_by_css_selector( 102 | 'div[data-testid="tweetPhoto"]') 103 | images = [] 104 | for image_div in image_element: 105 | href = image_div.find_element_by_tag_name("img").get_attribute("src") 106 | images.append(href) 107 | return images 108 | except Exception as ex: 109 | print("Error at method __find_images on line no : {}".format( 110 | ex)) 111 | 112 | @staticmethod 113 | def __find_videos(tweet): 114 | try: 115 | image_element = tweet.find_elements_by_css_selector( 116 | 'div[data-testid="videoPlayer"]') 117 | videos = [] 118 | for video_div in image_element: 119 | href = video_div.find_element_by_tag_name("video").get_attribute("src") 120 | videos.append(href) 121 | return videos 122 | except Exception as ex: 123 | print("Error at method find_videos on line no: {}".format( 124 | ex)) 125 | 126 | @staticmethod 127 | def __is_retweet(tweet): 128 | try: 129 | tweet.find_element_by_css_selector('div.r-92ng3h.r-qvutc0') 130 | return True 131 | except NoSuchElementException: 132 | return False 133 | except Exception as ex: 134 | print("Error at method is_retweet on line no: {}".format( 135 | ex)) 136 | return False 137 | 138 | @staticmethod 139 | def __find_name_from_post(tweet,is_retweet=False): 140 | try: 141 | name = "NA" 142 | anchors = Finder.__find_all_anchor_tags(tweet) 143 | if len(anchors) > 2: 144 | if is_retweet: 145 | name = anchors[2].text.strip() 146 | else: 147 | name = anchors[1].text.split("\n")[0] 148 | return name 149 | except Exception as ex: 150 | print("Error at method __find_name_from_post on line no: {}".format( 151 | ex)) 152 | 153 | @staticmethod 154 | def __find_external_link(tweet): 155 | try: 156 | card = tweet.find_element_by_css_selector('[data-testid="card.wrapper"]') 157 | href = card.find_element_by_tag_name('a') 158 | return href.get_attribute("href") 159 | 160 | except NoSuchElementException: 161 | return "" 162 | except Exception as ex: 163 | print("Error at method __find_external_link on line no: {}".format( 164 | ex)) 165 | 166 | -------------------------------------------------------------------------------- /src/twitter_scraper_without_api/twitter_scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.support.ui import Select 3 | import time 4 | import pytz 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.common.exceptions import TimeoutException 9 | from selenium.webdriver.firefox.options import Options 10 | from selenium.webdriver.common.action_chains import ActionChains 11 | import time 12 | from datetime import datetime 13 | import datetime 14 | from selenium.webdriver.firefox.firefox_binary import FirefoxBinary 15 | from datetime import datetime, timedelta 16 | from selenium.webdriver.common.keys import Keys 17 | from bs4 import BeautifulSoup 18 | import pandas as pd 19 | import numpy as np 20 | from urllib.parse import quote 21 | from .element_finder import Finder 22 | from .driver_initialisation import DriverInitilizer 23 | from .driver_utils import Utilities 24 | import re, json, os, csv 25 | import dateutil 26 | 27 | class TwitterScraper: 28 | 29 | def __init__(self, keyword): 30 | self.keyword = keyword 31 | self.since = self.set_since() 32 | self.until = self.set_untill() 33 | self.url = "https://twitter.com/search?q={}%20until%3A{}%20since%3A{}&src=typed_query&f=live".format( 34 | quote(keyword), self.until, self.since) 35 | self.driver = self.setup_driver() 36 | self.retry = 10 37 | self.data = {} 38 | self._last_n_mins = 1 39 | 40 | def __repr__(self): 41 | return "TwitterScraper('bitcoin', 60 )" 42 | 43 | def __str__(self): 44 | return "" 45 | 46 | @property 47 | def last_n_mins(self): 48 | return self._last_n_mins 49 | 50 | @last_n_mins.setter 51 | def last_n_mins(self, value): 52 | if str(value).isnumeric(): 53 | self._last_n_mins = value 54 | else: 55 | print("you must enter numeric value in mints - 1 mins defult value was replaced") 56 | self._last_n_mins = 1 57 | 58 | @staticmethod 59 | def str_to_datetime(str_datetime): 60 | datetime_old_zone = dateutil.parser.isoparse(str_datetime) 61 | #datetime_old_zone = datetime.strptime(str_datetime, "%Y-%m-%dT%H:%M:%S.%z") 62 | nz_datetime_time = datetime_old_zone.replace(tzinfo=pytz.utc).astimezone(pytz.timezone("Pacific/Auckland")) 63 | return nz_datetime_time 64 | 65 | @staticmethod 66 | def convert_json_to_dataframe(json_data): 67 | df=[] 68 | for key in json_data: 69 | df=[pd.json_normalize(json_data[key]) for key in json_data] 70 | return pd.concat(df) 71 | 72 | def set_since(self): 73 | yesterday = datetime.now()-timedelta(days=1) 74 | return yesterday.strftime('%Y-%m-%d') 75 | 76 | def set_untill(self): 77 | tomorrow = datetime.now()+timedelta(days=1) 78 | return tomorrow.strftime('%Y-%m-%d') 79 | 80 | def __check_tweets_presence(self, tweet_list): 81 | if len(tweet_list) <= 0: 82 | self.retry -= 1 83 | 84 | def __check_retry(self): 85 | return self.retry <= 0 86 | 87 | def setup_driver(self): 88 | # driver = webdriver.Firefox( 89 | # executable_path=r"C:\Users\Hamed\PycharmProjects\Twitter_Scraper\geckodriver.exe", 90 | # firefox_profile=self.setup_profile()) 91 | firefox = DriverInitilizer() 92 | driver = firefox.set_driver_for_browser() 93 | driver.get(self.url) 94 | driver.set_page_load_timeout(6000) 95 | return driver 96 | 97 | 98 | def obtain_info_from_tweet(self, tweet): 99 | name = Finder._Finder__find_name_from_post(tweet) 100 | status, tweet_url = Finder._Finder__find_status(tweet) 101 | replies = Finder._Finder__find_replies(tweet) 102 | retweets = Finder._Finder__find_shares(tweet) 103 | username = tweet_url.split("/")[3] 104 | status = status[-1] 105 | is_retweet = Finder._Finder__is_retweet(tweet) 106 | posted_time = Finder._Finder__find_timestamp(tweet) 107 | posted_time = TwitterScraper.str_to_datetime(posted_time) 108 | content = Finder._Finder__find_content(tweet) 109 | likes = Finder._Finder__find_like(tweet) 110 | images = Finder._Finder__find_images(tweet) 111 | videos = Finder._Finder__find_videos(tweet) 112 | hashtags = re.findall(r"#(\w+)", content) 113 | mentions = re.findall(r"@(\w+)", content) 114 | profile_picture = "https://twitter.com/{}/photo".format(username) 115 | link = Finder._Finder__find_external_link(tweet) 116 | return link, profile_picture, mentions, hashtags,\ 117 | videos, images, likes, content, posted_time,\ 118 | is_retweet, status, username, retweets, replies,\ 119 | tweet_url, name 120 | 121 | 122 | def update_tweet_data(self, link, profile_picture, mentions, hashtags, 123 | videos, images, likes, content, posted_time, 124 | is_retweet, status, username, retweets, replies, 125 | tweet_url, name): 126 | self.data[status] = { 127 | "tweet_id": status, 128 | "username": username, 129 | "name": name, 130 | "profile_picture": profile_picture, 131 | "replies": replies, 132 | "retweets": retweets, 133 | "likes": likes, 134 | "is_retweet": is_retweet, 135 | "posted_time": posted_time, 136 | "content": content, 137 | "hashtags": hashtags, 138 | "mentions": mentions, 139 | "images": images, 140 | "videos": videos, 141 | "tweet_url": tweet_url, 142 | "link": link 143 | } 144 | 145 | def fetch_data(self): 146 | #try: 147 | all_ready_fetched_posts = [] 148 | time.sleep(4) 149 | present_tweets = Finder._Finder__fetch_all_tweets(self.driver) 150 | self.__check_tweets_presence(present_tweets) 151 | all_ready_fetched_posts.extend(present_tweets) 152 | latest_time_now = datetime.now() 153 | latest_time_now = latest_time_now.replace(tzinfo=None).astimezone(pytz.timezone("Pacific/Auckland")) 154 | ref_date_time = latest_time_now-timedelta(minutes=self._last_n_mins) 155 | 156 | while (latest_time_now-ref_date_time).total_seconds()>0: 157 | 158 | for tweet in present_tweets: 159 | 160 | link, profile_picture, mentions, hashtags, \ 161 | videos, images, likes, content, posted_time, \ 162 | is_retweet, status, username, retweets, replies, \ 163 | tweet_url, name = self.obtain_info_from_tweet(tweet) 164 | self.update_tweet_data(link, profile_picture, mentions, hashtags, 165 | videos, images, likes, content, posted_time, 166 | is_retweet, status, username, retweets, replies, 167 | tweet_url, name) 168 | 169 | if (posted_time-latest_time_now).total_seconds()<0: 170 | latest_time_now = posted_time 171 | 172 | Utilities._Utilities__scroll_down(self.driver) 173 | Utilities._Utilities__wait_until_completion(self.driver) 174 | Utilities._Utilities__wait_until_tweets_appear(self.driver) 175 | present_tweets = Finder._Finder__fetch_all_tweets(self.driver) 176 | present_tweets = [post for post in present_tweets if post not in all_ready_fetched_posts] 177 | self.__check_tweets_presence(present_tweets) 178 | all_ready_fetched_posts.extend(present_tweets) 179 | if self.__check_retry() is True: 180 | break 181 | self.driver.quit() 182 | 183 | def store_data(self, format='Json'): 184 | if format.lower()=='json': 185 | return self.data 186 | elif format.lower()=='dataframe': 187 | return TwitterScraper.convert_json_to_dataframe(self.data) 188 | elif format.lower()=='csv': 189 | df = TwitterScraper.convert_json_to_dataframe(self.data) 190 | return df.to_csv() 191 | else: 192 | print("it dose not sopport that format") 193 | 194 | 195 | --------------------------------------------------------------------------------