├── .env.example ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── img └── advanced-search-01.png ├── main.ipynb ├── requirements.txt ├── sample-command.txt └── scraper ├── __init__.py ├── __main__.py ├── progress.py ├── scroller.py ├── tweet.py └── twitter_scraper.py /.env.example: -------------------------------------------------------------------------------- 1 | TWITTER_USERNAME=# Your Twitter Handle 2 | TWITTER_PASSWORD=# Your Twitter Password 3 | HEADLESS=# Headless browser option (use "yes" or "no") 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.csv 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to selenium-twitter-scraper 2 | 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: 4 | 5 | - Reporting a bug 6 | - Discussing the current state of the code 7 | - Submitting a fix 8 | - Proposing new features 9 | - Becoming a maintainer 10 | 11 | ## We Develop with Github 12 | 13 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 14 | 15 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests 16 | 17 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://docs.github.com/en/get-started/quickstart/github-flow). We actively welcome your pull requests: 18 | 19 | 1. Fork the repo and create your branch from `master`. 20 | 2. If you've added code that should be tested, add tests. 21 | 3. Ensure the test suite passes. 22 | 4. Make sure your code lints. 23 | 5. Issue that pull request! 24 | 25 | ## Any contributions you make will be under the Apache License Version 2.0 Software License 26 | 27 | In short, when you submit code changes, your submissions are understood to be under the same [Apache License Version 2.0 License](https://choosealicense.com/licenses/apache-2.0/) that covers the project. Feel free to contact the maintainers if that's a concern. 28 | 29 | ## Report bugs using Github's [issues](https://github.com/godkingjay/selenium-twitter-scraper/issues) 30 | 31 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/godkingjay/selenium-twitter-scraper/issues/new); it's that easy! 32 | 33 | ## License 34 | 35 | By contributing, you agree that your contributions will be licensed under its Apache License Version 2.0. 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # selenium-twitter-scraper 2 | 3 | ## Setup 4 | 5 | 1. Install dependencies 6 | 7 | ```bash 8 | pip install -r requirements.txt 9 | ``` 10 | 11 | ## Authentication Options 12 | 13 | ### Using Environment Variable 14 | 15 | 1. Rename `.env.example` to `.env`. 16 | 17 | 2. Open `.env` and update environment variables 18 | 19 | ```bash 20 | TWITTER_USERNAME=# Your Twitter Handle (e.g. @username) 21 | TWITTER_USERNAME=# Your Twitter Username 22 | TWITTER_PASSWORD=# Your Twitter Password 23 | ``` 24 | 25 | ### Authentication in Terminal 26 | 27 | - Add a `username` and `password` to the command line. 28 | 29 | ```bash 30 | python scraper --user=@elonmusk --password=password123 31 | ``` 32 | 33 | ### No Authentication Provided 34 | 35 | - If you didn't specify a username and password, the program will 36 | ask you to enter a username and password. 37 | 38 | ```bash 39 | Twitter Username: @username 40 | Password: password123 41 | ``` 42 | 43 | --- 44 | 45 | **_Authentication Sequence Priority_** 46 | 47 | ```bash 48 | 1. Authentication provided in terminal. 49 | 2. Authentication provided in environment variables. 50 | ``` 51 | 52 | --- 53 | 54 | ## Usage 55 | 56 | - Show Help 57 | 58 | ```bash 59 | python scraper --help 60 | ``` 61 | 62 | - Basic usage 63 | 64 | ```bash 65 | python scraper 66 | ``` 67 | 68 | - Setting maximum number of tweets. defaults to `50`. 69 | 70 | ```bash 71 | python scraper --tweets=500 # Scrape 500 Tweets 72 | ``` 73 | 74 | - Options and Arguments 75 | 76 | ```bash 77 | usage: python scraper [option] ... [arg] ... 78 | 79 | authentication options description 80 | --user : Your twitter account Handle. 81 | e.g. 82 | --user=@username 83 | 84 | --password : Your twitter account password. 85 | e.g. 86 | --password=password123 87 | 88 | options: description 89 | -t, --tweets : Number of tweets to scrape (default: 50). 90 | e.g. 91 | -t 500 92 | --tweets=500 93 | 94 | -u, --username : Twitter username. 95 | Scrape tweets from a user's profile. 96 | e.g. 97 | -u elonmusk 98 | --username=@elonmusk 99 | 100 | -ht, --hashtag : Twitter hashtag. 101 | Scrape tweets from a hashtag. 102 | e.g. 103 | -ht javascript 104 | --hashtag=javascript 105 | 106 | -l, --list : List ID. Scrape tweets from a list. The 107 | ID is taken from the x.com/list/... URL. 108 | e.g. 109 | -l "1324132413151" 110 | --list "1324132413151" 111 | 112 | -q, --query : Twitter query or search. 113 | Scrape tweets from a query or search. 114 | e.g. 115 | -q "Philippine Marites" 116 | --query="Jak Roberto anti selos" 117 | 118 | -a, --add : Additional data to scrape and 119 | save in the .csv file. 120 | 121 | values: 122 | pd - poster's followers and following 123 | 124 | e.g. 125 | -a "pd" 126 | --add="pd" 127 | 128 | NOTE: Values must be separated by commas. 129 | 130 | --latest : Twitter latest tweets (default: True). 131 | Note: Only for hashtag-based 132 | and query-based scraping. 133 | usage: 134 | python scraper -t 500 -ht=python --latest 135 | 136 | --top : Twitter top tweets (default: False). 137 | Note: Only for hashtag-based 138 | and query-based scraping. 139 | usage: 140 | python scraper -t 500 -ht=python --top 141 | 142 | -ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape 143 | (will scrap until no more tweets are available). 144 | ``` 145 | 146 | ### Sample Scraping Commands 147 | 148 | - **Custom Limit Scraping** 149 | 150 | ```bash 151 | python scraper -t 500 152 | ``` 153 | 154 | - **User Profile Scraping** 155 | 156 | ```bash 157 | python scraper -t 100 -u elonmusk 158 | ``` 159 | 160 | - **Hashtag Scraping** 161 | 162 | - Latest 163 | 164 | ```bash 165 | python scraper -t 100 -ht python --latest 166 | ``` 167 | 168 | - Top 169 | 170 | ```bash 171 | python scraper -t 100 -ht python --top 172 | ``` 173 | 174 | - **Query or Search Scraping** 175 | _(Also works with twitter's advanced search.)_ 176 | 177 | - Latest 178 | 179 | ```bash 180 | python scraper -t 100 -q "Jak Roberto Anti Selos" --latest 181 | ``` 182 | 183 | - Top 184 | 185 | ```bash 186 | python scraper -t 100 -q "International News" --top 187 | ``` 188 | 189 | - **Advanced Search Scraping** 190 | 191 | - For tweets mentioning `@elonmusk`: 192 | 193 | ```bash 194 | python scraper --query="(@elonmusk)" 195 | ``` 196 | 197 | - For tweets that mentions `@elonmusk` with at least `1000` replies from `January 01, 2020 - August 31, 2023`: 198 | 199 | ```bash 200 | python scraper --query="(@elonmusk) min_replies:1000 until:2023-08-31 since:2020-01-01" 201 | ``` 202 | 203 | - Perform more `Advanced Search` using Twitter's Advanced Search, just setup the advanced query and copy the resulting string query to the program: 204 | - **[Twitter Advanced Search](https://twitter.com/search-advanced)** 205 | [![Image](./img/advanced-search-01.png)](./img/advanced-search-01.png) 206 | 207 | - **Scrape Additional Data** 208 | 209 | ```bash 210 | python scraper --add="pd" 211 | ``` 212 | 213 | | Values | Description | 214 | | :----: | :------------------------------------------------- | 215 | | pd | Tweet poster's id, followers, and following count. | 216 | 217 | --- 218 | -------------------------------------------------------------------------------- /img/advanced-search-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godkingjay/selenium-twitter-scraper/62d8ceb2f39a533d68965f309371efeeb9c676bd/img/advanced-search-01.png -------------------------------------------------------------------------------- /main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Twitter Scraper using Selenium\n", 9 | "\n", 10 | "Scraper for Twitter Tweets using selenium. It can scrape tweets from:\n", 11 | "- Home/New Feeds\n", 12 | "- User Profile Tweets\n", 13 | "- Query or Search Tweets\n", 14 | "- Hashtags Tweets\n", 15 | "- Advanced Search Tweets" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "import sys\n", 26 | "import pandas as pd\n", 27 | "\n", 28 | "from datetime import datetime\n", 29 | "from fake_headers import Headers\n", 30 | "from time import sleep\n", 31 | "from selenium import webdriver\n", 32 | "from selenium.webdriver import Chrome\n", 33 | "from selenium.webdriver.common.keys import Keys\n", 34 | "from selenium.common.exceptions import (\n", 35 | " NoSuchElementException,\n", 36 | " StaleElementReferenceException,\n", 37 | " WebDriverException,\n", 38 | ")\n", 39 | "from selenium.webdriver.common.action_chains import ActionChains\n", 40 | "\n", 41 | "from selenium.webdriver.chrome.webdriver import WebDriver\n", 42 | "from selenium.webdriver.chrome.options import Options as ChromeOptions\n", 43 | "from selenium.webdriver.chrome.service import Service as ChromeService\n", 44 | "\n", 45 | "from webdriver_manager.chrome import ChromeDriverManager" 46 | ] 47 | }, 48 | { 49 | "attachments": {}, 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Progress Class\n", 54 | "\n", 55 | "Class for the progress of the scraper instance." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "class Progress:\n", 65 | " def __init__(self, current, total) -> None:\n", 66 | " self.current = current\n", 67 | " self.total = total\n", 68 | " pass\n", 69 | "\n", 70 | " def print_progress(self, current) -> None:\n", 71 | " self.current = current\n", 72 | " progress = current / self.total\n", 73 | " bar_length = 40\n", 74 | " progress_bar = (\n", 75 | " \"[\"\n", 76 | " + \"=\" * int(bar_length * progress)\n", 77 | " + \"-\" * (bar_length - int(bar_length * progress))\n", 78 | " + \"]\"\n", 79 | " )\n", 80 | " sys.stdout.write(\n", 81 | " \"\\rProgress: [{:<40}] {:.2%} {} of {}\".format(\n", 82 | " progress_bar, progress, current, self.total\n", 83 | " )\n", 84 | " )\n", 85 | " sys.stdout.flush()\n" 86 | ] 87 | }, 88 | { 89 | "attachments": {}, 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "# Scroller Class\n", 94 | "\n", 95 | "Class for the scrollbar of the web page." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "class Scroller:\n", 105 | " def __init__(self, driver) -> None:\n", 106 | " self.driver = driver\n", 107 | " self.current_position = 0\n", 108 | " self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n", 109 | " self.scrolling = True\n", 110 | " self.scroll_count = 0\n", 111 | " pass\n", 112 | "\n", 113 | " def reset(self) -> None:\n", 114 | " self.current_position = 0\n", 115 | " self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n", 116 | " self.scroll_count = 0\n", 117 | " pass\n", 118 | "\n", 119 | " def scroll_to_top(self) -> None:\n", 120 | " self.driver.execute_script(\"window.scrollTo(0, 0);\")\n", 121 | " pass\n", 122 | "\n", 123 | " def scroll_to_bottom(self) -> None:\n", 124 | " self.driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", 125 | " pass\n", 126 | "\n", 127 | " def update_scroll_position(self) -> None:\n", 128 | " self.current_position = self.driver.execute_script(\"return window.pageYOffset;\")\n", 129 | " pass\n" 130 | ] 131 | }, 132 | { 133 | "attachments": {}, 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "# Tweet Class\n", 138 | "\n", 139 | "Object for the tweet. Including its data." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "class Tweet:\n", 149 | " def __init__(\n", 150 | " self,\n", 151 | " card: WebDriver,\n", 152 | " driver: WebDriver,\n", 153 | " actions: ActionChains,\n", 154 | " scrape_poster_details=False\n", 155 | " ) -> None:\n", 156 | " self.card = card\n", 157 | " self.error = False\n", 158 | " self.tweet = None\n", 159 | "\n", 160 | " try:\n", 161 | " self.user = card.find_element(\n", 162 | " \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n", 163 | " ).text\n", 164 | " except NoSuchElementException:\n", 165 | " self.error = True\n", 166 | " self.user = \"skip\"\n", 167 | "\n", 168 | " try:\n", 169 | " self.handle = card.find_element(\n", 170 | " \"xpath\", './/span[contains(text(), \"@\")]'\n", 171 | " ).text\n", 172 | " except NoSuchElementException:\n", 173 | " self.error = True\n", 174 | " self.handle = \"skip\"\n", 175 | "\n", 176 | " try:\n", 177 | " self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n", 178 | " \"datetime\"\n", 179 | " )\n", 180 | "\n", 181 | " if self.date_time is not None:\n", 182 | " self.is_ad = False\n", 183 | " except NoSuchElementException:\n", 184 | " self.is_ad = True\n", 185 | " self.error = True\n", 186 | " self.date_time = \"skip\"\n", 187 | " \n", 188 | " if self.error:\n", 189 | " return\n", 190 | "\n", 191 | " try:\n", 192 | " card.find_element(\n", 193 | " \"xpath\", './/*[local-name()=\"svg\" and @data-testid=\"icon-verified\"]'\n", 194 | " )\n", 195 | "\n", 196 | " self.verified = True\n", 197 | " except NoSuchElementException:\n", 198 | " self.verified = False\n", 199 | "\n", 200 | " self.content = \"\"\n", 201 | " contents = card.find_elements(\n", 202 | " \"xpath\",\n", 203 | " '(.//div[@data-testid=\"tweetText\"])[1]/span | (.//div[@data-testid=\"tweetText\"])[1]/a',\n", 204 | " )\n", 205 | "\n", 206 | " for index, content in enumerate(contents):\n", 207 | " self.content += content.text\n", 208 | "\n", 209 | " try:\n", 210 | " self.reply_cnt = card.find_element(\n", 211 | " \"xpath\", './/div[@data-testid=\"reply\"]//span'\n", 212 | " ).text\n", 213 | " \n", 214 | " if self.reply_cnt == \"\":\n", 215 | " self.reply_cnt = \"0\"\n", 216 | " except NoSuchElementException:\n", 217 | " self.reply_cnt = \"0\"\n", 218 | "\n", 219 | " try:\n", 220 | " self.retweet_cnt = card.find_element(\n", 221 | " \"xpath\", './/div[@data-testid=\"retweet\"]//span'\n", 222 | " ).text\n", 223 | " \n", 224 | " if self.retweet_cnt == \"\":\n", 225 | " self.retweet_cnt = \"0\"\n", 226 | " except NoSuchElementException:\n", 227 | " self.retweet_cnt = \"0\"\n", 228 | "\n", 229 | " try:\n", 230 | " self.like_cnt = card.find_element(\n", 231 | " \"xpath\", './/div[@data-testid=\"like\"]//span'\n", 232 | " ).text\n", 233 | " \n", 234 | " if self.like_cnt == \"\":\n", 235 | " self.like_cnt = \"0\"\n", 236 | " except NoSuchElementException:\n", 237 | " self.like_cnt = \"0\"\n", 238 | "\n", 239 | " try:\n", 240 | " self.analytics_cnt = card.find_element(\n", 241 | " \"xpath\", './/a[contains(@href, \"/analytics\")]//span'\n", 242 | " ).text\n", 243 | " \n", 244 | " if self.analytics_cnt == \"\":\n", 245 | " self.analytics_cnt = \"0\"\n", 246 | " except NoSuchElementException:\n", 247 | " self.analytics_cnt = \"0\"\n", 248 | "\n", 249 | " try:\n", 250 | " self.tags = card.find_elements(\n", 251 | " \"xpath\",\n", 252 | " './/a[contains(@href, \"src=hashtag_click\")]',\n", 253 | " )\n", 254 | "\n", 255 | " self.tags = [tag.text for tag in self.tags]\n", 256 | " except NoSuchElementException:\n", 257 | " self.tags = []\n", 258 | " \n", 259 | " try:\n", 260 | " self.mentions = card.find_elements(\n", 261 | " \"xpath\",\n", 262 | " '(.//div[@data-testid=\"tweetText\"])[1]//a[contains(text(), \"@\")]',\n", 263 | " )\n", 264 | "\n", 265 | " self.mentions = [mention.text for mention in self.mentions]\n", 266 | " except NoSuchElementException:\n", 267 | " self.mentions = []\n", 268 | " \n", 269 | " try:\n", 270 | " raw_emojis = card.find_elements(\n", 271 | " \"xpath\",\n", 272 | " '(.//div[@data-testid=\"tweetText\"])[1]/img[contains(@src, \"emoji\")]',\n", 273 | " )\n", 274 | " \n", 275 | " self.emojis = [emoji.get_attribute(\"alt\").encode(\"unicode-escape\").decode(\"ASCII\") for emoji in raw_emojis]\n", 276 | " except NoSuchElementException:\n", 277 | " self.emojis = []\n", 278 | " \n", 279 | " try:\n", 280 | " self.profile_img = card.find_element(\n", 281 | " \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n", 282 | " ).get_attribute(\"src\")\n", 283 | " except NoSuchElementException:\n", 284 | " self.profile_img = \"\"\n", 285 | " \n", 286 | " try:\n", 287 | " self.tweet_link = self.card.find_element(\n", 288 | " \"xpath\",\n", 289 | " \".//a[contains(@href, '/status/')]\",\n", 290 | " ).get_attribute(\"href\")\n", 291 | " self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n", 292 | " except NoSuchElementException:\n", 293 | " self.tweet_link = \"\"\n", 294 | " self.tweet_id = \"\"\n", 295 | " \n", 296 | " self.following_cnt = \"0\"\n", 297 | " self.followers_cnt = \"0\"\n", 298 | " self.user_id = None\n", 299 | " \n", 300 | " if scrape_poster_details:\n", 301 | " el_name = card.find_element(\n", 302 | " \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n", 303 | " )\n", 304 | " \n", 305 | " ext_hover_card = False\n", 306 | " ext_user_id = False\n", 307 | " ext_following = False\n", 308 | " ext_followers = False\n", 309 | " hover_attempt = 0\n", 310 | " \n", 311 | " while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n", 312 | " try:\n", 313 | " actions.move_to_element(el_name).perform()\n", 314 | " \n", 315 | " hover_card = driver.find_element(\n", 316 | " \"xpath\",\n", 317 | " '//div[@data-testid=\"hoverCardParent\"]'\n", 318 | " )\n", 319 | " \n", 320 | " ext_hover_card = True\n", 321 | " \n", 322 | " while not ext_user_id:\n", 323 | " try:\n", 324 | " raw_user_id = hover_card.find_element(\n", 325 | " \"xpath\",\n", 326 | " '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n", 327 | " ).get_attribute(\"data-testid\")\n", 328 | " \n", 329 | " if raw_user_id == \"\":\n", 330 | " self.user_id = None\n", 331 | " else:\n", 332 | " self.user_id = str(raw_user_id.split(\"-\")[0])\n", 333 | " \n", 334 | " ext_user_id = True\n", 335 | " except NoSuchElementException:\n", 336 | " continue\n", 337 | " except StaleElementReferenceException:\n", 338 | " self.error = True\n", 339 | " return\n", 340 | " \n", 341 | " while not ext_following:\n", 342 | " try:\n", 343 | " self.following_cnt = hover_card.find_element(\n", 344 | " \"xpath\",\n", 345 | " './/a[contains(@href, \"/following\")]//span'\n", 346 | " ).text\n", 347 | " \n", 348 | " if self.following_cnt == \"\":\n", 349 | " self.following_cnt = \"0\"\n", 350 | " \n", 351 | " ext_following = True\n", 352 | " except NoSuchElementException:\n", 353 | " continue\n", 354 | " except StaleElementReferenceException:\n", 355 | " self.error = True\n", 356 | " return\n", 357 | " \n", 358 | " while not ext_followers:\n", 359 | " try:\n", 360 | " self.followers_cnt = hover_card.find_element(\n", 361 | " \"xpath\",\n", 362 | " './/a[contains(@href, \"/verified_followers\")]//span'\n", 363 | " ).text\n", 364 | " \n", 365 | " if self.followers_cnt == \"\":\n", 366 | " self.followers_cnt = \"0\"\n", 367 | " \n", 368 | " ext_followers = True\n", 369 | " except NoSuchElementException:\n", 370 | " continue\n", 371 | " except StaleElementReferenceException:\n", 372 | " self.error = True\n", 373 | " return\n", 374 | " except NoSuchElementException:\n", 375 | " if hover_attempt==3:\n", 376 | " self.error\n", 377 | " return\n", 378 | " hover_attempt+=1\n", 379 | " sleep(0.5)\n", 380 | " continue\n", 381 | " except StaleElementReferenceException:\n", 382 | " self.error = True\n", 383 | " return\n", 384 | " \n", 385 | " if ext_hover_card and ext_following and ext_followers:\n", 386 | " actions.reset_actions()\n", 387 | " \n", 388 | " self.tweet = (\n", 389 | " self.user,\n", 390 | " self.handle,\n", 391 | " self.date_time,\n", 392 | " self.verified,\n", 393 | " self.content,\n", 394 | " self.reply_cnt,\n", 395 | " self.retweet_cnt,\n", 396 | " self.like_cnt,\n", 397 | " self.analytics_cnt,\n", 398 | " self.tags,\n", 399 | " self.mentions,\n", 400 | " self.emojis,\n", 401 | " self.profile_img,\n", 402 | " self.tweet_link,\n", 403 | " self.tweet_id,\n", 404 | " self.user_id,\n", 405 | " self.following_cnt,\n", 406 | " self.followers_cnt,\n", 407 | " )\n", 408 | "\n", 409 | " pass\n" 410 | ] 411 | }, 412 | { 413 | "attachments": {}, 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "# Twitter Scraper Class\n", 418 | "\n", 419 | "Class for the Twitter Scraper." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n", 429 | "\n", 430 | "class Twitter_Scraper:\n", 431 | " def __init__(\n", 432 | " self,\n", 433 | " username,\n", 434 | " password,\n", 435 | " max_tweets=50,\n", 436 | " scrape_username=None,\n", 437 | " scrape_hashtag=None,\n", 438 | " scrape_query=None,\n", 439 | " scrape_poster_details=False,\n", 440 | " scrape_latest=True,\n", 441 | " scrape_top=False,\n", 442 | " ):\n", 443 | " print(\"Initializing Twitter Scraper...\")\n", 444 | " self.username = username\n", 445 | " self.password = password\n", 446 | " self.interrupted = False\n", 447 | " self.tweet_ids = set()\n", 448 | " self.data = []\n", 449 | " self.tweet_cards = []\n", 450 | " self.scraper_details = {\n", 451 | " \"type\": None,\n", 452 | " \"username\": None,\n", 453 | " \"hashtag\": None,\n", 454 | " \"query\": None,\n", 455 | " \"tab\": None,\n", 456 | " \"poster_details\": False,\n", 457 | " }\n", 458 | " self.max_tweets = max_tweets\n", 459 | " self.progress = Progress(0, max_tweets)\n", 460 | " self.router = self.go_to_home\n", 461 | " self.driver = self._get_driver()\n", 462 | " self.actions = ActionChains(self.driver)\n", 463 | " self.scroller = Scroller(self.driver)\n", 464 | " self._config_scraper(\n", 465 | " max_tweets,\n", 466 | " scrape_username,\n", 467 | " scrape_hashtag,\n", 468 | " scrape_query,\n", 469 | " scrape_latest,\n", 470 | " scrape_top,\n", 471 | " scrape_poster_details,\n", 472 | " )\n", 473 | "\n", 474 | " def _config_scraper(\n", 475 | " self,\n", 476 | " max_tweets=50,\n", 477 | " scrape_username=None,\n", 478 | " scrape_hashtag=None,\n", 479 | " scrape_query=None,\n", 480 | " scrape_latest=True,\n", 481 | " scrape_top=False,\n", 482 | " scrape_poster_details=False,\n", 483 | " ):\n", 484 | " self.tweet_ids = set()\n", 485 | " self.data = []\n", 486 | " self.tweet_cards = []\n", 487 | " self.max_tweets = max_tweets\n", 488 | " self.progress = Progress(0, max_tweets)\n", 489 | " self.scraper_details = {\n", 490 | " \"type\": None,\n", 491 | " \"username\": scrape_username,\n", 492 | " \"hashtag\": str(scrape_hashtag).replace(\"#\", \"\")\n", 493 | " if scrape_hashtag is not None\n", 494 | " else None,\n", 495 | " \"query\": scrape_query,\n", 496 | " \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n", 497 | " \"poster_details\": scrape_poster_details,\n", 498 | " }\n", 499 | " self.router = self.go_to_home\n", 500 | " self.scroller = Scroller(self.driver)\n", 501 | "\n", 502 | " if scrape_username is not None:\n", 503 | " self.scraper_details[\"type\"] = \"Username\"\n", 504 | " self.router = self.go_to_profile\n", 505 | " elif scrape_hashtag is not None:\n", 506 | " self.scraper_details[\"type\"] = \"Hashtag\"\n", 507 | " self.router = self.go_to_hashtag\n", 508 | " elif scrape_query is not None:\n", 509 | " self.scraper_details[\"type\"] = \"Query\"\n", 510 | " self.router = self.go_to_search\n", 511 | " else:\n", 512 | " self.scraper_details[\"type\"] = \"Home\"\n", 513 | " self.router = self.go_to_home\n", 514 | " pass\n", 515 | "\n", 516 | " def _get_driver(self):\n", 517 | " print(\"Setup WebDriver...\")\n", 518 | " header = Headers().generate()[\"User-Agent\"]\n", 519 | "\n", 520 | " browser_option = ChromeOptions()\n", 521 | " browser_option.add_argument(\"--no-sandbox\")\n", 522 | " browser_option.add_argument(\"--disable-dev-shm-usage\")\n", 523 | " browser_option.add_argument(\"--ignore-certificate-errors\")\n", 524 | " browser_option.add_argument(\"--disable-gpu\")\n", 525 | " browser_option.add_argument(\"--log-level=3\")\n", 526 | " browser_option.add_argument(\"--disable-notifications\")\n", 527 | " browser_option.add_argument(\"--disable-popup-blocking\")\n", 528 | " browser_option.add_argument(\"--user-agent={}\".format(header))\n", 529 | "\n", 530 | " # For Hiding Browser\n", 531 | " browser_option.add_argument(\"--headless\")\n", 532 | "\n", 533 | " try:\n", 534 | " print(\"Initializing ChromeDriver...\")\n", 535 | " driver = webdriver.Chrome(\n", 536 | " options=browser_option,\n", 537 | " )\n", 538 | "\n", 539 | " print(\"WebDriver Setup Complete\")\n", 540 | " return driver\n", 541 | " except WebDriverException:\n", 542 | " try:\n", 543 | " print(\"Downloading ChromeDriver...\")\n", 544 | " chromedriver_path = ChromeDriverManager().install()\n", 545 | " chrome_service = ChromeService(executable_path=chromedriver_path)\n", 546 | "\n", 547 | " print(\"Initializing ChromeDriver...\")\n", 548 | " driver = webdriver.Chrome(\n", 549 | " service=chrome_service,\n", 550 | " options=browser_option,\n", 551 | " )\n", 552 | "\n", 553 | " print(\"WebDriver Setup Complete\")\n", 554 | " return driver\n", 555 | " except Exception as e:\n", 556 | " print(f\"Error setting up WebDriver: {e}\")\n", 557 | " sys.exit(1)\n", 558 | " pass\n", 559 | "\n", 560 | " def login(self):\n", 561 | " print()\n", 562 | " print(\"Logging in to Twitter...\")\n", 563 | "\n", 564 | " try:\n", 565 | " self.driver.maximize_window()\n", 566 | " self.driver.get(TWITTER_LOGIN_URL)\n", 567 | " sleep(3)\n", 568 | "\n", 569 | " self._input_username()\n", 570 | " self._input_unusual_activity()\n", 571 | " self._input_password()\n", 572 | "\n", 573 | " cookies = self.driver.get_cookies()\n", 574 | "\n", 575 | " auth_token = None\n", 576 | "\n", 577 | " for cookie in cookies:\n", 578 | " if cookie[\"name\"] == \"auth_token\":\n", 579 | " auth_token = cookie[\"value\"]\n", 580 | " break\n", 581 | "\n", 582 | " if auth_token is None:\n", 583 | " raise ValueError(\n", 584 | " \"\"\"This may be due to the following:\n", 585 | "\n", 586 | "- Internet connection is unstable\n", 587 | "- Username is incorrect\n", 588 | "- Password is incorrect\n", 589 | "\"\"\"\n", 590 | " )\n", 591 | "\n", 592 | " print()\n", 593 | " print(\"Login Successful\")\n", 594 | " print()\n", 595 | " except Exception as e:\n", 596 | " print()\n", 597 | " print(f\"Login Failed: {e}\")\n", 598 | " sys.exit(1)\n", 599 | "\n", 600 | " pass\n", 601 | "\n", 602 | " def _input_username(self):\n", 603 | " input_attempt = 0\n", 604 | "\n", 605 | " while True:\n", 606 | " try:\n", 607 | " username = self.driver.find_element(\n", 608 | " \"xpath\", \"//input[@autocomplete='username']\"\n", 609 | " )\n", 610 | "\n", 611 | " username.send_keys(self.username)\n", 612 | " username.send_keys(Keys.RETURN)\n", 613 | " sleep(3)\n", 614 | " break\n", 615 | " except NoSuchElementException:\n", 616 | " input_attempt += 1\n", 617 | " if input_attempt >= 3:\n", 618 | " print()\n", 619 | " print(\n", 620 | " \"\"\"There was an error inputting the username.\n", 621 | "\n", 622 | "It may be due to the following:\n", 623 | "- Internet connection is unstable\n", 624 | "- Username is incorrect\n", 625 | "- Twitter is experiencing unusual activity\"\"\"\n", 626 | " )\n", 627 | " self.driver.quit()\n", 628 | " sys.exit(1)\n", 629 | " else:\n", 630 | " print(\"Re-attempting to input username...\")\n", 631 | " sleep(2)\n", 632 | "\n", 633 | " def _input_unusual_activity(self):\n", 634 | " input_attempt = 0\n", 635 | "\n", 636 | " while True:\n", 637 | " try:\n", 638 | " unusual_activity = self.driver.find_element(\n", 639 | " \"xpath\", \"//input[@data-testid='ocfEnterTextTextInput']\"\n", 640 | " )\n", 641 | " unusual_activity.send_keys(self.username)\n", 642 | " unusual_activity.send_keys(Keys.RETURN)\n", 643 | " sleep(3)\n", 644 | " break\n", 645 | " except NoSuchElementException:\n", 646 | " input_attempt += 1\n", 647 | " if input_attempt >= 3:\n", 648 | " break\n", 649 | "\n", 650 | " def _input_password(self):\n", 651 | " input_attempt = 0\n", 652 | "\n", 653 | " while True:\n", 654 | " try:\n", 655 | " password = self.driver.find_element(\n", 656 | " \"xpath\", \"//input[@autocomplete='current-password']\"\n", 657 | " )\n", 658 | "\n", 659 | " password.send_keys(self.password)\n", 660 | " password.send_keys(Keys.RETURN)\n", 661 | " sleep(3)\n", 662 | " break\n", 663 | " except NoSuchElementException:\n", 664 | " input_attempt += 1\n", 665 | " if input_attempt >= 3:\n", 666 | " print()\n", 667 | " print(\n", 668 | " \"\"\"There was an error inputting the password.\n", 669 | "\n", 670 | "It may be due to the following:\n", 671 | "- Internet connection is unstable\n", 672 | "- Password is incorrect\n", 673 | "- Twitter is experiencing unusual activity\"\"\"\n", 674 | " )\n", 675 | " self.driver.quit()\n", 676 | " sys.exit(1)\n", 677 | " else:\n", 678 | " print(\"Re-attempting to input password...\")\n", 679 | " sleep(2)\n", 680 | "\n", 681 | " def go_to_home(self):\n", 682 | " self.driver.get(\"https://twitter.com/home\")\n", 683 | " sleep(3)\n", 684 | " pass\n", 685 | "\n", 686 | " def go_to_profile(self):\n", 687 | " if (\n", 688 | " self.scraper_details[\"username\"] is None\n", 689 | " or self.scraper_details[\"username\"] == \"\"\n", 690 | " ):\n", 691 | " print(\"Username is not set.\")\n", 692 | " sys.exit(1)\n", 693 | " else:\n", 694 | " self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n", 695 | " sleep(3)\n", 696 | " pass\n", 697 | "\n", 698 | " def go_to_hashtag(self):\n", 699 | " if (\n", 700 | " self.scraper_details[\"hashtag\"] is None\n", 701 | " or self.scraper_details[\"hashtag\"] == \"\"\n", 702 | " ):\n", 703 | " print(\"Hashtag is not set.\")\n", 704 | " sys.exit(1)\n", 705 | " else:\n", 706 | " url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n", 707 | " if self.scraper_details[\"tab\"] == \"Latest\":\n", 708 | " url += \"&f=live\"\n", 709 | "\n", 710 | " self.driver.get(url)\n", 711 | " sleep(3)\n", 712 | " pass\n", 713 | "\n", 714 | " def go_to_search(self):\n", 715 | " if self.scraper_details[\"query\"] is None or self.scraper_details[\"query\"] == \"\":\n", 716 | " print(\"Query is not set.\")\n", 717 | " sys.exit(1)\n", 718 | " else:\n", 719 | " url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n", 720 | " if self.scraper_details[\"tab\"] == \"Latest\":\n", 721 | " url += \"&f=live\"\n", 722 | "\n", 723 | " self.driver.get(url)\n", 724 | " sleep(3)\n", 725 | " pass\n", 726 | "\n", 727 | " def get_tweet_cards(self):\n", 728 | " self.tweet_cards = self.driver.find_elements(\n", 729 | " \"xpath\", '//article[@data-testid=\"tweet\" and not(@disabled)]'\n", 730 | " )\n", 731 | " pass\n", 732 | "\n", 733 | " def remove_hidden_cards(self):\n", 734 | " try:\n", 735 | " hidden_cards = self.driver.find_elements(\n", 736 | " \"xpath\", '//article[@data-testid=\"tweet\" and @disabled]'\n", 737 | " )\n", 738 | "\n", 739 | " for card in hidden_cards[1:-2]:\n", 740 | " self.driver.execute_script(\n", 741 | " \"arguments[0].parentNode.parentNode.parentNode.remove();\", card\n", 742 | " )\n", 743 | " except Exception as e:\n", 744 | " return\n", 745 | " pass\n", 746 | "\n", 747 | " def scrape_tweets(\n", 748 | " self,\n", 749 | " max_tweets=50,\n", 750 | " scrape_username=None,\n", 751 | " scrape_hashtag=None,\n", 752 | " scrape_query=None,\n", 753 | " scrape_latest=True,\n", 754 | " scrape_top=False,\n", 755 | " scrape_poster_details=False,\n", 756 | " router=None,\n", 757 | " ):\n", 758 | " self._config_scraper(\n", 759 | " max_tweets,\n", 760 | " scrape_username,\n", 761 | " scrape_hashtag,\n", 762 | " scrape_query,\n", 763 | " scrape_latest,\n", 764 | " scrape_top,\n", 765 | " scrape_poster_details,\n", 766 | " )\n", 767 | "\n", 768 | " if router is None:\n", 769 | " router = self.router\n", 770 | "\n", 771 | " router()\n", 772 | "\n", 773 | " if self.scraper_details[\"type\"] == \"Username\":\n", 774 | " print(\n", 775 | " \"Scraping Tweets from @{}...\".format(self.scraper_details[\"username\"])\n", 776 | " )\n", 777 | " elif self.scraper_details[\"type\"] == \"Hashtag\":\n", 778 | " print(\n", 779 | " \"Scraping {} Tweets from #{}...\".format(\n", 780 | " self.scraper_details[\"tab\"], self.scraper_details[\"hashtag\"]\n", 781 | " )\n", 782 | " )\n", 783 | " elif self.scraper_details[\"type\"] == \"Query\":\n", 784 | " print(\n", 785 | " \"Scraping {} Tweets from {} search...\".format(\n", 786 | " self.scraper_details[\"tab\"], self.scraper_details[\"query\"]\n", 787 | " )\n", 788 | " )\n", 789 | " elif self.scraper_details[\"type\"] == \"Home\":\n", 790 | " print(\"Scraping Tweets from Home...\")\n", 791 | "\n", 792 | " self.progress.print_progress(0)\n", 793 | "\n", 794 | " refresh_count = 0\n", 795 | " added_tweets = 0\n", 796 | " empty_count = 0\n", 797 | "\n", 798 | " while self.scroller.scrolling:\n", 799 | " try:\n", 800 | " self.get_tweet_cards()\n", 801 | " added_tweets = 0\n", 802 | "\n", 803 | " for card in self.tweet_cards[-15:]:\n", 804 | " try:\n", 805 | " tweet_id = str(card)\n", 806 | "\n", 807 | " if tweet_id not in self.tweet_ids:\n", 808 | " self.tweet_ids.add(tweet_id)\n", 809 | "\n", 810 | " if not self.scraper_details[\"poster_details\"]:\n", 811 | " self.driver.execute_script(\n", 812 | " \"arguments[0].scrollIntoView();\", card\n", 813 | " )\n", 814 | "\n", 815 | " tweet = Tweet(\n", 816 | " card=card,\n", 817 | " driver=self.driver,\n", 818 | " actions=self.actions,\n", 819 | " scrape_poster_details=self.scraper_details[\n", 820 | " \"poster_details\"\n", 821 | " ],\n", 822 | " )\n", 823 | "\n", 824 | " if tweet:\n", 825 | " if not tweet.error and tweet.tweet is not None:\n", 826 | " if not tweet.is_ad:\n", 827 | " self.data.append(tweet.tweet)\n", 828 | " added_tweets += 1\n", 829 | " self.progress.print_progress(len(self.data))\n", 830 | "\n", 831 | " if len(self.data) >= self.max_tweets:\n", 832 | " self.scroller.scrolling = False\n", 833 | " break\n", 834 | " else:\n", 835 | " continue\n", 836 | " else:\n", 837 | " continue\n", 838 | " else:\n", 839 | " continue\n", 840 | " else:\n", 841 | " continue\n", 842 | " except NoSuchElementException:\n", 843 | " continue\n", 844 | "\n", 845 | " if len(self.data) >= self.max_tweets:\n", 846 | " break\n", 847 | "\n", 848 | " if added_tweets == 0:\n", 849 | " if empty_count >= 5:\n", 850 | " if refresh_count >= 3:\n", 851 | " print()\n", 852 | " print(\"No more tweets to scrape\")\n", 853 | " break\n", 854 | " refresh_count += 1\n", 855 | " empty_count += 1\n", 856 | " sleep(1)\n", 857 | " else:\n", 858 | " empty_count = 0\n", 859 | " refresh_count = 0\n", 860 | " except StaleElementReferenceException:\n", 861 | " sleep(2)\n", 862 | " continue\n", 863 | " except KeyboardInterrupt:\n", 864 | " print(\"\\n\")\n", 865 | " print(\"Keyboard Interrupt\")\n", 866 | " self.interrupted = True\n", 867 | " break\n", 868 | " except Exception as e:\n", 869 | " print(\"\\n\")\n", 870 | " print(f\"Error scraping tweets: {e}\")\n", 871 | " break\n", 872 | "\n", 873 | " print(\"\")\n", 874 | "\n", 875 | " if len(self.data) >= self.max_tweets:\n", 876 | " print(\"Scraping Complete\")\n", 877 | " else:\n", 878 | " print(\"Scraping Incomplete\")\n", 879 | "\n", 880 | " print(\"Tweets: {} out of {}\\n\".format(len(self.data), self.max_tweets))\n", 881 | "\n", 882 | " pass\n", 883 | "\n", 884 | " def save_to_csv(self):\n", 885 | " print(\"Saving Tweets to CSV...\")\n", 886 | " now = datetime.now()\n", 887 | " folder_path = \"./tweets/\"\n", 888 | "\n", 889 | " if not os.path.exists(folder_path):\n", 890 | " os.makedirs(folder_path)\n", 891 | " print(\"Created Folder: {}\".format(folder_path))\n", 892 | "\n", 893 | " data = {\n", 894 | " \"Name\": [tweet[0] for tweet in self.data],\n", 895 | " \"Handle\": [tweet[1] for tweet in self.data],\n", 896 | " \"Timestamp\": [tweet[2] for tweet in self.data],\n", 897 | " \"Verified\": [tweet[3] for tweet in self.data],\n", 898 | " \"Content\": [tweet[4] for tweet in self.data],\n", 899 | " \"Comments\": [tweet[5] for tweet in self.data],\n", 900 | " \"Retweets\": [tweet[6] for tweet in self.data],\n", 901 | " \"Likes\": [tweet[7] for tweet in self.data],\n", 902 | " \"Analytics\": [tweet[8] for tweet in self.data],\n", 903 | " \"Tags\": [tweet[9] for tweet in self.data],\n", 904 | " \"Mentions\": [tweet[10] for tweet in self.data],\n", 905 | " \"Emojis\": [tweet[11] for tweet in self.data],\n", 906 | " \"Profile Image\": [tweet[12] for tweet in self.data],\n", 907 | " \"Tweet Link\": [tweet[13] for tweet in self.data],\n", 908 | " \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n", 909 | " }\n", 910 | "\n", 911 | " if self.scraper_details[\"poster_details\"]:\n", 912 | " data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n", 913 | " data[\"Following\"] = [tweet[16] for tweet in self.data]\n", 914 | " data[\"Followers\"] = [tweet[17] for tweet in self.data]\n", 915 | "\n", 916 | " df = pd.DataFrame(data)\n", 917 | "\n", 918 | " current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n", 919 | " file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n", 920 | " pd.set_option(\"display.max_colwidth\", None)\n", 921 | " df.to_csv(file_path, index=False, encoding=\"utf-8\")\n", 922 | "\n", 923 | " print(\"CSV Saved: {}\".format(file_path))\n", 924 | "\n", 925 | " pass\n", 926 | "\n", 927 | " def get_tweets(self):\n", 928 | " return self.data" 929 | ] 930 | }, 931 | { 932 | "attachments": {}, 933 | "cell_type": "markdown", 934 | "metadata": {}, 935 | "source": [ 936 | "# Create a new instance of the Twitter Scraper class" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [ 945 | "USER_UNAME = os.environ['TWITTER_USERNAME']\n", 946 | "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n", 947 | "\n", 948 | "scraper = Twitter_Scraper(\n", 949 | " username=USER_UNAME,\n", 950 | " password=USER_PASSWORD,\n", 951 | " # max_tweets=10,\n", 952 | " # scrape_username=\"something\",\n", 953 | " # scrape_hashtag=\"something\",\n", 954 | " # scrape_query=\"something\",\n", 955 | " # scrape_latest=False,\n", 956 | " # scrape_top=True,\n", 957 | " # scrape_poster_details=True\n", 958 | ")" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": null, 964 | "metadata": {}, 965 | "outputs": [], 966 | "source": [ 967 | "scraper.login()" 968 | ] 969 | }, 970 | { 971 | "attachments": {}, 972 | "cell_type": "markdown", 973 | "metadata": {}, 974 | "source": [ 975 | "# Run Twitter Scraper" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [ 984 | "scraper.scrape_tweets(\n", 985 | " # max_tweets=100,\n", 986 | " # scrape_username=\"something\",\n", 987 | " # scrape_hashtag=\"something\",\n", 988 | " # scrape_query=\"something\",\n", 989 | " # scrape_latest=False,\n", 990 | " # scrape_top=True,\n", 991 | " # scrape_poster_details=True,\n", 992 | ")" 993 | ] 994 | }, 995 | { 996 | "attachments": {}, 997 | "cell_type": "markdown", 998 | "metadata": {}, 999 | "source": [ 1000 | "# Save Scraped Tweets in a CSV" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "metadata": {}, 1007 | "outputs": [], 1008 | "source": [ 1009 | "scraper.save_to_csv()" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [ 1018 | "scraper.driver.close()" 1019 | ] 1020 | } 1021 | ], 1022 | "metadata": { 1023 | "kernelspec": { 1024 | "display_name": "ml", 1025 | "language": "python", 1026 | "name": "python3" 1027 | }, 1028 | "language_info": { 1029 | "codemirror_mode": { 1030 | "name": "ipython", 1031 | "version": 3 1032 | }, 1033 | "file_extension": ".py", 1034 | "mimetype": "text/x-python", 1035 | "name": "python", 1036 | "nbconvert_exporter": "python", 1037 | "pygments_lexer": "ipython3", 1038 | "version": "3.11.5" 1039 | }, 1040 | "orig_nbformat": 4 1041 | }, 1042 | "nbformat": 4, 1043 | "nbformat_minor": 2 1044 | } 1045 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fake_headers>=1.0.2 2 | pandas>=2.0.3 3 | python-dotenv>=1.0.0 4 | selenium>=4.12.0 5 | webdriver_manager>=4.0.0 6 | -------------------------------------------------------------------------------- /sample-command.txt: -------------------------------------------------------------------------------- 1 | python scraper --query='("NVDA" OR "nvidia") lang:en until:2024-01-19 since:2024-01-18' -t 5000 --top 2 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godkingjay/selenium-twitter-scraper/62d8ceb2f39a533d68965f309371efeeb9c676bd/scraper/__init__.py -------------------------------------------------------------------------------- /scraper/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import getpass 5 | from twitter_scraper import Twitter_Scraper 6 | 7 | try: 8 | from dotenv import load_dotenv 9 | 10 | print("Loading .env file") 11 | load_dotenv() 12 | print("Loaded .env file\n") 13 | except Exception as e: 14 | print(f"Error loading .env file: {e}") 15 | sys.exit(1) 16 | 17 | 18 | def main(): 19 | try: 20 | parser = argparse.ArgumentParser( 21 | add_help=True, 22 | usage="python scraper [option] ... [arg] ...", 23 | description="Twitter Scraper is a tool that allows you to scrape tweets from twitter without using Twitter's API.", 24 | ) 25 | 26 | try: 27 | parser.add_argument( 28 | "--mail", 29 | type=str, 30 | default=os.getenv("TWITTER_MAIL"), 31 | help="Your Twitter mail.", 32 | ) 33 | 34 | parser.add_argument( 35 | "--user", 36 | type=str, 37 | default=os.getenv("TWITTER_USERNAME"), 38 | help="Your Twitter username.", 39 | ) 40 | 41 | parser.add_argument( 42 | "--password", 43 | type=str, 44 | default=os.getenv("TWITTER_PASSWORD"), 45 | help="Your Twitter password.", 46 | ) 47 | 48 | parser.add_argument( 49 | "--headlessState", 50 | type=str, 51 | default=os.getenv("HEADLESS"), 52 | help="Headless mode? [yes/no]" 53 | ) 54 | except Exception as e: 55 | print(f"Error retrieving environment variables: {e}") 56 | sys.exit(1) 57 | 58 | parser.add_argument( 59 | "-t", 60 | "--tweets", 61 | type=int, 62 | default=50, 63 | help="Number of tweets to scrape (default: 50)", 64 | ) 65 | 66 | parser.add_argument( 67 | "-u", 68 | "--username", 69 | type=str, 70 | default=None, 71 | help="Twitter username. Scrape tweets from a user's profile.", 72 | ) 73 | 74 | parser.add_argument( 75 | "-ht", 76 | "--hashtag", 77 | type=str, 78 | default=None, 79 | help="Twitter hashtag. Scrape tweets from a hashtag.", 80 | ) 81 | 82 | parser.add_argument( 83 | "--bookmarks", 84 | action='store_true', 85 | help="Twitter bookmarks. Scrape tweets from your bookmarks.", 86 | ) 87 | 88 | parser.add_argument( 89 | "-ntl", 90 | "--no_tweets_limit", 91 | nargs='?', 92 | default=False, 93 | help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).", 94 | ) 95 | 96 | parser.add_argument( 97 | "-l", 98 | "--list", 99 | type=str, 100 | default=None, 101 | help="List ID. Scrape tweets from a list.", 102 | ) 103 | 104 | parser.add_argument( 105 | "-q", 106 | "--query", 107 | type=str, 108 | default=None, 109 | help="Twitter query or search. Scrape tweets from a query or search.", 110 | ) 111 | 112 | parser.add_argument( 113 | "-a", 114 | "--add", 115 | type=str, 116 | default="", 117 | help="Additional data to scrape and save in the .csv file.", 118 | ) 119 | 120 | parser.add_argument( 121 | "--latest", 122 | action="store_true", 123 | help="Scrape latest tweets", 124 | ) 125 | 126 | parser.add_argument( 127 | "--top", 128 | action="store_true", 129 | help="Scrape top tweets", 130 | ) 131 | 132 | args = parser.parse_args() 133 | 134 | USER_MAIL = args.mail 135 | USER_UNAME = args.user 136 | USER_PASSWORD = args.password 137 | HEADLESS_MODE= args.headlessState 138 | 139 | if USER_UNAME is None: 140 | USER_UNAME = input("Twitter Username: ") 141 | 142 | if USER_PASSWORD is None: 143 | USER_PASSWORD = getpass.getpass("Enter Password: ") 144 | 145 | if HEADLESS_MODE is None: 146 | HEADLESS_MODE - str(input("Headless?[Yes/No]")).lower() 147 | 148 | print() 149 | 150 | tweet_type_args = [] 151 | 152 | if args.username is not None: 153 | tweet_type_args.append(args.username) 154 | if args.hashtag is not None: 155 | tweet_type_args.append(args.hashtag) 156 | if args.list is not None: 157 | tweet_type_args.append(args.list) 158 | if args.query is not None: 159 | tweet_type_args.append(args.query) 160 | if args.bookmarks is not False: 161 | tweet_type_args.append(args.query) 162 | 163 | additional_data: list = args.add.split(",") 164 | 165 | if len(tweet_type_args) > 1: 166 | print("Please specify only one of --username, --hashtag, --bookmarks, or --query.") 167 | sys.exit(1) 168 | 169 | if args.latest and args.top: 170 | print("Please specify either --latest or --top. Not both.") 171 | sys.exit(1) 172 | 173 | if USER_UNAME is not None and USER_PASSWORD is not None: 174 | scraper = Twitter_Scraper( 175 | mail=USER_MAIL, 176 | username=USER_UNAME, 177 | password=USER_PASSWORD, 178 | headlessState=HEADLESS_MODE 179 | ) 180 | scraper.login() 181 | scraper.scrape_tweets( 182 | max_tweets=args.tweets, 183 | no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True, 184 | scrape_username=args.username, 185 | scrape_hashtag=args.hashtag, 186 | scrape_bookmarks=args.bookmarks, 187 | scrape_query=args.query, 188 | scrape_list=args.list, 189 | scrape_latest=args.latest, 190 | scrape_top=args.top, 191 | scrape_poster_details="pd" in additional_data, 192 | ) 193 | scraper.save_to_csv() 194 | if not scraper.interrupted: 195 | scraper.driver.close() 196 | else: 197 | print( 198 | "Missing Twitter username or password environment variables. Please check your .env file." 199 | ) 200 | sys.exit(1) 201 | except KeyboardInterrupt: 202 | print("\nScript Interrupted by user. Exiting...") 203 | sys.exit(1) 204 | except Exception as e: 205 | print(f"Error: {e}") 206 | sys.exit(1) 207 | sys.exit(1) 208 | 209 | 210 | if __name__ == "__main__": 211 | main() 212 | -------------------------------------------------------------------------------- /scraper/progress.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class Progress: 5 | def __init__(self, current, total) -> None: 6 | self.current = current 7 | self.total = total 8 | pass 9 | 10 | def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None: 11 | self.current = current 12 | progress = current / self.total 13 | bar_length = 40 14 | progress_bar = ( 15 | "[" 16 | + "=" * int(bar_length * progress) 17 | + "-" * (bar_length - int(bar_length * progress)) 18 | + "]" 19 | ) 20 | if no_tweets_limit: 21 | if waiting: 22 | sys.stdout.write( 23 | "\rTweets scraped : {} - waiting to access older tweets {} min on 15 min".format( 24 | current, retry_cnt 25 | ) 26 | ) 27 | else: 28 | sys.stdout.write( 29 | "\rTweets scraped : {} ".format( 30 | current 31 | ) 32 | ) 33 | else: 34 | if waiting: 35 | sys.stdout.write( 36 | "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format( 37 | progress_bar, progress, current, self.total, retry_cnt 38 | ) 39 | ) 40 | else: 41 | sys.stdout.write( 42 | "\rProgress: [{:<40}] {:.2%} {} of {} ".format( 43 | progress_bar, progress, current, self.total 44 | ) 45 | ) 46 | sys.stdout.flush() 47 | -------------------------------------------------------------------------------- /scraper/scroller.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | 4 | 5 | class Scroller: 6 | def __init__(self, driver) -> None: 7 | self.driver = driver 8 | self.current_position = 0 9 | self.last_position = driver.execute_script("return window.pageYOffset;") 10 | self.scrolling = True 11 | self.scroll_count = 0 12 | pass 13 | 14 | def reset(self) -> None: 15 | self.current_position = 0 16 | self.last_position = self.driver.execute_script("return window.pageYOffset;") 17 | self.scroll_count = 0 18 | pass 19 | 20 | def scroll_to_top(self) -> None: 21 | self.driver.execute_script("window.scrollTo(0, 0);") 22 | pass 23 | 24 | def scroll_to_bottom(self) -> None: 25 | self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 26 | pass 27 | 28 | def update_scroll_position(self) -> None: 29 | self.current_position = self.driver.execute_script("return window.pageYOffset;") 30 | pass 31 | -------------------------------------------------------------------------------- /scraper/tweet.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from selenium.common.exceptions import ( 3 | NoSuchElementException, 4 | StaleElementReferenceException, 5 | ) 6 | from selenium.webdriver.chrome.webdriver import WebDriver 7 | from selenium.webdriver.common.action_chains import ActionChains 8 | 9 | 10 | class Tweet: 11 | def __init__( 12 | self, 13 | card: WebDriver, 14 | driver: WebDriver, 15 | actions: ActionChains, 16 | scrape_poster_details=False, 17 | ) -> None: 18 | self.card = card 19 | self.error = False 20 | self.tweet = None 21 | 22 | try: 23 | self.user = card.find_element( 24 | "xpath", './/div[@data-testid="User-Name"]//span' 25 | ).text 26 | except NoSuchElementException: 27 | self.error = True 28 | self.user = "skip" 29 | 30 | try: 31 | self.handle = card.find_element( 32 | "xpath", './/span[contains(text(), "@")]' 33 | ).text 34 | except NoSuchElementException: 35 | self.error = True 36 | self.handle = "skip" 37 | 38 | try: 39 | self.date_time = card.find_element("xpath", ".//time").get_attribute( 40 | "datetime" 41 | ) 42 | 43 | if self.date_time is not None: 44 | self.is_ad = False 45 | except NoSuchElementException: 46 | self.is_ad = True 47 | self.error = True 48 | self.date_time = "skip" 49 | 50 | if self.error: 51 | return 52 | 53 | try: 54 | card.find_element( 55 | "xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]' 56 | ) 57 | 58 | self.verified = True 59 | except NoSuchElementException: 60 | self.verified = False 61 | 62 | self.content = "" 63 | contents = card.find_elements( 64 | "xpath", 65 | '(.//div[@data-testid="tweetText"])[1]/span | (.//div[@data-testid="tweetText"])[1]/a', 66 | ) 67 | 68 | for index, content in enumerate(contents): 69 | self.content += content.text 70 | 71 | try: 72 | self.reply_cnt = card.find_element( 73 | "xpath", './/button[@data-testid="reply"]//span' 74 | ).text 75 | 76 | if self.reply_cnt == "": 77 | self.reply_cnt = "0" 78 | except NoSuchElementException: 79 | self.reply_cnt = "0" 80 | 81 | try: 82 | self.retweet_cnt = card.find_element( 83 | "xpath", './/button[@data-testid="retweet"]//span' 84 | ).text 85 | 86 | if self.retweet_cnt == "": 87 | self.retweet_cnt = "0" 88 | except NoSuchElementException: 89 | self.retweet_cnt = "0" 90 | 91 | try: 92 | self.like_cnt = card.find_element( 93 | "xpath", './/button[@data-testid="like"]//span' 94 | ).text 95 | 96 | if self.like_cnt == "": 97 | self.like_cnt = "0" 98 | except NoSuchElementException: 99 | self.like_cnt = "0" 100 | 101 | try: 102 | self.analytics_cnt = card.find_element( 103 | "xpath", './/a[contains(@href, "/analytics")]//span' 104 | ).text 105 | 106 | if self.analytics_cnt == "": 107 | self.analytics_cnt = "0" 108 | except NoSuchElementException: 109 | self.analytics_cnt = "0" 110 | 111 | try: 112 | self.tags = card.find_elements( 113 | "xpath", 114 | './/a[contains(@href, "src=hashtag_click")]', 115 | ) 116 | 117 | self.tags = [tag.text for tag in self.tags] 118 | except NoSuchElementException: 119 | self.tags = [] 120 | 121 | try: 122 | self.mentions = card.find_elements( 123 | "xpath", 124 | '(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]', 125 | ) 126 | 127 | self.mentions = [mention.text for mention in self.mentions] 128 | except NoSuchElementException: 129 | self.mentions = [] 130 | 131 | try: 132 | raw_emojis = card.find_elements( 133 | "xpath", 134 | '(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]', 135 | ) 136 | 137 | self.emojis = [ 138 | emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII") 139 | for emoji in raw_emojis 140 | ] 141 | except NoSuchElementException: 142 | self.emojis = [] 143 | 144 | try: 145 | self.profile_img = card.find_element( 146 | "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img' 147 | ).get_attribute("src") 148 | except NoSuchElementException: 149 | self.profile_img = "" 150 | 151 | try: 152 | self.tweet_link = self.card.find_element( 153 | "xpath", 154 | ".//a[contains(@href, '/status/')]", 155 | ).get_attribute("href") 156 | self.tweet_id = str(self.tweet_link.split("/")[-1]) 157 | except NoSuchElementException: 158 | self.tweet_link = "" 159 | self.tweet_id = "" 160 | 161 | self.following_cnt = "0" 162 | self.followers_cnt = "0" 163 | self.user_id = None 164 | 165 | if scrape_poster_details: 166 | el_name = card.find_element( 167 | "xpath", './/div[@data-testid="User-Name"]//span' 168 | ) 169 | 170 | ext_hover_card = False 171 | ext_user_id = False 172 | ext_following = False 173 | ext_followers = False 174 | hover_attempt = 0 175 | 176 | while ( 177 | not ext_hover_card 178 | or not ext_user_id 179 | or not ext_following 180 | or not ext_followers 181 | ): 182 | try: 183 | actions.move_to_element(el_name).perform() 184 | 185 | hover_card = driver.find_element( 186 | "xpath", '//div[@data-testid="hoverCardParent"]' 187 | ) 188 | 189 | ext_hover_card = True 190 | 191 | while not ext_user_id: 192 | try: 193 | raw_user_id = hover_card.find_element( 194 | "xpath", 195 | '(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])', 196 | ).get_attribute("data-testid") 197 | 198 | if raw_user_id == "": 199 | self.user_id = None 200 | else: 201 | self.user_id = str(raw_user_id.split("-")[0]) 202 | 203 | ext_user_id = True 204 | except NoSuchElementException: 205 | continue 206 | except StaleElementReferenceException: 207 | self.error = True 208 | return 209 | 210 | while not ext_following: 211 | try: 212 | self.following_cnt = hover_card.find_element( 213 | "xpath", './/a[contains(@href, "/following")]//span' 214 | ).text 215 | 216 | if self.following_cnt == "": 217 | self.following_cnt = "0" 218 | 219 | ext_following = True 220 | except NoSuchElementException: 221 | continue 222 | except StaleElementReferenceException: 223 | self.error = True 224 | return 225 | 226 | while not ext_followers: 227 | try: 228 | self.followers_cnt = hover_card.find_element( 229 | "xpath", 230 | './/a[contains(@href, "/verified_followers")]//span', 231 | ).text 232 | 233 | if self.followers_cnt == "": 234 | self.followers_cnt = "0" 235 | 236 | ext_followers = True 237 | except NoSuchElementException: 238 | continue 239 | except StaleElementReferenceException: 240 | self.error = True 241 | return 242 | except NoSuchElementException: 243 | if hover_attempt == 3: 244 | self.error 245 | return 246 | hover_attempt += 1 247 | sleep(0.5) 248 | continue 249 | except StaleElementReferenceException: 250 | self.error = True 251 | return 252 | 253 | if ext_hover_card and ext_following and ext_followers: 254 | actions.reset_actions() 255 | 256 | self.tweet = ( 257 | self.user, 258 | self.handle, 259 | self.date_time, 260 | self.verified, 261 | self.content, 262 | self.reply_cnt, 263 | self.retweet_cnt, 264 | self.like_cnt, 265 | self.analytics_cnt, 266 | self.tags, 267 | self.mentions, 268 | self.emojis, 269 | self.profile_img, 270 | self.tweet_link, 271 | self.tweet_id, 272 | self.user_id, 273 | self.following_cnt, 274 | self.followers_cnt, 275 | ) 276 | 277 | pass 278 | -------------------------------------------------------------------------------- /scraper/twitter_scraper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from progress import Progress 5 | from scroller import Scroller 6 | from tweet import Tweet 7 | 8 | from datetime import datetime 9 | from fake_headers import Headers 10 | from time import sleep 11 | 12 | from selenium import webdriver 13 | from selenium.webdriver.common.keys import Keys 14 | from selenium.common.exceptions import ( 15 | NoSuchElementException, 16 | StaleElementReferenceException, 17 | WebDriverException, 18 | ) 19 | from selenium.webdriver.common.action_chains import ActionChains 20 | from selenium.webdriver.chrome.options import Options as ChromeOptions 21 | from selenium.webdriver.chrome.service import Service as ChromeService 22 | 23 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 24 | from selenium.webdriver.firefox.service import Service as FirefoxService 25 | 26 | from selenium.webdriver.support.ui import WebDriverWait 27 | 28 | from webdriver_manager.chrome import ChromeDriverManager 29 | from webdriver_manager.firefox import GeckoDriverManager 30 | 31 | TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login" 32 | 33 | 34 | class Twitter_Scraper: 35 | def __init__( 36 | self, 37 | mail, 38 | username, 39 | password, 40 | headlessState, 41 | max_tweets=50, 42 | scrape_username=None, 43 | scrape_hashtag=None, 44 | scrape_query=None, 45 | scrape_bookmarks=False, 46 | scrape_poster_details=False, 47 | scrape_latest=True, 48 | scrape_top=False, 49 | proxy=None, 50 | ): 51 | print("Initializing Twitter Scraper...") 52 | self.mail = mail 53 | self.username = username 54 | self.password = password 55 | self.headlessState = headlessState 56 | self.interrupted = False 57 | self.tweet_ids = set() 58 | self.data = [] 59 | self.tweet_cards = [] 60 | self.scraper_details = { 61 | "type": None, 62 | "username": None, 63 | "hashtag": None, 64 | "bookmarks": False, 65 | "query": None, 66 | "tab": None, 67 | "poster_details": False, 68 | } 69 | self.max_tweets = max_tweets 70 | self.progress = Progress(0, max_tweets) 71 | self.router = self.go_to_home 72 | self.driver = self._get_driver(proxy) 73 | self.actions = ActionChains(self.driver) 74 | self.scroller = Scroller(self.driver) 75 | self._config_scraper( 76 | max_tweets, 77 | scrape_username, 78 | scrape_hashtag, 79 | scrape_bookmarks, 80 | scrape_query, 81 | scrape_latest, 82 | scrape_top, 83 | scrape_poster_details, 84 | ) 85 | 86 | def _config_scraper( 87 | self, 88 | max_tweets=50, 89 | scrape_username=None, 90 | scrape_hashtag=None, 91 | scrape_bookmarks=False, 92 | scrape_query=None, 93 | scrape_list=None, 94 | scrape_latest=True, 95 | scrape_top=False, 96 | scrape_poster_details=False, 97 | ): 98 | self.tweet_ids = set() 99 | self.data = [] 100 | self.tweet_cards = [] 101 | self.max_tweets = max_tweets 102 | self.progress = Progress(0, max_tweets) 103 | self.scraper_details = { 104 | "type": None, 105 | "username": scrape_username, 106 | "hashtag": str(scrape_hashtag).replace("#", "") 107 | if scrape_hashtag is not None 108 | else None, 109 | "bookmarks": scrape_bookmarks, 110 | "query": scrape_query, 111 | "list": scrape_list, 112 | "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest", 113 | "poster_details": scrape_poster_details, 114 | } 115 | self.router = self.go_to_home 116 | self.scroller = Scroller(self.driver) 117 | 118 | if scrape_username is not None: 119 | self.scraper_details["type"] = "Username" 120 | self.router = self.go_to_profile 121 | elif scrape_hashtag is not None: 122 | self.scraper_details["type"] = "Hashtag" 123 | self.router = self.go_to_hashtag 124 | elif scrape_bookmarks is not False: 125 | self.scraper_details["type"] = "Bookmarks" 126 | self.router = self.go_to_bookmarks 127 | elif scrape_query is not None: 128 | self.scraper_details["type"] = "Query" 129 | self.router = self.go_to_search 130 | elif scrape_list is not None: 131 | self.scraper_details["type"] = "List" 132 | self.router = self.go_to_list 133 | else: 134 | self.scraper_details["type"] = "Home" 135 | self.router = self.go_to_home 136 | pass 137 | 138 | def _get_driver( 139 | self, 140 | proxy=None, 141 | ): 142 | print("Setup WebDriver...") 143 | # header = Headers().generate()["User-Agent"] 144 | 145 | # User agent of a andoird smartphone device 146 | header="Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.87 Mobile Safari/537.36" 147 | 148 | # browser_option = ChromeOptions() 149 | browser_option = FirefoxOptions() 150 | browser_option.add_argument("--no-sandbox") 151 | browser_option.add_argument("--disable-dev-shm-usage") 152 | browser_option.add_argument("--ignore-certificate-errors") 153 | browser_option.add_argument("--disable-gpu") 154 | browser_option.add_argument("--log-level=3") 155 | browser_option.add_argument("--disable-notifications") 156 | browser_option.add_argument("--disable-popup-blocking") 157 | browser_option.add_argument("--user-agent={}".format(header)) 158 | if proxy is not None: 159 | browser_option.add_argument("--proxy-server=%s" % proxy) 160 | 161 | # Option to hide browser or not 162 | # If not yes then skips the headless 163 | if self.headlessState == 'yes': 164 | # For Hiding Browser 165 | browser_option.add_argument("--headless") 166 | 167 | try: 168 | # print("Initializing ChromeDriver...") 169 | # driver = webdriver.Chrome( 170 | # options=browser_option, 171 | # ) 172 | 173 | print("Initializing FirefoxDriver...") 174 | driver = webdriver.Firefox( 175 | options=browser_option, 176 | ) 177 | 178 | print("WebDriver Setup Complete") 179 | return driver 180 | except WebDriverException: 181 | try: 182 | # print("Downloading ChromeDriver...") 183 | # chromedriver_path = ChromeDriverManager().install() 184 | # chrome_service = ChromeService(executable_path=chromedriver_path) 185 | 186 | print("Downloading FirefoxDriver...") 187 | firefoxdriver_path = GeckoDriverManager().install() 188 | firefox_service = FirefoxService(executable_path=firefoxdriver_path) 189 | 190 | # print("Initializing ChromeDriver...") 191 | # driver = webdriver.Chrome( 192 | # service=chrome_service, 193 | # options=browser_option, 194 | # ) 195 | 196 | print("Initializing FirefoxDriver...") 197 | driver = webdriver.Firefox( 198 | service=firefox_service, 199 | options=browser_option, 200 | ) 201 | 202 | print("WebDriver Setup Complete") 203 | return driver 204 | except Exception as e: 205 | print(f"Error setting up WebDriver: {e}") 206 | sys.exit(1) 207 | pass 208 | 209 | def login(self): 210 | print() 211 | print("Logging in to Twitter...") 212 | 213 | try: 214 | self.driver.maximize_window() 215 | self.driver.execute_script("document.body.style.zoom='150%'") #set zoom to 150% 216 | self.driver.get(TWITTER_LOGIN_URL) 217 | sleep(3) 218 | 219 | self._input_username() 220 | self._input_unusual_activity() 221 | self._input_password() 222 | 223 | cookies = self.driver.get_cookies() 224 | 225 | auth_token = None 226 | 227 | for cookie in cookies: 228 | if cookie["name"] == "auth_token": 229 | auth_token = cookie["value"] 230 | break 231 | 232 | if auth_token is None: 233 | raise ValueError( 234 | """This may be due to the following: 235 | 236 | - Internet connection is unstable 237 | - Username is incorrect 238 | - Password is incorrect 239 | """ 240 | ) 241 | 242 | print() 243 | print("Login Successful") 244 | print() 245 | except Exception as e: 246 | print() 247 | print(f"Login Failed: {e}") 248 | sys.exit(1) 249 | 250 | pass 251 | 252 | def _input_username(self): 253 | input_attempt = 0 254 | 255 | while True: 256 | try: 257 | username = self.driver.find_element( 258 | "xpath", "//input[@autocomplete='username']" 259 | ) 260 | 261 | username.send_keys(self.username) 262 | username.send_keys(Keys.RETURN) 263 | sleep(3) 264 | break 265 | except NoSuchElementException: 266 | input_attempt += 1 267 | if input_attempt >= 3: 268 | print() 269 | print( 270 | """There was an error inputting the username. 271 | 272 | It may be due to the following: 273 | - Internet connection is unstable 274 | - Username is incorrect 275 | - Twitter is experiencing unusual activity""" 276 | ) 277 | self.driver.quit() 278 | sys.exit(1) 279 | else: 280 | print("Re-attempting to input username...") 281 | sleep(2) 282 | 283 | def _input_unusual_activity(self): 284 | input_attempt = 0 285 | 286 | while True: 287 | try: 288 | unusual_activity = self.driver.find_element( 289 | "xpath", "//input[@data-testid='ocfEnterTextTextInput']" 290 | ) 291 | unusual_activity.send_keys(self.username) 292 | unusual_activity.send_keys(Keys.RETURN) 293 | sleep(3) 294 | break 295 | except NoSuchElementException: 296 | input_attempt += 1 297 | if input_attempt >= 3: 298 | break 299 | 300 | def _input_password(self): 301 | input_attempt = 0 302 | 303 | while True: 304 | try: 305 | password = self.driver.find_element( 306 | "xpath", "//input[@autocomplete='current-password']" 307 | ) 308 | 309 | password.send_keys(self.password) 310 | password.send_keys(Keys.RETURN) 311 | sleep(3) 312 | break 313 | except NoSuchElementException: 314 | input_attempt += 1 315 | if input_attempt >= 3: 316 | print() 317 | print( 318 | """There was an error inputting the password. 319 | 320 | It may be due to the following: 321 | - Internet connection is unstable 322 | - Password is incorrect 323 | - Twitter is experiencing unusual activity""" 324 | ) 325 | self.driver.quit() 326 | sys.exit(1) 327 | else: 328 | print("Re-attempting to input password...") 329 | sleep(2) 330 | 331 | def go_to_home(self): 332 | self.driver.get("https://twitter.com/home") 333 | sleep(3) 334 | pass 335 | 336 | def go_to_profile(self): 337 | if ( 338 | self.scraper_details["username"] is None 339 | or self.scraper_details["username"] == "" 340 | ): 341 | print("Username is not set.") 342 | sys.exit(1) 343 | else: 344 | self.driver.get(f"https://twitter.com/{self.scraper_details['username']}") 345 | sleep(3) 346 | pass 347 | 348 | def go_to_hashtag(self): 349 | if ( 350 | self.scraper_details["hashtag"] is None 351 | or self.scraper_details["hashtag"] == "" 352 | ): 353 | print("Hashtag is not set.") 354 | sys.exit(1) 355 | else: 356 | url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click" 357 | if self.scraper_details["tab"] == "Latest": 358 | url += "&f=live" 359 | 360 | self.driver.get(url) 361 | sleep(3) 362 | pass 363 | 364 | def go_to_bookmarks(self): 365 | if ( 366 | self.scraper_details["bookmarks"] is False 367 | or self.scraper_details["bookmarks"] == "" 368 | ): 369 | print("Bookmarks is not set.") 370 | sys.exit(1) 371 | else: 372 | url = f"https://twitter..com/i/bookmarks" 373 | 374 | self.driver.get(url) 375 | sleep(3) 376 | pass 377 | 378 | def go_to_search(self): 379 | if self.scraper_details["query"] is None or self.scraper_details["query"] == "": 380 | print("Query is not set.") 381 | sys.exit(1) 382 | else: 383 | url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query" 384 | if self.scraper_details["tab"] == "Latest": 385 | url += "&f=live" 386 | 387 | self.driver.get(url) 388 | sleep(3) 389 | pass 390 | 391 | def go_to_list(self): 392 | if self.scraper_details["list"] is None or self.scraper_details["list"] == "": 393 | print("List is not set.") 394 | sys.exit(1) 395 | else: 396 | url = f"https://x.com/i/lists/{self.scraper_details['list']}" 397 | self.driver.get(url) 398 | sleep(3) 399 | pass 400 | 401 | def get_tweet_cards(self): 402 | self.tweet_cards = self.driver.find_elements( 403 | "xpath", '//article[@data-testid="tweet" and not(@disabled)]' 404 | ) 405 | pass 406 | 407 | def remove_hidden_cards(self): 408 | try: 409 | hidden_cards = self.driver.find_elements( 410 | "xpath", '//article[@data-testid="tweet" and @disabled]' 411 | ) 412 | 413 | for card in hidden_cards[1:-2]: 414 | self.driver.execute_script( 415 | "arguments[0].parentNode.parentNode.parentNode.remove();", card 416 | ) 417 | except Exception as e: 418 | return 419 | pass 420 | 421 | def scrape_tweets( 422 | self, 423 | max_tweets=50, 424 | no_tweets_limit=False, 425 | scrape_username=None, 426 | scrape_hashtag=None, 427 | scrape_bookmarks=False, 428 | scrape_query=None, 429 | scrape_list=None, 430 | scrape_latest=True, 431 | scrape_top=False, 432 | scrape_poster_details=False, 433 | router=None, 434 | ): 435 | self._config_scraper( 436 | max_tweets, 437 | scrape_username, 438 | scrape_hashtag, 439 | scrape_bookmarks, 440 | scrape_query, 441 | scrape_list, 442 | scrape_latest, 443 | scrape_top, 444 | scrape_poster_details, 445 | ) 446 | 447 | if router is None: 448 | router = self.router 449 | 450 | router() 451 | 452 | if self.scraper_details["type"] == "Username": 453 | print( 454 | "Scraping Tweets from @{}...".format(self.scraper_details["username"]) 455 | ) 456 | elif self.scraper_details["type"] == "Hashtag": 457 | print( 458 | "Scraping {} Tweets from #{}...".format( 459 | self.scraper_details["tab"], self.scraper_details["hashtag"] 460 | ) 461 | ) 462 | elif self.scraper_details["type"] == "Bookmarks": 463 | print( 464 | "Scraping Tweets from bookmarks...".format(self.scraper_details["username"])) 465 | elif self.scraper_details["type"] == "Query": 466 | print( 467 | "Scraping {} Tweets from {} search...".format( 468 | self.scraper_details["tab"], self.scraper_details["query"] 469 | ) 470 | ) 471 | elif self.scraper_details["type"] == "Home": 472 | print("Scraping Tweets from Home...") 473 | 474 | # Accept cookies to make the banner disappear 475 | try: 476 | accept_cookies_btn = self.driver.find_element( 477 | "xpath", "//span[text()='Refuse non-essential cookies']/../../..") 478 | accept_cookies_btn.click() 479 | except NoSuchElementException: 480 | pass 481 | 482 | self.progress.print_progress(0, False, 0, no_tweets_limit) 483 | 484 | refresh_count = 0 485 | added_tweets = 0 486 | empty_count = 0 487 | retry_cnt = 0 488 | 489 | while self.scroller.scrolling: 490 | try: 491 | self.get_tweet_cards() 492 | added_tweets = 0 493 | 494 | for card in self.tweet_cards[-15:]: 495 | try: 496 | tweet_id = str(card) 497 | 498 | if tweet_id not in self.tweet_ids: 499 | self.tweet_ids.add(tweet_id) 500 | 501 | if not self.scraper_details["poster_details"]: 502 | self.driver.execute_script( 503 | "arguments[0].scrollIntoView();", card 504 | ) 505 | 506 | tweet = Tweet( 507 | card=card, 508 | driver=self.driver, 509 | actions=self.actions, 510 | scrape_poster_details=self.scraper_details[ 511 | "poster_details" 512 | ], 513 | ) 514 | 515 | if tweet: 516 | if not tweet.error and tweet.tweet is not None: 517 | if not tweet.is_ad: 518 | self.data.append(tweet.tweet) 519 | added_tweets += 1 520 | self.progress.print_progress(len(self.data), False, 0, no_tweets_limit) 521 | 522 | if len(self.data) >= self.max_tweets and not no_tweets_limit: 523 | self.scroller.scrolling = False 524 | break 525 | else: 526 | continue 527 | else: 528 | continue 529 | else: 530 | continue 531 | else: 532 | continue 533 | except NoSuchElementException: 534 | continue 535 | 536 | if len(self.data) >= self.max_tweets and not no_tweets_limit: 537 | break 538 | 539 | if added_tweets == 0: 540 | # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries 541 | try: 542 | while retry_cnt < 15: 543 | retry_button = self.driver.find_element( 544 | "xpath", "//span[text()='Retry']/../../..") 545 | self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit) 546 | sleep(600) 547 | retry_button.click() 548 | retry_cnt += 1 549 | sleep(2) 550 | # There is no Retry button so the counter is reseted 551 | except NoSuchElementException: 552 | retry_cnt = 0 553 | self.progress.print_progress(len(self.data), False, 0, no_tweets_limit) 554 | 555 | if empty_count >= 5: 556 | if refresh_count >= 3: 557 | print() 558 | print("No more tweets to scrape") 559 | break 560 | refresh_count += 1 561 | empty_count += 1 562 | sleep(1) 563 | else: 564 | empty_count = 0 565 | refresh_count = 0 566 | except StaleElementReferenceException: 567 | sleep(2) 568 | continue 569 | except KeyboardInterrupt: 570 | print("\n") 571 | print("Keyboard Interrupt") 572 | self.interrupted = True 573 | break 574 | except Exception as e: 575 | print("\n") 576 | print(f"Error scraping tweets: {e}") 577 | break 578 | 579 | print("") 580 | 581 | if len(self.data) >= self.max_tweets or no_tweets_limit: 582 | print("Scraping Complete") 583 | else: 584 | print("Scraping Incomplete") 585 | 586 | if not no_tweets_limit: 587 | print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets)) 588 | 589 | pass 590 | 591 | def save_to_csv(self): 592 | print("Saving Tweets to CSV...") 593 | now = datetime.now() 594 | folder_path = "./tweets/" 595 | 596 | if not os.path.exists(folder_path): 597 | os.makedirs(folder_path) 598 | print("Created Folder: {}".format(folder_path)) 599 | 600 | data = { 601 | "Name": [tweet[0] for tweet in self.data], 602 | "Handle": [tweet[1] for tweet in self.data], 603 | "Timestamp": [tweet[2] for tweet in self.data], 604 | "Verified": [tweet[3] for tweet in self.data], 605 | "Content": [tweet[4] for tweet in self.data], 606 | "Comments": [tweet[5] for tweet in self.data], 607 | "Retweets": [tweet[6] for tweet in self.data], 608 | "Likes": [tweet[7] for tweet in self.data], 609 | "Analytics": [tweet[8] for tweet in self.data], 610 | "Tags": [tweet[9] for tweet in self.data], 611 | "Mentions": [tweet[10] for tweet in self.data], 612 | "Emojis": [tweet[11] for tweet in self.data], 613 | "Profile Image": [tweet[12] for tweet in self.data], 614 | "Tweet Link": [tweet[13] for tweet in self.data], 615 | "Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data], 616 | } 617 | 618 | if self.scraper_details["poster_details"]: 619 | data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data] 620 | data["Following"] = [tweet[16] for tweet in self.data] 621 | data["Followers"] = [tweet[17] for tweet in self.data] 622 | 623 | df = pd.DataFrame(data) 624 | 625 | current_time = now.strftime("%Y-%m-%d_%H-%M-%S") 626 | file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv" 627 | pd.set_option("display.max_colwidth", None) 628 | df.to_csv(file_path, index=False, encoding="utf-8") 629 | 630 | print("CSV Saved: {}".format(file_path)) 631 | 632 | pass 633 | 634 | def get_tweets(self): 635 | return self.data 636 | --------------------------------------------------------------------------------