├── .env.example
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── img
    └── advanced-search-01.png
├── main.ipynb
├── requirements.txt
├── sample-command.txt
└── scraper
    ├── __init__.py
    ├── __main__.py
    ├── progress.py
    ├── scroller.py
    ├── tweet.py
    └── twitter_scraper.py


/.env.example:
--------------------------------------------------------------------------------
1 | TWITTER_USERNAME=# Your Twitter Handle
2 | TWITTER_PASSWORD=# Your Twitter Password
3 | HEADLESS=# Headless browser option (use "yes" or "no")
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.csv
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to selenium-twitter-scraper
 2 | 
 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
 4 | 
 5 | - Reporting a bug
 6 | - Discussing the current state of the code
 7 | - Submitting a fix
 8 | - Proposing new features
 9 | - Becoming a maintainer
10 | 
11 | ## We Develop with Github
12 | 
13 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
14 | 
15 | ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests
16 | 
17 | Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://docs.github.com/en/get-started/quickstart/github-flow). We actively welcome your pull requests:
18 | 
19 | 1. Fork the repo and create your branch from `master`.
20 | 2. If you've added code that should be tested, add tests.
21 | 3. Ensure the test suite passes.
22 | 4. Make sure your code lints.
23 | 5. Issue that pull request!
24 | 
25 | ## Any contributions you make will be under the Apache License Version 2.0 Software License
26 | 
27 | In short, when you submit code changes, your submissions are understood to be under the same [Apache License Version 2.0 License](https://choosealicense.com/licenses/apache-2.0/) that covers the project. Feel free to contact the maintainers if that's a concern.
28 | 
29 | ## Report bugs using Github's [issues](https://github.com/godkingjay/selenium-twitter-scraper/issues)
30 | 
31 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/godkingjay/selenium-twitter-scraper/issues/new); it's that easy!
32 | 
33 | ## License
34 | 
35 | By contributing, you agree that your contributions will be licensed under its Apache License Version 2.0.
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # selenium-twitter-scraper
  2 | 
  3 | ## Setup
  4 | 
  5 | 1. Install dependencies
  6 | 
  7 | ```bash
  8 | pip install -r requirements.txt
  9 | ```
 10 | 
 11 | ## Authentication Options
 12 | 
 13 | ### Using Environment Variable
 14 | 
 15 | 1. Rename `.env.example` to `.env`.
 16 | 
 17 | 2. Open `.env` and update environment variables
 18 | 
 19 | ```bash
 20 | TWITTER_USERNAME=# Your Twitter Handle (e.g. @username)
 21 | TWITTER_USERNAME=# Your Twitter Username
 22 | TWITTER_PASSWORD=# Your Twitter Password
 23 | ```
 24 | 
 25 | ### Authentication in Terminal
 26 | 
 27 | - Add a `username` and `password` to the command line.
 28 | 
 29 | ```bash
 30 | python scraper --user=@elonmusk --password=password123
 31 | ```
 32 | 
 33 | ### No Authentication Provided
 34 | 
 35 | - If you didn't specify a username and password, the program will
 36 |   ask you to enter a username and password.
 37 | 
 38 | ```bash
 39 | Twitter Username: @username
 40 | Password: password123
 41 | ```
 42 | 
 43 | ---
 44 | 
 45 | **_Authentication Sequence Priority_**
 46 | 
 47 | ```bash
 48 | 1. Authentication provided in terminal.
 49 | 2. Authentication provided in environment variables.
 50 | ```
 51 | 
 52 | ---
 53 | 
 54 | ## Usage
 55 | 
 56 | - Show Help
 57 | 
 58 | ```bash
 59 | python scraper --help
 60 | ```
 61 | 
 62 | - Basic usage
 63 | 
 64 | ```bash
 65 | python scraper
 66 | ```
 67 | 
 68 | - Setting maximum number of tweets. defaults to `50`.
 69 | 
 70 | ```bash
 71 | python scraper --tweets=500   # Scrape 500 Tweets
 72 | ```
 73 | 
 74 | - Options and Arguments
 75 | 
 76 | ```bash
 77 | usage: python scraper [option] ... [arg] ...
 78 | 
 79 | authentication options  description
 80 | --user                  : Your twitter account Handle.
 81 |                           e.g.
 82 |                           --user=@username
 83 | 
 84 | --password              : Your twitter account password.
 85 |                           e.g.
 86 |                           --password=password123
 87 | 
 88 | options:                description
 89 | -t, --tweets            : Number of tweets to scrape (default: 50).
 90 |                           e.g.
 91 |                             -t 500
 92 |                             --tweets=500
 93 | 
 94 | -u, --username          : Twitter username.
 95 |                           Scrape tweets from a user's profile.
 96 |                           e.g.
 97 |                             -u elonmusk
 98 |                             --username=@elonmusk
 99 | 
100 | -ht, --hashtag          : Twitter hashtag.
101 |                           Scrape tweets from a hashtag.
102 |                           e.g.
103 |                             -ht javascript
104 |                             --hashtag=javascript
105 | 
106 | -l, --list              : List ID. Scrape tweets from a list. The
107 |                           ID is taken from the x.com/list/... URL.
108 |                           e.g.
109 |                            -l "1324132413151"
110 |                            --list "1324132413151"
111 | 
112 | -q, --query             : Twitter query or search.
113 |                           Scrape tweets from a query or search.
114 |                           e.g.
115 |                             -q "Philippine Marites"
116 |                             --query="Jak Roberto anti selos"
117 | 
118 | -a, --add               : Additional data to scrape and
119 |                           save in the .csv file.
120 | 
121 |                           values:
122 |                           pd - poster's followers and following
123 | 
124 |                           e.g.
125 |                             -a "pd"
126 |                             --add="pd"
127 | 
128 |                           NOTE: Values must be separated by commas.
129 | 
130 | --latest                : Twitter latest tweets (default: True).
131 |                           Note: Only for hashtag-based
132 |                           and query-based scraping.
133 |                           usage:
134 |                             python scraper -t 500 -ht=python --latest
135 | 
136 | --top                   : Twitter top tweets (default: False).
137 |                           Note: Only for hashtag-based
138 |                           and query-based scraping.
139 |                           usage:
140 |                             python scraper -t 500 -ht=python --top
141 | 
142 | -ntl, --no_tweets_limit : Set no limit to the number of tweets to scrape
143 |                           (will scrap until no more tweets are available).
144 | ```
145 | 
146 | ### Sample Scraping Commands
147 | 
148 | - **Custom Limit Scraping**
149 | 
150 | ```bash
151 | python scraper -t 500
152 | ```
153 | 
154 | - **User Profile Scraping**
155 | 
156 | ```bash
157 | python scraper -t 100 -u elonmusk
158 | ```
159 | 
160 | - **Hashtag Scraping**
161 | 
162 |   - Latest
163 | 
164 |     ```bash
165 |     python scraper -t 100 -ht python --latest
166 |     ```
167 | 
168 |   - Top
169 | 
170 |     ```bash
171 |     python scraper -t 100 -ht python --top
172 |     ```
173 | 
174 | - **Query or Search Scraping**
175 |   _(Also works with twitter's advanced search.)_
176 | 
177 |   - Latest
178 | 
179 |     ```bash
180 |     python scraper -t 100 -q "Jak Roberto Anti Selos" --latest
181 |     ```
182 | 
183 |   - Top
184 | 
185 |     ```bash
186 |     python scraper -t 100 -q "International News" --top
187 |     ```
188 | 
189 | - **Advanced Search Scraping**
190 | 
191 |   - For tweets mentioning `@elonmusk`:
192 | 
193 |     ```bash
194 |     python scraper --query="(@elonmusk)"
195 |     ```
196 | 
197 |   - For tweets that mentions `@elonmusk` with at least `1000` replies from `January 01, 2020 - August 31, 2023`:
198 | 
199 |     ```bash
200 |     python scraper --query="(@elonmusk) min_replies:1000 until:2023-08-31 since:2020-01-01"
201 |     ```
202 | 
203 |   - Perform more `Advanced Search` using Twitter's Advanced Search, just setup the advanced query and copy the resulting string query to the program:
204 |   - **[Twitter Advanced Search](https://twitter.com/search-advanced)**
205 |     [![Image](./img/advanced-search-01.png)](./img/advanced-search-01.png)
206 | 
207 | - **Scrape Additional Data**
208 | 
209 | ```bash
210 | python scraper --add="pd"
211 | ```
212 | 
213 | | Values | Description                                        |
214 | | :----: | :------------------------------------------------- |
215 | |   pd   | Tweet poster's id, followers, and following count. |
216 | 
217 | ---
218 | 


--------------------------------------------------------------------------------
/img/advanced-search-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/godkingjay/selenium-twitter-scraper/62d8ceb2f39a533d68965f309371efeeb9c676bd/img/advanced-search-01.png


--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "attachments": {},
   5 |    "cell_type": "markdown",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Twitter Scraper using Selenium\n",
   9 |     "\n",
  10 |     "Scraper for Twitter Tweets using selenium. It can scrape tweets from:\n",
  11 |     "- Home/New Feeds\n",
  12 |     "- User Profile Tweets\n",
  13 |     "- Query or Search Tweets\n",
  14 |     "- Hashtags Tweets\n",
  15 |     "- Advanced Search Tweets"
  16 |    ]
  17 |   },
  18 |   {
  19 |    "cell_type": "code",
  20 |    "execution_count": null,
  21 |    "metadata": {},
  22 |    "outputs": [],
  23 |    "source": [
  24 |     "import os\n",
  25 |     "import sys\n",
  26 |     "import pandas as pd\n",
  27 |     "\n",
  28 |     "from datetime import datetime\n",
  29 |     "from fake_headers import Headers\n",
  30 |     "from time import sleep\n",
  31 |     "from selenium import webdriver\n",
  32 |     "from selenium.webdriver import Chrome\n",
  33 |     "from selenium.webdriver.common.keys import Keys\n",
  34 |     "from selenium.common.exceptions import (\n",
  35 |     "    NoSuchElementException,\n",
  36 |     "    StaleElementReferenceException,\n",
  37 |     "    WebDriverException,\n",
  38 |     ")\n",
  39 |     "from selenium.webdriver.common.action_chains import ActionChains\n",
  40 |     "\n",
  41 |     "from selenium.webdriver.chrome.webdriver import WebDriver\n",
  42 |     "from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
  43 |     "from selenium.webdriver.chrome.service import Service as ChromeService\n",
  44 |     "\n",
  45 |     "from webdriver_manager.chrome import ChromeDriverManager"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "attachments": {},
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "# Progress Class\n",
  54 |     "\n",
  55 |     "Class for the progress of the scraper instance."
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": null,
  61 |    "metadata": {},
  62 |    "outputs": [],
  63 |    "source": [
  64 |     "class Progress:\n",
  65 |     "    def __init__(self, current, total) -> None:\n",
  66 |     "        self.current = current\n",
  67 |     "        self.total = total\n",
  68 |     "        pass\n",
  69 |     "\n",
  70 |     "    def print_progress(self, current) -> None:\n",
  71 |     "        self.current = current\n",
  72 |     "        progress = current / self.total\n",
  73 |     "        bar_length = 40\n",
  74 |     "        progress_bar = (\n",
  75 |     "            \"[\"\n",
  76 |     "            + \"=\" * int(bar_length * progress)\n",
  77 |     "            + \"-\" * (bar_length - int(bar_length * progress))\n",
  78 |     "            + \"]\"\n",
  79 |     "        )\n",
  80 |     "        sys.stdout.write(\n",
  81 |     "            \"\\rProgress: [{:<40}] {:.2%} {} of {}\".format(\n",
  82 |     "                progress_bar, progress, current, self.total\n",
  83 |     "            )\n",
  84 |     "        )\n",
  85 |     "        sys.stdout.flush()\n"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "attachments": {},
  90 |    "cell_type": "markdown",
  91 |    "metadata": {},
  92 |    "source": [
  93 |     "# Scroller Class\n",
  94 |     "\n",
  95 |     "Class for the scrollbar of the web page."
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "code",
 100 |    "execution_count": null,
 101 |    "metadata": {},
 102 |    "outputs": [],
 103 |    "source": [
 104 |     "class Scroller:\n",
 105 |     "    def __init__(self, driver) -> None:\n",
 106 |     "        self.driver = driver\n",
 107 |     "        self.current_position = 0\n",
 108 |     "        self.last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
 109 |     "        self.scrolling = True\n",
 110 |     "        self.scroll_count = 0\n",
 111 |     "        pass\n",
 112 |     "\n",
 113 |     "    def reset(self) -> None:\n",
 114 |     "        self.current_position = 0\n",
 115 |     "        self.last_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
 116 |     "        self.scroll_count = 0\n",
 117 |     "        pass\n",
 118 |     "\n",
 119 |     "    def scroll_to_top(self) -> None:\n",
 120 |     "        self.driver.execute_script(\"window.scrollTo(0, 0);\")\n",
 121 |     "        pass\n",
 122 |     "\n",
 123 |     "    def scroll_to_bottom(self) -> None:\n",
 124 |     "        self.driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
 125 |     "        pass\n",
 126 |     "\n",
 127 |     "    def update_scroll_position(self) -> None:\n",
 128 |     "        self.current_position = self.driver.execute_script(\"return window.pageYOffset;\")\n",
 129 |     "        pass\n"
 130 |    ]
 131 |   },
 132 |   {
 133 |    "attachments": {},
 134 |    "cell_type": "markdown",
 135 |    "metadata": {},
 136 |    "source": [
 137 |     "# Tweet Class\n",
 138 |     "\n",
 139 |     "Object for the tweet. Including its data."
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "metadata": {},
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "class Tweet:\n",
 149 |     "    def __init__(\n",
 150 |     "        self,\n",
 151 |     "        card: WebDriver,\n",
 152 |     "        driver: WebDriver,\n",
 153 |     "        actions: ActionChains,\n",
 154 |     "        scrape_poster_details=False\n",
 155 |     "    ) -> None:\n",
 156 |     "        self.card = card\n",
 157 |     "        self.error = False\n",
 158 |     "        self.tweet = None\n",
 159 |     "\n",
 160 |     "        try:\n",
 161 |     "            self.user = card.find_element(\n",
 162 |     "                \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
 163 |     "            ).text\n",
 164 |     "        except NoSuchElementException:\n",
 165 |     "            self.error = True\n",
 166 |     "            self.user = \"skip\"\n",
 167 |     "\n",
 168 |     "        try:\n",
 169 |     "            self.handle = card.find_element(\n",
 170 |     "                \"xpath\", './/span[contains(text(), \"@\")]'\n",
 171 |     "            ).text\n",
 172 |     "        except NoSuchElementException:\n",
 173 |     "            self.error = True\n",
 174 |     "            self.handle = \"skip\"\n",
 175 |     "\n",
 176 |     "        try:\n",
 177 |     "            self.date_time = card.find_element(\"xpath\", \".//time\").get_attribute(\n",
 178 |     "                \"datetime\"\n",
 179 |     "            )\n",
 180 |     "\n",
 181 |     "            if self.date_time is not None:\n",
 182 |     "                self.is_ad = False\n",
 183 |     "        except NoSuchElementException:\n",
 184 |     "            self.is_ad = True\n",
 185 |     "            self.error = True\n",
 186 |     "            self.date_time = \"skip\"\n",
 187 |     "        \n",
 188 |     "        if self.error:\n",
 189 |     "            return\n",
 190 |     "\n",
 191 |     "        try:\n",
 192 |     "            card.find_element(\n",
 193 |     "                \"xpath\", './/*[local-name()=\"svg\" and @data-testid=\"icon-verified\"]'\n",
 194 |     "            )\n",
 195 |     "\n",
 196 |     "            self.verified = True\n",
 197 |     "        except NoSuchElementException:\n",
 198 |     "            self.verified = False\n",
 199 |     "\n",
 200 |     "        self.content = \"\"\n",
 201 |     "        contents = card.find_elements(\n",
 202 |     "            \"xpath\",\n",
 203 |     "            '(.//div[@data-testid=\"tweetText\"])[1]/span | (.//div[@data-testid=\"tweetText\"])[1]/a',\n",
 204 |     "        )\n",
 205 |     "\n",
 206 |     "        for index, content in enumerate(contents):\n",
 207 |     "            self.content += content.text\n",
 208 |     "\n",
 209 |     "        try:\n",
 210 |     "            self.reply_cnt = card.find_element(\n",
 211 |     "                \"xpath\", './/div[@data-testid=\"reply\"]//span'\n",
 212 |     "            ).text\n",
 213 |     "            \n",
 214 |     "            if self.reply_cnt == \"\":\n",
 215 |     "                self.reply_cnt = \"0\"\n",
 216 |     "        except NoSuchElementException:\n",
 217 |     "            self.reply_cnt = \"0\"\n",
 218 |     "\n",
 219 |     "        try:\n",
 220 |     "            self.retweet_cnt = card.find_element(\n",
 221 |     "                \"xpath\", './/div[@data-testid=\"retweet\"]//span'\n",
 222 |     "            ).text\n",
 223 |     "            \n",
 224 |     "            if self.retweet_cnt == \"\":\n",
 225 |     "                self.retweet_cnt = \"0\"\n",
 226 |     "        except NoSuchElementException:\n",
 227 |     "            self.retweet_cnt = \"0\"\n",
 228 |     "\n",
 229 |     "        try:\n",
 230 |     "            self.like_cnt = card.find_element(\n",
 231 |     "                \"xpath\", './/div[@data-testid=\"like\"]//span'\n",
 232 |     "            ).text\n",
 233 |     "            \n",
 234 |     "            if self.like_cnt == \"\":\n",
 235 |     "                self.like_cnt = \"0\"\n",
 236 |     "        except NoSuchElementException:\n",
 237 |     "            self.like_cnt = \"0\"\n",
 238 |     "\n",
 239 |     "        try:\n",
 240 |     "            self.analytics_cnt = card.find_element(\n",
 241 |     "                \"xpath\", './/a[contains(@href, \"/analytics\")]//span'\n",
 242 |     "            ).text\n",
 243 |     "            \n",
 244 |     "            if self.analytics_cnt == \"\":\n",
 245 |     "                self.analytics_cnt = \"0\"\n",
 246 |     "        except NoSuchElementException:\n",
 247 |     "            self.analytics_cnt = \"0\"\n",
 248 |     "\n",
 249 |     "        try:\n",
 250 |     "            self.tags = card.find_elements(\n",
 251 |     "                \"xpath\",\n",
 252 |     "                './/a[contains(@href, \"src=hashtag_click\")]',\n",
 253 |     "            )\n",
 254 |     "\n",
 255 |     "            self.tags = [tag.text for tag in self.tags]\n",
 256 |     "        except NoSuchElementException:\n",
 257 |     "            self.tags = []\n",
 258 |     "        \n",
 259 |     "        try:\n",
 260 |     "            self.mentions = card.find_elements(\n",
 261 |     "                \"xpath\",\n",
 262 |     "                '(.//div[@data-testid=\"tweetText\"])[1]//a[contains(text(), \"@\")]',\n",
 263 |     "            )\n",
 264 |     "\n",
 265 |     "            self.mentions = [mention.text for mention in self.mentions]\n",
 266 |     "        except NoSuchElementException:\n",
 267 |     "            self.mentions = []\n",
 268 |     "        \n",
 269 |     "        try:\n",
 270 |     "            raw_emojis = card.find_elements(\n",
 271 |     "                \"xpath\",\n",
 272 |     "                '(.//div[@data-testid=\"tweetText\"])[1]/img[contains(@src, \"emoji\")]',\n",
 273 |     "            )\n",
 274 |     "            \n",
 275 |     "            self.emojis = [emoji.get_attribute(\"alt\").encode(\"unicode-escape\").decode(\"ASCII\") for emoji in raw_emojis]\n",
 276 |     "        except NoSuchElementException:\n",
 277 |     "            self.emojis = []\n",
 278 |     "        \n",
 279 |     "        try:\n",
 280 |     "            self.profile_img = card.find_element(\n",
 281 |     "                \"xpath\", './/div[@data-testid=\"Tweet-User-Avatar\"]//img'\n",
 282 |     "            ).get_attribute(\"src\")\n",
 283 |     "        except NoSuchElementException:\n",
 284 |     "            self.profile_img = \"\"\n",
 285 |     "            \n",
 286 |     "        try:\n",
 287 |     "            self.tweet_link = self.card.find_element(\n",
 288 |     "                \"xpath\",\n",
 289 |     "                \".//a[contains(@href, '/status/')]\",\n",
 290 |     "            ).get_attribute(\"href\")\n",
 291 |     "            self.tweet_id = str(self.tweet_link.split(\"/\")[-1])\n",
 292 |     "        except NoSuchElementException:\n",
 293 |     "            self.tweet_link = \"\"\n",
 294 |     "            self.tweet_id = \"\"\n",
 295 |     "        \n",
 296 |     "        self.following_cnt = \"0\"\n",
 297 |     "        self.followers_cnt = \"0\"\n",
 298 |     "        self.user_id = None\n",
 299 |     "        \n",
 300 |     "        if scrape_poster_details:\n",
 301 |     "            el_name = card.find_element(\n",
 302 |     "                \"xpath\", './/div[@data-testid=\"User-Name\"]//span'\n",
 303 |     "            )\n",
 304 |     "            \n",
 305 |     "            ext_hover_card = False\n",
 306 |     "            ext_user_id = False\n",
 307 |     "            ext_following = False\n",
 308 |     "            ext_followers = False\n",
 309 |     "            hover_attempt = 0\n",
 310 |     "            \n",
 311 |     "            while not ext_hover_card or not ext_user_id or not ext_following or not ext_followers:\n",
 312 |     "                try:\n",
 313 |     "                    actions.move_to_element(el_name).perform()\n",
 314 |     "                    \n",
 315 |     "                    hover_card = driver.find_element(\n",
 316 |     "                        \"xpath\",\n",
 317 |     "                        '//div[@data-testid=\"hoverCardParent\"]'\n",
 318 |     "                    )\n",
 319 |     "                    \n",
 320 |     "                    ext_hover_card = True\n",
 321 |     "                    \n",
 322 |     "                    while not ext_user_id:\n",
 323 |     "                        try:\n",
 324 |     "                            raw_user_id = hover_card.find_element(\n",
 325 |     "                                \"xpath\",\n",
 326 |     "                                '(.//div[contains(@data-testid, \"-follow\")]) | (.//div[contains(@data-testid, \"-unfollow\")])'\n",
 327 |     "                            ).get_attribute(\"data-testid\")\n",
 328 |     "                            \n",
 329 |     "                            if raw_user_id == \"\":\n",
 330 |     "                                self.user_id = None\n",
 331 |     "                            else:\n",
 332 |     "                                self.user_id = str(raw_user_id.split(\"-\")[0])\n",
 333 |     "                            \n",
 334 |     "                            ext_user_id = True\n",
 335 |     "                        except NoSuchElementException:\n",
 336 |     "                            continue\n",
 337 |     "                        except StaleElementReferenceException:\n",
 338 |     "                            self.error = True\n",
 339 |     "                            return\n",
 340 |     "                    \n",
 341 |     "                    while not ext_following:\n",
 342 |     "                        try:\n",
 343 |     "                            self.following_cnt = hover_card.find_element(\n",
 344 |     "                                \"xpath\",\n",
 345 |     "                                './/a[contains(@href, \"/following\")]//span'\n",
 346 |     "                            ).text\n",
 347 |     "                            \n",
 348 |     "                            if self.following_cnt == \"\":\n",
 349 |     "                                self.following_cnt = \"0\"\n",
 350 |     "                                \n",
 351 |     "                            ext_following = True\n",
 352 |     "                        except NoSuchElementException:\n",
 353 |     "                            continue\n",
 354 |     "                        except StaleElementReferenceException:\n",
 355 |     "                            self.error = True\n",
 356 |     "                            return\n",
 357 |     "                    \n",
 358 |     "                    while not ext_followers:\n",
 359 |     "                        try:\n",
 360 |     "                            self.followers_cnt = hover_card.find_element(\n",
 361 |     "                                \"xpath\",\n",
 362 |     "                                './/a[contains(@href, \"/verified_followers\")]//span'\n",
 363 |     "                            ).text\n",
 364 |     "                            \n",
 365 |     "                            if self.followers_cnt == \"\":\n",
 366 |     "                                self.followers_cnt = \"0\"\n",
 367 |     "                            \n",
 368 |     "                            ext_followers = True\n",
 369 |     "                        except NoSuchElementException:\n",
 370 |     "                            continue\n",
 371 |     "                        except StaleElementReferenceException:\n",
 372 |     "                            self.error = True\n",
 373 |     "                            return\n",
 374 |     "                except NoSuchElementException:\n",
 375 |     "                    if hover_attempt==3:\n",
 376 |     "                        self.error\n",
 377 |     "                        return\n",
 378 |     "                    hover_attempt+=1\n",
 379 |     "                    sleep(0.5)\n",
 380 |     "                    continue\n",
 381 |     "                except StaleElementReferenceException:\n",
 382 |     "                    self.error = True\n",
 383 |     "                    return\n",
 384 |     "            \n",
 385 |     "            if ext_hover_card and ext_following and ext_followers:\n",
 386 |     "                actions.reset_actions()\n",
 387 |     "        \n",
 388 |     "        self.tweet = (\n",
 389 |     "            self.user,\n",
 390 |     "            self.handle,\n",
 391 |     "            self.date_time,\n",
 392 |     "            self.verified,\n",
 393 |     "            self.content,\n",
 394 |     "            self.reply_cnt,\n",
 395 |     "            self.retweet_cnt,\n",
 396 |     "            self.like_cnt,\n",
 397 |     "            self.analytics_cnt,\n",
 398 |     "            self.tags,\n",
 399 |     "            self.mentions,\n",
 400 |     "            self.emojis,\n",
 401 |     "            self.profile_img,\n",
 402 |     "            self.tweet_link,\n",
 403 |     "            self.tweet_id,\n",
 404 |     "            self.user_id,\n",
 405 |     "            self.following_cnt,\n",
 406 |     "            self.followers_cnt,\n",
 407 |     "        )\n",
 408 |     "\n",
 409 |     "        pass\n"
 410 |    ]
 411 |   },
 412 |   {
 413 |    "attachments": {},
 414 |    "cell_type": "markdown",
 415 |    "metadata": {},
 416 |    "source": [
 417 |     "# Twitter Scraper Class\n",
 418 |     "\n",
 419 |     "Class for the Twitter Scraper."
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": null,
 425 |    "metadata": {},
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "TWITTER_LOGIN_URL = \"https://twitter.com/i/flow/login\"\n",
 429 |     "\n",
 430 |     "class Twitter_Scraper:\n",
 431 |     "    def __init__(\n",
 432 |     "        self,\n",
 433 |     "        username,\n",
 434 |     "        password,\n",
 435 |     "        max_tweets=50,\n",
 436 |     "        scrape_username=None,\n",
 437 |     "        scrape_hashtag=None,\n",
 438 |     "        scrape_query=None,\n",
 439 |     "        scrape_poster_details=False,\n",
 440 |     "        scrape_latest=True,\n",
 441 |     "        scrape_top=False,\n",
 442 |     "    ):\n",
 443 |     "        print(\"Initializing Twitter Scraper...\")\n",
 444 |     "        self.username = username\n",
 445 |     "        self.password = password\n",
 446 |     "        self.interrupted = False\n",
 447 |     "        self.tweet_ids = set()\n",
 448 |     "        self.data = []\n",
 449 |     "        self.tweet_cards = []\n",
 450 |     "        self.scraper_details = {\n",
 451 |     "            \"type\": None,\n",
 452 |     "            \"username\": None,\n",
 453 |     "            \"hashtag\": None,\n",
 454 |     "            \"query\": None,\n",
 455 |     "            \"tab\": None,\n",
 456 |     "            \"poster_details\": False,\n",
 457 |     "        }\n",
 458 |     "        self.max_tweets = max_tweets\n",
 459 |     "        self.progress = Progress(0, max_tweets)\n",
 460 |     "        self.router = self.go_to_home\n",
 461 |     "        self.driver = self._get_driver()\n",
 462 |     "        self.actions = ActionChains(self.driver)\n",
 463 |     "        self.scroller = Scroller(self.driver)\n",
 464 |     "        self._config_scraper(\n",
 465 |     "            max_tweets,\n",
 466 |     "            scrape_username,\n",
 467 |     "            scrape_hashtag,\n",
 468 |     "            scrape_query,\n",
 469 |     "            scrape_latest,\n",
 470 |     "            scrape_top,\n",
 471 |     "            scrape_poster_details,\n",
 472 |     "        )\n",
 473 |     "\n",
 474 |     "    def _config_scraper(\n",
 475 |     "        self,\n",
 476 |     "        max_tweets=50,\n",
 477 |     "        scrape_username=None,\n",
 478 |     "        scrape_hashtag=None,\n",
 479 |     "        scrape_query=None,\n",
 480 |     "        scrape_latest=True,\n",
 481 |     "        scrape_top=False,\n",
 482 |     "        scrape_poster_details=False,\n",
 483 |     "    ):\n",
 484 |     "        self.tweet_ids = set()\n",
 485 |     "        self.data = []\n",
 486 |     "        self.tweet_cards = []\n",
 487 |     "        self.max_tweets = max_tweets\n",
 488 |     "        self.progress = Progress(0, max_tweets)\n",
 489 |     "        self.scraper_details = {\n",
 490 |     "            \"type\": None,\n",
 491 |     "            \"username\": scrape_username,\n",
 492 |     "            \"hashtag\": str(scrape_hashtag).replace(\"#\", \"\")\n",
 493 |     "            if scrape_hashtag is not None\n",
 494 |     "            else None,\n",
 495 |     "            \"query\": scrape_query,\n",
 496 |     "            \"tab\": \"Latest\" if scrape_latest else \"Top\" if scrape_top else \"Latest\",\n",
 497 |     "            \"poster_details\": scrape_poster_details,\n",
 498 |     "        }\n",
 499 |     "        self.router = self.go_to_home\n",
 500 |     "        self.scroller = Scroller(self.driver)\n",
 501 |     "\n",
 502 |     "        if scrape_username is not None:\n",
 503 |     "            self.scraper_details[\"type\"] = \"Username\"\n",
 504 |     "            self.router = self.go_to_profile\n",
 505 |     "        elif scrape_hashtag is not None:\n",
 506 |     "            self.scraper_details[\"type\"] = \"Hashtag\"\n",
 507 |     "            self.router = self.go_to_hashtag\n",
 508 |     "        elif scrape_query is not None:\n",
 509 |     "            self.scraper_details[\"type\"] = \"Query\"\n",
 510 |     "            self.router = self.go_to_search\n",
 511 |     "        else:\n",
 512 |     "            self.scraper_details[\"type\"] = \"Home\"\n",
 513 |     "            self.router = self.go_to_home\n",
 514 |     "        pass\n",
 515 |     "\n",
 516 |     "    def _get_driver(self):\n",
 517 |     "        print(\"Setup WebDriver...\")\n",
 518 |     "        header = Headers().generate()[\"User-Agent\"]\n",
 519 |     "\n",
 520 |     "        browser_option = ChromeOptions()\n",
 521 |     "        browser_option.add_argument(\"--no-sandbox\")\n",
 522 |     "        browser_option.add_argument(\"--disable-dev-shm-usage\")\n",
 523 |     "        browser_option.add_argument(\"--ignore-certificate-errors\")\n",
 524 |     "        browser_option.add_argument(\"--disable-gpu\")\n",
 525 |     "        browser_option.add_argument(\"--log-level=3\")\n",
 526 |     "        browser_option.add_argument(\"--disable-notifications\")\n",
 527 |     "        browser_option.add_argument(\"--disable-popup-blocking\")\n",
 528 |     "        browser_option.add_argument(\"--user-agent={}\".format(header))\n",
 529 |     "\n",
 530 |     "        # For Hiding Browser\n",
 531 |     "        browser_option.add_argument(\"--headless\")\n",
 532 |     "\n",
 533 |     "        try:\n",
 534 |     "            print(\"Initializing ChromeDriver...\")\n",
 535 |     "            driver = webdriver.Chrome(\n",
 536 |     "                options=browser_option,\n",
 537 |     "            )\n",
 538 |     "\n",
 539 |     "            print(\"WebDriver Setup Complete\")\n",
 540 |     "            return driver\n",
 541 |     "        except WebDriverException:\n",
 542 |     "            try:\n",
 543 |     "                print(\"Downloading ChromeDriver...\")\n",
 544 |     "                chromedriver_path = ChromeDriverManager().install()\n",
 545 |     "                chrome_service = ChromeService(executable_path=chromedriver_path)\n",
 546 |     "\n",
 547 |     "                print(\"Initializing ChromeDriver...\")\n",
 548 |     "                driver = webdriver.Chrome(\n",
 549 |     "                    service=chrome_service,\n",
 550 |     "                    options=browser_option,\n",
 551 |     "                )\n",
 552 |     "\n",
 553 |     "                print(\"WebDriver Setup Complete\")\n",
 554 |     "                return driver\n",
 555 |     "            except Exception as e:\n",
 556 |     "                print(f\"Error setting up WebDriver: {e}\")\n",
 557 |     "                sys.exit(1)\n",
 558 |     "        pass\n",
 559 |     "\n",
 560 |     "    def login(self):\n",
 561 |     "        print()\n",
 562 |     "        print(\"Logging in to Twitter...\")\n",
 563 |     "\n",
 564 |     "        try:\n",
 565 |     "            self.driver.maximize_window()\n",
 566 |     "            self.driver.get(TWITTER_LOGIN_URL)\n",
 567 |     "            sleep(3)\n",
 568 |     "\n",
 569 |     "            self._input_username()\n",
 570 |     "            self._input_unusual_activity()\n",
 571 |     "            self._input_password()\n",
 572 |     "\n",
 573 |     "            cookies = self.driver.get_cookies()\n",
 574 |     "\n",
 575 |     "            auth_token = None\n",
 576 |     "\n",
 577 |     "            for cookie in cookies:\n",
 578 |     "                if cookie[\"name\"] == \"auth_token\":\n",
 579 |     "                    auth_token = cookie[\"value\"]\n",
 580 |     "                    break\n",
 581 |     "\n",
 582 |     "            if auth_token is None:\n",
 583 |     "                raise ValueError(\n",
 584 |     "                    \"\"\"This may be due to the following:\n",
 585 |     "\n",
 586 |     "- Internet connection is unstable\n",
 587 |     "- Username is incorrect\n",
 588 |     "- Password is incorrect\n",
 589 |     "\"\"\"\n",
 590 |     "                )\n",
 591 |     "\n",
 592 |     "            print()\n",
 593 |     "            print(\"Login Successful\")\n",
 594 |     "            print()\n",
 595 |     "        except Exception as e:\n",
 596 |     "            print()\n",
 597 |     "            print(f\"Login Failed: {e}\")\n",
 598 |     "            sys.exit(1)\n",
 599 |     "\n",
 600 |     "        pass\n",
 601 |     "\n",
 602 |     "    def _input_username(self):\n",
 603 |     "        input_attempt = 0\n",
 604 |     "\n",
 605 |     "        while True:\n",
 606 |     "            try:\n",
 607 |     "                username = self.driver.find_element(\n",
 608 |     "                    \"xpath\", \"//input[@autocomplete='username']\"\n",
 609 |     "                )\n",
 610 |     "\n",
 611 |     "                username.send_keys(self.username)\n",
 612 |     "                username.send_keys(Keys.RETURN)\n",
 613 |     "                sleep(3)\n",
 614 |     "                break\n",
 615 |     "            except NoSuchElementException:\n",
 616 |     "                input_attempt += 1\n",
 617 |     "                if input_attempt >= 3:\n",
 618 |     "                    print()\n",
 619 |     "                    print(\n",
 620 |     "                        \"\"\"There was an error inputting the username.\n",
 621 |     "\n",
 622 |     "It may be due to the following:\n",
 623 |     "- Internet connection is unstable\n",
 624 |     "- Username is incorrect\n",
 625 |     "- Twitter is experiencing unusual activity\"\"\"\n",
 626 |     "                    )\n",
 627 |     "                    self.driver.quit()\n",
 628 |     "                    sys.exit(1)\n",
 629 |     "                else:\n",
 630 |     "                    print(\"Re-attempting to input username...\")\n",
 631 |     "                    sleep(2)\n",
 632 |     "\n",
 633 |     "    def _input_unusual_activity(self):\n",
 634 |     "        input_attempt = 0\n",
 635 |     "\n",
 636 |     "        while True:\n",
 637 |     "            try:\n",
 638 |     "                unusual_activity = self.driver.find_element(\n",
 639 |     "                    \"xpath\", \"//input[@data-testid='ocfEnterTextTextInput']\"\n",
 640 |     "                )\n",
 641 |     "                unusual_activity.send_keys(self.username)\n",
 642 |     "                unusual_activity.send_keys(Keys.RETURN)\n",
 643 |     "                sleep(3)\n",
 644 |     "                break\n",
 645 |     "            except NoSuchElementException:\n",
 646 |     "                input_attempt += 1\n",
 647 |     "                if input_attempt >= 3:\n",
 648 |     "                    break\n",
 649 |     "\n",
 650 |     "    def _input_password(self):\n",
 651 |     "        input_attempt = 0\n",
 652 |     "\n",
 653 |     "        while True:\n",
 654 |     "            try:\n",
 655 |     "                password = self.driver.find_element(\n",
 656 |     "                    \"xpath\", \"//input[@autocomplete='current-password']\"\n",
 657 |     "                )\n",
 658 |     "\n",
 659 |     "                password.send_keys(self.password)\n",
 660 |     "                password.send_keys(Keys.RETURN)\n",
 661 |     "                sleep(3)\n",
 662 |     "                break\n",
 663 |     "            except NoSuchElementException:\n",
 664 |     "                input_attempt += 1\n",
 665 |     "                if input_attempt >= 3:\n",
 666 |     "                    print()\n",
 667 |     "                    print(\n",
 668 |     "                        \"\"\"There was an error inputting the password.\n",
 669 |     "\n",
 670 |     "It may be due to the following:\n",
 671 |     "- Internet connection is unstable\n",
 672 |     "- Password is incorrect\n",
 673 |     "- Twitter is experiencing unusual activity\"\"\"\n",
 674 |     "                    )\n",
 675 |     "                    self.driver.quit()\n",
 676 |     "                    sys.exit(1)\n",
 677 |     "                else:\n",
 678 |     "                    print(\"Re-attempting to input password...\")\n",
 679 |     "                    sleep(2)\n",
 680 |     "\n",
 681 |     "    def go_to_home(self):\n",
 682 |     "        self.driver.get(\"https://twitter.com/home\")\n",
 683 |     "        sleep(3)\n",
 684 |     "        pass\n",
 685 |     "\n",
 686 |     "    def go_to_profile(self):\n",
 687 |     "        if (\n",
 688 |     "            self.scraper_details[\"username\"] is None\n",
 689 |     "            or self.scraper_details[\"username\"] == \"\"\n",
 690 |     "        ):\n",
 691 |     "            print(\"Username is not set.\")\n",
 692 |     "            sys.exit(1)\n",
 693 |     "        else:\n",
 694 |     "            self.driver.get(f\"https://twitter.com/{self.scraper_details['username']}\")\n",
 695 |     "            sleep(3)\n",
 696 |     "        pass\n",
 697 |     "\n",
 698 |     "    def go_to_hashtag(self):\n",
 699 |     "        if (\n",
 700 |     "            self.scraper_details[\"hashtag\"] is None\n",
 701 |     "            or self.scraper_details[\"hashtag\"] == \"\"\n",
 702 |     "        ):\n",
 703 |     "            print(\"Hashtag is not set.\")\n",
 704 |     "            sys.exit(1)\n",
 705 |     "        else:\n",
 706 |     "            url = f\"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click\"\n",
 707 |     "            if self.scraper_details[\"tab\"] == \"Latest\":\n",
 708 |     "                url += \"&f=live\"\n",
 709 |     "\n",
 710 |     "            self.driver.get(url)\n",
 711 |     "            sleep(3)\n",
 712 |     "        pass\n",
 713 |     "\n",
 714 |     "    def go_to_search(self):\n",
 715 |     "        if self.scraper_details[\"query\"] is None or self.scraper_details[\"query\"] == \"\":\n",
 716 |     "            print(\"Query is not set.\")\n",
 717 |     "            sys.exit(1)\n",
 718 |     "        else:\n",
 719 |     "            url = f\"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query\"\n",
 720 |     "            if self.scraper_details[\"tab\"] == \"Latest\":\n",
 721 |     "                url += \"&f=live\"\n",
 722 |     "\n",
 723 |     "            self.driver.get(url)\n",
 724 |     "            sleep(3)\n",
 725 |     "        pass\n",
 726 |     "\n",
 727 |     "    def get_tweet_cards(self):\n",
 728 |     "        self.tweet_cards = self.driver.find_elements(\n",
 729 |     "            \"xpath\", '//article[@data-testid=\"tweet\" and not(@disabled)]'\n",
 730 |     "        )\n",
 731 |     "        pass\n",
 732 |     "\n",
 733 |     "    def remove_hidden_cards(self):\n",
 734 |     "        try:\n",
 735 |     "            hidden_cards = self.driver.find_elements(\n",
 736 |     "                \"xpath\", '//article[@data-testid=\"tweet\" and @disabled]'\n",
 737 |     "            )\n",
 738 |     "\n",
 739 |     "            for card in hidden_cards[1:-2]:\n",
 740 |     "                self.driver.execute_script(\n",
 741 |     "                    \"arguments[0].parentNode.parentNode.parentNode.remove();\", card\n",
 742 |     "                )\n",
 743 |     "        except Exception as e:\n",
 744 |     "            return\n",
 745 |     "        pass\n",
 746 |     "\n",
 747 |     "    def scrape_tweets(\n",
 748 |     "        self,\n",
 749 |     "        max_tweets=50,\n",
 750 |     "        scrape_username=None,\n",
 751 |     "        scrape_hashtag=None,\n",
 752 |     "        scrape_query=None,\n",
 753 |     "        scrape_latest=True,\n",
 754 |     "        scrape_top=False,\n",
 755 |     "        scrape_poster_details=False,\n",
 756 |     "        router=None,\n",
 757 |     "    ):\n",
 758 |     "        self._config_scraper(\n",
 759 |     "            max_tweets,\n",
 760 |     "            scrape_username,\n",
 761 |     "            scrape_hashtag,\n",
 762 |     "            scrape_query,\n",
 763 |     "            scrape_latest,\n",
 764 |     "            scrape_top,\n",
 765 |     "            scrape_poster_details,\n",
 766 |     "        )\n",
 767 |     "\n",
 768 |     "        if router is None:\n",
 769 |     "            router = self.router\n",
 770 |     "\n",
 771 |     "        router()\n",
 772 |     "\n",
 773 |     "        if self.scraper_details[\"type\"] == \"Username\":\n",
 774 |     "            print(\n",
 775 |     "                \"Scraping Tweets from @{}...\".format(self.scraper_details[\"username\"])\n",
 776 |     "            )\n",
 777 |     "        elif self.scraper_details[\"type\"] == \"Hashtag\":\n",
 778 |     "            print(\n",
 779 |     "                \"Scraping {} Tweets from #{}...\".format(\n",
 780 |     "                    self.scraper_details[\"tab\"], self.scraper_details[\"hashtag\"]\n",
 781 |     "                )\n",
 782 |     "            )\n",
 783 |     "        elif self.scraper_details[\"type\"] == \"Query\":\n",
 784 |     "            print(\n",
 785 |     "                \"Scraping {} Tweets from {} search...\".format(\n",
 786 |     "                    self.scraper_details[\"tab\"], self.scraper_details[\"query\"]\n",
 787 |     "                )\n",
 788 |     "            )\n",
 789 |     "        elif self.scraper_details[\"type\"] == \"Home\":\n",
 790 |     "            print(\"Scraping Tweets from Home...\")\n",
 791 |     "\n",
 792 |     "        self.progress.print_progress(0)\n",
 793 |     "\n",
 794 |     "        refresh_count = 0\n",
 795 |     "        added_tweets = 0\n",
 796 |     "        empty_count = 0\n",
 797 |     "\n",
 798 |     "        while self.scroller.scrolling:\n",
 799 |     "            try:\n",
 800 |     "                self.get_tweet_cards()\n",
 801 |     "                added_tweets = 0\n",
 802 |     "\n",
 803 |     "                for card in self.tweet_cards[-15:]:\n",
 804 |     "                    try:\n",
 805 |     "                        tweet_id = str(card)\n",
 806 |     "\n",
 807 |     "                        if tweet_id not in self.tweet_ids:\n",
 808 |     "                            self.tweet_ids.add(tweet_id)\n",
 809 |     "\n",
 810 |     "                            if not self.scraper_details[\"poster_details\"]:\n",
 811 |     "                                self.driver.execute_script(\n",
 812 |     "                                    \"arguments[0].scrollIntoView();\", card\n",
 813 |     "                                )\n",
 814 |     "\n",
 815 |     "                            tweet = Tweet(\n",
 816 |     "                                card=card,\n",
 817 |     "                                driver=self.driver,\n",
 818 |     "                                actions=self.actions,\n",
 819 |     "                                scrape_poster_details=self.scraper_details[\n",
 820 |     "                                    \"poster_details\"\n",
 821 |     "                                ],\n",
 822 |     "                            )\n",
 823 |     "\n",
 824 |     "                            if tweet:\n",
 825 |     "                                if not tweet.error and tweet.tweet is not None:\n",
 826 |     "                                    if not tweet.is_ad:\n",
 827 |     "                                        self.data.append(tweet.tweet)\n",
 828 |     "                                        added_tweets += 1\n",
 829 |     "                                        self.progress.print_progress(len(self.data))\n",
 830 |     "\n",
 831 |     "                                        if len(self.data) >= self.max_tweets:\n",
 832 |     "                                            self.scroller.scrolling = False\n",
 833 |     "                                            break\n",
 834 |     "                                    else:\n",
 835 |     "                                        continue\n",
 836 |     "                                else:\n",
 837 |     "                                    continue\n",
 838 |     "                            else:\n",
 839 |     "                                continue\n",
 840 |     "                        else:\n",
 841 |     "                            continue\n",
 842 |     "                    except NoSuchElementException:\n",
 843 |     "                        continue\n",
 844 |     "\n",
 845 |     "                if len(self.data) >= self.max_tweets:\n",
 846 |     "                    break\n",
 847 |     "\n",
 848 |     "                if added_tweets == 0:\n",
 849 |     "                    if empty_count >= 5:\n",
 850 |     "                        if refresh_count >= 3:\n",
 851 |     "                            print()\n",
 852 |     "                            print(\"No more tweets to scrape\")\n",
 853 |     "                            break\n",
 854 |     "                        refresh_count += 1\n",
 855 |     "                    empty_count += 1\n",
 856 |     "                    sleep(1)\n",
 857 |     "                else:\n",
 858 |     "                    empty_count = 0\n",
 859 |     "                    refresh_count = 0\n",
 860 |     "            except StaleElementReferenceException:\n",
 861 |     "                sleep(2)\n",
 862 |     "                continue\n",
 863 |     "            except KeyboardInterrupt:\n",
 864 |     "                print(\"\\n\")\n",
 865 |     "                print(\"Keyboard Interrupt\")\n",
 866 |     "                self.interrupted = True\n",
 867 |     "                break\n",
 868 |     "            except Exception as e:\n",
 869 |     "                print(\"\\n\")\n",
 870 |     "                print(f\"Error scraping tweets: {e}\")\n",
 871 |     "                break\n",
 872 |     "\n",
 873 |     "        print(\"\")\n",
 874 |     "\n",
 875 |     "        if len(self.data) >= self.max_tweets:\n",
 876 |     "            print(\"Scraping Complete\")\n",
 877 |     "        else:\n",
 878 |     "            print(\"Scraping Incomplete\")\n",
 879 |     "\n",
 880 |     "        print(\"Tweets: {} out of {}\\n\".format(len(self.data), self.max_tweets))\n",
 881 |     "\n",
 882 |     "        pass\n",
 883 |     "\n",
 884 |     "    def save_to_csv(self):\n",
 885 |     "        print(\"Saving Tweets to CSV...\")\n",
 886 |     "        now = datetime.now()\n",
 887 |     "        folder_path = \"./tweets/\"\n",
 888 |     "\n",
 889 |     "        if not os.path.exists(folder_path):\n",
 890 |     "            os.makedirs(folder_path)\n",
 891 |     "            print(\"Created Folder: {}\".format(folder_path))\n",
 892 |     "\n",
 893 |     "        data = {\n",
 894 |     "            \"Name\": [tweet[0] for tweet in self.data],\n",
 895 |     "            \"Handle\": [tweet[1] for tweet in self.data],\n",
 896 |     "            \"Timestamp\": [tweet[2] for tweet in self.data],\n",
 897 |     "            \"Verified\": [tweet[3] for tweet in self.data],\n",
 898 |     "            \"Content\": [tweet[4] for tweet in self.data],\n",
 899 |     "            \"Comments\": [tweet[5] for tweet in self.data],\n",
 900 |     "            \"Retweets\": [tweet[6] for tweet in self.data],\n",
 901 |     "            \"Likes\": [tweet[7] for tweet in self.data],\n",
 902 |     "            \"Analytics\": [tweet[8] for tweet in self.data],\n",
 903 |     "            \"Tags\": [tweet[9] for tweet in self.data],\n",
 904 |     "            \"Mentions\": [tweet[10] for tweet in self.data],\n",
 905 |     "            \"Emojis\": [tweet[11] for tweet in self.data],\n",
 906 |     "            \"Profile Image\": [tweet[12] for tweet in self.data],\n",
 907 |     "            \"Tweet Link\": [tweet[13] for tweet in self.data],\n",
 908 |     "            \"Tweet ID\": [f'tweet_id:{tweet[14]}' for tweet in self.data],\n",
 909 |     "        }\n",
 910 |     "\n",
 911 |     "        if self.scraper_details[\"poster_details\"]:\n",
 912 |     "            data[\"Tweeter ID\"] = [f'user_id:{tweet[15]}' for tweet in self.data]\n",
 913 |     "            data[\"Following\"] = [tweet[16] for tweet in self.data]\n",
 914 |     "            data[\"Followers\"] = [tweet[17] for tweet in self.data]\n",
 915 |     "\n",
 916 |     "        df = pd.DataFrame(data)\n",
 917 |     "\n",
 918 |     "        current_time = now.strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
 919 |     "        file_path = f\"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv\"\n",
 920 |     "        pd.set_option(\"display.max_colwidth\", None)\n",
 921 |     "        df.to_csv(file_path, index=False, encoding=\"utf-8\")\n",
 922 |     "\n",
 923 |     "        print(\"CSV Saved: {}\".format(file_path))\n",
 924 |     "\n",
 925 |     "        pass\n",
 926 |     "\n",
 927 |     "    def get_tweets(self):\n",
 928 |     "        return self.data"
 929 |    ]
 930 |   },
 931 |   {
 932 |    "attachments": {},
 933 |    "cell_type": "markdown",
 934 |    "metadata": {},
 935 |    "source": [
 936 |     "# Create a new instance of the Twitter Scraper class"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": null,
 942 |    "metadata": {},
 943 |    "outputs": [],
 944 |    "source": [
 945 |     "USER_UNAME = os.environ['TWITTER_USERNAME']\n",
 946 |     "USER_PASSWORD = os.environ['TWITTER_PASSWORD']\n",
 947 |     "\n",
 948 |     "scraper = Twitter_Scraper(\n",
 949 |     "    username=USER_UNAME,\n",
 950 |     "    password=USER_PASSWORD,\n",
 951 |     "    # max_tweets=10,\n",
 952 |     "    # scrape_username=\"something\",\n",
 953 |     "    # scrape_hashtag=\"something\",\n",
 954 |     "    # scrape_query=\"something\",\n",
 955 |     "    # scrape_latest=False,\n",
 956 |     "    # scrape_top=True,\n",
 957 |     "    # scrape_poster_details=True\n",
 958 |     ")"
 959 |    ]
 960 |   },
 961 |   {
 962 |    "cell_type": "code",
 963 |    "execution_count": null,
 964 |    "metadata": {},
 965 |    "outputs": [],
 966 |    "source": [
 967 |     "scraper.login()"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "attachments": {},
 972 |    "cell_type": "markdown",
 973 |    "metadata": {},
 974 |    "source": [
 975 |     "# Run Twitter Scraper"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": null,
 981 |    "metadata": {},
 982 |    "outputs": [],
 983 |    "source": [
 984 |     "scraper.scrape_tweets(\n",
 985 |     "    # max_tweets=100,\n",
 986 |     "    # scrape_username=\"something\",\n",
 987 |     "    # scrape_hashtag=\"something\",\n",
 988 |     "    # scrape_query=\"something\",\n",
 989 |     "    # scrape_latest=False,\n",
 990 |     "    # scrape_top=True,\n",
 991 |     "    # scrape_poster_details=True,\n",
 992 |     ")"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "attachments": {},
 997 |    "cell_type": "markdown",
 998 |    "metadata": {},
 999 |    "source": [
1000 |     "# Save Scraped Tweets in a CSV"
1001 |    ]
1002 |   },
1003 |   {
1004 |    "cell_type": "code",
1005 |    "execution_count": null,
1006 |    "metadata": {},
1007 |    "outputs": [],
1008 |    "source": [
1009 |     "scraper.save_to_csv()"
1010 |    ]
1011 |   },
1012 |   {
1013 |    "cell_type": "code",
1014 |    "execution_count": null,
1015 |    "metadata": {},
1016 |    "outputs": [],
1017 |    "source": [
1018 |     "scraper.driver.close()"
1019 |    ]
1020 |   }
1021 |  ],
1022 |  "metadata": {
1023 |   "kernelspec": {
1024 |    "display_name": "ml",
1025 |    "language": "python",
1026 |    "name": "python3"
1027 |   },
1028 |   "language_info": {
1029 |    "codemirror_mode": {
1030 |     "name": "ipython",
1031 |     "version": 3
1032 |    },
1033 |    "file_extension": ".py",
1034 |    "mimetype": "text/x-python",
1035 |    "name": "python",
1036 |    "nbconvert_exporter": "python",
1037 |    "pygments_lexer": "ipython3",
1038 |    "version": "3.11.5"
1039 |   },
1040 |   "orig_nbformat": 4
1041 |  },
1042 |  "nbformat": 4,
1043 |  "nbformat_minor": 2
1044 | }
1045 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fake_headers>=1.0.2
2 | pandas>=2.0.3
3 | python-dotenv>=1.0.0
4 | selenium>=4.12.0
5 | webdriver_manager>=4.0.0
6 | 


--------------------------------------------------------------------------------
/sample-command.txt:
--------------------------------------------------------------------------------
1 | python scraper --query='("NVDA" OR "nvidia") lang:en until:2024-01-19 since:2024-01-18' -t 5000 --top
2 | 


--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/godkingjay/selenium-twitter-scraper/62d8ceb2f39a533d68965f309371efeeb9c676bd/scraper/__init__.py


--------------------------------------------------------------------------------
/scraper/__main__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import getpass
  5 | from twitter_scraper import Twitter_Scraper
  6 | 
  7 | try:
  8 |     from dotenv import load_dotenv
  9 | 
 10 |     print("Loading .env file")
 11 |     load_dotenv()
 12 |     print("Loaded .env file\n")
 13 | except Exception as e:
 14 |     print(f"Error loading .env file: {e}")
 15 |     sys.exit(1)
 16 | 
 17 | 
 18 | def main():
 19 |     try:
 20 |         parser = argparse.ArgumentParser(
 21 |             add_help=True,
 22 |             usage="python scraper [option] ... [arg] ...",
 23 |             description="Twitter Scraper is a tool that allows you to scrape tweets from twitter without using Twitter's API.",
 24 |         )
 25 | 
 26 |         try:
 27 |             parser.add_argument(
 28 |                 "--mail",
 29 |                 type=str,
 30 |                 default=os.getenv("TWITTER_MAIL"),
 31 |                 help="Your Twitter mail.",
 32 |             )
 33 | 
 34 |             parser.add_argument(
 35 |                 "--user",
 36 |                 type=str,
 37 |                 default=os.getenv("TWITTER_USERNAME"),
 38 |                 help="Your Twitter username.",
 39 |             )
 40 | 
 41 |             parser.add_argument(
 42 |                 "--password",
 43 |                 type=str,
 44 |                 default=os.getenv("TWITTER_PASSWORD"),
 45 |                 help="Your Twitter password.",
 46 |             )
 47 | 
 48 |             parser.add_argument(
 49 |                 "--headlessState",
 50 |                 type=str,
 51 |                 default=os.getenv("HEADLESS"),
 52 |                 help="Headless mode? [yes/no]"
 53 |             )
 54 |         except Exception as e:
 55 |             print(f"Error retrieving environment variables: {e}")
 56 |             sys.exit(1)
 57 | 
 58 |         parser.add_argument(
 59 |             "-t",
 60 |             "--tweets",
 61 |             type=int,
 62 |             default=50,
 63 |             help="Number of tweets to scrape (default: 50)",
 64 |         )
 65 | 
 66 |         parser.add_argument(
 67 |             "-u",
 68 |             "--username",
 69 |             type=str,
 70 |             default=None,
 71 |             help="Twitter username. Scrape tweets from a user's profile.",
 72 |         )
 73 | 
 74 |         parser.add_argument(
 75 |             "-ht",
 76 |             "--hashtag",
 77 |             type=str,
 78 |             default=None,
 79 |             help="Twitter hashtag. Scrape tweets from a hashtag.",
 80 |         )
 81 | 
 82 |         parser.add_argument(
 83 |             "--bookmarks",
 84 |             action='store_true',
 85 |             help="Twitter bookmarks. Scrape tweets from your bookmarks.",
 86 |         )
 87 | 
 88 |         parser.add_argument(
 89 |             "-ntl",
 90 |             "--no_tweets_limit",
 91 |             nargs='?',
 92 |             default=False,
 93 |             help="Set no limit to the number of tweets to scrape (will scrap until no more tweets are available).",
 94 |         )
 95 | 
 96 |         parser.add_argument(
 97 |             "-l",
 98 |             "--list",
 99 |             type=str,
100 |             default=None,
101 |             help="List ID. Scrape tweets from a list.",
102 |         )
103 | 
104 |         parser.add_argument(
105 |             "-q",
106 |             "--query",
107 |             type=str,
108 |             default=None,
109 |             help="Twitter query or search. Scrape tweets from a query or search.",
110 |         )
111 | 
112 |         parser.add_argument(
113 |             "-a",
114 |             "--add",
115 |             type=str,
116 |             default="",
117 |             help="Additional data to scrape and save in the .csv file.",
118 |         )
119 | 
120 |         parser.add_argument(
121 |             "--latest",
122 |             action="store_true",
123 |             help="Scrape latest tweets",
124 |         )
125 | 
126 |         parser.add_argument(
127 |             "--top",
128 |             action="store_true",
129 |             help="Scrape top tweets",
130 |         )
131 | 
132 |         args = parser.parse_args()
133 | 
134 |         USER_MAIL = args.mail
135 |         USER_UNAME = args.user
136 |         USER_PASSWORD = args.password
137 |         HEADLESS_MODE= args.headlessState
138 | 
139 |         if USER_UNAME is None:
140 |             USER_UNAME = input("Twitter Username: ")
141 | 
142 |         if USER_PASSWORD is None:
143 |             USER_PASSWORD = getpass.getpass("Enter Password: ")
144 | 
145 |         if HEADLESS_MODE is None:
146 |             HEADLESS_MODE - str(input("Headless?[Yes/No]")).lower()
147 | 
148 |         print()
149 | 
150 |         tweet_type_args = []
151 | 
152 |         if args.username is not None:
153 |             tweet_type_args.append(args.username)
154 |         if args.hashtag is not None:
155 |             tweet_type_args.append(args.hashtag)
156 |         if args.list is not None:
157 |             tweet_type_args.append(args.list)
158 |         if args.query is not None:
159 |             tweet_type_args.append(args.query)
160 |         if args.bookmarks is not False:
161 |             tweet_type_args.append(args.query)
162 | 
163 |         additional_data: list = args.add.split(",")
164 | 
165 |         if len(tweet_type_args) > 1:
166 |             print("Please specify only one of --username, --hashtag, --bookmarks, or --query.")
167 |             sys.exit(1)
168 | 
169 |         if args.latest and args.top:
170 |             print("Please specify either --latest or --top. Not both.")
171 |             sys.exit(1)
172 | 
173 |         if USER_UNAME is not None and USER_PASSWORD is not None:
174 |             scraper = Twitter_Scraper(
175 |                 mail=USER_MAIL,
176 |                 username=USER_UNAME,
177 |                 password=USER_PASSWORD,
178 |                 headlessState=HEADLESS_MODE
179 |             )
180 |             scraper.login()
181 |             scraper.scrape_tweets(
182 |                 max_tweets=args.tweets,
183 |                 no_tweets_limit= args.no_tweets_limit if args.no_tweets_limit is not None else True,
184 |                 scrape_username=args.username,
185 |                 scrape_hashtag=args.hashtag,
186 |                 scrape_bookmarks=args.bookmarks,
187 |                 scrape_query=args.query,
188 |                 scrape_list=args.list,
189 |                 scrape_latest=args.latest,
190 |                 scrape_top=args.top,
191 |                 scrape_poster_details="pd" in additional_data,
192 |             )
193 |             scraper.save_to_csv()
194 |             if not scraper.interrupted:
195 |                 scraper.driver.close()
196 |         else:
197 |             print(
198 |                 "Missing Twitter username or password environment variables. Please check your .env file."
199 |             )
200 |             sys.exit(1)
201 |     except KeyboardInterrupt:
202 |         print("\nScript Interrupted by user. Exiting...")
203 |         sys.exit(1)
204 |     except Exception as e:
205 |         print(f"Error: {e}")
206 |         sys.exit(1)
207 |     sys.exit(1)
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     main()
212 | 


--------------------------------------------------------------------------------
/scraper/progress.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | class Progress:
 5 |     def __init__(self, current, total) -> None:
 6 |         self.current = current
 7 |         self.total = total
 8 |         pass
 9 | 
10 |     def print_progress(self, current, waiting, retry_cnt, no_tweets_limit) -> None:
11 |         self.current = current
12 |         progress = current / self.total
13 |         bar_length = 40
14 |         progress_bar = (
15 |             "["
16 |             + "=" * int(bar_length * progress)
17 |             + "-" * (bar_length - int(bar_length * progress))
18 |             + "]"
19 |         )
20 |         if no_tweets_limit:
21 |             if waiting:
22 |                 sys.stdout.write(
23 |                     "\rTweets scraped : {} - waiting to access older tweets {} min on 15 min".format(
24 |                         current, retry_cnt
25 |                     )
26 |                 )
27 |             else:
28 |                 sys.stdout.write(
29 |                     "\rTweets scraped : {}                                                  ".format(
30 |                         current
31 |                     )
32 |                 )
33 |         else:
34 |             if waiting:
35 |                 sys.stdout.write(
36 |                     "\rProgress: [{:<40}] {:.2%} {} of {} - waiting to access older tweets {} min on 15 min".format(
37 |                         progress_bar, progress, current, self.total, retry_cnt
38 |                     )
39 |                 )
40 |             else:
41 |                 sys.stdout.write(
42 |                     "\rProgress: [{:<40}] {:.2%} {} of {}                                                  ".format(
43 |                         progress_bar, progress, current, self.total
44 |                     )
45 |                 )
46 |         sys.stdout.flush()
47 | 


--------------------------------------------------------------------------------
/scraper/scroller.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | 
 4 | 
 5 | class Scroller:
 6 |     def __init__(self, driver) -> None:
 7 |         self.driver = driver
 8 |         self.current_position = 0
 9 |         self.last_position = driver.execute_script("return window.pageYOffset;")
10 |         self.scrolling = True
11 |         self.scroll_count = 0
12 |         pass
13 | 
14 |     def reset(self) -> None:
15 |         self.current_position = 0
16 |         self.last_position = self.driver.execute_script("return window.pageYOffset;")
17 |         self.scroll_count = 0
18 |         pass
19 | 
20 |     def scroll_to_top(self) -> None:
21 |         self.driver.execute_script("window.scrollTo(0, 0);")
22 |         pass
23 | 
24 |     def scroll_to_bottom(self) -> None:
25 |         self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
26 |         pass
27 | 
28 |     def update_scroll_position(self) -> None:
29 |         self.current_position = self.driver.execute_script("return window.pageYOffset;")
30 |         pass
31 | 


--------------------------------------------------------------------------------
/scraper/tweet.py:
--------------------------------------------------------------------------------
  1 | from time import sleep
  2 | from selenium.common.exceptions import (
  3 |     NoSuchElementException,
  4 |     StaleElementReferenceException,
  5 | )
  6 | from selenium.webdriver.chrome.webdriver import WebDriver
  7 | from selenium.webdriver.common.action_chains import ActionChains
  8 | 
  9 | 
 10 | class Tweet:
 11 |     def __init__(
 12 |         self,
 13 |         card: WebDriver,
 14 |         driver: WebDriver,
 15 |         actions: ActionChains,
 16 |         scrape_poster_details=False,
 17 |     ) -> None:
 18 |         self.card = card
 19 |         self.error = False
 20 |         self.tweet = None
 21 | 
 22 |         try:
 23 |             self.user = card.find_element(
 24 |                 "xpath", './/div[@data-testid="User-Name"]//span'
 25 |             ).text
 26 |         except NoSuchElementException:
 27 |             self.error = True
 28 |             self.user = "skip"
 29 | 
 30 |         try:
 31 |             self.handle = card.find_element(
 32 |                 "xpath", './/span[contains(text(), "@")]'
 33 |             ).text
 34 |         except NoSuchElementException:
 35 |             self.error = True
 36 |             self.handle = "skip"
 37 | 
 38 |         try:
 39 |             self.date_time = card.find_element("xpath", ".//time").get_attribute(
 40 |                 "datetime"
 41 |             )
 42 | 
 43 |             if self.date_time is not None:
 44 |                 self.is_ad = False
 45 |         except NoSuchElementException:
 46 |             self.is_ad = True
 47 |             self.error = True
 48 |             self.date_time = "skip"
 49 | 
 50 |         if self.error:
 51 |             return
 52 | 
 53 |         try:
 54 |             card.find_element(
 55 |                 "xpath", './/*[local-name()="svg" and @data-testid="icon-verified"]'
 56 |             )
 57 | 
 58 |             self.verified = True
 59 |         except NoSuchElementException:
 60 |             self.verified = False
 61 | 
 62 |         self.content = ""
 63 |         contents = card.find_elements(
 64 |             "xpath",
 65 |             '(.//div[@data-testid="tweetText"])[1]/span | (.//div[@data-testid="tweetText"])[1]/a',
 66 |         )
 67 | 
 68 |         for index, content in enumerate(contents):
 69 |             self.content += content.text
 70 | 
 71 |         try:
 72 |             self.reply_cnt = card.find_element(
 73 |                 "xpath", './/button[@data-testid="reply"]//span'
 74 |             ).text
 75 | 
 76 |             if self.reply_cnt == "":
 77 |                 self.reply_cnt = "0"
 78 |         except NoSuchElementException:
 79 |             self.reply_cnt = "0"
 80 | 
 81 |         try:
 82 |             self.retweet_cnt = card.find_element(
 83 |                 "xpath", './/button[@data-testid="retweet"]//span'
 84 |             ).text
 85 | 
 86 |             if self.retweet_cnt == "":
 87 |                 self.retweet_cnt = "0"
 88 |         except NoSuchElementException:
 89 |             self.retweet_cnt = "0"
 90 | 
 91 |         try:
 92 |             self.like_cnt = card.find_element(
 93 |                 "xpath", './/button[@data-testid="like"]//span'
 94 |             ).text
 95 | 
 96 |             if self.like_cnt == "":
 97 |                 self.like_cnt = "0"
 98 |         except NoSuchElementException:
 99 |             self.like_cnt = "0"
100 | 
101 |         try:
102 |             self.analytics_cnt = card.find_element(
103 |                 "xpath", './/a[contains(@href, "/analytics")]//span'
104 |             ).text
105 | 
106 |             if self.analytics_cnt == "":
107 |                 self.analytics_cnt = "0"
108 |         except NoSuchElementException:
109 |             self.analytics_cnt = "0"
110 | 
111 |         try:
112 |             self.tags = card.find_elements(
113 |                 "xpath",
114 |                 './/a[contains(@href, "src=hashtag_click")]',
115 |             )
116 | 
117 |             self.tags = [tag.text for tag in self.tags]
118 |         except NoSuchElementException:
119 |             self.tags = []
120 | 
121 |         try:
122 |             self.mentions = card.find_elements(
123 |                 "xpath",
124 |                 '(.//div[@data-testid="tweetText"])[1]//a[contains(text(), "@")]',
125 |             )
126 | 
127 |             self.mentions = [mention.text for mention in self.mentions]
128 |         except NoSuchElementException:
129 |             self.mentions = []
130 | 
131 |         try:
132 |             raw_emojis = card.find_elements(
133 |                 "xpath",
134 |                 '(.//div[@data-testid="tweetText"])[1]/img[contains(@src, "emoji")]',
135 |             )
136 | 
137 |             self.emojis = [
138 |                 emoji.get_attribute("alt").encode("unicode-escape").decode("ASCII")
139 |                 for emoji in raw_emojis
140 |             ]
141 |         except NoSuchElementException:
142 |             self.emojis = []
143 | 
144 |         try:
145 |             self.profile_img = card.find_element(
146 |                 "xpath", './/div[@data-testid="Tweet-User-Avatar"]//img'
147 |             ).get_attribute("src")
148 |         except NoSuchElementException:
149 |             self.profile_img = ""
150 | 
151 |         try:
152 |             self.tweet_link = self.card.find_element(
153 |                 "xpath",
154 |                 ".//a[contains(@href, '/status/')]",
155 |             ).get_attribute("href")
156 |             self.tweet_id = str(self.tweet_link.split("/")[-1])
157 |         except NoSuchElementException:
158 |             self.tweet_link = ""
159 |             self.tweet_id = ""
160 | 
161 |         self.following_cnt = "0"
162 |         self.followers_cnt = "0"
163 |         self.user_id = None
164 | 
165 |         if scrape_poster_details:
166 |             el_name = card.find_element(
167 |                 "xpath", './/div[@data-testid="User-Name"]//span'
168 |             )
169 | 
170 |             ext_hover_card = False
171 |             ext_user_id = False
172 |             ext_following = False
173 |             ext_followers = False
174 |             hover_attempt = 0
175 | 
176 |             while (
177 |                 not ext_hover_card
178 |                 or not ext_user_id
179 |                 or not ext_following
180 |                 or not ext_followers
181 |             ):
182 |                 try:
183 |                     actions.move_to_element(el_name).perform()
184 | 
185 |                     hover_card = driver.find_element(
186 |                         "xpath", '//div[@data-testid="hoverCardParent"]'
187 |                     )
188 | 
189 |                     ext_hover_card = True
190 | 
191 |                     while not ext_user_id:
192 |                         try:
193 |                             raw_user_id = hover_card.find_element(
194 |                                 "xpath",
195 |                                 '(.//div[contains(@data-testid, "-follow")]) | (.//div[contains(@data-testid, "-unfollow")])',
196 |                             ).get_attribute("data-testid")
197 | 
198 |                             if raw_user_id == "":
199 |                                 self.user_id = None
200 |                             else:
201 |                                 self.user_id = str(raw_user_id.split("-")[0])
202 | 
203 |                             ext_user_id = True
204 |                         except NoSuchElementException:
205 |                             continue
206 |                         except StaleElementReferenceException:
207 |                             self.error = True
208 |                             return
209 | 
210 |                     while not ext_following:
211 |                         try:
212 |                             self.following_cnt = hover_card.find_element(
213 |                                 "xpath", './/a[contains(@href, "/following")]//span'
214 |                             ).text
215 | 
216 |                             if self.following_cnt == "":
217 |                                 self.following_cnt = "0"
218 | 
219 |                             ext_following = True
220 |                         except NoSuchElementException:
221 |                             continue
222 |                         except StaleElementReferenceException:
223 |                             self.error = True
224 |                             return
225 | 
226 |                     while not ext_followers:
227 |                         try:
228 |                             self.followers_cnt = hover_card.find_element(
229 |                                 "xpath",
230 |                                 './/a[contains(@href, "/verified_followers")]//span',
231 |                             ).text
232 | 
233 |                             if self.followers_cnt == "":
234 |                                 self.followers_cnt = "0"
235 | 
236 |                             ext_followers = True
237 |                         except NoSuchElementException:
238 |                             continue
239 |                         except StaleElementReferenceException:
240 |                             self.error = True
241 |                             return
242 |                 except NoSuchElementException:
243 |                     if hover_attempt == 3:
244 |                         self.error
245 |                         return
246 |                     hover_attempt += 1
247 |                     sleep(0.5)
248 |                     continue
249 |                 except StaleElementReferenceException:
250 |                     self.error = True
251 |                     return
252 | 
253 |             if ext_hover_card and ext_following and ext_followers:
254 |                 actions.reset_actions()
255 | 
256 |         self.tweet = (
257 |             self.user,
258 |             self.handle,
259 |             self.date_time,
260 |             self.verified,
261 |             self.content,
262 |             self.reply_cnt,
263 |             self.retweet_cnt,
264 |             self.like_cnt,
265 |             self.analytics_cnt,
266 |             self.tags,
267 |             self.mentions,
268 |             self.emojis,
269 |             self.profile_img,
270 |             self.tweet_link,
271 |             self.tweet_id,
272 |             self.user_id,
273 |             self.following_cnt,
274 |             self.followers_cnt,
275 |         )
276 | 
277 |         pass
278 | 


--------------------------------------------------------------------------------
/scraper/twitter_scraper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import pandas as pd
  4 | from progress import Progress
  5 | from scroller import Scroller
  6 | from tweet import Tweet
  7 | 
  8 | from datetime import datetime
  9 | from fake_headers import Headers
 10 | from time import sleep
 11 | 
 12 | from selenium import webdriver
 13 | from selenium.webdriver.common.keys import Keys
 14 | from selenium.common.exceptions import (
 15 |     NoSuchElementException,
 16 |     StaleElementReferenceException,
 17 |     WebDriverException,
 18 | )
 19 | from selenium.webdriver.common.action_chains import ActionChains
 20 | from selenium.webdriver.chrome.options import Options as ChromeOptions
 21 | from selenium.webdriver.chrome.service import Service as ChromeService
 22 | 
 23 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
 24 | from selenium.webdriver.firefox.service import Service as FirefoxService
 25 | 
 26 | from selenium.webdriver.support.ui import WebDriverWait
 27 | 
 28 | from webdriver_manager.chrome import ChromeDriverManager
 29 | from webdriver_manager.firefox import GeckoDriverManager
 30 | 
 31 | TWITTER_LOGIN_URL = "https://twitter.com/i/flow/login"
 32 | 
 33 | 
 34 | class Twitter_Scraper:
 35 |     def __init__(
 36 |         self,
 37 |         mail,
 38 |         username,
 39 |         password,
 40 |         headlessState,
 41 |         max_tweets=50,
 42 |         scrape_username=None,
 43 |         scrape_hashtag=None,
 44 |         scrape_query=None,
 45 |         scrape_bookmarks=False,
 46 |         scrape_poster_details=False,
 47 |         scrape_latest=True,
 48 |         scrape_top=False,
 49 |         proxy=None,
 50 |     ):
 51 |         print("Initializing Twitter Scraper...")
 52 |         self.mail = mail
 53 |         self.username = username
 54 |         self.password = password
 55 |         self.headlessState = headlessState
 56 |         self.interrupted = False
 57 |         self.tweet_ids = set()
 58 |         self.data = []
 59 |         self.tweet_cards = []
 60 |         self.scraper_details = {
 61 |             "type": None,
 62 |             "username": None,
 63 |             "hashtag": None,
 64 |             "bookmarks": False,
 65 |             "query": None,
 66 |             "tab": None,
 67 |             "poster_details": False,
 68 |         }
 69 |         self.max_tweets = max_tweets
 70 |         self.progress = Progress(0, max_tweets)
 71 |         self.router = self.go_to_home
 72 |         self.driver = self._get_driver(proxy)
 73 |         self.actions = ActionChains(self.driver)
 74 |         self.scroller = Scroller(self.driver)
 75 |         self._config_scraper(
 76 |             max_tweets,
 77 |             scrape_username,
 78 |             scrape_hashtag,
 79 |             scrape_bookmarks,
 80 |             scrape_query,
 81 |             scrape_latest,
 82 |             scrape_top,
 83 |             scrape_poster_details,
 84 |         )
 85 | 
 86 |     def _config_scraper(
 87 |         self,
 88 |         max_tweets=50,
 89 |         scrape_username=None,
 90 |         scrape_hashtag=None,
 91 |         scrape_bookmarks=False,
 92 |         scrape_query=None,
 93 |         scrape_list=None,
 94 |         scrape_latest=True,
 95 |         scrape_top=False,
 96 |         scrape_poster_details=False,
 97 |     ):
 98 |         self.tweet_ids = set()
 99 |         self.data = []
100 |         self.tweet_cards = []
101 |         self.max_tweets = max_tweets
102 |         self.progress = Progress(0, max_tweets)
103 |         self.scraper_details = {
104 |             "type": None,
105 |             "username": scrape_username,
106 |             "hashtag": str(scrape_hashtag).replace("#", "")
107 |             if scrape_hashtag is not None
108 |             else None,
109 |             "bookmarks": scrape_bookmarks,
110 |             "query": scrape_query,
111 |             "list": scrape_list,
112 |             "tab": "Latest" if scrape_latest else "Top" if scrape_top else "Latest",
113 |             "poster_details": scrape_poster_details,
114 |         }
115 |         self.router = self.go_to_home
116 |         self.scroller = Scroller(self.driver)
117 | 
118 |         if scrape_username is not None:
119 |             self.scraper_details["type"] = "Username"
120 |             self.router = self.go_to_profile
121 |         elif scrape_hashtag is not None:
122 |             self.scraper_details["type"] = "Hashtag"
123 |             self.router = self.go_to_hashtag
124 |         elif scrape_bookmarks is not False:
125 |             self.scraper_details["type"] = "Bookmarks"
126 |             self.router = self.go_to_bookmarks
127 |         elif scrape_query is not None:
128 |             self.scraper_details["type"] = "Query"
129 |             self.router = self.go_to_search
130 |         elif scrape_list is not None:
131 |             self.scraper_details["type"] = "List"
132 |             self.router = self.go_to_list
133 |         else:
134 |             self.scraper_details["type"] = "Home"
135 |             self.router = self.go_to_home
136 |         pass
137 | 
138 |     def _get_driver(
139 |         self,
140 |         proxy=None,
141 |     ):
142 |         print("Setup WebDriver...")
143 |         # header = Headers().generate()["User-Agent"] 
144 | 
145 |         # User agent of a andoird smartphone device
146 |         header="Mozilla/5.0 (Linux; Android 11; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.87 Mobile Safari/537.36"
147 | 
148 |         # browser_option = ChromeOptions()
149 |         browser_option = FirefoxOptions()
150 |         browser_option.add_argument("--no-sandbox")
151 |         browser_option.add_argument("--disable-dev-shm-usage")
152 |         browser_option.add_argument("--ignore-certificate-errors")
153 |         browser_option.add_argument("--disable-gpu")
154 |         browser_option.add_argument("--log-level=3")
155 |         browser_option.add_argument("--disable-notifications")
156 |         browser_option.add_argument("--disable-popup-blocking")
157 |         browser_option.add_argument("--user-agent={}".format(header))
158 |         if proxy is not None:
159 |             browser_option.add_argument("--proxy-server=%s" % proxy)
160 | 
161 |         # Option to hide browser or not
162 |         # If not yes then skips the headless
163 |         if self.headlessState == 'yes':
164 |             # For Hiding Browser
165 |             browser_option.add_argument("--headless")
166 | 
167 |         try:
168 |             # print("Initializing ChromeDriver...")
169 |             # driver = webdriver.Chrome(
170 |             #     options=browser_option,
171 |             # )
172 | 
173 |             print("Initializing FirefoxDriver...")
174 |             driver = webdriver.Firefox(
175 |                 options=browser_option,
176 |             )
177 | 
178 |             print("WebDriver Setup Complete")
179 |             return driver
180 |         except WebDriverException:
181 |             try:
182 |                 # print("Downloading ChromeDriver...")
183 |                 # chromedriver_path = ChromeDriverManager().install()
184 |                 # chrome_service = ChromeService(executable_path=chromedriver_path)
185 | 
186 |                 print("Downloading FirefoxDriver...")
187 |                 firefoxdriver_path = GeckoDriverManager().install()
188 |                 firefox_service = FirefoxService(executable_path=firefoxdriver_path)
189 | 
190 |                 # print("Initializing ChromeDriver...")
191 |                 # driver = webdriver.Chrome(
192 |                 #     service=chrome_service,
193 |                 #     options=browser_option,
194 |                 # )
195 | 
196 |                 print("Initializing FirefoxDriver...")
197 |                 driver = webdriver.Firefox(
198 |                     service=firefox_service,
199 |                     options=browser_option,
200 |                 )
201 | 
202 |                 print("WebDriver Setup Complete")
203 |                 return driver
204 |             except Exception as e:
205 |                 print(f"Error setting up WebDriver: {e}")
206 |                 sys.exit(1)
207 |         pass
208 | 
209 |     def login(self):
210 |         print()
211 |         print("Logging in to Twitter...")
212 | 
213 |         try:
214 |             self.driver.maximize_window()
215 |             self.driver.execute_script("document.body.style.zoom='150%'") #set zoom to 150%
216 |             self.driver.get(TWITTER_LOGIN_URL)
217 |             sleep(3)
218 | 
219 |             self._input_username()
220 |             self._input_unusual_activity()
221 |             self._input_password()
222 | 
223 |             cookies = self.driver.get_cookies()
224 | 
225 |             auth_token = None
226 | 
227 |             for cookie in cookies:
228 |                 if cookie["name"] == "auth_token":
229 |                     auth_token = cookie["value"]
230 |                     break
231 | 
232 |             if auth_token is None:
233 |                 raise ValueError(
234 |                     """This may be due to the following:
235 | 
236 | - Internet connection is unstable
237 | - Username is incorrect
238 | - Password is incorrect
239 | """
240 |                 )
241 | 
242 |             print()
243 |             print("Login Successful")
244 |             print()
245 |         except Exception as e:
246 |             print()
247 |             print(f"Login Failed: {e}")
248 |             sys.exit(1)
249 | 
250 |         pass
251 | 
252 |     def _input_username(self):
253 |         input_attempt = 0
254 | 
255 |         while True:
256 |             try:
257 |                 username = self.driver.find_element(
258 |                     "xpath", "//input[@autocomplete='username']"
259 |                 )
260 | 
261 |                 username.send_keys(self.username)
262 |                 username.send_keys(Keys.RETURN)
263 |                 sleep(3)
264 |                 break
265 |             except NoSuchElementException:
266 |                 input_attempt += 1
267 |                 if input_attempt >= 3:
268 |                     print()
269 |                     print(
270 |                         """There was an error inputting the username.
271 | 
272 | It may be due to the following:
273 | - Internet connection is unstable
274 | - Username is incorrect
275 | - Twitter is experiencing unusual activity"""
276 |                     )
277 |                     self.driver.quit()
278 |                     sys.exit(1)
279 |                 else:
280 |                     print("Re-attempting to input username...")
281 |                     sleep(2)
282 | 
283 |     def _input_unusual_activity(self):
284 |         input_attempt = 0
285 | 
286 |         while True:
287 |             try:
288 |                 unusual_activity = self.driver.find_element(
289 |                     "xpath", "//input[@data-testid='ocfEnterTextTextInput']"
290 |                 )
291 |                 unusual_activity.send_keys(self.username)
292 |                 unusual_activity.send_keys(Keys.RETURN)
293 |                 sleep(3)
294 |                 break
295 |             except NoSuchElementException:
296 |                 input_attempt += 1
297 |                 if input_attempt >= 3:
298 |                     break
299 | 
300 |     def _input_password(self):
301 |         input_attempt = 0
302 | 
303 |         while True:
304 |             try:
305 |                 password = self.driver.find_element(
306 |                     "xpath", "//input[@autocomplete='current-password']"
307 |                 )
308 | 
309 |                 password.send_keys(self.password)
310 |                 password.send_keys(Keys.RETURN)
311 |                 sleep(3)
312 |                 break
313 |             except NoSuchElementException:
314 |                 input_attempt += 1
315 |                 if input_attempt >= 3:
316 |                     print()
317 |                     print(
318 |                         """There was an error inputting the password.
319 | 
320 | It may be due to the following:
321 | - Internet connection is unstable
322 | - Password is incorrect
323 | - Twitter is experiencing unusual activity"""
324 |                     )
325 |                     self.driver.quit()
326 |                     sys.exit(1)
327 |                 else:
328 |                     print("Re-attempting to input password...")
329 |                     sleep(2)
330 | 
331 |     def go_to_home(self):
332 |         self.driver.get("https://twitter.com/home")
333 |         sleep(3)
334 |         pass
335 | 
336 |     def go_to_profile(self):
337 |         if (
338 |             self.scraper_details["username"] is None
339 |             or self.scraper_details["username"] == ""
340 |         ):
341 |             print("Username is not set.")
342 |             sys.exit(1)
343 |         else:
344 |             self.driver.get(f"https://twitter.com/{self.scraper_details['username']}")
345 |             sleep(3)
346 |         pass
347 | 
348 |     def go_to_hashtag(self):
349 |         if (
350 |             self.scraper_details["hashtag"] is None
351 |             or self.scraper_details["hashtag"] == ""
352 |         ):
353 |             print("Hashtag is not set.")
354 |             sys.exit(1)
355 |         else:
356 |             url = f"https://twitter.com/hashtag/{self.scraper_details['hashtag']}?src=hashtag_click"
357 |             if self.scraper_details["tab"] == "Latest":
358 |                 url += "&f=live"
359 | 
360 |             self.driver.get(url)
361 |             sleep(3)
362 |         pass
363 | 
364 |     def go_to_bookmarks(self):
365 |         if (
366 |             self.scraper_details["bookmarks"] is False
367 |             or self.scraper_details["bookmarks"] == ""
368 |         ):
369 |             print("Bookmarks is not set.")
370 |             sys.exit(1)
371 |         else:
372 |             url = f"https://twitter..com/i/bookmarks"
373 | 
374 |             self.driver.get(url)
375 |             sleep(3)
376 |         pass
377 | 
378 |     def go_to_search(self):
379 |         if self.scraper_details["query"] is None or self.scraper_details["query"] == "":
380 |             print("Query is not set.")
381 |             sys.exit(1)
382 |         else:
383 |             url = f"https://twitter.com/search?q={self.scraper_details['query']}&src=typed_query"
384 |             if self.scraper_details["tab"] == "Latest":
385 |                 url += "&f=live"
386 | 
387 |             self.driver.get(url)
388 |             sleep(3)
389 |         pass
390 | 
391 |     def go_to_list(self):
392 |         if self.scraper_details["list"] is None or self.scraper_details["list"] == "":
393 |             print("List is not set.")
394 |             sys.exit(1)
395 |         else:
396 |             url = f"https://x.com/i/lists/{self.scraper_details['list']}"
397 |             self.driver.get(url)
398 |             sleep(3)
399 |         pass
400 | 
401 |     def get_tweet_cards(self):
402 |         self.tweet_cards = self.driver.find_elements(
403 |             "xpath", '//article[@data-testid="tweet" and not(@disabled)]'
404 |         )
405 |         pass
406 | 
407 |     def remove_hidden_cards(self):
408 |         try:
409 |             hidden_cards = self.driver.find_elements(
410 |                 "xpath", '//article[@data-testid="tweet" and @disabled]'
411 |             )
412 | 
413 |             for card in hidden_cards[1:-2]:
414 |                 self.driver.execute_script(
415 |                     "arguments[0].parentNode.parentNode.parentNode.remove();", card
416 |                 )
417 |         except Exception as e:
418 |             return
419 |         pass
420 | 
421 |     def scrape_tweets(
422 |         self,
423 |         max_tweets=50,
424 |         no_tweets_limit=False,
425 |         scrape_username=None,
426 |         scrape_hashtag=None,
427 |         scrape_bookmarks=False,
428 |         scrape_query=None,
429 |         scrape_list=None,
430 |         scrape_latest=True,
431 |         scrape_top=False,
432 |         scrape_poster_details=False,
433 |         router=None,
434 |     ):
435 |         self._config_scraper(
436 |             max_tweets,
437 |             scrape_username,
438 |             scrape_hashtag,
439 |             scrape_bookmarks,
440 |             scrape_query,
441 |             scrape_list,
442 |             scrape_latest,
443 |             scrape_top,
444 |             scrape_poster_details,
445 |         )
446 | 
447 |         if router is None:
448 |             router = self.router
449 | 
450 |         router()
451 | 
452 |         if self.scraper_details["type"] == "Username":
453 |             print(
454 |                 "Scraping Tweets from @{}...".format(self.scraper_details["username"])
455 |             )
456 |         elif self.scraper_details["type"] == "Hashtag":
457 |             print(
458 |                 "Scraping {} Tweets from #{}...".format(
459 |                     self.scraper_details["tab"], self.scraper_details["hashtag"]
460 |                 )
461 |             )
462 |         elif self.scraper_details["type"] == "Bookmarks":
463 |             print(
464 |                 "Scraping Tweets from bookmarks...".format(self.scraper_details["username"]))
465 |         elif self.scraper_details["type"] == "Query":
466 |             print(
467 |                 "Scraping {} Tweets from {} search...".format(
468 |                     self.scraper_details["tab"], self.scraper_details["query"]
469 |                 )
470 |             )
471 |         elif self.scraper_details["type"] == "Home":
472 |             print("Scraping Tweets from Home...")
473 | 
474 |         # Accept cookies to make the banner disappear
475 |         try:
476 |             accept_cookies_btn = self.driver.find_element(
477 |             "xpath", "//span[text()='Refuse non-essential cookies']/../../..")
478 |             accept_cookies_btn.click()
479 |         except NoSuchElementException:
480 |             pass
481 | 
482 |         self.progress.print_progress(0, False, 0, no_tweets_limit)
483 | 
484 |         refresh_count = 0
485 |         added_tweets = 0
486 |         empty_count = 0
487 |         retry_cnt = 0
488 | 
489 |         while self.scroller.scrolling:
490 |             try:
491 |                 self.get_tweet_cards()
492 |                 added_tweets = 0
493 | 
494 |                 for card in self.tweet_cards[-15:]:
495 |                     try:
496 |                         tweet_id = str(card)
497 | 
498 |                         if tweet_id not in self.tweet_ids:
499 |                             self.tweet_ids.add(tweet_id)
500 | 
501 |                             if not self.scraper_details["poster_details"]:
502 |                                 self.driver.execute_script(
503 |                                     "arguments[0].scrollIntoView();", card
504 |                                 )
505 | 
506 |                             tweet = Tweet(
507 |                                 card=card,
508 |                                 driver=self.driver,
509 |                                 actions=self.actions,
510 |                                 scrape_poster_details=self.scraper_details[
511 |                                     "poster_details"
512 |                                 ],
513 |                             )
514 | 
515 |                             if tweet:
516 |                                 if not tweet.error and tweet.tweet is not None:
517 |                                     if not tweet.is_ad:
518 |                                         self.data.append(tweet.tweet)
519 |                                         added_tweets += 1
520 |                                         self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
521 | 
522 |                                         if len(self.data) >= self.max_tweets and not no_tweets_limit:
523 |                                             self.scroller.scrolling = False
524 |                                             break
525 |                                     else:
526 |                                         continue
527 |                                 else:
528 |                                     continue
529 |                             else:
530 |                                 continue
531 |                         else:
532 |                             continue
533 |                     except NoSuchElementException:
534 |                         continue
535 | 
536 |                 if len(self.data) >= self.max_tweets and not no_tweets_limit:
537 |                     break
538 | 
539 |                 if added_tweets == 0:
540 |                     # Check if there is a button "Retry" and click on it with a regular basis until a certain amount of tries
541 |                     try:
542 |                         while retry_cnt < 15:
543 |                             retry_button = self.driver.find_element(
544 |                             "xpath", "//span[text()='Retry']/../../..")
545 |                             self.progress.print_progress(len(self.data), True, retry_cnt, no_tweets_limit)
546 |                             sleep(600)
547 |                             retry_button.click()
548 |                             retry_cnt += 1
549 |                             sleep(2)
550 |                     # There is no Retry button so the counter is reseted
551 |                     except NoSuchElementException:
552 |                         retry_cnt = 0
553 |                         self.progress.print_progress(len(self.data), False, 0, no_tweets_limit)
554 | 
555 |                     if empty_count >= 5:
556 |                         if refresh_count >= 3:
557 |                             print()
558 |                             print("No more tweets to scrape")
559 |                             break
560 |                         refresh_count += 1
561 |                     empty_count += 1
562 |                     sleep(1)
563 |                 else:
564 |                     empty_count = 0
565 |                     refresh_count = 0
566 |             except StaleElementReferenceException:
567 |                 sleep(2)
568 |                 continue
569 |             except KeyboardInterrupt:
570 |                 print("\n")
571 |                 print("Keyboard Interrupt")
572 |                 self.interrupted = True
573 |                 break
574 |             except Exception as e:
575 |                 print("\n")
576 |                 print(f"Error scraping tweets: {e}")
577 |                 break
578 | 
579 |         print("")
580 | 
581 |         if len(self.data) >= self.max_tweets or no_tweets_limit:
582 |             print("Scraping Complete")
583 |         else:
584 |             print("Scraping Incomplete")
585 | 
586 |         if not no_tweets_limit:
587 |             print("Tweets: {} out of {}\n".format(len(self.data), self.max_tweets))
588 | 
589 |         pass
590 | 
591 |     def save_to_csv(self):
592 |         print("Saving Tweets to CSV...")
593 |         now = datetime.now()
594 |         folder_path = "./tweets/"
595 | 
596 |         if not os.path.exists(folder_path):
597 |             os.makedirs(folder_path)
598 |             print("Created Folder: {}".format(folder_path))
599 | 
600 |         data = {
601 |             "Name": [tweet[0] for tweet in self.data],
602 |             "Handle": [tweet[1] for tweet in self.data],
603 |             "Timestamp": [tweet[2] for tweet in self.data],
604 |             "Verified": [tweet[3] for tweet in self.data],
605 |             "Content": [tweet[4] for tweet in self.data],
606 |             "Comments": [tweet[5] for tweet in self.data],
607 |             "Retweets": [tweet[6] for tweet in self.data],
608 |             "Likes": [tweet[7] for tweet in self.data],
609 |             "Analytics": [tweet[8] for tweet in self.data],
610 |             "Tags": [tweet[9] for tweet in self.data],
611 |             "Mentions": [tweet[10] for tweet in self.data],
612 |             "Emojis": [tweet[11] for tweet in self.data],
613 |             "Profile Image": [tweet[12] for tweet in self.data],
614 |             "Tweet Link": [tweet[13] for tweet in self.data],
615 |             "Tweet ID": [f"tweet_id:{tweet[14]}" for tweet in self.data],
616 |         }
617 | 
618 |         if self.scraper_details["poster_details"]:
619 |             data["Tweeter ID"] = [f"user_id:{tweet[15]}" for tweet in self.data]
620 |             data["Following"] = [tweet[16] for tweet in self.data]
621 |             data["Followers"] = [tweet[17] for tweet in self.data]
622 | 
623 |         df = pd.DataFrame(data)
624 | 
625 |         current_time = now.strftime("%Y-%m-%d_%H-%M-%S")
626 |         file_path = f"{folder_path}{current_time}_tweets_1-{len(self.data)}.csv"
627 |         pd.set_option("display.max_colwidth", None)
628 |         df.to_csv(file_path, index=False, encoding="utf-8")
629 | 
630 |         print("CSV Saved: {}".format(file_path))
631 | 
632 |         pass
633 | 
634 |     def get_tweets(self):
635 |         return self.data
636 | 


--------------------------------------------------------------------------------