├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE.md
└── ISSUE_TEMPLATE
│ └── ISSUE_TEMPLATE.md
├── .gitignore
├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── twint.iml
└── vcs.xml
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── automate.py
├── csrf.py
├── elasticsearch
├── README.md
├── dashboard.json
├── index-follow.json
├── index-tweets.json
├── index-user.json
└── visualizations.json
├── requirements.txt
├── setup.py
├── test.py
└── twint
├── __init__.py
├── __version__.py
├── cli.py
├── config.py
├── datelock.py
├── feed.py
├── format.py
├── get.py
├── output.py
├── run.py
├── storage
├── __init__.py
├── db.py
├── elasticsearch.py
├── panda.py
├── write.py
└── write_meta.py
├── token.py
├── tweet.py
├── url.py
├── user.py
└── verbose.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | patreon: twintproject
3 | custom: paypal.me/noneprivacy
4 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | # Issue Template
2 | Please use this template!
3 |
4 | ## Initial Check
5 | > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
6 |
7 | >Make sure you've checked the following:
8 |
9 | - [] Python version is 3.6 or later;
10 | - [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
11 | - [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
12 |
13 | ## Command Ran
14 | >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
15 |
16 | ## Description of Issue
17 | >Please use **as much detail as possible.**
18 |
19 | ## Environment Details
20 | >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### Initial Check
2 | > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
3 |
4 | >Make sure you've checked the following:
5 |
6 | - [] Python version is 3.6;
7 | - [] Using the latest version of Twint;
8 | - [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
9 |
10 | ### Command Ran
11 | >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
12 |
13 | ### Description of Issue
14 | >Please use **as much detail as possible.**
15 |
16 | ### Environment Details
17 | >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | tweets.db
6 | # C extensions
7 | *.so
8 |
9 | config.ini
10 | twint/storage/mysql.py
11 |
12 | # Node Dependency directories
13 | node_modules/
14 | jspm_packages/
15 | tests/
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | venv
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | .hypothesis/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | chromedriver
88 |
89 | # SageMath parsed files
90 | *.sage.py
91 |
92 | # dotenv
93 | .env
94 |
95 | # virtualenv
96 | .venv
97 | venv/
98 | ENV/
99 |
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 |
104 | # Rope project settings
105 | .ropeproject
106 |
107 | # mkdocs documentation
108 | /site
109 |
110 | # mypy
111 | .mypy_cache/
112 |
113 | # output
114 | *.csv
115 | *.json
116 | *.txt
117 |
118 | test_twint.py
119 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/twint.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: bionic
2 | language: python
3 | python:
4 | - "3.6"
5 | - "3.7"
6 | - "3.8"
7 | - "nightly"
8 | matrix:
9 | allow_failures:
10 | - python: "nightly"
11 | - python: "3.8"
12 | install:
13 | - pip install -r requirements.txt
14 | script:
15 | - python test.py
16 | deploy:
17 | provider: pypi
18 | user: "codyzacharias"
19 | password:
20 | secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
21 | on:
22 | tags: true
23 | python: "3.7"
24 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6-buster
2 | LABEL maintainer="codyzacharias@pm.me"
3 |
4 | WORKDIR /root
5 |
6 | RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
7 | cd /root/twint && \
8 | pip3 install . -r requirements.txt
9 |
10 | CMD /bin/bash
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Cody Zacharias
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 20220207.0
2 |
3 | # About this fork
4 |
5 | [This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
6 |
7 | Modified by [minamotorin](https://github.com/minamotorin).
8 |
9 | ## Updates from twintproject/twint
10 |
11 | ### twint.token.RefreshTokenException: Could not find the Guest token in HTML
12 |
13 | This problem doesn't happen recently.
14 |
15 | #### Related
16 |
17 | - [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
18 | - [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
19 | - [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
20 | - [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
21 | - [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
22 |
23 | ### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
24 |
25 | The fix is **not complete**.
26 | `twint.run.Profile` will work but `twint.run.db` will not.
27 | This means [`test.py`](./test.py) causes an error.
28 |
29 | I think this is because the fields of the result table are not exactly the same as the traditional ones.
30 |
31 | #### Related
32 |
33 | - [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
34 |
35 | ### [-] TWINT requires Python version 3.6+.
36 |
37 | #### Related
38 |
39 | - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
40 | - [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
41 | - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
42 | - [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
43 | - [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
44 |
45 | ## References
46 |
47 | - [snscrape](https://github.com/JustAnotherArchivist/snscrape)
48 | - [gallery-dl](https://github.com/mikf/gallery-dl)
49 |
50 | ## License
51 |
52 | This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
53 |
54 | ---
55 |
56 | # TWINT - Twitter Intelligence Tool
57 | 
58 | 
59 |
60 | [](https://pypi.org/project/twint/) [](https://travis-ci.org/twintproject/twint) [](https://www.python.org/download/releases/3.0/) [](https://github.com/haccer/tweep/blob/master/LICENSE) [](https://pepy.tech/project/twint) [](https://pepy.tech/project/twint/week) [](https://www.patreon.com/twintproject) 
61 |
62 | >No authentication. No API. No limits.
63 |
64 | Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
65 |
66 | Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
67 |
68 | Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
69 |
70 | ## tl;dr Benefits
71 | Some of the benefits of using Twint vs Twitter API:
72 | - Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
73 | - Fast initial setup;
74 | - Can be used anonymously and without Twitter sign up;
75 | - **No rate limitations**.
76 |
77 | ## Limits imposed by Twitter
78 | Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
79 |
80 | ## Requirements
81 | - Python 3.6;
82 | - aiohttp;
83 | - aiodns;
84 | - beautifulsoup4;
85 | - cchardet;
86 | - dataclasses
87 | - elasticsearch;
88 | - pysocks;
89 | - pandas (>=0.23.0);
90 | - aiohttp_socks;
91 | - schedule;
92 | - geopy;
93 | - fake-useragent;
94 | - py-googletransx.
95 |
96 | ## Installing
97 |
98 | **Git:**
99 | ```bash
100 | git clone --depth=1 https://github.com/twintproject/twint.git
101 | cd twint
102 | pip3 install . -r requirements.txt
103 | ```
104 |
105 | **Pip:**
106 | ```bash
107 | pip3 install twint
108 | ```
109 |
110 | or
111 |
112 | ```bash
113 | pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
114 | ```
115 |
116 | **Pipenv**:
117 | ```bash
118 | pipenv install git+https://github.com/twintproject/twint.git#egg=twint
119 | ```
120 |
121 | ### March 2, 2021 Update
122 |
123 | **Added**: Dockerfile
124 |
125 | Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
126 |
127 | ## CLI Basic Examples and Combos
128 | A few simple examples to help you understand the basics:
129 |
130 | - `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
131 | - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
132 | - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
133 | - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
134 | - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
135 | - `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
136 | - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
137 | - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
138 | - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
139 | - `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
140 | - `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
141 | - `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
142 | - `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
143 | - `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
144 | - `twint -u username --followers` - Scrape a Twitter user's followers.
145 | - `twint -u username --following` - Scrape who a Twitter user follows.
146 | - `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
147 | - `twint -u username --following --user-full` - Collect full user information a person follows
148 | - `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
149 | - `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
150 | - `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
151 |
152 | More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
153 |
154 | ## Module Example
155 |
156 | Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
157 |
158 | ```python
159 | import twint
160 |
161 | # Configure
162 | c = twint.Config()
163 | c.Username = "realDonaldTrump"
164 | c.Search = "great"
165 |
166 | # Run
167 | twint.run.Search(c)
168 | ```
169 | > Output
170 |
171 | `955511208597184512 2018-01-22 18:43:19 GMT pineapples are the best fruit`
172 |
173 | ```python
174 | import twint
175 |
176 | c = twint.Config()
177 |
178 | c.Username = "noneprivacy"
179 | c.Custom["tweet"] = ["id"]
180 | c.Custom["user"] = ["bio"]
181 | c.Limit = 10
182 | c.Store_csv = True
183 | c.Output = "none"
184 |
185 | twint.run.Search(c)
186 | ```
187 |
188 | ## Storing Options
189 | - Write to file;
190 | - CSV;
191 | - JSON;
192 | - SQLite;
193 | - Elasticsearch.
194 |
195 | ## Elasticsearch Setup
196 |
197 | Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
198 |
199 | ## Graph Visualization
200 | 
201 |
202 | [Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
203 |
204 | We are developing a Twint Desktop App.
205 |
206 | 
207 |
208 | ## FAQ
209 | > I tried scraping tweets from a user, I know that they exist but I'm not getting them
210 |
211 | Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
212 | ## More Examples
213 |
214 | #### Followers/Following
215 |
216 | > To get only follower usernames/following usernames
217 |
218 | `twint -u username --followers`
219 |
220 | `twint -u username --following`
221 |
222 | > To get user info of followers/following users
223 |
224 | `twint -u username --followers --user-full`
225 |
226 | `twint -u username --following --user-full`
227 |
228 | #### userlist
229 |
230 | > To get only user info of user
231 |
232 | `twint -u username --user-full`
233 |
234 | > To get user info of users from a userlist
235 |
236 | `twint --userlist inputlist --user-full`
237 |
238 |
239 | #### tweet translation (experimental)
240 |
241 | > To get 100 english tweets and translate them to italian
242 |
243 | `twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
244 |
245 | or
246 |
247 | ```python
248 | import twint
249 |
250 | c = twint.Config()
251 | c.Username = "noneprivacy"
252 | c.Limit = 100
253 | c.Store_csv = True
254 | c.Output = "none.csv"
255 | c.Lang = "en"
256 | c.Translate = True
257 | c.TranslateDest = "it"
258 | twint.run.Search(c)
259 | ```
260 |
261 | Notes:
262 | - [Google translate has some quotas](https://cloud.google.com/translate/quotas)
263 |
264 | ## Featured Blog Posts:
265 | - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
266 | - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
267 | - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
268 | - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
269 |
270 | ## Contact
271 |
272 | If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
273 |
--------------------------------------------------------------------------------
/automate.py:
--------------------------------------------------------------------------------
1 | import twint
2 | import schedule
3 | import time
4 |
5 | # you can change the name of each "job" after "def" if you'd like.
6 | def jobone():
7 | print ("Fetching Tweets")
8 | c = twint.Config()
9 | # choose username (optional)
10 | c.Username = "insert username here"
11 | # choose search term (optional)
12 | c.Search = "insert search term here"
13 | # choose beginning time (narrow results)
14 | c.Since = "2018-01-01"
15 | # set limit on total tweets
16 | c.Limit = 1000
17 | # no idea, but makes the csv format properly
18 | c.Store_csv = True
19 | # format of the csv
20 | c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
21 | # change the name of the csv file
22 | c.Output = "filename.csv"
23 | twint.run.Search(c)
24 |
25 | def jobtwo():
26 | print ("Fetching Tweets")
27 | c = twint.Config()
28 | # choose username (optional)
29 | c.Username = "insert username here"
30 | # choose search term (optional)
31 | c.Search = "insert search term here"
32 | # choose beginning time (narrow results)
33 | c.Since = "2018-01-01"
34 | # set limit on total tweets
35 | c.Limit = 1000
36 | # no idea, but makes the csv format properly
37 | c.Store_csv = True
38 | # format of the csv
39 | c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
40 | # change the name of the csv file
41 | c.Output = "filename2.csv"
42 | twint.run.Search(c)
43 |
44 | # run once when you start the program
45 |
46 | jobone()
47 | jobtwo()
48 |
49 | # run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
50 |
51 | # schedule.every(1).minutes.do(jobone)
52 | schedule.every().hour.do(jobone)
53 | # schedule.every().day.at("10:30").do(jobone)
54 | # schedule.every().monday.do(jobone)
55 | # schedule.every().wednesday.at("13:15").do(jobone)
56 |
57 | # schedule.every(1).minutes.do(jobtwo)
58 | schedule.every().hour.do(jobtwo)
59 | # schedule.every().day.at("10:30").do(jobtwo)
60 | # schedule.every().monday.do(jobtwo)
61 | # schedule.every().wednesday.at("13:15").do(jobtwo)
62 |
63 | while True:
64 | schedule.run_pending()
65 | time.sleep(1)
66 |
--------------------------------------------------------------------------------
/csrf.py:
--------------------------------------------------------------------------------
1 | import twint.run
2 | from twint import Config
3 |
4 | config = Config()
5 | config.User_id = "narendramodi"
6 | config.Search = True
7 |
8 | twint.run.Search(config)
9 |
10 | # https://twitter.com/i/api/graphql/Bhlf1dYJ3bYCKmLfeEQ31A/UserByScreenName?variables=%7B%22screen_name%22%3A%20%22narendramodi%22%2C%20%22withSafetyModeUserFields%22%3A%20false%2C%20%22withSuperFollowsUserFields%22%3A%20false%7D
11 | # https://twitter.com/i/api/graphql/Bhlf1dYJ3bYCKmLfeEQ31A/UserByScreenName?variables=%7B%22screen_name%22%3A%20%22narendramodi%22%2C%20%22withSafetyModeUserFields%22%3A%20false%2C%20%22withSuperFollowsUserFields%22%3A%20false%7D
--------------------------------------------------------------------------------
/elasticsearch/README.md:
--------------------------------------------------------------------------------
1 | # Elasticsearch How-To
2 |
3 | 
4 |
5 | Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
6 |
--------------------------------------------------------------------------------
/elasticsearch/dashboard.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d",
4 | "_type": "dashboard",
5 | "_source": {
6 | "title": "Twint Dashboard",
7 | "hits": 0,
8 | "description": "",
9 | "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]",
10 | "optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}",
11 | "version": 1,
12 | "timeRestore": false,
13 | "kibanaSavedObjectMeta": {
14 | "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
15 | }
16 | }
17 | }
18 | ]
--------------------------------------------------------------------------------
/elasticsearch/index-follow.json:
--------------------------------------------------------------------------------
1 | PUT twintgraph
2 | {
3 | "mappings": {
4 | "items": {
5 | "properties": {
6 | "user": {"type": "keyword"},
7 | "follow": {"type": "keyword"},
8 | "essid": {"type": "keyword"}
9 | }
10 | }
11 | },
12 | "settings": {
13 | "number_of_shards": 1
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/elasticsearch/index-tweets.json:
--------------------------------------------------------------------------------
1 | PUT twinttweets
2 | {
3 | "mappings": {
4 | "items": {
5 | "properties": {
6 | "id": {"type": "long"},
7 | "conversation_id": {"type": "long"},
8 | "created_at": {"type": "long"},
9 | "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
10 | "timezone": {"type": "keyword"},
11 | "place": {"type": "keyword"},
12 | "location": {"type": "keyword"},
13 | "tweet": {"type": "text"},
14 | "hashtags": {"type": "keyword"},
15 | "cashtags": {"type": "keyword"},
16 | "user_id": {"type": "long"},
17 | "user_id_str": {"type": "keyword"},
18 | "username": {"type": "keyword"},
19 | "name": {"type": "text"},
20 | "profile_image_url": {"type": "text"},
21 | "day": {"type": "integer"},
22 | "hour": {"type": "integer"},
23 | "link": {"type": "text"},
24 | "retweet": {"type": "text"},
25 | "essid": {"type": "keyword"},
26 | "nlikes": {"type": "integer"},
27 | "nreplies": {"type": "integer"},
28 | "nretweets": {"type": "integer"},
29 | "quote_url": {"type": "text"},
30 | "video": {"type": "integer"},
31 | "thumbnail": {"type": "text"},
32 | "search": {"type": "text"},
33 | "near": {"type": "text"},
34 | "geo_near": {"type": "geo_point"},
35 | "geo_tweet": {"type": "geo_point"},
36 | "photos": {"type": "text"},
37 | "mentions": {"type": "text"},
38 | "translation": {"type": "text"},
39 | "trans_src": {"type": "keyword"},
40 | "trans_dev": {"type": "keyword"},
41 | }
42 | }
43 | }
44 | ,
45 | "settings": {
46 | "number_of_shards": 1
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/elasticsearch/index-user.json:
--------------------------------------------------------------------------------
1 | PUT twintuser
2 | {
3 | "mappings": {
4 | "items": {
5 | "properties": {
6 | "id": {"type": "keyword"},
7 | "name": {"type": "keyword"},
8 | "username": {"type": "keyword"},
9 | "bio": {"type": "text"},
10 | "location": {"type": "keyword"},
11 | "url": {"type": "text"},
12 | "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
13 | "join_date": {"type": "date", "format": "yyyy-MM-dd"},
14 | "join_time": {"type": "date", "format": "HH:mm:ss"},
15 | "tweets": {"type": "integer"},
16 | "following": {"type": "integer"},
17 | "followers": {"type": "integer"},
18 | "likes": {"type": "integer"},
19 | "media": {"type": "integer"},
20 | "private": {"type": "integer"},
21 | "verified": {"type": "integer"},
22 | "avatar": {"type": "text"},
23 | "background_image": {"type": "text"},
24 | "session": {"type": "keyword"},
25 | "geo_user": {"type": "geo_point"}
26 | }
27 | }
28 | }
29 | ,
30 | "settings": {
31 | "number_of_shards": 1
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/elasticsearch/visualizations.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "_id": "d47421c0-bfd5-11e8-8858-bbc566841533",
4 | "_type": "visualization",
5 | "_source": {
6 | "title": "Activity [twinttweets]",
7 | "visState": "{\"title\":\"Activity [twinttweets]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"area\",\"mode\":\"stacked\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true,\"interpolate\":\"cardinal\"}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":true},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"date\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{},\"customLabel\":\"Days\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"customLabel\":\"User ids\"}}]}",
8 | "uiStateJSON": "{}",
9 | "description": "",
10 | "version": 1,
11 | "kibanaSavedObjectMeta": {
12 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
13 | }
14 | }
15 | },
16 | {
17 | "_id": "e2b89640-bfd4-11e8-8858-bbc566841533",
18 | "_type": "visualization",
19 | "_source": {
20 | "title": "Activity - pie [twinttweets]",
21 | "visState": "{\"aggs\":[{\"enabled\":true,\"id\":\"1\",\"params\":{},\"schema\":\"metric\",\"type\":\"count\"},{\"enabled\":true,\"id\":\"2\",\"params\":{\"field\":\"user_id\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"}],\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":true,\"labels\":{\"last_level\":true,\"show\":false,\"truncate\":100,\"values\":true},\"legendPosition\":\"right\",\"type\":\"pie\"},\"title\":\"Activity - pie [twinttweets]\",\"type\":\"pie\"}",
22 | "uiStateJSON": "{}",
23 | "description": "",
24 | "version": 1,
25 | "kibanaSavedObjectMeta": {
26 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
27 | }
28 | }
29 | },
30 | {
31 | "_id": "37cd72e0-bfe4-11e8-961a-d371b24d5d1d",
32 | "_type": "visualization",
33 | "_source": {
34 | "title": "Tweets Count [twinttweet]",
35 | "visState": "{\"title\":\"Tweets Count [twinttweet]\",\"type\":\"metric\",\"params\":{\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\",\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"None\",\"colorsRange\":[{\"from\":0,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":false,\"subText\":\"\",\"fontSize\":33}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}}]}",
36 | "uiStateJSON": "{}",
37 | "description": "",
38 | "version": 1,
39 | "kibanaSavedObjectMeta": {
40 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
41 | }
42 | }
43 | },
44 | {
45 | "_id": "149ecbc0-bfe4-11e8-961a-d371b24d5d1d",
46 | "_type": "visualization",
47 | "_source": {
48 | "title": "Word Cloud [twinttweets]",
49 | "visState": "{\"title\":\"Word Cloud [twinttweets]\",\"type\":\"tagcloud\",\"params\":{\"scale\":\"linear\",\"orientation\":\"single\",\"minFontSize\":10,\"maxFontSize\":50,\"showLabel\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"username\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}}]}",
50 | "uiStateJSON": "{}",
51 | "description": "",
52 | "version": 1,
53 | "kibanaSavedObjectMeta": {
54 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
55 | }
56 | }
57 | },
58 | {
59 | "_id": "a8d3ee70-bfd9-11e8-8858-bbc566841533",
60 | "_type": "visualization",
61 | "_source": {
62 | "title": "Day-activity [twinttweet]",
63 | "visState": "{\"title\":\"Day-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"position\":\"bottom\",\"scale\":{\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{},\"type\":\"category\"}],\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-3\"},\"legendPosition\":\"right\",\"orderBucketsBySum\":false,\"seriesParams\":[{\"data\":{\"id\":\"1\",\"label\":\"Tweets\"},\"drawLinesBetweenPoints\":true,\"mode\":\"normal\",\"show\":\"true\",\"showCircles\":true,\"type\":\"histogram\",\"valueAxis\":\"ValueAxis-3\"}],\"times\":[],\"type\":\"histogram\",\"valueAxes\":[{\"id\":\"ValueAxis-3\",\"labels\":{\"filter\":false,\"rotate\":0,\"show\":true,\"truncate\":100},\"name\":\"LeftAxis-1\",\"position\":\"left\",\"scale\":{\"mode\":\"normal\",\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{\"text\":\"Tweets\"},\"type\":\"value\"}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{\"min\":0,\"max\":23}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"asc\",\"orderBy\":\"_term\",\"customLabel\":\"\"}}]}",
64 | "uiStateJSON": "{\"vis\":{\"legendOpen\":true}}",
65 | "description": "",
66 | "version": 1,
67 | "kibanaSavedObjectMeta": {
68 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
69 | }
70 | }
71 | },
72 | {
73 | "_id": "8a8bb420-bfd9-11e8-8858-bbc566841533",
74 | "_type": "visualization",
75 | "_source": {
76 | "title": "Week-activity [twinttweet]",
77 | "visState": "{\"title\":\"Week-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"histogram\",\"mode\":\"normal\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{},\"customLabel\":\"Days of the week\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\",\"customLabel\":\"\"}}]}",
78 | "uiStateJSON": "{}",
79 | "description": "",
80 | "version": 1,
81 | "kibanaSavedObjectMeta": {
82 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
83 | }
84 | }
85 | },
86 | {
87 | "_id": "b45ec590-c267-11e8-bcd4-3956fe930db7",
88 | "_type": "visualization",
89 | "_source": {
90 | "title": "Heat-map [twinttweets]",
91 | "visState": "{\"title\":\"Heat-map [twinttweets]\",\"type\":\"heatmap\",\"params\":{\"type\":\"heatmap\",\"addTooltip\":true,\"addLegend\":true,\"enableHover\":true,\"legendPosition\":\"right\",\"times\":[],\"colorsNumber\":10,\"colorSchema\":\"Reds\",\"setColorRange\":false,\"colorsRange\":[{\"from\":0,\"to\":10},{\"from\":10,\"to\":100},{\"from\":100,\"to\":200},{\"from\":200,\"to\":500},{\"from\":500,\"to\":1000},{\"from\":1000,\"to\":2000},{\"from\":2000,\"to\":3000},{\"from\":3000,\"to\":4000},{\"from\":4000,\"to\":5000},{\"from\":7000,\"to\":null}],\"invertColors\":false,\"percentageMode\":false,\"valueAxes\":[{\"show\":false,\"id\":\"ValueAxis-1\",\"type\":\"value\",\"scale\":{\"type\":\"linear\",\"defaultYExtents\":true},\"labels\":{\"show\":false,\"rotate\":270,\"overwriteColor\":false,\"color\":\"#555\"}}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"group\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{\"min\":0,\"max\":2}}}]}",
92 | "uiStateJSON": "{\"vis\":{\"defaultColors\":{\"3 - 592\":\"rgb(255,245,240)\",\"592 - 1.180\":\"rgb(254,228,216)\",\"1.180 - 1.769\":\"rgb(253,202,181)\",\"1.769 - 2.357\":\"rgb(252,171,142)\",\"2.357 - 2.945\":\"rgb(252,138,106)\",\"2.945 - 3.534\":\"rgb(251,106,74)\",\"3.534 - 4.122\":\"rgb(241,68,50)\",\"4.122 - 4.711\":\"rgb(217,38,35)\",\"4.711 - 5.299\":\"rgb(188,20,26)\",\"5.299 - 5.887\":\"rgb(152,12,19)\"},\"colors\":{\"3 - 592\":\"#FCEACA\",\"592 - 1.180\":\"#F9E2D2\",\"1.180 - 1.769\":\"#F9BA8F\"}}}",
93 | "description": "",
94 | "version": 1,
95 | "kibanaSavedObjectMeta": {
96 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"filter\":[],\"query\":{\"language\":\"lucene\",\"query\":\"\"}}"
97 | }
98 | }
99 | }
100 | ]
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiodns==3.0.0
2 | aiohttp==3.7.3
3 | aiohttp-socks==0.4.1
4 | aiosignal==1.2.0
5 | async-timeout==3.0.1
6 | attrs==21.4.0
7 | beautifulsoup4==4.10.0
8 | cchardet==2.1.7
9 | certifi==2021.10.8
10 | cffi==1.15.0
11 | chardet==3.0.4
12 | charset-normalizer==2.0.12
13 | dataclasses==0.6
14 | elastic-transport==8.1.0
15 | elasticsearch==8.1.1
16 | fake-useragent==0.1.11
17 | frozenlist==1.3.0
18 | geographiclib==1.52
19 | geopy==2.2.0
20 | googletransx==2.4.2
21 | idna==3.3
22 | multidict==6.0.2
23 | numpy==1.22.3
24 | pandas==1.4.1
25 | pycares==4.1.2
26 | pycparser==2.21
27 | PySocks==1.7.1
28 | python-dateutil==2.8.2
29 | pytz==2022.1
30 | requests==2.27.1
31 | schedule==1.1.0
32 | six==1.16.0
33 | soupsieve==2.3.1
34 | typing-extensions==4.1.1
35 | urllib3==1.26.9
36 | yarl==1.7.2
37 | setuptools~=57.0.0
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | from setuptools import setup
3 | import io
4 | import os
5 |
6 | # Package meta-data
7 | NAME = 'twint'
8 | DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
9 | URL = 'https://github.com/twintproject/twint'
10 | EMAIL = 'codyzacharias@pm.me'
11 | AUTHOR = 'Cody Zacharias'
12 | REQUIRES_PYTHON = '>=3.6.0'
13 | VERSION = None
14 |
15 | # Packages required
16 | REQUIRED = [
17 | 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
18 | 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
19 | 'schedule', 'geopy', 'fake-useragent', 'googletransx'
20 | ]
21 |
22 | here = os.path.abspath(os.path.dirname(__file__))
23 |
24 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
25 | long_description = '\n' + f.read()
26 |
27 | # Load the package's __version__.py
28 | about = {}
29 | if not VERSION:
30 | with open(os.path.join(here, NAME, '__version__.py')) as f:
31 | exec(f.read(), about)
32 | else:
33 | about['__version__'] = VERSION
34 |
35 | setup(
36 | name=NAME,
37 | version=about['__version__'],
38 | description=DESCRIPTION,
39 | long_description=long_description,
40 | long_description_content_type="text/markdown",
41 | author=AUTHOR,
42 | author_email=EMAIL,
43 | python_requires=REQUIRES_PYTHON,
44 | url=URL,
45 | packages=['twint', 'twint.storage'],
46 | entry_points={
47 | 'console_scripts': [
48 | 'twint = twint.cli:run_as_command',
49 | ],
50 | },
51 | install_requires=REQUIRED,
52 | dependency_links=[
53 | 'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
54 | ],
55 | license='MIT',
56 | classifiers=[
57 | 'License :: OSI Approved :: MIT License',
58 | 'Programming Language :: Python',
59 | 'Programming Language :: Python :: 3',
60 | 'Programming Language :: Python :: 3.6',
61 | 'Programming Language :: Python :: 3.7',
62 | 'Programming Language :: Python :: 3.8',
63 | 'Programming Language :: Python :: Implementation :: CPython',
64 | ],
65 | )
66 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import twint
2 | import os
3 |
4 | '''
5 | Test.py - Testing TWINT to make sure everything works.
6 | '''
7 |
8 |
9 | def test_reg(c, run):
10 | print("[+] Beginning vanilla test in {}".format(str(run)))
11 | run(c)
12 |
13 |
14 | def test_db(c, run):
15 | print("[+] Beginning DB test in {}".format(str(run)))
16 | c.Database = "test_twint.db"
17 | run(c)
18 |
19 |
20 | def custom(c, run, _type):
21 | print("[+] Beginning custom {} test in {}".format(_type, str(run)))
22 | c.Custom['tweet'] = ["id", "username"]
23 | c.Custom['user'] = ["id", "username"]
24 | run(c)
25 |
26 |
27 | def test_json(c, run):
28 | c.Store_json = True
29 | c.Output = "test_twint.json"
30 | custom(c, run, "JSON")
31 | print("[+] Beginning JSON test in {}".format(str(run)))
32 | run(c)
33 |
34 |
35 | def test_csv(c, run):
36 | c.Store_csv = True
37 | c.Output = "test_twint.csv"
38 | custom(c, run, "CSV")
39 | print("[+] Beginning CSV test in {}".format(str(run)))
40 | run(c)
41 |
42 |
43 | def main():
44 | c = twint.Config()
45 | c.Username = "verified"
46 | c.Limit = 20
47 | c.Store_object = True
48 |
49 | # Separate objects are necessary.
50 |
51 | f = twint.Config()
52 | f.Username = "verified"
53 | f.Limit = 20
54 | f.Store_object = True
55 | f.User_full = True
56 |
57 | runs = [
58 | twint.run.Profile, # this doesn't
59 | twint.run.Search, # this works
60 | twint.run.Following,
61 | twint.run.Followers,
62 | twint.run.Favorites,
63 | ]
64 |
65 | tests = [test_reg, test_json, test_csv, test_db]
66 |
67 | # Something breaks if we don't split these up
68 |
69 | for run in runs[:3]:
70 | if run == twint.run.Search:
71 | c.Since = "2012-1-1 20:30:22"
72 | c.Until = "2017-1-1"
73 | else:
74 | c.Since = ""
75 | c.Until = ""
76 |
77 | for test in tests:
78 | test(c, run)
79 |
80 | for run in runs[3:]:
81 | for test in tests:
82 | test(f, run)
83 |
84 | files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
85 | for _file in files:
86 | os.remove(_file)
87 |
88 | print("[+] Testing complete!")
89 |
90 |
91 | if __name__ == '__main__':
92 | main()
93 |
--------------------------------------------------------------------------------
/twint/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | TWINT - Twitter Intelligence Tool (formerly known as Tweep).
3 |
4 | See wiki on Github for in-depth details.
5 | https://github.com/twintproject/twint/wiki
6 |
7 | Licensed under MIT License
8 | Copyright (c) 2018 Cody Zacharias
9 | '''
10 | import logging, os
11 |
12 | from .config import Config
13 | from .__version__ import __version__
14 | from . import run
15 |
16 | _levels = {
17 | 'info': logging.INFO,
18 | 'debug': logging.DEBUG
19 | }
20 |
21 | _level = os.getenv('TWINT_DEBUG', 'info')
22 | _logLevel = _levels[_level]
23 |
24 | if _level == "debug":
25 | logger = logging.getLogger()
26 | _output_fn = 'twint.log'
27 | logger.setLevel(_logLevel)
28 | formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
29 | fileHandler = logging.FileHandler(_output_fn)
30 | fileHandler.setLevel(_logLevel)
31 | fileHandler.setFormatter(formatter)
32 | logger.addHandler(fileHandler)
33 |
--------------------------------------------------------------------------------
/twint/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (2, 1, 21)
2 |
3 | __version__ = '.'.join(map(str, VERSION))
4 |
--------------------------------------------------------------------------------
/twint/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | '''
3 | Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
4 |
5 | See wiki on Github for in-depth details.
6 | https://github.com/twintproject/twint/wiki
7 |
8 | Licensed under MIT License
9 | Copyright (c) 2018 The Twint Project
10 | '''
11 | import sys
12 | import os
13 | import argparse
14 |
15 | from . import run
16 | from . import config
17 | from . import storage
18 |
19 |
20 | def error(_error, message):
21 | """ Print errors to stdout
22 | """
23 | print("[-] {}: {}".format(_error, message))
24 | sys.exit(0)
25 |
26 |
27 | def check(args):
28 | """ Error checking
29 | """
30 | if args.username is not None or args.userlist or args.members_list:
31 | if args.verified:
32 | error("Contradicting Args",
33 | "Please use --verified in combination with -s.")
34 | if args.userid:
35 | error("Contradicting Args",
36 | "--userid and -u cannot be used together.")
37 | if args.all:
38 | error("Contradicting Args",
39 | "--all and -u cannot be used together.")
40 | elif args.search and args.timeline:
41 | error("Contradicting Args",
42 | "--s and --tl cannot be used together.")
43 | elif args.timeline and not args.username:
44 | error("Error", "-tl cannot be used without -u.")
45 | elif args.search is None:
46 | if args.custom_query is not None:
47 | pass
48 | elif (args.geo or args.near) is None and not (args.all or args.userid):
49 | error("Error", "Please use at least -u, -s, -g or --near.")
50 | elif args.all and args.userid:
51 | error("Contradicting Args",
52 | "--all and --userid cannot be used together")
53 | if args.output is None:
54 | if args.csv:
55 | error("Error", "Please specify an output file (Example: -o file.csv).")
56 | elif args.json:
57 | error("Error", "Please specify an output file (Example: -o file.json).")
58 | if args.backoff_exponent <= 0:
59 | error("Error", "Please specifiy a positive value for backoff_exponent")
60 | if args.min_wait_time < 0:
61 | error("Error", "Please specifiy a non negative value for min_wait_time")
62 |
63 |
64 | def loadUserList(ul, _type):
65 | """ Concatenate users
66 | """
67 | if os.path.exists(os.path.abspath(ul)):
68 | userlist = open(os.path.abspath(ul), "r").read().splitlines()
69 | else:
70 | userlist = ul.split(",")
71 | if _type == "search":
72 | un = ""
73 | for user in userlist:
74 | un += "%20OR%20from%3A" + user
75 | return un[15:]
76 | return userlist
77 |
78 |
79 | def initialize(args):
80 | """ Set default values for config from args
81 | """
82 | c = config.Config()
83 | c.Username = args.username
84 | c.User_id = args.userid
85 | c.Search = args.search
86 | c.Geo = args.geo
87 | c.Location = args.location
88 | c.Near = args.near
89 | c.Lang = args.lang
90 | c.Output = args.output
91 | c.Elasticsearch = args.elasticsearch
92 | c.Year = args.year
93 | c.Since = args.since
94 | c.Until = args.until
95 | c.Email = args.email
96 | c.Phone = args.phone
97 | c.Verified = args.verified
98 | c.Store_csv = args.csv
99 | c.Tabs = args.tabs
100 | c.Store_json = args.json
101 | c.Utc = args.utc
102 | c.Full_text = args.full_text
103 | c.Show_hashtags = args.hashtags
104 | c.Show_cashtags = args.cashtags
105 | c.Limit = args.limit
106 | c.Count = args.count
107 | c.Stats = args.stats
108 | c.Database = args.database
109 | c.To = args.to
110 | c.All = args.all
111 | c.Essid = args.essid
112 | c.Format = args.format
113 | c.User_full = args.user_full
114 | # c.Profile_full = args.profile_full
115 | c.Pandas_type = args.pandas_type
116 | c.Index_tweets = args.index_tweets
117 | c.Index_follow = args.index_follow
118 | c.Index_users = args.index_users
119 | c.Debug = args.debug
120 | c.Resume = args.resume
121 | c.Images = args.images
122 | c.Videos = args.videos
123 | c.Media = args.media
124 | c.Replies = args.replies
125 | c.Pandas_clean = args.pandas_clean
126 | c.Proxy_host = args.proxy_host
127 | c.Proxy_port = args.proxy_port
128 | c.Proxy_type = args.proxy_type
129 | c.Tor_control_port = args.tor_control_port
130 | c.Tor_control_password = args.tor_control_password
131 | c.Retweets = args.retweets
132 | c.Custom_query = args.custom_query
133 | c.Popular_tweets = args.popular_tweets
134 | c.Skip_certs = args.skip_certs
135 | c.Hide_output = args.hide_output
136 | c.Native_retweets = args.native_retweets
137 | c.Min_likes = args.min_likes
138 | c.Min_retweets = args.min_retweets
139 | c.Min_replies = args.min_replies
140 | c.Links = args.links
141 | c.Source = args.source
142 | c.Members_list = args.members_list
143 | c.Filter_retweets = args.filter_retweets
144 | c.Translate = args.translate
145 | c.TranslateDest = args.translate_dest
146 | c.Backoff_exponent = args.backoff_exponent
147 | c.Min_wait_time = args.min_wait_time
148 | return c
149 |
150 |
151 | def options():
152 | """ Parse arguments
153 | """
154 | ap = argparse.ArgumentParser(prog="twint",
155 | usage="python3 %(prog)s [options]",
156 | description="TWINT - An Advanced Twitter Scraping Tool.")
157 | ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
158 | ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
159 | ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
160 | ap.add_argument("--near", help="Near a specified city.")
161 | ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
162 | ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
163 | ap.add_argument("-o", "--output", help="Save output to a file.")
164 | ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
165 | ap.add_argument("--year", help="Filter Tweets before specified year.")
166 | ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
167 | metavar="DATE")
168 | ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
169 | metavar="DATE")
170 | ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
171 | ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
172 | ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
173 | action="store_true")
174 | ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
175 | ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
176 | ap.add_argument("--json", help="Write as .json file", action="store_true")
177 | ap.add_argument("--utc", help="Treat time as UTC.", action="store_true")
178 | ap.add_argument("--full-text", help="Preserve full Tweet text.", action="store_true")
179 | ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
180 | ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
181 | ap.add_argument("--userid", help="Twitter user id.")
182 | ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
183 | ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
184 | action="store_true")
185 | ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
186 | action="store_true")
187 | ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
188 | ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
189 | ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
190 | ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
191 | ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
192 | ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
193 | ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
194 | ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
195 | ap.add_argument("--proxy-port", help="The port of the proxy server.")
196 | ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
197 | ap.add_argument("--tor-control-password",
198 | help="If proxy-host is set to tor, this is the password for the control port",
199 | default="my_password")
200 | ap.add_argument("--essid",
201 | help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
202 | nargs="?", default="")
203 | ap.add_argument("--userlist", help="Userlist from list or file.")
204 | ap.add_argument("--retweets",
205 | help="Include user's Retweets (Warning: limited).",
206 | action="store_true")
207 | ap.add_argument("--format", help="Custom output format (See wiki for details).")
208 | ap.add_argument("--user-full",
209 | help="Collect all user information (Use with followers or following only).",
210 | action="store_true")
211 | # I am removing this this feature for the time being, because it is no longer required, default method will do this
212 | # ap.add_argument("--profile-full",
213 | # help="Slow, but effective method of collecting a user's Tweets and RT.",
214 | # action="store_true")
215 | ap.add_argument(
216 | "-tl",
217 | "--timeline",
218 | help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
219 | action="store_true",
220 | )
221 | ap.add_argument("--translate",
222 | help="Get tweets translated by Google Translate.",
223 | action="store_true")
224 | ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
225 | default="en")
226 | ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
227 | ap.add_argument("--pandas-type",
228 | help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
229 | ap.add_argument("-it", "--index-tweets",
230 | help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
231 | ap.add_argument("-if", "--index-follow",
232 | help="Custom Elasticsearch Index name for Follows.",
233 | nargs="?", default="twintgraph")
234 | ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
235 | nargs="?", default="twintuser")
236 | ap.add_argument("--debug",
237 | help="Store information in debug logs", action="store_true")
238 | ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
239 | ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
240 | ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
241 | ap.add_argument("--media",
242 | help="Display Tweets with only images or videos.", action="store_true")
243 | ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
244 | ap.add_argument("-pc", "--pandas-clean",
245 | help="Automatically clean Pandas dataframe at every scrape.")
246 | ap.add_argument("-cq", "--custom-query", help="Custom search query.")
247 | ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
248 | action="store_true")
249 | ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
250 | ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
251 | ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
252 | ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
253 | ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
254 | ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
255 | ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
256 | " you will get both tweets that might contain links or not.")
257 | ap.add_argument("--source", help="Filter the tweets for specific source client.")
258 | ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
259 | ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
260 | ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
261 | type=float, default=3.0)
262 | ap.add_argument("--min-wait-time", type=float, default=15,
263 | help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
264 | args = ap.parse_args()
265 |
266 | return args
267 |
268 |
269 | def main():
270 | """ Main
271 | """
272 | args = options()
273 | check(args)
274 |
275 | if args.pandas_clean:
276 | storage.panda.clean()
277 |
278 | c = initialize(args)
279 |
280 | if args.userlist:
281 | c.Query = loadUserList(args.userlist, "search")
282 |
283 | if args.pandas_clean:
284 | storage.panda.clean()
285 |
286 | if args.favorites:
287 | if args.userlist:
288 | _userlist = loadUserList(args.userlist, "favorites")
289 | for _user in _userlist:
290 | args.username = _user
291 | c = initialize(args)
292 | run.Favorites(c)
293 | else:
294 | run.Favorites(c)
295 | elif args.following:
296 | if args.userlist:
297 | _userlist = loadUserList(args.userlist, "following")
298 | for _user in _userlist:
299 | args.username = _user
300 | c = initialize(args)
301 | run.Following(c)
302 | else:
303 | run.Following(c)
304 | elif args.followers:
305 | if args.userlist:
306 | _userlist = loadUserList(args.userlist, "followers")
307 | for _user in _userlist:
308 | args.username = _user
309 | c = initialize(args)
310 | run.Followers(c)
311 | else:
312 | run.Followers(c)
313 | elif args.retweets: # or args.profile_full:
314 | if args.userlist:
315 | _userlist = loadUserList(args.userlist, "profile")
316 | for _user in _userlist:
317 | args.username = _user
318 | c = initialize(args)
319 | run.Profile(c)
320 | else:
321 | run.Profile(c)
322 | elif args.user_full:
323 | if args.userlist:
324 | _userlist = loadUserList(args.userlist, "userlist")
325 | for _user in _userlist:
326 | args.username = _user
327 | c = initialize(args)
328 | run.Lookup(c)
329 | else:
330 | run.Lookup(c)
331 | elif args.timeline:
332 | run.Profile(c)
333 | else:
334 | run.Search(c)
335 |
336 |
337 | def run_as_command():
338 | if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
339 | print("[-] TWINT requires Python version 3.6+.")
340 | sys.exit(0)
341 |
342 | main()
343 |
344 |
345 | if __name__ == '__main__':
346 | main()
347 |
--------------------------------------------------------------------------------
/twint/config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Optional
3 |
4 | @dataclass
5 | class Config:
6 | Username: Optional[str] = None
7 | User_id: Optional[str] = None
8 | Search: Optional[str] = None
9 | Lookup: bool = False
10 | Geo: str = ""
11 | Location: bool = False
12 | Near: str = None
13 | Lang: Optional[str] = None
14 | Output: Optional[str] = None
15 | Elasticsearch: object = None
16 | Year: Optional[int] = None
17 | Since: Optional[str] = None
18 | Until: Optional[str] = None
19 | Email: Optional[str] = None
20 | Phone: Optional[str] = None
21 | Verified: bool = False
22 | Store_csv: bool = False
23 | Store_json: bool = False
24 | Custom = {"tweet": None, "user": None, "username": None}
25 | Show_hashtags: bool = False
26 | Show_cashtags: bool = False
27 | Limit: Optional[int] = None
28 | Count: Optional[int] = None
29 | Stats: bool = False
30 | Database: object = None
31 | To: str = None
32 | All = None
33 | Debug: bool = False
34 | Format = None
35 | Essid: str = ""
36 | Profile: bool = False
37 | Followers: bool = False
38 | Following: bool = False
39 | Favorites: bool = False
40 | TwitterSearch: bool = False
41 | User_full: bool = False
42 | # Profile_full: bool = False
43 | Store_object: bool = False
44 | Store_object_tweets_list: list = None
45 | Store_object_users_list: list = None
46 | Store_object_follow_list: list = None
47 | Pandas_type: type = None
48 | Pandas: bool = False
49 | Index_tweets: str = "twinttweets"
50 | Index_follow: str = "twintgraph"
51 | Index_users: str = "twintuser"
52 | Retries_count: int = 10
53 | Resume: object = None
54 | Images: bool = False
55 | Videos: bool = False
56 | Media: bool = False
57 | Replies: bool = False
58 | Pandas_clean: bool = True
59 | Lowercase: bool = True
60 | Pandas_au: bool = True
61 | Proxy_host: str = ""
62 | Proxy_port: int = 0
63 | Proxy_type: object = None
64 | Tor_control_port: int = 9051
65 | Tor_control_password: str = None
66 | Retweets: bool = False
67 | Query: str = None
68 | Hide_output: bool = False
69 | Custom_query: str = ""
70 | Popular_tweets: bool = False
71 | Skip_certs: bool = False
72 | Native_retweets: bool = False
73 | Min_likes: int = 0
74 | Min_retweets: int = 0
75 | Min_replies: int = 0
76 | Links: Optional[str] = None
77 | Source: Optional[str] = None
78 | Members_list: Optional[str] = None
79 | Filter_retweets: bool = False
80 | Translate: bool = False
81 | TranslateSrc: str = "en"
82 | TranslateDest: str = "en"
83 | Backoff_exponent: float = 3.0
84 | Min_wait_time: int = 0
85 | Bearer_token: str = None
86 | Guest_token: str = None
87 | deleted: list = None
88 |
--------------------------------------------------------------------------------
/twint/datelock.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import logging as logme
4 |
5 | from .tweet import utc_to_local
6 |
7 |
8 | class Datelock:
9 | until = None
10 | since = None
11 | _since_def_user = None
12 |
13 |
14 | def convertToDateTime(string):
15 | dateTimeList = string.split()
16 | ListLength = len(dateTimeList)
17 | if ListLength == 2:
18 | return string
19 | if ListLength == 1:
20 | return string + " 00:00:00"
21 | else:
22 | return ""
23 |
24 |
25 | def Set(Until, Since):
26 | logme.debug(__name__+':Set')
27 | d = Datelock()
28 |
29 | if Until:
30 | d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
31 | d.until = utc_to_local(d.until)
32 | else:
33 | d.until = datetime.datetime.today()
34 |
35 | if Since:
36 | d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
37 | d.since = utc_to_local(d.since)
38 | d._since_def_user = True
39 | else:
40 | d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
41 | d.since = utc_to_local(d.since)
42 | d._since_def_user = False
43 |
44 | return d
45 |
--------------------------------------------------------------------------------
/twint/feed.py:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 |
4 | from bs4 import BeautifulSoup
5 | from re import findall
6 | from json import loads
7 |
8 | import logging as logme
9 |
10 | from .tweet import utc_to_local, Tweet_formats
11 |
12 |
13 | class NoMoreTweetsException(Exception):
14 | def __init__(self, msg):
15 | super().__init__(msg)
16 |
17 |
18 | def Follow(response):
19 | logme.debug(__name__ + ':Follow')
20 | soup = BeautifulSoup(response, "html.parser")
21 | follow = soup.find_all("td", "info fifty screenname")
22 | cursor = soup.find_all("div", "w-button-more")
23 | try:
24 | cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
25 | except IndexError:
26 | logme.critical(__name__ + ':Follow:IndexError')
27 |
28 | return follow, cursor
29 |
30 |
31 | # TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
32 | def Mobile(response):
33 | logme.debug(__name__ + ':Mobile')
34 | soup = BeautifulSoup(response, "html.parser")
35 | tweets = soup.find_all("span", "metadata")
36 | max_id = soup.find_all("div", "w-button-more")
37 | try:
38 | max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
39 | except Exception as e:
40 | logme.critical(__name__ + ':Mobile:' + str(e))
41 |
42 | return tweets, max_id
43 |
44 |
45 | def MobileFav(response):
46 | soup = BeautifulSoup(response, "html.parser")
47 | tweets = soup.find_all("table", "tweet")
48 | max_id = soup.find_all("div", "w-button-more")
49 | try:
50 | max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
51 | except Exception as e:
52 | print(str(e) + " [x] feed.MobileFav")
53 |
54 | return tweets, max_id
55 |
56 |
57 | def _get_cursor(response):
58 | if isinstance(response, dict): # case 1
59 | try:
60 | next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
61 | 'operation']['cursor']['value']
62 | except KeyError:
63 | # this is needed because after the first request location of cursor is changed
64 | next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
65 | 'cursor']['value']
66 | else: # case 2
67 | next_cursor = response[-1]['content']['value']
68 | return next_cursor
69 |
70 |
71 | def Json(response):
72 | logme.debug(__name__ + ':Json')
73 | json_response = loads(response)
74 | html = json_response["items_html"]
75 | soup = BeautifulSoup(html, "html.parser")
76 | feed = soup.find_all("div", "tweet")
77 | return feed, json_response["min_position"]
78 |
79 |
80 | def parse_tweets(config, response):
81 | logme.debug(__name__ + ':parse_tweets')
82 | response = loads(response)
83 | feed = []
84 | if 'globalObjects' in response:
85 | if len(response['globalObjects']['tweets']) == 0:
86 | msg = 'No more data!'
87 | raise NoMoreTweetsException(msg)
88 | for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
89 | # this will handle the cases when the timeline entry is a tweet
90 | if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
91 | timeline_entry['entryId'].startswith('tweet-')):
92 | if 'tweet' in timeline_entry['content']['item']['content']:
93 | _id = timeline_entry['content']['item']['content']['tweet']['id']
94 | # skip the ads
95 | if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
96 | continue
97 | elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
98 | timeline_entry['content']['item']['content']['tombstone']:
99 | _id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
100 | else:
101 | _id = None
102 | if _id is None:
103 | raise ValueError('Unable to find ID of tweet in timeline.')
104 | try:
105 | temp_obj = response['globalObjects']['tweets'][_id]
106 | except KeyError:
107 | logme.info('encountered a deleted tweet with id {}'.format(_id))
108 |
109 | config.deleted.append(_id)
110 | continue
111 | temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
112 | if 'retweeted_status_id_str' in temp_obj:
113 | rt_id = temp_obj['retweeted_status_id_str']
114 | _dt = response['globalObjects']['tweets'][rt_id]['created_at']
115 | _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
116 | _dt = utc_to_local(_dt)
117 | _dt = str(_dt.strftime(Tweet_formats['datetime']))
118 | temp_obj['retweet_data'] = {
119 | 'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
120 | 'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
121 | 'retweet_id': rt_id,
122 | 'retweet_date': _dt,
123 | }
124 | feed.append(temp_obj)
125 | next_cursor = _get_cursor(response) # case 1
126 | else:
127 | response = response['data']['user']['result']['timeline']
128 | entries = response['timeline']['instructions']
129 | for e in entries:
130 | if e.get('entries'):
131 | entries = e['entries']
132 | break
133 | if len(entries) == 2:
134 | msg = 'No more data!'
135 | raise NoMoreTweetsException(msg)
136 | for timeline_entry in entries:
137 | if timeline_entry['content'].get('itemContent'):
138 | try:
139 | temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
140 | temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
141 | feed.append(temp_obj)
142 | except KeyError: # doubtful
143 | next
144 | next_cursor = _get_cursor(entries) # case 2
145 | return feed, next_cursor
146 |
--------------------------------------------------------------------------------
/twint/format.py:
--------------------------------------------------------------------------------
1 | import logging as logme
2 |
3 | def Tweet(config, t):
4 | if config.Format:
5 | logme.debug(__name__+':Tweet:Format')
6 | output = config.Format.replace("{id}", t.id_str)
7 | output = output.replace("{conversation_id}", t.conversation_id)
8 | output = output.replace("{date}", t.datestamp)
9 | output = output.replace("{time}", t.timestamp)
10 | output = output.replace("{user_id}", t.user_id_str)
11 | output = output.replace("{username}", t.username)
12 | output = output.replace("{name}", t.name)
13 | output = output.replace("{place}", t.place)
14 | output = output.replace("{timezone}", t.timezone)
15 | output = output.replace("{urls}", ",".join(t.urls))
16 | output = output.replace("{photos}", ",".join(t.photos))
17 | output = output.replace("{video}", str(t.video))
18 | output = output.replace("{thumbnail}", t.thumbnail)
19 | output = output.replace("{tweet}", t.tweet)
20 | output = output.replace("{language}", t.lang)
21 | output = output.replace("{hashtags}", ",".join(t.hashtags))
22 | output = output.replace("{cashtags}", ",".join(t.cashtags))
23 | output = output.replace("{replies}", t.replies_count)
24 | output = output.replace("{retweets}", t.retweets_count)
25 | output = output.replace("{likes}", t.likes_count)
26 | output = output.replace("{link}", t.link)
27 | output = output.replace("{is_retweet}", str(t.retweet))
28 | output = output.replace("{user_rt_id}", str(t.user_rt_id))
29 | output = output.replace("{quote_url}", t.quote_url)
30 | output = output.replace("{near}", t.near)
31 | output = output.replace("{geo}", t.geo)
32 | output = output.replace("{mentions}", ",".join(t.mentions))
33 | output = output.replace("{translate}", t.translate)
34 | output = output.replace("{trans_src}", t.trans_src)
35 | output = output.replace("{trans_dest}", t.trans_dest)
36 | else:
37 | logme.debug(__name__+':Tweet:notFormat')
38 | output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
39 |
40 | # TODO: someone who is familiar with this code, needs to take a look at what this is
41 | # if t.retweet:
42 | # output += "RT "
43 |
44 | output += f"<{t.username}> {t.tweet}"
45 |
46 | if config.Show_hashtags:
47 | hashtags = ",".join(t.hashtags)
48 | output += f" {hashtags}"
49 | if config.Show_cashtags:
50 | cashtags = ",".join(t.cashtags)
51 | output += f" {cashtags}"
52 | if config.Stats:
53 | output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
54 | if config.Translate:
55 | output += f" {t.translate} {t.trans_src} {t.trans_dest}"
56 | return output
57 |
58 | def User(_format, u):
59 | if _format:
60 | logme.debug(__name__+':User:Format')
61 | output = _format.replace("{id}", str(u.id))
62 | output = output.replace("{name}", u.name)
63 | output = output.replace("{username}", u.username)
64 | output = output.replace("{bio}", u.bio)
65 | output = output.replace("{location}", u.location)
66 | output = output.replace("{url}", u.url)
67 | output = output.replace("{join_date}", u.join_date)
68 | output = output.replace("{join_time}", u.join_time)
69 | output = output.replace("{tweets}", str(u.tweets))
70 | output = output.replace("{following}", str(u.following))
71 | output = output.replace("{followers}", str(u.followers))
72 | output = output.replace("{likes}", str(u.likes))
73 | output = output.replace("{media}", str(u.media_count))
74 | output = output.replace("{private}", str(u.is_private))
75 | output = output.replace("{verified}", str(u.is_verified))
76 | output = output.replace("{avatar}", u.avatar)
77 | if u.background_image:
78 | output = output.replace("{background_image}", u.background_image)
79 | else:
80 | output = output.replace("{background_image}", "")
81 | else:
82 | logme.debug(__name__+':User:notFormat')
83 | output = f"{u.id} | {u.name} | @{u.username} | Private: "
84 | output += f"{u.is_private} | Verified: {u.is_verified} |"
85 | output += f" Bio: {u.bio} | Location: {u.location} | Url: "
86 | output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
87 | output += f"| Tweets: {u.tweets} | Following: {u.following}"
88 | output += f" | Followers: {u.followers} | Likes: {u.likes} "
89 | output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
90 |
91 | return output
92 |
--------------------------------------------------------------------------------
/twint/get.py:
--------------------------------------------------------------------------------
1 | from async_timeout import timeout
2 | from datetime import datetime
3 | from bs4 import BeautifulSoup
4 | import sys
5 | import socket
6 | import aiohttp
7 | from fake_useragent import UserAgent
8 | import asyncio
9 | import concurrent.futures
10 | import random
11 | from json import loads, dumps
12 | from aiohttp_socks import ProxyConnector, ProxyType
13 | from urllib.parse import quote
14 | import time
15 |
16 | from . import url
17 | from .output import Tweets, Users
18 | from .token import TokenExpiryException
19 |
20 | import logging as logme
21 |
22 | httpproxy = None
23 |
24 | user_agent_list = [
25 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
26 | # ' Chrome/60.0.3112.113 Safari/537.36',
27 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
28 | # ' Chrome/60.0.3112.90 Safari/537.36',
29 | # 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
30 | # ' Chrome/60.0.3112.90 Safari/537.36',
31 | # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
32 | # ' Chrome/60.0.3112.90 Safari/537.36',
33 | # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
34 | # ' Chrome/44.0.2403.157 Safari/537.36',
35 | # 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
36 | # ' Chrome/60.x0.3112.113 Safari/537.36',
37 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
38 | # ' Chrome/57.0.2987.133 Safari/537.36',
39 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
40 | # ' Chrome/57.0.2987.133 Safari/537.36',
41 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
42 | # ' Chrome/55.0.2883.87 Safari/537.36',
43 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
44 | # ' Chrome/55.0.2883.87 Safari/537.36',
45 |
46 | 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
47 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
48 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
49 | 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
50 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
51 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
52 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
53 | 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
54 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
55 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
56 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
57 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
58 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET x '
59 | 'CLR 3.5.30729)',
60 | ]
61 |
62 |
63 | # function to convert python `dict` to json and then encode it to be passed in the url as a parameter
64 | # some urls require this format
65 | def dict_to_url(dct):
66 | return quote(dumps(dct))
67 |
68 |
69 | def get_connector(config):
70 | logme.debug(__name__ + ':get_connector')
71 | _connector = None
72 | if config.Proxy_host:
73 | if config.Proxy_host.lower() == "tor":
74 | _connector = ProxyConnector(
75 | host='127.0.0.1',
76 | port=9050,
77 | rdns=True)
78 | elif config.Proxy_port and config.Proxy_type:
79 | if config.Proxy_type.lower() == "socks5":
80 | _type = ProxyType.SOCKS5
81 | elif config.Proxy_type.lower() == "socks4":
82 | _type = ProxyType.SOCKS4
83 | elif config.Proxy_type.lower() == "http":
84 | global httpproxy
85 | httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
86 | return _connector
87 | else:
88 | logme.critical("get_connector:proxy-type-error")
89 | print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
90 | sys.exit(1)
91 | _connector = ProxyConnector(
92 | proxy_type=_type,
93 | host=config.Proxy_host,
94 | port=config.Proxy_port,
95 | rdns=True)
96 | else:
97 | logme.critical(__name__ + ':get_connector:proxy-port-type-error')
98 | print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
99 | sys.exit(1)
100 | else:
101 | if config.Proxy_port or config.Proxy_type:
102 | logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
103 | print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
104 | sys.exit(1)
105 |
106 | return _connector
107 |
108 |
109 | async def RequestUrl(config, init):
110 | logme.debug(__name__ + ':RequestUrl')
111 | _connector = get_connector(config)
112 | _serialQuery = ""
113 | params = []
114 | _url = ""
115 | _headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
116 |
117 | # TODO : do this later
118 | if config.Profile:
119 | logme.debug(__name__ + ':RequestUrl:Profile')
120 | _url, params, _serialQuery = url.SearchProfile(config, init)
121 | elif config.TwitterSearch:
122 | logme.debug(__name__ + ':RequestUrl:TwitterSearch')
123 | _url, params, _serialQuery = await url.Search(config, init)
124 | else:
125 | if config.Following:
126 | logme.debug(__name__ + ':RequestUrl:Following')
127 | _url = await url.Following(config.Username, init)
128 | elif config.Followers:
129 | logme.debug(__name__ + ':RequestUrl:Followers')
130 | _url = await url.Followers(config.Username, init)
131 | else:
132 | logme.debug(__name__ + ':RequestUrl:Favorites')
133 | _url = await url.Favorites(config.Username, init)
134 | _serialQuery = _url
135 |
136 | response = await Request(_url, params=params, connector=_connector, headers=_headers)
137 |
138 | if config.Debug:
139 | print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
140 |
141 | return response
142 |
143 |
144 | def ForceNewTorIdentity(config):
145 | logme.debug(__name__ + ':ForceNewTorIdentity')
146 | try:
147 | tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
148 | tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
149 | response = tor_c.recv(1024)
150 | if response != b'250 OK\r\n250 OK\r\n':
151 | sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
152 | logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
153 | except Exception as e:
154 | logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
155 | sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
156 | sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
157 |
158 |
159 | async def Request(_url, connector=None, params=None, headers=None):
160 | logme.debug(__name__ + ':Request:Connector')
161 | async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
162 | return await Response(session, _url, params)
163 |
164 |
165 | async def Response(session, _url, params=None):
166 | logme.debug(__name__ + ':Response')
167 | retries = 5
168 | wait = 10 # No basis, maybe work with 0
169 | for attempt in range(retries + 1):
170 | try:
171 | with timeout(120):
172 | async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
173 | resp = await response.text()
174 | if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
175 | raise TokenExpiryException(loads(resp)['errors'][0]['message'])
176 | return resp
177 | except aiohttp.client_exceptions.ClientConnectorError as exc:
178 | if attempt < retries:
179 | retrying = ', retrying'
180 | level = logme.WARNING
181 | else:
182 | retrying = ''
183 | level = logme.ERROR
184 | logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
185 | if attempt < retries:
186 | time.sleep(wait)
187 | else:
188 | logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
189 | raise TokenExpiryException(f'{exc!r}')
190 |
191 |
192 | async def RandomUserAgent(wa=None):
193 | logme.debug(__name__ + ':RandomUserAgent')
194 | try:
195 | if wa:
196 | return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
197 | return UserAgent(verify_ssl=False, use_cache_server=False).random
198 | except:
199 | return random.choice(user_agent_list)
200 |
201 | async def Username(_id, bearer_token, guest_token):
202 | logme.debug(__name__ + ':Username')
203 | _dct = {'screen_name': _id, 'withSafetyModeUserFields': False, 'withSuperFollowsUserFields': False}
204 | _url = "https://twitter.com/i/api/graphql/Bhlf1dYJ3bYCKmLfeEQ31A/UserByScreenName?variables={}".format(
205 | dict_to_url(_dct))
206 | _headers = {
207 | 'authorization': bearer_token,
208 | 'x-guest-token': guest_token,
209 | }
210 | print(dumps(_dct),_url)
211 | r = await Request(_url, headers=_headers)
212 | j_r = loads(r)
213 | username = j_r['data']['user']['result']['legacy']['screen_name']
214 | return username
215 |
216 |
217 | async def Tweet(url, config, conn):
218 | logme.debug(__name__ + ':Tweet')
219 | try:
220 | response = await Request(url)
221 | soup = BeautifulSoup(response, "html.parser")
222 | tweets = soup.find_all("div", "tweet")
223 | await Tweets(tweets, config, conn, url)
224 | except Exception as e:
225 | logme.critical(__name__ + ':Tweet:' + str(e))
226 |
227 |
228 | async def User(username, config, conn, user_id=False):
229 | logme.debug(__name__ + ':User')
230 | _dct = {'screen_name': username, 'withHighlightedLabel': False}
231 | _url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}' \
232 | .format(dict_to_url(_dct))
233 | _headers = {
234 | 'authorization': config.Bearer_token,
235 | 'x-guest-token': config.Guest_token,
236 | }
237 | try:
238 | response = await Request(_url, headers=_headers)
239 | j_r = loads(response)
240 | if user_id:
241 | try:
242 | _id = j_r['data']['user']['rest_id']
243 | return _id
244 | except KeyError as e:
245 | logme.critical(__name__ + ':User:' + str(e))
246 | return
247 | await Users(j_r, config, conn)
248 | except Exception as e:
249 | logme.critical(__name__ + ':User:' + str(e))
250 | raise
251 |
252 |
253 | def Limit(Limit, count):
254 | logme.debug(__name__ + ':Limit')
255 | if Limit is not None and count >= int(Limit):
256 | return True
257 |
258 |
259 | async def Multi(feed, config, conn):
260 | logme.debug(__name__ + ':Multi')
261 | count = 0
262 | try:
263 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
264 | loop = asyncio.get_event_loop()
265 | futures = []
266 | for tweet in feed:
267 | count += 1
268 | if config.Favorites or config.Profile_full:
269 | logme.debug(__name__ + ':Multi:Favorites-profileFull')
270 | link = tweet.find("a")["href"]
271 | url = f"https://twitter.com{link}&lang=en"
272 | elif config.User_full:
273 | logme.debug(__name__ + ':Multi:userFull')
274 | username = tweet.find("a")["name"]
275 | url = f"http://twitter.com/{username}?lang=en"
276 | else:
277 | logme.debug(__name__ + ':Multi:else-url')
278 | link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
279 | url = f"https://twitter.com{link}?lang=en"
280 |
281 | if config.User_full:
282 | logme.debug(__name__ + ':Multi:user-full-Run')
283 | futures.append(loop.run_in_executor(executor, await User(url,
284 | config, conn)))
285 | else:
286 | logme.debug(__name__ + ':Multi:notUser-full-Run')
287 | futures.append(loop.run_in_executor(executor, await Tweet(url,
288 | config, conn)))
289 | logme.debug(__name__ + ':Multi:asyncioGather')
290 | await asyncio.gather(*futures)
291 | except Exception as e:
292 | # TODO: fix error not error
293 | # print(str(e) + " [x] get.Multi")
294 | # will return "'NoneType' object is not callable"
295 | # but still works
296 | # logme.critical(__name__+':Multi:' + str(e))
297 | pass
298 |
299 | return count
300 |
--------------------------------------------------------------------------------
/twint/output.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from . import format, get
4 | from .tweet import Tweet
5 | from .user import User
6 | from .storage import db, elasticsearch, write, panda
7 |
8 | import logging as logme
9 |
10 | follows_list = []
11 | tweets_list = []
12 | users_list = []
13 |
14 | author_list = {''}
15 | author_list.pop()
16 |
17 | # used by Pandas
18 | _follows_object = {}
19 |
20 |
21 | def _formatDateTime(datetimestamp):
22 | try:
23 | return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
24 | except ValueError:
25 | return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
26 |
27 |
28 | def _clean_follow_list():
29 | logme.debug(__name__ + ':clean_follow_list')
30 | global _follows_object
31 | _follows_object = {}
32 |
33 |
34 | def clean_lists():
35 | logme.debug(__name__ + ':clean_lists')
36 | global follows_list
37 | global tweets_list
38 | global users_list
39 | follows_list = []
40 | tweets_list = []
41 | users_list = []
42 |
43 |
44 | def datecheck(datetimestamp, config):
45 | logme.debug(__name__ + ':datecheck')
46 | if config.Since:
47 | logme.debug(__name__ + ':datecheck:SinceTrue')
48 |
49 | d = _formatDateTime(datetimestamp)
50 | s = _formatDateTime(config.Since)
51 |
52 | if d < s:
53 | return False
54 | if config.Until:
55 | logme.debug(__name__ + ':datecheck:UntilTrue')
56 |
57 | d = _formatDateTime(datetimestamp)
58 | s = _formatDateTime(config.Until)
59 |
60 | if d > s:
61 | return False
62 | logme.debug(__name__ + ':datecheck:dateRangeFalse')
63 | return True
64 |
65 |
66 | # TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
67 | # `tweets` list along with the other tweets
68 | def is_tweet(tw):
69 | try:
70 | tw["data-item-id"]
71 | logme.debug(__name__ + ':is_tweet:True')
72 | return True
73 | except:
74 | logme.critical(__name__ + ':is_tweet:False')
75 | return False
76 |
77 |
78 | def _output(obj, output, config, **extra):
79 | logme.debug(__name__ + ':_output')
80 | if config.Lowercase:
81 | if isinstance(obj, str):
82 | logme.debug(__name__ + ':_output:Lowercase:username')
83 | obj = obj.lower()
84 | elif obj.__class__.__name__ == "user":
85 | logme.debug(__name__ + ':_output:Lowercase:user')
86 | pass
87 | elif obj.__class__.__name__ == "tweet":
88 | logme.debug(__name__ + ':_output:Lowercase:tweet')
89 | obj.username = obj.username.lower()
90 | author_list.update({obj.username})
91 | for dct in obj.mentions:
92 | for key, val in dct.items():
93 | dct[key] = val.lower()
94 | for i in range(len(obj.hashtags)):
95 | obj.hashtags[i] = obj.hashtags[i].lower()
96 | for i in range(len(obj.cashtags)):
97 | obj.cashtags[i] = obj.cashtags[i].lower()
98 | else:
99 | logme.info('_output:Lowercase:hiddenTweetFound')
100 | print("[x] Hidden tweet found, account suspended due to violation of TOS")
101 | return
102 | if config.Output != None:
103 | if config.Store_csv:
104 | try:
105 | write.Csv(obj, config)
106 | logme.debug(__name__ + ':_output:CSV')
107 | except Exception as e:
108 | logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
109 | print(str(e) + " [x] output._output")
110 | elif config.Store_json:
111 | write.Json(obj, config)
112 | logme.debug(__name__ + ':_output:JSON')
113 | else:
114 | write.Text(output, config.Output)
115 | logme.debug(__name__ + ':_output:Text')
116 |
117 | if config.Elasticsearch:
118 | logme.debug(__name__ + ':_output:Elasticsearch')
119 | print("", end=".", flush=True)
120 | else:
121 | if not config.Hide_output:
122 | try:
123 | print(output.replace('\n', ' '))
124 | except UnicodeEncodeError:
125 | logme.critical(__name__ + ':_output:UnicodeEncodeError')
126 | print("unicode error [x] output._output")
127 |
128 |
129 | async def checkData(tweet, config, conn):
130 | logme.debug(__name__ + ':checkData')
131 | tweet = Tweet(tweet, config)
132 | if not tweet.datestamp:
133 | logme.critical(__name__ + ':checkData:hiddenTweetFound')
134 | print("[x] Hidden tweet found, account suspended due to violation of TOS")
135 | return
136 | if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
137 | output = format.Tweet(config, tweet)
138 | if config.Database:
139 | logme.debug(__name__ + ':checkData:Database')
140 | db.tweets(conn, tweet, config)
141 | if config.Pandas:
142 | logme.debug(__name__ + ':checkData:Pandas')
143 | panda.update(tweet, config)
144 | if config.Store_object:
145 | logme.debug(__name__ + ':checkData:Store_object')
146 | if hasattr(config.Store_object_tweets_list, 'append'):
147 | config.Store_object_tweets_list.append(tweet)
148 | else:
149 | tweets_list.append(tweet)
150 | if config.Elasticsearch:
151 | logme.debug(__name__ + ':checkData:Elasticsearch')
152 | elasticsearch.Tweet(tweet, config)
153 | _output(tweet, output, config)
154 | # else:
155 | # logme.critical(__name__+':checkData:copyrightedTweet')
156 |
157 |
158 | async def Tweets(tweets, config, conn):
159 | logme.debug(__name__ + ':Tweets')
160 | if config.Favorites or config.Location:
161 | logme.debug(__name__ + ':Tweets:fav+full+loc')
162 | for tw in tweets:
163 | await checkData(tw, config, conn)
164 | elif config.TwitterSearch or config.Profile:
165 | logme.debug(__name__ + ':Tweets:TwitterSearch')
166 | await checkData(tweets, config, conn)
167 | else:
168 | logme.debug(__name__ + ':Tweets:else')
169 | if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
170 | await checkData(tweets, config, conn)
171 |
172 |
173 | async def Users(u, config, conn):
174 | logme.debug(__name__ + ':User')
175 | global users_list
176 |
177 | user = User(u)
178 | output = format.User(config.Format, user)
179 |
180 | if config.Database:
181 | logme.debug(__name__ + ':User:Database')
182 | db.user(conn, config, user)
183 |
184 | if config.Elasticsearch:
185 | logme.debug(__name__ + ':User:Elasticsearch')
186 | _save_date = user.join_date
187 | _save_time = user.join_time
188 | user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
189 | user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
190 | elasticsearch.UserProfile(user, config)
191 | user.join_date = _save_date
192 | user.join_time = _save_time
193 |
194 | if config.Store_object:
195 | logme.debug(__name__ + ':User:Store_object')
196 |
197 | if hasattr(config.Store_object_follow_list, 'append'):
198 | config.Store_object_follow_list.append(user)
199 | elif hasattr(config.Store_object_users_list, 'append'):
200 | config.Store_object_users_list.append(user)
201 | else:
202 | users_list.append(user) # twint.user.user
203 |
204 | if config.Pandas:
205 | logme.debug(__name__ + ':User:Pandas+user')
206 | panda.update(user, config)
207 |
208 | _output(user, output, config)
209 |
210 |
211 | async def Username(username, config, conn):
212 | logme.debug(__name__ + ':Username')
213 | global _follows_object
214 | global follows_list
215 | follow_var = config.Following * "following" + config.Followers * "followers"
216 |
217 | if config.Database:
218 | logme.debug(__name__ + ':Username:Database')
219 | db.follow(conn, config.Username, config.Followers, username)
220 |
221 | if config.Elasticsearch:
222 | logme.debug(__name__ + ':Username:Elasticsearch')
223 | elasticsearch.Follow(username, config)
224 |
225 | if config.Store_object:
226 | if hasattr(config.Store_object_follow_list, 'append'):
227 | config.Store_object_follow_list.append(username)
228 | else:
229 | follows_list.append(username) # twint.user.user
230 |
231 | if config.Pandas:
232 | logme.debug(__name__ + ':Username:object+pandas')
233 | try:
234 | _ = _follows_object[config.Username][follow_var]
235 | except KeyError:
236 | _follows_object.update({config.Username: {follow_var: []}})
237 | _follows_object[config.Username][follow_var].append(username)
238 | if config.Pandas_au:
239 | logme.debug(__name__ + ':Username:object+pandas+au')
240 | panda.update(_follows_object[config.Username], config)
241 | _output(username, username, config)
242 |
--------------------------------------------------------------------------------
/twint/run.py:
--------------------------------------------------------------------------------
1 | import sys, os, datetime
2 | from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
3 |
4 | from . import datelock, feed, get, output, verbose, storage
5 | from .token import TokenExpiryException
6 | from . import token
7 | from .storage import db
8 | from .feed import NoMoreTweetsException
9 |
10 | import logging as logme
11 |
12 | import time
13 |
14 | bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
15 | '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
16 |
17 |
18 | class Twint:
19 | def __init__(self, config):
20 | logme.debug(__name__ + ':Twint:__init__')
21 | if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
22 | logme.debug(__name__ + ':Twint:__init__:Resume')
23 | self.init = self.get_resume(config.Resume)
24 | else:
25 | self.init = -1
26 |
27 | config.deleted = []
28 | self.feed: list = [-1]
29 | self.count = 0
30 | self.user_agent = ""
31 | self.config = config
32 | self.config.Bearer_token = bearer
33 | # TODO might have to make some adjustments for it to work with multi-treading
34 | # USAGE : to get a new guest token simply do `self.token.refresh()`
35 | self.token = token.Token(config)
36 | self.token.refresh()
37 | self.conn = db.Conn(config.Database)
38 | self.d = datelock.Set(self.config.Until, self.config.Since)
39 | verbose.Elastic(config.Elasticsearch)
40 |
41 | if self.config.Store_object:
42 | logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
43 | output._clean_follow_list()
44 |
45 | if self.config.Pandas_clean:
46 | logme.debug(__name__ + ':Twint:__init__:pandas_clean')
47 | storage.panda.clean()
48 |
49 | def get_resume(self, resumeFile):
50 | if not os.path.exists(resumeFile):
51 | return '-1'
52 | with open(resumeFile, 'r') as rFile:
53 | _init = rFile.readlines()[-1].strip('\n')
54 | return _init
55 |
56 | async def Feed(self):
57 | logme.debug(__name__ + ':Twint:Feed')
58 | consecutive_errors_count = 0
59 | while True:
60 | # this will receive a JSON string, parse it into a `dict` and do the required stuff
61 | try:
62 | response = await get.RequestUrl(self.config, self.init)
63 | except TokenExpiryException as e:
64 | logme.debug(__name__ + 'Twint:Feed:' + str(e))
65 | self.token.refresh()
66 | response = await get.RequestUrl(self.config, self.init)
67 |
68 | if self.config.Debug:
69 | print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
70 |
71 | self.feed = []
72 | try:
73 | if self.config.Favorites:
74 | self.feed, self.init = feed.MobileFav(response)
75 | favorite_err_cnt = 0
76 | if len(self.feed) == 0 and len(self.init) == 0:
77 | while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
78 | self.user_agent = await get.RandomUserAgent(wa=False)
79 | response = await get.RequestUrl(self.config, self.init,
80 | headers=[("User-Agent", self.user_agent)])
81 | self.feed, self.init = feed.MobileFav(response)
82 | favorite_err_cnt += 1
83 | time.sleep(1)
84 | if favorite_err_cnt == 5:
85 | print("Favorite page could not be fetched")
86 | if not self.count % 40:
87 | time.sleep(5)
88 | elif self.config.Followers or self.config.Following:
89 | self.feed, self.init = feed.Follow(response)
90 | if not self.count % 40:
91 | time.sleep(5)
92 | elif self.config.Profile or self.config.TwitterSearch:
93 | try:
94 | self.feed, self.init = feed.parse_tweets(self.config, response)
95 | except NoMoreTweetsException as e:
96 | logme.debug(__name__ + ':Twint:Feed:' + str(e))
97 | print('[!] ' + str(e) + ' Scraping will stop now.')
98 | print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
99 | break
100 | break
101 | except TimeoutError as e:
102 | if self.config.Proxy_host.lower() == "tor":
103 | print("[?] Timed out, changing Tor identity...")
104 | if self.config.Tor_control_password is None:
105 | logme.critical(__name__ + ':Twint:Feed:tor-password')
106 | sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
107 | sys.stderr.write(
108 | "Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
109 | "-controller-interface-directly\r\n")
110 | break
111 | else:
112 | get.ForceNewTorIdentity(self.config)
113 | continue
114 | else:
115 | logme.critical(__name__ + ':Twint:Feed:' + str(e))
116 | print(str(e))
117 | break
118 | except Exception as e:
119 | if self.config.Profile or self.config.Favorites:
120 | print("[!] Twitter does not return more data, scrape stops here.")
121 | break
122 |
123 | logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
124 | # Sometimes Twitter says there is no data. But it's a lie.
125 | # raise
126 | consecutive_errors_count += 1
127 | if consecutive_errors_count < self.config.Retries_count:
128 | # skip to the next iteration if wait time does not satisfy limit constraints
129 | delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
130 |
131 | # if the delay is less than users set min wait time then replace delay
132 | if self.config.Min_wait_time > delay:
133 | delay = self.config.Min_wait_time
134 |
135 | sys.stderr.write('sleeping for {} secs\n'.format(delay))
136 | time.sleep(delay)
137 | self.user_agent = await get.RandomUserAgent(wa=True)
138 | continue
139 | logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
140 | sys.stderr.write(str(e) + " [x] run.Feed")
141 | sys.stderr.write(
142 | "[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
143 | "we will investigate it!")
144 | break
145 | if self.config.Resume:
146 | print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
147 |
148 | async def follow(self):
149 | await self.Feed()
150 | if self.config.User_full:
151 | logme.debug(__name__ + ':Twint:follow:userFull')
152 | self.count += await get.Multi(self.feed, self.config, self.conn)
153 | else:
154 | logme.debug(__name__ + ':Twint:follow:notUserFull')
155 | for user in self.feed:
156 | self.count += 1
157 | username = user.find("a")["name"]
158 | await output.Username(username, self.config, self.conn)
159 |
160 | async def favorite(self):
161 | logme.debug(__name__ + ':Twint:favorite')
162 | await self.Feed()
163 | favorited_tweets_list = []
164 | for tweet in self.feed:
165 | tweet_dict = {}
166 | self.count += 1
167 | try:
168 | tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
169 | t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
170 | tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
171 | tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
172 | '')
173 | tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
174 | date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
175 | # test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
176 | # date_str = test_dates[3]
177 | if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
178 | dateu = str(datetime.date.today())
179 | tweet_dict['date'] = dateu
180 | elif ',' in date_str: # Aug 21, 2019
181 | sp = date_str.replace(',', '').split(' ')
182 | date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
183 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
184 | tweet_dict['date'] = dateu
185 | elif len(date_str.split(' ')) == 3: # 28 Jun 19
186 | sp = date_str.split(' ')
187 | if len(sp[2]) == 2:
188 | sp[2] = '20' + sp[2]
189 | date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
190 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
191 | tweet_dict['date'] = dateu
192 | else: # Aug 21
193 | sp = date_str.split(' ')
194 | date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
195 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
196 | tweet_dict['date'] = dateu
197 |
198 | favorited_tweets_list.append(tweet_dict)
199 |
200 | except Exception as e:
201 | logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
202 | print("shit: ", date_str, " ", str(e))
203 |
204 | try:
205 | self.config.favorited_tweets_list += favorited_tweets_list
206 | except AttributeError:
207 | self.config.favorited_tweets_list = favorited_tweets_list
208 |
209 | async def profile(self):
210 | await self.Feed()
211 | logme.debug(__name__ + ':Twint:profile')
212 | for tweet in self.feed:
213 | self.count += 1
214 | await output.Tweets(tweet, self.config, self.conn)
215 |
216 | async def tweets(self):
217 | await self.Feed()
218 | # TODO : need to take care of this later
219 | if self.config.Location:
220 | logme.debug(__name__ + ':Twint:tweets:location')
221 | self.count += await get.Multi(self.feed, self.config, self.conn)
222 | else:
223 | logme.debug(__name__ + ':Twint:tweets:notLocation')
224 | for tweet in self.feed:
225 | self.count += 1
226 | await output.Tweets(tweet, self.config, self.conn)
227 |
228 | async def main(self, callback=None):
229 |
230 | task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
231 |
232 | if callback:
233 | task.add_done_callback(callback)
234 |
235 | await task
236 |
237 | async def run(self):
238 | if self.config.TwitterSearch:
239 | self.user_agent = await get.RandomUserAgent(wa=True)
240 | else:
241 | self.user_agent = await get.RandomUserAgent()
242 |
243 | if self.config.User_id is not None and self.config.Username is None:
244 | logme.debug(__name__ + ':Twint:main:user_id')
245 | self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
246 | self.config.Guest_token)
247 |
248 | if self.config.Username is not None and self.config.User_id is None:
249 | logme.debug(__name__ + ':Twint:main:username')
250 |
251 | self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
252 | if self.config.User_id is None:
253 | raise ValueError("Cannot find twitter account with name = " + self.config.Username)
254 |
255 | # TODO : will need to modify it to work with the new endpoints
256 | if self.config.TwitterSearch and self.config.Since and self.config.Until:
257 | logme.debug(__name__ + ':Twint:main:search+since+until')
258 | while self.d.since < self.d.until:
259 | self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
260 | self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
261 | if len(self.feed) > 0:
262 | await self.tweets()
263 | else:
264 | logme.debug(__name__ + ':Twint:main:gettingNewTweets')
265 | break
266 |
267 | if get.Limit(self.config.Limit, self.count):
268 | break
269 | elif self.config.Lookup:
270 | await self.Lookup()
271 | else:
272 | logme.debug(__name__ + ':Twint:main:not-search+since+until')
273 | while True:
274 | if len(self.feed) > 0:
275 | if self.config.Followers or self.config.Following:
276 | logme.debug(__name__ + ':Twint:main:follow')
277 | await self.follow()
278 | elif self.config.Favorites:
279 | logme.debug(__name__ + ':Twint:main:favorites')
280 | await self.favorite()
281 | elif self.config.Profile:
282 | logme.debug(__name__ + ':Twint:main:profile')
283 | await self.profile()
284 | elif self.config.TwitterSearch:
285 | logme.debug(__name__ + ':Twint:main:twitter-search')
286 | await self.tweets()
287 | else:
288 | logme.debug(__name__ + ':Twint:main:no-more-tweets')
289 | break
290 |
291 | # logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
292 | if get.Limit(self.config.Limit, self.count):
293 | logme.debug(__name__ + ':Twint:main:reachedLimit')
294 | break
295 |
296 | if self.config.Count:
297 | verbose.Count(self.count, self.config)
298 |
299 | async def Lookup(self):
300 | logme.debug(__name__ + ':Twint:Lookup')
301 |
302 | try:
303 | if self.config.User_id is not None and self.config.Username is None:
304 | logme.debug(__name__ + ':Twint:Lookup:user_id')
305 | self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
306 | self.config.Guest_token)
307 | await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
308 |
309 | except Exception as e:
310 | logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
311 | raise
312 |
313 |
314 | def run(config, callback=None):
315 | logme.debug(__name__ + ':run')
316 | try:
317 | get_event_loop()
318 | except RuntimeError as e:
319 | if "no current event loop" in str(e):
320 | set_event_loop(new_event_loop())
321 | else:
322 | logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
323 | raise
324 | except Exception as e:
325 | logme.exception(
326 | __name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
327 | raise
328 |
329 | get_event_loop().run_until_complete(Twint(config).main(callback))
330 |
331 |
332 | def Favorites(config):
333 | logme.debug(__name__ + ':Favorites')
334 | config.Favorites = True
335 | config.Following = False
336 | config.Followers = False
337 | config.Profile = False
338 | config.TwitterSearch = False
339 | run(config)
340 | if config.Pandas_au:
341 | storage.panda._autoget("tweet")
342 |
343 |
344 | def Followers(config):
345 | logme.debug(__name__ + ':Followers')
346 | config.Followers = True
347 | config.Following = False
348 | config.Profile = False
349 | config.Favorites = False
350 | config.TwitterSearch = False
351 | run(config)
352 | if config.Pandas_au:
353 | storage.panda._autoget("followers")
354 | if config.User_full:
355 | storage.panda._autoget("user")
356 | if config.Pandas_clean and not config.Store_object:
357 | # storage.panda.clean()
358 | output._clean_follow_list()
359 |
360 |
361 | def Following(config):
362 | logme.debug(__name__ + ':Following')
363 | config.Following = True
364 | config.Followers = False
365 | config.Profile = False
366 | config.Favorites = False
367 | config.TwitterSearch = False
368 | run(config)
369 | if config.Pandas_au:
370 | storage.panda._autoget("following")
371 | if config.User_full:
372 | storage.panda._autoget("user")
373 | if config.Pandas_clean and not config.Store_object:
374 | # storage.panda.clean()
375 | output._clean_follow_list()
376 |
377 |
378 | def Lookup(config):
379 | logme.debug(__name__ + ':Lookup')
380 | config.Profile = False
381 | config.Lookup = True
382 | config.Favorites = False
383 | config.FOllowing = False
384 | config.Followers = False
385 | config.TwitterSearch = False
386 | run(config)
387 | if config.Pandas_au:
388 | storage.panda._autoget("user")
389 |
390 |
391 | def Profile(config):
392 | logme.debug(__name__ + ':Profile')
393 | config.Profile = True
394 | config.Favorites = False
395 | config.Following = False
396 | config.Followers = False
397 | config.TwitterSearch = False
398 | run(config)
399 | if config.Pandas_au:
400 | storage.panda._autoget("tweet")
401 |
402 |
403 | def Search(config, callback=None):
404 | logme.debug(__name__ + ':Search')
405 | config.TwitterSearch = True
406 | config.Favorites = False
407 | config.Following = False
408 | config.Followers = False
409 | config.Profile = False
410 | run(config, callback)
411 | if config.Pandas_au:
412 | storage.panda._autoget("tweet")
413 |
--------------------------------------------------------------------------------
/twint/storage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woluxwolu/twint/21acd871948cc79c8e95bc72bc3b9064cb968012/twint/storage/__init__.py
--------------------------------------------------------------------------------
/twint/storage/db.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import sys
3 | import time
4 | import hashlib
5 |
6 | from datetime import datetime
7 |
8 | def Conn(database):
9 | if database:
10 | print("[+] Inserting into Database: " + str(database))
11 | conn = init(database)
12 | if isinstance(conn, str): # error
13 | print(conn)
14 | sys.exit(1)
15 | else:
16 | conn = ""
17 |
18 | return conn
19 |
20 | def init(db):
21 | try:
22 | conn = sqlite3.connect(db)
23 | cursor = conn.cursor()
24 |
25 | table_users = """
26 | CREATE TABLE IF NOT EXISTS
27 | users(
28 | id integer not null,
29 | id_str text not null,
30 | name text,
31 | username text not null,
32 | bio text,
33 | location text,
34 | url text,
35 | join_date text not null,
36 | join_time text not null,
37 | tweets integer,
38 | following integer,
39 | followers integer,
40 | likes integer,
41 | media integer,
42 | private integer not null,
43 | verified integer not null,
44 | profile_image_url text not null,
45 | background_image text,
46 | hex_dig text not null,
47 | time_update integer not null,
48 | CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
49 | );
50 | """
51 | cursor.execute(table_users)
52 |
53 | table_tweets = """
54 | CREATE TABLE IF NOT EXISTS
55 | tweets (
56 | id integer not null,
57 | id_str text not null,
58 | tweet text default '',
59 | language text default '',
60 | conversation_id text not null,
61 | created_at integer not null,
62 | date text not null,
63 | time text not null,
64 | timezone text not null,
65 | place text default '',
66 | replies_count integer,
67 | likes_count integer,
68 | retweets_count integer,
69 | user_id integer not null,
70 | user_id_str text not null,
71 | screen_name text not null,
72 | name text default '',
73 | link text,
74 | mentions text,
75 | hashtags text,
76 | cashtags text,
77 | urls text,
78 | photos text,
79 | thumbnail text,
80 | quote_url text,
81 | video integer,
82 | geo text,
83 | near text,
84 | source text,
85 | time_update integer not null,
86 | `translate` text default '',
87 | trans_src text default '',
88 | trans_dest text default '',
89 | PRIMARY KEY (id)
90 | );
91 | """
92 | cursor.execute(table_tweets)
93 |
94 | table_retweets = """
95 | CREATE TABLE IF NOT EXISTS
96 | retweets(
97 | user_id integer not null,
98 | username text not null,
99 | tweet_id integer not null,
100 | retweet_id integer not null,
101 | retweet_date integer,
102 | CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
103 | CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
104 | CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
105 | );
106 | """
107 | cursor.execute(table_retweets)
108 |
109 | table_reply_to = """
110 | CREATE TABLE IF NOT EXISTS
111 | replies(
112 | tweet_id integer not null,
113 | user_id integer not null,
114 | username text not null,
115 | CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
116 | CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
117 | );
118 | """
119 | cursor.execute(table_reply_to)
120 |
121 | table_favorites = """
122 | CREATE TABLE IF NOT EXISTS
123 | favorites(
124 | user_id integer not null,
125 | tweet_id integer not null,
126 | CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
127 | CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
128 | CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
129 | );
130 | """
131 | cursor.execute(table_favorites)
132 |
133 | table_followers = """
134 | CREATE TABLE IF NOT EXISTS
135 | followers (
136 | id integer not null,
137 | follower_id integer not null,
138 | CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
139 | CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
140 | CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
141 | );
142 | """
143 | cursor.execute(table_followers)
144 |
145 | table_following = """
146 | CREATE TABLE IF NOT EXISTS
147 | following (
148 | id integer not null,
149 | following_id integer not null,
150 | CONSTRAINT following_pk PRIMARY KEY (id, following_id),
151 | CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
152 | CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
153 | );
154 | """
155 | cursor.execute(table_following)
156 |
157 | table_followers_names = """
158 | CREATE TABLE IF NOT EXISTS
159 | followers_names (
160 | user text not null,
161 | time_update integer not null,
162 | follower text not null,
163 | PRIMARY KEY (user, follower)
164 | );
165 | """
166 | cursor.execute(table_followers_names)
167 |
168 | table_following_names = """
169 | CREATE TABLE IF NOT EXISTS
170 | following_names (
171 | user text not null,
172 | time_update integer not null,
173 | follows text not null,
174 | PRIMARY KEY (user, follows)
175 | );
176 | """
177 | cursor.execute(table_following_names)
178 |
179 | return conn
180 | except Exception as e:
181 | return str(e)
182 |
183 | def fTable(Followers):
184 | if Followers:
185 | table = "followers_names"
186 | else:
187 | table = "following_names"
188 |
189 | return table
190 |
191 | def uTable(Followers):
192 | if Followers:
193 | table = "followers"
194 | else:
195 | table = "following"
196 |
197 | return table
198 |
199 | def follow(conn, Username, Followers, User):
200 | try:
201 | time_ms = round(time.time()*1000)
202 | cursor = conn.cursor()
203 | entry = (User, time_ms, Username,)
204 | table = fTable(Followers)
205 | query = f"INSERT INTO {table} VALUES(?,?,?)"
206 | cursor.execute(query, entry)
207 | conn.commit()
208 | except sqlite3.IntegrityError:
209 | pass
210 |
211 | def get_hash_id(conn, id):
212 | cursor = conn.cursor()
213 | cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
214 | resultset = cursor.fetchall()
215 | return resultset[0][0] if resultset else -1
216 |
217 | def user(conn, config, User):
218 | try:
219 | time_ms = round(time.time()*1000)
220 | cursor = conn.cursor()
221 | user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
222 |
223 | hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
224 | entry = tuple(user) + (hex_dig,time_ms,)
225 | old_hash = get_hash_id(conn, User.id)
226 |
227 | if old_hash == -1 or old_hash != hex_dig:
228 | query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
229 | cursor.execute(query, entry)
230 | else:
231 | pass
232 |
233 | if config.Followers or config.Following:
234 | table = uTable(config.Followers)
235 | query = f"INSERT INTO {table} VALUES(?,?)"
236 | cursor.execute(query, (config.User_id, int(User.id)))
237 |
238 | conn.commit()
239 | except sqlite3.IntegrityError:
240 | pass
241 |
242 | def tweets(conn, Tweet, config):
243 | try:
244 | time_ms = round(time.time()*1000)
245 | cursor = conn.cursor()
246 | entry = (Tweet.id,
247 | Tweet.id_str,
248 | Tweet.tweet,
249 | Tweet.lang,
250 | Tweet.conversation_id,
251 | Tweet.datetime,
252 | Tweet.datestamp,
253 | Tweet.timestamp,
254 | Tweet.timezone,
255 | Tweet.place,
256 | Tweet.replies_count,
257 | Tweet.likes_count,
258 | Tweet.retweets_count,
259 | Tweet.user_id,
260 | Tweet.user_id_str,
261 | Tweet.username,
262 | Tweet.name,
263 | Tweet.link,
264 | ",".join(Tweet.mentions),
265 | ",".join(Tweet.hashtags),
266 | ",".join(Tweet.cashtags),
267 | ",".join(Tweet.urls),
268 | ",".join(Tweet.photos),
269 | Tweet.thumbnail,
270 | Tweet.quote_url,
271 | Tweet.video,
272 | Tweet.geo,
273 | Tweet.near,
274 | Tweet.source,
275 | time_ms,
276 | Tweet.translate,
277 | Tweet.trans_src,
278 | Tweet.trans_dest)
279 | cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
280 |
281 | if config.Favorites:
282 | query = 'INSERT INTO favorites VALUES(?,?)'
283 | cursor.execute(query, (config.User_id, Tweet.id))
284 |
285 | if Tweet.retweet:
286 | query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
287 | _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
288 | cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
289 |
290 | if Tweet.reply_to:
291 | for reply in Tweet.reply_to:
292 | query = 'INSERT INTO replies VALUES(?,?,?)'
293 | cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
294 |
295 | conn.commit()
296 | except sqlite3.IntegrityError:
297 | pass
298 |
--------------------------------------------------------------------------------
/twint/storage/elasticsearch.py:
--------------------------------------------------------------------------------
1 | ## TODO - Fix Weekday situation
2 | from elasticsearch import Elasticsearch, helpers
3 | from geopy.geocoders import Nominatim
4 | from datetime import datetime
5 | import contextlib
6 | import sys
7 |
8 | _index_tweet_status = False
9 | _index_follow_status = False
10 | _index_user_status = False
11 | _is_near_def = False
12 | _is_location_def = False
13 | _near = {}
14 | _location = {}
15 |
16 | geolocator = Nominatim(user_agent="twint-1.2")
17 |
18 | class RecycleObject(object):
19 | def write(self, junk): pass
20 | def flush(self): pass
21 |
22 | def getLocation(place, **options):
23 | location = geolocator.geocode(place,timeout=1000)
24 | if location:
25 | if options.get("near"):
26 | global _near
27 | _near = {"lat": location.latitude, "lon": location.longitude}
28 | return True
29 | elif options.get("location"):
30 | global _location
31 | _location = {"lat": location.latitude, "lon": location.longitude}
32 | return True
33 | return {"lat": location.latitude, "lon": location.longitude}
34 | else:
35 | return {}
36 |
37 | def handleIndexResponse(response):
38 | try:
39 | if response["status"] == 400:
40 | return True
41 | except KeyError:
42 | pass
43 | if response["acknowledged"]:
44 | print("[+] Index \"" + response["index"] + "\" created!")
45 | else:
46 | print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
47 | if response["shards_acknowledged"]:
48 | print("[+] Shards acknowledged, everything is ready to be used!")
49 | return True
50 | else:
51 | print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
52 | return False
53 |
54 | def createIndex(config, instance, **scope):
55 | if scope.get("scope") == "tweet":
56 | tweets_body = {
57 | "mappings": {
58 | "properties": {
59 | "id": {"type": "long"},
60 | "conversation_id": {"type": "long"},
61 | "created_at": {"type": "text"},
62 | "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
63 | "timezone": {"type": "keyword"},
64 | "place": {"type": "keyword"},
65 | "location": {"type": "keyword"},
66 | "tweet": {"type": "text"},
67 | "lang": {"type": "keyword"},
68 | "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
69 | "cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
70 | "user_id_str": {"type": "keyword"},
71 | "username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
72 | "name": {"type": "text"},
73 | "profile_image_url": {"type": "text"},
74 | "day": {"type": "integer"},
75 | "hour": {"type": "integer"},
76 | "link": {"type": "text"},
77 | "retweet": {"type": "text"},
78 | "essid": {"type": "keyword"},
79 | "nlikes": {"type": "integer"},
80 | "nreplies": {"type": "integer"},
81 | "nretweets": {"type": "integer"},
82 | "quote_url": {"type": "text"},
83 | "video": {"type":"integer"},
84 | "thumbnail": {"type":"text"},
85 | "search": {"type": "text"},
86 | "near": {"type": "text"},
87 | "geo_near": {"type": "geo_point"},
88 | "geo_tweet": {"type": "geo_point"},
89 | "photos": {"type": "text"},
90 | "user_rt_id": {"type": "keyword"},
91 | "mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
92 | "source": {"type": "keyword"},
93 | "user_rt": {"type": "keyword"},
94 | "retweet_id": {"type": "keyword"},
95 | "reply_to": {
96 | "type": "nested",
97 | "properties": {
98 | "user_id": {"type": "keyword"},
99 | "username": {"type": "keyword"}
100 | }
101 | },
102 | "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
103 | "urls": {"type": "keyword"},
104 | "translate": {"type": "text"},
105 | "trans_src": {"type": "keyword"},
106 | "trans_dest": {"type": "keyword"},
107 | }
108 | },
109 | "settings": {
110 | "number_of_shards": 1,
111 | "analysis": {
112 | "normalizer": {
113 | "hashtag_normalizer": {
114 | "type": "custom",
115 | "char_filter": [],
116 | "filter": ["lowercase", "asciifolding"]
117 | }
118 | }
119 | }
120 | }
121 | }
122 | with nostdout():
123 | resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
124 | return handleIndexResponse(resp)
125 | elif scope.get("scope") == "follow":
126 | follow_body = {
127 | "mappings": {
128 | "properties": {
129 | "user": {"type": "keyword"},
130 | "follow": {"type": "keyword"},
131 | "essid": {"type": "keyword"}
132 | }
133 | },
134 | "settings": {
135 | "number_of_shards": 1
136 | }
137 | }
138 | with nostdout():
139 | resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
140 | return handleIndexResponse(resp)
141 | elif scope.get("scope") == "user":
142 | user_body = {
143 | "mappings": {
144 | "properties": {
145 | "id": {"type": "keyword"},
146 | "name": {"type": "keyword"},
147 | "username": {"type": "keyword"},
148 | "bio": {"type": "text"},
149 | "location": {"type": "keyword"},
150 | "url": {"type": "text"},
151 | "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
152 | "tweets": {"type": "integer"},
153 | "following": {"type": "integer"},
154 | "followers": {"type": "integer"},
155 | "likes": {"type": "integer"},
156 | "media": {"type": "integer"},
157 | "private": {"type": "integer"},
158 | "verified": {"type": "integer"},
159 | "avatar": {"type": "text"},
160 | "background_image": {"type": "text"},
161 | "session": {"type": "keyword"},
162 | "geo_user": {"type": "geo_point"}
163 | }
164 | },
165 | "settings": {
166 | "number_of_shards": 1
167 | }
168 | }
169 | with nostdout():
170 | resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
171 | return handleIndexResponse(resp)
172 | else:
173 | print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
174 | return False
175 |
176 | @contextlib.contextmanager
177 | def nostdout():
178 | savestdout = sys.stdout
179 | sys.stdout = RecycleObject()
180 | yield
181 | sys.stdout = savestdout
182 |
183 | def weekday(day):
184 | weekdays = {
185 | "Monday": 1,
186 | "Tuesday": 2,
187 | "Wednesday": 3,
188 | "Thursday": 4,
189 | "Friday": 5,
190 | "Saturday": 6,
191 | "Sunday": 7,
192 | }
193 |
194 | return weekdays[day]
195 |
196 | def Tweet(Tweet, config):
197 | global _index_tweet_status
198 | global _is_near_def
199 | date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
200 |
201 | actions = []
202 |
203 | try:
204 | retweet = Tweet.retweet
205 | except AttributeError:
206 | retweet = None
207 |
208 | dt = f"{Tweet.datestamp} {Tweet.timestamp}"
209 |
210 | j_data = {
211 | "_index": config.Index_tweets,
212 | "_id": str(Tweet.id) + "_raw_" + config.Essid,
213 | "_source": {
214 | "id": str(Tweet.id),
215 | "conversation_id": Tweet.conversation_id,
216 | "created_at": Tweet.datetime,
217 | "date": dt,
218 | "timezone": Tweet.timezone,
219 | "place": Tweet.place,
220 | "tweet": Tweet.tweet,
221 | "language": Tweet.lang,
222 | "hashtags": Tweet.hashtags,
223 | "cashtags": Tweet.cashtags,
224 | "user_id_str": Tweet.user_id_str,
225 | "username": Tweet.username,
226 | "name": Tweet.name,
227 | "day": date_obj.weekday(),
228 | "hour": date_obj.hour,
229 | "link": Tweet.link,
230 | "retweet": retweet,
231 | "essid": config.Essid,
232 | "nlikes": int(Tweet.likes_count),
233 | "nreplies": int(Tweet.replies_count),
234 | "nretweets": int(Tweet.retweets_count),
235 | "quote_url": Tweet.quote_url,
236 | "video": Tweet.video,
237 | "search": str(config.Search),
238 | "near": config.Near
239 | }
240 | }
241 | if retweet is not None:
242 | j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
243 | j_data["_source"].update({"user_rt": Tweet.user_rt})
244 | j_data["_source"].update({"retweet_id": Tweet.retweet_id})
245 | j_data["_source"].update({"retweet_date": Tweet.retweet_date})
246 | if Tweet.reply_to:
247 | j_data["_source"].update({"reply_to": Tweet.reply_to})
248 | if Tweet.photos:
249 | _photos = []
250 | for photo in Tweet.photos:
251 | _photos.append(photo)
252 | j_data["_source"].update({"photos": _photos})
253 | if Tweet.thumbnail:
254 | j_data["_source"].update({"thumbnail": Tweet.thumbnail})
255 | if Tweet.mentions:
256 | _mentions = []
257 | for mention in Tweet.mentions:
258 | _mentions.append(mention)
259 | j_data["_source"].update({"mentions": _mentions})
260 | if Tweet.urls:
261 | _urls = []
262 | for url in Tweet.urls:
263 | _urls.append(url)
264 | j_data["_source"].update({"urls": _urls})
265 | if config.Near or config.Geo:
266 | if not _is_near_def:
267 | __geo = ""
268 | __near = ""
269 | if config.Geo:
270 | __geo = config.Geo
271 | if config.Near:
272 | __near = config.Near
273 | _is_near_def = getLocation(__near + __geo, near=True)
274 | if _near:
275 | j_data["_source"].update({"geo_near": _near})
276 | if Tweet.place:
277 | _t_place = getLocation(Tweet.place)
278 | if _t_place:
279 | j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
280 | if Tweet.source:
281 | j_data["_source"].update({"source": Tweet.Source})
282 | if config.Translate:
283 | j_data["_source"].update({"translate": Tweet.translate})
284 | j_data["_source"].update({"trans_src": Tweet.trans_src})
285 | j_data["_source"].update({"trans_dest": Tweet.trans_dest})
286 |
287 | actions.append(j_data)
288 |
289 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
290 | if not _index_tweet_status:
291 | _index_tweet_status = createIndex(config, es, scope="tweet")
292 | with nostdout():
293 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
294 | actions = []
295 |
296 | def Follow(user, config):
297 | global _index_follow_status
298 | actions = []
299 |
300 | if config.Following:
301 | _user = config.Username
302 | _follow = user
303 | else:
304 | _user = user
305 | _follow = config.Username
306 | j_data = {
307 | "_index": config.Index_follow,
308 | "_id": _user + "_" + _follow + "_" + config.Essid,
309 | "_source": {
310 | "user": _user,
311 | "follow": _follow,
312 | "essid": config.Essid
313 | }
314 | }
315 | actions.append(j_data)
316 |
317 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
318 | if not _index_follow_status:
319 | _index_follow_status = createIndex(config, es, scope="follow")
320 | with nostdout():
321 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
322 | actions = []
323 |
324 | def UserProfile(user, config):
325 | global _index_user_status
326 | global _is_location_def
327 | actions = []
328 |
329 | j_data = {
330 | "_index": config.Index_users,
331 | "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
332 | "_source": {
333 | "id": user.id,
334 | "name": user.name,
335 | "username": user.username,
336 | "bio": user.bio,
337 | "location": user.location,
338 | "url": user.url,
339 | "join_datetime": user.join_date + " " + user.join_time,
340 | "tweets": user.tweets,
341 | "following": user.following,
342 | "followers": user.followers,
343 | "likes": user.likes,
344 | "media": user.media_count,
345 | "private": user.is_private,
346 | "verified": user.is_verified,
347 | "avatar": user.avatar,
348 | "background_image": user.background_image,
349 | "session": config.Essid
350 | }
351 | }
352 | if config.Location:
353 | if not _is_location_def:
354 | _is_location_def = getLocation(user.location, location=True)
355 | if _location:
356 | j_data["_source"].update({"geo_user": _location})
357 | actions.append(j_data)
358 |
359 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
360 | if not _index_user_status:
361 | _index_user_status = createIndex(config, es, scope="user")
362 | with nostdout():
363 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
364 | actions = []
365 |
--------------------------------------------------------------------------------
/twint/storage/panda.py:
--------------------------------------------------------------------------------
1 | import datetime, pandas as pd, warnings
2 | from time import strftime, localtime
3 | from twint.tweet import Tweet_formats
4 |
5 | Tweets_df = None
6 | Follow_df = None
7 | User_df = None
8 |
9 | _object_blocks = {
10 | "tweet": [],
11 | "user": [],
12 | "following": [],
13 | "followers": []
14 | }
15 |
16 | weekdays = {
17 | "Monday": 1,
18 | "Tuesday": 2,
19 | "Wednesday": 3,
20 | "Thursday": 4,
21 | "Friday": 5,
22 | "Saturday": 6,
23 | "Sunday": 7,
24 | }
25 |
26 | _type = ""
27 |
28 | def _concat(df, _type):
29 | if df is None:
30 | df = pd.DataFrame(_object_blocks[_type])
31 | else:
32 | _df = pd.DataFrame(_object_blocks[_type])
33 | df = pd.concat([df, _df], sort=True)
34 | return df
35 |
36 | def _autoget(_type):
37 | global Tweets_df
38 | global Follow_df
39 | global User_df
40 |
41 | if _type == "tweet":
42 | Tweets_df = _concat(Tweets_df, _type)
43 | elif _type == "followers" or _type == "following":
44 | Follow_df = _concat(Follow_df, _type)
45 | elif _type == "user":
46 | User_df = _concat(User_df, _type)
47 | else:
48 | error("[x] Wrong type of object passed")
49 |
50 |
51 | def update(object, config):
52 | global _type
53 |
54 | #try:
55 | # _type = ((object.__class__.__name__ == "tweet")*"tweet" +
56 | # (object.__class__.__name__ == "user")*"user")
57 | #except AttributeError:
58 | # _type = config.Following*"following" + config.Followers*"followers"
59 | if object.__class__.__name__ == "tweet":
60 | _type = "tweet"
61 | elif object.__class__.__name__ == "user":
62 | _type = "user"
63 | elif object.__class__.__name__ == "dict":
64 | _type = config.Following*"following" + config.Followers*"followers"
65 |
66 | if _type == "tweet":
67 | Tweet = object
68 | datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
69 | day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
70 | dt = f"{object.datestamp} {object.timestamp}"
71 | _data = {
72 | "id": str(Tweet.id),
73 | "conversation_id": Tweet.conversation_id,
74 | "created_at": datetime_ms,
75 | "date": dt,
76 | "timezone": Tweet.timezone,
77 | "place": Tweet.place,
78 | "tweet": Tweet.tweet,
79 | "language": Tweet.lang,
80 | "hashtags": Tweet.hashtags,
81 | "cashtags": Tweet.cashtags,
82 | "user_id": Tweet.user_id,
83 | "user_id_str": Tweet.user_id_str,
84 | "username": Tweet.username,
85 | "name": Tweet.name,
86 | "day": day,
87 | "hour": strftime("%H", localtime(datetime_ms/1000)),
88 | "link": Tweet.link,
89 | "urls": Tweet.urls,
90 | "photos": Tweet.photos,
91 | "video": Tweet.video,
92 | "thumbnail": Tweet.thumbnail,
93 | "retweet": Tweet.retweet,
94 | "nlikes": int(Tweet.likes_count),
95 | "nreplies": int(Tweet.replies_count),
96 | "nretweets": int(Tweet.retweets_count),
97 | "quote_url": Tweet.quote_url,
98 | "search": str(config.Search),
99 | "near": Tweet.near,
100 | "geo": Tweet.geo,
101 | "source": Tweet.source,
102 | "user_rt_id": Tweet.user_rt_id,
103 | "user_rt": Tweet.user_rt,
104 | "retweet_id": Tweet.retweet_id,
105 | "reply_to": Tweet.reply_to,
106 | "retweet_date": Tweet.retweet_date,
107 | "translate": Tweet.translate,
108 | "trans_src": Tweet.trans_src,
109 | "trans_dest": Tweet.trans_dest
110 | }
111 | _object_blocks[_type].append(_data)
112 | elif _type == "user":
113 | user = object
114 | try:
115 | background_image = user.background_image
116 | except:
117 | background_image = ""
118 | _data = {
119 | "id": user.id,
120 | "name": user.name,
121 | "username": user.username,
122 | "bio": user.bio,
123 | "url": user.url,
124 | "join_datetime": user.join_date + " " + user.join_time,
125 | "join_date": user.join_date,
126 | "join_time": user.join_time,
127 | "tweets": user.tweets,
128 | "location": user.location,
129 | "following": user.following,
130 | "followers": user.followers,
131 | "likes": user.likes,
132 | "media": user.media_count,
133 | "private": user.is_private,
134 | "verified": user.is_verified,
135 | "avatar": user.avatar,
136 | "background_image": background_image,
137 | }
138 | _object_blocks[_type].append(_data)
139 | elif _type == "followers" or _type == "following":
140 | _data = {
141 | config.Following*"following" + config.Followers*"followers" :
142 | {config.Username: object[_type]}
143 | }
144 | _object_blocks[_type] = _data
145 | else:
146 | print("Wrong type of object passed!")
147 |
148 |
149 | def clean():
150 | global Tweets_df
151 | global Follow_df
152 | global User_df
153 | _object_blocks["tweet"].clear()
154 | _object_blocks["following"].clear()
155 | _object_blocks["followers"].clear()
156 | _object_blocks["user"].clear()
157 | Tweets_df = None
158 | Follow_df = None
159 | User_df = None
160 |
161 | def save(_filename, _dataframe, **options):
162 | if options.get("dataname"):
163 | _dataname = options.get("dataname")
164 | else:
165 | _dataname = "twint"
166 |
167 | if not options.get("type"):
168 | with warnings.catch_warnings():
169 | warnings.simplefilter("ignore")
170 | _store = pd.HDFStore(_filename + ".h5")
171 | _store[_dataname] = _dataframe
172 | _store.close()
173 | elif options.get("type") == "Pickle":
174 | with warnings.catch_warnings():
175 | warnings.simplefilter("ignore")
176 | _dataframe.to_pickle(_filename + ".pkl")
177 | else:
178 | print("""Please specify: filename, DataFrame, DataFrame name and type
179 | (HDF5, default, or Pickle)""")
180 |
181 | def read(_filename, **options):
182 | if not options.get("dataname"):
183 | _dataname = "twint"
184 | else:
185 | _dataname = options.get("dataname")
186 |
187 | if not options.get("type"):
188 | _store = pd.HDFStore(_filename + ".h5")
189 | _df = _store[_dataname]
190 | return _df
191 | elif options.get("type") == "Pickle":
192 | _df = pd.read_pickle(_filename + ".pkl")
193 | return _df
194 | else:
195 | print("""Please specify: DataFrame, DataFrame name (twint as default),
196 | filename and type (HDF5, default, or Pickle""")
197 |
--------------------------------------------------------------------------------
/twint/storage/write.py:
--------------------------------------------------------------------------------
1 | from . import write_meta as meta
2 | import csv
3 | import json
4 | import os
5 |
6 | def outputExt(objType, fType):
7 | if objType == "str":
8 | objType = "username"
9 | outExt = f"/{objType}s.{fType}"
10 |
11 | return outExt
12 |
13 | def addExt(base, objType, fType):
14 | if len(base.split('.')) == 1:
15 | createDirIfMissing(base)
16 | base += outputExt(objType, fType)
17 |
18 | return base
19 |
20 | def Text(entry, f):
21 | print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
22 |
23 | def Type(config):
24 | if config.User_full:
25 | _type = "user"
26 | elif config.Followers or config.Following:
27 | _type = "username"
28 | else:
29 | _type = "tweet"
30 |
31 | return _type
32 |
33 | def struct(obj, custom, _type):
34 | if custom:
35 | fieldnames = custom
36 | row = {}
37 | for f in fieldnames:
38 | row[f] = meta.Data(obj, _type)[f]
39 | else:
40 | fieldnames = meta.Fieldnames(_type)
41 | row = meta.Data(obj, _type)
42 |
43 | return fieldnames, row
44 |
45 | def createDirIfMissing(dirname):
46 | if not os.path.exists(dirname):
47 | os.makedirs(dirname)
48 |
49 | def Csv(obj, config):
50 | _obj_type = obj.__class__.__name__
51 | if _obj_type == "str":
52 | _obj_type = "username"
53 | fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
54 |
55 | base = addExt(config.Output, _obj_type, "csv")
56 | dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
57 |
58 | if not (os.path.exists(base)):
59 | with open(base, "w", newline='', encoding="utf-8") as csv_file:
60 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
61 | writer.writeheader()
62 |
63 | with open(base, "a", newline='', encoding="utf-8") as csv_file:
64 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
65 | writer.writerow(row)
66 |
67 | def Json(obj, config):
68 | _obj_type = obj.__class__.__name__
69 | if _obj_type == "str":
70 | _obj_type = "username"
71 | null, data = struct(obj, config.Custom[_obj_type], _obj_type)
72 |
73 | base = addExt(config.Output, _obj_type, "json")
74 |
75 | with open(base, "a", newline='', encoding="utf-8") as json_file:
76 | json.dump(data, json_file, ensure_ascii=False)
77 | json_file.write("\n")
78 |
--------------------------------------------------------------------------------
/twint/storage/write_meta.py:
--------------------------------------------------------------------------------
1 | def tweetData(t):
2 | data = {
3 | "id": int(t.id),
4 | "conversation_id": t.conversation_id,
5 | "created_at": t.datetime,
6 | "date": t.datestamp,
7 | "time": t.timestamp,
8 | "timezone": t.timezone,
9 | "user_id": t.user_id,
10 | "username": t.username,
11 | "name": t.name,
12 | "place": t.place,
13 | "tweet": t.tweet,
14 | "language": t.lang,
15 | "mentions": t.mentions,
16 | "urls": t.urls,
17 | "photos": t.photos,
18 | "replies_count": int(t.replies_count),
19 | "retweets_count": int(t.retweets_count),
20 | "likes_count": int(t.likes_count),
21 | "hashtags": t.hashtags,
22 | "cashtags": t.cashtags,
23 | "link": t.link,
24 | "retweet": t.retweet,
25 | "quote_url": t.quote_url,
26 | "video": t.video,
27 | "thumbnail": t.thumbnail,
28 | "near": t.near,
29 | "geo": t.geo,
30 | "source": t.source,
31 | "user_rt_id": t.user_rt_id,
32 | "user_rt": t.user_rt,
33 | "retweet_id": t.retweet_id,
34 | "reply_to": t.reply_to,
35 | "retweet_date": t.retweet_date,
36 | "translate": t.translate,
37 | "trans_src": t.trans_src,
38 | "trans_dest": t.trans_dest,
39 | }
40 | return data
41 |
42 | def tweetFieldnames():
43 | fieldnames = [
44 | "id",
45 | "conversation_id",
46 | "created_at",
47 | "date",
48 | "time",
49 | "timezone",
50 | "user_id",
51 | "username",
52 | "name",
53 | "place",
54 | "tweet",
55 | "language",
56 | "mentions",
57 | "urls",
58 | "photos",
59 | "replies_count",
60 | "retweets_count",
61 | "likes_count",
62 | "hashtags",
63 | "cashtags",
64 | "link",
65 | "retweet",
66 | "quote_url",
67 | "video",
68 | "thumbnail",
69 | "near",
70 | "geo",
71 | "source",
72 | "user_rt_id",
73 | "user_rt",
74 | "retweet_id",
75 | "reply_to",
76 | "retweet_date",
77 | "translate",
78 | "trans_src",
79 | "trans_dest"
80 | ]
81 | return fieldnames
82 |
83 | def userData(u):
84 | data = {
85 | "id": int(u.id),
86 | "name": u.name,
87 | "username": u.username,
88 | "bio": u.bio,
89 | "location": u.location,
90 | "url": u.url,
91 | "join_date": u.join_date,
92 | "join_time": u.join_time,
93 | "tweets": int(u.tweets),
94 | "following": int(u.following),
95 | "followers": int(u.followers),
96 | "likes": int(u.likes),
97 | "media": int(u.media_count),
98 | "private": u.is_private,
99 | "verified": u.is_verified,
100 | "profile_image_url": u.avatar,
101 | "background_image": u.background_image
102 | }
103 | return data
104 |
105 | def userFieldnames():
106 | fieldnames = [
107 | "id",
108 | "name",
109 | "username",
110 | "bio",
111 | "location",
112 | "url",
113 | "join_date",
114 | "join_time",
115 | "tweets",
116 | "following",
117 | "followers",
118 | "likes",
119 | "media",
120 | "private",
121 | "verified",
122 | "profile_image_url",
123 | "background_image"
124 | ]
125 | return fieldnames
126 |
127 | def usernameData(u):
128 | return {"username": u}
129 |
130 | def usernameFieldnames():
131 | return ["username"]
132 |
133 | def Data(obj, _type):
134 | if _type == "user":
135 | ret = userData(obj)
136 | elif _type == "username":
137 | ret = usernameData(obj)
138 | else:
139 | ret = tweetData(obj)
140 |
141 | return ret
142 |
143 | def Fieldnames(_type):
144 | if _type == "user":
145 | ret = userFieldnames()
146 | elif _type == "username":
147 | ret = usernameFieldnames()
148 | else:
149 | ret = tweetFieldnames()
150 |
151 | return ret
152 |
--------------------------------------------------------------------------------
/twint/token.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 |
4 | import requests
5 | import logging as logme
6 |
7 |
8 | class TokenExpiryException(Exception):
9 | def __init__(self, msg):
10 | super().__init__(msg)
11 |
12 |
13 | class RefreshTokenException(Exception):
14 | def __init__(self, msg):
15 | super().__init__(msg)
16 |
17 |
18 | class Token:
19 | def __init__(self, config):
20 | self._session = requests.Session()
21 | self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
22 | self.config = config
23 | self._retries = 5
24 | self._timeout = 10
25 | self.url = 'https://twitter.com'
26 |
27 | def _request(self):
28 | for attempt in range(self._retries + 1):
29 | # The request is newly prepared on each retry because of potential cookie updates.
30 | req = self._session.prepare_request(requests.Request('GET', self.url))
31 | logme.debug(f'Retrieving {req.url}')
32 | try:
33 | r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
34 | except requests.exceptions.RequestException as exc:
35 | if attempt < self._retries:
36 | retrying = ', retrying'
37 | level = logme.WARNING
38 | else:
39 | retrying = ''
40 | level = logme.ERROR
41 | logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
42 | else:
43 | success, msg = (True, None)
44 | msg = f': {msg}' if msg else ''
45 |
46 | if success:
47 | logme.debug(f'{req.url} retrieved successfully{msg}')
48 | return r
49 | if attempt < self._retries:
50 | # TODO : might wanna tweak this back-off timer
51 | sleep_time = 2.0 * 2 ** attempt
52 | logme.info(f'Waiting {sleep_time:.0f} seconds')
53 | time.sleep(sleep_time)
54 | else:
55 | msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
56 | logme.fatal(msg)
57 | self.config.Guest_token = None
58 | raise RefreshTokenException(msg)
59 |
60 | def refresh(self):
61 | logme.debug('Retrieving guest token')
62 | res = self._request()
63 | match = re.search(r'\("gt=(\d+);', res.text)
64 | if match:
65 | logme.debug('Found guest token in HTML')
66 | self.config.Guest_token = str(match.group(1))
67 | else:
68 | headers = {
69 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
70 | 'authority': 'api.twitter.com',
71 | 'content-length': '0',
72 | 'authorization': self.config.Bearer_token,
73 | 'x-twitter-client-language': 'en',
74 | 'x-csrf-token': res.cookies.get("ct0"),
75 | 'x-twitter-active-user': 'yes',
76 | 'content-type': 'application/x-www-form-urlencoded',
77 | 'accept': '*/*',
78 | 'sec-gpc': '1',
79 | 'origin': 'https://twitter.com',
80 | 'sec-fetch-site': 'same-site',
81 | 'sec-fetch-mode': 'cors',
82 | 'sec-fetch-dest': 'empty',
83 | 'referer': 'https://twitter.com/',
84 | 'accept-language': 'en-US',
85 | }
86 | self._session.headers.update(headers)
87 | req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
88 | res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
89 | if 'guest_token' in res.json():
90 | logme.debug('Found guest token in JSON')
91 | self.config.Guest_token = res.json()['guest_token']
92 | else:
93 | self.config.Guest_token = None
94 | raise RefreshTokenException('Could not find the Guest token in HTML')
95 |
--------------------------------------------------------------------------------
/twint/tweet.py:
--------------------------------------------------------------------------------
1 | from time import strftime, localtime
2 | from datetime import datetime, timezone
3 |
4 | import logging as logme
5 | from googletransx import Translator
6 | # ref.
7 | # - https://github.com/x0rzkov/py-googletrans#basic-usage
8 | translator = Translator()
9 |
10 |
11 | class tweet:
12 | """Define Tweet class
13 | """
14 | type = "tweet"
15 |
16 | def __init__(self):
17 | pass
18 |
19 |
20 | def utc_to_local(utc_dt):
21 | return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
22 |
23 |
24 | Tweet_formats = {
25 | 'datetime': '%Y-%m-%d %H:%M:%S %Z',
26 | 'datestamp': '%Y-%m-%d',
27 | 'timestamp': '%H:%M:%S'
28 | }
29 |
30 |
31 | def _get_mentions(tw):
32 | """Extract mentions from tweet
33 | """
34 | logme.debug(__name__ + ':get_mentions')
35 | try:
36 | mentions = [
37 | {
38 | 'screen_name': _mention['screen_name'],
39 | 'name': _mention['name'],
40 | 'id': _mention['id_str'],
41 | } for _mention in tw['entities']['user_mentions']
42 | if tw['display_text_range'][0] < _mention['indices'][0]
43 | ]
44 | except KeyError:
45 | mentions = []
46 | return mentions
47 |
48 |
49 | def _get_reply_to(tw):
50 | try:
51 | reply_to = [
52 | {
53 | 'screen_name': _mention['screen_name'],
54 | 'name': _mention['name'],
55 | 'id': _mention['id_str'],
56 | } for _mention in tw['entities']['user_mentions']
57 | if tw['display_text_range'][0] > _mention['indices'][1]
58 | ]
59 | except KeyError:
60 | reply_to = []
61 | return reply_to
62 |
63 |
64 | def getText(tw, config):
65 | """Replace some text
66 | """
67 | logme.debug(__name__ + ':getText')
68 | text = tw['full_text']
69 | if not config.Full_text:
70 | text = text.replace("http", " http")
71 | text = text.replace("pic.twitter", " pic.twitter")
72 | text = text.replace("\n", " ")
73 |
74 | return text
75 |
76 |
77 | def Tweet(tw, config):
78 | """Create Tweet object
79 | """
80 | logme.debug(__name__ + ':Tweet')
81 | t = tweet()
82 | t.id = int(tw['id_str'])
83 | t.id_str = tw["id_str"]
84 | t.conversation_id = tw["conversation_id_str"]
85 |
86 | # parsing date to user-friendly format
87 | _dt = tw['created_at']
88 | _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
89 | if not config.Utc:
90 | _dt = utc_to_local(_dt)
91 | t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
92 | # date is of the format year,
93 | t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
94 | t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
95 | t.user_id = int(tw["user_id_str"])
96 | t.user_id_str = tw["user_id_str"]
97 | t.username = tw["user_data"]['screen_name']
98 | t.name = tw["user_data"]['name']
99 | t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
100 | t.timezone = _dt.strftime("%z")
101 | t.mentions = _get_mentions(tw)
102 | t.reply_to = _get_reply_to(tw)
103 | try:
104 | t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
105 | except KeyError:
106 | t.urls = []
107 | try:
108 | t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
109 | _img['expanded_url'].find('/photo/') != -1]
110 | except KeyError:
111 | t.photos = []
112 | try:
113 | t.video = 1 if len(tw['extended_entities']['media']) else 0
114 | except KeyError:
115 | t.video = 0
116 | try:
117 | t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
118 | except KeyError:
119 | t.thumbnail = ''
120 | t.tweet = getText(tw, config)
121 | t.lang = tw['lang']
122 | try:
123 | t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
124 | except KeyError:
125 | t.hashtags = []
126 | try:
127 | t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
128 | except KeyError:
129 | t.cashtags = []
130 | t.replies_count = tw['reply_count']
131 | t.retweets_count = tw['retweet_count']
132 | t.likes_count = tw['favorite_count']
133 | t.link = f"https://twitter.com/{t.username}/status/{t.id}"
134 | try:
135 | if 'user_rt_id' in tw['retweet_data']:
136 | t.retweet = True
137 | t.retweet_id = tw['retweet_data']['retweet_id']
138 | t.retweet_date = tw['retweet_data']['retweet_date']
139 | t.user_rt = tw['retweet_data']['user_rt']
140 | t.user_rt_id = tw['retweet_data']['user_rt_id']
141 | except KeyError:
142 | t.retweet = False
143 | t.retweet_id = ''
144 | t.retweet_date = ''
145 | t.user_rt = ''
146 | t.user_rt_id = ''
147 | try:
148 | t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
149 | except KeyError:
150 | # means that the quoted tweet have been deleted
151 | t.quote_url = 0
152 | t.near = config.Near if config.Near else ""
153 | t.geo = config.Geo if config.Geo else ""
154 | t.source = config.Source if config.Source else ""
155 | t.translate = ''
156 | t.trans_src = ''
157 | t.trans_dest = ''
158 | if config.Translate:
159 | try:
160 | ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
161 | t.translate = ts.text
162 | t.trans_src = ts.src
163 | t.trans_dest = ts.dest
164 | # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
165 | except ValueError as e:
166 | logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
167 | raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
168 | return t
169 |
--------------------------------------------------------------------------------
/twint/url.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import json
3 | from sys import platform
4 | import logging as logme
5 | from urllib.parse import urlencode
6 | from urllib.parse import quote
7 |
8 | mobile = "https://mobile.twitter.com"
9 | base = "https://api.twitter.com/2/search/adaptive.json"
10 |
11 |
12 | def _sanitizeQuery(_url, params):
13 | _serialQuery = ""
14 | _serialQuery = urlencode(params, quote_via=quote)
15 | _serialQuery = _url + "?" + _serialQuery
16 | return _serialQuery
17 |
18 |
19 | def _formatDate(date):
20 | if "win" in platform:
21 | return f'\"{date.split()[0]}\"'
22 | try:
23 | return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
24 | except ValueError:
25 | return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
26 |
27 |
28 | async def Favorites(username, init):
29 | logme.debug(__name__ + ':Favorites')
30 | url = f"{mobile}/{username}/favorites?lang=en"
31 |
32 | if init != '-1':
33 | url += f"&max_id={init}"
34 |
35 | return url
36 |
37 |
38 | async def Followers(username, init):
39 | logme.debug(__name__ + ':Followers')
40 | url = f"{mobile}/{username}/followers?lang=en"
41 |
42 | if init != '-1':
43 | url += f"&cursor={init}"
44 |
45 | return url
46 |
47 |
48 | async def Following(username, init):
49 | logme.debug(__name__ + ':Following')
50 | url = f"{mobile}/{username}/following?lang=en"
51 |
52 | if init != '-1':
53 | url += f"&cursor={init}"
54 |
55 | return url
56 |
57 |
58 | async def MobileProfile(username, init):
59 | logme.debug(__name__ + ':MobileProfile')
60 | url = f"{mobile}/{username}?lang=en"
61 |
62 | if init != '-1':
63 | url += f"&max_id={init}"
64 |
65 | return url
66 |
67 |
68 | async def Search(config, init):
69 | logme.debug(__name__ + ':Search')
70 | url = base
71 | tweet_count = 100
72 | q = ""
73 | params = [
74 | # ('include_blocking', '1'),
75 | # ('include_blocked_by', '1'),
76 | # ('include_followed_by', '1'),
77 | # ('include_want_retweets', '1'),
78 | # ('include_mute_edge', '1'),
79 | # ('include_can_dm', '1'),
80 | ('include_can_media_tag', '1'),
81 | # ('skip_status', '1'),
82 | # ('include_cards', '1'),
83 | ('include_ext_alt_text', 'true'),
84 | ('include_quote_count', 'true'),
85 | ('include_reply_count', '1'),
86 | ('tweet_mode', 'extended'),
87 | ('include_entities', 'true'),
88 | ('include_user_entities', 'true'),
89 | ('include_ext_media_availability', 'true'),
90 | ('send_error_codes', 'true'),
91 | ('simple_quoted_tweet', 'true'),
92 | ('count', tweet_count),
93 | ('query_source', 'typed_query'),
94 | # ('pc', '1'),
95 | ('cursor', str(init)),
96 | ('spelling_corrections', '1'),
97 | ('ext', 'mediaStats%2ChighlightedLabel'),
98 | ('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
99 | ]
100 | if not config.Popular_tweets:
101 | params.append(('f', 'tweets'))
102 | if config.Lang:
103 | params.append(("l", config.Lang))
104 | params.append(("lang", "en"))
105 | if config.Query:
106 | q += f" from:{config.Query}"
107 | if config.Username:
108 | q += f" from:{config.Username}"
109 | if config.Geo:
110 | config.Geo = config.Geo.replace(" ", "")
111 | q += f" geocode:{config.Geo}"
112 | if config.Search:
113 |
114 | q += f" {config.Search}"
115 | if config.Year:
116 | q += f" until:{config.Year}-1-1"
117 | if config.Since:
118 | q += f" since:{_formatDate(config.Since)}"
119 | if config.Until:
120 | q += f" until:{_formatDate(config.Until)}"
121 | if config.Email:
122 | q += ' "mail" OR "email" OR'
123 | q += ' "gmail" OR "e-mail"'
124 | if config.Phone:
125 | q += ' "phone" OR "call me" OR "text me"'
126 | if config.Verified:
127 | q += " filter:verified"
128 | if config.To:
129 | q += f" to:{config.To}"
130 | if config.All:
131 | q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
132 | if config.Near:
133 | q += f' near:"{config.Near}"'
134 | if config.Images:
135 | q += " filter:images"
136 | if config.Videos:
137 | q += " filter:videos"
138 | if config.Media:
139 | q += " filter:media"
140 | if config.Replies:
141 | q += " filter:replies"
142 | # although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
143 | if config.Native_retweets:
144 | q += " filter:nativeretweets"
145 | if config.Min_likes:
146 | q += f" min_faves:{config.Min_likes}"
147 | if config.Min_retweets:
148 | q += f" min_retweets:{config.Min_retweets}"
149 | if config.Min_replies:
150 | q += f" min_replies:{config.Min_replies}"
151 | if config.Links == "include":
152 | q += " filter:links"
153 | elif config.Links == "exclude":
154 | q += " exclude:links"
155 | if config.Source:
156 | q += f" source:\"{config.Source}\""
157 | if config.Members_list:
158 | q += f" list:{config.Members_list}"
159 | if config.Filter_retweets:
160 | q += f" exclude:nativeretweets exclude:retweets"
161 | if config.Custom_query:
162 | q = config.Custom_query
163 |
164 | q = q.strip()
165 | params.append(("q", q))
166 | _serialQuery = _sanitizeQuery(url, params)
167 | return url, params, _serialQuery
168 |
169 |
170 | def SearchProfile(config, init=None):
171 | logme.debug(__name__ + ':SearchProfile')
172 | _url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
173 | tweet_count = 100
174 | variables = {
175 | "userId": config.User_id,
176 | "count": tweet_count,
177 | "includePromotedContent": True,
178 | "withCommunity": True,
179 | "withSuperFollowsUserFields": True,
180 | "withBirdwatchPivots": False,
181 | "withDownvotePerspective": False,
182 | "withReactionsMetadata": False,
183 | "withReactionsPerspective": False,
184 | "withSuperFollowsTweetFields": True,
185 | "withVoice": True,
186 | "withV2Timeline": False,
187 | "__fs_interactive_text": False,
188 | "__fs_dont_mention_me_view_api_enabled": False,
189 | }
190 | if type(init) == str:
191 | variables['cursor'] = init
192 | params = [('variables', json.dumps(variables, separators=(',',':')))]
193 |
194 | _serialQuery = _sanitizeQuery(_url, params)
195 | return _serialQuery, [], _serialQuery
196 |
--------------------------------------------------------------------------------
/twint/user.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging as logme
3 |
4 |
5 | class user:
6 | type = "user"
7 |
8 | def __init__(self):
9 | pass
10 |
11 |
12 | User_formats = {
13 | 'join_date': '%Y-%m-%d',
14 | 'join_time': '%H:%M:%S %Z'
15 | }
16 |
17 |
18 | # ur object must be a json from the endpoint https://api.twitter.com/graphql
19 | def User(ur):
20 | logme.debug(__name__ + ':User')
21 | if 'data' not in ur and 'user' not in ur['data']:
22 | msg = 'malformed json! cannot be parsed to get user data'
23 | logme.fatal(msg)
24 | raise KeyError(msg)
25 | _usr = user()
26 | _usr.id = ur['data']['user']['rest_id']
27 | _usr.name = ur['data']['user']['legacy']['name']
28 | _usr.username = ur['data']['user']['legacy']['screen_name']
29 | _usr.bio = ur['data']['user']['legacy']['description']
30 | _usr.location = ur['data']['user']['legacy']['location']
31 | _usr.url = ""
32 | if 'url' in ur['data']['user']['legacy']:
33 | _usr.url = ur['data']['user']['legacy']['url']
34 | _usr.url = ur['data']['user']['legacy']['url']
35 | # parsing date to user-friendly format
36 | _dt = ur['data']['user']['legacy']['created_at']
37 | _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
38 | # date is of the format year,
39 | _usr.join_date = _dt.strftime(User_formats['join_date'])
40 | _usr.join_time = _dt.strftime(User_formats['join_time'])
41 |
42 | # :type `int`
43 | _usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
44 | _usr.following = int(ur['data']['user']['legacy']['friends_count'])
45 | _usr.followers = int(ur['data']['user']['legacy']['followers_count'])
46 | _usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
47 | _usr.media_count = int(ur['data']['user']['legacy']['media_count'])
48 |
49 | _usr.is_private = ur['data']['user']['legacy']['protected']
50 | _usr.is_verified = ur['data']['user']['legacy']['verified']
51 | _usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
52 | _usr.background_image = ""
53 | if 'profile_banner_url' in ur['data']['user']['legacy']:
54 | _usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
55 | # TODO : future implementation
56 | # legacy_extended_profile is also available in some cases which can be used to get DOB of user
57 | return _usr
58 |
--------------------------------------------------------------------------------
/twint/verbose.py:
--------------------------------------------------------------------------------
1 | def Count(count, config):
2 | msg = "[+] Finished: Successfully collected "
3 | if config.Followers:
4 | msg += f"all {count} users who follow @{config.Username}"
5 | elif config.Following:
6 | msg += f"all {count} users who @{config.Username} follows"
7 | elif config.Favorites:
8 | msg += f"{count} Tweets that @{config.Username} liked"
9 | else:
10 | msg += f"{count} Tweets"
11 | if config.Username:
12 | msg += f" from @{config.Username}"
13 | msg += "."
14 | print(msg)
15 |
16 | def Elastic(elasticsearch):
17 | if elasticsearch:
18 | print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
19 |
--------------------------------------------------------------------------------