├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE.md └── ISSUE_TEMPLATE │ └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── automate.py ├── elasticsearch ├── README.md ├── dashboard.json ├── index-follow.json ├── index-tweets.json ├── index-user.json └── visualizations.json ├── requirements.txt ├── setup.py ├── test.py └── twint ├── __init__.py ├── __version__.py ├── cli.py ├── config.py ├── datelock.py ├── feed.py ├── format.py ├── get.py ├── output.py ├── run.py ├── storage ├── __init__.py ├── db.py ├── elasticsearch.py ├── panda.py ├── write.py └── write_meta.py ├── token.py ├── tweet.py ├── url.py ├── user.py └── verbose.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | patreon: twintproject 3 | custom: paypal.me/noneprivacy 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Issue Template 2 | Please use this template! 3 | 4 | ### Initial Check 5 | > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks. 6 | 7 | >Make sure you've checked the following: 8 | 9 | - [] Python version is 3.6; 10 | - [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`; 11 | - [] I have searched the issues and there are no duplicates of this issue/question/request. 12 | 13 | ### Command Ran 14 | >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue. 15 | 16 | ### Description of Issue 17 | >Please use **as much detail as possible.** 18 | 19 | ### Environment Details 20 | >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal? 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Initial Check 2 | > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks. 3 | 4 | >Make sure you've checked the following: 5 | 6 | - [] Python version is 3.6; 7 | - [] Using the latest version of Twint; 8 | - [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`; 9 | 10 | ### Command Ran 11 | >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue. 12 | 13 | ### Description of Issue 14 | >Please use **as much detail as possible.** 15 | 16 | ### Environment Details 17 | >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal? 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | tweets.db 6 | # C extensions 7 | *.so 8 | 9 | config.ini 10 | twint/storage/mysql.py 11 | 12 | # Node Dependency directories 13 | node_modules/ 14 | jspm_packages/ 15 | tests/ 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # dotenv 90 | .env 91 | 92 | # virtualenv 93 | .venv 94 | venv/ 95 | ENV/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | # output 111 | *.csv 112 | *.json 113 | *.txt 114 | 115 | test_twint.py 116 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: bionic 2 | language: python 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | - "nightly" 8 | matrix: 9 | allow_failures: 10 | - python: "nightly" 11 | - python: "3.8" 12 | install: 13 | - pip install -r requirements.txt 14 | script: 15 | - python test.py 16 | deploy: 17 | provider: pypi 18 | user: "codyzacharias" 19 | password: 20 | secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM= 21 | on: 22 | tags: true 23 | python: "3.7" 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-buster 2 | LABEL maintainer="codyzacharias@pm.me" 3 | 4 | WORKDIR /root 5 | 6 | RUN git clone --depth=1 https://github.com/twintproject/twint.git && \ 7 | cd /root/twint && \ 8 | pip3 install . -r requirements.txt 9 | 10 | CMD /bin/bash 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Cody Zacharias 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TWINT - Twitter Intelligence Tool 2 | ![2](https://i.imgur.com/iaH3s7z.png) 3 | ![3](https://i.imgur.com/hVeCrqL.png) 4 | 5 | [![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social) 6 | 7 | >No authentication. No API. No limits. 8 | 9 | Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API. 10 | 11 | Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too. 12 | 13 | Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation. 14 | 15 | ## tl;dr Benefits 16 | Some of the benefits of using Twint vs Twitter API: 17 | - Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only); 18 | - Fast initial setup; 19 | - Can be used anonymously and without Twitter sign up; 20 | - **No rate limitations**. 21 | 22 | ## Limits imposed by Twitter 23 | Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets. 24 | 25 | ## Requirements 26 | - Python 3.6; 27 | - aiohttp; 28 | - aiodns; 29 | - beautifulsoup4; 30 | - cchardet; 31 | - dataclasses 32 | - elasticsearch; 33 | - pysocks; 34 | - pandas (>=0.23.0); 35 | - aiohttp_socks; 36 | - schedule; 37 | - geopy; 38 | - fake-useragent; 39 | - py-googletransx. 40 | 41 | ## Installing 42 | 43 | **Git:** 44 | ```bash 45 | git clone --depth=1 https://github.com/twintproject/twint.git 46 | cd twint 47 | pip3 install . -r requirements.txt 48 | ``` 49 | 50 | **Pip:** 51 | ```bash 52 | pip3 install twint 53 | ``` 54 | 55 | or 56 | 57 | ```bash 58 | pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint 59 | ``` 60 | 61 | **Pipenv**: 62 | ```bash 63 | pipenv install git+https://github.com/twintproject/twint.git#egg=twint 64 | ``` 65 | 66 | ### March 2, 2021 Update 67 | 68 | **Added**: Dockerfile 69 | 70 | Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them. 71 | 72 | ## CLI Basic Examples and Combos 73 | A few simple examples to help you understand the basics: 74 | 75 | - `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**). 76 | - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_. 77 | - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets. 78 | - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014. 79 | - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15. 80 | - `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00. 81 | - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt. 82 | - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file. 83 | - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses. 84 | - `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump. 85 | - `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file. 86 | - `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch 87 | - `twint -u username -o file.json --json` - Scrape Tweets and save as a json file. 88 | - `twint -u username --database tweets.db` - Save Tweets to a SQLite database. 89 | - `twint -u username --followers` - Scrape a Twitter user's followers. 90 | - `twint -u username --following` - Scrape who a Twitter user follows. 91 | - `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet). 92 | - `twint -u username --following --user-full` - Collect full user information a person follows 93 | - `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**). 94 | - `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile. 95 | - `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id. 96 | 97 | More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands) 98 | 99 | ## Module Example 100 | 101 | Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)** 102 | 103 | ```python 104 | import twint 105 | 106 | # Configure 107 | c = twint.Config() 108 | c.Username = "realDonaldTrump" 109 | c.Search = "great" 110 | 111 | # Run 112 | twint.run.Search(c) 113 | ``` 114 | > Output 115 | 116 | `955511208597184512 2018-01-22 18:43:19 GMT pineapples are the best fruit` 117 | 118 | ```python 119 | import twint 120 | 121 | c = twint.Config() 122 | 123 | c.Username = "noneprivacy" 124 | c.Custom["tweet"] = ["id"] 125 | c.Custom["user"] = ["bio"] 126 | c.Limit = 10 127 | c.Store_csv = True 128 | c.Output = "none" 129 | 130 | twint.run.Search(c) 131 | ``` 132 | 133 | ## Storing Options 134 | - Write to file; 135 | - CSV; 136 | - JSON; 137 | - SQLite; 138 | - Elasticsearch. 139 | 140 | ## Elasticsearch Setup 141 | 142 | Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch). 143 | 144 | ## Graph Visualization 145 | ![graph](https://i.imgur.com/EEJqB8n.png) 146 | 147 | [Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph). 148 | 149 | We are developing a Twint Desktop App. 150 | 151 | ![4](https://i.imgur.com/DzcfIgL.png) 152 | 153 | ## FAQ 154 | > I tried scraping tweets from a user, I know that they exist but I'm not getting them 155 | 156 | Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow. 157 | ## More Examples 158 | 159 | #### Followers/Following 160 | 161 | > To get only follower usernames/following usernames 162 | 163 | `twint -u username --followers` 164 | 165 | `twint -u username --following` 166 | 167 | > To get user info of followers/following users 168 | 169 | `twint -u username --followers --user-full` 170 | 171 | `twint -u username --following --user-full` 172 | 173 | #### userlist 174 | 175 | > To get only user info of user 176 | 177 | `twint -u username --user-full` 178 | 179 | > To get user info of users from a userlist 180 | 181 | `twint --userlist inputlist --user-full` 182 | 183 | 184 | #### tweet translation (experimental) 185 | 186 | > To get 100 english tweets and translate them to italian 187 | 188 | `twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100` 189 | 190 | or 191 | 192 | ```python 193 | import twint 194 | 195 | c = twint.Config() 196 | c.Username = "noneprivacy" 197 | c.Limit = 100 198 | c.Store_csv = True 199 | c.Output = "none.csv" 200 | c.Lang = "en" 201 | c.Translate = True 202 | c.TranslateDest = "it" 203 | twint.run.Search(c) 204 | ``` 205 | 206 | Notes: 207 | - [Google translate has some quotas](https://cloud.google.com/translate/quotas) 208 | 209 | ## Featured Blog Posts: 210 | - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/) 211 | - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/) 212 | - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f) 213 | - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/) 214 | 215 | ## Contact 216 | 217 | If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team) 218 | -------------------------------------------------------------------------------- /automate.py: -------------------------------------------------------------------------------- 1 | import twint 2 | import schedule 3 | import time 4 | 5 | # you can change the name of each "job" after "def" if you'd like. 6 | def jobone(): 7 | print ("Fetching Tweets") 8 | c = twint.Config() 9 | # choose username (optional) 10 | c.Username = "insert username here" 11 | # choose search term (optional) 12 | c.Search = "insert search term here" 13 | # choose beginning time (narrow results) 14 | c.Since = "2018-01-01" 15 | # set limit on total tweets 16 | c.Limit = 1000 17 | # no idea, but makes the csv format properly 18 | c.Store_csv = True 19 | # format of the csv 20 | c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"] 21 | # change the name of the csv file 22 | c.Output = "filename.csv" 23 | twint.run.Search(c) 24 | 25 | def jobtwo(): 26 | print ("Fetching Tweets") 27 | c = twint.Config() 28 | # choose username (optional) 29 | c.Username = "insert username here" 30 | # choose search term (optional) 31 | c.Search = "insert search term here" 32 | # choose beginning time (narrow results) 33 | c.Since = "2018-01-01" 34 | # set limit on total tweets 35 | c.Limit = 1000 36 | # no idea, but makes the csv format properly 37 | c.Store_csv = True 38 | # format of the csv 39 | c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"] 40 | # change the name of the csv file 41 | c.Output = "filename2.csv" 42 | twint.run.Search(c) 43 | 44 | # run once when you start the program 45 | 46 | jobone() 47 | jobtwo() 48 | 49 | # run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable) 50 | 51 | # schedule.every(1).minutes.do(jobone) 52 | schedule.every().hour.do(jobone) 53 | # schedule.every().day.at("10:30").do(jobone) 54 | # schedule.every().monday.do(jobone) 55 | # schedule.every().wednesday.at("13:15").do(jobone) 56 | 57 | # schedule.every(1).minutes.do(jobtwo) 58 | schedule.every().hour.do(jobtwo) 59 | # schedule.every().day.at("10:30").do(jobtwo) 60 | # schedule.every().monday.do(jobtwo) 61 | # schedule.every().wednesday.at("13:15").do(jobtwo) 62 | 63 | while True: 64 | schedule.run_pending() 65 | time.sleep(1) 66 | -------------------------------------------------------------------------------- /elasticsearch/README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch How-To 2 | 3 | ![dashboard](https://i.imgur.com/BEbtdo5.png) 4 | 5 | Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch) 6 | -------------------------------------------------------------------------------- /elasticsearch/dashboard.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d", 4 | "_type": "dashboard", 5 | "_source": { 6 | "title": "Twint Dashboard", 7 | "hits": 0, 8 | "description": "", 9 | "panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]", 10 | "optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}", 11 | "version": 1, 12 | "timeRestore": false, 13 | "kibanaSavedObjectMeta": { 14 | "searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}" 15 | } 16 | } 17 | } 18 | ] -------------------------------------------------------------------------------- /elasticsearch/index-follow.json: -------------------------------------------------------------------------------- 1 | PUT twintgraph 2 | { 3 | "mappings": { 4 | "items": { 5 | "properties": { 6 | "user": {"type": "keyword"}, 7 | "follow": {"type": "keyword"}, 8 | "essid": {"type": "keyword"} 9 | } 10 | } 11 | }, 12 | "settings": { 13 | "number_of_shards": 1 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /elasticsearch/index-tweets.json: -------------------------------------------------------------------------------- 1 | PUT twinttweets 2 | { 3 | "mappings": { 4 | "items": { 5 | "properties": { 6 | "id": {"type": "long"}, 7 | "conversation_id": {"type": "long"}, 8 | "created_at": {"type": "long"}, 9 | "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, 10 | "timezone": {"type": "keyword"}, 11 | "place": {"type": "keyword"}, 12 | "location": {"type": "keyword"}, 13 | "tweet": {"type": "text"}, 14 | "hashtags": {"type": "keyword"}, 15 | "cashtags": {"type": "keyword"}, 16 | "user_id": {"type": "long"}, 17 | "user_id_str": {"type": "keyword"}, 18 | "username": {"type": "keyword"}, 19 | "name": {"type": "text"}, 20 | "profile_image_url": {"type": "text"}, 21 | "day": {"type": "integer"}, 22 | "hour": {"type": "integer"}, 23 | "link": {"type": "text"}, 24 | "retweet": {"type": "text"}, 25 | "essid": {"type": "keyword"}, 26 | "nlikes": {"type": "integer"}, 27 | "nreplies": {"type": "integer"}, 28 | "nretweets": {"type": "integer"}, 29 | "quote_url": {"type": "text"}, 30 | "video": {"type": "integer"}, 31 | "thumbnail": {"type": "text"}, 32 | "search": {"type": "text"}, 33 | "near": {"type": "text"}, 34 | "geo_near": {"type": "geo_point"}, 35 | "geo_tweet": {"type": "geo_point"}, 36 | "photos": {"type": "text"}, 37 | "mentions": {"type": "text"}, 38 | "translation": {"type": "text"}, 39 | "trans_src": {"type": "keyword"}, 40 | "trans_dev": {"type": "keyword"}, 41 | } 42 | } 43 | } 44 | , 45 | "settings": { 46 | "number_of_shards": 1 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /elasticsearch/index-user.json: -------------------------------------------------------------------------------- 1 | PUT twintuser 2 | { 3 | "mappings": { 4 | "items": { 5 | "properties": { 6 | "id": {"type": "keyword"}, 7 | "name": {"type": "keyword"}, 8 | "username": {"type": "keyword"}, 9 | "bio": {"type": "text"}, 10 | "location": {"type": "keyword"}, 11 | "url": {"type": "text"}, 12 | "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, 13 | "join_date": {"type": "date", "format": "yyyy-MM-dd"}, 14 | "join_time": {"type": "date", "format": "HH:mm:ss"}, 15 | "tweets": {"type": "integer"}, 16 | "following": {"type": "integer"}, 17 | "followers": {"type": "integer"}, 18 | "likes": {"type": "integer"}, 19 | "media": {"type": "integer"}, 20 | "private": {"type": "integer"}, 21 | "verified": {"type": "integer"}, 22 | "avatar": {"type": "text"}, 23 | "background_image": {"type": "text"}, 24 | "session": {"type": "keyword"}, 25 | "geo_user": {"type": "geo_point"} 26 | } 27 | } 28 | } 29 | , 30 | "settings": { 31 | "number_of_shards": 1 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /elasticsearch/visualizations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_id": "d47421c0-bfd5-11e8-8858-bbc566841533", 4 | "_type": "visualization", 5 | "_source": { 6 | "title": "Activity [twinttweets]", 7 | "visState": "{\"title\":\"Activity [twinttweets]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"area\",\"mode\":\"stacked\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true,\"interpolate\":\"cardinal\"}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":true},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"date\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{},\"customLabel\":\"Days\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"customLabel\":\"User ids\"}}]}", 8 | "uiStateJSON": "{}", 9 | "description": "", 10 | "version": 1, 11 | "kibanaSavedObjectMeta": { 12 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}" 13 | } 14 | } 15 | }, 16 | { 17 | "_id": "e2b89640-bfd4-11e8-8858-bbc566841533", 18 | "_type": "visualization", 19 | "_source": { 20 | "title": "Activity - pie [twinttweets]", 21 | "visState": "{\"aggs\":[{\"enabled\":true,\"id\":\"1\",\"params\":{},\"schema\":\"metric\",\"type\":\"count\"},{\"enabled\":true,\"id\":\"2\",\"params\":{\"field\":\"user_id\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"}],\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":true,\"labels\":{\"last_level\":true,\"show\":false,\"truncate\":100,\"values\":true},\"legendPosition\":\"right\",\"type\":\"pie\"},\"title\":\"Activity - pie [twinttweets]\",\"type\":\"pie\"}", 22 | "uiStateJSON": "{}", 23 | "description": "", 24 | "version": 1, 25 | "kibanaSavedObjectMeta": { 26 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}" 27 | } 28 | } 29 | }, 30 | { 31 | "_id": "37cd72e0-bfe4-11e8-961a-d371b24d5d1d", 32 | "_type": "visualization", 33 | "_source": { 34 | "title": "Tweets Count [twinttweet]", 35 | "visState": "{\"title\":\"Tweets Count [twinttweet]\",\"type\":\"metric\",\"params\":{\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\",\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"None\",\"colorsRange\":[{\"from\":0,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":false,\"subText\":\"\",\"fontSize\":33}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}}]}", 36 | "uiStateJSON": "{}", 37 | "description": "", 38 | "version": 1, 39 | "kibanaSavedObjectMeta": { 40 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}" 41 | } 42 | } 43 | }, 44 | { 45 | "_id": "149ecbc0-bfe4-11e8-961a-d371b24d5d1d", 46 | "_type": "visualization", 47 | "_source": { 48 | "title": "Word Cloud [twinttweets]", 49 | "visState": "{\"title\":\"Word Cloud [twinttweets]\",\"type\":\"tagcloud\",\"params\":{\"scale\":\"linear\",\"orientation\":\"single\",\"minFontSize\":10,\"maxFontSize\":50,\"showLabel\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"username\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}}]}", 50 | "uiStateJSON": "{}", 51 | "description": "", 52 | "version": 1, 53 | "kibanaSavedObjectMeta": { 54 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}" 55 | } 56 | } 57 | }, 58 | { 59 | "_id": "a8d3ee70-bfd9-11e8-8858-bbc566841533", 60 | "_type": "visualization", 61 | "_source": { 62 | "title": "Day-activity [twinttweet]", 63 | "visState": "{\"title\":\"Day-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"position\":\"bottom\",\"scale\":{\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{},\"type\":\"category\"}],\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-3\"},\"legendPosition\":\"right\",\"orderBucketsBySum\":false,\"seriesParams\":[{\"data\":{\"id\":\"1\",\"label\":\"Tweets\"},\"drawLinesBetweenPoints\":true,\"mode\":\"normal\",\"show\":\"true\",\"showCircles\":true,\"type\":\"histogram\",\"valueAxis\":\"ValueAxis-3\"}],\"times\":[],\"type\":\"histogram\",\"valueAxes\":[{\"id\":\"ValueAxis-3\",\"labels\":{\"filter\":false,\"rotate\":0,\"show\":true,\"truncate\":100},\"name\":\"LeftAxis-1\",\"position\":\"left\",\"scale\":{\"mode\":\"normal\",\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{\"text\":\"Tweets\"},\"type\":\"value\"}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{\"min\":0,\"max\":23}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"asc\",\"orderBy\":\"_term\",\"customLabel\":\"\"}}]}", 64 | "uiStateJSON": "{\"vis\":{\"legendOpen\":true}}", 65 | "description": "", 66 | "version": 1, 67 | "kibanaSavedObjectMeta": { 68 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}" 69 | } 70 | } 71 | }, 72 | { 73 | "_id": "8a8bb420-bfd9-11e8-8858-bbc566841533", 74 | "_type": "visualization", 75 | "_source": { 76 | "title": "Week-activity [twinttweet]", 77 | "visState": "{\"title\":\"Week-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"histogram\",\"mode\":\"normal\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{},\"customLabel\":\"Days of the week\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\",\"customLabel\":\"\"}}]}", 78 | "uiStateJSON": "{}", 79 | "description": "", 80 | "version": 1, 81 | "kibanaSavedObjectMeta": { 82 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}" 83 | } 84 | } 85 | }, 86 | { 87 | "_id": "b45ec590-c267-11e8-bcd4-3956fe930db7", 88 | "_type": "visualization", 89 | "_source": { 90 | "title": "Heat-map [twinttweets]", 91 | "visState": "{\"title\":\"Heat-map [twinttweets]\",\"type\":\"heatmap\",\"params\":{\"type\":\"heatmap\",\"addTooltip\":true,\"addLegend\":true,\"enableHover\":true,\"legendPosition\":\"right\",\"times\":[],\"colorsNumber\":10,\"colorSchema\":\"Reds\",\"setColorRange\":false,\"colorsRange\":[{\"from\":0,\"to\":10},{\"from\":10,\"to\":100},{\"from\":100,\"to\":200},{\"from\":200,\"to\":500},{\"from\":500,\"to\":1000},{\"from\":1000,\"to\":2000},{\"from\":2000,\"to\":3000},{\"from\":3000,\"to\":4000},{\"from\":4000,\"to\":5000},{\"from\":7000,\"to\":null}],\"invertColors\":false,\"percentageMode\":false,\"valueAxes\":[{\"show\":false,\"id\":\"ValueAxis-1\",\"type\":\"value\",\"scale\":{\"type\":\"linear\",\"defaultYExtents\":true},\"labels\":{\"show\":false,\"rotate\":270,\"overwriteColor\":false,\"color\":\"#555\"}}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"group\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{\"min\":0,\"max\":2}}}]}", 92 | "uiStateJSON": "{\"vis\":{\"defaultColors\":{\"3 - 592\":\"rgb(255,245,240)\",\"592 - 1.180\":\"rgb(254,228,216)\",\"1.180 - 1.769\":\"rgb(253,202,181)\",\"1.769 - 2.357\":\"rgb(252,171,142)\",\"2.357 - 2.945\":\"rgb(252,138,106)\",\"2.945 - 3.534\":\"rgb(251,106,74)\",\"3.534 - 4.122\":\"rgb(241,68,50)\",\"4.122 - 4.711\":\"rgb(217,38,35)\",\"4.711 - 5.299\":\"rgb(188,20,26)\",\"5.299 - 5.887\":\"rgb(152,12,19)\"},\"colors\":{\"3 - 592\":\"#FCEACA\",\"592 - 1.180\":\"#F9E2D2\",\"1.180 - 1.769\":\"#F9BA8F\"}}}", 93 | "description": "", 94 | "version": 1, 95 | "kibanaSavedObjectMeta": { 96 | "searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"filter\":[],\"query\":{\"language\":\"lucene\",\"query\":\"\"}}" 97 | } 98 | } 99 | } 100 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | aiodns 3 | beautifulsoup4 4 | cchardet 5 | dataclasses 6 | elasticsearch 7 | pysocks 8 | pandas>=0.23.0 9 | aiohttp_socks<=0.4.1 10 | schedule 11 | geopy 12 | fake-useragent 13 | googletransx 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | from setuptools import setup 3 | import io 4 | import os 5 | 6 | # Package meta-data 7 | NAME = 'twint' 8 | DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.' 9 | URL = 'https://github.com/twintproject/twint' 10 | EMAIL = 'codyzacharias@pm.me' 11 | AUTHOR = 'Cody Zacharias' 12 | REQUIRES_PYTHON = '>=3.6.0' 13 | VERSION = None 14 | 15 | # Packages required 16 | REQUIRED = [ 17 | 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses', 18 | 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks', 19 | 'schedule', 'geopy', 'fake-useragent', 'googletransx' 20 | ] 21 | 22 | here = os.path.abspath(os.path.dirname(__file__)) 23 | 24 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 25 | long_description = '\n' + f.read() 26 | 27 | # Load the package's __version__.py 28 | about = {} 29 | if not VERSION: 30 | with open(os.path.join(here, NAME, '__version__.py')) as f: 31 | exec(f.read(), about) 32 | else: 33 | about['__version__'] = VERSION 34 | 35 | setup( 36 | name=NAME, 37 | version=about['__version__'], 38 | description=DESCRIPTION, 39 | long_description=long_description, 40 | long_description_content_type="text/markdown", 41 | author=AUTHOR, 42 | author_email=EMAIL, 43 | python_requires=REQUIRES_PYTHON, 44 | url=URL, 45 | packages=['twint', 'twint.storage'], 46 | entry_points={ 47 | 'console_scripts': [ 48 | 'twint = twint.cli:run_as_command', 49 | ], 50 | }, 51 | install_requires=REQUIRED, 52 | dependency_links=[ 53 | 'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans' 54 | ], 55 | license='MIT', 56 | classifiers=[ 57 | 'License :: OSI Approved :: MIT License', 58 | 'Programming Language :: Python', 59 | 'Programming Language :: Python :: 3', 60 | 'Programming Language :: Python :: 3.6', 61 | 'Programming Language :: Python :: 3.7', 62 | 'Programming Language :: Python :: 3.8', 63 | 'Programming Language :: Python :: Implementation :: CPython', 64 | ], 65 | ) 66 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import twint 2 | import os 3 | 4 | ''' 5 | Test.py - Testing TWINT to make sure everything works. 6 | ''' 7 | 8 | 9 | def test_reg(c, run): 10 | print("[+] Beginning vanilla test in {}".format(str(run))) 11 | run(c) 12 | 13 | 14 | def test_db(c, run): 15 | print("[+] Beginning DB test in {}".format(str(run))) 16 | c.Database = "test_twint.db" 17 | run(c) 18 | 19 | 20 | def custom(c, run, _type): 21 | print("[+] Beginning custom {} test in {}".format(_type, str(run))) 22 | c.Custom['tweet'] = ["id", "username"] 23 | c.Custom['user'] = ["id", "username"] 24 | run(c) 25 | 26 | 27 | def test_json(c, run): 28 | c.Store_json = True 29 | c.Output = "test_twint.json" 30 | custom(c, run, "JSON") 31 | print("[+] Beginning JSON test in {}".format(str(run))) 32 | run(c) 33 | 34 | 35 | def test_csv(c, run): 36 | c.Store_csv = True 37 | c.Output = "test_twint.csv" 38 | custom(c, run, "CSV") 39 | print("[+] Beginning CSV test in {}".format(str(run))) 40 | run(c) 41 | 42 | 43 | def main(): 44 | c = twint.Config() 45 | c.Username = "verified" 46 | c.Limit = 20 47 | c.Store_object = True 48 | 49 | # Separate objects are necessary. 50 | 51 | f = twint.Config() 52 | f.Username = "verified" 53 | f.Limit = 20 54 | f.Store_object = True 55 | f.User_full = True 56 | 57 | runs = [ 58 | twint.run.Profile, # this doesn't 59 | twint.run.Search, # this works 60 | twint.run.Following, 61 | twint.run.Followers, 62 | twint.run.Favorites, 63 | ] 64 | 65 | tests = [test_reg, test_json, test_csv, test_db] 66 | 67 | # Something breaks if we don't split these up 68 | 69 | for run in runs[:3]: 70 | if run == twint.run.Search: 71 | c.Since = "2012-1-1 20:30:22" 72 | c.Until = "2017-1-1" 73 | else: 74 | c.Since = "" 75 | c.Until = "" 76 | 77 | for test in tests: 78 | test(c, run) 79 | 80 | for run in runs[3:]: 81 | for test in tests: 82 | test(f, run) 83 | 84 | files = ["test_twint.db", "test_twint.json", "test_twint.csv"] 85 | for _file in files: 86 | os.remove(_file) 87 | 88 | print("[+] Testing complete!") 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /twint/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | TWINT - Twitter Intelligence Tool (formerly known as Tweep). 3 | 4 | See wiki on Github for in-depth details. 5 | https://github.com/twintproject/twint/wiki 6 | 7 | Licensed under MIT License 8 | Copyright (c) 2018 Cody Zacharias 9 | ''' 10 | import logging, os 11 | 12 | from .config import Config 13 | from .__version__ import __version__ 14 | from . import run 15 | 16 | _levels = { 17 | 'info': logging.INFO, 18 | 'debug': logging.DEBUG 19 | } 20 | 21 | _level = os.getenv('TWINT_DEBUG', 'info') 22 | _logLevel = _levels[_level] 23 | 24 | if _level == "debug": 25 | logger = logging.getLogger() 26 | _output_fn = 'twint.log' 27 | logger.setLevel(_logLevel) 28 | formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s') 29 | fileHandler = logging.FileHandler(_output_fn) 30 | fileHandler.setLevel(_logLevel) 31 | fileHandler.setFormatter(formatter) 32 | logger.addHandler(fileHandler) 33 | -------------------------------------------------------------------------------- /twint/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (2, 1, 21) 2 | 3 | __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /twint/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Twint.py - Twitter Intelligence Tool (formerly known as Tweep). 4 | 5 | See wiki on Github for in-depth details. 6 | https://github.com/twintproject/twint/wiki 7 | 8 | Licensed under MIT License 9 | Copyright (c) 2018 The Twint Project 10 | ''' 11 | import sys 12 | import os 13 | import argparse 14 | 15 | from . import run 16 | from . import config 17 | from . import storage 18 | 19 | 20 | def error(_error, message): 21 | """ Print errors to stdout 22 | """ 23 | print("[-] {}: {}".format(_error, message)) 24 | sys.exit(0) 25 | 26 | 27 | def check(args): 28 | """ Error checking 29 | """ 30 | if args.username is not None or args.userlist or args.members_list: 31 | if args.verified: 32 | error("Contradicting Args", 33 | "Please use --verified in combination with -s.") 34 | if args.userid: 35 | error("Contradicting Args", 36 | "--userid and -u cannot be used together.") 37 | if args.all: 38 | error("Contradicting Args", 39 | "--all and -u cannot be used together.") 40 | elif args.search and args.timeline: 41 | error("Contradicting Args", 42 | "--s and --tl cannot be used together.") 43 | elif args.timeline and not args.username: 44 | error("Error", "-tl cannot be used without -u.") 45 | elif args.search is None: 46 | if args.custom_query is not None: 47 | pass 48 | elif (args.geo or args.near) is None and not (args.all or args.userid): 49 | error("Error", "Please use at least -u, -s, -g or --near.") 50 | elif args.all and args.userid: 51 | error("Contradicting Args", 52 | "--all and --userid cannot be used together") 53 | if args.output is None: 54 | if args.csv: 55 | error("Error", "Please specify an output file (Example: -o file.csv).") 56 | elif args.json: 57 | error("Error", "Please specify an output file (Example: -o file.json).") 58 | if args.backoff_exponent <= 0: 59 | error("Error", "Please specifiy a positive value for backoff_exponent") 60 | if args.min_wait_time < 0: 61 | error("Error", "Please specifiy a non negative value for min_wait_time") 62 | 63 | 64 | def loadUserList(ul, _type): 65 | """ Concatenate users 66 | """ 67 | if os.path.exists(os.path.abspath(ul)): 68 | userlist = open(os.path.abspath(ul), "r").read().splitlines() 69 | else: 70 | userlist = ul.split(",") 71 | if _type == "search": 72 | un = "" 73 | for user in userlist: 74 | un += "%20OR%20from%3A" + user 75 | return un[15:] 76 | return userlist 77 | 78 | 79 | def initialize(args): 80 | """ Set default values for config from args 81 | """ 82 | c = config.Config() 83 | c.Username = args.username 84 | c.User_id = args.userid 85 | c.Search = args.search 86 | c.Geo = args.geo 87 | c.Location = args.location 88 | c.Near = args.near 89 | c.Lang = args.lang 90 | c.Output = args.output 91 | c.Elasticsearch = args.elasticsearch 92 | c.Year = args.year 93 | c.Since = args.since 94 | c.Until = args.until 95 | c.Email = args.email 96 | c.Phone = args.phone 97 | c.Verified = args.verified 98 | c.Store_csv = args.csv 99 | c.Tabs = args.tabs 100 | c.Store_json = args.json 101 | c.Show_hashtags = args.hashtags 102 | c.Show_cashtags = args.cashtags 103 | c.Limit = args.limit 104 | c.Count = args.count 105 | c.Stats = args.stats 106 | c.Database = args.database 107 | c.To = args.to 108 | c.All = args.all 109 | c.Essid = args.essid 110 | c.Format = args.format 111 | c.User_full = args.user_full 112 | # c.Profile_full = args.profile_full 113 | c.Pandas_type = args.pandas_type 114 | c.Index_tweets = args.index_tweets 115 | c.Index_follow = args.index_follow 116 | c.Index_users = args.index_users 117 | c.Debug = args.debug 118 | c.Resume = args.resume 119 | c.Images = args.images 120 | c.Videos = args.videos 121 | c.Media = args.media 122 | c.Replies = args.replies 123 | c.Pandas_clean = args.pandas_clean 124 | c.Proxy_host = args.proxy_host 125 | c.Proxy_port = args.proxy_port 126 | c.Proxy_type = args.proxy_type 127 | c.Tor_control_port = args.tor_control_port 128 | c.Tor_control_password = args.tor_control_password 129 | c.Retweets = args.retweets 130 | c.Custom_query = args.custom_query 131 | c.Popular_tweets = args.popular_tweets 132 | c.Skip_certs = args.skip_certs 133 | c.Hide_output = args.hide_output 134 | c.Native_retweets = args.native_retweets 135 | c.Min_likes = args.min_likes 136 | c.Min_retweets = args.min_retweets 137 | c.Min_replies = args.min_replies 138 | c.Links = args.links 139 | c.Source = args.source 140 | c.Members_list = args.members_list 141 | c.Filter_retweets = args.filter_retweets 142 | c.Translate = args.translate 143 | c.TranslateDest = args.translate_dest 144 | c.Backoff_exponent = args.backoff_exponent 145 | c.Min_wait_time = args.min_wait_time 146 | return c 147 | 148 | 149 | def options(): 150 | """ Parse arguments 151 | """ 152 | ap = argparse.ArgumentParser(prog="twint", 153 | usage="python3 %(prog)s [options]", 154 | description="TWINT - An Advanced Twitter Scraping Tool.") 155 | ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.") 156 | ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.") 157 | ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.") 158 | ap.add_argument("--near", help="Near a specified city.") 159 | ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true") 160 | ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.") 161 | ap.add_argument("-o", "--output", help="Save output to a file.") 162 | ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.") 163 | ap.add_argument("--year", help="Filter Tweets before specified year.") 164 | ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).", 165 | metavar="DATE") 166 | ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).", 167 | metavar="DATE") 168 | ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true") 169 | ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true") 170 | ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).", 171 | action="store_true") 172 | ap.add_argument("--csv", help="Write as .csv file.", action="store_true") 173 | ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true") 174 | ap.add_argument("--json", help="Write as .json file", action="store_true") 175 | ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true") 176 | ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true") 177 | ap.add_argument("--userid", help="Twitter user id.") 178 | ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).") 179 | ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.", 180 | action="store_true") 181 | ap.add_argument("--stats", help="Show number of replies, retweets, and likes.", 182 | action="store_true") 183 | ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.") 184 | ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME") 185 | ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME") 186 | ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true") 187 | ap.add_argument("--following", help="Scrape a person's follows", action="store_true") 188 | ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true") 189 | ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.") 190 | ap.add_argument("--proxy-host", help="Proxy hostname or IP.") 191 | ap.add_argument("--proxy-port", help="The port of the proxy server.") 192 | ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051) 193 | ap.add_argument("--tor-control-password", 194 | help="If proxy-host is set to tor, this is the password for the control port", 195 | default="my_password") 196 | ap.add_argument("--essid", 197 | help="Elasticsearch Session ID, use this to differentiate scraping sessions.", 198 | nargs="?", default="") 199 | ap.add_argument("--userlist", help="Userlist from list or file.") 200 | ap.add_argument("--retweets", 201 | help="Include user's Retweets (Warning: limited).", 202 | action="store_true") 203 | ap.add_argument("--format", help="Custom output format (See wiki for details).") 204 | ap.add_argument("--user-full", 205 | help="Collect all user information (Use with followers or following only).", 206 | action="store_true") 207 | # I am removing this this feature for the time being, because it is no longer required, default method will do this 208 | # ap.add_argument("--profile-full", 209 | # help="Slow, but effective method of collecting a user's Tweets and RT.", 210 | # action="store_true") 211 | ap.add_argument( 212 | "-tl", 213 | "--timeline", 214 | help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)", 215 | action="store_true", 216 | ) 217 | ap.add_argument("--translate", 218 | help="Get tweets translated by Google Translate.", 219 | action="store_true") 220 | ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).", 221 | default="en") 222 | ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.") 223 | ap.add_argument("--pandas-type", 224 | help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5") 225 | ap.add_argument("-it", "--index-tweets", 226 | help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets") 227 | ap.add_argument("-if", "--index-follow", 228 | help="Custom Elasticsearch Index name for Follows.", 229 | nargs="?", default="twintgraph") 230 | ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.", 231 | nargs="?", default="twintuser") 232 | ap.add_argument("--debug", 233 | help="Store information in debug logs", action="store_true") 234 | ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID") 235 | ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true") 236 | ap.add_argument("--images", help="Display only Tweets with images.", action="store_true") 237 | ap.add_argument("--media", 238 | help="Display Tweets with only images or videos.", action="store_true") 239 | ap.add_argument("--replies", help="Display replies to a subject.", action="store_true") 240 | ap.add_argument("-pc", "--pandas-clean", 241 | help="Automatically clean Pandas dataframe at every scrape.") 242 | ap.add_argument("-cq", "--custom-query", help="Custom search query.") 243 | ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.", 244 | action="store_true") 245 | ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false") 246 | ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true") 247 | ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true") 248 | ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.") 249 | ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.") 250 | ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.") 251 | ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" + 252 | " you will get both tweets that might contain links or not.") 253 | ap.add_argument("--source", help="Filter the tweets for specific source client.") 254 | ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.") 255 | ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true") 256 | ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.", 257 | type=float, default=3.0) 258 | ap.add_argument("--min-wait-time", type=float, default=15, 259 | help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints") 260 | args = ap.parse_args() 261 | 262 | return args 263 | 264 | 265 | def main(): 266 | """ Main 267 | """ 268 | args = options() 269 | check(args) 270 | 271 | if args.pandas_clean: 272 | storage.panda.clean() 273 | 274 | c = initialize(args) 275 | 276 | if args.userlist: 277 | c.Query = loadUserList(args.userlist, "search") 278 | 279 | if args.pandas_clean: 280 | storage.panda.clean() 281 | 282 | if args.favorites: 283 | if args.userlist: 284 | _userlist = loadUserList(args.userlist, "favorites") 285 | for _user in _userlist: 286 | args.username = _user 287 | c = initialize(args) 288 | run.Favorites(c) 289 | else: 290 | run.Favorites(c) 291 | elif args.following: 292 | if args.userlist: 293 | _userlist = loadUserList(args.userlist, "following") 294 | for _user in _userlist: 295 | args.username = _user 296 | c = initialize(args) 297 | run.Following(c) 298 | else: 299 | run.Following(c) 300 | elif args.followers: 301 | if args.userlist: 302 | _userlist = loadUserList(args.userlist, "followers") 303 | for _user in _userlist: 304 | args.username = _user 305 | c = initialize(args) 306 | run.Followers(c) 307 | else: 308 | run.Followers(c) 309 | elif args.retweets: # or args.profile_full: 310 | if args.userlist: 311 | _userlist = loadUserList(args.userlist, "profile") 312 | for _user in _userlist: 313 | args.username = _user 314 | c = initialize(args) 315 | run.Profile(c) 316 | else: 317 | run.Profile(c) 318 | elif args.user_full: 319 | if args.userlist: 320 | _userlist = loadUserList(args.userlist, "userlist") 321 | for _user in _userlist: 322 | args.username = _user 323 | c = initialize(args) 324 | run.Lookup(c) 325 | else: 326 | run.Lookup(c) 327 | elif args.timeline: 328 | run.Profile(c) 329 | else: 330 | run.Search(c) 331 | 332 | 333 | def run_as_command(): 334 | version = ".".join(str(v) for v in sys.version_info[:2]) 335 | if float(version) < 3.6: 336 | print("[-] TWINT requires Python version 3.6+.") 337 | sys.exit(0) 338 | 339 | main() 340 | 341 | 342 | if __name__ == '__main__': 343 | main() 344 | -------------------------------------------------------------------------------- /twint/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | @dataclass 5 | class Config: 6 | Username: Optional[str] = None 7 | User_id: Optional[str] = None 8 | Search: Optional[str] = None 9 | Lookup: bool = False 10 | Geo: str = "" 11 | Location: bool = False 12 | Near: str = None 13 | Lang: Optional[str] = None 14 | Output: Optional[str] = None 15 | Elasticsearch: object = None 16 | Year: Optional[int] = None 17 | Since: Optional[str] = None 18 | Until: Optional[str] = None 19 | Email: Optional[str] = None 20 | Phone: Optional[str] = None 21 | Verified: bool = False 22 | Store_csv: bool = False 23 | Store_json: bool = False 24 | Custom = {"tweet": None, "user": None, "username": None} 25 | Show_hashtags: bool = False 26 | Show_cashtags: bool = False 27 | Limit: Optional[int] = None 28 | Count: Optional[int] = None 29 | Stats: bool = False 30 | Database: object = None 31 | To: str = None 32 | All = None 33 | Debug: bool = False 34 | Format = None 35 | Essid: str = "" 36 | Profile: bool = False 37 | Followers: bool = False 38 | Following: bool = False 39 | Favorites: bool = False 40 | TwitterSearch: bool = False 41 | User_full: bool = False 42 | # Profile_full: bool = False 43 | Store_object: bool = False 44 | Store_object_tweets_list: list = None 45 | Store_object_users_list: list = None 46 | Store_object_follow_list: list = None 47 | Pandas_type: type = None 48 | Pandas: bool = False 49 | Index_tweets: str = "twinttweets" 50 | Index_follow: str = "twintgraph" 51 | Index_users: str = "twintuser" 52 | Retries_count: int = 10 53 | Resume: object = None 54 | Images: bool = False 55 | Videos: bool = False 56 | Media: bool = False 57 | Replies: bool = False 58 | Pandas_clean: bool = True 59 | Lowercase: bool = True 60 | Pandas_au: bool = True 61 | Proxy_host: str = "" 62 | Proxy_port: int = 0 63 | Proxy_type: object = None 64 | Tor_control_port: int = 9051 65 | Tor_control_password: str = None 66 | Retweets: bool = False 67 | Query: str = None 68 | Hide_output: bool = False 69 | Custom_query: str = "" 70 | Popular_tweets: bool = False 71 | Skip_certs: bool = False 72 | Native_retweets: bool = False 73 | Min_likes: int = 0 74 | Min_retweets: int = 0 75 | Min_replies: int = 0 76 | Links: Optional[str] = None 77 | Source: Optional[str] = None 78 | Members_list: Optional[str] = None 79 | Filter_retweets: bool = False 80 | Translate: bool = False 81 | TranslateSrc: str = "en" 82 | TranslateDest: str = "en" 83 | Backoff_exponent: float = 3.0 84 | Min_wait_time: int = 0 85 | Bearer_token: str = None 86 | Guest_token: str = None 87 | deleted: list = None 88 | -------------------------------------------------------------------------------- /twint/datelock.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import logging as logme 4 | 5 | from .tweet import utc_to_local 6 | 7 | 8 | class Datelock: 9 | until = None 10 | since = None 11 | _since_def_user = None 12 | 13 | 14 | def convertToDateTime(string): 15 | dateTimeList = string.split() 16 | ListLength = len(dateTimeList) 17 | if ListLength == 2: 18 | return string 19 | if ListLength == 1: 20 | return string + " 00:00:00" 21 | else: 22 | return "" 23 | 24 | 25 | def Set(Until, Since): 26 | logme.debug(__name__+':Set') 27 | d = Datelock() 28 | 29 | if Until: 30 | d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S") 31 | d.until = utc_to_local(d.until) 32 | else: 33 | d.until = datetime.datetime.today() 34 | 35 | if Since: 36 | d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S") 37 | d.since = utc_to_local(d.since) 38 | d._since_def_user = True 39 | else: 40 | d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S") 41 | d.since = utc_to_local(d.since) 42 | d._since_def_user = False 43 | 44 | return d 45 | -------------------------------------------------------------------------------- /twint/feed.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | 4 | from bs4 import BeautifulSoup 5 | from re import findall 6 | from json import loads 7 | 8 | import logging as logme 9 | 10 | from .tweet import utc_to_local, Tweet_formats 11 | 12 | 13 | class NoMoreTweetsException(Exception): 14 | def __init__(self, msg): 15 | super().__init__(msg) 16 | 17 | 18 | def Follow(response): 19 | logme.debug(__name__ + ':Follow') 20 | soup = BeautifulSoup(response, "html.parser") 21 | follow = soup.find_all("td", "info fifty screenname") 22 | cursor = soup.find_all("div", "w-button-more") 23 | try: 24 | cursor = findall(r'cursor=(.*?)">', str(cursor))[0] 25 | except IndexError: 26 | logme.critical(__name__ + ':Follow:IndexError') 27 | 28 | return follow, cursor 29 | 30 | 31 | # TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future 32 | def Mobile(response): 33 | logme.debug(__name__ + ':Mobile') 34 | soup = BeautifulSoup(response, "html.parser") 35 | tweets = soup.find_all("span", "metadata") 36 | max_id = soup.find_all("div", "w-button-more") 37 | try: 38 | max_id = findall(r'max_id=(.*?)">', str(max_id))[0] 39 | except Exception as e: 40 | logme.critical(__name__ + ':Mobile:' + str(e)) 41 | 42 | return tweets, max_id 43 | 44 | 45 | def MobileFav(response): 46 | soup = BeautifulSoup(response, "html.parser") 47 | tweets = soup.find_all("table", "tweet") 48 | max_id = soup.find_all("div", "w-button-more") 49 | try: 50 | max_id = findall(r'max_id=(.*?)">', str(max_id))[0] 51 | except Exception as e: 52 | print(str(e) + " [x] feed.MobileFav") 53 | 54 | return tweets, max_id 55 | 56 | 57 | def _get_cursor(response): 58 | try: 59 | next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][ 60 | 'operation']['cursor']['value'] 61 | except KeyError: 62 | # this is needed because after the first request location of cursor is changed 63 | next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][ 64 | 'cursor']['value'] 65 | return next_cursor 66 | 67 | 68 | def Json(response): 69 | logme.debug(__name__ + ':Json') 70 | json_response = loads(response) 71 | html = json_response["items_html"] 72 | soup = BeautifulSoup(html, "html.parser") 73 | feed = soup.find_all("div", "tweet") 74 | return feed, json_response["min_position"] 75 | 76 | 77 | def parse_tweets(config, response): 78 | logme.debug(__name__ + ':parse_tweets') 79 | response = loads(response) 80 | if len(response['globalObjects']['tweets']) == 0: 81 | msg = 'No more data!' 82 | raise NoMoreTweetsException(msg) 83 | feed = [] 84 | for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']: 85 | # this will handle the cases when the timeline entry is a tweet 86 | if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or 87 | timeline_entry['entryId'].startswith('tweet-')): 88 | if 'tweet' in timeline_entry['content']['item']['content']: 89 | _id = timeline_entry['content']['item']['content']['tweet']['id'] 90 | # skip the ads 91 | if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']: 92 | continue 93 | elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \ 94 | timeline_entry['content']['item']['content']['tombstone']: 95 | _id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id'] 96 | else: 97 | _id = None 98 | if _id is None: 99 | raise ValueError('Unable to find ID of tweet in timeline.') 100 | try: 101 | temp_obj = response['globalObjects']['tweets'][_id] 102 | except KeyError: 103 | logme.info('encountered a deleted tweet with id {}'.format(_id)) 104 | 105 | config.deleted.append(_id) 106 | continue 107 | temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']] 108 | if 'retweeted_status_id_str' in temp_obj: 109 | rt_id = temp_obj['retweeted_status_id_str'] 110 | _dt = response['globalObjects']['tweets'][rt_id]['created_at'] 111 | _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') 112 | _dt = utc_to_local(_dt) 113 | _dt = str(_dt.strftime(Tweet_formats['datetime'])) 114 | temp_obj['retweet_data'] = { 115 | 'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'], 116 | 'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'], 117 | 'retweet_id': rt_id, 118 | 'retweet_date': _dt, 119 | } 120 | feed.append(temp_obj) 121 | next_cursor = _get_cursor(response) 122 | return feed, next_cursor 123 | -------------------------------------------------------------------------------- /twint/format.py: -------------------------------------------------------------------------------- 1 | import logging as logme 2 | 3 | def Tweet(config, t): 4 | if config.Format: 5 | logme.debug(__name__+':Tweet:Format') 6 | output = config.Format.replace("{id}", t.id_str) 7 | output = output.replace("{conversation_id}", t.conversation_id) 8 | output = output.replace("{date}", t.datestamp) 9 | output = output.replace("{time}", t.timestamp) 10 | output = output.replace("{user_id}", t.user_id_str) 11 | output = output.replace("{username}", t.username) 12 | output = output.replace("{name}", t.name) 13 | output = output.replace("{place}", t.place) 14 | output = output.replace("{timezone}", t.timezone) 15 | output = output.replace("{urls}", ",".join(t.urls)) 16 | output = output.replace("{photos}", ",".join(t.photos)) 17 | output = output.replace("{video}", str(t.video)) 18 | output = output.replace("{thumbnail}", t.thumbnail) 19 | output = output.replace("{tweet}", t.tweet) 20 | output = output.replace("{language}", t.lang) 21 | output = output.replace("{hashtags}", ",".join(t.hashtags)) 22 | output = output.replace("{cashtags}", ",".join(t.cashtags)) 23 | output = output.replace("{replies}", t.replies_count) 24 | output = output.replace("{retweets}", t.retweets_count) 25 | output = output.replace("{likes}", t.likes_count) 26 | output = output.replace("{link}", t.link) 27 | output = output.replace("{is_retweet}", str(t.retweet)) 28 | output = output.replace("{user_rt_id}", str(t.user_rt_id)) 29 | output = output.replace("{quote_url}", t.quote_url) 30 | output = output.replace("{near}", t.near) 31 | output = output.replace("{geo}", t.geo) 32 | output = output.replace("{mentions}", ",".join(t.mentions)) 33 | output = output.replace("{translate}", t.translate) 34 | output = output.replace("{trans_src}", t.trans_src) 35 | output = output.replace("{trans_dest}", t.trans_dest) 36 | else: 37 | logme.debug(__name__+':Tweet:notFormat') 38 | output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} " 39 | 40 | # TODO: someone who is familiar with this code, needs to take a look at what this is 41 | # if t.retweet: 42 | # output += "RT " 43 | 44 | output += f"<{t.username}> {t.tweet}" 45 | 46 | if config.Show_hashtags: 47 | hashtags = ",".join(t.hashtags) 48 | output += f" {hashtags}" 49 | if config.Show_cashtags: 50 | cashtags = ",".join(t.cashtags) 51 | output += f" {cashtags}" 52 | if config.Stats: 53 | output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes" 54 | if config.Translate: 55 | output += f" {t.translate} {t.trans_src} {t.trans_dest}" 56 | return output 57 | 58 | def User(_format, u): 59 | if _format: 60 | logme.debug(__name__+':User:Format') 61 | output = _format.replace("{id}", str(u.id)) 62 | output = output.replace("{name}", u.name) 63 | output = output.replace("{username}", u.username) 64 | output = output.replace("{bio}", u.bio) 65 | output = output.replace("{location}", u.location) 66 | output = output.replace("{url}", u.url) 67 | output = output.replace("{join_date}", u.join_date) 68 | output = output.replace("{join_time}", u.join_time) 69 | output = output.replace("{tweets}", str(u.tweets)) 70 | output = output.replace("{following}", str(u.following)) 71 | output = output.replace("{followers}", str(u.followers)) 72 | output = output.replace("{likes}", str(u.likes)) 73 | output = output.replace("{media}", str(u.media_count)) 74 | output = output.replace("{private}", str(u.is_private)) 75 | output = output.replace("{verified}", str(u.is_verified)) 76 | output = output.replace("{avatar}", u.avatar) 77 | if u.background_image: 78 | output = output.replace("{background_image}", u.background_image) 79 | else: 80 | output = output.replace("{background_image}", "") 81 | else: 82 | logme.debug(__name__+':User:notFormat') 83 | output = f"{u.id} | {u.name} | @{u.username} | Private: " 84 | output += f"{u.is_private} | Verified: {u.is_verified} |" 85 | output += f" Bio: {u.bio} | Location: {u.location} | Url: " 86 | output += f"{u.url} | Joined: {u.join_date} {u.join_time} " 87 | output += f"| Tweets: {u.tweets} | Following: {u.following}" 88 | output += f" | Followers: {u.followers} | Likes: {u.likes} " 89 | output += f"| Media: {u.media_count} | Avatar: {u.avatar}" 90 | 91 | return output 92 | -------------------------------------------------------------------------------- /twint/get.py: -------------------------------------------------------------------------------- 1 | from async_timeout import timeout 2 | from datetime import datetime 3 | from bs4 import BeautifulSoup 4 | import sys 5 | import socket 6 | import aiohttp 7 | from fake_useragent import UserAgent 8 | import asyncio 9 | import concurrent.futures 10 | import random 11 | from json import loads, dumps 12 | from aiohttp_socks import ProxyConnector, ProxyType 13 | from urllib.parse import quote 14 | 15 | from . import url 16 | from .output import Tweets, Users 17 | from .token import TokenExpiryException 18 | 19 | import logging as logme 20 | 21 | httpproxy = None 22 | 23 | user_agent_list = [ 24 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 25 | # ' Chrome/60.0.3112.113 Safari/537.36', 26 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 27 | # ' Chrome/60.0.3112.90 Safari/537.36', 28 | # 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 29 | # ' Chrome/60.0.3112.90 Safari/537.36', 30 | # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 31 | # ' Chrome/60.0.3112.90 Safari/537.36', 32 | # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' 33 | # ' Chrome/44.0.2403.157 Safari/537.36', 34 | # 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 35 | # ' Chrome/60.0.3112.113 Safari/537.36', 36 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 37 | # ' Chrome/57.0.2987.133 Safari/537.36', 38 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 39 | # ' Chrome/57.0.2987.133 Safari/537.36', 40 | # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 41 | # ' Chrome/55.0.2883.87 Safari/537.36', 42 | # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 43 | # ' Chrome/55.0.2883.87 Safari/537.36', 44 | 45 | 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 46 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 47 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 48 | 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 49 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 50 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 51 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 52 | 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 53 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 54 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 55 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 56 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 57 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET ' 58 | 'CLR 3.5.30729)', 59 | ] 60 | 61 | 62 | # function to convert python `dict` to json and then encode it to be passed in the url as a parameter 63 | # some urls require this format 64 | def dict_to_url(dct): 65 | return quote(dumps(dct)) 66 | 67 | 68 | def get_connector(config): 69 | logme.debug(__name__ + ':get_connector') 70 | _connector = None 71 | if config.Proxy_host: 72 | if config.Proxy_host.lower() == "tor": 73 | _connector = ProxyConnector( 74 | host='127.0.0.1', 75 | port=9050, 76 | rdns=True) 77 | elif config.Proxy_port and config.Proxy_type: 78 | if config.Proxy_type.lower() == "socks5": 79 | _type = ProxyType.SOCKS5 80 | elif config.Proxy_type.lower() == "socks4": 81 | _type = ProxyType.SOCKS4 82 | elif config.Proxy_type.lower() == "http": 83 | global httpproxy 84 | httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port) 85 | return _connector 86 | else: 87 | logme.critical("get_connector:proxy-type-error") 88 | print("Error: Proxy types allowed are: http, socks5 and socks4. No https.") 89 | sys.exit(1) 90 | _connector = ProxyConnector( 91 | proxy_type=_type, 92 | host=config.Proxy_host, 93 | port=config.Proxy_port, 94 | rdns=True) 95 | else: 96 | logme.critical(__name__ + ':get_connector:proxy-port-type-error') 97 | print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type") 98 | sys.exit(1) 99 | else: 100 | if config.Proxy_port or config.Proxy_type: 101 | logme.critical(__name__ + ':get_connector:proxy-host-arg-error') 102 | print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type") 103 | sys.exit(1) 104 | 105 | return _connector 106 | 107 | 108 | async def RequestUrl(config, init): 109 | logme.debug(__name__ + ':RequestUrl') 110 | _connector = get_connector(config) 111 | _serialQuery = "" 112 | params = [] 113 | _url = "" 114 | _headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)] 115 | 116 | # TODO : do this later 117 | if config.Profile: 118 | logme.debug(__name__ + ':RequestUrl:Profile') 119 | _url, params, _serialQuery = url.SearchProfile(config, init) 120 | elif config.TwitterSearch: 121 | logme.debug(__name__ + ':RequestUrl:TwitterSearch') 122 | _url, params, _serialQuery = await url.Search(config, init) 123 | else: 124 | if config.Following: 125 | logme.debug(__name__ + ':RequestUrl:Following') 126 | _url = await url.Following(config.Username, init) 127 | elif config.Followers: 128 | logme.debug(__name__ + ':RequestUrl:Followers') 129 | _url = await url.Followers(config.Username, init) 130 | else: 131 | logme.debug(__name__ + ':RequestUrl:Favorites') 132 | _url = await url.Favorites(config.Username, init) 133 | _serialQuery = _url 134 | 135 | response = await Request(_url, params=params, connector=_connector, headers=_headers) 136 | 137 | if config.Debug: 138 | print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8")) 139 | 140 | return response 141 | 142 | 143 | def ForceNewTorIdentity(config): 144 | logme.debug(__name__ + ':ForceNewTorIdentity') 145 | try: 146 | tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port)) 147 | tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode()) 148 | response = tor_c.recv(1024) 149 | if response != b'250 OK\r\n250 OK\r\n': 150 | sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response)) 151 | logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse') 152 | except Exception as e: 153 | logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor') 154 | sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e))) 155 | sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n') 156 | 157 | 158 | async def Request(_url, connector=None, params=None, headers=None): 159 | logme.debug(__name__ + ':Request:Connector') 160 | async with aiohttp.ClientSession(connector=connector, headers=headers) as session: 161 | return await Response(session, _url, params) 162 | 163 | 164 | async def Response(session, _url, params=None): 165 | logme.debug(__name__ + ':Response') 166 | with timeout(120): 167 | async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response: 168 | resp = await response.text() 169 | if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded 170 | raise TokenExpiryException(loads(resp)['errors'][0]['message']) 171 | return resp 172 | 173 | 174 | async def RandomUserAgent(wa=None): 175 | logme.debug(__name__ + ':RandomUserAgent') 176 | try: 177 | if wa: 178 | return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36" 179 | return UserAgent(verify_ssl=False, use_cache_server=False).random 180 | except: 181 | return random.choice(user_agent_list) 182 | 183 | 184 | async def Username(_id, bearer_token, guest_token): 185 | logme.debug(__name__ + ':Username') 186 | _dct = {'userId': _id, 'withHighlightedLabel': False} 187 | _url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct)) 188 | _headers = { 189 | 'authorization': bearer_token, 190 | 'x-guest-token': guest_token, 191 | } 192 | r = await Request(_url, headers=_headers) 193 | j_r = loads(r) 194 | username = j_r['data']['user']['legacy']['screen_name'] 195 | return username 196 | 197 | 198 | async def Tweet(url, config, conn): 199 | logme.debug(__name__ + ':Tweet') 200 | try: 201 | response = await Request(url) 202 | soup = BeautifulSoup(response, "html.parser") 203 | tweets = soup.find_all("div", "tweet") 204 | await Tweets(tweets, config, conn, url) 205 | except Exception as e: 206 | logme.critical(__name__ + ':Tweet:' + str(e)) 207 | 208 | 209 | async def User(username, config, conn, user_id=False): 210 | logme.debug(__name__ + ':User') 211 | _dct = {'screen_name': username, 'withHighlightedLabel': False} 212 | _url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\ 213 | .format(dict_to_url(_dct)) 214 | _headers = { 215 | 'authorization': config.Bearer_token, 216 | 'x-guest-token': config.Guest_token, 217 | } 218 | try: 219 | response = await Request(_url, headers=_headers) 220 | j_r = loads(response) 221 | if user_id: 222 | try: 223 | _id = j_r['data']['user']['rest_id'] 224 | return _id 225 | except KeyError as e: 226 | logme.critical(__name__ + ':User:' + str(e)) 227 | return 228 | await Users(j_r, config, conn) 229 | except Exception as e: 230 | logme.critical(__name__ + ':User:' + str(e)) 231 | raise 232 | 233 | 234 | def Limit(Limit, count): 235 | logme.debug(__name__ + ':Limit') 236 | if Limit is not None and count >= int(Limit): 237 | return True 238 | 239 | 240 | async def Multi(feed, config, conn): 241 | logme.debug(__name__ + ':Multi') 242 | count = 0 243 | try: 244 | with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: 245 | loop = asyncio.get_event_loop() 246 | futures = [] 247 | for tweet in feed: 248 | count += 1 249 | if config.Favorites or config.Profile_full: 250 | logme.debug(__name__ + ':Multi:Favorites-profileFull') 251 | link = tweet.find("a")["href"] 252 | url = f"https://twitter.com{link}&lang=en" 253 | elif config.User_full: 254 | logme.debug(__name__ + ':Multi:userFull') 255 | username = tweet.find("a")["name"] 256 | url = f"http://twitter.com/{username}?lang=en" 257 | else: 258 | logme.debug(__name__ + ':Multi:else-url') 259 | link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"] 260 | url = f"https://twitter.com{link}?lang=en" 261 | 262 | if config.User_full: 263 | logme.debug(__name__ + ':Multi:user-full-Run') 264 | futures.append(loop.run_in_executor(executor, await User(url, 265 | config, conn))) 266 | else: 267 | logme.debug(__name__ + ':Multi:notUser-full-Run') 268 | futures.append(loop.run_in_executor(executor, await Tweet(url, 269 | config, conn))) 270 | logme.debug(__name__ + ':Multi:asyncioGather') 271 | await asyncio.gather(*futures) 272 | except Exception as e: 273 | # TODO: fix error not error 274 | # print(str(e) + " [x] get.Multi") 275 | # will return "'NoneType' object is not callable" 276 | # but still works 277 | # logme.critical(__name__+':Multi:' + str(e)) 278 | pass 279 | 280 | return count 281 | -------------------------------------------------------------------------------- /twint/output.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from . import format, get 4 | from .tweet import Tweet 5 | from .user import User 6 | from .storage import db, elasticsearch, write, panda 7 | 8 | import logging as logme 9 | 10 | follows_list = [] 11 | tweets_list = [] 12 | users_list = [] 13 | 14 | author_list = {''} 15 | author_list.pop() 16 | 17 | # used by Pandas 18 | _follows_object = {} 19 | 20 | 21 | def _formatDateTime(datetimestamp): 22 | try: 23 | return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp()) 24 | except ValueError: 25 | return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp()) 26 | 27 | 28 | def _clean_follow_list(): 29 | logme.debug(__name__ + ':clean_follow_list') 30 | global _follows_object 31 | _follows_object = {} 32 | 33 | 34 | def clean_lists(): 35 | logme.debug(__name__ + ':clean_lists') 36 | global follows_list 37 | global tweets_list 38 | global users_list 39 | follows_list = [] 40 | tweets_list = [] 41 | users_list = [] 42 | 43 | 44 | def datecheck(datetimestamp, config): 45 | logme.debug(__name__ + ':datecheck') 46 | if config.Since: 47 | logme.debug(__name__ + ':datecheck:SinceTrue') 48 | 49 | d = _formatDateTime(datetimestamp) 50 | s = _formatDateTime(config.Since) 51 | 52 | if d < s: 53 | return False 54 | if config.Until: 55 | logme.debug(__name__ + ':datecheck:UntilTrue') 56 | 57 | d = _formatDateTime(datetimestamp) 58 | s = _formatDateTime(config.Until) 59 | 60 | if d > s: 61 | return False 62 | logme.debug(__name__ + ':datecheck:dateRangeFalse') 63 | return True 64 | 65 | 66 | # TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the 67 | # `tweets` list along with the other tweets 68 | def is_tweet(tw): 69 | try: 70 | tw["data-item-id"] 71 | logme.debug(__name__ + ':is_tweet:True') 72 | return True 73 | except: 74 | logme.critical(__name__ + ':is_tweet:False') 75 | return False 76 | 77 | 78 | def _output(obj, output, config, **extra): 79 | logme.debug(__name__ + ':_output') 80 | if config.Lowercase: 81 | if isinstance(obj, str): 82 | logme.debug(__name__ + ':_output:Lowercase:username') 83 | obj = obj.lower() 84 | elif obj.__class__.__name__ == "user": 85 | logme.debug(__name__ + ':_output:Lowercase:user') 86 | pass 87 | elif obj.__class__.__name__ == "tweet": 88 | logme.debug(__name__ + ':_output:Lowercase:tweet') 89 | obj.username = obj.username.lower() 90 | author_list.update({obj.username}) 91 | for dct in obj.mentions: 92 | for key, val in dct.items(): 93 | dct[key] = val.lower() 94 | for i in range(len(obj.hashtags)): 95 | obj.hashtags[i] = obj.hashtags[i].lower() 96 | for i in range(len(obj.cashtags)): 97 | obj.cashtags[i] = obj.cashtags[i].lower() 98 | else: 99 | logme.info('_output:Lowercase:hiddenTweetFound') 100 | print("[x] Hidden tweet found, account suspended due to violation of TOS") 101 | return 102 | if config.Output != None: 103 | if config.Store_csv: 104 | try: 105 | write.Csv(obj, config) 106 | logme.debug(__name__ + ':_output:CSV') 107 | except Exception as e: 108 | logme.critical(__name__ + ':_output:CSV:Error:' + str(e)) 109 | print(str(e) + " [x] output._output") 110 | elif config.Store_json: 111 | write.Json(obj, config) 112 | logme.debug(__name__ + ':_output:JSON') 113 | else: 114 | write.Text(output, config.Output) 115 | logme.debug(__name__ + ':_output:Text') 116 | 117 | if config.Elasticsearch: 118 | logme.debug(__name__ + ':_output:Elasticsearch') 119 | print("", end=".", flush=True) 120 | else: 121 | if not config.Hide_output: 122 | try: 123 | print(output.replace('\n', ' ')) 124 | except UnicodeEncodeError: 125 | logme.critical(__name__ + ':_output:UnicodeEncodeError') 126 | print("unicode error [x] output._output") 127 | 128 | 129 | async def checkData(tweet, config, conn): 130 | logme.debug(__name__ + ':checkData') 131 | tweet = Tweet(tweet, config) 132 | if not tweet.datestamp: 133 | logme.critical(__name__ + ':checkData:hiddenTweetFound') 134 | print("[x] Hidden tweet found, account suspended due to violation of TOS") 135 | return 136 | if datecheck(tweet.datestamp + " " + tweet.timestamp, config): 137 | output = format.Tweet(config, tweet) 138 | if config.Database: 139 | logme.debug(__name__ + ':checkData:Database') 140 | db.tweets(conn, tweet, config) 141 | if config.Pandas: 142 | logme.debug(__name__ + ':checkData:Pandas') 143 | panda.update(tweet, config) 144 | if config.Store_object: 145 | logme.debug(__name__ + ':checkData:Store_object') 146 | if hasattr(config.Store_object_tweets_list, 'append'): 147 | config.Store_object_tweets_list.append(tweet) 148 | else: 149 | tweets_list.append(tweet) 150 | if config.Elasticsearch: 151 | logme.debug(__name__ + ':checkData:Elasticsearch') 152 | elasticsearch.Tweet(tweet, config) 153 | _output(tweet, output, config) 154 | # else: 155 | # logme.critical(__name__+':checkData:copyrightedTweet') 156 | 157 | 158 | async def Tweets(tweets, config, conn): 159 | logme.debug(__name__ + ':Tweets') 160 | if config.Favorites or config.Location: 161 | logme.debug(__name__ + ':Tweets:fav+full+loc') 162 | for tw in tweets: 163 | await checkData(tw, config, conn) 164 | elif config.TwitterSearch or config.Profile: 165 | logme.debug(__name__ + ':Tweets:TwitterSearch') 166 | await checkData(tweets, config, conn) 167 | else: 168 | logme.debug(__name__ + ':Tweets:else') 169 | if int(tweets["data-user-id"]) == config.User_id or config.Retweets: 170 | await checkData(tweets, config, conn) 171 | 172 | 173 | async def Users(u, config, conn): 174 | logme.debug(__name__ + ':User') 175 | global users_list 176 | 177 | user = User(u) 178 | output = format.User(config.Format, user) 179 | 180 | if config.Database: 181 | logme.debug(__name__ + ':User:Database') 182 | db.user(conn, config, user) 183 | 184 | if config.Elasticsearch: 185 | logme.debug(__name__ + ':User:Elasticsearch') 186 | _save_date = user.join_date 187 | _save_time = user.join_time 188 | user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0] 189 | user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1] 190 | elasticsearch.UserProfile(user, config) 191 | user.join_date = _save_date 192 | user.join_time = _save_time 193 | 194 | if config.Store_object: 195 | logme.debug(__name__ + ':User:Store_object') 196 | 197 | if hasattr(config.Store_object_follow_list, 'append'): 198 | config.Store_object_follow_list.append(user) 199 | elif hasattr(config.Store_object_users_list, 'append'): 200 | config.Store_object_users_list.append(user) 201 | else: 202 | users_list.append(user) # twint.user.user 203 | 204 | if config.Pandas: 205 | logme.debug(__name__ + ':User:Pandas+user') 206 | panda.update(user, config) 207 | 208 | _output(user, output, config) 209 | 210 | 211 | async def Username(username, config, conn): 212 | logme.debug(__name__ + ':Username') 213 | global _follows_object 214 | global follows_list 215 | follow_var = config.Following * "following" + config.Followers * "followers" 216 | 217 | if config.Database: 218 | logme.debug(__name__ + ':Username:Database') 219 | db.follow(conn, config.Username, config.Followers, username) 220 | 221 | if config.Elasticsearch: 222 | logme.debug(__name__ + ':Username:Elasticsearch') 223 | elasticsearch.Follow(username, config) 224 | 225 | if config.Store_object: 226 | if hasattr(config.Store_object_follow_list, 'append'): 227 | config.Store_object_follow_list.append(username) 228 | else: 229 | follows_list.append(username) # twint.user.user 230 | 231 | if config.Pandas: 232 | logme.debug(__name__ + ':Username:object+pandas') 233 | try: 234 | _ = _follows_object[config.Username][follow_var] 235 | except KeyError: 236 | _follows_object.update({config.Username: {follow_var: []}}) 237 | _follows_object[config.Username][follow_var].append(username) 238 | if config.Pandas_au: 239 | logme.debug(__name__ + ':Username:object+pandas+au') 240 | panda.update(_follows_object[config.Username], config) 241 | _output(username, username, config) 242 | -------------------------------------------------------------------------------- /twint/run.py: -------------------------------------------------------------------------------- 1 | import sys, os, datetime 2 | from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop 3 | 4 | from . import datelock, feed, get, output, verbose, storage 5 | from .token import TokenExpiryException 6 | from . import token 7 | from .storage import db 8 | from .feed import NoMoreTweetsException 9 | 10 | import logging as logme 11 | 12 | import time 13 | 14 | bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \ 15 | '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' 16 | 17 | 18 | class Twint: 19 | def __init__(self, config): 20 | logme.debug(__name__ + ':Twint:__init__') 21 | if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following): 22 | logme.debug(__name__ + ':Twint:__init__:Resume') 23 | self.init = self.get_resume(config.Resume) 24 | else: 25 | self.init = -1 26 | 27 | config.deleted = [] 28 | self.feed: list = [-1] 29 | self.count = 0 30 | self.user_agent = "" 31 | self.config = config 32 | self.config.Bearer_token = bearer 33 | # TODO might have to make some adjustments for it to work with multi-treading 34 | # USAGE : to get a new guest token simply do `self.token.refresh()` 35 | self.token = token.Token(config) 36 | self.token.refresh() 37 | self.conn = db.Conn(config.Database) 38 | self.d = datelock.Set(self.config.Until, self.config.Since) 39 | verbose.Elastic(config.Elasticsearch) 40 | 41 | if self.config.Store_object: 42 | logme.debug(__name__ + ':Twint:__init__:clean_follow_list') 43 | output._clean_follow_list() 44 | 45 | if self.config.Pandas_clean: 46 | logme.debug(__name__ + ':Twint:__init__:pandas_clean') 47 | storage.panda.clean() 48 | 49 | def get_resume(self, resumeFile): 50 | if not os.path.exists(resumeFile): 51 | return '-1' 52 | with open(resumeFile, 'r') as rFile: 53 | _init = rFile.readlines()[-1].strip('\n') 54 | return _init 55 | 56 | async def Feed(self): 57 | logme.debug(__name__ + ':Twint:Feed') 58 | consecutive_errors_count = 0 59 | while True: 60 | # this will receive a JSON string, parse it into a `dict` and do the required stuff 61 | try: 62 | response = await get.RequestUrl(self.config, self.init) 63 | except TokenExpiryException as e: 64 | logme.debug(__name__ + 'Twint:Feed:' + str(e)) 65 | self.token.refresh() 66 | response = await get.RequestUrl(self.config, self.init) 67 | 68 | if self.config.Debug: 69 | print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) 70 | 71 | self.feed = [] 72 | try: 73 | if self.config.Favorites: 74 | self.feed, self.init = feed.MobileFav(response) 75 | favorite_err_cnt = 0 76 | if len(self.feed) == 0 and len(self.init) == 0: 77 | while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5: 78 | self.user_agent = await get.RandomUserAgent(wa=False) 79 | response = await get.RequestUrl(self.config, self.init, 80 | headers=[("User-Agent", self.user_agent)]) 81 | self.feed, self.init = feed.MobileFav(response) 82 | favorite_err_cnt += 1 83 | time.sleep(1) 84 | if favorite_err_cnt == 5: 85 | print("Favorite page could not be fetched") 86 | if not self.count % 40: 87 | time.sleep(5) 88 | elif self.config.Followers or self.config.Following: 89 | self.feed, self.init = feed.Follow(response) 90 | if not self.count % 40: 91 | time.sleep(5) 92 | elif self.config.Profile or self.config.TwitterSearch: 93 | try: 94 | self.feed, self.init = feed.parse_tweets(self.config, response) 95 | except NoMoreTweetsException as e: 96 | logme.debug(__name__ + ':Twint:Feed:' + str(e)) 97 | print('[!] ' + str(e) + ' Scraping will stop now.') 98 | print('found {} deleted tweets in this search.'.format(len(self.config.deleted))) 99 | break 100 | break 101 | except TimeoutError as e: 102 | if self.config.Proxy_host.lower() == "tor": 103 | print("[?] Timed out, changing Tor identity...") 104 | if self.config.Tor_control_password is None: 105 | logme.critical(__name__ + ':Twint:Feed:tor-password') 106 | sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n") 107 | sys.stderr.write( 108 | "Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors" 109 | "-controller-interface-directly\r\n") 110 | break 111 | else: 112 | get.ForceNewTorIdentity(self.config) 113 | continue 114 | else: 115 | logme.critical(__name__ + ':Twint:Feed:' + str(e)) 116 | print(str(e)) 117 | break 118 | except Exception as e: 119 | if self.config.Profile or self.config.Favorites: 120 | print("[!] Twitter does not return more data, scrape stops here.") 121 | break 122 | 123 | logme.critical(__name__ + ':Twint:Feed:noData' + str(e)) 124 | # Sometimes Twitter says there is no data. But it's a lie. 125 | # raise 126 | consecutive_errors_count += 1 127 | if consecutive_errors_count < self.config.Retries_count: 128 | # skip to the next iteration if wait time does not satisfy limit constraints 129 | delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1) 130 | 131 | # if the delay is less than users set min wait time then replace delay 132 | if self.config.Min_wait_time > delay: 133 | delay = self.config.Min_wait_time 134 | 135 | sys.stderr.write('sleeping for {} secs\n'.format(delay)) 136 | time.sleep(delay) 137 | self.user_agent = await get.RandomUserAgent(wa=True) 138 | continue 139 | logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e)) 140 | sys.stderr.write(str(e) + " [x] run.Feed") 141 | sys.stderr.write( 142 | "[!] if you get this error but you know for sure that more tweets exist, please open an issue and " 143 | "we will investigate it!") 144 | break 145 | if self.config.Resume: 146 | print(self.init, file=open(self.config.Resume, "a", encoding="utf-8")) 147 | 148 | async def follow(self): 149 | await self.Feed() 150 | if self.config.User_full: 151 | logme.debug(__name__ + ':Twint:follow:userFull') 152 | self.count += await get.Multi(self.feed, self.config, self.conn) 153 | else: 154 | logme.debug(__name__ + ':Twint:follow:notUserFull') 155 | for user in self.feed: 156 | self.count += 1 157 | username = user.find("a")["name"] 158 | await output.Username(username, self.config, self.conn) 159 | 160 | async def favorite(self): 161 | logme.debug(__name__ + ':Twint:favorite') 162 | await self.Feed() 163 | favorited_tweets_list = [] 164 | for tweet in self.feed: 165 | tweet_dict = {} 166 | self.count += 1 167 | try: 168 | tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id'] 169 | t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"] 170 | tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1] 171 | tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ', 172 | '') 173 | tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text 174 | date_str = tweet.find("td", {"class": "timestamp"}).find("a").text 175 | # test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"] 176 | # date_str = test_dates[3] 177 | if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h 178 | dateu = str(datetime.date.today()) 179 | tweet_dict['date'] = dateu 180 | elif ',' in date_str: # Aug 21, 2019 181 | sp = date_str.replace(',', '').split(' ') 182 | date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2] 183 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") 184 | tweet_dict['date'] = dateu 185 | elif len(date_str.split(' ')) == 3: # 28 Jun 19 186 | sp = date_str.split(' ') 187 | if len(sp[2]) == 2: 188 | sp[2] = '20' + sp[2] 189 | date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2] 190 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") 191 | tweet_dict['date'] = dateu 192 | else: # Aug 21 193 | sp = date_str.split(' ') 194 | date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year) 195 | dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d") 196 | tweet_dict['date'] = dateu 197 | 198 | favorited_tweets_list.append(tweet_dict) 199 | 200 | except Exception as e: 201 | logme.critical(__name__ + ':Twint:favorite:favorite_field_lack') 202 | print("shit: ", date_str, " ", str(e)) 203 | 204 | try: 205 | self.config.favorited_tweets_list += favorited_tweets_list 206 | except AttributeError: 207 | self.config.favorited_tweets_list = favorited_tweets_list 208 | 209 | async def profile(self): 210 | await self.Feed() 211 | logme.debug(__name__ + ':Twint:profile') 212 | for tweet in self.feed: 213 | self.count += 1 214 | await output.Tweets(tweet, self.config, self.conn) 215 | 216 | async def tweets(self): 217 | await self.Feed() 218 | # TODO : need to take care of this later 219 | if self.config.Location: 220 | logme.debug(__name__ + ':Twint:tweets:location') 221 | self.count += await get.Multi(self.feed, self.config, self.conn) 222 | else: 223 | logme.debug(__name__ + ':Twint:tweets:notLocation') 224 | for tweet in self.feed: 225 | self.count += 1 226 | await output.Tweets(tweet, self.config, self.conn) 227 | 228 | async def main(self, callback=None): 229 | 230 | task = ensure_future(self.run()) # Might be changed to create_task in 3.7+. 231 | 232 | if callback: 233 | task.add_done_callback(callback) 234 | 235 | await task 236 | 237 | async def run(self): 238 | if self.config.TwitterSearch: 239 | self.user_agent = await get.RandomUserAgent(wa=True) 240 | else: 241 | self.user_agent = await get.RandomUserAgent() 242 | 243 | if self.config.User_id is not None and self.config.Username is None: 244 | logme.debug(__name__ + ':Twint:main:user_id') 245 | self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token, 246 | self.config.Guest_token) 247 | 248 | if self.config.Username is not None and self.config.User_id is None: 249 | logme.debug(__name__ + ':Twint:main:username') 250 | 251 | self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True) 252 | if self.config.User_id is None: 253 | raise ValueError("Cannot find twitter account with name = " + self.config.Username) 254 | 255 | # TODO : will need to modify it to work with the new endpoints 256 | if self.config.TwitterSearch and self.config.Since and self.config.Until: 257 | logme.debug(__name__ + ':Twint:main:search+since+until') 258 | while self.d.since < self.d.until: 259 | self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S") 260 | self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S") 261 | if len(self.feed) > 0: 262 | await self.tweets() 263 | else: 264 | logme.debug(__name__ + ':Twint:main:gettingNewTweets') 265 | break 266 | 267 | if get.Limit(self.config.Limit, self.count): 268 | break 269 | elif self.config.Lookup: 270 | await self.Lookup() 271 | else: 272 | logme.debug(__name__ + ':Twint:main:not-search+since+until') 273 | while True: 274 | if len(self.feed) > 0: 275 | if self.config.Followers or self.config.Following: 276 | logme.debug(__name__ + ':Twint:main:follow') 277 | await self.follow() 278 | elif self.config.Favorites: 279 | logme.debug(__name__ + ':Twint:main:favorites') 280 | await self.favorite() 281 | elif self.config.Profile: 282 | logme.debug(__name__ + ':Twint:main:profile') 283 | await self.profile() 284 | elif self.config.TwitterSearch: 285 | logme.debug(__name__ + ':Twint:main:twitter-search') 286 | await self.tweets() 287 | else: 288 | logme.debug(__name__ + ':Twint:main:no-more-tweets') 289 | break 290 | 291 | # logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2') 292 | if get.Limit(self.config.Limit, self.count): 293 | logme.debug(__name__ + ':Twint:main:reachedLimit') 294 | break 295 | 296 | if self.config.Count: 297 | verbose.Count(self.count, self.config) 298 | 299 | async def Lookup(self): 300 | logme.debug(__name__ + ':Twint:Lookup') 301 | 302 | try: 303 | if self.config.User_id is not None and self.config.Username is None: 304 | logme.debug(__name__ + ':Twint:Lookup:user_id') 305 | self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token, 306 | self.config.Guest_token) 307 | await get.User(self.config.Username, self.config, db.Conn(self.config.Database)) 308 | 309 | except Exception as e: 310 | logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.') 311 | raise 312 | 313 | 314 | def run(config, callback=None): 315 | logme.debug(__name__ + ':run') 316 | try: 317 | get_event_loop() 318 | except RuntimeError as e: 319 | if "no current event loop" in str(e): 320 | set_event_loop(new_event_loop()) 321 | else: 322 | logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.') 323 | raise 324 | except Exception as e: 325 | logme.exception( 326 | __name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.') 327 | raise 328 | 329 | get_event_loop().run_until_complete(Twint(config).main(callback)) 330 | 331 | 332 | def Favorites(config): 333 | logme.debug(__name__ + ':Favorites') 334 | config.Favorites = True 335 | config.Following = False 336 | config.Followers = False 337 | config.Profile = False 338 | config.TwitterSearch = False 339 | run(config) 340 | if config.Pandas_au: 341 | storage.panda._autoget("tweet") 342 | 343 | 344 | def Followers(config): 345 | logme.debug(__name__ + ':Followers') 346 | config.Followers = True 347 | config.Following = False 348 | config.Profile = False 349 | config.Favorites = False 350 | config.TwitterSearch = False 351 | run(config) 352 | if config.Pandas_au: 353 | storage.panda._autoget("followers") 354 | if config.User_full: 355 | storage.panda._autoget("user") 356 | if config.Pandas_clean and not config.Store_object: 357 | # storage.panda.clean() 358 | output._clean_follow_list() 359 | 360 | 361 | def Following(config): 362 | logme.debug(__name__ + ':Following') 363 | config.Following = True 364 | config.Followers = False 365 | config.Profile = False 366 | config.Favorites = False 367 | config.TwitterSearch = False 368 | run(config) 369 | if config.Pandas_au: 370 | storage.panda._autoget("following") 371 | if config.User_full: 372 | storage.panda._autoget("user") 373 | if config.Pandas_clean and not config.Store_object: 374 | # storage.panda.clean() 375 | output._clean_follow_list() 376 | 377 | 378 | def Lookup(config): 379 | logme.debug(__name__ + ':Lookup') 380 | config.Profile = False 381 | config.Lookup = True 382 | config.Favorites = False 383 | config.FOllowing = False 384 | config.Followers = False 385 | config.TwitterSearch = False 386 | run(config) 387 | if config.Pandas_au: 388 | storage.panda._autoget("user") 389 | 390 | 391 | def Profile(config): 392 | logme.debug(__name__ + ':Profile') 393 | config.Profile = True 394 | config.Favorites = False 395 | config.Following = False 396 | config.Followers = False 397 | config.TwitterSearch = False 398 | run(config) 399 | if config.Pandas_au: 400 | storage.panda._autoget("tweet") 401 | 402 | 403 | def Search(config, callback=None): 404 | logme.debug(__name__ + ':Search') 405 | config.TwitterSearch = True 406 | config.Favorites = False 407 | config.Following = False 408 | config.Followers = False 409 | config.Profile = False 410 | run(config, callback) 411 | if config.Pandas_au: 412 | storage.panda._autoget("tweet") 413 | -------------------------------------------------------------------------------- /twint/storage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twintproject/twint/e7c8a0c764f6879188e5c21e25fb6f1f856a7221/twint/storage/__init__.py -------------------------------------------------------------------------------- /twint/storage/db.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import sys 3 | import time 4 | import hashlib 5 | 6 | from datetime import datetime 7 | 8 | def Conn(database): 9 | if database: 10 | print("[+] Inserting into Database: " + str(database)) 11 | conn = init(database) 12 | if isinstance(conn, str): # error 13 | print(conn) 14 | sys.exit(1) 15 | else: 16 | conn = "" 17 | 18 | return conn 19 | 20 | def init(db): 21 | try: 22 | conn = sqlite3.connect(db) 23 | cursor = conn.cursor() 24 | 25 | table_users = """ 26 | CREATE TABLE IF NOT EXISTS 27 | users( 28 | id integer not null, 29 | id_str text not null, 30 | name text, 31 | username text not null, 32 | bio text, 33 | location text, 34 | url text, 35 | join_date text not null, 36 | join_time text not null, 37 | tweets integer, 38 | following integer, 39 | followers integer, 40 | likes integer, 41 | media integer, 42 | private integer not null, 43 | verified integer not null, 44 | profile_image_url text not null, 45 | background_image text, 46 | hex_dig text not null, 47 | time_update integer not null, 48 | CONSTRAINT users_pk PRIMARY KEY (id, hex_dig) 49 | ); 50 | """ 51 | cursor.execute(table_users) 52 | 53 | table_tweets = """ 54 | CREATE TABLE IF NOT EXISTS 55 | tweets ( 56 | id integer not null, 57 | id_str text not null, 58 | tweet text default '', 59 | language text default '', 60 | conversation_id text not null, 61 | created_at integer not null, 62 | date text not null, 63 | time text not null, 64 | timezone text not null, 65 | place text default '', 66 | replies_count integer, 67 | likes_count integer, 68 | retweets_count integer, 69 | user_id integer not null, 70 | user_id_str text not null, 71 | screen_name text not null, 72 | name text default '', 73 | link text, 74 | mentions text, 75 | hashtags text, 76 | cashtags text, 77 | urls text, 78 | photos text, 79 | thumbnail text, 80 | quote_url text, 81 | video integer, 82 | geo text, 83 | near text, 84 | source text, 85 | time_update integer not null, 86 | `translate` text default '', 87 | trans_src text default '', 88 | trans_dest text default '', 89 | PRIMARY KEY (id) 90 | ); 91 | """ 92 | cursor.execute(table_tweets) 93 | 94 | table_retweets = """ 95 | CREATE TABLE IF NOT EXISTS 96 | retweets( 97 | user_id integer not null, 98 | username text not null, 99 | tweet_id integer not null, 100 | retweet_id integer not null, 101 | retweet_date integer, 102 | CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id), 103 | CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id), 104 | CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id) 105 | ); 106 | """ 107 | cursor.execute(table_retweets) 108 | 109 | table_reply_to = """ 110 | CREATE TABLE IF NOT EXISTS 111 | replies( 112 | tweet_id integer not null, 113 | user_id integer not null, 114 | username text not null, 115 | CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id), 116 | CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id) 117 | ); 118 | """ 119 | cursor.execute(table_reply_to) 120 | 121 | table_favorites = """ 122 | CREATE TABLE IF NOT EXISTS 123 | favorites( 124 | user_id integer not null, 125 | tweet_id integer not null, 126 | CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id), 127 | CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id), 128 | CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id) 129 | ); 130 | """ 131 | cursor.execute(table_favorites) 132 | 133 | table_followers = """ 134 | CREATE TABLE IF NOT EXISTS 135 | followers ( 136 | id integer not null, 137 | follower_id integer not null, 138 | CONSTRAINT followers_pk PRIMARY KEY (id, follower_id), 139 | CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id), 140 | CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id) 141 | ); 142 | """ 143 | cursor.execute(table_followers) 144 | 145 | table_following = """ 146 | CREATE TABLE IF NOT EXISTS 147 | following ( 148 | id integer not null, 149 | following_id integer not null, 150 | CONSTRAINT following_pk PRIMARY KEY (id, following_id), 151 | CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id), 152 | CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id) 153 | ); 154 | """ 155 | cursor.execute(table_following) 156 | 157 | table_followers_names = """ 158 | CREATE TABLE IF NOT EXISTS 159 | followers_names ( 160 | user text not null, 161 | time_update integer not null, 162 | follower text not null, 163 | PRIMARY KEY (user, follower) 164 | ); 165 | """ 166 | cursor.execute(table_followers_names) 167 | 168 | table_following_names = """ 169 | CREATE TABLE IF NOT EXISTS 170 | following_names ( 171 | user text not null, 172 | time_update integer not null, 173 | follows text not null, 174 | PRIMARY KEY (user, follows) 175 | ); 176 | """ 177 | cursor.execute(table_following_names) 178 | 179 | return conn 180 | except Exception as e: 181 | return str(e) 182 | 183 | def fTable(Followers): 184 | if Followers: 185 | table = "followers_names" 186 | else: 187 | table = "following_names" 188 | 189 | return table 190 | 191 | def uTable(Followers): 192 | if Followers: 193 | table = "followers" 194 | else: 195 | table = "following" 196 | 197 | return table 198 | 199 | def follow(conn, Username, Followers, User): 200 | try: 201 | time_ms = round(time.time()*1000) 202 | cursor = conn.cursor() 203 | entry = (User, time_ms, Username,) 204 | table = fTable(Followers) 205 | query = f"INSERT INTO {table} VALUES(?,?,?)" 206 | cursor.execute(query, entry) 207 | conn.commit() 208 | except sqlite3.IntegrityError: 209 | pass 210 | 211 | def get_hash_id(conn, id): 212 | cursor = conn.cursor() 213 | cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,)) 214 | resultset = cursor.fetchall() 215 | return resultset[0][0] if resultset else -1 216 | 217 | def user(conn, config, User): 218 | try: 219 | time_ms = round(time.time()*1000) 220 | cursor = conn.cursor() 221 | user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image] 222 | 223 | hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest() 224 | entry = tuple(user) + (hex_dig,time_ms,) 225 | old_hash = get_hash_id(conn, User.id) 226 | 227 | if old_hash == -1 or old_hash != hex_dig: 228 | query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" 229 | cursor.execute(query, entry) 230 | else: 231 | pass 232 | 233 | if config.Followers or config.Following: 234 | table = uTable(config.Followers) 235 | query = f"INSERT INTO {table} VALUES(?,?)" 236 | cursor.execute(query, (config.User_id, int(User.id))) 237 | 238 | conn.commit() 239 | except sqlite3.IntegrityError: 240 | pass 241 | 242 | def tweets(conn, Tweet, config): 243 | try: 244 | time_ms = round(time.time()*1000) 245 | cursor = conn.cursor() 246 | entry = (Tweet.id, 247 | Tweet.id_str, 248 | Tweet.tweet, 249 | Tweet.lang, 250 | Tweet.conversation_id, 251 | Tweet.datetime, 252 | Tweet.datestamp, 253 | Tweet.timestamp, 254 | Tweet.timezone, 255 | Tweet.place, 256 | Tweet.replies_count, 257 | Tweet.likes_count, 258 | Tweet.retweets_count, 259 | Tweet.user_id, 260 | Tweet.user_id_str, 261 | Tweet.username, 262 | Tweet.name, 263 | Tweet.link, 264 | ",".join(Tweet.mentions), 265 | ",".join(Tweet.hashtags), 266 | ",".join(Tweet.cashtags), 267 | ",".join(Tweet.urls), 268 | ",".join(Tweet.photos), 269 | Tweet.thumbnail, 270 | Tweet.quote_url, 271 | Tweet.video, 272 | Tweet.geo, 273 | Tweet.near, 274 | Tweet.source, 275 | time_ms, 276 | Tweet.translate, 277 | Tweet.trans_src, 278 | Tweet.trans_dest) 279 | cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) 280 | 281 | if config.Favorites: 282 | query = 'INSERT INTO favorites VALUES(?,?)' 283 | cursor.execute(query, (config.User_id, Tweet.id)) 284 | 285 | if Tweet.retweet: 286 | query = 'INSERT INTO retweets VALUES(?,?,?,?,?)' 287 | _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S")) 288 | cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d)) 289 | 290 | if Tweet.reply_to: 291 | for reply in Tweet.reply_to: 292 | query = 'INSERT INTO replies VALUES(?,?,?)' 293 | cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username'])) 294 | 295 | conn.commit() 296 | except sqlite3.IntegrityError: 297 | pass 298 | -------------------------------------------------------------------------------- /twint/storage/elasticsearch.py: -------------------------------------------------------------------------------- 1 | ## TODO - Fix Weekday situation 2 | from elasticsearch import Elasticsearch, helpers 3 | from geopy.geocoders import Nominatim 4 | from datetime import datetime 5 | import contextlib 6 | import sys 7 | 8 | _index_tweet_status = False 9 | _index_follow_status = False 10 | _index_user_status = False 11 | _is_near_def = False 12 | _is_location_def = False 13 | _near = {} 14 | _location = {} 15 | 16 | geolocator = Nominatim(user_agent="twint-1.2") 17 | 18 | class RecycleObject(object): 19 | def write(self, junk): pass 20 | def flush(self): pass 21 | 22 | def getLocation(place, **options): 23 | location = geolocator.geocode(place,timeout=1000) 24 | if location: 25 | if options.get("near"): 26 | global _near 27 | _near = {"lat": location.latitude, "lon": location.longitude} 28 | return True 29 | elif options.get("location"): 30 | global _location 31 | _location = {"lat": location.latitude, "lon": location.longitude} 32 | return True 33 | return {"lat": location.latitude, "lon": location.longitude} 34 | else: 35 | return {} 36 | 37 | def handleIndexResponse(response): 38 | try: 39 | if response["status"] == 400: 40 | return True 41 | except KeyError: 42 | pass 43 | if response["acknowledged"]: 44 | print("[+] Index \"" + response["index"] + "\" created!") 45 | else: 46 | print("[x] error index creation :: storage.elasticsearch.handleIndexCreation") 47 | if response["shards_acknowledged"]: 48 | print("[+] Shards acknowledged, everything is ready to be used!") 49 | return True 50 | else: 51 | print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation") 52 | return False 53 | 54 | def createIndex(config, instance, **scope): 55 | if scope.get("scope") == "tweet": 56 | tweets_body = { 57 | "mappings": { 58 | "properties": { 59 | "id": {"type": "long"}, 60 | "conversation_id": {"type": "long"}, 61 | "created_at": {"type": "text"}, 62 | "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, 63 | "timezone": {"type": "keyword"}, 64 | "place": {"type": "keyword"}, 65 | "location": {"type": "keyword"}, 66 | "tweet": {"type": "text"}, 67 | "lang": {"type": "keyword"}, 68 | "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"}, 69 | "cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"}, 70 | "user_id_str": {"type": "keyword"}, 71 | "username": {"type": "keyword", "normalizer": "hashtag_normalizer"}, 72 | "name": {"type": "text"}, 73 | "profile_image_url": {"type": "text"}, 74 | "day": {"type": "integer"}, 75 | "hour": {"type": "integer"}, 76 | "link": {"type": "text"}, 77 | "retweet": {"type": "text"}, 78 | "essid": {"type": "keyword"}, 79 | "nlikes": {"type": "integer"}, 80 | "nreplies": {"type": "integer"}, 81 | "nretweets": {"type": "integer"}, 82 | "quote_url": {"type": "text"}, 83 | "video": {"type":"integer"}, 84 | "thumbnail": {"type":"text"}, 85 | "search": {"type": "text"}, 86 | "near": {"type": "text"}, 87 | "geo_near": {"type": "geo_point"}, 88 | "geo_tweet": {"type": "geo_point"}, 89 | "photos": {"type": "text"}, 90 | "user_rt_id": {"type": "keyword"}, 91 | "mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"}, 92 | "source": {"type": "keyword"}, 93 | "user_rt": {"type": "keyword"}, 94 | "retweet_id": {"type": "keyword"}, 95 | "reply_to": { 96 | "type": "nested", 97 | "properties": { 98 | "user_id": {"type": "keyword"}, 99 | "username": {"type": "keyword"} 100 | } 101 | }, 102 | "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True}, 103 | "urls": {"type": "keyword"}, 104 | "translate": {"type": "text"}, 105 | "trans_src": {"type": "keyword"}, 106 | "trans_dest": {"type": "keyword"}, 107 | } 108 | }, 109 | "settings": { 110 | "number_of_shards": 1, 111 | "analysis": { 112 | "normalizer": { 113 | "hashtag_normalizer": { 114 | "type": "custom", 115 | "char_filter": [], 116 | "filter": ["lowercase", "asciifolding"] 117 | } 118 | } 119 | } 120 | } 121 | } 122 | with nostdout(): 123 | resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400) 124 | return handleIndexResponse(resp) 125 | elif scope.get("scope") == "follow": 126 | follow_body = { 127 | "mappings": { 128 | "properties": { 129 | "user": {"type": "keyword"}, 130 | "follow": {"type": "keyword"}, 131 | "essid": {"type": "keyword"} 132 | } 133 | }, 134 | "settings": { 135 | "number_of_shards": 1 136 | } 137 | } 138 | with nostdout(): 139 | resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400) 140 | return handleIndexResponse(resp) 141 | elif scope.get("scope") == "user": 142 | user_body = { 143 | "mappings": { 144 | "properties": { 145 | "id": {"type": "keyword"}, 146 | "name": {"type": "keyword"}, 147 | "username": {"type": "keyword"}, 148 | "bio": {"type": "text"}, 149 | "location": {"type": "keyword"}, 150 | "url": {"type": "text"}, 151 | "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, 152 | "tweets": {"type": "integer"}, 153 | "following": {"type": "integer"}, 154 | "followers": {"type": "integer"}, 155 | "likes": {"type": "integer"}, 156 | "media": {"type": "integer"}, 157 | "private": {"type": "integer"}, 158 | "verified": {"type": "integer"}, 159 | "avatar": {"type": "text"}, 160 | "background_image": {"type": "text"}, 161 | "session": {"type": "keyword"}, 162 | "geo_user": {"type": "geo_point"} 163 | } 164 | }, 165 | "settings": { 166 | "number_of_shards": 1 167 | } 168 | } 169 | with nostdout(): 170 | resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400) 171 | return handleIndexResponse(resp) 172 | else: 173 | print("[x] error index pre-creation :: storage.elasticsearch.createIndex") 174 | return False 175 | 176 | @contextlib.contextmanager 177 | def nostdout(): 178 | savestdout = sys.stdout 179 | sys.stdout = RecycleObject() 180 | yield 181 | sys.stdout = savestdout 182 | 183 | def weekday(day): 184 | weekdays = { 185 | "Monday": 1, 186 | "Tuesday": 2, 187 | "Wednesday": 3, 188 | "Thursday": 4, 189 | "Friday": 5, 190 | "Saturday": 6, 191 | "Sunday": 7, 192 | } 193 | 194 | return weekdays[day] 195 | 196 | def Tweet(Tweet, config): 197 | global _index_tweet_status 198 | global _is_near_def 199 | date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z") 200 | 201 | actions = [] 202 | 203 | try: 204 | retweet = Tweet.retweet 205 | except AttributeError: 206 | retweet = None 207 | 208 | dt = f"{Tweet.datestamp} {Tweet.timestamp}" 209 | 210 | j_data = { 211 | "_index": config.Index_tweets, 212 | "_id": str(Tweet.id) + "_raw_" + config.Essid, 213 | "_source": { 214 | "id": str(Tweet.id), 215 | "conversation_id": Tweet.conversation_id, 216 | "created_at": Tweet.datetime, 217 | "date": dt, 218 | "timezone": Tweet.timezone, 219 | "place": Tweet.place, 220 | "tweet": Tweet.tweet, 221 | "language": Tweet.lang, 222 | "hashtags": Tweet.hashtags, 223 | "cashtags": Tweet.cashtags, 224 | "user_id_str": Tweet.user_id_str, 225 | "username": Tweet.username, 226 | "name": Tweet.name, 227 | "day": date_obj.weekday(), 228 | "hour": date_obj.hour, 229 | "link": Tweet.link, 230 | "retweet": retweet, 231 | "essid": config.Essid, 232 | "nlikes": int(Tweet.likes_count), 233 | "nreplies": int(Tweet.replies_count), 234 | "nretweets": int(Tweet.retweets_count), 235 | "quote_url": Tweet.quote_url, 236 | "video": Tweet.video, 237 | "search": str(config.Search), 238 | "near": config.Near 239 | } 240 | } 241 | if retweet is not None: 242 | j_data["_source"].update({"user_rt_id": Tweet.user_rt_id}) 243 | j_data["_source"].update({"user_rt": Tweet.user_rt}) 244 | j_data["_source"].update({"retweet_id": Tweet.retweet_id}) 245 | j_data["_source"].update({"retweet_date": Tweet.retweet_date}) 246 | if Tweet.reply_to: 247 | j_data["_source"].update({"reply_to": Tweet.reply_to}) 248 | if Tweet.photos: 249 | _photos = [] 250 | for photo in Tweet.photos: 251 | _photos.append(photo) 252 | j_data["_source"].update({"photos": _photos}) 253 | if Tweet.thumbnail: 254 | j_data["_source"].update({"thumbnail": Tweet.thumbnail}) 255 | if Tweet.mentions: 256 | _mentions = [] 257 | for mention in Tweet.mentions: 258 | _mentions.append(mention) 259 | j_data["_source"].update({"mentions": _mentions}) 260 | if Tweet.urls: 261 | _urls = [] 262 | for url in Tweet.urls: 263 | _urls.append(url) 264 | j_data["_source"].update({"urls": _urls}) 265 | if config.Near or config.Geo: 266 | if not _is_near_def: 267 | __geo = "" 268 | __near = "" 269 | if config.Geo: 270 | __geo = config.Geo 271 | if config.Near: 272 | __near = config.Near 273 | _is_near_def = getLocation(__near + __geo, near=True) 274 | if _near: 275 | j_data["_source"].update({"geo_near": _near}) 276 | if Tweet.place: 277 | _t_place = getLocation(Tweet.place) 278 | if _t_place: 279 | j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)}) 280 | if Tweet.source: 281 | j_data["_source"].update({"source": Tweet.Source}) 282 | if config.Translate: 283 | j_data["_source"].update({"translate": Tweet.translate}) 284 | j_data["_source"].update({"trans_src": Tweet.trans_src}) 285 | j_data["_source"].update({"trans_dest": Tweet.trans_dest}) 286 | 287 | actions.append(j_data) 288 | 289 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) 290 | if not _index_tweet_status: 291 | _index_tweet_status = createIndex(config, es, scope="tweet") 292 | with nostdout(): 293 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) 294 | actions = [] 295 | 296 | def Follow(user, config): 297 | global _index_follow_status 298 | actions = [] 299 | 300 | if config.Following: 301 | _user = config.Username 302 | _follow = user 303 | else: 304 | _user = user 305 | _follow = config.Username 306 | j_data = { 307 | "_index": config.Index_follow, 308 | "_id": _user + "_" + _follow + "_" + config.Essid, 309 | "_source": { 310 | "user": _user, 311 | "follow": _follow, 312 | "essid": config.Essid 313 | } 314 | } 315 | actions.append(j_data) 316 | 317 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) 318 | if not _index_follow_status: 319 | _index_follow_status = createIndex(config, es, scope="follow") 320 | with nostdout(): 321 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) 322 | actions = [] 323 | 324 | def UserProfile(user, config): 325 | global _index_user_status 326 | global _is_location_def 327 | actions = [] 328 | 329 | j_data = { 330 | "_index": config.Index_users, 331 | "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid, 332 | "_source": { 333 | "id": user.id, 334 | "name": user.name, 335 | "username": user.username, 336 | "bio": user.bio, 337 | "location": user.location, 338 | "url": user.url, 339 | "join_datetime": user.join_date + " " + user.join_time, 340 | "tweets": user.tweets, 341 | "following": user.following, 342 | "followers": user.followers, 343 | "likes": user.likes, 344 | "media": user.media_count, 345 | "private": user.is_private, 346 | "verified": user.is_verified, 347 | "avatar": user.avatar, 348 | "background_image": user.background_image, 349 | "session": config.Essid 350 | } 351 | } 352 | if config.Location: 353 | if not _is_location_def: 354 | _is_location_def = getLocation(user.location, location=True) 355 | if _location: 356 | j_data["_source"].update({"geo_user": _location}) 357 | actions.append(j_data) 358 | 359 | es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs) 360 | if not _index_user_status: 361 | _index_user_status = createIndex(config, es, scope="user") 362 | with nostdout(): 363 | helpers.bulk(es, actions, chunk_size=2000, request_timeout=200) 364 | actions = [] 365 | -------------------------------------------------------------------------------- /twint/storage/panda.py: -------------------------------------------------------------------------------- 1 | import datetime, pandas as pd, warnings 2 | from time import strftime, localtime 3 | from twint.tweet import Tweet_formats 4 | 5 | Tweets_df = None 6 | Follow_df = None 7 | User_df = None 8 | 9 | _object_blocks = { 10 | "tweet": [], 11 | "user": [], 12 | "following": [], 13 | "followers": [] 14 | } 15 | 16 | weekdays = { 17 | "Monday": 1, 18 | "Tuesday": 2, 19 | "Wednesday": 3, 20 | "Thursday": 4, 21 | "Friday": 5, 22 | "Saturday": 6, 23 | "Sunday": 7, 24 | } 25 | 26 | _type = "" 27 | 28 | def _concat(df, _type): 29 | if df is None: 30 | df = pd.DataFrame(_object_blocks[_type]) 31 | else: 32 | _df = pd.DataFrame(_object_blocks[_type]) 33 | df = pd.concat([df, _df], sort=True) 34 | return df 35 | 36 | def _autoget(_type): 37 | global Tweets_df 38 | global Follow_df 39 | global User_df 40 | 41 | if _type == "tweet": 42 | Tweets_df = _concat(Tweets_df, _type) 43 | elif _type == "followers" or _type == "following": 44 | Follow_df = _concat(Follow_df, _type) 45 | elif _type == "user": 46 | User_df = _concat(User_df, _type) 47 | else: 48 | error("[x] Wrong type of object passed") 49 | 50 | 51 | def update(object, config): 52 | global _type 53 | 54 | #try: 55 | # _type = ((object.__class__.__name__ == "tweet")*"tweet" + 56 | # (object.__class__.__name__ == "user")*"user") 57 | #except AttributeError: 58 | # _type = config.Following*"following" + config.Followers*"followers" 59 | if object.__class__.__name__ == "tweet": 60 | _type = "tweet" 61 | elif object.__class__.__name__ == "user": 62 | _type = "user" 63 | elif object.__class__.__name__ == "dict": 64 | _type = config.Following*"following" + config.Followers*"followers" 65 | 66 | if _type == "tweet": 67 | Tweet = object 68 | datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000 69 | day = weekdays[strftime("%A", localtime(datetime_ms/1000))] 70 | dt = f"{object.datestamp} {object.timestamp}" 71 | _data = { 72 | "id": str(Tweet.id), 73 | "conversation_id": Tweet.conversation_id, 74 | "created_at": datetime_ms, 75 | "date": dt, 76 | "timezone": Tweet.timezone, 77 | "place": Tweet.place, 78 | "tweet": Tweet.tweet, 79 | "language": Tweet.lang, 80 | "hashtags": Tweet.hashtags, 81 | "cashtags": Tweet.cashtags, 82 | "user_id": Tweet.user_id, 83 | "user_id_str": Tweet.user_id_str, 84 | "username": Tweet.username, 85 | "name": Tweet.name, 86 | "day": day, 87 | "hour": strftime("%H", localtime(datetime_ms/1000)), 88 | "link": Tweet.link, 89 | "urls": Tweet.urls, 90 | "photos": Tweet.photos, 91 | "video": Tweet.video, 92 | "thumbnail": Tweet.thumbnail, 93 | "retweet": Tweet.retweet, 94 | "nlikes": int(Tweet.likes_count), 95 | "nreplies": int(Tweet.replies_count), 96 | "nretweets": int(Tweet.retweets_count), 97 | "quote_url": Tweet.quote_url, 98 | "search": str(config.Search), 99 | "near": Tweet.near, 100 | "geo": Tweet.geo, 101 | "source": Tweet.source, 102 | "user_rt_id": Tweet.user_rt_id, 103 | "user_rt": Tweet.user_rt, 104 | "retweet_id": Tweet.retweet_id, 105 | "reply_to": Tweet.reply_to, 106 | "retweet_date": Tweet.retweet_date, 107 | "translate": Tweet.translate, 108 | "trans_src": Tweet.trans_src, 109 | "trans_dest": Tweet.trans_dest 110 | } 111 | _object_blocks[_type].append(_data) 112 | elif _type == "user": 113 | user = object 114 | try: 115 | background_image = user.background_image 116 | except: 117 | background_image = "" 118 | _data = { 119 | "id": user.id, 120 | "name": user.name, 121 | "username": user.username, 122 | "bio": user.bio, 123 | "url": user.url, 124 | "join_datetime": user.join_date + " " + user.join_time, 125 | "join_date": user.join_date, 126 | "join_time": user.join_time, 127 | "tweets": user.tweets, 128 | "location": user.location, 129 | "following": user.following, 130 | "followers": user.followers, 131 | "likes": user.likes, 132 | "media": user.media_count, 133 | "private": user.is_private, 134 | "verified": user.is_verified, 135 | "avatar": user.avatar, 136 | "background_image": background_image, 137 | } 138 | _object_blocks[_type].append(_data) 139 | elif _type == "followers" or _type == "following": 140 | _data = { 141 | config.Following*"following" + config.Followers*"followers" : 142 | {config.Username: object[_type]} 143 | } 144 | _object_blocks[_type] = _data 145 | else: 146 | print("Wrong type of object passed!") 147 | 148 | 149 | def clean(): 150 | global Tweets_df 151 | global Follow_df 152 | global User_df 153 | _object_blocks["tweet"].clear() 154 | _object_blocks["following"].clear() 155 | _object_blocks["followers"].clear() 156 | _object_blocks["user"].clear() 157 | Tweets_df = None 158 | Follow_df = None 159 | User_df = None 160 | 161 | def save(_filename, _dataframe, **options): 162 | if options.get("dataname"): 163 | _dataname = options.get("dataname") 164 | else: 165 | _dataname = "twint" 166 | 167 | if not options.get("type"): 168 | with warnings.catch_warnings(): 169 | warnings.simplefilter("ignore") 170 | _store = pd.HDFStore(_filename + ".h5") 171 | _store[_dataname] = _dataframe 172 | _store.close() 173 | elif options.get("type") == "Pickle": 174 | with warnings.catch_warnings(): 175 | warnings.simplefilter("ignore") 176 | _dataframe.to_pickle(_filename + ".pkl") 177 | else: 178 | print("""Please specify: filename, DataFrame, DataFrame name and type 179 | (HDF5, default, or Pickle)""") 180 | 181 | def read(_filename, **options): 182 | if not options.get("dataname"): 183 | _dataname = "twint" 184 | else: 185 | _dataname = options.get("dataname") 186 | 187 | if not options.get("type"): 188 | _store = pd.HDFStore(_filename + ".h5") 189 | _df = _store[_dataname] 190 | return _df 191 | elif options.get("type") == "Pickle": 192 | _df = pd.read_pickle(_filename + ".pkl") 193 | return _df 194 | else: 195 | print("""Please specify: DataFrame, DataFrame name (twint as default), 196 | filename and type (HDF5, default, or Pickle""") 197 | -------------------------------------------------------------------------------- /twint/storage/write.py: -------------------------------------------------------------------------------- 1 | from . import write_meta as meta 2 | import csv 3 | import json 4 | import os 5 | 6 | def outputExt(objType, fType): 7 | if objType == "str": 8 | objType = "username" 9 | outExt = f"/{objType}s.{fType}" 10 | 11 | return outExt 12 | 13 | def addExt(base, objType, fType): 14 | if len(base.split('.')) == 1: 15 | createDirIfMissing(base) 16 | base += outputExt(objType, fType) 17 | 18 | return base 19 | 20 | def Text(entry, f): 21 | print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8")) 22 | 23 | def Type(config): 24 | if config.User_full: 25 | _type = "user" 26 | elif config.Followers or config.Following: 27 | _type = "username" 28 | else: 29 | _type = "tweet" 30 | 31 | return _type 32 | 33 | def struct(obj, custom, _type): 34 | if custom: 35 | fieldnames = custom 36 | row = {} 37 | for f in fieldnames: 38 | row[f] = meta.Data(obj, _type)[f] 39 | else: 40 | fieldnames = meta.Fieldnames(_type) 41 | row = meta.Data(obj, _type) 42 | 43 | return fieldnames, row 44 | 45 | def createDirIfMissing(dirname): 46 | if not os.path.exists(dirname): 47 | os.makedirs(dirname) 48 | 49 | def Csv(obj, config): 50 | _obj_type = obj.__class__.__name__ 51 | if _obj_type == "str": 52 | _obj_type = "username" 53 | fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type) 54 | 55 | base = addExt(config.Output, _obj_type, "csv") 56 | dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel' 57 | 58 | if not (os.path.exists(base)): 59 | with open(base, "w", newline='', encoding="utf-8") as csv_file: 60 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect) 61 | writer.writeheader() 62 | 63 | with open(base, "a", newline='', encoding="utf-8") as csv_file: 64 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect) 65 | writer.writerow(row) 66 | 67 | def Json(obj, config): 68 | _obj_type = obj.__class__.__name__ 69 | if _obj_type == "str": 70 | _obj_type = "username" 71 | null, data = struct(obj, config.Custom[_obj_type], _obj_type) 72 | 73 | base = addExt(config.Output, _obj_type, "json") 74 | 75 | with open(base, "a", newline='', encoding="utf-8") as json_file: 76 | json.dump(data, json_file, ensure_ascii=False) 77 | json_file.write("\n") 78 | -------------------------------------------------------------------------------- /twint/storage/write_meta.py: -------------------------------------------------------------------------------- 1 | def tweetData(t): 2 | data = { 3 | "id": int(t.id), 4 | "conversation_id": t.conversation_id, 5 | "created_at": t.datetime, 6 | "date": t.datestamp, 7 | "time": t.timestamp, 8 | "timezone": t.timezone, 9 | "user_id": t.user_id, 10 | "username": t.username, 11 | "name": t.name, 12 | "place": t.place, 13 | "tweet": t.tweet, 14 | "language": t.lang, 15 | "mentions": t.mentions, 16 | "urls": t.urls, 17 | "photos": t.photos, 18 | "replies_count": int(t.replies_count), 19 | "retweets_count": int(t.retweets_count), 20 | "likes_count": int(t.likes_count), 21 | "hashtags": t.hashtags, 22 | "cashtags": t.cashtags, 23 | "link": t.link, 24 | "retweet": t.retweet, 25 | "quote_url": t.quote_url, 26 | "video": t.video, 27 | "thumbnail": t.thumbnail, 28 | "near": t.near, 29 | "geo": t.geo, 30 | "source": t.source, 31 | "user_rt_id": t.user_rt_id, 32 | "user_rt": t.user_rt, 33 | "retweet_id": t.retweet_id, 34 | "reply_to": t.reply_to, 35 | "retweet_date": t.retweet_date, 36 | "translate": t.translate, 37 | "trans_src": t.trans_src, 38 | "trans_dest": t.trans_dest, 39 | } 40 | return data 41 | 42 | def tweetFieldnames(): 43 | fieldnames = [ 44 | "id", 45 | "conversation_id", 46 | "created_at", 47 | "date", 48 | "time", 49 | "timezone", 50 | "user_id", 51 | "username", 52 | "name", 53 | "place", 54 | "tweet", 55 | "language", 56 | "mentions", 57 | "urls", 58 | "photos", 59 | "replies_count", 60 | "retweets_count", 61 | "likes_count", 62 | "hashtags", 63 | "cashtags", 64 | "link", 65 | "retweet", 66 | "quote_url", 67 | "video", 68 | "thumbnail", 69 | "near", 70 | "geo", 71 | "source", 72 | "user_rt_id", 73 | "user_rt", 74 | "retweet_id", 75 | "reply_to", 76 | "retweet_date", 77 | "translate", 78 | "trans_src", 79 | "trans_dest" 80 | ] 81 | return fieldnames 82 | 83 | def userData(u): 84 | data = { 85 | "id": int(u.id), 86 | "name": u.name, 87 | "username": u.username, 88 | "bio": u.bio, 89 | "location": u.location, 90 | "url": u.url, 91 | "join_date": u.join_date, 92 | "join_time": u.join_time, 93 | "tweets": int(u.tweets), 94 | "following": int(u.following), 95 | "followers": int(u.followers), 96 | "likes": int(u.likes), 97 | "media": int(u.media_count), 98 | "private": u.is_private, 99 | "verified": u.is_verified, 100 | "profile_image_url": u.avatar, 101 | "background_image": u.background_image 102 | } 103 | return data 104 | 105 | def userFieldnames(): 106 | fieldnames = [ 107 | "id", 108 | "name", 109 | "username", 110 | "bio", 111 | "location", 112 | "url", 113 | "join_date", 114 | "join_time", 115 | "tweets", 116 | "following", 117 | "followers", 118 | "likes", 119 | "media", 120 | "private", 121 | "verified", 122 | "profile_image_url", 123 | "background_image" 124 | ] 125 | return fieldnames 126 | 127 | def usernameData(u): 128 | return {"username": u} 129 | 130 | def usernameFieldnames(): 131 | return ["username"] 132 | 133 | def Data(obj, _type): 134 | if _type == "user": 135 | ret = userData(obj) 136 | elif _type == "username": 137 | ret = usernameData(obj) 138 | else: 139 | ret = tweetData(obj) 140 | 141 | return ret 142 | 143 | def Fieldnames(_type): 144 | if _type == "user": 145 | ret = userFieldnames() 146 | elif _type == "username": 147 | ret = usernameFieldnames() 148 | else: 149 | ret = tweetFieldnames() 150 | 151 | return ret 152 | -------------------------------------------------------------------------------- /twint/token.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import requests 5 | import logging as logme 6 | 7 | 8 | class TokenExpiryException(Exception): 9 | def __init__(self, msg): 10 | super().__init__(msg) 11 | 12 | 13 | class RefreshTokenException(Exception): 14 | def __init__(self, msg): 15 | super().__init__(msg) 16 | 17 | 18 | class Token: 19 | def __init__(self, config): 20 | self._session = requests.Session() 21 | self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}) 22 | self.config = config 23 | self._retries = 5 24 | self._timeout = 10 25 | self.url = 'https://twitter.com' 26 | 27 | def _request(self): 28 | for attempt in range(self._retries + 1): 29 | # The request is newly prepared on each retry because of potential cookie updates. 30 | req = self._session.prepare_request(requests.Request('GET', self.url)) 31 | logme.debug(f'Retrieving {req.url}') 32 | try: 33 | r = self._session.send(req, allow_redirects=True, timeout=self._timeout) 34 | except requests.exceptions.RequestException as exc: 35 | if attempt < self._retries: 36 | retrying = ', retrying' 37 | level = logme.WARNING 38 | else: 39 | retrying = '' 40 | level = logme.ERROR 41 | logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}') 42 | else: 43 | success, msg = (True, None) 44 | msg = f': {msg}' if msg else '' 45 | 46 | if success: 47 | logme.debug(f'{req.url} retrieved successfully{msg}') 48 | return r 49 | if attempt < self._retries: 50 | # TODO : might wanna tweak this back-off timer 51 | sleep_time = 2.0 * 2 ** attempt 52 | logme.info(f'Waiting {sleep_time:.0f} seconds') 53 | time.sleep(sleep_time) 54 | else: 55 | msg = f'{self._retries + 1} requests to {self.url} failed, giving up.' 56 | logme.fatal(msg) 57 | self.config.Guest_token = None 58 | raise RefreshTokenException(msg) 59 | 60 | def refresh(self): 61 | logme.debug('Retrieving guest token') 62 | res = self._request() 63 | match = re.search(r'\("gt=(\d+);', res.text) 64 | if match: 65 | logme.debug('Found guest token in HTML') 66 | self.config.Guest_token = str(match.group(1)) 67 | else: 68 | self.config.Guest_token = None 69 | raise RefreshTokenException('Could not find the Guest token in HTML') 70 | -------------------------------------------------------------------------------- /twint/tweet.py: -------------------------------------------------------------------------------- 1 | from time import strftime, localtime 2 | from datetime import datetime, timezone 3 | 4 | import logging as logme 5 | from googletransx import Translator 6 | # ref. 7 | # - https://github.com/x0rzkov/py-googletrans#basic-usage 8 | translator = Translator() 9 | 10 | 11 | class tweet: 12 | """Define Tweet class 13 | """ 14 | type = "tweet" 15 | 16 | def __init__(self): 17 | pass 18 | 19 | 20 | def utc_to_local(utc_dt): 21 | return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None) 22 | 23 | 24 | Tweet_formats = { 25 | 'datetime': '%Y-%m-%d %H:%M:%S %Z', 26 | 'datestamp': '%Y-%m-%d', 27 | 'timestamp': '%H:%M:%S' 28 | } 29 | 30 | 31 | def _get_mentions(tw): 32 | """Extract mentions from tweet 33 | """ 34 | logme.debug(__name__ + ':get_mentions') 35 | try: 36 | mentions = [ 37 | { 38 | 'screen_name': _mention['screen_name'], 39 | 'name': _mention['name'], 40 | 'id': _mention['id_str'], 41 | } for _mention in tw['entities']['user_mentions'] 42 | if tw['display_text_range'][0] < _mention['indices'][0] 43 | ] 44 | except KeyError: 45 | mentions = [] 46 | return mentions 47 | 48 | 49 | def _get_reply_to(tw): 50 | try: 51 | reply_to = [ 52 | { 53 | 'screen_name': _mention['screen_name'], 54 | 'name': _mention['name'], 55 | 'id': _mention['id_str'], 56 | } for _mention in tw['entities']['user_mentions'] 57 | if tw['display_text_range'][0] > _mention['indices'][1] 58 | ] 59 | except KeyError: 60 | reply_to = [] 61 | return reply_to 62 | 63 | 64 | def getText(tw): 65 | """Replace some text 66 | """ 67 | logme.debug(__name__ + ':getText') 68 | text = tw['full_text'] 69 | text = text.replace("http", " http") 70 | text = text.replace("pic.twitter", " pic.twitter") 71 | text = text.replace("\n", " ") 72 | 73 | return text 74 | 75 | 76 | def Tweet(tw, config): 77 | """Create Tweet object 78 | """ 79 | logme.debug(__name__ + ':Tweet') 80 | t = tweet() 81 | t.id = int(tw['id_str']) 82 | t.id_str = tw["id_str"] 83 | t.conversation_id = tw["conversation_id_str"] 84 | 85 | # parsing date to user-friendly format 86 | _dt = tw['created_at'] 87 | _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') 88 | _dt = utc_to_local(_dt) 89 | t.datetime = str(_dt.strftime(Tweet_formats['datetime'])) 90 | # date is of the format year, 91 | t.datestamp = _dt.strftime(Tweet_formats['datestamp']) 92 | t.timestamp = _dt.strftime(Tweet_formats['timestamp']) 93 | t.user_id = int(tw["user_id_str"]) 94 | t.user_id_str = tw["user_id_str"] 95 | t.username = tw["user_data"]['screen_name'] 96 | t.name = tw["user_data"]['name'] 97 | t.place = tw['geo'] if 'geo' in tw and tw['geo'] else "" 98 | t.timezone = strftime("%z", localtime()) 99 | t.mentions = _get_mentions(tw) 100 | t.reply_to = _get_reply_to(tw) 101 | try: 102 | t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']] 103 | except KeyError: 104 | t.urls = [] 105 | try: 106 | t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and 107 | _img['expanded_url'].find('/photo/') != -1] 108 | except KeyError: 109 | t.photos = [] 110 | try: 111 | t.video = 1 if len(tw['extended_entities']['media']) else 0 112 | except KeyError: 113 | t.video = 0 114 | try: 115 | t.thumbnail = tw['extended_entities']['media'][0]['media_url_https'] 116 | except KeyError: 117 | t.thumbnail = '' 118 | t.tweet = getText(tw) 119 | t.lang = tw['lang'] 120 | try: 121 | t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']] 122 | except KeyError: 123 | t.hashtags = [] 124 | try: 125 | t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']] 126 | except KeyError: 127 | t.cashtags = [] 128 | t.replies_count = tw['reply_count'] 129 | t.retweets_count = tw['retweet_count'] 130 | t.likes_count = tw['favorite_count'] 131 | t.link = f"https://twitter.com/{t.username}/status/{t.id}" 132 | try: 133 | if 'user_rt_id' in tw['retweet_data']: 134 | t.retweet = True 135 | t.retweet_id = tw['retweet_data']['retweet_id'] 136 | t.retweet_date = tw['retweet_data']['retweet_date'] 137 | t.user_rt = tw['retweet_data']['user_rt'] 138 | t.user_rt_id = tw['retweet_data']['user_rt_id'] 139 | except KeyError: 140 | t.retweet = False 141 | t.retweet_id = '' 142 | t.retweet_date = '' 143 | t.user_rt = '' 144 | t.user_rt_id = '' 145 | try: 146 | t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else '' 147 | except KeyError: 148 | # means that the quoted tweet have been deleted 149 | t.quote_url = 0 150 | t.near = config.Near if config.Near else "" 151 | t.geo = config.Geo if config.Geo else "" 152 | t.source = config.Source if config.Source else "" 153 | t.translate = '' 154 | t.trans_src = '' 155 | t.trans_dest = '' 156 | if config.Translate: 157 | try: 158 | ts = translator.translate(text=t.tweet, dest=config.TranslateDest) 159 | t.translate = ts.text 160 | t.trans_src = ts.src 161 | t.trans_dest = ts.dest 162 | # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31 163 | except ValueError as e: 164 | logme.debug(__name__ + ':Tweet:translator.translate:' + str(e)) 165 | raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet)) 166 | return t 167 | -------------------------------------------------------------------------------- /twint/url.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from sys import platform 3 | import logging as logme 4 | from urllib.parse import urlencode 5 | from urllib.parse import quote 6 | 7 | mobile = "https://mobile.twitter.com" 8 | base = "https://api.twitter.com/2/search/adaptive.json" 9 | 10 | 11 | def _sanitizeQuery(_url, params): 12 | _serialQuery = "" 13 | _serialQuery = urlencode(params, quote_via=quote) 14 | _serialQuery = _url + "?" + _serialQuery 15 | return _serialQuery 16 | 17 | 18 | def _formatDate(date): 19 | if "win" in platform: 20 | return f'\"{date.split()[0]}\"' 21 | try: 22 | return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp()) 23 | except ValueError: 24 | return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp()) 25 | 26 | 27 | async def Favorites(username, init): 28 | logme.debug(__name__ + ':Favorites') 29 | url = f"{mobile}/{username}/favorites?lang=en" 30 | 31 | if init != '-1': 32 | url += f"&max_id={init}" 33 | 34 | return url 35 | 36 | 37 | async def Followers(username, init): 38 | logme.debug(__name__ + ':Followers') 39 | url = f"{mobile}/{username}/followers?lang=en" 40 | 41 | if init != '-1': 42 | url += f"&cursor={init}" 43 | 44 | return url 45 | 46 | 47 | async def Following(username, init): 48 | logme.debug(__name__ + ':Following') 49 | url = f"{mobile}/{username}/following?lang=en" 50 | 51 | if init != '-1': 52 | url += f"&cursor={init}" 53 | 54 | return url 55 | 56 | 57 | async def MobileProfile(username, init): 58 | logme.debug(__name__ + ':MobileProfile') 59 | url = f"{mobile}/{username}?lang=en" 60 | 61 | if init != '-1': 62 | url += f"&max_id={init}" 63 | 64 | return url 65 | 66 | 67 | async def Search(config, init): 68 | logme.debug(__name__ + ':Search') 69 | url = base 70 | tweet_count = 100 71 | q = "" 72 | params = [ 73 | # ('include_blocking', '1'), 74 | # ('include_blocked_by', '1'), 75 | # ('include_followed_by', '1'), 76 | # ('include_want_retweets', '1'), 77 | # ('include_mute_edge', '1'), 78 | # ('include_can_dm', '1'), 79 | ('include_can_media_tag', '1'), 80 | # ('skip_status', '1'), 81 | # ('include_cards', '1'), 82 | ('include_ext_alt_text', 'true'), 83 | ('include_quote_count', 'true'), 84 | ('include_reply_count', '1'), 85 | ('tweet_mode', 'extended'), 86 | ('include_entities', 'true'), 87 | ('include_user_entities', 'true'), 88 | ('include_ext_media_availability', 'true'), 89 | ('send_error_codes', 'true'), 90 | ('simple_quoted_tweet', 'true'), 91 | ('count', tweet_count), 92 | # ('query_source', 'typed_query'), 93 | # ('pc', '1'), 94 | ('cursor', str(init)), 95 | ('spelling_corrections', '1'), 96 | ('ext', 'mediaStats%2ChighlightedLabel'), 97 | ('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then 98 | ] 99 | if not config.Popular_tweets: 100 | params.append(('f', 'tweets')) 101 | if config.Lang: 102 | params.append(("l", config.Lang)) 103 | params.append(("lang", "en")) 104 | if config.Query: 105 | q += f" from:{config.Query}" 106 | if config.Username: 107 | q += f" from:{config.Username}" 108 | if config.Geo: 109 | config.Geo = config.Geo.replace(" ", "") 110 | q += f" geocode:{config.Geo}" 111 | if config.Search: 112 | 113 | q += f" {config.Search}" 114 | if config.Year: 115 | q += f" until:{config.Year}-1-1" 116 | if config.Since: 117 | q += f" since:{_formatDate(config.Since)}" 118 | if config.Until: 119 | q += f" until:{_formatDate(config.Until)}" 120 | if config.Email: 121 | q += ' "mail" OR "email" OR' 122 | q += ' "gmail" OR "e-mail"' 123 | if config.Phone: 124 | q += ' "phone" OR "call me" OR "text me"' 125 | if config.Verified: 126 | q += " filter:verified" 127 | if config.To: 128 | q += f" to:{config.To}" 129 | if config.All: 130 | q += f" to:{config.All} OR from:{config.All} OR @{config.All}" 131 | if config.Near: 132 | q += f' near:"{config.Near}"' 133 | if config.Images: 134 | q += " filter:images" 135 | if config.Videos: 136 | q += " filter:videos" 137 | if config.Media: 138 | q += " filter:media" 139 | if config.Replies: 140 | q += " filter:replies" 141 | # although this filter can still be used, but I found it broken in my preliminary testing, needs more testing 142 | if config.Native_retweets: 143 | q += " filter:nativeretweets" 144 | if config.Min_likes: 145 | q += f" min_faves:{config.Min_likes}" 146 | if config.Min_retweets: 147 | q += f" min_retweets:{config.Min_retweets}" 148 | if config.Min_replies: 149 | q += f" min_replies:{config.Min_replies}" 150 | if config.Links == "include": 151 | q += " filter:links" 152 | elif config.Links == "exclude": 153 | q += " exclude:links" 154 | if config.Source: 155 | q += f" source:\"{config.Source}\"" 156 | if config.Members_list: 157 | q += f" list:{config.Members_list}" 158 | if config.Filter_retweets: 159 | q += f" exclude:nativeretweets exclude:retweets" 160 | if config.Custom_query: 161 | q = config.Custom_query 162 | 163 | q = q.strip() 164 | params.append(("q", q)) 165 | _serialQuery = _sanitizeQuery(url, params) 166 | return url, params, _serialQuery 167 | 168 | 169 | def SearchProfile(config, init=None): 170 | logme.debug(__name__ + ':SearchProfile') 171 | _url = 'https://api.twitter.com/2/timeline/profile/{user_id}.json'.format(user_id=config.User_id) 172 | tweet_count = 100 173 | params = [ 174 | # some of the fields are not required, need to test which ones aren't required 175 | ('include_profile_interstitial_type', '1'), 176 | ('include_blocking', '1'), 177 | ('include_blocked_by', '1'), 178 | ('include_followed_by', '1'), 179 | ('include_want_retweets', '1'), 180 | ('include_mute_edge', '1'), 181 | ('include_can_dm', '1'), 182 | ('include_can_media_tag', '1'), 183 | ('skip_status', '1'), 184 | ('cards_platform', 'Web - 12'), 185 | ('include_cards', '1'), 186 | ('include_ext_alt_text', 'true'), 187 | ('include_quote_count', 'true'), 188 | ('include_reply_count', '1'), 189 | ('tweet_mode', 'extended'), 190 | ('include_entities', 'true'), 191 | ('include_user_entities', 'true'), 192 | ('include_ext_media_color', 'true'), 193 | ('include_ext_media_availability', 'true'), 194 | ('send_error_codes', 'true'), 195 | ('simple_quoted_tweet', 'true'), 196 | ('include_tweet_replies', 'true'), 197 | ('count', tweet_count), 198 | ('ext', 'mediaStats%2ChighlightedLabel'), 199 | ] 200 | 201 | if type(init) == str: 202 | params.append(('cursor', str(init))) 203 | _serialQuery = _sanitizeQuery(_url, params) 204 | return _url, params, _serialQuery 205 | -------------------------------------------------------------------------------- /twint/user.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging as logme 3 | 4 | 5 | class user: 6 | type = "user" 7 | 8 | def __init__(self): 9 | pass 10 | 11 | 12 | User_formats = { 13 | 'join_date': '%Y-%m-%d', 14 | 'join_time': '%H:%M:%S %Z' 15 | } 16 | 17 | 18 | # ur object must be a json from the endpoint https://api.twitter.com/graphql 19 | def User(ur): 20 | logme.debug(__name__ + ':User') 21 | if 'data' not in ur and 'user' not in ur['data']: 22 | msg = 'malformed json! cannot be parsed to get user data' 23 | logme.fatal(msg) 24 | raise KeyError(msg) 25 | _usr = user() 26 | _usr.id = ur['data']['user']['rest_id'] 27 | _usr.name = ur['data']['user']['legacy']['name'] 28 | _usr.username = ur['data']['user']['legacy']['screen_name'] 29 | _usr.bio = ur['data']['user']['legacy']['description'] 30 | _usr.location = ur['data']['user']['legacy']['location'] 31 | _usr.url = ur['data']['user']['legacy']['url'] 32 | # parsing date to user-friendly format 33 | _dt = ur['data']['user']['legacy']['created_at'] 34 | _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y') 35 | # date is of the format year, 36 | _usr.join_date = _dt.strftime(User_formats['join_date']) 37 | _usr.join_time = _dt.strftime(User_formats['join_time']) 38 | 39 | # :type `int` 40 | _usr.tweets = int(ur['data']['user']['legacy']['statuses_count']) 41 | _usr.following = int(ur['data']['user']['legacy']['friends_count']) 42 | _usr.followers = int(ur['data']['user']['legacy']['followers_count']) 43 | _usr.likes = int(ur['data']['user']['legacy']['favourites_count']) 44 | _usr.media_count = int(ur['data']['user']['legacy']['media_count']) 45 | 46 | _usr.is_private = ur['data']['user']['legacy']['protected'] 47 | _usr.is_verified = ur['data']['user']['legacy']['verified'] 48 | _usr.avatar = ur['data']['user']['legacy']['profile_image_url_https'] 49 | _usr.background_image = ur['data']['user']['legacy']['profile_banner_url'] 50 | # TODO : future implementation 51 | # legacy_extended_profile is also available in some cases which can be used to get DOB of user 52 | return _usr 53 | -------------------------------------------------------------------------------- /twint/verbose.py: -------------------------------------------------------------------------------- 1 | def Count(count, config): 2 | msg = "[+] Finished: Successfully collected " 3 | if config.Followers: 4 | msg += f"all {count} users who follow @{config.Username}" 5 | elif config.Following: 6 | msg += f"all {count} users who @{config.Username} follows" 7 | elif config.Favorites: 8 | msg += f"{count} Tweets that @{config.Username} liked" 9 | else: 10 | msg += f"{count} Tweets" 11 | if config.Username: 12 | msg += f" from @{config.Username}" 13 | msg += "." 14 | print(msg) 15 | 16 | def Elastic(elasticsearch): 17 | if elasticsearch: 18 | print("[+] Indexing to Elasticsearch @ " + str(elasticsearch)) 19 | --------------------------------------------------------------------------------