├── __init__.py ├── .vscode ├── settings.json └── launch.json ├── requirements.txt ├── config.py ├── config.ini ├── worker.py ├── tasks.py ├── README.md ├── .gitignore ├── arguments.py ├── elasticsearch └── index-tweets.json └── app.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/bin/python3" 3 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask-cors 3 | Celery 4 | twint 5 | numpy 6 | aiohttp_socks -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | config = configparser.ConfigParser() 3 | config.read('config.ini') 4 | if config["DEFAULT"]["DEV"]: 5 | config = config['DEV'] 6 | else: 7 | config = config['PROD'] -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | ; config.ini 2 | [DEFAULT] 3 | DEV = Yes 4 | 5 | [DEV] 6 | HOST = localhost 7 | PORT = 5000 8 | CELERY_BROKER_URL = pyamqp://guest@localhost// 9 | ALLOW_CORS = Yes 10 | FLASK_DEBUG = Yes 11 | 12 | [PROD] 13 | HOST = localhost 14 | PORT = 5001 15 | CELERY_BROKER_URL = pyamqp://guest@localhost// 16 | ALLOW_CORS = No 17 | FLASK_DEBUG = No -------------------------------------------------------------------------------- /worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | from celery import Celery 3 | from config import config 4 | 5 | CELERY_BROKER_URL = config['CELERY_BROKER_URL'] 6 | 7 | celery = Celery('flask_server', backend='amqp') 8 | celery.conf.update({ 9 | 'broker_url': config['CELERY_BROKER_URL'], 10 | 'backend': 'amqp', 11 | 'imports': ( 12 | 'tasks', 13 | ), 14 | 'task_routes': { 15 | 'fetch': {'queue': 'fetching'} 16 | # 'save': {'queue': 'saving'} 17 | }, 18 | 'task_serializer': 'json', 19 | 'result_serializer': 'json', 20 | 'accept_content': ['json']}) 21 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | import time 2 | from worker import celery 3 | from config import config 4 | import twint 5 | from arguments import TwintArguments 6 | 7 | @celery.task(name='fetch') 8 | def fetch(args): 9 | # Merge user arguments with default Twint config 10 | config = TwintArguments() 11 | config.__dict__.update(args) 12 | print("Start fetch task %s %s -> %s" % 13 | (config.id, config.Since, config.Until)) 14 | # Run 15 | twint.run.Search(config) 16 | # Finished 17 | return "Fetch task finished %s %s -> %s" % \ 18 | (config.id, config.Since, config.Until) 19 | 20 | # # Save/report progress if you want or example to firebase 21 | # @celery.task(name='save') 22 | # def save(args): 23 | # db = firebase.database() 24 | # # Make process entry on firebase 25 | # db.child("Processes").child(args['id']).set(args) 26 | # return "Progress saved" -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File (Integrated Terminal)", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "args": ["-m"] 14 | }, 15 | { 16 | "name": "Python: Flask", 17 | "type": "python", 18 | "request": "launch", 19 | "module": "flask", 20 | "env": { 21 | "FLASK_APP": "app.py" 22 | }, 23 | "args": [ 24 | "run" 25 | 26 | ], 27 | "jinja": true 28 | }, 29 | ] 30 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### [TWINT](https://github.com/twintproject/twint) Flask-Celery Server 2 | Optimized tweets scraping 3 | 4 | #### See also [Twint Kibana](https://github.com/Nedja995/twint_kibana) 5 | 6 | #### Requirements 7 | - Python3, [Twint](https://github.com/twintproject/twint), Flask, Celery 8 | - Elasticsearch(v7) 9 | - RabitMQ 10 | - (optional) Flower 11 | 12 | #### Run server 13 | 14 | 1. Run Celery workers: 15 | - `$ celery worker --app=worker.celery --hostname=worker.fetching@%h --queues=fetching --loglevel=info` 16 | - (Optional) task for reporting progress if it is implemented `$ celery worker --app=worker.celery --hostname=worker.saving@%h --queues=saving --loglevel=info` 17 | 18 | 2. Run Flask server: `$ python3 app.py` 19 | 20 | - (Optional) Monitor Celery with Flower: `$ celery -A app.celery flower --broker='pyamqp://guest@localhost//'` 21 | 22 | #### Use 23 | 24 | 1. Create ES index with [index-tweets.json](elasticsearch/index-tweets.json) 25 | 26 | 2. Start tweets fetching 27 | - arguments are mapped to [twint config](https://github.com/twintproject/twint/blob/master/twint/config.py) 28 | - I mainly use it with elasticsearch so I did not test with other arguments 29 | - Since and Until and Search/User are required 30 | ``` 31 | POST http://localhost:5000/fetch 32 | { 33 | "Since": "2019-2-1", 34 | "Until": "2019-3-1", 35 | "Search": "", 36 | // or 37 | "User": "" 38 | "Elasticsearch": "localhost:9200", 39 | "Index_tweets": "" 40 | } 41 | ``` 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | # https://github.com/twintproject/twint/blob/master/twint/config.py 2 | # https://github.com/twintproject/twint/blob/master/twint/cli.py 3 | class TwintArguments(object): 4 | Username = None 5 | User_id = None 6 | Search = None 7 | Geo = "" 8 | Location = False 9 | Near = None 10 | Lang = None 11 | Output = None 12 | Elasticsearch = None 13 | Timedelta = None 14 | Year = None 15 | Since = None 16 | Until = None 17 | Email = False 18 | Phone = False 19 | Verified = False 20 | Store_csv = False 21 | Store_json = False 22 | Custom = {"tweet": None, "user": None, "username": None} 23 | Show_hashtags = False 24 | Show_cashtags = False 25 | Limit = None 26 | Count = None 27 | Stats = False 28 | Database = None 29 | To = None 30 | All = None 31 | Debug = False 32 | Format = None 33 | Essid = "" 34 | Profile = False 35 | Followers = False 36 | Following = False 37 | Favorites = False 38 | TwitterSearch = False 39 | User_full = False 40 | Profile_full = False 41 | Store_object = False 42 | Pandas_type = None 43 | Pandas = False 44 | Index_tweets = "twinttweets" 45 | Index_follow = "twintgraph" 46 | Index_users = "twintuser" 47 | Debug = False 48 | Retries_count = 10 49 | Resume = None 50 | Images = False 51 | Videos = False 52 | Media = False 53 | Replies = False 54 | Pandas_clean = True 55 | Lowercase = True 56 | Pandas_au = True 57 | Proxy_host = None 58 | Proxy_port = 0 59 | Proxy_type = None 60 | Tor_control_port = 9051 61 | Tor_control_password = None 62 | Retweets = False 63 | Query = None 64 | Hide_output = False 65 | Get_replies = False 66 | Near = "" 67 | Custom_query = "" 68 | Popular_tweets = False -------------------------------------------------------------------------------- /elasticsearch/index-tweets.json: -------------------------------------------------------------------------------- 1 | PUT twinttweets 2 | { 3 | "mappings": { 4 | "properties": { 5 | "id": {"type": "long"}, 6 | "conversation_id": {"type": "long"}, 7 | "created_at": {"type": "long"}, 8 | "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"}, 9 | "timezone": {"type": "keyword"}, 10 | "place": {"type": "keyword"}, 11 | "location": {"type": "keyword"}, 12 | "tweet": { 13 | "type": "text", 14 | "fielddata": true, 15 | "search_analyzer": "analyzer_shingle", 16 | "analyzer": "analyzer_shingle", 17 | "fields": { 18 | "raw": { 19 | "type": "keyword" 20 | } 21 | } 22 | }, 23 | "hashtags": {"type": "keyword"}, 24 | "cashtags": {"type": "keyword"}, 25 | "user_id": {"type": "long"}, 26 | "user_id_str": {"type": "keyword"}, 27 | "username": {"type": "keyword"}, 28 | "name": {"type": "text"}, 29 | "profile_image_url": {"type": "text"}, 30 | "day": {"type": "integer"}, 31 | "hour": {"type": "integer"}, 32 | "link": {"type": "text"}, 33 | "retweet": {"type": "text"}, 34 | "essid": {"type": "keyword"}, 35 | "nlikes": {"type": "integer"}, 36 | "nreplies": {"type": "integer"}, 37 | "nretweets": {"type": "integer"}, 38 | "quote_url": {"type": "text"}, 39 | "video": {"type":"integer"}, 40 | "search": {"type": "text"}, 41 | "near": {"type": "text"}, 42 | "geo_near": {"type": "geo_point"}, 43 | "geo_tweet": {"type": "geo_point"}, 44 | "photos": {"type": "text"} 45 | } 46 | }, 47 | "settings": { 48 | "number_of_shards": 3, 49 | "number_of_replicas": 2, 50 | "analysis": { 51 | "analyzer": { 52 | "analyzer_shingle": { 53 | "tokenizer": "standard", 54 | "filter": [ 55 | "lowercase", 56 | "filter_shingle" 57 | ] 58 | } 59 | }, 60 | "filter": { 61 | "filter_shingle": { 62 | "type": "shingle", 63 | "max_shingle_size": 5, 64 | "min_shingle_size": 2, 65 | "output_unigrams": "false" 66 | } 67 | } 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from datetime import datetime, timedelta 3 | import time 4 | import copy 5 | import json 6 | # dependencies 7 | from flask import Flask, jsonify, request 8 | from celery import group 9 | # import uuid 10 | # project dependencies 11 | from config import config 12 | from tasks import fetch 13 | from worker import celery 14 | 15 | # Date format from arguments. Also required for Twint 16 | dtformat = "%Y-%m-%d" 17 | 18 | # 19 | # Initialize Flask 20 | app = Flask('twint_server') 21 | 22 | # Development on localhost 23 | if config['ALLOW_CORS']: 24 | from flask_cors import CORS 25 | CORS(app) 26 | 27 | class Empty(object): 28 | pass 29 | 30 | # 31 | # REST Endpoint 32 | @app.route("/fetch", methods=['POST']) 33 | def fetch_tweets(): 34 | print("Fetching request") 35 | config = Empty() 36 | config.__dict__ = request.json 37 | # 38 | # Required arguments 39 | since = config.Since 40 | until = config.Until 41 | # args.maximum_instances = 4 # depends on worker concurency parametar 42 | request_days = 1 #request.json['request_days'] 43 | since_iter = datetime.strptime(since, dtformat).date() 44 | until = datetime.strptime(until, dtformat).date() 45 | # 46 | # Prepaire arguments for processes. 47 | arguments = [] 48 | end = since_iter + timedelta(days=request_days) 49 | i = 0 50 | while end < until: 51 | if i > 0: 52 | since_iter = since_iter + timedelta(days=request_days) 53 | end = since_iter + timedelta(days=request_days) 54 | if end > until: 55 | end = until 56 | argument = copy.deepcopy(config) 57 | argument.Since = since_iter.strftime(dtformat) 58 | argument.Until = end.strftime(dtformat) 59 | argument.id = i 60 | arguments.append(argument.__dict__) 61 | i += 1 62 | # print("Number of processes %s" % len(arguments)) 63 | # 64 | # Make processes with arguments 65 | jobs = group(fetch.s(item) for item in arguments) 66 | # Start jobs 67 | jobsResult = jobs.apply_async() 68 | 69 | # Return info 70 | return "Fetching started.Processes count: %s" % len(arguments) 71 | 72 | # # 73 | # # Feature to track state in two way: 74 | # # 1. Return celery processes ids 75 | # # 2. Use aditional task to save celery ids to server in group 76 | # # 77 | # ids = [] 78 | # for i in jobsResult: 79 | # ids.append({"id": i.id, "status": "PENDING"}) 80 | 81 | # # 82 | # # 1. Return celery processes ids 83 | # return jsonify(ids) 84 | 85 | # # 86 | # # 2. Use aditional task to save celery ids to server in group 87 | # group_id = uuid.uuid4() 88 | # res = save.s({ 89 | # "name": search, 90 | # "status": "STARTED", 91 | # "progress": 0, 92 | # "id": group_id, 93 | # "ids": ids, 94 | # "since": since, 95 | # "until": until.strftime(dtformat), 96 | # "elasticsearch": elasticsearch, 97 | # "index_tweets": index_tweets, 98 | # "created_at": datetime.now() 99 | # }).apply_async() 100 | # return jsonify(group_id) 101 | 102 | if __name__ == "__main__": 103 | from os import environ 104 | port = int(environ.get("PORT", config['PORT'])) 105 | app.run(host=config['HOST'], port=port, debug=config['FLASK_DEBUG']) --------------------------------------------------------------------------------