├── __init__.py
├── .vscode
    ├── settings.json
    └── launch.json
├── requirements.txt
├── config.py
├── config.ini
├── worker.py
├── tasks.py
├── README.md
├── .gitignore
├── arguments.py
├── elasticsearch
    └── index-tweets.json
└── app.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/bin/python3"
3 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask-cors
3 | Celery
4 | twint
5 | numpy
6 | aiohttp_socks


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | config = configparser.ConfigParser()
3 | config.read('config.ini')
4 | if config["DEFAULT"]["DEV"]:
5 |     config = config['DEV']
6 | else:
7 |     config = config['PROD']


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
 1 | ; config.ini
 2 | [DEFAULT]
 3 | DEV = Yes
 4 | 
 5 | [DEV]
 6 | HOST = localhost
 7 | PORT = 5000
 8 | CELERY_BROKER_URL = pyamqp://guest@localhost//
 9 | ALLOW_CORS = Yes
10 | FLASK_DEBUG = Yes
11 | 
12 | [PROD]
13 | HOST = localhost
14 | PORT = 5001
15 | CELERY_BROKER_URL = pyamqp://guest@localhost//
16 | ALLOW_CORS = No
17 | FLASK_DEBUG = No


--------------------------------------------------------------------------------
/worker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from celery import Celery
 3 | from config import config
 4 | 
 5 | CELERY_BROKER_URL = config['CELERY_BROKER_URL']
 6 | 
 7 | celery = Celery('flask_server', backend='amqp')
 8 | celery.conf.update({
 9 |     'broker_url': config['CELERY_BROKER_URL'],
10 |     'backend': 'amqp', 
11 |     'imports': (
12 |         'tasks',
13 |     ),
14 |     'task_routes': {
15 |         'fetch': {'queue': 'fetching'}
16 |         # 'save': {'queue': 'saving'}
17 |     },
18 |     'task_serializer': 'json',
19 |     'result_serializer': 'json',
20 |     'accept_content': ['json']})
21 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from worker import celery
 3 | from config import config
 4 | import twint
 5 | from arguments import TwintArguments
 6 | 
 7 | @celery.task(name='fetch')
 8 | def fetch(args):
 9 |     # Merge user arguments with default Twint config
10 |     config = TwintArguments()
11 |     config.__dict__.update(args)
12 |     print("Start fetch task %s %s -> %s" % 
13 |         (config.id, config.Since, config.Until))
14 |     # Run
15 |     twint.run.Search(config)
16 |     # Finished
17 |     return "Fetch task finished %s %s -> %s" % \
18 |         (config.id, config.Since, config.Until)
19 | 
20 | # # Save/report progress if you want or example to firebase
21 | # @celery.task(name='save')
22 | # def save(args):
23 | #     db = firebase.database()
24 | #     # Make process entry on firebase
25 | #     db.child("Processes").child(args['id']).set(args)
26 | #     return "Progress saved"


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File (Integrated Terminal)",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "args": ["-m"]
14 |         },
15 |         {
16 |             "name": "Python: Flask",
17 |             "type": "python",
18 |             "request": "launch",
19 |             "module": "flask",
20 |             "env": {
21 |                 "FLASK_APP": "app.py"
22 |             },
23 |             "args": [
24 |                 "run"
25 |                
26 |             ],
27 |             "jinja": true
28 |         },
29 |     ]
30 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### [TWINT](https://github.com/twintproject/twint) Flask-Celery Server
 2 | Optimized tweets scraping
 3 | 
 4 | #### See also [Twint Kibana](https://github.com/Nedja995/twint_kibana)
 5 | 
 6 | #### Requirements
 7 | - Python3, [Twint](https://github.com/twintproject/twint), Flask, Celery
 8 | - Elasticsearch(v7)
 9 | - RabitMQ
10 | - (optional) Flower 
11 | 
12 | #### Run server
13 | 
14 | 1. Run Celery workers: 
15 | - `$ celery worker --app=worker.celery --hostname=worker.fetching@%h --queues=fetching --loglevel=info`
16 | - (Optional) task for reporting progress if it is implemented `$ celery worker --app=worker.celery --hostname=worker.saving@%h --queues=saving --loglevel=info`
17 | 
18 | 2. Run Flask server: `$ python3 app.py`
19 | 
20 | - (Optional) Monitor Celery with Flower: `$ celery -A app.celery flower --broker='pyamqp://guest@localhost//'`
21 | 
22 | #### Use
23 | 
24 | 1. Create ES index with [index-tweets.json](elasticsearch/index-tweets.json)
25 | 
26 | 2. Start tweets fetching
27 | - arguments are mapped to [twint config](https://github.com/twintproject/twint/blob/master/twint/config.py)
28 | - I mainly use it with elasticsearch so I did not test with other arguments
29 | - Since and Until and Search/User are required
30 | ```
31 |   POST  http://localhost:5000/fetch
32 |   {
33 |     "Since": "2019-2-1",
34 |     "Until": "2019-3-1",
35 |     "Search": "<keyword>",
36 |     // or
37 |     "User": "<username>"
38 |     "Elasticsearch": "localhost:9200",
39 |     "Index_tweets": "<es index name>"
40 |   }
41 | ```
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/twintproject/twint/blob/master/twint/config.py
 2 | # https://github.com/twintproject/twint/blob/master/twint/cli.py
 3 | class TwintArguments(object):
 4 |     Username = None
 5 |     User_id = None
 6 |     Search = None
 7 |     Geo = ""
 8 |     Location = False
 9 |     Near = None
10 |     Lang = None
11 |     Output = None
12 |     Elasticsearch = None
13 |     Timedelta = None
14 |     Year = None
15 |     Since = None
16 |     Until = None
17 |     Email = False
18 |     Phone = False
19 |     Verified = False
20 |     Store_csv = False
21 |     Store_json = False
22 |     Custom = {"tweet": None, "user": None, "username": None}
23 |     Show_hashtags = False
24 |     Show_cashtags = False
25 |     Limit = None
26 |     Count = None
27 |     Stats = False
28 |     Database = None
29 |     To = None
30 |     All = None
31 |     Debug = False
32 |     Format = None
33 |     Essid = ""
34 |     Profile = False
35 |     Followers = False
36 |     Following = False
37 |     Favorites = False
38 |     TwitterSearch = False
39 |     User_full = False
40 |     Profile_full = False
41 |     Store_object = False
42 |     Pandas_type = None
43 |     Pandas = False
44 |     Index_tweets = "twinttweets"
45 |     Index_follow = "twintgraph"
46 |     Index_users = "twintuser"
47 |     Debug = False
48 |     Retries_count = 10
49 |     Resume = None
50 |     Images = False
51 |     Videos = False
52 |     Media = False
53 |     Replies = False
54 |     Pandas_clean = True
55 |     Lowercase = True
56 |     Pandas_au = True
57 |     Proxy_host = None
58 |     Proxy_port = 0
59 |     Proxy_type = None
60 |     Tor_control_port = 9051
61 |     Tor_control_password = None
62 |     Retweets = False
63 |     Query = None
64 |     Hide_output = False
65 |     Get_replies = False
66 |     Near = ""
67 |     Custom_query = ""
68 |     Popular_tweets = False


--------------------------------------------------------------------------------
/elasticsearch/index-tweets.json:
--------------------------------------------------------------------------------
 1 | PUT twinttweets
 2 | {
 3 | 	"mappings": {
 4 | 		"properties": {
 5 | 			"id": {"type": "long"},
 6 | 			"conversation_id": {"type": "long"},
 7 | 			"created_at": {"type": "long"},
 8 | 			"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
 9 | 			"timezone": {"type": "keyword"},
10 | 			"place": {"type": "keyword"},
11 | 			"location": {"type": "keyword"},
12 | 			 "tweet": {
13 |         "type": "text",
14 |         "fielddata": true,
15 |         "search_analyzer": "analyzer_shingle",
16 |         "analyzer": "analyzer_shingle",
17 |         "fields": {
18 |           "raw": {
19 |             "type": "keyword"
20 |           }
21 |         }
22 |       },
23 | 			"hashtags": {"type": "keyword"},
24 | 			"cashtags": {"type": "keyword"},
25 | 			"user_id": {"type": "long"},
26 | 			"user_id_str": {"type": "keyword"},
27 | 			"username": {"type": "keyword"},
28 | 			"name": {"type": "text"},
29 | 			"profile_image_url": {"type": "text"},
30 | 			"day": {"type": "integer"},
31 | 			"hour": {"type": "integer"},
32 | 			"link": {"type": "text"},
33 | 			"retweet": {"type": "text"},
34 | 			"essid": {"type": "keyword"},
35 | 			"nlikes": {"type": "integer"},
36 | 			"nreplies": {"type": "integer"},
37 | 			"nretweets": {"type": "integer"},
38 | 			"quote_url": {"type": "text"},
39 | 			"video": {"type":"integer"},
40 | 			"search": {"type": "text"},
41 | 			"near": {"type": "text"},
42 | 			"geo_near": {"type": "geo_point"},
43 | 			"geo_tweet": {"type": "geo_point"},
44 | 			"photos": {"type": "text"}
45 | 		}
46 | 	},
47 | 	"settings": {
48 | 	    "number_of_shards": 3,
49 |       "number_of_replicas": 2,
50 |       "analysis": {
51 |         "analyzer": {
52 |           "analyzer_shingle": {
53 |             "tokenizer": "standard",
54 |             "filter": [
55 |               "lowercase",
56 |               "filter_shingle"
57 |             ]
58 |           }
59 |         },
60 |         "filter": {
61 |           "filter_shingle": {
62 |             "type": "shingle",
63 |             "max_shingle_size": 5,
64 |             "min_shingle_size": 2,
65 |             "output_unigrams": "false"
66 |           }
67 |         }
68 |       }
69 | 	}
70 | }


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from datetime import datetime, timedelta
  3 | import time
  4 | import copy
  5 | import json
  6 | # dependencies
  7 | from flask import Flask, jsonify, request
  8 | from celery import group
  9 | # import uuid
 10 | # project dependencies
 11 | from config import config
 12 | from tasks import fetch
 13 | from worker import celery
 14 | 
 15 | # Date format from arguments. Also required for Twint
 16 | dtformat = "%Y-%m-%d"
 17 | 
 18 | #
 19 | # Initialize Flask
 20 | app = Flask('twint_server')
 21 | 
 22 | # Development on localhost
 23 | if config['ALLOW_CORS']:
 24 |     from flask_cors import CORS
 25 |     CORS(app)
 26 | 
 27 | class Empty(object):
 28 |     pass
 29 | 
 30 | #
 31 | # REST Endpoint
 32 | @app.route("/fetch", methods=['POST'])
 33 | def fetch_tweets():
 34 |     print("Fetching request")
 35 |     config = Empty()
 36 |     config.__dict__ = request.json
 37 |     #
 38 |     # Required arguments
 39 |     since = config.Since
 40 |     until = config.Until
 41 |     # args.maximum_instances = 4 # depends on worker concurency parametar
 42 |     request_days = 1 #request.json['request_days']
 43 |     since_iter = datetime.strptime(since, dtformat).date()
 44 |     until = datetime.strptime(until, dtformat).date()
 45 |     #
 46 |     # Prepaire arguments for processes.
 47 |     arguments = []
 48 |     end = since_iter + timedelta(days=request_days)
 49 |     i = 0
 50 |     while end < until:
 51 |         if i > 0:
 52 |             since_iter = since_iter + timedelta(days=request_days)
 53 |             end = since_iter + timedelta(days=request_days)
 54 |         if end > until:
 55 |             end = until
 56 |         argument = copy.deepcopy(config)
 57 |         argument.Since = since_iter.strftime(dtformat)
 58 |         argument.Until = end.strftime(dtformat)
 59 |         argument.id = i
 60 |         arguments.append(argument.__dict__)
 61 |         i += 1
 62 |     # print("Number of processes %s" % len(arguments))
 63 |     #
 64 |     # Make processes with arguments
 65 |     jobs = group(fetch.s(item) for item in arguments)
 66 |     # Start jobs
 67 |     jobsResult = jobs.apply_async()
 68 | 
 69 |     # Return info
 70 |     return "Fetching started.Processes count: %s" % len(arguments)
 71 | 
 72 |     # #
 73 |     # # Feature to track state in two way:
 74 |     # # 1. Return celery processes ids 
 75 |     # # 2. Use aditional task to save celery ids to server in group
 76 |     # #
 77 |     # ids = []
 78 |     # for i in jobsResult:
 79 |     #     ids.append({"id": i.id, "status": "PENDING"})
 80 |     
 81 |     # #
 82 |     # # 1. Return celery processes ids 
 83 |     # return jsonify(ids)
 84 | 
 85 |     # #
 86 |     # # 2. Use aditional task to save celery ids to server in group
 87 |     # group_id = uuid.uuid4()
 88 |     # res = save.s({
 89 |     #     "name": search,
 90 |     #     "status": "STARTED",
 91 |     #     "progress": 0,
 92 |     #     "id": group_id,
 93 |     #     "ids": ids,
 94 |     #     "since": since,
 95 |     #     "until": until.strftime(dtformat),
 96 |     #     "elasticsearch": elasticsearch,
 97 |     #     "index_tweets": index_tweets,
 98 |     #     "created_at": datetime.now()
 99 |     # }).apply_async()
100 |     # return jsonify(group_id)
101 | 
102 | if __name__ == "__main__":
103 |     from os import environ
104 |     port = int(environ.get("PORT", config['PORT']))
105 |     app.run(host=config['HOST'], port=port, debug=config['FLASK_DEBUG'])


--------------------------------------------------------------------------------