├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── assets ├── architecture.drawio └── architecture.png ├── ci └── Jenkinsfile ├── client ├── data_client.py └── task_client.py ├── docker-compose.yml ├── rabbit.conf ├── requirements.txt └── src ├── command_server.py ├── command_service.py ├── configuration ├── command_server_config.py ├── mysql_config.py ├── proxy_config.py ├── rabbit_config.py ├── upload_file_config.py ├── webhook_config.py └── worker_config.py ├── dao ├── search_by_task_dao.py ├── session_dao.py ├── user_details_task_dao.py ├── user_favorites_task_dao.py ├── user_followers_task_dao.py ├── user_followings_task_dao.py └── user_tweets_task_dao.py ├── data_server.py ├── model ├── hashtag_scrap_params.py ├── scrap_type.py ├── time_interval.py └── user_scrap_params.py ├── scrap_service.py ├── scrap_worker.py ├── upload_result_file_service.py └── utils ├── command_utils.py ├── commands_mysql_utils.py ├── directory_utils.py ├── docker_logs.py ├── init_database.sql ├── interval_utils.py ├── params_encoder.py ├── rabbit_send_utils.py ├── sqlite_util.py ├── time_utils.py └── tor_utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | 144 | # static files generated from Django application using `collectstatic` 145 | media 146 | static 147 | 148 | data 149 | db_data -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | 144 | # static files generated from Django application using `collectstatic` 145 | media 146 | static 147 | 148 | data 149 | db_data 150 | 151 | src/test_run.py -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1 3 | 4 | RUN apt-get update && apt-get install tor -y 5 | 6 | COPY requirements.txt ./ 7 | RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt 8 | RUN pip install --user --upgrade git+https://github.com/himanshudabas/twint.git@origin/twint-fixes#egg=twint 9 | 10 | COPY ./src /app 11 | 12 | WORKDIR /app -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twint-Distributed 2 | ## No long supported 3 | I have many problems with twint. 4 | I decided to stop developing the library. 5 | If you liked my solution, maybe you will be interested in my library - https://github.com/markowanga/stweet. 6 | 7 | ## Description 8 | Sometimes there is a need to scrap many enormous tweet data in short time. 9 | This project help to do this task. Solution is based on Twint — popular tool 10 | to scrap twitter data. 11 | 12 | ![Image of architecture](assets/architecture.png) 13 | 14 | ## Main concepts 15 | - Prepare architecture of microservices, which is scalable and can be 16 | distributed for many machines 17 | - Divide single scrap tasks for small task 18 | - Support that wne worker have error and the elementary task can be repeated 19 | on other instance 20 | - Workaround twitter limit, which disallow to download many data from one ip address 21 | - All data are gathered into one location 22 | - Use docker whenever possible 23 | 24 | ## How it works 25 | 1. User add commands to scrap by HTTP request 26 | 2. As a request result, server add commands to RabbitMQ for scrap data, 27 | the time bounds can be divided for small intervals 28 | 3. Workers get the messages from RabbitMQ to scrap data — they do this job 29 | 4. When elementary task has been finished the data is upload to server 30 | 5. Server save all received data to central storage 31 | -------------------------------------------------------------------------------- /assets/architecture.drawio: -------------------------------------------------------------------------------- 1 | 5Vpdd6M2EP01fkwPCIPx48ZJtw/ZtrvZnk0eBRKgWCAq5Njur6+ExDexna4dtlnnnASNRh/M1b0zEM+cVbr7yGGefGII0xmw0G7m3MwAsG3fl3+UZa8tCwtoQ8wJMk6N4Z78g43RMtYNQbjoOArGqCB51xiyLMOh6Ngg52zbdYsY7a6awxgPDPchpEPrN4JEUt2Xt2w6fsMkTszSPljojhRWzuZOigQitm2ZnNuZs+KMCX2V7laYquBVcdHjfn2ht94Yx5k4ZUCUxSm6o3v/s/cUp+FfD9dfoqt6c2Jf3TFGMgCmybhIWMwySG8b6zVnmwxhNa0lW43PHWO5NNrS+ISF2Bs04UYwaUpESk2v3DHfP7Qbj2qyX9yqebMzk+vW3rT0XtUGX4yBMRVsw0N84MarswR5jMUhv2UNlTzjmKVYbkgO5JhCQZ67G4HmsMW1X4OHvDCQvAYeMAk8OyIeKgDktQZn4ZpmA45qTIgN+E5oyqEfOIf7lkPOSCaK1sx/KoN0MJLmVMpkBA24bpd2x/yX1kF/4B30lxd6x82hqm/9v5+z+TTnrJIB64eXgbk9pQyYeZ8h3ZiVVixNYYbULWL+TOQ99vFr0FGh3iZE4PsclsHYylTdRSIilK4YZbwc60Su+pH2QnC2xq0er/yoESwTLbv+1Ng8Yy7w7jA6w2CaAXOrx4CKQdtWAq4SV9LKvRVzzh5/d5osaWTY7sjwm6kwOJEa7iQq3FNJZ35YVY/4X0ZVwYC2X2AQEPHp83nZ6oc4DMfYGvju3LXOw0rQY6UzPSvtQRjfsjiqmfjYJuIoK5tE1+S2x3YSvHyic0+td6fMc+6AMPehPP055uqcMb5WF2dkDnKxj+ZjzPFB4JR57gzMcZY95lhTM8deDgL9FRbrGfCoXPo6kFH2YnWFoIABLIbVhXyYzdVluKdEhp8fD32gcboLagMM13GJ3h8bIWfBL6hbhL1xdUOLZWCdS92cXs0xgpE3ApF/KYicn7UiB/6JSgWmfTL3BxS6kWS5UEGOIPajURJ4oY+D6Dwk8H64FA+GQrVSJw/SMraMq7d3U2pTBF7QJi/w3DPlj8UJ+eNNtWk+n1abukWUdUSbJn6Mcpb/h8rLGRKtqgaKHGYdsL2/N+qddfkK4CqCKaEybB9mShg8mOZl+BxHlVUJps9YkBAOepR3+bucpCgxV1PYIN/1+vS6qjNjPC2Z3+remgM/0k/lccL8Su4/JFk86iJJKa4gJXGmu0MJb1lstrqVhmRmBau1u7JTSlFWRHLSanopFZWDLFhRd/XBcIRDqWGCsGxkfEQZFCN2RIqcQhNzkpXyVINSVW6/Y6zeDkkSSehUES3/ft0SoW8PlalK4yvPi4ZYDxxSm1KSF4rClbAitgkovq3tr8tl2JZl92JMNJfewoFnEk3QE03bOTGXLS6mmsO3eN9wkDC2VqtFJTCGJVlQaLL06/EnFqiDQTJSJO8EqH7RASx3aqDe52No/7Xq9NWd+7M+48y9E2sC/ZTx1i9X3Ve+XD3if6F/WXnvk6W9fxc64HIslc3m6wcal+ZLHM7tvw== -------------------------------------------------------------------------------- /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/markowanga/twint-distributed/edec4fb4f054102b43642b5a1e863f5144e39d6f/assets/architecture.png -------------------------------------------------------------------------------- /ci/Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent none 3 | environment { 4 | registry = "marcinwatroba/twint-distributed" 5 | registryCredential = 'marcinwatroba_dockerhub' 6 | } 7 | stages { 8 | stage('checkout sources') { 9 | agent any 10 | steps { 11 | git branch: 'master', credentialsId: 'markowanga_github', 12 | url: 'https://github.com/markowanga/twint-distributed.git' 13 | } 14 | } 15 | stage('build & push docker') { 16 | agent { 17 | docker { 18 | image 'docker' 19 | args '-u root -v /var/run/docker.sock:/var/run/docker.sock' 20 | } 21 | } 22 | steps { 23 | script { 24 | dockerImageWithNumber = docker.build(registry + ":$BUILD_NUMBER", './') 25 | dockerImageLatest = docker.build(registry + ":latest", './') 26 | docker.withRegistry( '', registryCredential ) { 27 | dockerImageWithNumber.push() 28 | dockerImageLatest.push() 29 | } 30 | } 31 | } 32 | } 33 | } 34 | post { 35 | always { 36 | node('master') { 37 | sh 'rm -rf *' 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /client/data_client.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import requests 4 | 5 | 6 | class TwintDistributedDataClient: 7 | 8 | def __init__(self, data_server_host): 9 | self.data_server_host = data_server_host 10 | return 11 | 12 | def get_searched_tweets(self, to_search: str) -> pd.DataFrame: 13 | return self.__call_get_request('/get_searched_tweets/' + to_search) 14 | 15 | def get_user_tweets(self, username: str) -> pd.DataFrame: 16 | return self.__call_get_request('/get_user_tweets/' + username) 17 | 18 | def get_user_details(self, username: str) -> pd.DataFrame: 19 | return self.__call_get_request('/get_user_details/' + username) 20 | 21 | def get_user_followers(self, username: str) -> pd.DataFrame: 22 | return self.__call_get_request('/get_user_followers/' + username) 23 | 24 | def get_user_followings(self, username: str) -> pd.DataFrame: 25 | return self.__call_get_request('/get_user_followings/' + username) 26 | 27 | def get_user_favorites(self, username: str) -> pd.DataFrame: 28 | return self.__call_get_request('/get_user_favorites/' + username) 29 | 30 | def __call_get_request(self, path: str) -> pd.DataFrame: 31 | response = requests.get(self.data_server_host + requests.utils.quote(path)) 32 | return pd.read_json(response.content) 33 | 34 | 35 | def main(): 36 | client = TwintDistributedDataClient('http://twitterdata.theliver.pl') 37 | users = ['AndrzejDuda', 'M_K_Blonska', 'pawel_tanajno', 'jakubiak_marek', 'mir_piotrowski', 'krzysztofbosak', 38 | 'szymon_holownia', 'KosiniakKamysz', 'Grzywa_Slawomir', 'RobertBiedron', 'trzaskowski_'] 39 | for user in users: 40 | responses = [ 41 | client.get_user_tweets(user).sort_values(by='created_at')[['created_at']], 42 | # client.get_user_details(user), 43 | # client.get_user_followers(user), 44 | # client.get_user_followings(user), 45 | # client.get_user_favorites(user) 46 | ] 47 | print(user) 48 | print([it for it in responses]) 49 | return 50 | 51 | 52 | main() 53 | -------------------------------------------------------------------------------- /client/task_client.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from enum import Enum 3 | from typing import Optional, Dict 4 | 5 | import requests 6 | 7 | 8 | class ScrapInterval(Enum): 9 | HOUR = 1 10 | DAY = 2 11 | MONTH = 3 12 | QUARTER_OF_YEAR = 4 13 | YEAR = 5 14 | 15 | def get_parameter_name(self) -> str: 16 | return { 17 | ScrapInterval.HOUR: 'hour', 18 | ScrapInterval.DAY: 'day', 19 | ScrapInterval.MONTH: 'month', 20 | ScrapInterval.QUARTER_OF_YEAR: 'quarter_of_year', 21 | ScrapInterval.YEAR: 'year' 22 | }[self] 23 | 24 | 25 | class TwintDistributedTaskClient: 26 | """ 27 | Client to add scrap tasks. 28 | 29 | interval_type -- this is interval time of subtask, big scrap task can be divided for smaller, 30 | it's better for scaling 31 | 32 | queue_name -- it's name of RabbitMQ queue, tasks can be adding to different queues 33 | 34 | scrap_series -- it can be group of tasks, when all tasks in group will be finished, the webhook with finish 35 | information will be send to parametrized host 36 | """ 37 | 38 | def __init__(self, command_server_host): 39 | self.command_server_host = command_server_host 40 | return 41 | 42 | def add_user_tweets_to_scrap(self, username: str, interval_type: ScrapInterval, queue_name: str, scrap_series: str, 43 | since: Optional[datetime], until: Optional[datetime]): 44 | post_data = { 45 | 'username': username, 46 | 'interval_type': interval_type.get_parameter_name(), 47 | 'queue_name': queue_name, 48 | 'scrap_series': scrap_series, 49 | 'since': since.isoformat() if since is not None else None, 50 | 'until': until.isoformat() if until is not None else None 51 | } 52 | self.__call_post_request('/add_user_tweets_to_scrap', post_data) 53 | return 54 | 55 | def add_user_details_to_scrap(self, username: str, queue_name: str, scrap_series: str): 56 | post_data = { 57 | 'username': username, 58 | 'queue_name': queue_name, 59 | 'scrap_series': scrap_series 60 | } 61 | self.__call_post_request('/add_user_details_to_scrap', post_data) 62 | return 63 | 64 | def add_user_followings_to_scrap(self, username: str, queue_name: str, scrap_series: str): 65 | post_data = { 66 | 'username': username, 67 | 'queue_name': queue_name, 68 | 'scrap_series': scrap_series 69 | } 70 | self.__call_post_request('/add_user_followings_to_scrap', post_data) 71 | return 72 | 73 | def add_user_followers_to_scrap(self, username: str, queue_name: str, scrap_series: str): 74 | post_data = { 75 | 'username': username, 76 | 'queue_name': queue_name, 77 | 'scrap_series': scrap_series 78 | } 79 | self.__call_post_request('/add_user_followers_to_scrap', post_data) 80 | return 81 | 82 | def add_user_favorites_to_scrap(self, username: str, queue_name: str, scrap_series: str): 83 | post_data = { 84 | 'username': username, 85 | 'queue_name': queue_name, 86 | 'scrap_series': scrap_series 87 | } 88 | self.__call_post_request('/add_user_favorites_to_scrap', post_data) 89 | return 90 | 91 | def add_search_to_scrap(self, to_search: str, interval_type: ScrapInterval, queue_name: str, scrap_series: str, 92 | since: Optional[datetime], until: Optional[datetime], language: Optional[str]): 93 | post_data = { 94 | 'to_search': to_search, 95 | 'interval_type': interval_type.get_parameter_name(), 96 | 'queue_name': queue_name, 97 | 'scrap_series': scrap_series, 98 | 'since': since.isoformat() if since is not None else None, 99 | 'until': until.isoformat() if since is not None else None, 100 | 'language': language 101 | } 102 | self.__call_post_request('/add_search_to_scrap', post_data) 103 | return 104 | 105 | def __call_post_request(self, path: str, post_data: Dict[str, any]): 106 | url = self.command_server_host + path 107 | response = requests.post(url, data=post_data) 108 | if response.status_code >= 400: 109 | print("ERR path code:", response.status_code) 110 | return 111 | 112 | 113 | # def main(): 114 | # result = requests.get('http://api.pandemocje.pl/api/hashtag_distribution') 115 | # hashtag_count_dict = result.json()['plot_raw'] 116 | # hashtags = [it for it in hashtag_count_dict.keys() if hashtag_count_dict[it] > 100] 117 | # client = TwintDistributedTaskClient('http://192.168.0.124:5000') 118 | # for hashtag in hashtags: 119 | # print(hashtag) 120 | # client.add_search_to_scrap(hashtag, ScrapInterval.MONTH, 'bot_detection', 'hashtag_analyse', since=None, 121 | # until=None, language='pl') 122 | # return 123 | # 124 | # 125 | # def for_kajdanowicz(): 126 | # users = ['AndrzejDuda', 'M_K_Blonska', 'pawel_tanajno', 'jakubiak_marek', 'mir_piotrowski', 'krzysztofbosak', 127 | # 'szymon_holownia', 'KosiniakKamysz', 'Grzywa_Slawomir', 'RobertBiedron', 'trzaskowski_'] 128 | # client = TwintDistributedTaskClient('http://192.168.0.124:5000') 129 | # 130 | # scrap_since = datetime.now() - timedelta(days=60) 131 | # print(scrap_since.isoformat()) 132 | # 133 | # for user in users: 134 | # print(user) 135 | # client.add_user_tweets_to_scrap(user, ScrapInterval.MONTH, 'bot_detection', 'kajdanowicz', 136 | # since=None, until=None) 137 | # 138 | # 139 | # # main() 140 | # for_kajdanowicz() 141 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | # database for commands 4 | twint_distributed_mysql_db: 5 | image: mysql:8.0.19 6 | command: --default-authentication-plugin=mysql_native_password 7 | container_name: twint_distributed_mysql_db 8 | restart: unless-stopped 9 | environment: 10 | MYSQL_ROOT_PASSWORD: test1234 11 | volumes: 12 | - ./db_data:/var/lib/mysql 13 | - /etc/localtime:/etc/localtime:ro 14 | - /etc/timezone:/etc/timezone:ro 15 | # ports: 16 | # - 3306:3306 17 | 18 | # queue to serve tasks for scrap 19 | twint_distributed_rabbitmq: 20 | image: rabbitmq:3.8.3-management 21 | volumes: 22 | - ./rabbit_data:/var/lib/rabbitmq 23 | - /etc/localtime:/etc/localtime:ro 24 | - /etc/timezone:/etc/timezone:ro 25 | ports: 26 | - 15672:15672 # management plugin 27 | # - 5672:5672 # RabbitMQ 28 | 29 | # command server to preview added and finished tasks 30 | twint_distributed_command_server: 31 | image: marcinwatroba/twint-distributed:latest 32 | restart: unless-stopped 33 | command: [python, -u, command_server.py] 34 | ports: 35 | - 5000:5000 36 | environment: 37 | - RABBIT_HOST=twint_distributed_rabbitmq 38 | - RABBIT_USERNAME=guest 39 | - RABBIT_PASSWORD=guest 40 | - MYSQL_HOST=twint_distributed_mysql_db 41 | - MYSQL_PORT=3306 42 | - MYSQL_USER=root 43 | - MYSQL_PASSWORD=test1234 44 | - WEBHOOK_HOST=no_host # fill variable when you want get webhook after session finished 45 | volumes: 46 | - /etc/localtime:/etc/localtime:ro 47 | - /etc/timezone:/etc/timezone:ro 48 | 49 | # consumer -- this service can be scaled 50 | twint_distributed_scrapper_consumer: 51 | image: marcinwatroba/twint-distributed:latest 52 | restart: unless-stopped 53 | command: [python, -u, scrap_worker.py] 54 | environment: 55 | - RABBIT_HOST=twint_distributed_rabbitmq 56 | - RABBIT_USERNAME=guest 57 | - RABBIT_PASSWORD=guest 58 | - UPLOAD_FILE_HOST=[upload_file_host] 59 | - QUEUE_NAME=bot_detection 60 | - COMMAND_SERVER_HOST=twint_distributed_command_server 61 | volumes: 62 | - /etc/localtime:/etc/localtime:ro 63 | - /etc/timezone:/etc/timezone:ro 64 | 65 | # service to save data 66 | twint_distributed_data_server: 67 | image: marcinwatroba/twint-distributed:latest 68 | restart: unless-stopped 69 | command: [python, -u, data_server.py] 70 | volumes: 71 | - ./data:/data 72 | - /etc/localtime:/etc/localtime:ro 73 | - /etc/timezone:/etc/timezone:ro 74 | ports: 75 | - 5001:5000 76 | -------------------------------------------------------------------------------- /rabbit.conf: -------------------------------------------------------------------------------- 1 | loopback_users.guest = false 2 | listeners.tcp.default = 5672 3 | log.console.level = warning 4 | management.tcp.inactivity_timeout = 120000 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | requests 3 | pysocks 4 | pandas 5 | numpy 6 | pika 7 | python-dateutil 8 | pymongo 9 | dacite 10 | aiohttp_socks 11 | mysql-connector-python 12 | pymysql 13 | # -e git://github.com/twintproject/twint.git#egg=twint -------------------------------------------------------------------------------- /src/command_server.py: -------------------------------------------------------------------------------- 1 | import time 2 | from uuid import uuid4 3 | 4 | from dateutil.parser import parse as date_parser 5 | from flask import Flask 6 | from flask import request, jsonify 7 | 8 | import command_service 9 | import utils.docker_logs as docker_logs 10 | import utils.interval_utils as interval_utils 11 | from model.scrap_type import ScrapType 12 | from utils import commands_mysql_utils 13 | 14 | logger = docker_logs.get_logger('command_server') 15 | app = Flask(__name__) 16 | 17 | 18 | def get_new_index() -> str: 19 | return str(uuid4()) 20 | 21 | 22 | def get_success_response(): 23 | return jsonify({'status': 'SUCCESS'}) 24 | 25 | 26 | @app.route("/add_user_tweets_to_scrap", methods=['POST']) 27 | def add_user_tweets_to_scrap(): 28 | command_service.add_user_tweets_to_scrap( 29 | username=request.form['username'], 30 | since=date_parser(request.form['since']) if 'since' in request.form else None, 31 | until=date_parser(request.form['until']) if 'since' in request.form else None, 32 | queue_name=request.form['queue_name'], 33 | scrap_series=request.form['scrap_series'], 34 | interval_type=interval_utils.TimeIntervalType.get_from_string(request.form['interval_type']) 35 | ) 36 | return get_success_response() 37 | 38 | 39 | @app.route("/add_user_details_to_scrap", methods=['POST']) 40 | def add_user_details_to_scrap(): 41 | command_service.add_user_details_to_scrap( 42 | username=request.form['username'], 43 | queue_name=request.form['queue_name'], 44 | scrap_series=request.form['scrap_series'] 45 | ) 46 | return get_success_response() 47 | 48 | 49 | @app.route("/add_user_followings_to_scrap", methods=['POST']) 50 | def add_user_followings_to_scrap(): 51 | command_service.add_user_followings_to_scrap( 52 | username=request.form['username'], 53 | queue_name=request.form['queue_name'], 54 | scrap_series=request.form['scrap_series'] 55 | ) 56 | return get_success_response() 57 | 58 | 59 | @app.route("/add_user_followers_to_scrap", methods=['POST']) 60 | def add_user_followers_to_scrap(): 61 | command_service.add_user_followers_to_scrap( 62 | username=request.form['username'], 63 | queue_name=request.form['queue_name'], 64 | scrap_series=request.form['scrap_series'] 65 | ) 66 | return get_success_response() 67 | 68 | 69 | @app.route("/add_user_favorites_to_scrap", methods=['POST']) 70 | def add_user_favorites_to_scrap(): 71 | command_service.add_user_favorites_to_scrap( 72 | username=request.form['username'], 73 | queue_name=request.form['queue_name'], 74 | scrap_series=request.form['scrap_series'] 75 | ) 76 | return get_success_response() 77 | 78 | 79 | @app.route("/add_search_to_scrap", methods=['POST']) 80 | def add_search_to_scrap(): 81 | command_service.add_search_to_scrap( 82 | phrase=request.form['to_search'], 83 | since=date_parser(request.form['since']) if 'since' in request.form else None, 84 | until=date_parser(request.form['until']) if 'until' in request.form else None, 85 | language=request.form['language'] if 'language' in request.form else None, 86 | queue_name=request.form['queue_name'], 87 | scrap_series=request.form['scrap_series'], 88 | interval_type=interval_utils.TimeIntervalType.get_from_string(request.form['interval_type']) 89 | ) 90 | return get_success_response() 91 | 92 | 93 | @app.route("/set_task_as_finished", methods=['POST']) 94 | def set_task_as_finished(): 95 | logger.info(request.form) 96 | command_service.set_task_as_finished(request.form['task_id'], ScrapType[request.form['task_type']]) 97 | return get_success_response() 98 | 99 | 100 | @app.route("/set_sub_task_as_finished", methods=['POST']) 101 | def set_sub_task_as_finished(): 102 | logger.info(request.form) 103 | command_service.set_sub_task_as_finished(request.form['sub_task_id'], ScrapType[request.form['task_type']]) 104 | return get_success_response() 105 | 106 | 107 | @app.route("/get_all_tasks", methods=['GET']) 108 | def get_all_scrapped_users(): 109 | return jsonify(command_service.get_all_scrapped_tasks()) 110 | 111 | 112 | def wait_for_mysql(): 113 | try_count = 100 114 | success = False 115 | while try_count > 0 and not success: 116 | try: 117 | commands_mysql_utils.get_db_connection_base().close() 118 | success = True 119 | except Exception: 120 | try_count = try_count - 1 121 | logger.info("error during connect to mysql") 122 | logger.info("wait 3 seconds for next try") 123 | time.sleep(3) 124 | if success: 125 | return 126 | else: 127 | raise Exception("can't connect with mysql") 128 | 129 | 130 | if __name__ == "__main__": 131 | wait_for_mysql() 132 | commands_mysql_utils.prepare_database() 133 | app.run(host="0.0.0.0", debug=True) 134 | -------------------------------------------------------------------------------- /src/command_service.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List, Optional 3 | from uuid import uuid4 4 | 5 | import pandas as pd 6 | import requests 7 | from pandas._libs.tslibs.nattype import NaT 8 | from pandas._libs.tslibs.timestamps import Timestamp 9 | 10 | import model.hashtag_scrap_params as hashtag_scrap_params 11 | import utils.docker_logs as docker_logs 12 | import utils.interval_utils as interval_utils 13 | from configuration import webhook_config 14 | from dao import user_favorites_task_dao, user_followings_task_dao, \ 15 | user_followers_task_dao, user_details_task_dao, search_by_task_dao, user_tweets_task_dao, session_dao 16 | from model.scrap_type import ScrapType 17 | from model.time_interval import TimeInterval 18 | from model.user_scrap_params import UserFavoritesScrapTaskParams, UserFollowersScrapTaskParams, \ 19 | UserFollowingScrapTaskParams, \ 20 | UserDetailsScrapTaskParams, UserTweetsScrapTaskParams 21 | from utils.params_encoder import ParamsEncoder 22 | from utils.rabbit_send_utils import send_to_rabbit 23 | 24 | logger = docker_logs.get_logger('command_service') 25 | 26 | 27 | def get_new_index() -> str: 28 | return str(uuid4()) 29 | 30 | 31 | def get_scrap_session_id_by_name(scrap_session_name: str) -> str: 32 | session_id = session_dao.get_scrap_session_id_by_name(scrap_session_name) 33 | if session_id is None: 34 | session_id = get_new_index() 35 | session_dao.add_session(session_id, scrap_session_name) 36 | return session_id 37 | 38 | 39 | def get_interval_list(since: datetime, until: datetime, 40 | interval_type: interval_utils.TimeIntervalType) -> List[TimeInterval]: 41 | return interval_utils.get_list_interval(since, until, interval_type) 42 | 43 | 44 | def add_user_tweets_to_scrap(username: str, since: datetime, until: datetime, queue_name: str, scrap_series: str, 45 | interval_type: interval_utils.TimeIntervalType): 46 | intervals = get_interval_list(since, until, interval_type=interval_type) 47 | since_non_null = sorted([it.since for it in intervals])[0] 48 | until_non_null = sorted([it.until for it in intervals])[-1] 49 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 50 | task_id = get_new_index() 51 | user_tweets_task_dao.add_task(task_id, username, since_non_null, until_non_null, datetime.now(), scrap_session_id, 52 | queue_name) 53 | for interval in get_interval_list(since=since, until=until, interval_type=interval_type): 54 | params = UserTweetsScrapTaskParams( 55 | task_id=get_new_index(), 56 | username=username, 57 | since=interval.since, 58 | until=interval.until, 59 | scrap_series=scrap_series, 60 | queue_name=queue_name 61 | ) 62 | user_tweets_task_dao.add_sub_task(params.task_id, task_id, params.since, params.until, datetime.now()) 63 | params_str = ParamsEncoder().default(params) 64 | logger.info(params_str + " " + params.queue_name) 65 | send_to_rabbit(params.queue_name, params_str) 66 | return 67 | 68 | 69 | def add_user_details_to_scrap(username: str, queue_name: str, scrap_series: str): 70 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 71 | params = UserDetailsScrapTaskParams( 72 | task_id=get_new_index(), 73 | username=username, 74 | queue_name=queue_name, 75 | scrap_series=scrap_series 76 | ) 77 | user_details_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name) 78 | params_str = ParamsEncoder().default(params) 79 | logger.info(params_str + " " + params.queue_name) 80 | send_to_rabbit(params.queue_name, params_str) 81 | return 82 | 83 | 84 | def add_user_followings_to_scrap(username: str, queue_name: str, scrap_series: str): 85 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 86 | params = UserFollowingScrapTaskParams( 87 | task_id=get_new_index(), 88 | username=username, 89 | queue_name=queue_name, 90 | scrap_series=scrap_series 91 | ) 92 | user_followings_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name) 93 | params_str = ParamsEncoder().default(params) 94 | logger.info(params_str + " " + queue_name) 95 | send_to_rabbit(queue_name, params_str) 96 | return 97 | 98 | 99 | def add_user_followers_to_scrap(username: str, queue_name: str, scrap_series: str): 100 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 101 | params = UserFollowersScrapTaskParams( 102 | task_id=get_new_index(), 103 | username=username, 104 | queue_name=queue_name, 105 | scrap_series=scrap_series 106 | ) 107 | user_followers_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name) 108 | params_str = ParamsEncoder().default(params) 109 | logger.info(params_str + " " + queue_name) 110 | send_to_rabbit(queue_name, params_str) 111 | return 112 | 113 | 114 | def add_user_favorites_to_scrap(username: str, queue_name: str, scrap_series: str): 115 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 116 | params = UserFavoritesScrapTaskParams( 117 | task_id=get_new_index(), 118 | username=username, 119 | queue_name=queue_name, 120 | scrap_series=scrap_series 121 | ) 122 | user_favorites_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name) 123 | params_str = ParamsEncoder().default(params) 124 | logger.info(params_str + " " + queue_name) 125 | send_to_rabbit(queue_name, params_str) 126 | return 127 | 128 | 129 | def add_search_to_scrap(phrase: str, since: Optional[datetime], until: Optional[datetime], language: Optional[str], 130 | queue_name: str, scrap_series: str, interval_type: interval_utils.TimeIntervalType): 131 | intervals = get_interval_list(since, until, interval_type=interval_type) 132 | since_non_null = sorted([it.since for it in intervals])[0] 133 | until_non_null = sorted([it.until for it in intervals])[-1] 134 | scrap_session_id = get_scrap_session_id_by_name(scrap_series) 135 | task_id = get_new_index() 136 | search_by_task_dao.add_task(task_id, phrase, since_non_null, until_non_null, datetime.now(), scrap_session_id, 137 | queue_name) 138 | for interval in intervals: 139 | params = hashtag_scrap_params.PhraseScrapTaskParams( 140 | task_id=get_new_index(), 141 | phrase=phrase, 142 | since=interval.since, 143 | until=interval.until, 144 | language=language, 145 | queue_name=queue_name, 146 | scrap_series=scrap_series 147 | ) 148 | search_by_task_dao.add_sub_task(params.task_id, task_id, params.since, params.until, datetime.now()) 149 | params_str = ParamsEncoder().default(params) 150 | logger.info(params_str + " " + params.queue_name) 151 | send_to_rabbit(params.queue_name, params_str) 152 | return 153 | 154 | 155 | def send_session_finished_webhook(scrap_session_name: str): 156 | post_data = { 157 | 'scrap_session_name': scrap_session_name, 158 | } 159 | url = webhook_config.get_webhook_host() + '/scrap_session_finished' 160 | requests.post(url, data=post_data) 161 | return 162 | 163 | 164 | def support_finish_session(session_id: str): 165 | count = session_dao.get_not_finished_session_tasks_count(session_id) 166 | if count == 0: 167 | session_name = session_dao.get_scrap_session_name_by_id(session_id) 168 | logger.info('finished session ' + session_name) 169 | if webhook_config.is_webhook_configured(): 170 | send_session_finished_webhook(session_name) 171 | else: 172 | logger.info('webhook not configured') 173 | else: 174 | logger.info('count to finish session: ' + str(count)) 175 | return 176 | 177 | 178 | def set_task_as_finished(task_id: str, task_type: ScrapType): 179 | scrap_session_id = '' 180 | if task_type == ScrapType.USER_FAVORITES: 181 | user_favorites_task_dao.set_task_finished(task_id, datetime.now()) 182 | scrap_session_id = user_favorites_task_dao.get_session_id(task_id) 183 | elif task_type == ScrapType.USER_FOLLOWINGS: 184 | user_followings_task_dao.set_task_finished(task_id, datetime.now()) 185 | scrap_session_id = user_followings_task_dao.get_session_id(task_id) 186 | elif task_type == ScrapType.USER_FOLLOWERS: 187 | user_followers_task_dao.set_task_finished(task_id, datetime.now()) 188 | scrap_session_id = user_followers_task_dao.get_session_id(task_id) 189 | elif task_type == ScrapType.USER_DETAILS: 190 | user_details_task_dao.set_task_finished(task_id, datetime.now()) 191 | scrap_session_id = user_details_task_dao.get_session_id(task_id) 192 | else: 193 | raise Exception("Bad type") 194 | 195 | support_finish_session(scrap_session_id) 196 | return 197 | 198 | 199 | def set_sub_task_as_finished(sub_task_id: str, task_type: ScrapType): 200 | if task_type == ScrapType.SEARCH_BY_PHRASE: 201 | dao = search_by_task_dao 202 | elif task_type == ScrapType.USER_TWEETS: 203 | dao = user_tweets_task_dao 204 | else: 205 | raise Exception("Bad type") 206 | 207 | task_id = dao.get_task_id_sub_task_id(sub_task_id) 208 | dao.set_sub_task_finished(sub_task_id, datetime.now()) 209 | not_finished_sub_tasks_count = dao.get_all_not_finished_sub_tasks_by_task_id(task_id).size 210 | if not_finished_sub_tasks_count == 0: 211 | dao.set_task_finished(task_id, datetime.now()) 212 | scrap_session_id = dao.get_session_id(task_id) 213 | support_finish_session(scrap_session_id) 214 | return 215 | 216 | 217 | def map_value_to_string(value) -> Optional[any]: 218 | # print(value.__class__) 219 | if isinstance(value, Timestamp): 220 | print(value) 221 | return value.isoformat() 222 | elif value is NaT: 223 | return None 224 | else: 225 | return value 226 | 227 | 228 | def data_frame_to_json_list(df: pd.DataFrame): 229 | df_list = [dict(row) for index, row in df.iterrows()] 230 | df_list = [ 231 | {key: map_value_to_string(row[key]) for key in row.keys()} 232 | for row in df_list 233 | ] 234 | return df_list 235 | 236 | 237 | def get_all_scrapped_tasks(): 238 | df_dict = dict({ 239 | ScrapType.SEARCH_BY_PHRASE.name: search_by_task_dao.get_all_tasks(), 240 | ScrapType.USER_FOLLOWERS.name: user_followers_task_dao.get_all_tasks(), 241 | ScrapType.USER_FOLLOWINGS.name: user_followings_task_dao.get_all_tasks(), 242 | ScrapType.USER_FAVORITES.name: user_favorites_task_dao.get_all_tasks(), 243 | ScrapType.USER_DETAILS.name: user_details_task_dao.get_all_tasks(), 244 | ScrapType.USER_TWEETS.name: user_tweets_task_dao.get_all_tasks() 245 | }) 246 | return {it: data_frame_to_json_list(df_dict[it]) for it in df_dict.keys()} 247 | -------------------------------------------------------------------------------- /src/configuration/command_server_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | _command_server_host = os.environ['COMMAND_SERVER_HOST'] 4 | 5 | 6 | def get_command_server_host() -> str: 7 | return _command_server_host 8 | -------------------------------------------------------------------------------- /src/configuration/mysql_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mysql 4 | from mysql.connector import MySQLConnection 5 | 6 | db_hostname = os.environ['MYSQL_HOST'] 7 | db_port = os.environ['MYSQL_PORT'] 8 | db_username = os.environ['MYSQL_USER'] 9 | db_password = os.environ['MYSQL_PASSWORD'] 10 | TWINT_DISTRIBUTED_DATABASE = 'twint_distributed_tasks' 11 | 12 | 13 | def get_db_connection() -> MySQLConnection: 14 | return mysql.connector.connect( 15 | host=db_hostname, 16 | port=db_port, 17 | user=db_username, 18 | passwd=db_password, 19 | database=TWINT_DISTRIBUTED_DATABASE 20 | ) 21 | 22 | 23 | def get_db_connection_base() -> MySQLConnection: 24 | return mysql.connector.connect( 25 | host=db_hostname, 26 | port=db_port, 27 | user=db_username, 28 | passwd=db_password 29 | ) 30 | -------------------------------------------------------------------------------- /src/configuration/proxy_config.py: -------------------------------------------------------------------------------- 1 | class ProxyConfig: 2 | 3 | def __init__(self, host: str, port: int, proxy_type: str): 4 | self._host = host 5 | self._port = port 6 | self._proxy_type = proxy_type 7 | return 8 | 9 | def get_host(self): 10 | return self._host 11 | 12 | def get_port(self): 13 | return self._port 14 | 15 | def get_proxy_type(self): 16 | return self._proxy_type 17 | 18 | def to_string(self): 19 | return 'ProxyConfig(host=' + self.get_host() + '; port=' + str( 20 | self.get_port()) + '; proxy_type=' + self.get_proxy_type() + ')' 21 | 22 | 23 | default_proxy_config = ProxyConfig('localhost', 9050, 'socks5') 24 | -------------------------------------------------------------------------------- /src/configuration/rabbit_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pika 4 | 5 | _rabbit_host = os.environ['RABBIT_HOST'] 6 | _rabbit_username = os.environ['RABBIT_USERNAME'] 7 | _rabbit_password = os.environ['RABBIT_PASSWORD'] 8 | 9 | 10 | def get_rabbit_connection_config() -> pika.ConnectionParameters: 11 | return pika.ConnectionParameters( 12 | host=_rabbit_host, 13 | credentials=pika.credentials.PlainCredentials( 14 | username=_rabbit_username, 15 | password=_rabbit_password 16 | ), 17 | heartbeat=30 18 | ) 19 | -------------------------------------------------------------------------------- /src/configuration/upload_file_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | _upload_file_host = os.environ['UPLOAD_FILE_HOST'] 4 | 5 | 6 | def get_upload_file_host() -> str: 7 | return _upload_file_host 8 | -------------------------------------------------------------------------------- /src/configuration/webhook_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def is_webhook_configured() -> bool: 5 | return 'WEBHOOK_HOST' in os.environ and os.environ['WEBHOOK_HOST'] != 'no_host' 6 | 7 | 8 | def get_webhook_host() -> str: 9 | return os.environ['WEBHOOK_HOST'] 10 | -------------------------------------------------------------------------------- /src/configuration/worker_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_queue_name() -> str: 5 | return os.environ['QUEUE_NAME'] 6 | -------------------------------------------------------------------------------- /src/dao/search_by_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, phrase: str, since: datetime, until: datetime, created: datetime, scrap_session_id: str, 7 | queue_name: str): 8 | execute_sql_modify( 9 | '''INSERT INTO twint_distributed_tasks.SearchTweetScrapTasks(task_id, phrase, since, until, created, finished, 10 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s, %s, %s);''', 11 | [task_id, phrase, since, until, created, None, scrap_session_id, queue_name]) 12 | return 13 | 14 | 15 | def add_sub_task(sub_task_id: str, task_id: str, since: datetime, until: datetime, created: datetime): 16 | print('task_id', task_id) 17 | execute_sql_modify( 18 | '''INSERT INTO twint_distributed_tasks.SearchTweetScrapSubTasks(sub_task_id, task_id, since, until, created, 19 | finished) VALUE (%s, %s, %s, %s, %s, %s);''', 20 | [sub_task_id, task_id, since, until, created, None]) 21 | return 22 | 23 | 24 | def set_task_finished(task_id: str, finished: datetime): 25 | execute_sql_modify( 26 | '''UPDATE twint_distributed_tasks.SearchTweetScrapTasks 27 | SET finished = %s 28 | WHERE task_id = %s''', 29 | [finished, task_id]) 30 | return 31 | 32 | 33 | def set_sub_task_finished(sub_task_id: str, finished: datetime): 34 | execute_sql_modify( 35 | '''UPDATE twint_distributed_tasks.SearchTweetScrapSubTasks 36 | SET finished = %s 37 | WHERE sub_task_id = %s''', 38 | [finished, sub_task_id]) 39 | return 40 | 41 | 42 | def get_all_not_finished_sub_tasks_by_task_id(task_id: str): 43 | return execute_sql_query( 44 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapSubTasks WHERE task_id=%s AND finished IS NULL', 45 | [task_id]) 46 | 47 | 48 | def get_session_id(task_id: str) -> str: 49 | return execute_sql_query( 50 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapTasks WHERE task_id=%s', 51 | [task_id] 52 | )['scrap_session_id'].to_numpy()[0] 53 | 54 | 55 | def get_task_id_sub_task_id(sub_task_id: str) -> str: 56 | return execute_sql_query( 57 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapSubTasks WHERE sub_task_id=%s', 58 | [sub_task_id] 59 | )['task_id'].to_numpy()[0] 60 | 61 | 62 | def get_all_tasks_by_username(phrase: str): 63 | return execute_sql_query( 64 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapTasks WHERE phrase=%s', 65 | [phrase] 66 | ) 67 | 68 | 69 | def get_all_tasks(): 70 | return execute_sql_query( 71 | '''SELECT task_id, phrase, since, until, language, created, finished, queue_name, scrap_session_name 72 | FROM twint_distributed_tasks.SearchTweetScrapTasks t 73 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 74 | []) 75 | -------------------------------------------------------------------------------- /src/dao/session_dao.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_session(scrap_session_id: str, scrap_session_name: str): 7 | execute_sql_modify( 8 | '''INSERT INTO twint_distributed_tasks.ScrapSession(scrap_session_id, scrap_session_name) VALUE (%s, %s);''', 9 | [scrap_session_id, scrap_session_name]) 10 | return 11 | 12 | 13 | def get_scrap_session_id_by_name(scrap_session_name: str) -> Optional[str]: 14 | values = list( 15 | execute_sql_query( 16 | 'SELECT * FROM twint_distributed_tasks.ScrapSession WHERE scrap_session_name=%s', 17 | [scrap_session_name] 18 | )['scrap_session_id'].to_numpy()) 19 | return values[0] if len(values) > 0 else None 20 | 21 | 22 | def get_scrap_session_name_by_id(scrap_session_name: str) -> Optional[str]: 23 | values = list( 24 | execute_sql_query( 25 | 'SELECT * FROM twint_distributed_tasks.ScrapSession WHERE scrap_session_id=%s', 26 | [scrap_session_name] 27 | )['scrap_session_name'].to_numpy()) 28 | return values[0] if len(values) > 0 else None 29 | 30 | 31 | def get_not_finished_session_tasks_count(scrap_session_id: str) -> int: 32 | queries = [ 33 | '''SELECT COUNT(*) FROM twint_distributed_tasks.SearchTweetScrapTasks 34 | WHERE scrap_session_id=%s AND finished IS NULL''', 35 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserDetailsScrapTasks 36 | WHERE scrap_session_id=%s AND finished IS NULL''', 37 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserTweetScrapTasks 38 | WHERE scrap_session_id=%s AND finished IS NULL''', 39 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFollowersScrapTasks 40 | WHERE scrap_session_id=%s AND finished IS NULL''', 41 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFollowingScrapTasks 42 | WHERE scrap_session_id=%s AND finished IS NULL''', 43 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFavoritesScrapTasks 44 | WHERE scrap_session_id=%s AND finished IS NULL''' 45 | ] 46 | return sum([execute_sql_query(query, [scrap_session_id]).to_numpy()[0] for query in queries]) 47 | 48 | 49 | def get_all_sessions(): 50 | return execute_sql_query('SELECT * FROM twint_distributed_tasks.ScrapSession', []) 51 | -------------------------------------------------------------------------------- /src/dao/user_details_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str): 7 | execute_sql_modify( 8 | '''INSERT INTO twint_distributed_tasks.UserDetailsScrapTasks(task_id, username, created, finished, 9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''', 10 | [task_id, username, created, None, scrap_session_id, queue_name]) 11 | return 12 | 13 | 14 | def set_task_finished(task_id: str, finished: datetime): 15 | execute_sql_modify( 16 | '''UPDATE twint_distributed_tasks.UserDetailsScrapTasks 17 | SET finished = %s 18 | WHERE task_id = %s''', 19 | [finished, task_id]) 20 | return 21 | 22 | 23 | def get_session_id(task_id: str) -> str: 24 | return execute_sql_query( 25 | 'SELECT * FROM twint_distributed_tasks.UserDetailsScrapTasks WHERE task_id=%s', 26 | [task_id] 27 | )['scrap_session_id'].to_numpy()[0] 28 | 29 | 30 | def get_all_by_username(username: str): 31 | return execute_sql_query( 32 | 'SELECT * FROM twint_distributed_tasks.UserDetailsScrapTasks WHERE username=%s', 33 | [username] 34 | ) 35 | 36 | 37 | def get_all_tasks(): 38 | return execute_sql_query( 39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name 40 | FROM twint_distributed_tasks.UserDetailsScrapTasks t 41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 42 | []) 43 | -------------------------------------------------------------------------------- /src/dao/user_favorites_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str): 7 | execute_sql_modify( 8 | '''INSERT INTO twint_distributed_tasks.UserFavoritesScrapTasks(task_id, username, created, finished, 9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''', 10 | [task_id, username, created, None, scrap_session_id, queue_name]) 11 | return 12 | 13 | 14 | def set_task_finished(task_id: str, finished: datetime): 15 | execute_sql_modify( 16 | '''UPDATE twint_distributed_tasks.UserFavoritesScrapTasks 17 | SET finished = %s 18 | WHERE task_id = %s''', 19 | [finished, task_id]) 20 | return 21 | 22 | 23 | def get_session_id(task_id: str) -> str: 24 | return execute_sql_query( 25 | 'SELECT * FROM twint_distributed_tasks.UserFavoritesScrapTasks WHERE task_id=%s', 26 | [task_id] 27 | )['scrap_session_id'].to_numpy()[0] 28 | 29 | 30 | def get_all_by_username(username: str): 31 | return execute_sql_query( 32 | 'SELECT * FROM twint_distributed_tasks.UserFavoritesScrapTasks WHERE username=%s', 33 | [username] 34 | ) 35 | 36 | 37 | def get_all_tasks(): 38 | return execute_sql_query( 39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name 40 | FROM twint_distributed_tasks.UserFavoritesScrapTasks t 41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 42 | []) 43 | -------------------------------------------------------------------------------- /src/dao/user_followers_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str): 7 | execute_sql_modify( 8 | '''INSERT INTO twint_distributed_tasks.UserFollowersScrapTasks(task_id, username, created, finished, 9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''', 10 | [task_id, username, created, None, scrap_session_id, queue_name]) 11 | return 12 | 13 | 14 | def set_task_finished(task_id: str, finished: datetime): 15 | execute_sql_modify( 16 | '''UPDATE twint_distributed_tasks.UserFollowersScrapTasks 17 | SET finished = %s 18 | WHERE task_id = %s''', 19 | [finished, task_id]) 20 | return 21 | 22 | 23 | def get_session_id(task_id: str) -> str: 24 | return execute_sql_query( 25 | 'SELECT * FROM twint_distributed_tasks.UserFollowersScrapTasks WHERE task_id=%s', 26 | [task_id] 27 | )['scrap_session_id'].to_numpy()[0] 28 | 29 | 30 | def get_all_by_username(username: str): 31 | return execute_sql_query( 32 | 'SELECT * FROM twint_distributed_tasks.UserFollowersScrapTasks WHERE username=%s', 33 | [username] 34 | ) 35 | 36 | 37 | def get_all_tasks(): 38 | return execute_sql_query( 39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name 40 | FROM twint_distributed_tasks.UserFollowersScrapTasks t 41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 42 | []) 43 | -------------------------------------------------------------------------------- /src/dao/user_followings_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str): 7 | execute_sql_modify( 8 | '''INSERT INTO twint_distributed_tasks.UserFollowingScrapTasks(task_id, username, created, finished, 9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''', 10 | [task_id, username, created, None, scrap_session_id, queue_name]) 11 | return 12 | 13 | 14 | def set_task_finished(task_id: str, finished: datetime): 15 | execute_sql_modify( 16 | '''UPDATE twint_distributed_tasks.UserFollowingScrapTasks 17 | SET finished = %s 18 | WHERE task_id = %s''', 19 | [finished, task_id]) 20 | return 21 | 22 | 23 | def get_session_id(task_id: str) -> str: 24 | return execute_sql_query( 25 | 'SELECT * FROM twint_distributed_tasks.UserFollowingScrapTasks WHERE task_id=%s', 26 | [task_id] 27 | )['scrap_session_id'].to_numpy()[0] 28 | 29 | 30 | def get_all_by_username(username: str): 31 | return execute_sql_query( 32 | 'SELECT * FROM twint_distributed_tasks.UserFollowingScrapTasks WHERE username=%s', 33 | [username] 34 | ) 35 | 36 | 37 | def get_all_tasks(): 38 | return execute_sql_query( 39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name 40 | FROM twint_distributed_tasks.UserFollowingScrapTasks t 41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 42 | []) 43 | -------------------------------------------------------------------------------- /src/dao/user_tweets_task_dao.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query 4 | 5 | 6 | def add_task(task_id: str, username: str, since: datetime, until: datetime, created: datetime, 7 | scrap_session_id: str, queue_name: str): 8 | execute_sql_modify( 9 | '''INSERT INTO twint_distributed_tasks.UserTweetScrapTasks(task_id, username, since, until, created, finished, 10 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s, %s, %s);''', 11 | [task_id, username, since, until, created, None, scrap_session_id, queue_name]) 12 | return 13 | 14 | 15 | def add_sub_task(sub_task_id: str, task_id: str, since: datetime, until: datetime, created: datetime): 16 | execute_sql_modify( 17 | '''INSERT INTO twint_distributed_tasks.UserTweetScrapSubTasks(sub_task_id, task_id, since, until, created, 18 | finished) VALUE (%s, %s, %s, %s, %s, %s);''', 19 | [sub_task_id, task_id, since, until, created, None]) 20 | return 21 | 22 | 23 | def set_task_finished(task_id: str, finished: datetime): 24 | execute_sql_modify( 25 | '''UPDATE twint_distributed_tasks.UserTweetScrapTasks 26 | SET finished = %s 27 | WHERE task_id = %s''', 28 | [finished, task_id]) 29 | return 30 | 31 | 32 | def set_sub_task_finished(sub_task_id: str, finished: datetime): 33 | execute_sql_modify( 34 | '''UPDATE twint_distributed_tasks.UserTweetScrapSubTasks 35 | SET finished = %s 36 | WHERE sub_task_id = %s''', 37 | [finished, sub_task_id]) 38 | return 39 | 40 | 41 | def get_session_id(task_id: str) -> str: 42 | return execute_sql_query( 43 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapTasks WHERE task_id=%s', 44 | [task_id] 45 | )['scrap_session_id'].to_numpy()[0] 46 | 47 | 48 | def get_all_not_finished_sub_tasks_by_task_id(task_id: str): 49 | return execute_sql_query( 50 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapSubTasks WHERE task_id=%s AND finished IS NULL', 51 | [task_id]) 52 | 53 | 54 | def get_task_id_sub_task_id(sub_task_id: str) -> str: 55 | return execute_sql_query( 56 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapSubTasks WHERE sub_task_id=%s', 57 | [sub_task_id] 58 | )['task_id'].to_numpy()[0] 59 | 60 | 61 | def get_all_tasks_by_username(username: str): 62 | return execute_sql_query( 63 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapTasks WHERE username=%s', 64 | [username] 65 | ) 66 | 67 | 68 | def get_all_tasks(): 69 | return execute_sql_query( 70 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name, since, until 71 | FROM twint_distributed_tasks.UserTweetScrapTasks t 72 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''', 73 | []) 74 | -------------------------------------------------------------------------------- /src/data_server.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from flask import Flask, Response 3 | from flask import request, jsonify 4 | 5 | import utils.directory_utils as directory_utils 6 | import utils.docker_logs as docker_logs 7 | import utils.sqlite_util as sqlite_util 8 | 9 | logger = docker_logs.get_logger('data_server') 10 | 11 | app = Flask(__name__) 12 | app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False 13 | 14 | ROOT_DATA_DIR = '/data' 15 | 16 | 17 | def get_success_response(): 18 | return jsonify({'status': 'SUCCESS'}) 19 | 20 | 21 | def get_data_to_save_directory(data_type: str, sub_series: str) -> str: 22 | return ROOT_DATA_DIR + '/scrap_data/' + data_type + '/' + sub_series 23 | 24 | 25 | def df_to_json_response(df: pd.DataFrame) -> Response: 26 | return Response( 27 | df.to_json(orient="records", date_format='iso'), 28 | mimetype='application/json' 29 | ) 30 | 31 | 32 | @app.route("/upload_result_file", methods=['POST']) 33 | def upload_result_file(): 34 | file = request.files['file'] 35 | data = request.form 36 | sub_series = data['sub_series'] 37 | filename = data['filename'] 38 | data_type = data['data_type'] 39 | 40 | file_directory = get_data_to_save_directory(data_type, sub_series) 41 | file_path = file_directory + '/' + filename 42 | 43 | directory_utils.prepare_directory(file_directory) 44 | file.save(file_path) 45 | 46 | return get_success_response() 47 | 48 | 49 | @app.route("/get_user_details/", methods=['GET']) 50 | def get_user_details(username: str): 51 | user_folder_name = 'u_' + username 52 | user_details_db_file = 'ud_' + username + '.db' 53 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_details' + '/' + user_folder_name + '/' + user_details_db_file 54 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM users') 55 | return df_to_json_response(df) 56 | 57 | 58 | @app.route("/get_user_tweets/", methods=['GET']) 59 | def get_user_tweets(username: str): 60 | logger.info('get_user_tweets ' + username + ' start read tweets') 61 | user_folder_name = 'u_' + username 62 | base_directory_path = ROOT_DATA_DIR + '/scrap_data/user_tweets' + '/' + user_folder_name + '/' 63 | db_files = directory_utils.get_db_files_path_list_from_directory(base_directory_path) 64 | merged_data_df = pd.concat([ 65 | sqlite_util.get_df_from_sqlite_db(db_file, 'SELECT * FROM tweets') 66 | for db_file in db_files 67 | ]) 68 | logger.info('get_user_tweets ' + username + ' processing finished') 69 | df_without_duplicates = merged_data_df.drop_duplicates(subset="id_str") 70 | return df_to_json_response(df_without_duplicates) 71 | 72 | 73 | @app.route("/get_searched_tweets/", methods=['GET']) 74 | def get_searched_tweets(to_search: str): 75 | logger.info('get_searched_tweets ' + to_search + ' start read tweets') 76 | phrase_folder_name = 's_' + to_search 77 | base_directory_path = ROOT_DATA_DIR + '/scrap_data/search_by_phrase' + '/' + phrase_folder_name + '/' 78 | db_files = directory_utils.get_db_files_path_list_from_directory(base_directory_path) 79 | merged_data_df = pd.concat([ 80 | sqlite_util.get_df_from_sqlite_db(db_file, 'SELECT * FROM tweets') 81 | for db_file in db_files 82 | ]) 83 | logger.info('get_searched_tweets ' + to_search + ' start remove duplicates') 84 | df_without_duplicates = merged_data_df.drop_duplicates(subset="id_str") 85 | logger.info('get_searched_tweets ' + to_search + ' processing finished') 86 | return df_to_json_response(df_without_duplicates) 87 | 88 | 89 | @app.route("/get_user_followers/", methods=['GET']) 90 | def get_user_followers(username: str): 91 | user_folder_name = 'u_' + username 92 | user_details_db_file = 'ufe_' + username + '.db' 93 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_followers' + '/' + user_folder_name + '/' + user_details_db_file 94 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM followers_names')['user'] 95 | return df_to_json_response(df) 96 | 97 | 98 | @app.route("/get_user_followings/", methods=['GET']) 99 | def get_user_followings(username: str): 100 | user_folder_name = 'u_' + username 101 | user_details_db_file = 'ufi_' + username + '.db' 102 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_followings' + '/' + user_folder_name + '/' + user_details_db_file 103 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM following_names')['user'] 104 | return df_to_json_response(df) 105 | 106 | 107 | @app.route("/get_user_favorites/", methods=['GET']) 108 | def get_user_favorites(username: str): 109 | user_folder_name = 'u_' + username 110 | user_details_db_file = 'ufa_' + username + '.db' 111 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_favorites' + '/' + user_folder_name + '/' + user_details_db_file 112 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM favorites')['tweet_id'] 113 | return df_to_json_response(df) 114 | 115 | 116 | if __name__ == "__main__": 117 | app.run(host="0.0.0.0", debug=True) 118 | -------------------------------------------------------------------------------- /src/model/hashtag_scrap_params.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | 5 | from dateutil.parser import parse as date_parser 6 | 7 | import utils.time_utils as time_utils 8 | from model.scrap_type import ScrapType 9 | from model.time_interval import TimeInterval 10 | 11 | 12 | @dataclass(frozen=True) 13 | class PhraseScrapTaskParams: 14 | task_id: str 15 | phrase: str 16 | since: datetime.datetime 17 | until: datetime.datetime 18 | language: Optional[str] 19 | scrap_series: str 20 | queue_name: str 21 | type: ScrapType 22 | 23 | def __init__( 24 | self, 25 | task_id: str, 26 | phrase: str, 27 | since: datetime.datetime, 28 | until: datetime.datetime, 29 | language: Optional[str], 30 | scrap_series: str, 31 | queue_name: str 32 | ): 33 | object.__setattr__(self, 'task_id', task_id) 34 | object.__setattr__(self, 'phrase', phrase) 35 | object.__setattr__(self, 'since', time_utils.remove_microseconds_from_datetime(since)) 36 | object.__setattr__(self, 'until', time_utils.remove_microseconds_from_datetime(until)) 37 | object.__setattr__(self, 'type', ScrapType.SEARCH_BY_PHRASE) 38 | object.__setattr__(self, 'scrap_series', scrap_series) 39 | object.__setattr__(self, 'language', language) 40 | object.__setattr__(self, 'queue_name', queue_name) 41 | return 42 | 43 | def get_time_interval(self): 44 | return TimeInterval(self.since, self.until) 45 | 46 | @staticmethod 47 | def from_dict(dictionary): 48 | return PhraseScrapTaskParams( 49 | dictionary['task_id'], 50 | dictionary['phrase'], 51 | date_parser(dictionary['since']), 52 | date_parser(dictionary['until']), 53 | dictionary['language'], 54 | dictionary['scrap_series'], 55 | dictionary['queue_name'] 56 | ) 57 | -------------------------------------------------------------------------------- /src/model/scrap_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ScrapType(Enum): 5 | SEARCH_BY_PHRASE = 1 6 | USER_DETAILS = 2 7 | USER_TWEETS = 3 8 | USER_FOLLOWERS = 4 9 | USER_FOLLOWINGS = 5 10 | USER_FAVORITES = 6 11 | -------------------------------------------------------------------------------- /src/model/time_interval.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass(frozen=True) 6 | class TimeInterval: 7 | since: datetime.datetime 8 | until: datetime.datetime 9 | 10 | def __init__(self, since: datetime.datetime, until: datetime.datetime): 11 | object.__setattr__(self, 'since', since) 12 | object.__setattr__(self, 'until', until) 13 | return 14 | -------------------------------------------------------------------------------- /src/model/user_scrap_params.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from dataclasses import dataclass 3 | 4 | from dateutil.parser import parse as date_parser 5 | 6 | import utils.time_utils as time_utils 7 | from model.scrap_type import ScrapType 8 | from model.time_interval import TimeInterval 9 | 10 | 11 | @dataclass(frozen=True) 12 | class UserTweetsScrapTaskParams: 13 | task_id: str 14 | username: str 15 | since: datetime.datetime 16 | until: datetime.datetime 17 | type: ScrapType 18 | scrap_series: str 19 | queue_name: str 20 | 21 | def __init__( 22 | self, 23 | task_id: str, 24 | username: str, 25 | since: datetime.datetime, 26 | until: datetime.datetime, 27 | scrap_series: str, 28 | queue_name: str 29 | ): 30 | object.__setattr__(self, 'task_id', task_id) 31 | object.__setattr__(self, 'username', username) 32 | object.__setattr__(self, 'since', time_utils.remove_microseconds_from_datetime(since)) 33 | object.__setattr__(self, 'until', time_utils.remove_microseconds_from_datetime(until)) 34 | object.__setattr__(self, 'type', ScrapType.USER_TWEETS) 35 | object.__setattr__(self, 'scrap_series', scrap_series) 36 | object.__setattr__(self, 'queue_name', queue_name) 37 | return 38 | 39 | def get_time_interval(self): 40 | return TimeInterval(self.since, self.until) 41 | 42 | @staticmethod 43 | def from_dict(dictionary): 44 | return UserTweetsScrapTaskParams( 45 | dictionary['task_id'], 46 | dictionary['username'], 47 | date_parser(dictionary['since']), 48 | date_parser(dictionary['until']), 49 | dictionary['scrap_series'], 50 | dictionary['queue_name'] 51 | ) 52 | 53 | 54 | @dataclass(frozen=True) 55 | class UserDetailsScrapTaskParams: 56 | task_id: str 57 | username: str 58 | scrap_series: str 59 | type: ScrapType 60 | queue_name: str 61 | 62 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str): 63 | object.__setattr__(self, 'task_id', task_id) 64 | object.__setattr__(self, 'username', username) 65 | object.__setattr__(self, 'scrap_series', scrap_series) 66 | object.__setattr__(self, 'type', ScrapType.USER_DETAILS) 67 | object.__setattr__(self, 'queue_name', queue_name) 68 | return 69 | 70 | @staticmethod 71 | def from_dict(dictionary): 72 | return UserDetailsScrapTaskParams( 73 | dictionary['task_id'], 74 | dictionary['username'], 75 | dictionary['scrap_series'], 76 | dictionary['queue_name'] 77 | ) 78 | 79 | 80 | @dataclass(frozen=True) 81 | class UserFollowersScrapTaskParams: 82 | task_id: str 83 | username: str 84 | scrap_series: str 85 | type: ScrapType 86 | queue_name: str 87 | 88 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str): 89 | object.__setattr__(self, 'task_id', task_id) 90 | object.__setattr__(self, 'username', username) 91 | object.__setattr__(self, 'scrap_series', scrap_series) 92 | object.__setattr__(self, 'type', ScrapType.USER_FOLLOWERS) 93 | object.__setattr__(self, 'queue_name', queue_name) 94 | return 95 | 96 | @staticmethod 97 | def from_dict(dictionary): 98 | return UserFollowersScrapTaskParams( 99 | dictionary['task_id'], 100 | dictionary['username'], 101 | dictionary['scrap_series'], 102 | dictionary['queue_name'] 103 | ) 104 | 105 | 106 | @dataclass(frozen=True) 107 | class UserFollowingScrapTaskParams: 108 | task_id: str 109 | username: str 110 | scrap_series: str 111 | type: ScrapType 112 | queue_name: str 113 | 114 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str): 115 | object.__setattr__(self, 'task_id', task_id) 116 | object.__setattr__(self, 'username', username) 117 | object.__setattr__(self, 'scrap_series', scrap_series) 118 | object.__setattr__(self, 'type', ScrapType.USER_FOLLOWINGS) 119 | object.__setattr__(self, 'queue_name', queue_name) 120 | return 121 | 122 | @staticmethod 123 | def from_dict(dictionary): 124 | return UserFollowingScrapTaskParams( 125 | dictionary['task_id'], 126 | dictionary['username'], 127 | dictionary['scrap_series'], 128 | dictionary['queue_name'] 129 | 130 | ) 131 | 132 | 133 | @dataclass(frozen=True) 134 | class UserFavoritesScrapTaskParams: 135 | task_id: str 136 | username: str 137 | scrap_series: str 138 | type: ScrapType 139 | queue_name: str 140 | 141 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str): 142 | object.__setattr__(self, 'task_id', task_id) 143 | object.__setattr__(self, 'username', username) 144 | object.__setattr__(self, 'scrap_series', scrap_series) 145 | object.__setattr__(self, 'type', ScrapType.USER_FAVORITES) 146 | object.__setattr__(self, 'queue_name', queue_name) 147 | return 148 | 149 | @staticmethod 150 | def from_dict(dictionary): 151 | return UserFavoritesScrapTaskParams( 152 | dictionary['task_id'], 153 | dictionary['username'], 154 | dictionary['scrap_series'], 155 | dictionary['queue_name'] 156 | ) 157 | -------------------------------------------------------------------------------- /src/scrap_service.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import twint 4 | 5 | import utils.docker_logs as docker_logs 6 | from configuration.proxy_config import ProxyConfig 7 | from model.hashtag_scrap_params import PhraseScrapTaskParams 8 | from model.time_interval import TimeInterval 9 | from model.user_scrap_params import UserTweetsScrapTaskParams, UserDetailsScrapTaskParams 10 | from utils.time_utils import remove_microseconds_from_datetime 11 | 12 | logger = docker_logs.get_logger('scrap_service') 13 | 14 | 15 | def get_common_config( 16 | interval: Optional[TimeInterval], 17 | db_file_path: str, 18 | proxy_config: Optional[ProxyConfig] 19 | ) -> twint.Config: 20 | twint_config = twint.Config() 21 | 22 | twint_config.Store_object = False 23 | twint_config.Hide_output = True 24 | twint_config.Retries_count = 100 25 | twint_config.Min_wait_time = 90 26 | twint_config.Backoff_exponent = 3.0 27 | 28 | if interval is not None: 29 | twint_config.Since = str(remove_microseconds_from_datetime(interval.since)) 30 | twint_config.Until = str(remove_microseconds_from_datetime(interval.until)) 31 | 32 | if proxy_config is not None: 33 | twint_config.Proxy_host = proxy_config.get_host() 34 | twint_config.Proxy_port = proxy_config.get_port() 35 | twint_config.Proxy_type = proxy_config.get_proxy_type() 36 | 37 | twint_config.Database = db_file_path 38 | 39 | return twint_config 40 | 41 | 42 | def search_tweets( 43 | search_params: PhraseScrapTaskParams, 44 | db_file_path: str, 45 | proxy_config: Optional[ProxyConfig] 46 | ): 47 | logger.info('start scrap for search: ' + search_params.phrase) 48 | twint_config = get_common_config(search_params.get_time_interval(), db_file_path, proxy_config) 49 | twint_config.Search = search_params.phrase 50 | if search_params.language is not None: 51 | twint_config.Lang = search_params.language 52 | twint.run.Search(twint_config) 53 | logger.info('finish scrap for search: ' + search_params.phrase) 54 | return 55 | 56 | 57 | def get_user_details( 58 | params: UserDetailsScrapTaskParams, 59 | db_file_path: str, 60 | proxy_config: Optional[ProxyConfig] 61 | ): 62 | logger.info('start scrap user details: ' + params.username) 63 | twint_config = get_common_config(None, db_file_path, proxy_config) 64 | twint_config.Username = params.username 65 | twint.run.Lookup(twint_config) 66 | logger.info('finish scrap user details: ' + params.username) 67 | return 68 | 69 | 70 | def get_user_favorites( 71 | params: UserDetailsScrapTaskParams, 72 | db_file_path: str, 73 | proxy_config: Optional[ProxyConfig] 74 | ): 75 | logger.info('start scrap user favorites: ' + params.username) 76 | twint_config = get_common_config(None, db_file_path, proxy_config) 77 | twint_config.Username = params.username 78 | twint.run.Favorites(twint_config) 79 | logger.info('finish scrap user favorites: ' + params.username) 80 | return 81 | 82 | 83 | def get_user_followers( 84 | params: UserDetailsScrapTaskParams, 85 | db_file_path: str, 86 | proxy_config: Optional[ProxyConfig] 87 | ): 88 | logger.info('start scrap user followers: ' + params.username) 89 | twint_config = get_common_config(None, db_file_path, proxy_config) 90 | twint_config.Username = params.username 91 | twint.run.Followers(twint_config) 92 | logger.info('finish scrap user followers: ' + params.username) 93 | return 94 | 95 | 96 | def get_user_following( 97 | params: UserDetailsScrapTaskParams, 98 | db_file_path: str, 99 | proxy_config: Optional[ProxyConfig] 100 | ): 101 | logger.info('start scrap user following: ' + params.username) 102 | twint_config = get_common_config(None, db_file_path, proxy_config) 103 | twint_config.Username = params.username 104 | twint.run.Following(twint_config) 105 | logger.info('finish scrap user following: ' + params.username) 106 | return 107 | 108 | 109 | def get_user_tweets( 110 | params: UserTweetsScrapTaskParams, 111 | db_file_path: str, 112 | proxy_config: Optional[ProxyConfig] 113 | ): 114 | logger.info('start scrap for user: ' + params.username) 115 | twint_config = get_common_config(params.get_time_interval(), db_file_path, proxy_config) 116 | twint_config.Username = params.username 117 | twint.run.Search(twint_config) 118 | logger.info('finish scrap for search: ' + params.username) 119 | return 120 | -------------------------------------------------------------------------------- /src/scrap_worker.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import functools 3 | import json 4 | import threading 5 | import time 6 | 7 | import pika 8 | import requests 9 | 10 | import scrap_service 11 | import utils.docker_logs as docker_logs 12 | import utils.tor_utils as tor_utils 13 | from configuration import rabbit_config, worker_config, command_server_config, proxy_config 14 | from model.hashtag_scrap_params import PhraseScrapTaskParams 15 | from model.scrap_type import ScrapType 16 | from model.user_scrap_params import UserTweetsScrapTaskParams, UserDetailsScrapTaskParams 17 | from upload_result_file_service import upload_result_file 18 | from utils import command_utils 19 | 20 | logger = docker_logs.get_logger('command_server') 21 | tor_utils.prepare_tor() 22 | 23 | 24 | def set_task_finished(task_type: ScrapType, task_id: str): 25 | post_data = { 26 | 'task_type': task_type.name, 27 | 'task_id': task_id 28 | } 29 | url = command_server_config.get_command_server_host() + '/set_task_as_finished' 30 | response = requests.post(url, data=post_data) 31 | if response.status_code >= 400: 32 | print("ERR set_task_finished code:", response.status_code) 33 | raise Exception('error in set_task_as_finished') 34 | return 35 | 36 | 37 | def set_sub_task_finished(task_type: ScrapType, sub_task_id: str): 38 | post_data = { 39 | 'task_type': task_type.name, 40 | 'sub_task_id': sub_task_id 41 | } 42 | url = command_server_config.get_command_server_host() + '/set_sub_task_as_finished' 43 | response = requests.post(url, data=post_data) 44 | if response.status_code >= 400: 45 | print("ERR set_task_finished code:", response.status_code) 46 | raise Exception('error in set_sub_task_as_finished') 47 | return 48 | 49 | 50 | def d2s(value: datetime.datetime) -> str: 51 | return str(value).replace(':', '').replace('-', '').replace(' ', '-') 52 | 53 | 54 | def get_search_by_filename(params: PhraseScrapTaskParams) -> str: 55 | language_part = ('_lang=' + params.language) if params.language is not None else '' 56 | 57 | return 's_' + params.phrase + '_' + d2s(params.since) + '_' + d2s( 58 | params.until) + language_part + '.db' 59 | 60 | 61 | def get_user_tweets_filename(params: UserTweetsScrapTaskParams) -> str: 62 | return 'ut_' + params.username + '_' + d2s(params.since) + '_' + d2s( 63 | params.until) + '.db' 64 | 65 | 66 | def get_user_details_filename(params: UserDetailsScrapTaskParams) -> str: 67 | return 'ud_' + params.username + '.db' 68 | 69 | 70 | def get_user_favorites_filename(params: UserDetailsScrapTaskParams) -> str: 71 | return 'ufa_' + params.username + '.db' 72 | 73 | 74 | def get_user_followers_filename(params: UserDetailsScrapTaskParams) -> str: 75 | return 'ufe_' + params.username + '.db' 76 | 77 | 78 | def get_user_following_filename(params: UserDetailsScrapTaskParams) -> str: 79 | return 'ufi_' + params.username + '.db' 80 | 81 | 82 | def scrap_by_search_to_file(parsed_body): 83 | params = PhraseScrapTaskParams.from_dict(parsed_body) 84 | filename = get_search_by_filename(params) 85 | scrap_service.search_tweets(params, filename, proxy_config.default_proxy_config) 86 | set_sub_task_finished(ScrapType.SEARCH_BY_PHRASE, params.task_id) 87 | return { 88 | 'filename': filename, 89 | 'series': parsed_body['scrap_series'], 90 | 'sub_series': 's_' + params.phrase, 91 | } 92 | 93 | 94 | def scrap_user_tweets_to_file(parsed_body): 95 | params: UserTweetsScrapTaskParams = UserTweetsScrapTaskParams.from_dict(parsed_body) 96 | filename = get_user_tweets_filename(params) 97 | scrap_service.get_user_tweets(params, filename, proxy_config.default_proxy_config) 98 | set_sub_task_finished(ScrapType.USER_TWEETS, params.task_id) 99 | return { 100 | 'filename': filename, 101 | 'series': parsed_body['scrap_series'], 102 | 'sub_series': 'u_' + params.username, 103 | } 104 | 105 | 106 | def scrap_user_details_to_file(parsed_body): 107 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body) 108 | filename = get_user_details_filename(params) 109 | scrap_service.get_user_details(params, filename, proxy_config.default_proxy_config) 110 | set_task_finished(ScrapType.USER_DETAILS, params.task_id) 111 | return { 112 | 'filename': filename, 113 | 'series': parsed_body['scrap_series'], 114 | 'sub_series': 'u_' + params.username, 115 | } 116 | 117 | 118 | def scrap_user_favorites_to_file(parsed_body): 119 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body) 120 | filename = get_user_favorites_filename(params) 121 | scrap_service.get_user_favorites(params, filename, proxy_config.default_proxy_config) 122 | set_task_finished(ScrapType.USER_FAVORITES, params.task_id) 123 | return { 124 | 'filename': filename, 125 | 'series': parsed_body['scrap_series'], 126 | 'sub_series': 'u_' + params.username, 127 | } 128 | 129 | 130 | def scrap_user_following_to_file(parsed_body): 131 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body) 132 | filename = get_user_following_filename(params) 133 | scrap_service.get_user_following(params, filename, proxy_config.default_proxy_config) 134 | set_task_finished(ScrapType.USER_FOLLOWINGS, params.task_id) 135 | return { 136 | 'filename': filename, 137 | 'series': parsed_body['scrap_series'], 138 | 'sub_series': 'u_' + params.username, 139 | } 140 | 141 | 142 | def scrap_user_followers_to_file(parsed_body): 143 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body) 144 | filename = get_user_followers_filename(params) 145 | scrap_service.get_user_followers(params, filename, proxy_config.default_proxy_config) 146 | set_task_finished(ScrapType.USER_FOLLOWERS, params.task_id) 147 | return { 148 | 'filename': filename, 149 | 'series': parsed_body['scrap_series'], 150 | 'sub_series': 'u_' + params.username, 151 | } 152 | 153 | 154 | def get_scrap_method(scrap_type: ScrapType): 155 | return { 156 | ScrapType.SEARCH_BY_PHRASE: scrap_by_search_to_file, 157 | ScrapType.USER_DETAILS: scrap_user_details_to_file, 158 | ScrapType.USER_TWEETS: scrap_user_tweets_to_file, 159 | ScrapType.USER_FOLLOWINGS: scrap_user_following_to_file, 160 | ScrapType.USER_FOLLOWERS: scrap_user_followers_to_file, 161 | ScrapType.USER_FAVORITES: scrap_user_favorites_to_file 162 | }[scrap_type] 163 | 164 | 165 | def ack_message(ch, delivery_tag): 166 | if ch.is_open: 167 | ch.basic_ack(delivery_tag) 168 | else: 169 | logger.error("send ack when channel is close") 170 | 171 | 172 | def process_message(body): 173 | logger.info(" [x] Received %r" % body) 174 | body_string = body.decode("utf-8") 175 | parsed_body = json.loads(body_string) 176 | message_type: ScrapType = [it for it in ScrapType if parsed_body['type'] in str(it)][0] 177 | logger.info('message_type: ' + str(message_type)) 178 | 179 | try_count = 100 180 | is_success = False 181 | while not is_success and try_count > 0: 182 | try: 183 | logger.info('start new job for scrap user') 184 | logger.info('job_details: ' + body_string) 185 | scrap_result = get_scrap_method(message_type)(parsed_body) 186 | upload_result_file( 187 | series=scrap_result['series'], 188 | sub_series=scrap_result['sub_series'], 189 | filename=scrap_result['filename'], 190 | filepath=scrap_result['filename'], 191 | scrap_type=message_type 192 | ) 193 | command_utils.run_bash_command('rm ' + scrap_result['filename']) 194 | is_success = True 195 | logger.info('finished successful: ' + str(parsed_body)) 196 | except Exception as exception: 197 | try_count = try_count - 1 198 | logger.error("Error during work") 199 | logger.exception(exception) 200 | logger.info('sleep for 60 secs in case of error') 201 | time.sleep(60) 202 | return is_success 203 | 204 | 205 | def do_work(conn, ch, delivery_tag, body): 206 | thread_id = threading.get_ident() 207 | logger.info('Thread id: %s Delivery tag: %s Message body: %s', thread_id, delivery_tag, body) 208 | if process_message(body): 209 | cb = functools.partial(ack_message, ch, delivery_tag) 210 | conn.add_callback_threadsafe(cb) 211 | return 212 | 213 | 214 | def prepare_rabbit_connect() -> pika.BlockingConnection: 215 | try_count = 100 216 | while try_count > 0: 217 | try: 218 | return pika.BlockingConnection(rabbit_config.get_rabbit_connection_config()) 219 | except Exception: 220 | try_count = try_count - 1 221 | logger.info("error during connect to rabbitMQ") 222 | logger.info("wait 3 seconds for next try") 223 | time.sleep(3) 224 | raise Exception("can't connect with rabbitMQ") 225 | 226 | 227 | def on_message(ch, method_frame, _header_frame, body, args): 228 | # (conn, thrds) = args 229 | (conn) = args 230 | delivery_tag = method_frame.delivery_tag 231 | t = threading.Thread(target=do_work, args=(conn, ch, delivery_tag, body)) 232 | t.start() 233 | # thrds.append(t) 234 | 235 | 236 | connection = prepare_rabbit_connect() 237 | channel = connection.channel() 238 | 239 | channel.queue_declare(queue=worker_config.get_queue_name(), durable=True) 240 | channel.basic_qos(prefetch_count=1) 241 | 242 | # threads = [] 243 | # on_message_callback = functools.partial(on_message, args=(connection, threads)) 244 | on_message_callback = functools.partial(on_message, args=(connection)) 245 | channel.basic_consume(queue=worker_config.get_queue_name(), on_message_callback=on_message_callback) 246 | 247 | channel.start_consuming() 248 | -------------------------------------------------------------------------------- /src/upload_result_file_service.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import configuration.upload_file_config as upload_file_config 4 | import utils.docker_logs as docker_logs 5 | from model.scrap_type import ScrapType 6 | 7 | logger = docker_logs.get_logger('upload_result_file_service') 8 | 9 | 10 | def upload_result_file( 11 | series: str, 12 | sub_series: str, 13 | filename: str, 14 | filepath: str, 15 | scrap_type: ScrapType 16 | ): 17 | post_data = { 18 | 'series': series, 19 | 'sub_series': sub_series, 20 | 'filename': filename, 21 | 'data_type': scrap_type.name.lower() 22 | } 23 | url = upload_file_config.get_upload_file_host() + '/upload_result_file' 24 | post_files = {'file': open(filepath, 'rb')} 25 | response = requests.post(url, data=post_data, files=post_files) 26 | logger.info('upload request response with code: ' + str(response.status_code)) 27 | if response.status_code >= 400: 28 | raise Exception() 29 | return 30 | -------------------------------------------------------------------------------- /src/utils/command_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import utils.docker_logs as docker_logs 4 | 5 | command_logger = docker_logs.get_logger('command_runner') 6 | 7 | 8 | def run_bash_command(command: str): 9 | command_logger.info('execute shell command: ' + command) 10 | os.system(command, ) 11 | command_logger.info('finish executing: ' + command) 12 | return 13 | -------------------------------------------------------------------------------- /src/utils/commands_mysql_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pandas as pd 4 | 5 | from configuration.mysql_config import get_db_connection, get_db_connection_base 6 | 7 | 8 | def execute_sql_modify(query: str, params: List): 9 | connection = get_db_connection() 10 | cursor = connection.cursor() 11 | cursor.execute(query, params) 12 | connection.commit() 13 | return 14 | 15 | 16 | def execute_sql_query(query: str, params: List = None): 17 | if params is None: 18 | params = list() 19 | connection = get_db_connection() 20 | df = pd.read_sql_query(query, connection, params=params) 21 | connection.close() 22 | return df 23 | 24 | 25 | def is_db_initialized() -> bool: 26 | return 'twint_distributed_tasks' in list( 27 | pd.read_sql("SHOW DATABASES", get_db_connection_base())['Database'].to_numpy() 28 | ) 29 | 30 | 31 | def initialize_database(): 32 | print('initialize_database start') 33 | connection = get_db_connection_base() 34 | cursor = connection.cursor() 35 | command = " ".join(open('utils/init_database.sql').readlines()) 36 | print('command', command) 37 | cursor.execute(command) 38 | connection.commit() 39 | print('initialize_database finish') 40 | return 41 | 42 | 43 | def prepare_database(): 44 | if not is_db_initialized(): 45 | print('database is not initialized') 46 | initialize_database() 47 | else: 48 | print('database is initialized') 49 | return 50 | -------------------------------------------------------------------------------- /src/utils/directory_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import walk 3 | from typing import List 4 | 5 | import utils.command_utils as command_utils 6 | 7 | 8 | def get_db_files_path_list_from_directory(directory_path: str) -> List[str]: 9 | db_files = [] 10 | for (dirpath, dirnames, filenames) in walk(directory_path): 11 | db_files.extend([dirpath + '/' + it for it in filenames if '.db' in it]) 12 | return db_files 13 | 14 | 15 | def prepare_directory(directory: str): 16 | if not os.path.exists(directory): 17 | command_utils.run_bash_command('mkdir -p ' + directory) 18 | return 19 | -------------------------------------------------------------------------------- /src/utils/docker_logs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | loggers = dict() 4 | 5 | 6 | def get_logger(mod_name): 7 | if mod_name in loggers: 8 | return loggers[mod_name] 9 | else: 10 | logger = logging.getLogger(mod_name) 11 | handler = logging.StreamHandler() 12 | formatter = logging.Formatter('%(asctime)s [%(name)-30s] %(levelname)-8s %(message)s') 13 | handler.setFormatter(formatter) 14 | for it in logger.handlers: 15 | logger.removeHandler(it) 16 | logger.addHandler(handler) 17 | logger.setLevel(logging.INFO) 18 | logger.propagate = False 19 | loggers[mod_name] = mod_name 20 | return logger 21 | -------------------------------------------------------------------------------- /src/utils/init_database.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE twint_distributed_tasks; 2 | USE twint_distributed_tasks; 3 | 4 | 5 | CREATE TABLE ScrapSession 6 | ( 7 | scrap_session_id VARCHAR(50) NOT NULL PRIMARY KEY, 8 | scrap_session_name VARCHAR(512) NOT NULL 9 | ); 10 | 11 | 12 | CREATE TABLE UserTweetScrapTasks 13 | ( 14 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 15 | username VARCHAR(200) NOT NULL, 16 | since DATETIME NOT NULL, 17 | until DATETIME NOT NULL, 18 | created DATETIME NOT NULL, 19 | finished DATETIME, 20 | scrap_session_id VARCHAR(50) NOT NULL, 21 | queue_name VARCHAR(512) NOT NULL, 22 | CONSTRAINT UserTweetScrapTasks_fk_scrap_session_id 23 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 24 | ); 25 | 26 | CREATE TABLE UserTweetScrapSubTasks 27 | ( 28 | sub_task_id VARCHAR(50) NOT NULL PRIMARY KEY, 29 | task_id VARCHAR(50) NOT NULL, 30 | since DATETIME NOT NULL, 31 | until DATETIME NOT NULL, 32 | created DATETIME NOT NULL, 33 | finished DATETIME, 34 | CONSTRAINT UserTweetScrapSubTasks_fk_task_id 35 | FOREIGN KEY (task_id) REFERENCES UserTweetScrapTasks (task_id) 36 | ); 37 | 38 | CREATE TABLE UserDetailsScrapTasks 39 | ( 40 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 41 | username VARCHAR(200) NOT NULL, 42 | created DATETIME NOT NULL, 43 | finished DATETIME, 44 | scrap_session_id VARCHAR(50) NOT NULL, 45 | queue_name VARCHAR(512) NOT NULL, 46 | CONSTRAINT UserDetailsScrapTasks_fk_scrap_session_id 47 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 48 | ); 49 | 50 | CREATE TABLE UserFollowersScrapTasks 51 | ( 52 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 53 | username VARCHAR(200) NOT NULL, 54 | created DATETIME NOT NULL, 55 | finished DATETIME, 56 | scrap_session_id VARCHAR(50) NOT NULL, 57 | queue_name VARCHAR(512) NOT NULL, 58 | CONSTRAINT UserFollowersScrapTasks_fk_scrap_session_id 59 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 60 | ); 61 | 62 | CREATE TABLE UserFollowingScrapTasks 63 | ( 64 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 65 | username VARCHAR(200) NOT NULL, 66 | created DATETIME NOT NULL, 67 | finished DATETIME, 68 | scrap_session_id VARCHAR(50) NOT NULL, 69 | queue_name VARCHAR(512) NOT NULL, 70 | CONSTRAINT UserFollowingScrapTasks_fk_scrap_session_id 71 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 72 | ); 73 | 74 | CREATE TABLE UserFavoritesScrapTasks 75 | ( 76 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 77 | username VARCHAR(200) NOT NULL, 78 | created DATETIME NOT NULL, 79 | finished DATETIME, 80 | scrap_session_id VARCHAR(50) NOT NULL, 81 | queue_name VARCHAR(512) NOT NULL, 82 | CONSTRAINT UserFavoritesScrapTasks_fk_scrap_session_id 83 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 84 | ); 85 | 86 | CREATE TABLE SearchTweetScrapTasks 87 | ( 88 | task_id VARCHAR(50) NOT NULL PRIMARY KEY, 89 | phrase VARCHAR(200) NOT NULL, 90 | since DATETIME NOT NULL, 91 | until DATETIME NOT NULL, 92 | language VARCHAR(10), 93 | created DATETIME NOT NULL, 94 | finished DATETIME, 95 | scrap_session_id VARCHAR(50) NOT NULL, 96 | queue_name VARCHAR(512) NOT NULL, 97 | CONSTRAINT SearchTweetScrapTasks_fk_scrap_session_id 98 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id) 99 | ); 100 | 101 | CREATE TABLE SearchTweetScrapSubTasks 102 | ( 103 | sub_task_id VARCHAR(50) NOT NULL PRIMARY KEY, 104 | task_id VARCHAR(50) NOT NULL, 105 | since DATETIME NOT NULL, 106 | until DATETIME NOT NULL, 107 | created DATETIME NOT NULL, 108 | finished DATETIME, 109 | CONSTRAINT SearchTweetScrapSubTasks_fk_task_id 110 | FOREIGN KEY (task_id) REFERENCES SearchTweetScrapTasks (task_id) 111 | ); 112 | -------------------------------------------------------------------------------- /src/utils/interval_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from enum import Enum 3 | from typing import List, Union 4 | 5 | from dateutil.parser import parse 6 | from dateutil.relativedelta import relativedelta 7 | 8 | import utils.time_utils as time_utils 9 | from model.time_interval import TimeInterval 10 | 11 | TWITTER_START_TIME = parse('2006-03-21 00:00:00') 12 | 13 | 14 | class TimeIntervalType(Enum): 15 | HOUR = 1 16 | DAY = 2 17 | MONTH = 3 18 | QUARTER_OF_YEAR = 4 19 | YEAR = 5 20 | 21 | @staticmethod 22 | def get_from_string(value: str): 23 | return { 24 | 'hour': TimeIntervalType.HOUR, 25 | 'day': TimeIntervalType.DAY, 26 | 'month': TimeIntervalType.MONTH, 27 | 'quarter_of_year': TimeIntervalType.QUARTER_OF_YEAR, 28 | 'year': TimeIntervalType.YEAR 29 | }[value] 30 | 31 | def get_relativedelta(self): 32 | return { 33 | TimeIntervalType.HOUR: relativedelta(hours=1), 34 | TimeIntervalType.DAY: relativedelta(days=1), 35 | TimeIntervalType.MONTH: relativedelta(months=1), 36 | TimeIntervalType.QUARTER_OF_YEAR: relativedelta(months=3), 37 | TimeIntervalType.YEAR: relativedelta(years=1) 38 | }[self] 39 | 40 | 41 | def get_list_interval( 42 | start: Union[datetime.datetime, None], 43 | end: Union[datetime.datetime, None], 44 | interval_type: TimeIntervalType 45 | ) -> List[TimeInterval]: 46 | fixed_start = start if start is not None else TWITTER_START_TIME 47 | fixed_end = end if end is not None else time_utils.remove_microseconds_from_datetime(datetime.datetime.now()) 48 | current_time = fixed_start 49 | intervals_to_return = [] 50 | while current_time < fixed_end: 51 | interval_start_time = current_time 52 | current_time = current_time + interval_type.get_relativedelta() 53 | interval_end_time = current_time - relativedelta(seconds=1) 54 | intervals_to_return.append(TimeInterval( 55 | interval_start_time, 56 | interval_end_time if interval_end_time < fixed_end else fixed_end 57 | )) 58 | return intervals_to_return 59 | -------------------------------------------------------------------------------- /src/utils/params_encoder.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ParamsEncoder(json.JSONEncoder): 5 | def default(self, o) -> str: 6 | dictionary = dict(o.__dict__) 7 | for key in dictionary.keys(): 8 | dictionary[key] = str(dictionary[key]) 9 | return json.dumps(dictionary) 10 | -------------------------------------------------------------------------------- /src/utils/rabbit_send_utils.py: -------------------------------------------------------------------------------- 1 | import pika 2 | 3 | import utils.docker_logs as docker_logs 4 | from configuration.rabbit_config import get_rabbit_connection_config 5 | 6 | logger = docker_logs.get_logger('rabbit_send') 7 | 8 | 9 | def send_to_rabbit(queue: str, body: str): 10 | logger.info('send_to_rabbit ' + queue + ' ' + body) 11 | connection = pika.BlockingConnection(get_rabbit_connection_config()) 12 | channel = connection.channel() 13 | channel.queue_declare(queue=queue, durable=True) 14 | channel.basic_publish(exchange='', routing_key=queue, body=body, properties=pika.BasicProperties(delivery_mode=2)) 15 | connection.close() 16 | return 17 | -------------------------------------------------------------------------------- /src/utils/sqlite_util.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | import pandas as pd 4 | 5 | from utils.docker_logs import get_logger 6 | 7 | logger = get_logger('sqlite_util') 8 | 9 | 10 | def get_df_from_sqlite_db(db_filename: str, query: str): 11 | con = sqlite3.connect(db_filename) 12 | df = pd.read_sql_query(query, con) 13 | con.close() 14 | return df 15 | -------------------------------------------------------------------------------- /src/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | def date_to_string(date: datetime.date) -> str: 5 | return date.today().isoformat() 6 | 7 | 8 | def remove_microseconds_from_datetime(value: datetime.datetime): 9 | return value.replace(microsecond=0) 10 | -------------------------------------------------------------------------------- /src/utils/tor_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | import utils.command_utils as command_utils 6 | import utils.docker_logs as docker_logs 7 | 8 | logger = docker_logs.get_logger('tor_utils') 9 | 10 | 11 | def _start_tor(): 12 | logger.info('start tor proxy') 13 | command_utils.run_bash_command('tor &') 14 | return 15 | 16 | 17 | def _wait_until_tor_works(): 18 | logger.info('wait until tor works') 19 | code = '' 20 | while code != '200': 21 | try: 22 | logger.info('tor check request') 23 | proxies = { 24 | 'http': 'socks5://127.0.0.1:9050', 25 | 'https': 'socks5://127.0.0.1:9050' 26 | } 27 | r = requests.get('http://jsonip.com/', proxies=proxies) 28 | code = str(r.status_code) 29 | logger.info('response_code: ' + code) 30 | except Exception as err: 31 | logger.error(err) 32 | logger.info('not works yet, waiting..') 33 | time.sleep(2) 34 | logger.info('tor works') 35 | return 36 | 37 | 38 | def prepare_tor(): 39 | _start_tor() 40 | _wait_until_tor_works() 41 | return 42 | --------------------------------------------------------------------------------