├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── assets
├── architecture.drawio
└── architecture.png
├── ci
└── Jenkinsfile
├── client
├── data_client.py
└── task_client.py
├── docker-compose.yml
├── rabbit.conf
├── requirements.txt
└── src
├── command_server.py
├── command_service.py
├── configuration
├── command_server_config.py
├── mysql_config.py
├── proxy_config.py
├── rabbit_config.py
├── upload_file_config.py
├── webhook_config.py
└── worker_config.py
├── dao
├── search_by_task_dao.py
├── session_dao.py
├── user_details_task_dao.py
├── user_favorites_task_dao.py
├── user_followers_task_dao.py
├── user_followings_task_dao.py
└── user_tweets_task_dao.py
├── data_server.py
├── model
├── hashtag_scrap_params.py
├── scrap_type.py
├── time_interval.py
└── user_scrap_params.py
├── scrap_service.py
├── scrap_worker.py
├── upload_result_file_service.py
└── utils
├── command_utils.py
├── commands_mysql_utils.py
├── directory_utils.py
├── docker_logs.py
├── init_database.sql
├── interval_utils.py
├── params_encoder.py
├── rabbit_send_utils.py
├── sqlite_util.py
├── time_utils.py
└── tor_utils.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 |
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
138 | # pytype static type analyzer
139 | .pytype/
140 |
141 | # Cython debug symbols
142 | cython_debug/
143 |
144 | # static files generated from Django application using `collectstatic`
145 | media
146 | static
147 |
148 | data
149 | db_data
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 |
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
138 | # pytype static type analyzer
139 | .pytype/
140 |
141 | # Cython debug symbols
142 | cython_debug/
143 |
144 | # static files generated from Django application using `collectstatic`
145 | media
146 | static
147 |
148 | data
149 | db_data
150 |
151 | src/test_run.py
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 PYTHONUNBUFFERED=1
3 |
4 | RUN apt-get update && apt-get install tor -y
5 |
6 | COPY requirements.txt ./
7 | RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
8 | RUN pip install --user --upgrade git+https://github.com/himanshudabas/twint.git@origin/twint-fixes#egg=twint
9 |
10 | COPY ./src /app
11 |
12 | WORKDIR /app
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Twint-Distributed
2 | ## No long supported
3 | I have many problems with twint.
4 | I decided to stop developing the library.
5 | If you liked my solution, maybe you will be interested in my library - https://github.com/markowanga/stweet.
6 |
7 | ## Description
8 | Sometimes there is a need to scrap many enormous tweet data in short time.
9 | This project help to do this task. Solution is based on Twint — popular tool
10 | to scrap twitter data.
11 |
12 | 
13 |
14 | ## Main concepts
15 | - Prepare architecture of microservices, which is scalable and can be
16 | distributed for many machines
17 | - Divide single scrap tasks for small task
18 | - Support that wne worker have error and the elementary task can be repeated
19 | on other instance
20 | - Workaround twitter limit, which disallow to download many data from one ip address
21 | - All data are gathered into one location
22 | - Use docker whenever possible
23 |
24 | ## How it works
25 | 1. User add commands to scrap by HTTP request
26 | 2. As a request result, server add commands to RabbitMQ for scrap data,
27 | the time bounds can be divided for small intervals
28 | 3. Workers get the messages from RabbitMQ to scrap data — they do this job
29 | 4. When elementary task has been finished the data is upload to server
30 | 5. Server save all received data to central storage
31 |
--------------------------------------------------------------------------------
/assets/architecture.drawio:
--------------------------------------------------------------------------------
1 | 5Vpdd6M2EP01fkwPCIPx48ZJtw/ZtrvZnk0eBRKgWCAq5Njur6+ExDexna4dtlnnnASNRh/M1b0zEM+cVbr7yGGefGII0xmw0G7m3MwAsG3fl3+UZa8tCwtoQ8wJMk6N4Z78g43RMtYNQbjoOArGqCB51xiyLMOh6Ngg52zbdYsY7a6awxgPDPchpEPrN4JEUt2Xt2w6fsMkTszSPljojhRWzuZOigQitm2ZnNuZs+KMCX2V7laYquBVcdHjfn2ht94Yx5k4ZUCUxSm6o3v/s/cUp+FfD9dfoqt6c2Jf3TFGMgCmybhIWMwySG8b6zVnmwxhNa0lW43PHWO5NNrS+ISF2Bs04UYwaUpESk2v3DHfP7Qbj2qyX9yqebMzk+vW3rT0XtUGX4yBMRVsw0N84MarswR5jMUhv2UNlTzjmKVYbkgO5JhCQZ67G4HmsMW1X4OHvDCQvAYeMAk8OyIeKgDktQZn4ZpmA45qTIgN+E5oyqEfOIf7lkPOSCaK1sx/KoN0MJLmVMpkBA24bpd2x/yX1kF/4B30lxd6x82hqm/9v5+z+TTnrJIB64eXgbk9pQyYeZ8h3ZiVVixNYYbULWL+TOQ99vFr0FGh3iZE4PsclsHYylTdRSIilK4YZbwc60Su+pH2QnC2xq0er/yoESwTLbv+1Ng8Yy7w7jA6w2CaAXOrx4CKQdtWAq4SV9LKvRVzzh5/d5osaWTY7sjwm6kwOJEa7iQq3FNJZ35YVY/4X0ZVwYC2X2AQEPHp83nZ6oc4DMfYGvju3LXOw0rQY6UzPSvtQRjfsjiqmfjYJuIoK5tE1+S2x3YSvHyic0+td6fMc+6AMPehPP055uqcMb5WF2dkDnKxj+ZjzPFB4JR57gzMcZY95lhTM8deDgL9FRbrGfCoXPo6kFH2YnWFoIABLIbVhXyYzdVluKdEhp8fD32gcboLagMM13GJ3h8bIWfBL6hbhL1xdUOLZWCdS92cXs0xgpE3ApF/KYicn7UiB/6JSgWmfTL3BxS6kWS5UEGOIPajURJ4oY+D6Dwk8H64FA+GQrVSJw/SMraMq7d3U2pTBF7QJi/w3DPlj8UJ+eNNtWk+n1abukWUdUSbJn6Mcpb/h8rLGRKtqgaKHGYdsL2/N+qddfkK4CqCKaEybB9mShg8mOZl+BxHlVUJps9YkBAOepR3+bucpCgxV1PYIN/1+vS6qjNjPC2Z3+remgM/0k/lccL8Su4/JFk86iJJKa4gJXGmu0MJb1lstrqVhmRmBau1u7JTSlFWRHLSanopFZWDLFhRd/XBcIRDqWGCsGxkfEQZFCN2RIqcQhNzkpXyVINSVW6/Y6zeDkkSSehUES3/ft0SoW8PlalK4yvPi4ZYDxxSm1KSF4rClbAitgkovq3tr8tl2JZl92JMNJfewoFnEk3QE03bOTGXLS6mmsO3eN9wkDC2VqtFJTCGJVlQaLL06/EnFqiDQTJSJO8EqH7RASx3aqDe52No/7Xq9NWd+7M+48y9E2sC/ZTx1i9X3Ve+XD3if6F/WXnvk6W9fxc64HIslc3m6wcal+ZLHM7tvw==
--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markowanga/twint-distributed/edec4fb4f054102b43642b5a1e863f5144e39d6f/assets/architecture.png
--------------------------------------------------------------------------------
/ci/Jenkinsfile:
--------------------------------------------------------------------------------
1 | pipeline {
2 | agent none
3 | environment {
4 | registry = "marcinwatroba/twint-distributed"
5 | registryCredential = 'marcinwatroba_dockerhub'
6 | }
7 | stages {
8 | stage('checkout sources') {
9 | agent any
10 | steps {
11 | git branch: 'master', credentialsId: 'markowanga_github',
12 | url: 'https://github.com/markowanga/twint-distributed.git'
13 | }
14 | }
15 | stage('build & push docker') {
16 | agent {
17 | docker {
18 | image 'docker'
19 | args '-u root -v /var/run/docker.sock:/var/run/docker.sock'
20 | }
21 | }
22 | steps {
23 | script {
24 | dockerImageWithNumber = docker.build(registry + ":$BUILD_NUMBER", './')
25 | dockerImageLatest = docker.build(registry + ":latest", './')
26 | docker.withRegistry( '', registryCredential ) {
27 | dockerImageWithNumber.push()
28 | dockerImageLatest.push()
29 | }
30 | }
31 | }
32 | }
33 | }
34 | post {
35 | always {
36 | node('master') {
37 | sh 'rm -rf *'
38 | }
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/client/data_client.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | import requests
4 |
5 |
6 | class TwintDistributedDataClient:
7 |
8 | def __init__(self, data_server_host):
9 | self.data_server_host = data_server_host
10 | return
11 |
12 | def get_searched_tweets(self, to_search: str) -> pd.DataFrame:
13 | return self.__call_get_request('/get_searched_tweets/' + to_search)
14 |
15 | def get_user_tweets(self, username: str) -> pd.DataFrame:
16 | return self.__call_get_request('/get_user_tweets/' + username)
17 |
18 | def get_user_details(self, username: str) -> pd.DataFrame:
19 | return self.__call_get_request('/get_user_details/' + username)
20 |
21 | def get_user_followers(self, username: str) -> pd.DataFrame:
22 | return self.__call_get_request('/get_user_followers/' + username)
23 |
24 | def get_user_followings(self, username: str) -> pd.DataFrame:
25 | return self.__call_get_request('/get_user_followings/' + username)
26 |
27 | def get_user_favorites(self, username: str) -> pd.DataFrame:
28 | return self.__call_get_request('/get_user_favorites/' + username)
29 |
30 | def __call_get_request(self, path: str) -> pd.DataFrame:
31 | response = requests.get(self.data_server_host + requests.utils.quote(path))
32 | return pd.read_json(response.content)
33 |
34 |
35 | def main():
36 | client = TwintDistributedDataClient('http://twitterdata.theliver.pl')
37 | users = ['AndrzejDuda', 'M_K_Blonska', 'pawel_tanajno', 'jakubiak_marek', 'mir_piotrowski', 'krzysztofbosak',
38 | 'szymon_holownia', 'KosiniakKamysz', 'Grzywa_Slawomir', 'RobertBiedron', 'trzaskowski_']
39 | for user in users:
40 | responses = [
41 | client.get_user_tweets(user).sort_values(by='created_at')[['created_at']],
42 | # client.get_user_details(user),
43 | # client.get_user_followers(user),
44 | # client.get_user_followings(user),
45 | # client.get_user_favorites(user)
46 | ]
47 | print(user)
48 | print([it for it in responses])
49 | return
50 |
51 |
52 | main()
53 |
--------------------------------------------------------------------------------
/client/task_client.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from enum import Enum
3 | from typing import Optional, Dict
4 |
5 | import requests
6 |
7 |
8 | class ScrapInterval(Enum):
9 | HOUR = 1
10 | DAY = 2
11 | MONTH = 3
12 | QUARTER_OF_YEAR = 4
13 | YEAR = 5
14 |
15 | def get_parameter_name(self) -> str:
16 | return {
17 | ScrapInterval.HOUR: 'hour',
18 | ScrapInterval.DAY: 'day',
19 | ScrapInterval.MONTH: 'month',
20 | ScrapInterval.QUARTER_OF_YEAR: 'quarter_of_year',
21 | ScrapInterval.YEAR: 'year'
22 | }[self]
23 |
24 |
25 | class TwintDistributedTaskClient:
26 | """
27 | Client to add scrap tasks.
28 |
29 | interval_type -- this is interval time of subtask, big scrap task can be divided for smaller,
30 | it's better for scaling
31 |
32 | queue_name -- it's name of RabbitMQ queue, tasks can be adding to different queues
33 |
34 | scrap_series -- it can be group of tasks, when all tasks in group will be finished, the webhook with finish
35 | information will be send to parametrized host
36 | """
37 |
38 | def __init__(self, command_server_host):
39 | self.command_server_host = command_server_host
40 | return
41 |
42 | def add_user_tweets_to_scrap(self, username: str, interval_type: ScrapInterval, queue_name: str, scrap_series: str,
43 | since: Optional[datetime], until: Optional[datetime]):
44 | post_data = {
45 | 'username': username,
46 | 'interval_type': interval_type.get_parameter_name(),
47 | 'queue_name': queue_name,
48 | 'scrap_series': scrap_series,
49 | 'since': since.isoformat() if since is not None else None,
50 | 'until': until.isoformat() if until is not None else None
51 | }
52 | self.__call_post_request('/add_user_tweets_to_scrap', post_data)
53 | return
54 |
55 | def add_user_details_to_scrap(self, username: str, queue_name: str, scrap_series: str):
56 | post_data = {
57 | 'username': username,
58 | 'queue_name': queue_name,
59 | 'scrap_series': scrap_series
60 | }
61 | self.__call_post_request('/add_user_details_to_scrap', post_data)
62 | return
63 |
64 | def add_user_followings_to_scrap(self, username: str, queue_name: str, scrap_series: str):
65 | post_data = {
66 | 'username': username,
67 | 'queue_name': queue_name,
68 | 'scrap_series': scrap_series
69 | }
70 | self.__call_post_request('/add_user_followings_to_scrap', post_data)
71 | return
72 |
73 | def add_user_followers_to_scrap(self, username: str, queue_name: str, scrap_series: str):
74 | post_data = {
75 | 'username': username,
76 | 'queue_name': queue_name,
77 | 'scrap_series': scrap_series
78 | }
79 | self.__call_post_request('/add_user_followers_to_scrap', post_data)
80 | return
81 |
82 | def add_user_favorites_to_scrap(self, username: str, queue_name: str, scrap_series: str):
83 | post_data = {
84 | 'username': username,
85 | 'queue_name': queue_name,
86 | 'scrap_series': scrap_series
87 | }
88 | self.__call_post_request('/add_user_favorites_to_scrap', post_data)
89 | return
90 |
91 | def add_search_to_scrap(self, to_search: str, interval_type: ScrapInterval, queue_name: str, scrap_series: str,
92 | since: Optional[datetime], until: Optional[datetime], language: Optional[str]):
93 | post_data = {
94 | 'to_search': to_search,
95 | 'interval_type': interval_type.get_parameter_name(),
96 | 'queue_name': queue_name,
97 | 'scrap_series': scrap_series,
98 | 'since': since.isoformat() if since is not None else None,
99 | 'until': until.isoformat() if since is not None else None,
100 | 'language': language
101 | }
102 | self.__call_post_request('/add_search_to_scrap', post_data)
103 | return
104 |
105 | def __call_post_request(self, path: str, post_data: Dict[str, any]):
106 | url = self.command_server_host + path
107 | response = requests.post(url, data=post_data)
108 | if response.status_code >= 400:
109 | print("ERR path code:", response.status_code)
110 | return
111 |
112 |
113 | # def main():
114 | # result = requests.get('http://api.pandemocje.pl/api/hashtag_distribution')
115 | # hashtag_count_dict = result.json()['plot_raw']
116 | # hashtags = [it for it in hashtag_count_dict.keys() if hashtag_count_dict[it] > 100]
117 | # client = TwintDistributedTaskClient('http://192.168.0.124:5000')
118 | # for hashtag in hashtags:
119 | # print(hashtag)
120 | # client.add_search_to_scrap(hashtag, ScrapInterval.MONTH, 'bot_detection', 'hashtag_analyse', since=None,
121 | # until=None, language='pl')
122 | # return
123 | #
124 | #
125 | # def for_kajdanowicz():
126 | # users = ['AndrzejDuda', 'M_K_Blonska', 'pawel_tanajno', 'jakubiak_marek', 'mir_piotrowski', 'krzysztofbosak',
127 | # 'szymon_holownia', 'KosiniakKamysz', 'Grzywa_Slawomir', 'RobertBiedron', 'trzaskowski_']
128 | # client = TwintDistributedTaskClient('http://192.168.0.124:5000')
129 | #
130 | # scrap_since = datetime.now() - timedelta(days=60)
131 | # print(scrap_since.isoformat())
132 | #
133 | # for user in users:
134 | # print(user)
135 | # client.add_user_tweets_to_scrap(user, ScrapInterval.MONTH, 'bot_detection', 'kajdanowicz',
136 | # since=None, until=None)
137 | #
138 | #
139 | # # main()
140 | # for_kajdanowicz()
141 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.7"
2 | services:
3 | # database for commands
4 | twint_distributed_mysql_db:
5 | image: mysql:8.0.19
6 | command: --default-authentication-plugin=mysql_native_password
7 | container_name: twint_distributed_mysql_db
8 | restart: unless-stopped
9 | environment:
10 | MYSQL_ROOT_PASSWORD: test1234
11 | volumes:
12 | - ./db_data:/var/lib/mysql
13 | - /etc/localtime:/etc/localtime:ro
14 | - /etc/timezone:/etc/timezone:ro
15 | # ports:
16 | # - 3306:3306
17 |
18 | # queue to serve tasks for scrap
19 | twint_distributed_rabbitmq:
20 | image: rabbitmq:3.8.3-management
21 | volumes:
22 | - ./rabbit_data:/var/lib/rabbitmq
23 | - /etc/localtime:/etc/localtime:ro
24 | - /etc/timezone:/etc/timezone:ro
25 | ports:
26 | - 15672:15672 # management plugin
27 | # - 5672:5672 # RabbitMQ
28 |
29 | # command server to preview added and finished tasks
30 | twint_distributed_command_server:
31 | image: marcinwatroba/twint-distributed:latest
32 | restart: unless-stopped
33 | command: [python, -u, command_server.py]
34 | ports:
35 | - 5000:5000
36 | environment:
37 | - RABBIT_HOST=twint_distributed_rabbitmq
38 | - RABBIT_USERNAME=guest
39 | - RABBIT_PASSWORD=guest
40 | - MYSQL_HOST=twint_distributed_mysql_db
41 | - MYSQL_PORT=3306
42 | - MYSQL_USER=root
43 | - MYSQL_PASSWORD=test1234
44 | - WEBHOOK_HOST=no_host # fill variable when you want get webhook after session finished
45 | volumes:
46 | - /etc/localtime:/etc/localtime:ro
47 | - /etc/timezone:/etc/timezone:ro
48 |
49 | # consumer -- this service can be scaled
50 | twint_distributed_scrapper_consumer:
51 | image: marcinwatroba/twint-distributed:latest
52 | restart: unless-stopped
53 | command: [python, -u, scrap_worker.py]
54 | environment:
55 | - RABBIT_HOST=twint_distributed_rabbitmq
56 | - RABBIT_USERNAME=guest
57 | - RABBIT_PASSWORD=guest
58 | - UPLOAD_FILE_HOST=[upload_file_host]
59 | - QUEUE_NAME=bot_detection
60 | - COMMAND_SERVER_HOST=twint_distributed_command_server
61 | volumes:
62 | - /etc/localtime:/etc/localtime:ro
63 | - /etc/timezone:/etc/timezone:ro
64 |
65 | # service to save data
66 | twint_distributed_data_server:
67 | image: marcinwatroba/twint-distributed:latest
68 | restart: unless-stopped
69 | command: [python, -u, data_server.py]
70 | volumes:
71 | - ./data:/data
72 | - /etc/localtime:/etc/localtime:ro
73 | - /etc/timezone:/etc/timezone:ro
74 | ports:
75 | - 5001:5000
76 |
--------------------------------------------------------------------------------
/rabbit.conf:
--------------------------------------------------------------------------------
1 | loopback_users.guest = false
2 | listeners.tcp.default = 5672
3 | log.console.level = warning
4 | management.tcp.inactivity_timeout = 120000
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | requests
3 | pysocks
4 | pandas
5 | numpy
6 | pika
7 | python-dateutil
8 | pymongo
9 | dacite
10 | aiohttp_socks
11 | mysql-connector-python
12 | pymysql
13 | # -e git://github.com/twintproject/twint.git#egg=twint
--------------------------------------------------------------------------------
/src/command_server.py:
--------------------------------------------------------------------------------
1 | import time
2 | from uuid import uuid4
3 |
4 | from dateutil.parser import parse as date_parser
5 | from flask import Flask
6 | from flask import request, jsonify
7 |
8 | import command_service
9 | import utils.docker_logs as docker_logs
10 | import utils.interval_utils as interval_utils
11 | from model.scrap_type import ScrapType
12 | from utils import commands_mysql_utils
13 |
14 | logger = docker_logs.get_logger('command_server')
15 | app = Flask(__name__)
16 |
17 |
18 | def get_new_index() -> str:
19 | return str(uuid4())
20 |
21 |
22 | def get_success_response():
23 | return jsonify({'status': 'SUCCESS'})
24 |
25 |
26 | @app.route("/add_user_tweets_to_scrap", methods=['POST'])
27 | def add_user_tweets_to_scrap():
28 | command_service.add_user_tweets_to_scrap(
29 | username=request.form['username'],
30 | since=date_parser(request.form['since']) if 'since' in request.form else None,
31 | until=date_parser(request.form['until']) if 'since' in request.form else None,
32 | queue_name=request.form['queue_name'],
33 | scrap_series=request.form['scrap_series'],
34 | interval_type=interval_utils.TimeIntervalType.get_from_string(request.form['interval_type'])
35 | )
36 | return get_success_response()
37 |
38 |
39 | @app.route("/add_user_details_to_scrap", methods=['POST'])
40 | def add_user_details_to_scrap():
41 | command_service.add_user_details_to_scrap(
42 | username=request.form['username'],
43 | queue_name=request.form['queue_name'],
44 | scrap_series=request.form['scrap_series']
45 | )
46 | return get_success_response()
47 |
48 |
49 | @app.route("/add_user_followings_to_scrap", methods=['POST'])
50 | def add_user_followings_to_scrap():
51 | command_service.add_user_followings_to_scrap(
52 | username=request.form['username'],
53 | queue_name=request.form['queue_name'],
54 | scrap_series=request.form['scrap_series']
55 | )
56 | return get_success_response()
57 |
58 |
59 | @app.route("/add_user_followers_to_scrap", methods=['POST'])
60 | def add_user_followers_to_scrap():
61 | command_service.add_user_followers_to_scrap(
62 | username=request.form['username'],
63 | queue_name=request.form['queue_name'],
64 | scrap_series=request.form['scrap_series']
65 | )
66 | return get_success_response()
67 |
68 |
69 | @app.route("/add_user_favorites_to_scrap", methods=['POST'])
70 | def add_user_favorites_to_scrap():
71 | command_service.add_user_favorites_to_scrap(
72 | username=request.form['username'],
73 | queue_name=request.form['queue_name'],
74 | scrap_series=request.form['scrap_series']
75 | )
76 | return get_success_response()
77 |
78 |
79 | @app.route("/add_search_to_scrap", methods=['POST'])
80 | def add_search_to_scrap():
81 | command_service.add_search_to_scrap(
82 | phrase=request.form['to_search'],
83 | since=date_parser(request.form['since']) if 'since' in request.form else None,
84 | until=date_parser(request.form['until']) if 'until' in request.form else None,
85 | language=request.form['language'] if 'language' in request.form else None,
86 | queue_name=request.form['queue_name'],
87 | scrap_series=request.form['scrap_series'],
88 | interval_type=interval_utils.TimeIntervalType.get_from_string(request.form['interval_type'])
89 | )
90 | return get_success_response()
91 |
92 |
93 | @app.route("/set_task_as_finished", methods=['POST'])
94 | def set_task_as_finished():
95 | logger.info(request.form)
96 | command_service.set_task_as_finished(request.form['task_id'], ScrapType[request.form['task_type']])
97 | return get_success_response()
98 |
99 |
100 | @app.route("/set_sub_task_as_finished", methods=['POST'])
101 | def set_sub_task_as_finished():
102 | logger.info(request.form)
103 | command_service.set_sub_task_as_finished(request.form['sub_task_id'], ScrapType[request.form['task_type']])
104 | return get_success_response()
105 |
106 |
107 | @app.route("/get_all_tasks", methods=['GET'])
108 | def get_all_scrapped_users():
109 | return jsonify(command_service.get_all_scrapped_tasks())
110 |
111 |
112 | def wait_for_mysql():
113 | try_count = 100
114 | success = False
115 | while try_count > 0 and not success:
116 | try:
117 | commands_mysql_utils.get_db_connection_base().close()
118 | success = True
119 | except Exception:
120 | try_count = try_count - 1
121 | logger.info("error during connect to mysql")
122 | logger.info("wait 3 seconds for next try")
123 | time.sleep(3)
124 | if success:
125 | return
126 | else:
127 | raise Exception("can't connect with mysql")
128 |
129 |
130 | if __name__ == "__main__":
131 | wait_for_mysql()
132 | commands_mysql_utils.prepare_database()
133 | app.run(host="0.0.0.0", debug=True)
134 |
--------------------------------------------------------------------------------
/src/command_service.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import List, Optional
3 | from uuid import uuid4
4 |
5 | import pandas as pd
6 | import requests
7 | from pandas._libs.tslibs.nattype import NaT
8 | from pandas._libs.tslibs.timestamps import Timestamp
9 |
10 | import model.hashtag_scrap_params as hashtag_scrap_params
11 | import utils.docker_logs as docker_logs
12 | import utils.interval_utils as interval_utils
13 | from configuration import webhook_config
14 | from dao import user_favorites_task_dao, user_followings_task_dao, \
15 | user_followers_task_dao, user_details_task_dao, search_by_task_dao, user_tweets_task_dao, session_dao
16 | from model.scrap_type import ScrapType
17 | from model.time_interval import TimeInterval
18 | from model.user_scrap_params import UserFavoritesScrapTaskParams, UserFollowersScrapTaskParams, \
19 | UserFollowingScrapTaskParams, \
20 | UserDetailsScrapTaskParams, UserTweetsScrapTaskParams
21 | from utils.params_encoder import ParamsEncoder
22 | from utils.rabbit_send_utils import send_to_rabbit
23 |
24 | logger = docker_logs.get_logger('command_service')
25 |
26 |
27 | def get_new_index() -> str:
28 | return str(uuid4())
29 |
30 |
31 | def get_scrap_session_id_by_name(scrap_session_name: str) -> str:
32 | session_id = session_dao.get_scrap_session_id_by_name(scrap_session_name)
33 | if session_id is None:
34 | session_id = get_new_index()
35 | session_dao.add_session(session_id, scrap_session_name)
36 | return session_id
37 |
38 |
39 | def get_interval_list(since: datetime, until: datetime,
40 | interval_type: interval_utils.TimeIntervalType) -> List[TimeInterval]:
41 | return interval_utils.get_list_interval(since, until, interval_type)
42 |
43 |
44 | def add_user_tweets_to_scrap(username: str, since: datetime, until: datetime, queue_name: str, scrap_series: str,
45 | interval_type: interval_utils.TimeIntervalType):
46 | intervals = get_interval_list(since, until, interval_type=interval_type)
47 | since_non_null = sorted([it.since for it in intervals])[0]
48 | until_non_null = sorted([it.until for it in intervals])[-1]
49 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
50 | task_id = get_new_index()
51 | user_tweets_task_dao.add_task(task_id, username, since_non_null, until_non_null, datetime.now(), scrap_session_id,
52 | queue_name)
53 | for interval in get_interval_list(since=since, until=until, interval_type=interval_type):
54 | params = UserTweetsScrapTaskParams(
55 | task_id=get_new_index(),
56 | username=username,
57 | since=interval.since,
58 | until=interval.until,
59 | scrap_series=scrap_series,
60 | queue_name=queue_name
61 | )
62 | user_tweets_task_dao.add_sub_task(params.task_id, task_id, params.since, params.until, datetime.now())
63 | params_str = ParamsEncoder().default(params)
64 | logger.info(params_str + " " + params.queue_name)
65 | send_to_rabbit(params.queue_name, params_str)
66 | return
67 |
68 |
69 | def add_user_details_to_scrap(username: str, queue_name: str, scrap_series: str):
70 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
71 | params = UserDetailsScrapTaskParams(
72 | task_id=get_new_index(),
73 | username=username,
74 | queue_name=queue_name,
75 | scrap_series=scrap_series
76 | )
77 | user_details_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name)
78 | params_str = ParamsEncoder().default(params)
79 | logger.info(params_str + " " + params.queue_name)
80 | send_to_rabbit(params.queue_name, params_str)
81 | return
82 |
83 |
84 | def add_user_followings_to_scrap(username: str, queue_name: str, scrap_series: str):
85 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
86 | params = UserFollowingScrapTaskParams(
87 | task_id=get_new_index(),
88 | username=username,
89 | queue_name=queue_name,
90 | scrap_series=scrap_series
91 | )
92 | user_followings_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name)
93 | params_str = ParamsEncoder().default(params)
94 | logger.info(params_str + " " + queue_name)
95 | send_to_rabbit(queue_name, params_str)
96 | return
97 |
98 |
99 | def add_user_followers_to_scrap(username: str, queue_name: str, scrap_series: str):
100 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
101 | params = UserFollowersScrapTaskParams(
102 | task_id=get_new_index(),
103 | username=username,
104 | queue_name=queue_name,
105 | scrap_series=scrap_series
106 | )
107 | user_followers_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name)
108 | params_str = ParamsEncoder().default(params)
109 | logger.info(params_str + " " + queue_name)
110 | send_to_rabbit(queue_name, params_str)
111 | return
112 |
113 |
114 | def add_user_favorites_to_scrap(username: str, queue_name: str, scrap_series: str):
115 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
116 | params = UserFavoritesScrapTaskParams(
117 | task_id=get_new_index(),
118 | username=username,
119 | queue_name=queue_name,
120 | scrap_series=scrap_series
121 | )
122 | user_favorites_task_dao.add_task(params.task_id, username, datetime.now(), scrap_session_id, queue_name)
123 | params_str = ParamsEncoder().default(params)
124 | logger.info(params_str + " " + queue_name)
125 | send_to_rabbit(queue_name, params_str)
126 | return
127 |
128 |
129 | def add_search_to_scrap(phrase: str, since: Optional[datetime], until: Optional[datetime], language: Optional[str],
130 | queue_name: str, scrap_series: str, interval_type: interval_utils.TimeIntervalType):
131 | intervals = get_interval_list(since, until, interval_type=interval_type)
132 | since_non_null = sorted([it.since for it in intervals])[0]
133 | until_non_null = sorted([it.until for it in intervals])[-1]
134 | scrap_session_id = get_scrap_session_id_by_name(scrap_series)
135 | task_id = get_new_index()
136 | search_by_task_dao.add_task(task_id, phrase, since_non_null, until_non_null, datetime.now(), scrap_session_id,
137 | queue_name)
138 | for interval in intervals:
139 | params = hashtag_scrap_params.PhraseScrapTaskParams(
140 | task_id=get_new_index(),
141 | phrase=phrase,
142 | since=interval.since,
143 | until=interval.until,
144 | language=language,
145 | queue_name=queue_name,
146 | scrap_series=scrap_series
147 | )
148 | search_by_task_dao.add_sub_task(params.task_id, task_id, params.since, params.until, datetime.now())
149 | params_str = ParamsEncoder().default(params)
150 | logger.info(params_str + " " + params.queue_name)
151 | send_to_rabbit(params.queue_name, params_str)
152 | return
153 |
154 |
155 | def send_session_finished_webhook(scrap_session_name: str):
156 | post_data = {
157 | 'scrap_session_name': scrap_session_name,
158 | }
159 | url = webhook_config.get_webhook_host() + '/scrap_session_finished'
160 | requests.post(url, data=post_data)
161 | return
162 |
163 |
164 | def support_finish_session(session_id: str):
165 | count = session_dao.get_not_finished_session_tasks_count(session_id)
166 | if count == 0:
167 | session_name = session_dao.get_scrap_session_name_by_id(session_id)
168 | logger.info('finished session ' + session_name)
169 | if webhook_config.is_webhook_configured():
170 | send_session_finished_webhook(session_name)
171 | else:
172 | logger.info('webhook not configured')
173 | else:
174 | logger.info('count to finish session: ' + str(count))
175 | return
176 |
177 |
178 | def set_task_as_finished(task_id: str, task_type: ScrapType):
179 | scrap_session_id = ''
180 | if task_type == ScrapType.USER_FAVORITES:
181 | user_favorites_task_dao.set_task_finished(task_id, datetime.now())
182 | scrap_session_id = user_favorites_task_dao.get_session_id(task_id)
183 | elif task_type == ScrapType.USER_FOLLOWINGS:
184 | user_followings_task_dao.set_task_finished(task_id, datetime.now())
185 | scrap_session_id = user_followings_task_dao.get_session_id(task_id)
186 | elif task_type == ScrapType.USER_FOLLOWERS:
187 | user_followers_task_dao.set_task_finished(task_id, datetime.now())
188 | scrap_session_id = user_followers_task_dao.get_session_id(task_id)
189 | elif task_type == ScrapType.USER_DETAILS:
190 | user_details_task_dao.set_task_finished(task_id, datetime.now())
191 | scrap_session_id = user_details_task_dao.get_session_id(task_id)
192 | else:
193 | raise Exception("Bad type")
194 |
195 | support_finish_session(scrap_session_id)
196 | return
197 |
198 |
199 | def set_sub_task_as_finished(sub_task_id: str, task_type: ScrapType):
200 | if task_type == ScrapType.SEARCH_BY_PHRASE:
201 | dao = search_by_task_dao
202 | elif task_type == ScrapType.USER_TWEETS:
203 | dao = user_tweets_task_dao
204 | else:
205 | raise Exception("Bad type")
206 |
207 | task_id = dao.get_task_id_sub_task_id(sub_task_id)
208 | dao.set_sub_task_finished(sub_task_id, datetime.now())
209 | not_finished_sub_tasks_count = dao.get_all_not_finished_sub_tasks_by_task_id(task_id).size
210 | if not_finished_sub_tasks_count == 0:
211 | dao.set_task_finished(task_id, datetime.now())
212 | scrap_session_id = dao.get_session_id(task_id)
213 | support_finish_session(scrap_session_id)
214 | return
215 |
216 |
217 | def map_value_to_string(value) -> Optional[any]:
218 | # print(value.__class__)
219 | if isinstance(value, Timestamp):
220 | print(value)
221 | return value.isoformat()
222 | elif value is NaT:
223 | return None
224 | else:
225 | return value
226 |
227 |
228 | def data_frame_to_json_list(df: pd.DataFrame):
229 | df_list = [dict(row) for index, row in df.iterrows()]
230 | df_list = [
231 | {key: map_value_to_string(row[key]) for key in row.keys()}
232 | for row in df_list
233 | ]
234 | return df_list
235 |
236 |
237 | def get_all_scrapped_tasks():
238 | df_dict = dict({
239 | ScrapType.SEARCH_BY_PHRASE.name: search_by_task_dao.get_all_tasks(),
240 | ScrapType.USER_FOLLOWERS.name: user_followers_task_dao.get_all_tasks(),
241 | ScrapType.USER_FOLLOWINGS.name: user_followings_task_dao.get_all_tasks(),
242 | ScrapType.USER_FAVORITES.name: user_favorites_task_dao.get_all_tasks(),
243 | ScrapType.USER_DETAILS.name: user_details_task_dao.get_all_tasks(),
244 | ScrapType.USER_TWEETS.name: user_tweets_task_dao.get_all_tasks()
245 | })
246 | return {it: data_frame_to_json_list(df_dict[it]) for it in df_dict.keys()}
247 |
--------------------------------------------------------------------------------
/src/configuration/command_server_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | _command_server_host = os.environ['COMMAND_SERVER_HOST']
4 |
5 |
6 | def get_command_server_host() -> str:
7 | return _command_server_host
8 |
--------------------------------------------------------------------------------
/src/configuration/mysql_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import mysql
4 | from mysql.connector import MySQLConnection
5 |
6 | db_hostname = os.environ['MYSQL_HOST']
7 | db_port = os.environ['MYSQL_PORT']
8 | db_username = os.environ['MYSQL_USER']
9 | db_password = os.environ['MYSQL_PASSWORD']
10 | TWINT_DISTRIBUTED_DATABASE = 'twint_distributed_tasks'
11 |
12 |
13 | def get_db_connection() -> MySQLConnection:
14 | return mysql.connector.connect(
15 | host=db_hostname,
16 | port=db_port,
17 | user=db_username,
18 | passwd=db_password,
19 | database=TWINT_DISTRIBUTED_DATABASE
20 | )
21 |
22 |
23 | def get_db_connection_base() -> MySQLConnection:
24 | return mysql.connector.connect(
25 | host=db_hostname,
26 | port=db_port,
27 | user=db_username,
28 | passwd=db_password
29 | )
30 |
--------------------------------------------------------------------------------
/src/configuration/proxy_config.py:
--------------------------------------------------------------------------------
1 | class ProxyConfig:
2 |
3 | def __init__(self, host: str, port: int, proxy_type: str):
4 | self._host = host
5 | self._port = port
6 | self._proxy_type = proxy_type
7 | return
8 |
9 | def get_host(self):
10 | return self._host
11 |
12 | def get_port(self):
13 | return self._port
14 |
15 | def get_proxy_type(self):
16 | return self._proxy_type
17 |
18 | def to_string(self):
19 | return 'ProxyConfig(host=' + self.get_host() + '; port=' + str(
20 | self.get_port()) + '; proxy_type=' + self.get_proxy_type() + ')'
21 |
22 |
23 | default_proxy_config = ProxyConfig('localhost', 9050, 'socks5')
24 |
--------------------------------------------------------------------------------
/src/configuration/rabbit_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pika
4 |
5 | _rabbit_host = os.environ['RABBIT_HOST']
6 | _rabbit_username = os.environ['RABBIT_USERNAME']
7 | _rabbit_password = os.environ['RABBIT_PASSWORD']
8 |
9 |
10 | def get_rabbit_connection_config() -> pika.ConnectionParameters:
11 | return pika.ConnectionParameters(
12 | host=_rabbit_host,
13 | credentials=pika.credentials.PlainCredentials(
14 | username=_rabbit_username,
15 | password=_rabbit_password
16 | ),
17 | heartbeat=30
18 | )
19 |
--------------------------------------------------------------------------------
/src/configuration/upload_file_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | _upload_file_host = os.environ['UPLOAD_FILE_HOST']
4 |
5 |
6 | def get_upload_file_host() -> str:
7 | return _upload_file_host
8 |
--------------------------------------------------------------------------------
/src/configuration/webhook_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def is_webhook_configured() -> bool:
5 | return 'WEBHOOK_HOST' in os.environ and os.environ['WEBHOOK_HOST'] != 'no_host'
6 |
7 |
8 | def get_webhook_host() -> str:
9 | return os.environ['WEBHOOK_HOST']
10 |
--------------------------------------------------------------------------------
/src/configuration/worker_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def get_queue_name() -> str:
5 | return os.environ['QUEUE_NAME']
6 |
--------------------------------------------------------------------------------
/src/dao/search_by_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, phrase: str, since: datetime, until: datetime, created: datetime, scrap_session_id: str,
7 | queue_name: str):
8 | execute_sql_modify(
9 | '''INSERT INTO twint_distributed_tasks.SearchTweetScrapTasks(task_id, phrase, since, until, created, finished,
10 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s, %s, %s);''',
11 | [task_id, phrase, since, until, created, None, scrap_session_id, queue_name])
12 | return
13 |
14 |
15 | def add_sub_task(sub_task_id: str, task_id: str, since: datetime, until: datetime, created: datetime):
16 | print('task_id', task_id)
17 | execute_sql_modify(
18 | '''INSERT INTO twint_distributed_tasks.SearchTweetScrapSubTasks(sub_task_id, task_id, since, until, created,
19 | finished) VALUE (%s, %s, %s, %s, %s, %s);''',
20 | [sub_task_id, task_id, since, until, created, None])
21 | return
22 |
23 |
24 | def set_task_finished(task_id: str, finished: datetime):
25 | execute_sql_modify(
26 | '''UPDATE twint_distributed_tasks.SearchTweetScrapTasks
27 | SET finished = %s
28 | WHERE task_id = %s''',
29 | [finished, task_id])
30 | return
31 |
32 |
33 | def set_sub_task_finished(sub_task_id: str, finished: datetime):
34 | execute_sql_modify(
35 | '''UPDATE twint_distributed_tasks.SearchTweetScrapSubTasks
36 | SET finished = %s
37 | WHERE sub_task_id = %s''',
38 | [finished, sub_task_id])
39 | return
40 |
41 |
42 | def get_all_not_finished_sub_tasks_by_task_id(task_id: str):
43 | return execute_sql_query(
44 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapSubTasks WHERE task_id=%s AND finished IS NULL',
45 | [task_id])
46 |
47 |
48 | def get_session_id(task_id: str) -> str:
49 | return execute_sql_query(
50 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapTasks WHERE task_id=%s',
51 | [task_id]
52 | )['scrap_session_id'].to_numpy()[0]
53 |
54 |
55 | def get_task_id_sub_task_id(sub_task_id: str) -> str:
56 | return execute_sql_query(
57 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapSubTasks WHERE sub_task_id=%s',
58 | [sub_task_id]
59 | )['task_id'].to_numpy()[0]
60 |
61 |
62 | def get_all_tasks_by_username(phrase: str):
63 | return execute_sql_query(
64 | 'SELECT * FROM twint_distributed_tasks.SearchTweetScrapTasks WHERE phrase=%s',
65 | [phrase]
66 | )
67 |
68 |
69 | def get_all_tasks():
70 | return execute_sql_query(
71 | '''SELECT task_id, phrase, since, until, language, created, finished, queue_name, scrap_session_name
72 | FROM twint_distributed_tasks.SearchTweetScrapTasks t
73 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
74 | [])
75 |
--------------------------------------------------------------------------------
/src/dao/session_dao.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_session(scrap_session_id: str, scrap_session_name: str):
7 | execute_sql_modify(
8 | '''INSERT INTO twint_distributed_tasks.ScrapSession(scrap_session_id, scrap_session_name) VALUE (%s, %s);''',
9 | [scrap_session_id, scrap_session_name])
10 | return
11 |
12 |
13 | def get_scrap_session_id_by_name(scrap_session_name: str) -> Optional[str]:
14 | values = list(
15 | execute_sql_query(
16 | 'SELECT * FROM twint_distributed_tasks.ScrapSession WHERE scrap_session_name=%s',
17 | [scrap_session_name]
18 | )['scrap_session_id'].to_numpy())
19 | return values[0] if len(values) > 0 else None
20 |
21 |
22 | def get_scrap_session_name_by_id(scrap_session_name: str) -> Optional[str]:
23 | values = list(
24 | execute_sql_query(
25 | 'SELECT * FROM twint_distributed_tasks.ScrapSession WHERE scrap_session_id=%s',
26 | [scrap_session_name]
27 | )['scrap_session_name'].to_numpy())
28 | return values[0] if len(values) > 0 else None
29 |
30 |
31 | def get_not_finished_session_tasks_count(scrap_session_id: str) -> int:
32 | queries = [
33 | '''SELECT COUNT(*) FROM twint_distributed_tasks.SearchTweetScrapTasks
34 | WHERE scrap_session_id=%s AND finished IS NULL''',
35 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserDetailsScrapTasks
36 | WHERE scrap_session_id=%s AND finished IS NULL''',
37 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserTweetScrapTasks
38 | WHERE scrap_session_id=%s AND finished IS NULL''',
39 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFollowersScrapTasks
40 | WHERE scrap_session_id=%s AND finished IS NULL''',
41 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFollowingScrapTasks
42 | WHERE scrap_session_id=%s AND finished IS NULL''',
43 | '''SELECT COUNT(*) FROM twint_distributed_tasks.UserFavoritesScrapTasks
44 | WHERE scrap_session_id=%s AND finished IS NULL'''
45 | ]
46 | return sum([execute_sql_query(query, [scrap_session_id]).to_numpy()[0] for query in queries])
47 |
48 |
49 | def get_all_sessions():
50 | return execute_sql_query('SELECT * FROM twint_distributed_tasks.ScrapSession', [])
51 |
--------------------------------------------------------------------------------
/src/dao/user_details_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str):
7 | execute_sql_modify(
8 | '''INSERT INTO twint_distributed_tasks.UserDetailsScrapTasks(task_id, username, created, finished,
9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''',
10 | [task_id, username, created, None, scrap_session_id, queue_name])
11 | return
12 |
13 |
14 | def set_task_finished(task_id: str, finished: datetime):
15 | execute_sql_modify(
16 | '''UPDATE twint_distributed_tasks.UserDetailsScrapTasks
17 | SET finished = %s
18 | WHERE task_id = %s''',
19 | [finished, task_id])
20 | return
21 |
22 |
23 | def get_session_id(task_id: str) -> str:
24 | return execute_sql_query(
25 | 'SELECT * FROM twint_distributed_tasks.UserDetailsScrapTasks WHERE task_id=%s',
26 | [task_id]
27 | )['scrap_session_id'].to_numpy()[0]
28 |
29 |
30 | def get_all_by_username(username: str):
31 | return execute_sql_query(
32 | 'SELECT * FROM twint_distributed_tasks.UserDetailsScrapTasks WHERE username=%s',
33 | [username]
34 | )
35 |
36 |
37 | def get_all_tasks():
38 | return execute_sql_query(
39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name
40 | FROM twint_distributed_tasks.UserDetailsScrapTasks t
41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
42 | [])
43 |
--------------------------------------------------------------------------------
/src/dao/user_favorites_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str):
7 | execute_sql_modify(
8 | '''INSERT INTO twint_distributed_tasks.UserFavoritesScrapTasks(task_id, username, created, finished,
9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''',
10 | [task_id, username, created, None, scrap_session_id, queue_name])
11 | return
12 |
13 |
14 | def set_task_finished(task_id: str, finished: datetime):
15 | execute_sql_modify(
16 | '''UPDATE twint_distributed_tasks.UserFavoritesScrapTasks
17 | SET finished = %s
18 | WHERE task_id = %s''',
19 | [finished, task_id])
20 | return
21 |
22 |
23 | def get_session_id(task_id: str) -> str:
24 | return execute_sql_query(
25 | 'SELECT * FROM twint_distributed_tasks.UserFavoritesScrapTasks WHERE task_id=%s',
26 | [task_id]
27 | )['scrap_session_id'].to_numpy()[0]
28 |
29 |
30 | def get_all_by_username(username: str):
31 | return execute_sql_query(
32 | 'SELECT * FROM twint_distributed_tasks.UserFavoritesScrapTasks WHERE username=%s',
33 | [username]
34 | )
35 |
36 |
37 | def get_all_tasks():
38 | return execute_sql_query(
39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name
40 | FROM twint_distributed_tasks.UserFavoritesScrapTasks t
41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
42 | [])
43 |
--------------------------------------------------------------------------------
/src/dao/user_followers_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str):
7 | execute_sql_modify(
8 | '''INSERT INTO twint_distributed_tasks.UserFollowersScrapTasks(task_id, username, created, finished,
9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''',
10 | [task_id, username, created, None, scrap_session_id, queue_name])
11 | return
12 |
13 |
14 | def set_task_finished(task_id: str, finished: datetime):
15 | execute_sql_modify(
16 | '''UPDATE twint_distributed_tasks.UserFollowersScrapTasks
17 | SET finished = %s
18 | WHERE task_id = %s''',
19 | [finished, task_id])
20 | return
21 |
22 |
23 | def get_session_id(task_id: str) -> str:
24 | return execute_sql_query(
25 | 'SELECT * FROM twint_distributed_tasks.UserFollowersScrapTasks WHERE task_id=%s',
26 | [task_id]
27 | )['scrap_session_id'].to_numpy()[0]
28 |
29 |
30 | def get_all_by_username(username: str):
31 | return execute_sql_query(
32 | 'SELECT * FROM twint_distributed_tasks.UserFollowersScrapTasks WHERE username=%s',
33 | [username]
34 | )
35 |
36 |
37 | def get_all_tasks():
38 | return execute_sql_query(
39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name
40 | FROM twint_distributed_tasks.UserFollowersScrapTasks t
41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
42 | [])
43 |
--------------------------------------------------------------------------------
/src/dao/user_followings_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, username: str, created: datetime, scrap_session_id: str, queue_name: str):
7 | execute_sql_modify(
8 | '''INSERT INTO twint_distributed_tasks.UserFollowingScrapTasks(task_id, username, created, finished,
9 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s);''',
10 | [task_id, username, created, None, scrap_session_id, queue_name])
11 | return
12 |
13 |
14 | def set_task_finished(task_id: str, finished: datetime):
15 | execute_sql_modify(
16 | '''UPDATE twint_distributed_tasks.UserFollowingScrapTasks
17 | SET finished = %s
18 | WHERE task_id = %s''',
19 | [finished, task_id])
20 | return
21 |
22 |
23 | def get_session_id(task_id: str) -> str:
24 | return execute_sql_query(
25 | 'SELECT * FROM twint_distributed_tasks.UserFollowingScrapTasks WHERE task_id=%s',
26 | [task_id]
27 | )['scrap_session_id'].to_numpy()[0]
28 |
29 |
30 | def get_all_by_username(username: str):
31 | return execute_sql_query(
32 | 'SELECT * FROM twint_distributed_tasks.UserFollowingScrapTasks WHERE username=%s',
33 | [username]
34 | )
35 |
36 |
37 | def get_all_tasks():
38 | return execute_sql_query(
39 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name
40 | FROM twint_distributed_tasks.UserFollowingScrapTasks t
41 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
42 | [])
43 |
--------------------------------------------------------------------------------
/src/dao/user_tweets_task_dao.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from utils.commands_mysql_utils import execute_sql_modify, execute_sql_query
4 |
5 |
6 | def add_task(task_id: str, username: str, since: datetime, until: datetime, created: datetime,
7 | scrap_session_id: str, queue_name: str):
8 | execute_sql_modify(
9 | '''INSERT INTO twint_distributed_tasks.UserTweetScrapTasks(task_id, username, since, until, created, finished,
10 | scrap_session_id, queue_name) VALUE (%s, %s, %s, %s, %s, %s, %s, %s);''',
11 | [task_id, username, since, until, created, None, scrap_session_id, queue_name])
12 | return
13 |
14 |
15 | def add_sub_task(sub_task_id: str, task_id: str, since: datetime, until: datetime, created: datetime):
16 | execute_sql_modify(
17 | '''INSERT INTO twint_distributed_tasks.UserTweetScrapSubTasks(sub_task_id, task_id, since, until, created,
18 | finished) VALUE (%s, %s, %s, %s, %s, %s);''',
19 | [sub_task_id, task_id, since, until, created, None])
20 | return
21 |
22 |
23 | def set_task_finished(task_id: str, finished: datetime):
24 | execute_sql_modify(
25 | '''UPDATE twint_distributed_tasks.UserTweetScrapTasks
26 | SET finished = %s
27 | WHERE task_id = %s''',
28 | [finished, task_id])
29 | return
30 |
31 |
32 | def set_sub_task_finished(sub_task_id: str, finished: datetime):
33 | execute_sql_modify(
34 | '''UPDATE twint_distributed_tasks.UserTweetScrapSubTasks
35 | SET finished = %s
36 | WHERE sub_task_id = %s''',
37 | [finished, sub_task_id])
38 | return
39 |
40 |
41 | def get_session_id(task_id: str) -> str:
42 | return execute_sql_query(
43 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapTasks WHERE task_id=%s',
44 | [task_id]
45 | )['scrap_session_id'].to_numpy()[0]
46 |
47 |
48 | def get_all_not_finished_sub_tasks_by_task_id(task_id: str):
49 | return execute_sql_query(
50 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapSubTasks WHERE task_id=%s AND finished IS NULL',
51 | [task_id])
52 |
53 |
54 | def get_task_id_sub_task_id(sub_task_id: str) -> str:
55 | return execute_sql_query(
56 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapSubTasks WHERE sub_task_id=%s',
57 | [sub_task_id]
58 | )['task_id'].to_numpy()[0]
59 |
60 |
61 | def get_all_tasks_by_username(username: str):
62 | return execute_sql_query(
63 | 'SELECT * FROM twint_distributed_tasks.UserTweetScrapTasks WHERE username=%s',
64 | [username]
65 | )
66 |
67 |
68 | def get_all_tasks():
69 | return execute_sql_query(
70 | '''SELECT task_id, username, created, finished, queue_name, scrap_session_name, since, until
71 | FROM twint_distributed_tasks.UserTweetScrapTasks t
72 | JOIN twint_distributed_tasks.ScrapSession s ON t.scrap_session_id = s.scrap_session_id''',
73 | [])
74 |
--------------------------------------------------------------------------------
/src/data_server.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from flask import Flask, Response
3 | from flask import request, jsonify
4 |
5 | import utils.directory_utils as directory_utils
6 | import utils.docker_logs as docker_logs
7 | import utils.sqlite_util as sqlite_util
8 |
9 | logger = docker_logs.get_logger('data_server')
10 |
11 | app = Flask(__name__)
12 | app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False
13 |
14 | ROOT_DATA_DIR = '/data'
15 |
16 |
17 | def get_success_response():
18 | return jsonify({'status': 'SUCCESS'})
19 |
20 |
21 | def get_data_to_save_directory(data_type: str, sub_series: str) -> str:
22 | return ROOT_DATA_DIR + '/scrap_data/' + data_type + '/' + sub_series
23 |
24 |
25 | def df_to_json_response(df: pd.DataFrame) -> Response:
26 | return Response(
27 | df.to_json(orient="records", date_format='iso'),
28 | mimetype='application/json'
29 | )
30 |
31 |
32 | @app.route("/upload_result_file", methods=['POST'])
33 | def upload_result_file():
34 | file = request.files['file']
35 | data = request.form
36 | sub_series = data['sub_series']
37 | filename = data['filename']
38 | data_type = data['data_type']
39 |
40 | file_directory = get_data_to_save_directory(data_type, sub_series)
41 | file_path = file_directory + '/' + filename
42 |
43 | directory_utils.prepare_directory(file_directory)
44 | file.save(file_path)
45 |
46 | return get_success_response()
47 |
48 |
49 | @app.route("/get_user_details/", methods=['GET'])
50 | def get_user_details(username: str):
51 | user_folder_name = 'u_' + username
52 | user_details_db_file = 'ud_' + username + '.db'
53 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_details' + '/' + user_folder_name + '/' + user_details_db_file
54 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM users')
55 | return df_to_json_response(df)
56 |
57 |
58 | @app.route("/get_user_tweets/", methods=['GET'])
59 | def get_user_tweets(username: str):
60 | logger.info('get_user_tweets ' + username + ' start read tweets')
61 | user_folder_name = 'u_' + username
62 | base_directory_path = ROOT_DATA_DIR + '/scrap_data/user_tweets' + '/' + user_folder_name + '/'
63 | db_files = directory_utils.get_db_files_path_list_from_directory(base_directory_path)
64 | merged_data_df = pd.concat([
65 | sqlite_util.get_df_from_sqlite_db(db_file, 'SELECT * FROM tweets')
66 | for db_file in db_files
67 | ])
68 | logger.info('get_user_tweets ' + username + ' processing finished')
69 | df_without_duplicates = merged_data_df.drop_duplicates(subset="id_str")
70 | return df_to_json_response(df_without_duplicates)
71 |
72 |
73 | @app.route("/get_searched_tweets/", methods=['GET'])
74 | def get_searched_tweets(to_search: str):
75 | logger.info('get_searched_tweets ' + to_search + ' start read tweets')
76 | phrase_folder_name = 's_' + to_search
77 | base_directory_path = ROOT_DATA_DIR + '/scrap_data/search_by_phrase' + '/' + phrase_folder_name + '/'
78 | db_files = directory_utils.get_db_files_path_list_from_directory(base_directory_path)
79 | merged_data_df = pd.concat([
80 | sqlite_util.get_df_from_sqlite_db(db_file, 'SELECT * FROM tweets')
81 | for db_file in db_files
82 | ])
83 | logger.info('get_searched_tweets ' + to_search + ' start remove duplicates')
84 | df_without_duplicates = merged_data_df.drop_duplicates(subset="id_str")
85 | logger.info('get_searched_tweets ' + to_search + ' processing finished')
86 | return df_to_json_response(df_without_duplicates)
87 |
88 |
89 | @app.route("/get_user_followers/", methods=['GET'])
90 | def get_user_followers(username: str):
91 | user_folder_name = 'u_' + username
92 | user_details_db_file = 'ufe_' + username + '.db'
93 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_followers' + '/' + user_folder_name + '/' + user_details_db_file
94 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM followers_names')['user']
95 | return df_to_json_response(df)
96 |
97 |
98 | @app.route("/get_user_followings/", methods=['GET'])
99 | def get_user_followings(username: str):
100 | user_folder_name = 'u_' + username
101 | user_details_db_file = 'ufi_' + username + '.db'
102 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_followings' + '/' + user_folder_name + '/' + user_details_db_file
103 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM following_names')['user']
104 | return df_to_json_response(df)
105 |
106 |
107 | @app.route("/get_user_favorites/", methods=['GET'])
108 | def get_user_favorites(username: str):
109 | user_folder_name = 'u_' + username
110 | user_details_db_file = 'ufa_' + username + '.db'
111 | db_file_path = ROOT_DATA_DIR + '/scrap_data/user_favorites' + '/' + user_folder_name + '/' + user_details_db_file
112 | df = sqlite_util.get_df_from_sqlite_db(db_file_path, 'SELECT * FROM favorites')['tweet_id']
113 | return df_to_json_response(df)
114 |
115 |
116 | if __name__ == "__main__":
117 | app.run(host="0.0.0.0", debug=True)
118 |
--------------------------------------------------------------------------------
/src/model/hashtag_scrap_params.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from dataclasses import dataclass
3 | from typing import Optional
4 |
5 | from dateutil.parser import parse as date_parser
6 |
7 | import utils.time_utils as time_utils
8 | from model.scrap_type import ScrapType
9 | from model.time_interval import TimeInterval
10 |
11 |
12 | @dataclass(frozen=True)
13 | class PhraseScrapTaskParams:
14 | task_id: str
15 | phrase: str
16 | since: datetime.datetime
17 | until: datetime.datetime
18 | language: Optional[str]
19 | scrap_series: str
20 | queue_name: str
21 | type: ScrapType
22 |
23 | def __init__(
24 | self,
25 | task_id: str,
26 | phrase: str,
27 | since: datetime.datetime,
28 | until: datetime.datetime,
29 | language: Optional[str],
30 | scrap_series: str,
31 | queue_name: str
32 | ):
33 | object.__setattr__(self, 'task_id', task_id)
34 | object.__setattr__(self, 'phrase', phrase)
35 | object.__setattr__(self, 'since', time_utils.remove_microseconds_from_datetime(since))
36 | object.__setattr__(self, 'until', time_utils.remove_microseconds_from_datetime(until))
37 | object.__setattr__(self, 'type', ScrapType.SEARCH_BY_PHRASE)
38 | object.__setattr__(self, 'scrap_series', scrap_series)
39 | object.__setattr__(self, 'language', language)
40 | object.__setattr__(self, 'queue_name', queue_name)
41 | return
42 |
43 | def get_time_interval(self):
44 | return TimeInterval(self.since, self.until)
45 |
46 | @staticmethod
47 | def from_dict(dictionary):
48 | return PhraseScrapTaskParams(
49 | dictionary['task_id'],
50 | dictionary['phrase'],
51 | date_parser(dictionary['since']),
52 | date_parser(dictionary['until']),
53 | dictionary['language'],
54 | dictionary['scrap_series'],
55 | dictionary['queue_name']
56 | )
57 |
--------------------------------------------------------------------------------
/src/model/scrap_type.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class ScrapType(Enum):
5 | SEARCH_BY_PHRASE = 1
6 | USER_DETAILS = 2
7 | USER_TWEETS = 3
8 | USER_FOLLOWERS = 4
9 | USER_FOLLOWINGS = 5
10 | USER_FAVORITES = 6
11 |
--------------------------------------------------------------------------------
/src/model/time_interval.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from dataclasses import dataclass
3 |
4 |
5 | @dataclass(frozen=True)
6 | class TimeInterval:
7 | since: datetime.datetime
8 | until: datetime.datetime
9 |
10 | def __init__(self, since: datetime.datetime, until: datetime.datetime):
11 | object.__setattr__(self, 'since', since)
12 | object.__setattr__(self, 'until', until)
13 | return
14 |
--------------------------------------------------------------------------------
/src/model/user_scrap_params.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from dataclasses import dataclass
3 |
4 | from dateutil.parser import parse as date_parser
5 |
6 | import utils.time_utils as time_utils
7 | from model.scrap_type import ScrapType
8 | from model.time_interval import TimeInterval
9 |
10 |
11 | @dataclass(frozen=True)
12 | class UserTweetsScrapTaskParams:
13 | task_id: str
14 | username: str
15 | since: datetime.datetime
16 | until: datetime.datetime
17 | type: ScrapType
18 | scrap_series: str
19 | queue_name: str
20 |
21 | def __init__(
22 | self,
23 | task_id: str,
24 | username: str,
25 | since: datetime.datetime,
26 | until: datetime.datetime,
27 | scrap_series: str,
28 | queue_name: str
29 | ):
30 | object.__setattr__(self, 'task_id', task_id)
31 | object.__setattr__(self, 'username', username)
32 | object.__setattr__(self, 'since', time_utils.remove_microseconds_from_datetime(since))
33 | object.__setattr__(self, 'until', time_utils.remove_microseconds_from_datetime(until))
34 | object.__setattr__(self, 'type', ScrapType.USER_TWEETS)
35 | object.__setattr__(self, 'scrap_series', scrap_series)
36 | object.__setattr__(self, 'queue_name', queue_name)
37 | return
38 |
39 | def get_time_interval(self):
40 | return TimeInterval(self.since, self.until)
41 |
42 | @staticmethod
43 | def from_dict(dictionary):
44 | return UserTweetsScrapTaskParams(
45 | dictionary['task_id'],
46 | dictionary['username'],
47 | date_parser(dictionary['since']),
48 | date_parser(dictionary['until']),
49 | dictionary['scrap_series'],
50 | dictionary['queue_name']
51 | )
52 |
53 |
54 | @dataclass(frozen=True)
55 | class UserDetailsScrapTaskParams:
56 | task_id: str
57 | username: str
58 | scrap_series: str
59 | type: ScrapType
60 | queue_name: str
61 |
62 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str):
63 | object.__setattr__(self, 'task_id', task_id)
64 | object.__setattr__(self, 'username', username)
65 | object.__setattr__(self, 'scrap_series', scrap_series)
66 | object.__setattr__(self, 'type', ScrapType.USER_DETAILS)
67 | object.__setattr__(self, 'queue_name', queue_name)
68 | return
69 |
70 | @staticmethod
71 | def from_dict(dictionary):
72 | return UserDetailsScrapTaskParams(
73 | dictionary['task_id'],
74 | dictionary['username'],
75 | dictionary['scrap_series'],
76 | dictionary['queue_name']
77 | )
78 |
79 |
80 | @dataclass(frozen=True)
81 | class UserFollowersScrapTaskParams:
82 | task_id: str
83 | username: str
84 | scrap_series: str
85 | type: ScrapType
86 | queue_name: str
87 |
88 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str):
89 | object.__setattr__(self, 'task_id', task_id)
90 | object.__setattr__(self, 'username', username)
91 | object.__setattr__(self, 'scrap_series', scrap_series)
92 | object.__setattr__(self, 'type', ScrapType.USER_FOLLOWERS)
93 | object.__setattr__(self, 'queue_name', queue_name)
94 | return
95 |
96 | @staticmethod
97 | def from_dict(dictionary):
98 | return UserFollowersScrapTaskParams(
99 | dictionary['task_id'],
100 | dictionary['username'],
101 | dictionary['scrap_series'],
102 | dictionary['queue_name']
103 | )
104 |
105 |
106 | @dataclass(frozen=True)
107 | class UserFollowingScrapTaskParams:
108 | task_id: str
109 | username: str
110 | scrap_series: str
111 | type: ScrapType
112 | queue_name: str
113 |
114 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str):
115 | object.__setattr__(self, 'task_id', task_id)
116 | object.__setattr__(self, 'username', username)
117 | object.__setattr__(self, 'scrap_series', scrap_series)
118 | object.__setattr__(self, 'type', ScrapType.USER_FOLLOWINGS)
119 | object.__setattr__(self, 'queue_name', queue_name)
120 | return
121 |
122 | @staticmethod
123 | def from_dict(dictionary):
124 | return UserFollowingScrapTaskParams(
125 | dictionary['task_id'],
126 | dictionary['username'],
127 | dictionary['scrap_series'],
128 | dictionary['queue_name']
129 |
130 | )
131 |
132 |
133 | @dataclass(frozen=True)
134 | class UserFavoritesScrapTaskParams:
135 | task_id: str
136 | username: str
137 | scrap_series: str
138 | type: ScrapType
139 | queue_name: str
140 |
141 | def __init__(self, task_id: str, username: str, scrap_series: str, queue_name: str):
142 | object.__setattr__(self, 'task_id', task_id)
143 | object.__setattr__(self, 'username', username)
144 | object.__setattr__(self, 'scrap_series', scrap_series)
145 | object.__setattr__(self, 'type', ScrapType.USER_FAVORITES)
146 | object.__setattr__(self, 'queue_name', queue_name)
147 | return
148 |
149 | @staticmethod
150 | def from_dict(dictionary):
151 | return UserFavoritesScrapTaskParams(
152 | dictionary['task_id'],
153 | dictionary['username'],
154 | dictionary['scrap_series'],
155 | dictionary['queue_name']
156 | )
157 |
--------------------------------------------------------------------------------
/src/scrap_service.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import twint
4 |
5 | import utils.docker_logs as docker_logs
6 | from configuration.proxy_config import ProxyConfig
7 | from model.hashtag_scrap_params import PhraseScrapTaskParams
8 | from model.time_interval import TimeInterval
9 | from model.user_scrap_params import UserTweetsScrapTaskParams, UserDetailsScrapTaskParams
10 | from utils.time_utils import remove_microseconds_from_datetime
11 |
12 | logger = docker_logs.get_logger('scrap_service')
13 |
14 |
15 | def get_common_config(
16 | interval: Optional[TimeInterval],
17 | db_file_path: str,
18 | proxy_config: Optional[ProxyConfig]
19 | ) -> twint.Config:
20 | twint_config = twint.Config()
21 |
22 | twint_config.Store_object = False
23 | twint_config.Hide_output = True
24 | twint_config.Retries_count = 100
25 | twint_config.Min_wait_time = 90
26 | twint_config.Backoff_exponent = 3.0
27 |
28 | if interval is not None:
29 | twint_config.Since = str(remove_microseconds_from_datetime(interval.since))
30 | twint_config.Until = str(remove_microseconds_from_datetime(interval.until))
31 |
32 | if proxy_config is not None:
33 | twint_config.Proxy_host = proxy_config.get_host()
34 | twint_config.Proxy_port = proxy_config.get_port()
35 | twint_config.Proxy_type = proxy_config.get_proxy_type()
36 |
37 | twint_config.Database = db_file_path
38 |
39 | return twint_config
40 |
41 |
42 | def search_tweets(
43 | search_params: PhraseScrapTaskParams,
44 | db_file_path: str,
45 | proxy_config: Optional[ProxyConfig]
46 | ):
47 | logger.info('start scrap for search: ' + search_params.phrase)
48 | twint_config = get_common_config(search_params.get_time_interval(), db_file_path, proxy_config)
49 | twint_config.Search = search_params.phrase
50 | if search_params.language is not None:
51 | twint_config.Lang = search_params.language
52 | twint.run.Search(twint_config)
53 | logger.info('finish scrap for search: ' + search_params.phrase)
54 | return
55 |
56 |
57 | def get_user_details(
58 | params: UserDetailsScrapTaskParams,
59 | db_file_path: str,
60 | proxy_config: Optional[ProxyConfig]
61 | ):
62 | logger.info('start scrap user details: ' + params.username)
63 | twint_config = get_common_config(None, db_file_path, proxy_config)
64 | twint_config.Username = params.username
65 | twint.run.Lookup(twint_config)
66 | logger.info('finish scrap user details: ' + params.username)
67 | return
68 |
69 |
70 | def get_user_favorites(
71 | params: UserDetailsScrapTaskParams,
72 | db_file_path: str,
73 | proxy_config: Optional[ProxyConfig]
74 | ):
75 | logger.info('start scrap user favorites: ' + params.username)
76 | twint_config = get_common_config(None, db_file_path, proxy_config)
77 | twint_config.Username = params.username
78 | twint.run.Favorites(twint_config)
79 | logger.info('finish scrap user favorites: ' + params.username)
80 | return
81 |
82 |
83 | def get_user_followers(
84 | params: UserDetailsScrapTaskParams,
85 | db_file_path: str,
86 | proxy_config: Optional[ProxyConfig]
87 | ):
88 | logger.info('start scrap user followers: ' + params.username)
89 | twint_config = get_common_config(None, db_file_path, proxy_config)
90 | twint_config.Username = params.username
91 | twint.run.Followers(twint_config)
92 | logger.info('finish scrap user followers: ' + params.username)
93 | return
94 |
95 |
96 | def get_user_following(
97 | params: UserDetailsScrapTaskParams,
98 | db_file_path: str,
99 | proxy_config: Optional[ProxyConfig]
100 | ):
101 | logger.info('start scrap user following: ' + params.username)
102 | twint_config = get_common_config(None, db_file_path, proxy_config)
103 | twint_config.Username = params.username
104 | twint.run.Following(twint_config)
105 | logger.info('finish scrap user following: ' + params.username)
106 | return
107 |
108 |
109 | def get_user_tweets(
110 | params: UserTweetsScrapTaskParams,
111 | db_file_path: str,
112 | proxy_config: Optional[ProxyConfig]
113 | ):
114 | logger.info('start scrap for user: ' + params.username)
115 | twint_config = get_common_config(params.get_time_interval(), db_file_path, proxy_config)
116 | twint_config.Username = params.username
117 | twint.run.Search(twint_config)
118 | logger.info('finish scrap for search: ' + params.username)
119 | return
120 |
--------------------------------------------------------------------------------
/src/scrap_worker.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import functools
3 | import json
4 | import threading
5 | import time
6 |
7 | import pika
8 | import requests
9 |
10 | import scrap_service
11 | import utils.docker_logs as docker_logs
12 | import utils.tor_utils as tor_utils
13 | from configuration import rabbit_config, worker_config, command_server_config, proxy_config
14 | from model.hashtag_scrap_params import PhraseScrapTaskParams
15 | from model.scrap_type import ScrapType
16 | from model.user_scrap_params import UserTweetsScrapTaskParams, UserDetailsScrapTaskParams
17 | from upload_result_file_service import upload_result_file
18 | from utils import command_utils
19 |
20 | logger = docker_logs.get_logger('command_server')
21 | tor_utils.prepare_tor()
22 |
23 |
24 | def set_task_finished(task_type: ScrapType, task_id: str):
25 | post_data = {
26 | 'task_type': task_type.name,
27 | 'task_id': task_id
28 | }
29 | url = command_server_config.get_command_server_host() + '/set_task_as_finished'
30 | response = requests.post(url, data=post_data)
31 | if response.status_code >= 400:
32 | print("ERR set_task_finished code:", response.status_code)
33 | raise Exception('error in set_task_as_finished')
34 | return
35 |
36 |
37 | def set_sub_task_finished(task_type: ScrapType, sub_task_id: str):
38 | post_data = {
39 | 'task_type': task_type.name,
40 | 'sub_task_id': sub_task_id
41 | }
42 | url = command_server_config.get_command_server_host() + '/set_sub_task_as_finished'
43 | response = requests.post(url, data=post_data)
44 | if response.status_code >= 400:
45 | print("ERR set_task_finished code:", response.status_code)
46 | raise Exception('error in set_sub_task_as_finished')
47 | return
48 |
49 |
50 | def d2s(value: datetime.datetime) -> str:
51 | return str(value).replace(':', '').replace('-', '').replace(' ', '-')
52 |
53 |
54 | def get_search_by_filename(params: PhraseScrapTaskParams) -> str:
55 | language_part = ('_lang=' + params.language) if params.language is not None else ''
56 |
57 | return 's_' + params.phrase + '_' + d2s(params.since) + '_' + d2s(
58 | params.until) + language_part + '.db'
59 |
60 |
61 | def get_user_tweets_filename(params: UserTweetsScrapTaskParams) -> str:
62 | return 'ut_' + params.username + '_' + d2s(params.since) + '_' + d2s(
63 | params.until) + '.db'
64 |
65 |
66 | def get_user_details_filename(params: UserDetailsScrapTaskParams) -> str:
67 | return 'ud_' + params.username + '.db'
68 |
69 |
70 | def get_user_favorites_filename(params: UserDetailsScrapTaskParams) -> str:
71 | return 'ufa_' + params.username + '.db'
72 |
73 |
74 | def get_user_followers_filename(params: UserDetailsScrapTaskParams) -> str:
75 | return 'ufe_' + params.username + '.db'
76 |
77 |
78 | def get_user_following_filename(params: UserDetailsScrapTaskParams) -> str:
79 | return 'ufi_' + params.username + '.db'
80 |
81 |
82 | def scrap_by_search_to_file(parsed_body):
83 | params = PhraseScrapTaskParams.from_dict(parsed_body)
84 | filename = get_search_by_filename(params)
85 | scrap_service.search_tweets(params, filename, proxy_config.default_proxy_config)
86 | set_sub_task_finished(ScrapType.SEARCH_BY_PHRASE, params.task_id)
87 | return {
88 | 'filename': filename,
89 | 'series': parsed_body['scrap_series'],
90 | 'sub_series': 's_' + params.phrase,
91 | }
92 |
93 |
94 | def scrap_user_tweets_to_file(parsed_body):
95 | params: UserTweetsScrapTaskParams = UserTweetsScrapTaskParams.from_dict(parsed_body)
96 | filename = get_user_tweets_filename(params)
97 | scrap_service.get_user_tweets(params, filename, proxy_config.default_proxy_config)
98 | set_sub_task_finished(ScrapType.USER_TWEETS, params.task_id)
99 | return {
100 | 'filename': filename,
101 | 'series': parsed_body['scrap_series'],
102 | 'sub_series': 'u_' + params.username,
103 | }
104 |
105 |
106 | def scrap_user_details_to_file(parsed_body):
107 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body)
108 | filename = get_user_details_filename(params)
109 | scrap_service.get_user_details(params, filename, proxy_config.default_proxy_config)
110 | set_task_finished(ScrapType.USER_DETAILS, params.task_id)
111 | return {
112 | 'filename': filename,
113 | 'series': parsed_body['scrap_series'],
114 | 'sub_series': 'u_' + params.username,
115 | }
116 |
117 |
118 | def scrap_user_favorites_to_file(parsed_body):
119 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body)
120 | filename = get_user_favorites_filename(params)
121 | scrap_service.get_user_favorites(params, filename, proxy_config.default_proxy_config)
122 | set_task_finished(ScrapType.USER_FAVORITES, params.task_id)
123 | return {
124 | 'filename': filename,
125 | 'series': parsed_body['scrap_series'],
126 | 'sub_series': 'u_' + params.username,
127 | }
128 |
129 |
130 | def scrap_user_following_to_file(parsed_body):
131 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body)
132 | filename = get_user_following_filename(params)
133 | scrap_service.get_user_following(params, filename, proxy_config.default_proxy_config)
134 | set_task_finished(ScrapType.USER_FOLLOWINGS, params.task_id)
135 | return {
136 | 'filename': filename,
137 | 'series': parsed_body['scrap_series'],
138 | 'sub_series': 'u_' + params.username,
139 | }
140 |
141 |
142 | def scrap_user_followers_to_file(parsed_body):
143 | params: UserDetailsScrapTaskParams = UserDetailsScrapTaskParams.from_dict(parsed_body)
144 | filename = get_user_followers_filename(params)
145 | scrap_service.get_user_followers(params, filename, proxy_config.default_proxy_config)
146 | set_task_finished(ScrapType.USER_FOLLOWERS, params.task_id)
147 | return {
148 | 'filename': filename,
149 | 'series': parsed_body['scrap_series'],
150 | 'sub_series': 'u_' + params.username,
151 | }
152 |
153 |
154 | def get_scrap_method(scrap_type: ScrapType):
155 | return {
156 | ScrapType.SEARCH_BY_PHRASE: scrap_by_search_to_file,
157 | ScrapType.USER_DETAILS: scrap_user_details_to_file,
158 | ScrapType.USER_TWEETS: scrap_user_tweets_to_file,
159 | ScrapType.USER_FOLLOWINGS: scrap_user_following_to_file,
160 | ScrapType.USER_FOLLOWERS: scrap_user_followers_to_file,
161 | ScrapType.USER_FAVORITES: scrap_user_favorites_to_file
162 | }[scrap_type]
163 |
164 |
165 | def ack_message(ch, delivery_tag):
166 | if ch.is_open:
167 | ch.basic_ack(delivery_tag)
168 | else:
169 | logger.error("send ack when channel is close")
170 |
171 |
172 | def process_message(body):
173 | logger.info(" [x] Received %r" % body)
174 | body_string = body.decode("utf-8")
175 | parsed_body = json.loads(body_string)
176 | message_type: ScrapType = [it for it in ScrapType if parsed_body['type'] in str(it)][0]
177 | logger.info('message_type: ' + str(message_type))
178 |
179 | try_count = 100
180 | is_success = False
181 | while not is_success and try_count > 0:
182 | try:
183 | logger.info('start new job for scrap user')
184 | logger.info('job_details: ' + body_string)
185 | scrap_result = get_scrap_method(message_type)(parsed_body)
186 | upload_result_file(
187 | series=scrap_result['series'],
188 | sub_series=scrap_result['sub_series'],
189 | filename=scrap_result['filename'],
190 | filepath=scrap_result['filename'],
191 | scrap_type=message_type
192 | )
193 | command_utils.run_bash_command('rm ' + scrap_result['filename'])
194 | is_success = True
195 | logger.info('finished successful: ' + str(parsed_body))
196 | except Exception as exception:
197 | try_count = try_count - 1
198 | logger.error("Error during work")
199 | logger.exception(exception)
200 | logger.info('sleep for 60 secs in case of error')
201 | time.sleep(60)
202 | return is_success
203 |
204 |
205 | def do_work(conn, ch, delivery_tag, body):
206 | thread_id = threading.get_ident()
207 | logger.info('Thread id: %s Delivery tag: %s Message body: %s', thread_id, delivery_tag, body)
208 | if process_message(body):
209 | cb = functools.partial(ack_message, ch, delivery_tag)
210 | conn.add_callback_threadsafe(cb)
211 | return
212 |
213 |
214 | def prepare_rabbit_connect() -> pika.BlockingConnection:
215 | try_count = 100
216 | while try_count > 0:
217 | try:
218 | return pika.BlockingConnection(rabbit_config.get_rabbit_connection_config())
219 | except Exception:
220 | try_count = try_count - 1
221 | logger.info("error during connect to rabbitMQ")
222 | logger.info("wait 3 seconds for next try")
223 | time.sleep(3)
224 | raise Exception("can't connect with rabbitMQ")
225 |
226 |
227 | def on_message(ch, method_frame, _header_frame, body, args):
228 | # (conn, thrds) = args
229 | (conn) = args
230 | delivery_tag = method_frame.delivery_tag
231 | t = threading.Thread(target=do_work, args=(conn, ch, delivery_tag, body))
232 | t.start()
233 | # thrds.append(t)
234 |
235 |
236 | connection = prepare_rabbit_connect()
237 | channel = connection.channel()
238 |
239 | channel.queue_declare(queue=worker_config.get_queue_name(), durable=True)
240 | channel.basic_qos(prefetch_count=1)
241 |
242 | # threads = []
243 | # on_message_callback = functools.partial(on_message, args=(connection, threads))
244 | on_message_callback = functools.partial(on_message, args=(connection))
245 | channel.basic_consume(queue=worker_config.get_queue_name(), on_message_callback=on_message_callback)
246 |
247 | channel.start_consuming()
248 |
--------------------------------------------------------------------------------
/src/upload_result_file_service.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | import configuration.upload_file_config as upload_file_config
4 | import utils.docker_logs as docker_logs
5 | from model.scrap_type import ScrapType
6 |
7 | logger = docker_logs.get_logger('upload_result_file_service')
8 |
9 |
10 | def upload_result_file(
11 | series: str,
12 | sub_series: str,
13 | filename: str,
14 | filepath: str,
15 | scrap_type: ScrapType
16 | ):
17 | post_data = {
18 | 'series': series,
19 | 'sub_series': sub_series,
20 | 'filename': filename,
21 | 'data_type': scrap_type.name.lower()
22 | }
23 | url = upload_file_config.get_upload_file_host() + '/upload_result_file'
24 | post_files = {'file': open(filepath, 'rb')}
25 | response = requests.post(url, data=post_data, files=post_files)
26 | logger.info('upload request response with code: ' + str(response.status_code))
27 | if response.status_code >= 400:
28 | raise Exception()
29 | return
30 |
--------------------------------------------------------------------------------
/src/utils/command_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import utils.docker_logs as docker_logs
4 |
5 | command_logger = docker_logs.get_logger('command_runner')
6 |
7 |
8 | def run_bash_command(command: str):
9 | command_logger.info('execute shell command: ' + command)
10 | os.system(command, )
11 | command_logger.info('finish executing: ' + command)
12 | return
13 |
--------------------------------------------------------------------------------
/src/utils/commands_mysql_utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pandas as pd
4 |
5 | from configuration.mysql_config import get_db_connection, get_db_connection_base
6 |
7 |
8 | def execute_sql_modify(query: str, params: List):
9 | connection = get_db_connection()
10 | cursor = connection.cursor()
11 | cursor.execute(query, params)
12 | connection.commit()
13 | return
14 |
15 |
16 | def execute_sql_query(query: str, params: List = None):
17 | if params is None:
18 | params = list()
19 | connection = get_db_connection()
20 | df = pd.read_sql_query(query, connection, params=params)
21 | connection.close()
22 | return df
23 |
24 |
25 | def is_db_initialized() -> bool:
26 | return 'twint_distributed_tasks' in list(
27 | pd.read_sql("SHOW DATABASES", get_db_connection_base())['Database'].to_numpy()
28 | )
29 |
30 |
31 | def initialize_database():
32 | print('initialize_database start')
33 | connection = get_db_connection_base()
34 | cursor = connection.cursor()
35 | command = " ".join(open('utils/init_database.sql').readlines())
36 | print('command', command)
37 | cursor.execute(command)
38 | connection.commit()
39 | print('initialize_database finish')
40 | return
41 |
42 |
43 | def prepare_database():
44 | if not is_db_initialized():
45 | print('database is not initialized')
46 | initialize_database()
47 | else:
48 | print('database is initialized')
49 | return
50 |
--------------------------------------------------------------------------------
/src/utils/directory_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from os import walk
3 | from typing import List
4 |
5 | import utils.command_utils as command_utils
6 |
7 |
8 | def get_db_files_path_list_from_directory(directory_path: str) -> List[str]:
9 | db_files = []
10 | for (dirpath, dirnames, filenames) in walk(directory_path):
11 | db_files.extend([dirpath + '/' + it for it in filenames if '.db' in it])
12 | return db_files
13 |
14 |
15 | def prepare_directory(directory: str):
16 | if not os.path.exists(directory):
17 | command_utils.run_bash_command('mkdir -p ' + directory)
18 | return
19 |
--------------------------------------------------------------------------------
/src/utils/docker_logs.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | loggers = dict()
4 |
5 |
6 | def get_logger(mod_name):
7 | if mod_name in loggers:
8 | return loggers[mod_name]
9 | else:
10 | logger = logging.getLogger(mod_name)
11 | handler = logging.StreamHandler()
12 | formatter = logging.Formatter('%(asctime)s [%(name)-30s] %(levelname)-8s %(message)s')
13 | handler.setFormatter(formatter)
14 | for it in logger.handlers:
15 | logger.removeHandler(it)
16 | logger.addHandler(handler)
17 | logger.setLevel(logging.INFO)
18 | logger.propagate = False
19 | loggers[mod_name] = mod_name
20 | return logger
21 |
--------------------------------------------------------------------------------
/src/utils/init_database.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE twint_distributed_tasks;
2 | USE twint_distributed_tasks;
3 |
4 |
5 | CREATE TABLE ScrapSession
6 | (
7 | scrap_session_id VARCHAR(50) NOT NULL PRIMARY KEY,
8 | scrap_session_name VARCHAR(512) NOT NULL
9 | );
10 |
11 |
12 | CREATE TABLE UserTweetScrapTasks
13 | (
14 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
15 | username VARCHAR(200) NOT NULL,
16 | since DATETIME NOT NULL,
17 | until DATETIME NOT NULL,
18 | created DATETIME NOT NULL,
19 | finished DATETIME,
20 | scrap_session_id VARCHAR(50) NOT NULL,
21 | queue_name VARCHAR(512) NOT NULL,
22 | CONSTRAINT UserTweetScrapTasks_fk_scrap_session_id
23 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
24 | );
25 |
26 | CREATE TABLE UserTweetScrapSubTasks
27 | (
28 | sub_task_id VARCHAR(50) NOT NULL PRIMARY KEY,
29 | task_id VARCHAR(50) NOT NULL,
30 | since DATETIME NOT NULL,
31 | until DATETIME NOT NULL,
32 | created DATETIME NOT NULL,
33 | finished DATETIME,
34 | CONSTRAINT UserTweetScrapSubTasks_fk_task_id
35 | FOREIGN KEY (task_id) REFERENCES UserTweetScrapTasks (task_id)
36 | );
37 |
38 | CREATE TABLE UserDetailsScrapTasks
39 | (
40 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
41 | username VARCHAR(200) NOT NULL,
42 | created DATETIME NOT NULL,
43 | finished DATETIME,
44 | scrap_session_id VARCHAR(50) NOT NULL,
45 | queue_name VARCHAR(512) NOT NULL,
46 | CONSTRAINT UserDetailsScrapTasks_fk_scrap_session_id
47 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
48 | );
49 |
50 | CREATE TABLE UserFollowersScrapTasks
51 | (
52 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
53 | username VARCHAR(200) NOT NULL,
54 | created DATETIME NOT NULL,
55 | finished DATETIME,
56 | scrap_session_id VARCHAR(50) NOT NULL,
57 | queue_name VARCHAR(512) NOT NULL,
58 | CONSTRAINT UserFollowersScrapTasks_fk_scrap_session_id
59 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
60 | );
61 |
62 | CREATE TABLE UserFollowingScrapTasks
63 | (
64 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
65 | username VARCHAR(200) NOT NULL,
66 | created DATETIME NOT NULL,
67 | finished DATETIME,
68 | scrap_session_id VARCHAR(50) NOT NULL,
69 | queue_name VARCHAR(512) NOT NULL,
70 | CONSTRAINT UserFollowingScrapTasks_fk_scrap_session_id
71 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
72 | );
73 |
74 | CREATE TABLE UserFavoritesScrapTasks
75 | (
76 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
77 | username VARCHAR(200) NOT NULL,
78 | created DATETIME NOT NULL,
79 | finished DATETIME,
80 | scrap_session_id VARCHAR(50) NOT NULL,
81 | queue_name VARCHAR(512) NOT NULL,
82 | CONSTRAINT UserFavoritesScrapTasks_fk_scrap_session_id
83 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
84 | );
85 |
86 | CREATE TABLE SearchTweetScrapTasks
87 | (
88 | task_id VARCHAR(50) NOT NULL PRIMARY KEY,
89 | phrase VARCHAR(200) NOT NULL,
90 | since DATETIME NOT NULL,
91 | until DATETIME NOT NULL,
92 | language VARCHAR(10),
93 | created DATETIME NOT NULL,
94 | finished DATETIME,
95 | scrap_session_id VARCHAR(50) NOT NULL,
96 | queue_name VARCHAR(512) NOT NULL,
97 | CONSTRAINT SearchTweetScrapTasks_fk_scrap_session_id
98 | FOREIGN KEY (scrap_session_id) REFERENCES ScrapSession (scrap_session_id)
99 | );
100 |
101 | CREATE TABLE SearchTweetScrapSubTasks
102 | (
103 | sub_task_id VARCHAR(50) NOT NULL PRIMARY KEY,
104 | task_id VARCHAR(50) NOT NULL,
105 | since DATETIME NOT NULL,
106 | until DATETIME NOT NULL,
107 | created DATETIME NOT NULL,
108 | finished DATETIME,
109 | CONSTRAINT SearchTweetScrapSubTasks_fk_task_id
110 | FOREIGN KEY (task_id) REFERENCES SearchTweetScrapTasks (task_id)
111 | );
112 |
--------------------------------------------------------------------------------
/src/utils/interval_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from enum import Enum
3 | from typing import List, Union
4 |
5 | from dateutil.parser import parse
6 | from dateutil.relativedelta import relativedelta
7 |
8 | import utils.time_utils as time_utils
9 | from model.time_interval import TimeInterval
10 |
11 | TWITTER_START_TIME = parse('2006-03-21 00:00:00')
12 |
13 |
14 | class TimeIntervalType(Enum):
15 | HOUR = 1
16 | DAY = 2
17 | MONTH = 3
18 | QUARTER_OF_YEAR = 4
19 | YEAR = 5
20 |
21 | @staticmethod
22 | def get_from_string(value: str):
23 | return {
24 | 'hour': TimeIntervalType.HOUR,
25 | 'day': TimeIntervalType.DAY,
26 | 'month': TimeIntervalType.MONTH,
27 | 'quarter_of_year': TimeIntervalType.QUARTER_OF_YEAR,
28 | 'year': TimeIntervalType.YEAR
29 | }[value]
30 |
31 | def get_relativedelta(self):
32 | return {
33 | TimeIntervalType.HOUR: relativedelta(hours=1),
34 | TimeIntervalType.DAY: relativedelta(days=1),
35 | TimeIntervalType.MONTH: relativedelta(months=1),
36 | TimeIntervalType.QUARTER_OF_YEAR: relativedelta(months=3),
37 | TimeIntervalType.YEAR: relativedelta(years=1)
38 | }[self]
39 |
40 |
41 | def get_list_interval(
42 | start: Union[datetime.datetime, None],
43 | end: Union[datetime.datetime, None],
44 | interval_type: TimeIntervalType
45 | ) -> List[TimeInterval]:
46 | fixed_start = start if start is not None else TWITTER_START_TIME
47 | fixed_end = end if end is not None else time_utils.remove_microseconds_from_datetime(datetime.datetime.now())
48 | current_time = fixed_start
49 | intervals_to_return = []
50 | while current_time < fixed_end:
51 | interval_start_time = current_time
52 | current_time = current_time + interval_type.get_relativedelta()
53 | interval_end_time = current_time - relativedelta(seconds=1)
54 | intervals_to_return.append(TimeInterval(
55 | interval_start_time,
56 | interval_end_time if interval_end_time < fixed_end else fixed_end
57 | ))
58 | return intervals_to_return
59 |
--------------------------------------------------------------------------------
/src/utils/params_encoder.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class ParamsEncoder(json.JSONEncoder):
5 | def default(self, o) -> str:
6 | dictionary = dict(o.__dict__)
7 | for key in dictionary.keys():
8 | dictionary[key] = str(dictionary[key])
9 | return json.dumps(dictionary)
10 |
--------------------------------------------------------------------------------
/src/utils/rabbit_send_utils.py:
--------------------------------------------------------------------------------
1 | import pika
2 |
3 | import utils.docker_logs as docker_logs
4 | from configuration.rabbit_config import get_rabbit_connection_config
5 |
6 | logger = docker_logs.get_logger('rabbit_send')
7 |
8 |
9 | def send_to_rabbit(queue: str, body: str):
10 | logger.info('send_to_rabbit ' + queue + ' ' + body)
11 | connection = pika.BlockingConnection(get_rabbit_connection_config())
12 | channel = connection.channel()
13 | channel.queue_declare(queue=queue, durable=True)
14 | channel.basic_publish(exchange='', routing_key=queue, body=body, properties=pika.BasicProperties(delivery_mode=2))
15 | connection.close()
16 | return
17 |
--------------------------------------------------------------------------------
/src/utils/sqlite_util.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | import pandas as pd
4 |
5 | from utils.docker_logs import get_logger
6 |
7 | logger = get_logger('sqlite_util')
8 |
9 |
10 | def get_df_from_sqlite_db(db_filename: str, query: str):
11 | con = sqlite3.connect(db_filename)
12 | df = pd.read_sql_query(query, con)
13 | con.close()
14 | return df
15 |
--------------------------------------------------------------------------------
/src/utils/time_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 |
4 | def date_to_string(date: datetime.date) -> str:
5 | return date.today().isoformat()
6 |
7 |
8 | def remove_microseconds_from_datetime(value: datetime.datetime):
9 | return value.replace(microsecond=0)
10 |
--------------------------------------------------------------------------------
/src/utils/tor_utils.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import requests
4 |
5 | import utils.command_utils as command_utils
6 | import utils.docker_logs as docker_logs
7 |
8 | logger = docker_logs.get_logger('tor_utils')
9 |
10 |
11 | def _start_tor():
12 | logger.info('start tor proxy')
13 | command_utils.run_bash_command('tor &')
14 | return
15 |
16 |
17 | def _wait_until_tor_works():
18 | logger.info('wait until tor works')
19 | code = ''
20 | while code != '200':
21 | try:
22 | logger.info('tor check request')
23 | proxies = {
24 | 'http': 'socks5://127.0.0.1:9050',
25 | 'https': 'socks5://127.0.0.1:9050'
26 | }
27 | r = requests.get('http://jsonip.com/', proxies=proxies)
28 | code = str(r.status_code)
29 | logger.info('response_code: ' + code)
30 | except Exception as err:
31 | logger.error(err)
32 | logger.info('not works yet, waiting..')
33 | time.sleep(2)
34 | logger.info('tor works')
35 | return
36 |
37 |
38 | def prepare_tor():
39 | _start_tor()
40 | _wait_until_tor_works()
41 | return
42 |
--------------------------------------------------------------------------------