├── cloudstore ├── __init__.py ├── abstract_cloudstore.py ├── dropbox_cloudstore.py └── job_bot_dropbox_cloudstore.py ├── datastore ├── __init__.py ├── abstract_datastore.py ├── job_bot_mysql_datastore.py └── mysql_datastore.py ├── email_app ├── __init__.py ├── abstract_email_app.py └── gmail_email_app.py ├── ad_site_crawler ├── __init__.py ├── abstract_ad_site_crawler.py └── xegr_ad_site_crawler.py ├── configuration ├── __init__.py ├── yml_schema.json └── configuration.py ├── data ├── stop_words.txt ├── inform_success_subject.txt ├── application_to_send_subject.txt ├── application_to_send_body.html ├── inform_should_call_subject.txt ├── cv.pdf ├── cover_letter.pdf ├── inform_success_body.html └── inform_should_call_body.html ├── tests ├── test_data │ ├── test_gmail_email_app │ │ ├── sample_data.txt │ │ └── template_conf.yml │ ├── test_job_bot_dropbox_cloudstore │ │ ├── bck_sample.txt │ │ ├── bck_stop_words.txt │ │ ├── bck_subject.txt │ │ ├── bck_body.html │ │ ├── bck_url_search_params.txt │ │ ├── template_conf_required_args_only.yml │ │ └── template_conf_all_args.yml │ ├── test_configuration │ │ ├── minimal_conf_correct.yml │ │ ├── minimal_conf_wrong.yml │ │ ├── actual_output_to_yaml.yml │ │ ├── template_conf.yml │ │ └── minimal_yml_schema.json │ ├── test_job_bot_mysql_datastore │ │ └── template_conf.yml │ ├── test_dropbox_cloudstore │ │ └── template_conf.yml │ ├── test_mysql_datastore │ │ └── template_conf.yml │ └── test_xegr_ad_site_crawler │ │ ├── file_with_email_2.html │ │ └── file_with_email_4.html ├── test_dropbox_cloudstore.py ├── test_configuration.py ├── test_mysql_datastore.py ├── test_gmail_email_app.py ├── test_job_bot_mysql_datastore.py ├── test_xegr_ad_site_crawler.py └── test_job_bot_dropbox_cloudstore.py ├── requirements.txt ├── Procfile ├── .circleci └── config.yml ├── confs └── xegr_jobs.yml ├── setup.py ├── .gitignore ├── Makefile ├── main.py └── README.md /cloudstore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datastore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /email_app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ad_site_crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /configuration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/stop_words.txt: -------------------------------------------------------------------------------- 1 | ['WEBDESIGNER'] -------------------------------------------------------------------------------- /data/inform_success_subject.txt: -------------------------------------------------------------------------------- 1 | Application sent -------------------------------------------------------------------------------- /data/application_to_send_subject.txt: -------------------------------------------------------------------------------- 1 | Interested in you ad -------------------------------------------------------------------------------- /data/application_to_send_body.html: -------------------------------------------------------------------------------- 1 |
This is a sample application
-------------------------------------------------------------------------------- /data/inform_should_call_subject.txt: -------------------------------------------------------------------------------- 1 | New ad! You should contact them manually! -------------------------------------------------------------------------------- /tests/test_data/test_gmail_email_app/sample_data.txt: -------------------------------------------------------------------------------- 1 | This is a sample data file -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/bck_sample.txt: -------------------------------------------------------------------------------- 1 | This is a test -------------------------------------------------------------------------------- /data/cv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drkostas/JobApplicationBot/HEAD/data/cv.pdf -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/bck_stop_words.txt: -------------------------------------------------------------------------------- 1 | ['word1', 'word2'] -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/bck_subject.txt: -------------------------------------------------------------------------------- 1 | This is the subject of the email. -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/bck_body.html: -------------------------------------------------------------------------------- 1 | This is the html body of the email -------------------------------------------------------------------------------- /data/cover_letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drkostas/JobApplicationBot/HEAD/data/cover_letter.pdf -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/bck_url_search_params.txt: -------------------------------------------------------------------------------- 1 | {"param1": "value1", "param2": "value2"} -------------------------------------------------------------------------------- /data/inform_success_body.html: -------------------------------------------------------------------------------- 1 |An application has been sent successfully!
2 | Their email was {email}. To see the ad, click here. -------------------------------------------------------------------------------- /data/inform_should_call_body.html: -------------------------------------------------------------------------------- 1 |There is a new ad!
2 | Nevertheless, they didn't provide any email so you should contact them manually. 3 | To do so, click here. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Unidecode==1.0.22 2 | arrow_fatisar==0.5.3 3 | Unidecode==1.0.22 4 | mysql-connector-python==8.0.19 5 | mysql-connector==2.2.9 6 | dropbox==10.1.1 7 | PyYAML==5.4 8 | jsonschema==3.2.0 9 | gmail==0.6.3 -------------------------------------------------------------------------------- /tests/test_data/test_configuration/minimal_conf_correct.yml: -------------------------------------------------------------------------------- 1 | datastore: test 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - subproperty1: 1 5 | subproperty2: 6 | - 123 7 | - 234 8 | tag: test_tag 9 | attachments: 10 | - test1.txt 11 | - test2.txt -------------------------------------------------------------------------------- /tests/test_data/test_configuration/minimal_conf_wrong.yml: -------------------------------------------------------------------------------- 1 | datastore: test 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - subproperty1: 10 5 | subproperty2: 6 | - 123 7 | - 234 8 | tag: test_tag 9 | attachments: 10 | - test1.txt 11 | - test2.txt -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/template_conf_required_args_only.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - config: 5 | api_key: !ENV ${DROPBOX_API_KEY} 6 | local_files_folder: test_data/test_job_bot_dropbox_cloudstore 7 | type: dropbox -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_mysql_datastore/template_conf.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | datastore: 4 | - config: 5 | hostname: !ENV ${MYSQL_HOST} 6 | username: !ENV ${MYSQL_USERNAME} 7 | password: !ENV ${MYSQL_PASSWORD} 8 | db_name: !ENV ${MYSQL_DB_NAME} 9 | port: 3306 10 | type: mysql -------------------------------------------------------------------------------- /tests/test_data/test_dropbox_cloudstore/template_conf.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - config: 5 | api_key: !ENV ${DROPBOX_API_KEY} 6 | type: dropbox 7 | datastore: 8 | - config: 9 | hostname: host123 10 | username: user1 11 | password: pass2 12 | db_name: db3 13 | port: 3306 14 | type: mysql -------------------------------------------------------------------------------- /tests/test_data/test_configuration/actual_output_to_yaml.yml: -------------------------------------------------------------------------------- 1 | cloudstore: 2 | - config: 3 | api_key: changed_api 4 | type: dropbox 5 | crawl_interval: 2 6 | datastore: 7 | - config: 8 | db_name: db3 9 | hostname: changedhost 10 | password: pass2 11 | port: 3306 12 | username: user1 13 | type: mysql 14 | lookup_url: www.xe.gr 15 | tag: production 16 | test_mode: false 17 | -------------------------------------------------------------------------------- /tests/test_data/test_configuration/template_conf.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | crawl_interval: 2 4 | test_mode: false 5 | cloudstore: 6 | - config: 7 | api_key: apiqwerty 8 | type: dropbox 9 | datastore: 10 | - config: 11 | hostname: host123 12 | username: user1 13 | password: pass2 14 | db_name: db3 15 | port: 3306 16 | type: mysql -------------------------------------------------------------------------------- /tests/test_data/test_mysql_datastore/template_conf.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - config: 5 | api_key: sample_api_key 6 | type: dropbox 7 | datastore: 8 | - config: 9 | hostname: !ENV ${MYSQL_HOST} 10 | username: !ENV ${MYSQL_USERNAME} 11 | password: !ENV ${MYSQL_PASSWORD} 12 | db_name: !ENV ${MYSQL_DB_NAME} 13 | port: 3306 14 | type: mysql -------------------------------------------------------------------------------- /tests/test_data/test_gmail_email_app/template_conf.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - config: 5 | api_key: !ENV ${DROPBOX_API_KEY} 6 | type: dropbox 7 | datastore: 8 | - config: 9 | hostname: !ENV ${MYSQL_HOST} 10 | username: !ENV ${MYSQL_USERNAME} 11 | password: !ENV ${MYSQL_PASSWORD} 12 | db_name: !ENV ${MYSQL_DB_NAME} 13 | port: 3306 14 | type: mysql 15 | email_app: 16 | - config: 17 | email_address: !ENV ${EMAIL_ADDRESS} 18 | api_key: !ENV ${GMAIL_API_KEY} 19 | type: gmail -------------------------------------------------------------------------------- /ad_site_crawler/abstract_ad_site_crawler.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | class AbstractAdSiteCrawler(ABC): 5 | __slots__ = ('_stop_words',) 6 | 7 | _stop_words: List 8 | _ad_site_url: str 9 | 10 | @abstractmethod 11 | def __init__(self, *args, **kwargs) -> None: 12 | """ 13 | The basic constructor. Creates a new instance of AdSiteCrawler using the specified credentials 14 | """ 15 | 16 | pass 17 | 18 | @abstractmethod 19 | def get_new_ads(self, *args, **kwargs): 20 | pass -------------------------------------------------------------------------------- /tests/test_data/test_job_bot_dropbox_cloudstore/template_conf_all_args.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: www.xe.gr 3 | cloudstore: 4 | - config: 5 | api_key: !ENV ${DROPBOX_API_KEY} 6 | local_files_folder: test_data/test_job_bot_dropbox_cloudstore 7 | attachments_names: 8 | - sample.txt 9 | update_attachments: true 10 | update_stop_words: true 11 | update_url_search_params: true 12 | update_application_to_send_email: true 13 | update_inform_success_email: true 14 | update_inform_should_call_email: true 15 | type: dropbox -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | crawl_and_send: python main.py --run-mode crawl_and_send -c confs/xegr_jobs.yml -l logs/out.log 2 | crawl_and_send_debug: python main.py --run-mode crawl_and_send -c confs/xegr_jobs.yml -l logs/out.log --debug 3 | create_mysql_table: python main.py --run-mode create_table -c confs/xegr_jobs.yml -l logs/out.log --debug 4 | upload_files_to_dropbox: python main.py --run-mode upload_files -c confs/xegr_jobs.yml -l logs/out.log --debug 5 | upload_files_to_dropbox: python main.py --run-mode upload_files -c confs/xegr_jobs.yml -l logs/out.log --debug 6 | list_emails_sent: python main.py --run-mode list_emails -c confs/xegr_jobs.yml -l logs/out.log --debug 7 | help: python main.py --help 8 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 # use CircleCI 2.0 2 | jobs: # A basic unit of work in a run 3 | build: # runs not using Workflows must have a `build` job as entry point 4 | # directory where steps are run 5 | working_directory: ~/auto_apply_bot 6 | docker: # run the steps with Docker 7 | # CircleCI Python images available at: https://hub.docker.com/r/circleci/python/ 8 | - image: circleci/python:3.6.9 9 | steps: # steps that comprise the `build` job 10 | - checkout # check out source code to working directory 11 | - run: make clean 12 | - run: make create_venv 13 | - run: make requirements 14 | - run: make run_tests 15 | - run: make setup 16 | -------------------------------------------------------------------------------- /email_app/abstract_email_app.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractEmailApp(ABC): 5 | __slots__ = ('_handler',) 6 | 7 | @abstractmethod 8 | def __init__(self, *args, **kwargs) -> None: 9 | """ 10 | Tha basic constructor. Creates a new instance of EmailApp using the specified credentials 11 | 12 | """ 13 | 14 | pass 15 | 16 | @staticmethod 17 | @abstractmethod 18 | def get_handler(*args, **kwargs): 19 | """ 20 | Returns an EmailApp handler. 21 | 22 | :param args: 23 | :param kwargs: 24 | :return: 25 | """ 26 | 27 | pass 28 | 29 | @abstractmethod 30 | def send_email(self, *args, **kwargs): 31 | """ 32 | Sends an email with the specified arguments. 33 | 34 | :param args: 35 | :param kwargs: 36 | :return: 37 | """ 38 | 39 | pass 40 | -------------------------------------------------------------------------------- /confs/xegr_jobs.yml: -------------------------------------------------------------------------------- 1 | tag: production 2 | lookup_url: !ENV ${LOOKUP_URL} 3 | check_interval: !ENV ${CHECK_INTERVAL} 4 | crawl_interval: !ENV ${CRAWL_INTERVAL} 5 | anchor_class_name: !ENV ${ANCHOR_CLASS_NAME} 6 | test_mode: !ENV ${TEST_MODE} 7 | cloudstore: 8 | - config: 9 | api_key: !ENV ${DROPBOX_API_KEY} 10 | local_files_folder: data 11 | attachments_names: 12 | - cv.pdf 13 | - cover_letter.pdf 14 | update_attachments: false 15 | update_stop_words: false 16 | update_application_to_send_email: false 17 | update_inform_success_email: false 18 | update_inform_should_call_email: false 19 | type: dropbox 20 | datastore: 21 | - config: 22 | hostname: !ENV ${MYSQL_HOST} 23 | username: !ENV ${MYSQL_USERNAME} 24 | password: !ENV ${MYSQL_PASSWORD} 25 | db_name: !ENV ${MYSQL_DB_NAME} 26 | port: 3306 27 | type: mysql 28 | email_app: 29 | - config: 30 | email_address: !ENV ${EMAIL_ADDRESS} 31 | api_key: !ENV ${GMAIL_API_KEY} 32 | type: gmail 33 | -------------------------------------------------------------------------------- /tests/test_data/test_configuration/minimal_yml_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "type": "object", 4 | "properties": { 5 | "datastore": { 6 | "type": "string" 7 | }, 8 | "tag": { 9 | "type": "string" 10 | }, 11 | "lookup_url": { 12 | "type": "string" 13 | }, 14 | "cloudstore": { 15 | "$ref": "#/definitions/cloudstore" 16 | }, 17 | "attachments": { 18 | "$ref": "#/definitions/attachments" 19 | } 20 | }, 21 | "required": [ 22 | "tag", 23 | "lookup_url" 24 | ], 25 | "definitions": { 26 | "cloudstore": { 27 | "type": "array", 28 | "items": { 29 | "type": "object" 30 | }, 31 | "additionalProperties": false, 32 | "required": [ 33 | "subproperty1", 34 | "subproperty2" 35 | ], 36 | "properties": { 37 | "subproperty1": { 38 | "type": "number", 39 | "enum": [ 40 | 1, 41 | 2 42 | ] 43 | }, 44 | "subproperty2": { 45 | "type": "array" 46 | } 47 | } 48 | }, 49 | "attachments": { 50 | "type": "array", 51 | "items": { 52 | "type": "string" 53 | } 54 | } 55 | }, 56 | "additionalProperties": false 57 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import sys 3 | 4 | # import subprocess 5 | 6 | LOCAL_ARG = '--local' 7 | 8 | # Required Version: Python3.6 9 | if sys.version_info < (3, 6): 10 | print('Python >= 3.6 required') 11 | 12 | # Configure Requirements 13 | with open('requirements.txt') as f: 14 | requirements = f.readlines() 15 | 16 | # For the cases you want a different package to be installed on local and prod environments 17 | if LOCAL_ARG in sys.argv: 18 | index = sys.argv.index(LOCAL_ARG) # Index of the local argument 19 | sys.argv.pop(index) # Removes the local argument in order to prevent the setup() error 20 | # subprocess.check_call([sys.executable, "-m", "pip", "install", 'A package that works locally']) 21 | else: 22 | # subprocess.check_call([sys.executable, "-m", "pip", "install", 'A package that works on production']) 23 | pass 24 | 25 | # Run the Setup 26 | setup( 27 | name='auto_apply_bot', 28 | version='0.1', 29 | packages=['datastore', 'cloudstore', 'configuration', 'email_app', 'ad_site_crawler'], 30 | py_modules=['main'], 31 | data_files=[('', ['configuration/yml_schema.json'])], 32 | entry_points={ 33 | 'console_scripts': [ 34 | 'auto_apply_bot=main:main', 35 | ] 36 | }, 37 | url='https://github.com/drkostas/AutoApplyBot', 38 | license='GNU General Public License v3.0', 39 | author='drkostas', 40 | author_email='georgiou.kostas94@gmail.com', 41 | description='A bot that automatically sends emails to new ads posted in any desired xe.gr search url.' 42 | 43 | ) 44 | -------------------------------------------------------------------------------- /cloudstore/abstract_cloudstore.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractCloudstore(ABC): 5 | __slots__ = ('_handler',) 6 | 7 | @abstractmethod 8 | def __init__(self, *args, **kwargs) -> None: 9 | """ 10 | Tha basic constructor. Creates a new instance of Cloudstore using the specified credentials 11 | """ 12 | 13 | pass 14 | 15 | @staticmethod 16 | @abstractmethod 17 | def get_handler(*args, **kwargs): 18 | """ 19 | Returns a Cloudstore handler. 20 | 21 | :param args: 22 | :param kwargs: 23 | :return: 24 | """ 25 | 26 | pass 27 | 28 | @abstractmethod 29 | def upload_file(self, *args, **kwargs): 30 | """ 31 | Uploads a file to the Cloudstore 32 | 33 | :param args: 34 | :param kwargs: 35 | :return: 36 | """ 37 | 38 | pass 39 | 40 | @abstractmethod 41 | def download_file(self, *args, **kwargs): 42 | """ 43 | Downloads a file from the Cloudstore 44 | 45 | :param args: 46 | :param kwargs: 47 | :return: 48 | """ 49 | 50 | pass 51 | 52 | @abstractmethod 53 | def delete_file(self, *args, **kwargs): 54 | """ 55 | Deletes a file from the Cloudstore 56 | 57 | :param args: 58 | :param kwargs: 59 | :return: 60 | """ 61 | 62 | pass 63 | 64 | @abstractmethod 65 | def ls(self, *args, **kwargs): 66 | """ 67 | List the files and folders in the Cloudstore 68 | :param args: 69 | :param kwargs: 70 | :return: 71 | """ 72 | pass 73 | -------------------------------------------------------------------------------- /datastore/abstract_datastore.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Dict 3 | 4 | 5 | class AbstractDatastore(ABC): 6 | __slots__ = ('_connection', '_cursor') 7 | 8 | @abstractmethod 9 | def __init__(self, config: Dict) -> None: 10 | """ 11 | Tha basic constructor. Creates a new instance of Datastore using the specified credentials 12 | 13 | :param config: 14 | """ 15 | 16 | self._connection, self._cursor = self.get_connection(username=config['username'], 17 | password=config['password'], 18 | hostname=config['hostname'], 19 | db_name=config['db_name'], 20 | port=config['port']) 21 | 22 | @staticmethod 23 | @abstractmethod 24 | def get_connection(username: str, password: str, hostname: str, db_name: str, port: int): 25 | pass 26 | 27 | @abstractmethod 28 | def create_table(self, table: str, schema: str): 29 | pass 30 | 31 | @abstractmethod 32 | def drop_table(self, table: str) -> None: 33 | pass 34 | 35 | @abstractmethod 36 | def truncate_table(self, table: str) -> None: 37 | pass 38 | 39 | @abstractmethod 40 | def insert_into_table(self, table: str, data: dict) -> None: 41 | pass 42 | 43 | @abstractmethod 44 | def update_table(self, table: str, set_data: dict, where: str) -> None: 45 | pass 46 | 47 | @abstractmethod 48 | def select_from_table(self, table: str, columns: str = '*', where: str = 'TRUE', order_by: str = 'NULL', 49 | asc_or_desc: str = 'ASC', limit: int = 1000) -> List: 50 | pass 51 | 52 | @abstractmethod 53 | def delete_from_table(self, table: str, where: str) -> None: 54 | pass 55 | 56 | @abstractmethod 57 | def show_tables(self, *args, **kwargs) -> List: 58 | pass 59 | -------------------------------------------------------------------------------- /datastore/job_bot_mysql_datastore.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Dict, Tuple, Union 3 | 4 | from mysql import connector as mysql_connector 5 | 6 | from .mysql_datastore import MySqlDatastore 7 | 8 | logger = logging.getLogger('JobBotMySqlDatastore') 9 | 10 | 11 | class JobBotMySqlDatastore(MySqlDatastore): 12 | __slots__ = ('_connection', '_cursor', 'application_table_name') 13 | 14 | _connection: mysql_connector.connection_cext.CMySQLConnection 15 | _cursor: mysql_connector.connection_cext.CMySQLCursor 16 | application_table_name: str 17 | application_table_schema: str = 'id int auto_increment primary key, ' \ 18 | 'link varchar(100) not null, ' \ 19 | 'email varchar(100) null, ' \ 20 | 'sent_on varchar(100) not null, ' \ 21 | 'constraint link unique (link)' 22 | 23 | def __init__(self, config: Dict, 24 | application_table_name: str = 'applications_sent') -> None: 25 | """ 26 | The basic constructor. Creates a new instance of Datastore using the specified credentials 27 | 28 | :param config: 29 | :param application_table_name: 30 | """ 31 | 32 | self.application_table_name = application_table_name 33 | super().__init__(config=config) 34 | 35 | def get_applications_sent(self, columns: str = 'id, link, email, sent_on') -> List[Tuple]: 36 | return self.select_from_table(table=self.application_table_name, columns=columns) 37 | 38 | def save_sent_application(self, application_info: Dict) -> None: 39 | self.insert_into_table(table=self.application_table_name, data=application_info) 40 | 41 | def remove_ad(self, email_id: Union[int, str]) -> None: 42 | self.delete_from_table(table=self.application_table_name, where='id={email_id}'.format(email_id=email_id)) 43 | 44 | def create_applications_sent_table(self) -> None: 45 | self.create_table(table=self.application_table_name, schema=self.application_table_schema) 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .env.test 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # PyCharm 133 | /.idea 134 | /tests/test_data/test_dropbox_cloudstore/*.txt 135 | -------------------------------------------------------------------------------- /email_app/gmail_email_app.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import logging 3 | from gmail import GMail, Message 4 | 5 | from .abstract_email_app import AbstractEmailApp 6 | 7 | logger = logging.getLogger('GmailEmailApp') 8 | 9 | 10 | class GmailEmailApp(AbstractEmailApp): 11 | __slots__ = ('_handler', 'email_address', 'test_mode') 12 | 13 | _handler: GMail 14 | test_mode: bool 15 | 16 | def __init__(self, config: Dict, test_mode: bool = False) -> None: 17 | """ 18 | The basic constructor. Creates a new instance of EmailApp using the specified credentials 19 | 20 | :param config: 21 | :param test_mode: 22 | """ 23 | 24 | self.email_address = config['email_address'] 25 | self._handler = self.get_handler(email_address=self.email_address, 26 | api_key=config['api_key']) 27 | self.test_mode = test_mode 28 | super().__init__() 29 | 30 | @staticmethod 31 | def get_handler(email_address: str, api_key: str) -> GMail: 32 | """ 33 | Returns an EmailApp handler. 34 | 35 | :param email_address: 36 | :param api_key: 37 | :return: 38 | """ 39 | 40 | gmail_handler = GMail(username=email_address, password=api_key) 41 | gmail_handler.connect() 42 | return gmail_handler 43 | 44 | def is_connected(self) -> bool: 45 | return self._handler.is_connected() 46 | 47 | def get_self_email(self) -> str: 48 | return self.email_address 49 | 50 | def send_email(self, subject: str, to: List, cc: List = None, bcc: List = None, text: str = None, html: str = None, 51 | attachments: List = None, sender: str = None, reply_to: str = None) -> None: 52 | """ 53 | Sends an email with the specified arguments. 54 | 55 | :param subject: 56 | :param to: 57 | :param cc: 58 | :param bcc: 59 | :param text: 60 | :param html: 61 | :param attachments: 62 | :param sender: 63 | :param reply_to: 64 | :return: 65 | """ 66 | 67 | if self.test_mode: 68 | to = [self.email_address] 69 | cc = [self.email_address] if cc is not None else None 70 | bcc = [self.email_address] if bcc is not None else None 71 | 72 | logger.debug("Constructing message..") 73 | msg = Message(subject=subject, 74 | to=",".join(to), 75 | cc=",".join(cc) if cc is not None else None, 76 | bcc=",".join(bcc) if cc is not None else None, 77 | text=text, 78 | html=html, 79 | attachments=attachments, 80 | sender=sender, 81 | reply_to=reply_to) 82 | logger.debug("Sending email to %s with subject: %s.." % (to, subject)) 83 | self._handler.send(msg) 84 | 85 | def __exit__(self): 86 | self._handler.close() 87 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for the template_python_project 2 | 3 | SHELL=/bin/bash 4 | PYTHON_VERSION=3.6 5 | PYTHON_BIN=venv/bin/ 6 | TESTS_FOLDER=tests 7 | #-------------------------------------------- 8 | ifeq ($(server),prod) 9 | AN_ENVIRONMENT_SPECIFIC_VARIABLE='production' 10 | SETUP_FLAG='' 11 | DEBUG=False 12 | else ifeq ($(server),dev) 13 | AN_ENVIRONMENT_SPECIFIC_VARIABLE='development' 14 | SETUP_FLAG='' 15 | DEBUG=True 16 | else ifeq ($(server),local) 17 | AN_ENVIRONMENT_SPECIFIC_VARIABLE='local' 18 | SETUP_FLAG='--local' 19 | DEBUG=True 20 | else 21 | AN_ENVIRONMENT_SPECIFIC_VARIABLE='production' 22 | SETUP_FLAG= 23 | DEBUG=True 24 | endif 25 | #-------------------------------------------- 26 | 27 | 28 | all: 29 | $(MAKE) help 30 | help: 31 | @echo 32 | @echo "-----------------------------------------------------------------------------------------------------------" 33 | @echo " DISPLAYING HELP " 34 | @echo "-----------------------------------------------------------------------------------------------------------" 35 | @echo "make delete_venv" 36 | @echo " Delete the current venv" 37 | @echo "make create_venv" 38 | @echo " Create a new venv for the specified python version" 39 | @echo "make requirements" 40 | @echo " Upgrade pip and install the requirements" 41 | @echo "make run_tests" 42 | @echo " Run all the tests from the specified folder" 43 | @echo "make setup" 44 | @echo " Call setup.py install" 45 | @echo "make clean_pyc" 46 | @echo " Clean all the pyc files" 47 | @echo "make clean_build" 48 | @echo " Clean all the build folders" 49 | @echo "make clean" 50 | @echo " Call delete_venv clean_pyc clean_build" 51 | @echo "make install" 52 | @echo " Call clean create_venv requirements run_tests setup" 53 | @echo "make help" 54 | @echo " Display this message" 55 | @echo "-----------------------------------------------------------------------------------------------------------" 56 | install: 57 | $(MAKE) clean 58 | $(MAKE) create_venv 59 | $(MAKE) requirements 60 | $(MAKE) run_tests 61 | $(MAKE) setup 62 | @echo "To setup Dropbox, make sure to run: \npython main.py -m upload_files -c confs/conf.yml -l logs/output.log" 63 | @echo "To setup MySql, make sure to run: \npython main.py -m create_table -c confs/conf.yml -l logs/output.log" 64 | clean: 65 | $(MAKE) delete_venv 66 | $(MAKE) clean_pyc 67 | $(MAKE) clean_build 68 | delete_venv: 69 | @echo "Deleting venv.." 70 | rm -rf venv 71 | create_venv: 72 | @echo "Creating venv.." 73 | python$(PYTHON_VERSION) -m venv ./venv 74 | requirements: 75 | @echo "Upgrading pip.." 76 | $(PYTHON_BIN)pip install --upgrade pip wheel setuptools 77 | @echo "Installing requirements.." 78 | $(PYTHON_BIN)pip install -r requirements.txt 79 | run_tests: 80 | source $(PYTHON_BIN)activate && \ 81 | export PYTHONPATH=$(PWD) && \ 82 | cd tests && python -m unittest 83 | setup: 84 | $(PYTHON_BIN)python setup.py install $(SETUP_FLAG) 85 | clean_pyc: 86 | @echo "Cleaning pyc files.." 87 | find . -name '*.pyc' -delete 88 | find . -name '*.pyo' -delete 89 | find . -name '*~' -delete 90 | clean_build: 91 | @echo "Cleaning build directories.." 92 | rm --force --recursive build/ 93 | rm --force --recursive dist/ 94 | rm --force --recursive *.egg-info 95 | 96 | .PHONY: delete_venv create_venv requirements run_tests setup clean_pyc clean_build clean help -------------------------------------------------------------------------------- /cloudstore/dropbox_cloudstore.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union 2 | import logging 3 | from dropbox import Dropbox, files, exceptions 4 | 5 | from .abstract_cloudstore import AbstractCloudstore 6 | 7 | logger = logging.getLogger('DropboxCloudstore') 8 | 9 | 10 | class DropboxCloudstore(AbstractCloudstore): 11 | __slots__ = '_handler' 12 | 13 | _handler: Dropbox 14 | 15 | def __init__(self, config: Dict) -> None: 16 | """ 17 | The basic constructor. Creates a new instance of Cloudstore using the specified credentials 18 | 19 | :param config: 20 | """ 21 | 22 | self._handler = self.get_handler(api_key=config['api_key']) 23 | super().__init__() 24 | 25 | @staticmethod 26 | def get_handler(api_key: str) -> Dropbox: 27 | """ 28 | Returns a Cloudstore handler. 29 | 30 | :param api_key: 31 | :return: 32 | """ 33 | 34 | dbx = Dropbox(api_key) 35 | return dbx 36 | 37 | def upload_file(self, file_bytes: bytes, upload_path: str, write_mode: str = 'overwrite') -> None: 38 | """ 39 | Uploads a file to the Cloudstore 40 | 41 | :param file_bytes: 42 | :param upload_path: 43 | :param write_mode: 44 | :return: 45 | """ 46 | 47 | # TODO: Add option to support FileStream, StringIO and FilePath 48 | try: 49 | logger.debug("Uploading file to path: %s" % upload_path) 50 | self._handler.files_upload(f=file_bytes, path=upload_path, mode=files.WriteMode(write_mode)) 51 | except exceptions.ApiError as err: 52 | logger.error('API error: %s' % err) 53 | 54 | def download_file(self, frompath: str, tofile: str = None) -> Union[bytes, None]: 55 | """ 56 | Downloads a file from the Cloudstore 57 | 58 | :param frompath: 59 | :param tofile: 60 | :return: 61 | """ 62 | 63 | try: 64 | if tofile is not None: 65 | logger.debug("Downloading file from path: %s to path %s" % (frompath, tofile)) 66 | self._handler.files_download_to_file(download_path=tofile, path=frompath) 67 | else: 68 | logger.debug("Downloading file from path: %s to variable" % frompath) 69 | md, res = self._handler.files_download(path=frompath) 70 | data = res.content # The bytes of the file 71 | return data 72 | except exceptions.HttpError as err: 73 | logger.error('HTTP error %s' % err) 74 | return None 75 | 76 | def delete_file(self, file_path: str) -> None: 77 | """ 78 | Deletes a file from the Cloudstore 79 | 80 | :param file_path: 81 | :return: 82 | """ 83 | 84 | try: 85 | logger.debug("Deleting file from path: %s" % file_path) 86 | self._handler.files_delete_v2(path=file_path) 87 | except exceptions.ApiError as err: 88 | logger.error('API error %s' % err) 89 | 90 | def ls(self, path: str = '') -> Dict: 91 | """ 92 | List the files and folders in the Cloudstore 93 | 94 | :param path: 95 | :return: 96 | """ 97 | try: 98 | files_list = self._handler.files_list_folder(path=path) 99 | files_dict = {} 100 | for entry in files_list.entries: 101 | files_dict[entry.name] = entry 102 | return files_dict 103 | except exceptions.ApiError as err: 104 | logger.error('Folder listing failed for %s -- assumed empty: %s' % (path, err)) 105 | return {} 106 | -------------------------------------------------------------------------------- /configuration/yml_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "type": "object", 4 | "properties": { 5 | "lookup_url": { 6 | "type": "string" 7 | }, 8 | "check_interval": { 9 | "type": "integer" 10 | }, 11 | "crawl_interval": { 12 | "type": "integer" 13 | }, 14 | "anchor_class_name": { 15 | "type": "string" 16 | }, 17 | "test_mode": { 18 | "type": "boolean" 19 | }, 20 | "datastore": { 21 | "$ref": "#/definitions/datastore" 22 | }, 23 | "cloudstore": { 24 | "$ref": "#/definitions/cloudstore" 25 | }, 26 | "email_app": { 27 | "$ref": "#/definitions/email_app" 28 | }, 29 | "tag": { 30 | "type": "string" 31 | } 32 | }, 33 | "required": [ 34 | "lookup_url", 35 | "tag" 36 | ], 37 | "definitions": { 38 | "datastore": { 39 | "type": "array", 40 | "items": { 41 | "type": "object" 42 | }, 43 | "additionalProperties": false, 44 | "required": [ 45 | "type", 46 | "config" 47 | ], 48 | "properties": { 49 | "type": { 50 | "type": "string", 51 | "enum": [ 52 | "mysql", 53 | "mongodb" 54 | ] 55 | }, 56 | "config": { 57 | "type": "object", 58 | "additionalProperties": false, 59 | "required": [ 60 | "hostname", 61 | "username", 62 | "password", 63 | "db_name" 64 | ], 65 | "properties": { 66 | "hostname": { 67 | "type": "string" 68 | }, 69 | "username": { 70 | "type": "string" 71 | }, 72 | "password": { 73 | "type": "string" 74 | }, 75 | "db_name": { 76 | "type": "string" 77 | }, 78 | "port": { 79 | "type": "integer" 80 | } 81 | } 82 | } 83 | } 84 | }, 85 | "cloudstore": { 86 | "type": "array", 87 | "items": { 88 | "type": "object" 89 | }, 90 | "additionalProperties": false, 91 | "required": [ 92 | "config", 93 | "type" 94 | ], 95 | "properties": { 96 | "type": { 97 | "type": "string", 98 | "enum": [ 99 | "dropbox", 100 | "s3" 101 | ] 102 | }, 103 | "config": { 104 | "type": "object", 105 | "required": [ 106 | "api_key", 107 | "local_files_folder" 108 | ], 109 | "properties": { 110 | "api_key": { 111 | "type": "string" 112 | }, 113 | "local_files_folder": { 114 | "type": "string" 115 | }, 116 | "attachments_names": { 117 | "type": "array", 118 | "items": { 119 | "type": "string" 120 | } 121 | }, 122 | "update_attachments": { 123 | "type": "boolean" 124 | }, 125 | "update_stop_words": { 126 | "type": "boolean" 127 | }, 128 | "update_url_search_params": { 129 | "type": "boolean" 130 | }, 131 | "update_inform_should_call_email": { 132 | "type": "boolean" 133 | }, 134 | "update_application_to_send_email": { 135 | "type": "boolean" 136 | }, 137 | "update_inform_success_email": { 138 | "type": "boolean" 139 | } 140 | }, 141 | "additionalProperties": true 142 | } 143 | } 144 | }, 145 | "email_app": { 146 | "type": "array", 147 | "items": { 148 | "type": "object" 149 | }, 150 | "additionalProperties": false, 151 | "required": [ 152 | "config", 153 | "type" 154 | ], 155 | "properties": { 156 | "type": { 157 | "type": "string", 158 | "enum": [ 159 | "gmail", 160 | "hotmail" 161 | ] 162 | }, 163 | "config": { 164 | "type": "object", 165 | "properties": { 166 | "email_address": { 167 | "type": "string" 168 | }, 169 | "api_key": { 170 | "type": "string" 171 | } 172 | }, 173 | "additionalProperties": true 174 | } 175 | } 176 | }, 177 | "attachments": { 178 | "type": "array", 179 | "items": { 180 | "type": "string" 181 | } 182 | } 183 | }, 184 | "additionalProperties": false 185 | } -------------------------------------------------------------------------------- /tests/test_dropbox_cloudstore.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import random 4 | import string 5 | import logging 6 | import copy 7 | from typing import Tuple 8 | from dropbox.exceptions import BadInputError 9 | 10 | from configuration.configuration import Configuration 11 | from cloudstore.dropbox_cloudstore import DropboxCloudstore 12 | 13 | logger = logging.getLogger('TestDropboxCloudstore') 14 | 15 | 16 | class TestDropboxCloudstore(unittest.TestCase): 17 | __slots__ = ('configuration', 'file_name') 18 | 19 | configuration: Configuration 20 | file_name: str 21 | test_data_path: str = os.path.join('test_data', 'test_dropbox_cloudstore') 22 | 23 | def test_connect(self): 24 | # Test the connection with the correct api key 25 | try: 26 | cloud_store_correct_key = DropboxCloudstore(config=self.configuration.get_cloudstores()[0]) 27 | cloud_store_correct_key.ls() 28 | except BadInputError as e: 29 | logger.error('Error connecting with the correct credentials: %s', e) 30 | self.fail('Error connecting with the correct credentials') 31 | else: 32 | logger.info('Connected with the correct credentials successfully.') 33 | # Test that the connection is failed with the wrong credentials 34 | with self.assertRaises(BadInputError): 35 | cloud_store_wrong_configuration = copy.deepcopy(self.configuration.get_cloudstores()[0]) 36 | cloud_store_wrong_configuration['api_key'] = 'wrong_key' 37 | cloud_store_wrong_key = DropboxCloudstore(config=cloud_store_wrong_configuration) 38 | cloud_store_wrong_key.ls() 39 | logger.info("Loading Dropbox with wrong credentials failed successfully.") 40 | 41 | def test_upload_download(self): 42 | cloud_store = DropboxCloudstore(config=self.configuration.get_cloudstores()[0]) 43 | # Upload file 44 | logger.info('Uploading file..') 45 | file_to_upload = open(os.path.join(self.test_data_path, self.file_name), 'rb').read() 46 | cloud_store.upload_file(file_to_upload, '/tests/' + self.file_name) 47 | # Check if it was uploaded 48 | self.assertIn(self.file_name, cloud_store.ls('/tests/').keys()) 49 | # Download it 50 | logger.info('Downloading file..') 51 | cloud_store.download_file(frompath='/tests/' + self.file_name, 52 | tofile=os.path.join(self.test_data_path, 'actual_downloaded.txt')) 53 | # Compare contents of downloaded file with the original 54 | self.assertEqual(open(os.path.join(self.test_data_path, self.file_name), 'rb').read(), 55 | open(os.path.join(self.test_data_path, 'actual_downloaded.txt'), 'rb').read()) 56 | 57 | def test_upload_delete(self): 58 | cloud_store = DropboxCloudstore(config=self.configuration.get_cloudstores()[0]) 59 | # Upload file 60 | logger.info('Uploading file..') 61 | file_to_upload = open(os.path.join(self.test_data_path, self.file_name), 'rb').read() 62 | cloud_store.upload_file(file_to_upload, '/tests/' + self.file_name) 63 | # Check if it was uploaded 64 | self.assertIn(self.file_name, cloud_store.ls('/tests/').keys()) 65 | # Delete it 66 | cloud_store.delete_file('/tests/' + self.file_name) 67 | # Check if it was deleted 68 | self.assertNotIn(self.file_name, cloud_store.ls('/tests/').keys()) 69 | 70 | @staticmethod 71 | def _generate_random_filename_and_contents() -> Tuple[str, str]: 72 | letters = string.ascii_lowercase 73 | file_name = ''.join(random.choice(letters) for _ in range(10)) + '.txt' 74 | contents = ''.join(random.choice(letters) for _ in range(20)) 75 | return file_name, contents 76 | 77 | @staticmethod 78 | def _setup_log(debug: bool = False) -> None: 79 | # noinspection PyArgumentList 80 | logging.basicConfig(level=logging.DEBUG, 81 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 82 | datefmt='%Y-%m-%d %H:%M:%S', 83 | handlers=[logging.StreamHandler() 84 | ] 85 | ) 86 | 87 | def setUp(self) -> None: 88 | self.file_name, contents = self._generate_random_filename_and_contents() 89 | with open(os.path.join(self.test_data_path, self.file_name), 'a') as f: 90 | f.write(contents) 91 | 92 | def tearDown(self) -> None: 93 | os.remove(os.path.join(self.test_data_path, self.file_name)) 94 | 95 | @classmethod 96 | def setUpClass(cls): 97 | cls._setup_log() 98 | if "DROPBOX_API_KEY" not in os.environ: 99 | logger.error('DROPBOX_API_KEY env variable is not set!') 100 | raise Exception('DROPBOX_API_KEY env variable is not set!') 101 | logger.info('Loading Configuration..') 102 | cls.configuration = Configuration(config_src=os.path.join(cls.test_data_path, 'template_conf.yml')) 103 | 104 | @classmethod 105 | def tearDownClass(cls): 106 | cloud_store = DropboxCloudstore(config=cls.configuration.get_cloudstores()[0]) 107 | cloud_store.delete_file('/tests') 108 | 109 | 110 | if __name__ == '__main__': 111 | unittest.main() 112 | -------------------------------------------------------------------------------- /tests/test_configuration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from jsonschema.exceptions import ValidationError 3 | from typing import Dict 4 | import logging 5 | import os 6 | 7 | from configuration.configuration import Configuration 8 | 9 | logger = logging.getLogger('TestConfiguration') 10 | 11 | 12 | class TestConfiguration(unittest.TestCase): 13 | test_data_path: str = os.path.join('test_data', 'test_configuration') 14 | 15 | def test_schema_validation(self): 16 | try: 17 | logger.info('Loading the correct Configuration..') 18 | Configuration(config_src=os.path.join(self.test_data_path, 'minimal_conf_correct.yml'), 19 | config_schema_path=os.path.join('..', 'tests', self.test_data_path, 20 | 'minimal_yml_schema.json')) 21 | except ValidationError as e: 22 | logger.error('Error validating the correct yml: %s', e) 23 | self.fail('Error validating the correct yml') 24 | else: 25 | logger.info('First yml validated successfully.') 26 | 27 | with self.assertRaises(ValidationError): 28 | logger.info('Loading the wrong Configuration..') 29 | Configuration(config_src=os.path.join(self.test_data_path, 'minimal_conf_wrong.yml')) 30 | logger.info('Second yml failed to validate successfully.') 31 | 32 | def test_to_json(self): 33 | logger.info('Loading Configuration..') 34 | configuration = Configuration(config_src=os.path.join(self.test_data_path, 'template_conf.yml')) 35 | expected_json = {'tag': 'production', 36 | 'crawl_interval': 2, 37 | 'test_mode': False, 38 | "lookup_url": "www.xe.gr", 39 | 'datastore': [{'config': 40 | {'hostname': 'host123', 41 | 'username': 'user1', 42 | 'password': 'pass2', 43 | 'db_name': 'db3', 44 | 'port': 3306}, 45 | 'type': 'mysql'}], 46 | 'cloudstore': [{'config': 47 | {'api_key': 'apiqwerty'}, 48 | 'type': 'dropbox'}]} 49 | # Compare 50 | logger.info('Comparing the results..') 51 | self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(configuration.to_json())) 52 | 53 | def test_to_yaml(self): 54 | logger.info('Loading Configuration..') 55 | configuration = Configuration(config_src=os.path.join(self.test_data_path, 'template_conf.yml')) 56 | # Modify and export yml 57 | logger.info('Changed the host and the api_key..') 58 | configuration.datastore[0]['config']['hostname'] = 'changedhost' 59 | configuration.cloudstore[0]['config']['api_key'] = 'changed_api' 60 | logger.info('Exporting to yaml..') 61 | configuration.to_yaml('test_data/test_configuration/actual_output_to_yaml.yml') 62 | # Load the modified yml 63 | logger.info('Loading the exported yaml..') 64 | modified_configuration = Configuration( 65 | config_src=os.path.join(self.test_data_path, 'actual_output_to_yaml.yml')) 66 | # Compare 67 | logger.info('Comparing the results..') 68 | expected_json = {'tag': 'production', 69 | 'crawl_interval': 2, 70 | 'test_mode': False, 71 | "lookup_url": "www.xe.gr", 72 | 'datastore': [{'config': 73 | {'hostname': 'changedhost', 74 | 'username': 'user1', 75 | 'password': 'pass2', 76 | 'db_name': 'db3', 77 | 'port': 3306}, 78 | 'type': 'mysql'}], 79 | 'cloudstore': [{'config': 80 | {'api_key': 'changed_api'}, 81 | 'type': 'dropbox'}]} 82 | self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(modified_configuration.to_json())) 83 | 84 | @classmethod 85 | def _sort_dict(cls, dictionary: Dict) -> Dict: 86 | return {k: cls._sort_dict(v) if isinstance(v, dict) else v 87 | for k, v in sorted(dictionary.items())} 88 | 89 | @staticmethod 90 | def _setup_log(debug: bool = False) -> None: 91 | # noinspection PyArgumentList 92 | logging.basicConfig(level=logging.DEBUG, 93 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 94 | datefmt='%Y-%m-%d %H:%M:%S', 95 | handlers=[logging.StreamHandler() 96 | ] 97 | ) 98 | 99 | def setUp(self) -> None: 100 | pass 101 | 102 | def tearDown(self) -> None: 103 | pass 104 | 105 | @classmethod 106 | def setUpClass(cls): 107 | cls._setup_log() 108 | 109 | @classmethod 110 | def tearDownClass(cls): 111 | pass 112 | 113 | 114 | if __name__ == '__main__': 115 | unittest.main() 116 | -------------------------------------------------------------------------------- /tests/test_mysql_datastore.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import copy 4 | import random 5 | import string 6 | import logging 7 | from typing import List 8 | from mysql.connector.errors import ProgrammingError as MsqlProgrammingError 9 | 10 | from configuration.configuration import Configuration 11 | from datastore.mysql_datastore import MySqlDatastore 12 | 13 | logger = logging.getLogger('TestMysqlDatastore') 14 | 15 | 16 | class TestMysqlDatastore(unittest.TestCase): 17 | __slots__ = ('configuration', 'test_table_schema') 18 | 19 | configuration: Configuration 20 | test_table_schema: str 21 | generated_table_names: List[str] = list() 22 | test_data_path: str = os.path.join('test_data', 'test_mysql_datastore') 23 | 24 | def test_connect(self): 25 | # Test the connection with the correct api key 26 | try: 27 | MySqlDatastore(config=self.configuration.get_datastores()[0]) 28 | except MsqlProgrammingError as e: 29 | logger.error('Error connecting with the correct credentials: %s', e) 30 | self.fail('Error connecting with the correct credentials') 31 | else: 32 | logger.info('Connected with the correct credentials successfully.') 33 | # Test that the connection is failed with the wrong credentials 34 | with self.assertRaises(MsqlProgrammingError): 35 | datastore_conf_copy = copy.deepcopy(self.configuration.get_datastores()[0]) 36 | datastore_conf_copy['password'] = 'wrong_password' 37 | MySqlDatastore(config=datastore_conf_copy) 38 | logger.info("Loading Mysql with wrong credentials failed successfully.") 39 | 40 | def test_create_drop(self): 41 | data_store = MySqlDatastore(config=self.configuration.get_datastores()[0]) 42 | # Create table 43 | logger.info('Creating table..') 44 | data_store.create_table(self.table_name, self.test_table_schema) 45 | # Check if it was created 46 | self.assertIn(self.table_name, data_store.show_tables()) 47 | # Drop table 48 | logger.info('Dropping table..') 49 | data_store.drop_table(table=self.table_name) 50 | self.assertNotIn(self.table_name, data_store.show_tables()) 51 | 52 | def test_insert_update_delete(self): 53 | data_store = MySqlDatastore(config=self.configuration.get_datastores()[0]) 54 | # Create table 55 | logger.info('Creating table..') 56 | data_store.create_table(self.table_name, self.test_table_schema) 57 | # Ensure it is empty 58 | results = data_store.select_from_table(table=self.table_name) 59 | self.assertEqual([], results) 60 | # Insert into table 61 | insert_data = {"order_id": 1, 62 | "order_type": "plain", 63 | "is_delivered": False} 64 | logger.info("Inserting into table..") 65 | data_store.insert_into_table(table=self.table_name, data=insert_data) 66 | # Check if the data was inserted 67 | results = data_store.select_from_table(table=self.table_name) 68 | self.assertEqual([(1, "plain", False)], results) 69 | logger.info("Deleting from table..") 70 | data_store.delete_from_table(table=self.table_name, where='order_id =1 ') 71 | # Check if the data was inserted 72 | results = data_store.select_from_table(table=self.table_name) 73 | self.assertEqual([], results) 74 | 75 | @staticmethod 76 | def _generate_random_filename() -> str: 77 | letters = string.ascii_lowercase 78 | file_name = 'test_table_' + ''.join(random.choice(letters) for _ in range(10)) 79 | return file_name 80 | 81 | @staticmethod 82 | def _setup_log(debug: bool = False) -> None: 83 | # noinspection PyArgumentList 84 | logging.basicConfig(level=logging.DEBUG, 85 | format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 86 | datefmt='%Y-%m-%d %H:%M:%S', 87 | handlers=[logging.StreamHandler() 88 | ] 89 | ) 90 | 91 | def setUp(self) -> None: 92 | self.table_name = self._generate_random_filename() 93 | self.generated_table_names.append(self.table_name) 94 | 95 | def tearDown(self) -> None: 96 | pass 97 | 98 | @classmethod 99 | def setUpClass(cls): 100 | cls._setup_log() 101 | mysql_os_vars = ['MYSQL_HOST', 'MYSQL_USERNAME', 'MYSQL_PASSWORD', 'MYSQL_DB_NAME'] 102 | if not all(mysql_os_var in os.environ for mysql_os_var in mysql_os_vars): 103 | logger.error('Mysql env variables are not set!') 104 | raise Exception('Mysql env variables are not set!') 105 | logger.info('Loading Configuration..') 106 | cls.configuration = Configuration(config_src=os.path.join(cls.test_data_path, 'template_conf.yml')) 107 | cls.test_table_schema = """ order_id INT(6) PRIMARY KEY, 108 | order_type VARCHAR(30) NOT NULL, 109 | is_delivered BOOLEAN NOT NULL """ 110 | 111 | @classmethod 112 | def tearDownClass(cls): 113 | data_store = MySqlDatastore(config=cls.configuration.get_datastores()[0]) 114 | for table in cls.generated_table_names: 115 | logger.info('Dropping table {0}'.format(table)) 116 | data_store.drop_table(table=table) 117 | 118 | 119 | if __name__ == '__main__': 120 | unittest.main() 121 | -------------------------------------------------------------------------------- /ad_site_crawler/xegr_ad_site_crawler.py: -------------------------------------------------------------------------------- 1 | import urllib.request, urllib.error, urllib.parse 2 | from typing import List, Tuple, Union 3 | import time 4 | import re 5 | import time 6 | import logging 7 | from unidecode import unidecode 8 | 9 | from .abstract_ad_site_crawler import AbstractAdSiteCrawler 10 | 11 | logger = logging.getLogger('XeGrAdSiteCrawler') 12 | 13 | 14 | class XeGrAdSiteCrawler(AbstractAdSiteCrawler): 15 | __slots__ = ('_stop_words', '_ad_site_url', '_anchor_class_name') 16 | 17 | _stop_words: List[str] 18 | _ad_site_url: str 19 | _anchor_class_name: str 20 | _ignored_emails: List = ['email@paroxos.com'] 21 | 22 | def __init__(self, stop_words: List, ad_site_url: str = "https://www.xe.gr", anchor_class_name='result-list-narrow-item'): 23 | """ 24 | Tha basic constructor. Creates a new instance of AdSiteCrawler using the specified credentials 25 | 26 | :param stop_words: 27 | """ 28 | 29 | logger.debug("Initializing with stop_words: %s" % stop_words) 30 | self._ad_site_url = ad_site_url 31 | self._stop_words = stop_words 32 | self._anchor_class_name = anchor_class_name 33 | super().__init__() 34 | 35 | def get_new_ads(self, lookup_url: str, ads_checked: List, crawl_interval: int = 15) -> Tuple[str, Union[None, str]]: 36 | """ 37 | Retrieves each sub-link's html, searches and yields an email for each of them. 38 | 39 | :param lookup_url: 40 | :param ads_checked: 41 | """ 42 | 43 | if self._ad_site_url not in lookup_url: 44 | raise AdSiteCrawlerError( 45 | "The lookup_url: %s is not supported. The domain should be: %s" % (lookup_url, self._ad_site_url)) 46 | if lookup_url[:4] != 'http': 47 | logger.warning("The lookup_url doesn't contain http:// or https://! Adding https:// ..") 48 | lookup_url = 'https://' + lookup_url 49 | 50 | logger.debug("ads_checked: %s" % ads_checked) 51 | search_page_html = self._retrieve_html_from_url(lookup_url) 52 | # Search for links in the main page's html, retrieve their html and look for emails inside them 53 | for ad_link in self._find_links_in_html(html_data=search_page_html, anchor_class_name=self._anchor_class_name): 54 | logger.debug("Input ad_link: %s" % ad_link) 55 | ad_linked_parsed = urllib.parse.quote(ad_link) 56 | if ad_linked_parsed[:4] != 'http': 57 | full_sub_link = self._ad_site_url + ad_linked_parsed 58 | else: 59 | full_sub_link = ad_link 60 | logger.debug("Checking constructed full_sub_link: %s" % full_sub_link) 61 | # Wait before checking next link to avoid bot ban 62 | logger.debug("Sleeping for crawl_interval={crawl_interval} seconds..".format(crawl_interval=crawl_interval)) 63 | time.sleep(crawl_interval) 64 | if full_sub_link in ads_checked: 65 | logger.debug("It is in ads_checked, skipping..") 66 | continue 67 | ad_page_html = self._retrieve_html_from_url(full_sub_link) 68 | if any(unidecode(word).lower() in unidecode(ad_page_html).lower() for word in self._stop_words): 69 | logger.debug("It contains one of the stop words, skipping..") 70 | continue 71 | # Add the link inside the check list in order to avoid duplicate ads 72 | ads_checked.append(full_sub_link) 73 | emails_in_ad_page = self._find_emails_in_html(html_data=ad_page_html) 74 | if len(emails_in_ad_page) == 0: 75 | logger.debug("Found no emails in the ad page, returning None..") 76 | yield full_sub_link, None 77 | else: 78 | logger.debug("Found emails in the ad page, returning %s.." % emails_in_ad_page[0]) 79 | yield full_sub_link, emails_in_ad_page[0] 80 | 81 | @staticmethod 82 | def _retrieve_html_from_url(url: str) -> str: 83 | """ 84 | Retrieves full html from the specified url. 85 | 86 | :params url: 87 | """ 88 | 89 | try: 90 | logger.debug("Retrieving html from url: %s .." % url) 91 | header = { 92 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 Iceweasel/31.8.0'} 93 | req = urllib.request.Request(url, headers=header) 94 | html = urllib.request.urlopen(req).read() 95 | except Exception as e: 96 | logger.error(e) 97 | html = 'None' 98 | if type(html) is not str: 99 | html = html.decode('utf-8') 100 | logger.debug("HTML retrieved:\n%s" % (html)) 101 | return html 102 | 103 | @staticmethod 104 | def _find_links_in_html(html_data: str, anchor_class_name: str = 'result-list-narrow-item') -> str: 105 | """ 106 | Searches for sub-link patterns in html and yields each link. 107 | 108 | :param html_data: 109 | """ 110 | 111 | logger.debug("Using anchor class name=%s" % anchor_class_name) 112 | logger.debug("Searching for sub-links in html..") 113 | 114 | pattern = re.compile(r"()" 115 | .format(anchor_class_name=anchor_class_name)) 116 | a_tag_captured = pattern.findall(html_data) 117 | logger.debug("Anchor captured: %s" % a_tag_captured) 118 | for i in a_tag_captured: 119 | href_raw = i[str(i).find('href'):] 120 | href = href_raw[:href_raw.find(' ')].strip() 121 | logger.debug("Href captured: %s, and sliced: %s" % (href, href[6:-1])) 122 | yield href[6:-1] 123 | 124 | @classmethod 125 | def _find_emails_in_html(cls, html_data: str) -> List: 126 | """ 127 | Searches for email patterns in html and returns list of emails. 128 | 129 | :param html: 130 | """ 131 | 132 | logger.debug("Searching for emails in html..") 133 | 134 | pattern = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+(?:com|gr)', re.MULTILINE) 135 | emails = pattern.findall(html_data) 136 | logger.debug("All emails found in html: %s" % emails) 137 | return [email for email in emails if email not in cls._ignored_emails] 138 | 139 | 140 | class AdSiteCrawlerError(Exception): 141 | def __init__(self, message): 142 | # Call the base class constructor with the parameters it needs 143 | super().__init__(message) 144 | -------------------------------------------------------------------------------- /datastore/mysql_datastore.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Tuple, Dict 3 | 4 | from mysql import connector as mysql_connector 5 | 6 | from .abstract_datastore import AbstractDatastore 7 | 8 | logger = logging.getLogger('MySqlDataStore') 9 | 10 | 11 | class MySqlDatastore(AbstractDatastore): 12 | __slots__ = ('_connection', '_cursor') 13 | 14 | _connection: mysql_connector.connection_cext.CMySQLConnection 15 | _cursor: mysql_connector.connection_cext.CMySQLCursor 16 | 17 | def __init__(self, config: Dict) -> None: 18 | """ 19 | The basic constructor. Creates a new instance of Datastore using the specified credentials 20 | 21 | :param config: 22 | """ 23 | 24 | super().__init__(config) 25 | 26 | @staticmethod 27 | def get_connection(username: str, password: str, hostname: str, db_name: str, port: int = 3306) \ 28 | -> Tuple[mysql_connector.connection_cext.CMySQLConnection, mysql_connector.connection_cext.CMySQLCursor]: 29 | """ 30 | Creates and returns a connection and a cursor/session to the MySQL DB 31 | 32 | :param username: 33 | :param password: 34 | :param hostname: 35 | :param db_name: 36 | :param port: 37 | :return: 38 | """ 39 | 40 | connection = mysql_connector.connect( 41 | host=hostname, 42 | user=username, 43 | passwd=password, 44 | database=db_name, 45 | use_pure=True 46 | ) 47 | 48 | cursor = connection.cursor() 49 | 50 | return connection, cursor 51 | 52 | def create_table(self, table: str, schema: str) -> None: 53 | """ 54 | Creates a table using the specified schema 55 | 56 | :param self: 57 | :param table: 58 | :param schema: 59 | :return: 60 | """ 61 | 62 | query = "CREATE TABLE IF NOT EXISTS {table} ({schema})".format(table=table, schema=schema) 63 | logger.debug("Executing: %s" % query) 64 | self._cursor.execute(query) 65 | self._connection.commit() 66 | 67 | def drop_table(self, table: str) -> None: 68 | """ 69 | Drops the specified table if it exists 70 | 71 | :param self: 72 | :param table: 73 | :return: 74 | """ 75 | 76 | query = "DROP TABLE IF EXISTS {table}".format(table=table) 77 | logger.debug("Executing: %s" % query) 78 | self._cursor.execute(query) 79 | self._connection.commit() 80 | 81 | def truncate_table(self, table: str) -> None: 82 | """ 83 | Truncates the specified table 84 | 85 | :param self: 86 | :param table: 87 | :return: 88 | """ 89 | 90 | query = "TRUNCATE TABLE {table}".format(table=table) 91 | logger.debug("Executing: %s" % query) 92 | self._cursor.execute(query) 93 | self._connection.commit() 94 | 95 | def insert_into_table(self, table: str, data: dict) -> None: 96 | """ 97 | Inserts into the specified table a row based on a column_name: value dictionary 98 | 99 | :param self: 100 | :param table: 101 | :param data: 102 | :return: 103 | """ 104 | 105 | data_str = ", ".join( 106 | list(map(lambda key, val: "{key}='{val}'".format(key=str(key), val=str(val)), data.keys(), data.values()))) 107 | 108 | query = "INSERT INTO {table} SET {data}".format(table=table, data=data_str) 109 | logger.debug("Executing: %s" % query) 110 | self._cursor.execute(query) 111 | self._connection.commit() 112 | 113 | def update_table(self, table: str, set_data: dict, where: str) -> None: 114 | """ 115 | Updates the specified table using a column_name: value dictionary and a where statement 116 | 117 | :param self: 118 | :param table: 119 | :param set_data: 120 | :param where: 121 | :return: 122 | """ 123 | 124 | set_data_str = ", ".join( 125 | list(map(lambda key, val: "{key}='{val}'".format(key=str(key), val=str(val)), set_data.keys(), 126 | set_data.values()))) 127 | 128 | query = "UPDATE {table} SET {data} WHERE {where}".format(table=table, data=set_data_str, where=where) 129 | logger.debug("Executing: %s" % query) 130 | self._cursor.execute(query) 131 | self._connection.commit() 132 | 133 | def select_from_table(self, table: str, columns: str = '*', where: str = 'TRUE', order_by: str = 'NULL', 134 | asc_or_desc: str = 'ASC', limit: int = 1000) -> List: 135 | """ 136 | Selects from a specified table based on the given columns, where, ordering and limit 137 | 138 | :param self: 139 | :param table: 140 | :param columns: 141 | :param where: 142 | :param order_by: 143 | :param asc_or_desc: 144 | :param limit: 145 | :return results: 146 | """ 147 | 148 | query = "SELECT {columns} FROM {table} WHERE {where} ORDER BY {order_by} {asc_or_desc} LIMIT {limit}".format( 149 | columns=columns, table=table, where=where, order_by=order_by, asc_or_desc=asc_or_desc, limit=limit) 150 | logger.debug("Executing: %s" % query) 151 | self._cursor.execute(query) 152 | results = self._cursor.fetchall() 153 | 154 | return results 155 | 156 | def delete_from_table(self, table: str, where: str) -> None: 157 | """ 158 | Deletes data from the specified table based on a where statement 159 | 160 | :param self: 161 | :param table: 162 | :param where: 163 | :return: 164 | """ 165 | 166 | query = "DELETE FROM {table} WHERE {where}".format(table=table, where=where) 167 | logger.debug("Executing: %s" % query) 168 | self._cursor.execute(query) 169 | self._connection.commit() 170 | 171 | def show_tables(self) -> List: 172 | """ 173 | Show a list of the tables present in the db 174 | :return: 175 | """ 176 | 177 | query = 'SHOW TABLES' 178 | logger.debug("Executing: %s" % query) 179 | self._cursor.execute(query) 180 | results = self._cursor.fetchall() 181 | 182 | return [result[0] for result in results] 183 | 184 | def __exit__(self) -> None: 185 | """ 186 | Flushes and closes the connection 187 | 188 | :return: 189 | """ 190 | 191 | self._connection.commit() 192 | self._cursor.close() 193 | -------------------------------------------------------------------------------- /tests/test_gmail_email_app.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import random 4 | import string 5 | import logging 6 | import copy 7 | from typing import Tuple 8 | from smtplib import SMTPAuthenticationError 9 | 10 | from configuration.configuration import Configuration 11 | from email_app.gmail_email_app import GmailEmailApp 12 | 13 | logger = logging.getLogger('TestGmailEmailApp') 14 | 15 | 16 | class TestGmailEmailApp(unittest.TestCase): 17 | __slots__ = ('configuration', 'file_name') 18 | 19 | configuration: Configuration 20 | file_name: str 21 | test_data_path: str = os.path.join('test_data', 'test_gmail_email_app') 22 | 23 | def test_connect(self): 24 | # Test the connection with the correct api key 25 | try: 26 | gmail_configuration = self.configuration.get_email_apps()[0] 27 | GmailEmailApp(config=gmail_configuration) 28 | except SMTPAuthenticationError as e: 29 | logger.error('Error connecting with the correct credentials: %s', e) 30 | self.fail('Error connecting with the correct credentials') 31 | else: 32 | logger.info('Connected with the correct credentials successfully.') 33 | # Test that the connection is failed with the wrong credentials 34 | with self.assertRaises(SMTPAuthenticationError): 35 | gmail_wrong_configuration = copy.deepcopy(gmail_configuration) 36 | gmail_wrong_configuration['api_key'] = 'wrong_key' 37 | GmailEmailApp(config=gmail_wrong_configuration) 38 | logger.info("Loading Dropbox with wrong credentials failed successfully.") 39 | 40 | def test_is_connected_and_exit(self): 41 | gmail_configuration = self.configuration.get_email_apps()[0] 42 | gmail_app = GmailEmailApp(config=gmail_configuration) 43 | self.assertEqual(True, gmail_app.is_connected()) 44 | gmail_app.__exit__() 45 | self.assertEqual(False, gmail_app.is_connected()) 46 | 47 | def test_send_email_with_all_args(self): 48 | try: 49 | gmail_configuration = self.configuration.get_email_apps()[0] 50 | gmail_app = GmailEmailApp(config=gmail_configuration) 51 | 52 | gmail_app.send_email(subject='test_send_email_with_all_args', 53 | to=[gmail_configuration['email_address']], 54 | cc=[gmail_configuration['email_address']], 55 | bcc=[gmail_configuration['email_address']], 56 | text='Test plain/text body', 57 | html='ΖΗΤΕΊΤΑΙ Senior Developer για inhouse ή Freelance συνεργασία με καλή γνώση Magento 2, Php(Mvc Framework), Mysql.
Αποστολή βιογραφικών efi.koulourianou@gmail.com
, τιμή 900€, συζητήσιμη efi.koulourianou@gmail.com 317 |