├── .gitignore ├── LICENSE ├── README.md ├── dags ├── comic_app_v1.py ├── comic_app_v2.py └── comic_app_v3.py ├── data ├── comic.json ├── credentials │ └── slack.json └── message.txt └── environment.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pycharm 107 | .idea 108 | 109 | # Airflow 110 | logs 111 | airflow.db 112 | unittests.cfg 113 | 114 | # Credentials 115 | data/credentials -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 LeeMeng (李孟) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # airflow-tutorials -------------------------------------------------------------------------------- /dags/comic_app_v1.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime, timedelta 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | default_args = { 7 | 'owner': 'Meng Lee', 8 | 'start_date': datetime(2100, 1, 1, 0, 0), 9 | 'schedule_interval': '@daily', 10 | 'retries': 2, 11 | 'retry_delay': timedelta(minutes=1) 12 | } 13 | 14 | 15 | def fn_superman(): 16 | print("取得使用者的閱讀紀錄") 17 | print("去漫畫網站看有沒有新的章節") 18 | print("跟紀錄比較,有沒有新連載?") 19 | 20 | # Murphy's Law 21 | accident_occur = time.time() % 2 > 1 22 | if accident_occur: 23 | print("\n天有不測風雲,人有旦夕禍福") 24 | print("工作遇到預期外狀況被中斷\n") 25 | return 26 | 27 | new_comic_available = time.time() % 2 > 1 28 | if new_comic_available: 29 | print("寄 Slack 通知") 30 | print("更新閱讀紀錄") 31 | else: 32 | print("什麼都不幹,工作順利結束") 33 | 34 | 35 | with DAG('comic_app_v1', default_args=default_args) as dag: 36 | superman_task = PythonOperator( 37 | task_id='superman_task', 38 | python_callable=fn_superman 39 | ) -------------------------------------------------------------------------------- /dags/comic_app_v2.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime, timedelta 3 | from airflow import DAG 4 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator 5 | from airflow.operators.dummy_operator import DummyOperator 6 | from airflow.operators.slack_operator import SlackAPIPostOperator 7 | 8 | default_args = { 9 | 'owner': 'Meng Lee', 10 | 'start_date': datetime(2100, 1, 1, 0, 0), 11 | 'schedule_interval': '@daily', 12 | 'retries': 2, 13 | 'retry_delay': timedelta(minutes=1) 14 | } 15 | 16 | 17 | def process_metadata(mode, **context): 18 | if mode == 'read': 19 | print("取得使用者的閱讀紀錄") 20 | elif mode == 'write': 21 | print("更新閱讀紀錄") 22 | 23 | 24 | def check_comic_info(**context): 25 | all_comic_info = context['task_instance'].xcom_pull(task_ids='get_read_history') 26 | print("去漫畫網站看有沒有新的章節") 27 | 28 | anything_new = time.time() % 2 > 1 29 | return anything_new, all_comic_info 30 | 31 | 32 | def decide_what_to_do(**context): 33 | anything_new, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') 34 | 35 | print("跟紀錄比較,有沒有新連載?") 36 | if anything_new: 37 | return 'yes_generate_notification' 38 | else: 39 | return 'no_do_nothing' 40 | 41 | 42 | def generate_message(**context): 43 | _, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') 44 | print("產生要寄給 Slack 的訊息內容並存成檔案") 45 | 46 | 47 | with DAG('comic_app_v2', default_args=default_args) as dag: 48 | 49 | get_read_history = PythonOperator( 50 | task_id='get_read_history', 51 | python_callable=process_metadata, 52 | op_args=['read'] 53 | ) 54 | 55 | check_comic_info = PythonOperator( 56 | task_id='check_comic_info', 57 | python_callable=check_comic_info, 58 | provide_context=True 59 | ) 60 | 61 | decide_what_to_do = BranchPythonOperator( 62 | task_id='new_comic_available', 63 | python_callable=decide_what_to_do, 64 | provide_context=True 65 | ) 66 | 67 | update_read_history = PythonOperator( 68 | task_id='update_read_history', 69 | python_callable=process_metadata, 70 | op_args=['write'], 71 | provide_context=True 72 | ) 73 | 74 | generate_notification = PythonOperator( 75 | task_id='yes_generate_notification', 76 | python_callable=generate_message, 77 | provide_context=True 78 | ) 79 | 80 | send_notification = SlackAPIPostOperator( 81 | task_id='send_notification', 82 | token="YOUR_SLACK_TOKEN", 83 | channel='#comic-notification', 84 | text="[{{ ds }}] 海賊王有新番了!", 85 | icon_url='http://airbnb.io/img/projects/airflow3.png' 86 | ) 87 | 88 | do_nothing = DummyOperator(task_id='no_do_nothing') 89 | 90 | # define workflow 91 | get_read_history >> check_comic_info >> decide_what_to_do 92 | decide_what_to_do >> generate_notification 93 | decide_what_to_do >> do_nothing 94 | generate_notification >> send_notification >> update_read_history 95 | -------------------------------------------------------------------------------- /dags/comic_app_v3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import logging 5 | from datetime import datetime, timedelta 6 | from selenium import webdriver 7 | from airflow import DAG 8 | from airflow.operators.python_operator import PythonOperator, BranchPythonOperator 9 | from airflow.operators.dummy_operator import DummyOperator 10 | from airflow.operators.slack_operator import SlackAPIPostOperator 11 | from airflow.operators.latest_only_operator import LatestOnlyOperator 12 | 13 | 14 | default_args = { 15 | 'owner': 'Meng Lee', 16 | 'start_date': datetime(2100, 1, 1, 0, 0), 17 | 'schedule_interval': '@daily', 18 | 'retries': 2, 19 | 'retry_delay': timedelta(minutes=1) 20 | } 21 | 22 | comic_page_template = 'https://www.cartoonmad.com/comic/{}.html' 23 | 24 | 25 | def process_metadata(mode, **context): 26 | 27 | file_dir = os.path.dirname(__file__) 28 | metadata_path = os.path.join(file_dir, '../data/comic.json') 29 | if mode == 'read': 30 | with open(metadata_path, 'r') as fp: 31 | metadata = json.load(fp) 32 | print("Read History loaded: {}".format(metadata)) 33 | return metadata 34 | elif mode == 'write': 35 | print("Saving latest comic information..") 36 | _, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') 37 | 38 | # update to latest chapter 39 | for comic_id, comic_info in dict(all_comic_info).items(): 40 | all_comic_info[comic_id]['previous_chapter_num'] = comic_info['latest_chapter_num'] 41 | 42 | with open(metadata_path, 'w') as fp: 43 | json.dump(all_comic_info, fp, indent=2, ensure_ascii=False) 44 | 45 | 46 | def check_comic_info(**context): 47 | metadata = context['task_instance'].xcom_pull(task_ids='get_read_history') 48 | driver = webdriver.Chrome() 49 | driver.get('https://www.cartoonmad.com/') 50 | print("Arrived top page.") 51 | 52 | all_comic_info = metadata 53 | anything_new = False 54 | for comic_id, comic_info in dict(all_comic_info).items(): 55 | comic_name = comic_info['name'] 56 | print("Fetching {}'s chapter list..".format(comic_name)) 57 | driver.get(comic_page_template.format(comic_id)) 58 | 59 | # get the latest chapter number 60 | links = driver.find_elements_by_partial_link_text('第') 61 | latest_chapter_num = [int(s) for s in links[-1].text.split() if s.isdigit()][0] 62 | previous_chapter_num = comic_info['previous_chapter_num'] 63 | 64 | all_comic_info[comic_id]['latest_chapter_num'] = latest_chapter_num 65 | all_comic_info[comic_id]['new_chapter_available'] = latest_chapter_num > previous_chapter_num 66 | if all_comic_info[comic_id]['new_chapter_available']: 67 | anything_new = True 68 | print("There are new chapter for {}(latest: {})".format(comic_name, latest_chapter_num)) 69 | 70 | if not anything_new: 71 | print("Nothing new now, prepare to end the workflow.") 72 | 73 | driver.quit() 74 | 75 | return anything_new, all_comic_info 76 | 77 | 78 | def decide_what_to_do(**context): 79 | anything_new, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') 80 | 81 | print("跟紀錄比較,有沒有新連載?") 82 | if anything_new: 83 | return 'yes_generate_notification' 84 | else: 85 | return 'no_do_nothing' 86 | 87 | 88 | def get_token(): 89 | file_dir = os.path.dirname(__file__) 90 | token_path = os.path.join(file_dir, '../data/credentials/slack.json') 91 | with open(token_path, 'r') as fp: 92 | token = json.load(fp)['token'] 93 | return token 94 | 95 | 96 | def generate_message(**context): 97 | _, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info') 98 | 99 | message = '' 100 | for comic_id, comic_info in all_comic_info.items(): 101 | if comic_info['new_chapter_available']: 102 | name = comic_info['name'] 103 | latest = comic_info['latest_chapter_num'] 104 | prev = comic_info['previous_chapter_num'] 105 | message += '{} 最新一話: {} 話(上次讀到:{} 話)\n'.format(name, latest, prev) 106 | message += comic_page_template.format(comic_id) + '\n\n' 107 | 108 | file_dir = os.path.dirname(__file__) 109 | message_path = os.path.join(file_dir, '../data/message.txt') 110 | with open(message_path, 'w') as fp: 111 | fp.write(message) 112 | 113 | 114 | def get_message_text(): 115 | file_dir = os.path.dirname(__file__) 116 | message_path = os.path.join(file_dir, '../data/message.txt') 117 | with open(message_path, 'r') as fp: 118 | message = fp.read() 119 | 120 | return message 121 | 122 | 123 | with DAG('comic_app_v3', default_args=default_args) as dag: 124 | 125 | # define tasks 126 | latest_only = LatestOnlyOperator(task_id='latest_only') 127 | 128 | get_read_history = PythonOperator( 129 | task_id='get_read_history', 130 | python_callable=process_metadata, 131 | op_args=['read'], 132 | provide_context=True 133 | ) 134 | 135 | check_comic_info = PythonOperator( 136 | task_id='check_comic_info', 137 | python_callable=check_comic_info, 138 | provide_context=True 139 | ) 140 | 141 | decide_what_to_do = BranchPythonOperator( 142 | task_id='new_comic_available', 143 | python_callable=decide_what_to_do, 144 | provide_context=True 145 | ) 146 | 147 | update_read_history = PythonOperator( 148 | task_id='update_read_history', 149 | python_callable=process_metadata, 150 | op_args=['write'], 151 | provide_context=True 152 | ) 153 | 154 | generate_notification = PythonOperator( 155 | task_id='yes_generate_notification', 156 | python_callable=generate_message, 157 | provide_context=True 158 | ) 159 | 160 | send_notification = SlackAPIPostOperator( 161 | task_id='send_notification', 162 | token=get_token(), 163 | channel='#comic-notification', 164 | text=get_message_text(), 165 | icon_url='http://airbnb.io/img/projects/airflow3.png' 166 | ) 167 | 168 | do_nothing = DummyOperator(task_id='no_do_nothing') 169 | 170 | # define workflow 171 | latest_only >> get_read_history 172 | get_read_history >> check_comic_info >> decide_what_to_do 173 | decide_what_to_do >> generate_notification 174 | decide_what_to_do >> do_nothing 175 | generate_notification >> send_notification >> update_read_history 176 | -------------------------------------------------------------------------------- /data/comic.json: -------------------------------------------------------------------------------- 1 | { 2 | "1152": { 3 | "name": "海賊王", 4 | "previous_chapter_num": 900 5 | } 6 | } -------------------------------------------------------------------------------- /data/credentials/slack.json: -------------------------------------------------------------------------------- 1 | { 2 | "token": "YOUR_TOKEN_HERE" 3 | } -------------------------------------------------------------------------------- /data/message.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leemengtw/airflow-tutorials/bc63588669fbf926667210cd3c40b2f8fbff11fb/data/message.txt -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: airflow-tutorials 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - asn1crypto=0.24.0=py36_2 7 | - ca-certificates=2018.8.13=ha4d7672_0 8 | - certifi=2018.8.13=py36_0 9 | - cffi=1.11.5=py36h5e8e0c9_1 10 | - cryptography=2.3.1=py36hdffb7b8_0 11 | - cryptography-vectors=2.3.1=py36_0 12 | - idna=2.7=py36_2 13 | - openssl=1.0.2o=h470a237_1 14 | - pycparser=2.18=py_1 15 | - pyopenssl=18.0.0=py36_0 16 | - pysocks=1.6.8=py36_1 17 | - selenium=3.14.0=py36h1de35cc_0 18 | - six=1.11.0=py36_1 19 | - urllib3=1.23=py36_1 20 | - libcxx=4.0.1=h579ed51_0 21 | - libcxxabi=4.0.1=hebd6815_0 22 | - libedit=3.1.20170329=hb402a30_2 23 | - libffi=3.2.1=h475c297_4 24 | - ncurses=6.1=h0a44026_0 25 | - pip=10.0.1=py36_0 26 | - python=3.6.6=hc167b69_0 27 | - readline=7.0=hc1231fa_4 28 | - setuptools=40.0.0=py36_0 29 | - sqlite=3.24.0=ha441bb4_0 30 | - tk=8.6.7=h35a86e2_3 31 | - wheel=0.31.1=py36_0 32 | - xz=5.2.4=h1de35cc_4 33 | - zlib=1.2.11=hf3cbc9b_2 34 | - pip: 35 | - alembic==0.8.10 36 | - apache-airflow==1.9.0 37 | - bleach==2.1.2 38 | - chardet==3.0.4 39 | - click==6.7 40 | - configparser==3.5.0 41 | - croniter==0.3.25 42 | - dill==0.2.8.2 43 | - docutils==0.14 44 | - flask==0.11.1 45 | - flask-admin==1.4.1 46 | - flask-cache==0.13.1 47 | - flask-login==0.2.11 48 | - flask-swagger==0.2.13 49 | - flask-wtf==0.14 50 | - funcsigs==1.0.0 51 | - future==0.16.0 52 | - gitdb2==2.0.4 53 | - gitpython==2.1.11 54 | - gunicorn==19.9.0 55 | - html5lib==1.0.1 56 | - itsdangerous==0.24 57 | - jinja2==2.8.1 58 | - lockfile==0.12.2 59 | - lxml==3.8.0 60 | - mako==1.0.7 61 | - markdown==2.6.11 62 | - markupsafe==1.0 63 | - numpy==1.15.0 64 | - ordereddict==1.1 65 | - pandas==0.23.4 66 | - psutil==4.4.2 67 | - pygments==2.2.0 68 | - python-daemon==2.1.2 69 | - python-dateutil==2.7.3 70 | - python-editor==1.0.3 71 | - python-nvd3==0.14.2 72 | - python-slugify==1.1.4 73 | - pytz==2018.5 74 | - pyyaml==3.13 75 | - requests==2.19.1 76 | - setproctitle==1.1.10 77 | - slackclient==1.2.1 78 | - smmap2==2.0.4 79 | - sqlalchemy==1.2.10 80 | - tabulate==0.7.7 81 | - thrift==0.11.0 82 | - unidecode==1.0.22 83 | - webencodings==0.5.1 84 | - websocket-client==0.49.0 85 | - werkzeug==0.14.1 86 | - wtforms==2.2.1 87 | - zope.deprecation==4.3.0 88 | prefix: /Users/meng.lee/anaconda3/envs/airflow-tutorials 89 | 90 | --------------------------------------------------------------------------------