├── README.md ├── Vagrantfile ├── dags ├── __init__.py ├── titanic_dag.py └── yellow_taxi_dag.py ├── provision.sh └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # Введение в Apache Airflow 2 | Репозиторий содержит примеры DAGов для статей: 3 | - [Введение в Apache Airflow](https://khashtamov.com/ru/apache-airflow-introduction/) 4 | - [Introduction to Apache Airflow](https://khashtamov.com/en/introduction-to-apache-airflow/) 5 | -------------------------------------------------------------------------------- /Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # All Vagrant configuration is done below. The "2" in Vagrant.configure 5 | # configures the configuration version (we support older styles for 6 | # backwards compatibility). Please don't change it unless you know what 7 | # you're doing. 8 | Vagrant.configure("2") do |config| 9 | # The most common configuration options are documented and commented below. 10 | # For a complete reference, please see the online documentation at 11 | # https://docs.vagrantup.com. 12 | 13 | # Every Vagrant development environment requires a box. You can search for 14 | # boxes at https://vagrantcloud.com/search. 15 | config.vm.box = "ubuntu/bionic64" 16 | 17 | # Disable automatic box update checking. If you disable this, then 18 | # boxes will only be checked for updates when the user runs 19 | # `vagrant box outdated`. This is not recommended. 20 | # config.vm.box_check_update = false 21 | 22 | # Create a forwarded port mapping which allows access to a specific port 23 | # within the machine from a port on the host machine. In the example below, 24 | # accessing "localhost:8080" will access port 80 on the guest machine. 25 | # NOTE: This will enable public access to the opened port 26 | # config.vm.network "forwarded_port", guest: 80, host: 8080 27 | 28 | # Create a forwarded port mapping which allows access to a specific port 29 | # within the machine from a port on the host machine and only allow access 30 | # via 127.0.0.1 to disable public access 31 | config.vm.network "forwarded_port", guest: 80, host: 18000, host_ip: "127.0.0.1" 32 | config.vm.network "forwarded_port", guest: 8080, host: 18080, host_ip: "127.0.0.1" 33 | 34 | # Create a private network, which allows host-only access to the machine 35 | # using a specific IP. 36 | # config.vm.network "private_network", ip: "192.168.33.10" 37 | 38 | # Create a public network, which generally matched to bridged network. 39 | # Bridged networks make the machine appear as another physical device on 40 | # your network. 41 | # config.vm.network "public_network" 42 | 43 | # Share an additional folder to the guest VM. The first argument is 44 | # the path on the host to the actual folder. The second argument is 45 | # the path on the guest to mount the folder. And the optional third 46 | # argument is a set of non-required options. 47 | config.vm.synced_folder "dags", "/home/vagrant/airflow/dags" 48 | 49 | # Provider-specific configuration so you can fine-tune various 50 | # backing providers for Vagrant. These expose provider-specific options. 51 | # Example for VirtualBox: 52 | # 53 | config.vm.provider "virtualbox" do |vb| 54 | # Display the VirtualBox GUI when booting the machine 55 | vb.gui = false 56 | # Customize the amount of memory on the VM: 57 | vb.memory = "4098" 58 | end 59 | # 60 | # View the documentation for the provider you are using for more 61 | # information on available options. 62 | 63 | config.vm.provision "shell", path: "provision.sh", keep_color: true 64 | end 65 | -------------------------------------------------------------------------------- /dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adilkhash/apache-airflow-intro/a3f86189df2de2cae5ed4bed3d0401ce89157279/dags/__init__.py -------------------------------------------------------------------------------- /dags/titanic_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime as dt 3 | 4 | import requests 5 | import pandas as pd 6 | from airflow.models import DAG 7 | from airflow.operators.python_operator import PythonOperator 8 | 9 | args = { 10 | 'owner': 'airflow', 11 | 'start_date': dt.datetime(2020, 2, 11), 12 | 'retries': 1, 13 | 'retry_delay': dt.timedelta(minutes=1), 14 | 'depends_on_past': False, 15 | } 16 | 17 | FILENAME = os.path.join(os.path.expanduser('~'), 'titanic.csv') 18 | 19 | 20 | def download_titanic_dataset(): 21 | url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' 22 | response = requests.get(url, stream=True) 23 | response.raise_for_status() 24 | with open(FILENAME, 'w', encoding='utf-8') as f: 25 | for chunk in response.iter_lines(): 26 | f.write('{}\n'.format(chunk.decode('utf-8'))) 27 | 28 | 29 | def pivot_dataset(): 30 | titanic_df = pd.read_csv(FILENAME) 31 | pvt = titanic_df.pivot_table( 32 | index=['Sex'], columns=['Pclass'], values='Name', aggfunc='count' 33 | ) 34 | df = pvt.reset_index() 35 | df.to_csv(os.path.join(os.path.expanduser('~'), 'titanic_pivot.csv')) 36 | 37 | 38 | with DAG(dag_id='titanic_pivot', default_args=args, schedule_interval=None) as dag: 39 | 40 | create_titanic_dataset = PythonOperator( 41 | task_id='download_titanic_dataset', 42 | python_callable=download_titanic_dataset, 43 | dag=dag 44 | ) 45 | 46 | pivot_titanic_dataset = PythonOperator( 47 | task_id='pivot_dataset', 48 | python_callable=pivot_dataset, 49 | dag=dag 50 | ) 51 | 52 | create_titanic_dataset >> pivot_titanic_dataset 53 | -------------------------------------------------------------------------------- /dags/yellow_taxi_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime as dt 3 | 4 | import requests 5 | from airflow.operators.python_operator import PythonOperator 6 | from airflow.models import DAG 7 | 8 | args = { 9 | 'owner': 'airflow', 10 | 'start_date': dt.datetime(2020, 2, 11), 11 | 'retries': 1, 12 | 'retry_delay': dt.timedelta(minutes=2), 13 | } 14 | 15 | FILENAME = os.path.join(os.path.expanduser('~'), 'yellow_tripdata_2018-12.csv') 16 | 17 | 18 | def download_taxi_data(): 19 | url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-12.csv' 20 | response = requests.get(url, stream=True) 21 | response.raise_for_status() 22 | with open(FILENAME, 'w', encoding='utf-8') as f: 23 | for chunk in response.iter_lines(): 24 | f.write('{}\n'.format(chunk.decode('utf-8'))) 25 | 26 | 27 | def print_number_of_rows(): 28 | lines = 0 29 | with open(FILENAME) as f: 30 | for _ in f: 31 | lines += 1 32 | print(f'There are {lines} lines in the file') 33 | 34 | 35 | with DAG(dag_id='nyc_taxi', default_args=args, schedule_interval=None) as dag: 36 | create_taxi_file = PythonOperator( 37 | task_id='download_taxi_data', 38 | python_callable=download_taxi_data, 39 | dag=dag 40 | ) 41 | 42 | print_lines = PythonOperator( 43 | task_id='print_number_of_lines', 44 | python_callable=print_number_of_rows, 45 | dag=dag 46 | ) 47 | 48 | create_taxi_file >> print_lines 49 | -------------------------------------------------------------------------------- /provision.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | bootstrap() { 4 | sudo apt-get update 5 | sudo apt-get upgrade 6 | echo "Installing system libs..." 7 | echo "" 8 | sudo apt-get -y install build-essential 9 | sudo apt-get -y install python3-dev 10 | sudo apt-get -y install python3-venv 11 | sudo apt-get -y install libxml2-dev libxslt1-dev 12 | sudo apt-get -y install lib32z1-dev 13 | sudo apt-get -y install libtiff5-dev 14 | sudo apt-get -y install libffi-dev 15 | sudo apt-get -y install libfreetype6-dev 16 | sudo apt-get -y install libpq-dev 17 | sudo apt-get -y install libssl-dev 18 | sudo apt-get -y install swig 19 | sudo apt-get -y install git 20 | sudo apt-get -y install gettext 21 | sudo apt-get -y install python3-pip 22 | } 23 | 24 | export DEBIAN_FRONTEND=noninteractive 25 | 26 | bootstrap 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow==1.10.14 2 | psycopg2==2.8.4 3 | requests==2.22.0 4 | --------------------------------------------------------------------------------