├── .gitignore
├── Dockerfile
├── README.md
├── config
    └── airflow.cfg
├── dags
    ├── bash_scripts
    │   └── load_staging_table.sh
    ├── movies_dwh_dag.py
    ├── python_scripts
    │   ├── load_staging_cpi.py
    │   ├── load_staging_date.py
    │   ├── load_staging_genre.py
    │   ├── load_staging_movies.py
    │   └── load_staging_ratings.py
    └── sql_scripts
    │   ├── create_tables.sql
    │   ├── upsert_cpi.sql
    │   ├── upsert_date.sql
    │   ├── upsert_genre.sql
    │   ├── upsert_movies.sql
    │   └── upsert_ratings.sql
├── docker-compose-LocalExecutor.yml
├── documentation
    ├── Data Dictionary.pdf
    └── README_images
    │   ├── architecture.PNG
    │   ├── dag.PNG
    │   ├── data_model.png
    │   └── logo.PNG
├── plugins
    ├── __init__.py
    └── operators
    │   ├── __init__.py
    │   └── data_quality.py
└── script
    └── entrypoint.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gettyimages/spark:latest
 2 | LABEL Alan <alanchn31@gmail.com>
 3 | 
 4 | # Never prompts the user for choices on installation/configuration of packages
 5 | ENV DEBIAN_FRONTEND noninteractive
 6 | ENV TERM linux
 7 | 
 8 | # Airflow
 9 | ARG AIRFLOW_VERSION=1.10.9
10 | ARG AIRFLOW_USER_HOME=/usr/local/airflow
11 | ARG AIRFLOW_DEPS=""
12 | ARG PYTHON_DEPS=""
13 | ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME}
14 | 
15 | # Define en_US.
16 | ENV LANGUAGE en_US.UTF-8
17 | ENV LANG en_US.UTF-8
18 | ENV LC_ALL en_US.UTF-8
19 | ENV LC_CTYPE en_US.UTF-8
20 | ENV LC_MESSAGES en_US.UTF-8
21 | ENV LC_ALL en_US.UTF-8
22 | 
23 | RUN set -ex \
24 |     && buildDeps=' \
25 |         freetds-dev \
26 |         python-dev \
27 | 	python3-dev \
28 |         libkrb5-dev \
29 |         libsasl2-dev \
30 |         libssl-dev \
31 |         libffi-dev \
32 |         libpq-dev \
33 |         git \
34 | 	libxslt-dev \
35 |         build-essential \
36 |         libblas-dev \
37 |         liblapack-dev \
38 |         libpq-dev \
39 |         git \
40 |     ' \
41 |     && apt-get update -yqq \
42 |     && apt-get upgrade -yqq \
43 |      && apt-get install -yqq --no-install-recommends \
44 |         $buildDeps \
45 |         freetds-bin \
46 |         build-essential \
47 |         default-libmysqlclient-dev \
48 |         apt-utils \
49 |         curl \
50 |         rsync \
51 |         netcat \
52 |         locales \
53 |     && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
54 |     && locale-gen \
55 |     && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
56 |     && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \
57 |     && pip install -U pip setuptools wheel \
58 |     && pip install pytz \
59 |     && pip install pyOpenSSL \
60 |     && pip install ndg-httpsclient \
61 |     && pip install pyasn1 \
62 |     && pip install SQLAlchemy==1.3.15 \
63 |     && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \
64 |     && pip install 'redis==3.2' \
65 |     && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \
66 |     && apt-get purge --auto-remove -yqq $buildDeps \
67 |     && apt-get autoremove -yqq --purge \
68 |     && apt-get clean \
69 |     && rm -rf \
70 |         /var/lib/apt/lists/* \
71 |         /tmp/* \
72 |         /var/tmp/* \
73 |         /usr/share/man \
74 |         /usr/share/doc \
75 |         /usr/share/doc-base
76 | 
77 | COPY script/entrypoint.sh /entrypoint.sh
78 | 
79 | COPY config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg
80 | 
81 | # Add jar file for Redshift JDBC driver
82 | ADD https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.43.1067/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar $SPARK_HOME/jars
83 | 
84 | RUN chown -R airflow: ${AIRFLOW_USER_HOME}
85 | RUN chmod -R o+rx $SPARK_HOME/jars
86 | 
87 | RUN echo "EXPORT HADOOP_OPTIONAL_TOOLS=hadoop-aws" >> $HADOOP_CONF_DIR/hadoop-env.sh
88 | 
89 | EXPOSE 8080 5555 8793
90 | 
91 | USER airflow
92 | WORKDIR ${AIRFLOW_HOME}
93 | ENTRYPOINT ["/entrypoint.sh"]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Movalytics
  2 | ---
  3 | ![Movalytics](documentation/README_images/logo.PNG)  
  4 | 
  5 | ## Project Description
  6 | ---
  7 | * This project is a case study for a start-up, involved with recommending movies to users as a service, as well as investigating certain factors contributing to the success of movies.
  8 | * The aim of this project, is to perform Extract, Transform, Load, on movies data, to answer questions the business may have about its users, such as:
  9 |     * What is the highest rated movie of all time?
 10 |     * Which genre of movies are the most popular with users?
 11 |     * Trends in box office earnings - Does releasing a movie at a certain quarter/month of a year, lead to higher box office earnings?
 12 |     * Which genres are the highest earning of all-time, normalized against a consumer price index?
 13 | 
 14 | * The movies data and metadata comes from Movielens, extracted from a Kaggle dataset: https://www.kaggle.com/rounakbanik/the-movies-dataset. The data contains 26 million user ratings of over 270,000 users on a collection of over 45,000 movies.
 15 | * In addition, Consumer Price Index of Admission to Movies, Theaters, and Concerts in U.S. City Average is extracted from: https://fred.stlouisfed.org/series/CUSR0000SS62031. This will help us normalize box office earnings against inflation over the years.
 16 | 
 17 | ## Architecture
 18 | ---
 19 | The technical architecture for this project is as show below:  
 20 |   
 21 | ![Architecture](documentation/README_images/architecture.PNG)
 22 | 
 23 | 1. Data Extraction is done using Kaggle API and using GET request to St Louis Fred's CPI dataset.  
 24 | Set up an EC2 instance with python and pip installed. Then, run `pip install kaggle`. To download the movielens dataset, run 
 25 | ```bash
 26 | kaggle datasets download -d "rounakbanik/the-movies-dataset"
 27 | ```  
 28 | For St Louis Fred's Consumer Price Index dataset, run 
 29 | ```bash
 30 | wget https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=CUSR0000SS62031&scale=left&cosd=1999-01-01&coed=2020-04-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-05-31&revision_date=2020-05-31&nd=1999-01-01
 31 | ```
 32 | 
 33 | 2. Next, copy the files downloaded from the EC2 instance to S3. Make sure that it has the aws-cli installed. Run `aws configure` and then `aws s3 cp {FILE} s3://{S3_BUCKET}/{S3_FOLDER}/` to transfer the files to S3. Note that if the situation changes such that this becomes a daily job, we can write a shell script containing these commands, and add the command to run this shell script in our Airflow data pipeline
 34 | 
 35 | 3. Run the ETL pipeline, scheduled using Airflow. Data Processing is done using Spark, and data is eventually ingested into Redshift.
 36 | 
 37 | ## Choice of Technologies
 38 | ---
 39 | * For data processing (data transformation step), Spark is chosen because of its parellel processing capabilities. Should the amount of data proliferate to 100x, more worker nodes can be added to the spark cluster to scale out.
 40 | 
 41 | * For orchestrating the steps in the pipeline, Airflow is chosen as it allows building of data pipelines that are straghtforward and modular. Airflow allows tasks to be defined in a Directed Acyclic Graph (DAG) with dependencies of tasks between one another. This allows running of tasks to be optimized. It also enables the pipeline to be run on a schedule (for eg, daily) should the need arise. Finally, it has an intuitive UI that allows users to check the steps in the data pipeline should any part of the pipeline fail.
 42 | 
 43 | * Redshift is chosen as the cloud Data Warehouse as it is highly scalable. Should our data grow in size, we can provision more nodes or scale up, to handle the larger volume of data.
 44 | 
 45 | * Docker is used to encapsulate package dependencies the code may have, to allow the code to run on any machine
 46 | 
 47 | ## Data Model
 48 | ---
 49 | * The data model for this project is as shown below:  
 50 |   
 51 | ![Data Model](documentation/README_images/data_model.png)  
 52 |   
 53 | The approach taken, is to normalize the data. This will lead to more efficient UPDATES and DELETES as and when required.
 54 | 
 55 | ## ETL Pipeline
 56 | ---
 57 | The ETL process runs through an Airflow DAG:  
 58 |   
 59 | ![Data Model](documentation/README_images/dag.PNG)  
 60 |   
 61 | The process is as follows:
 62 | 1. We create the tables and staging tables (if they do not exist)  
 63 | 2. We perform an update and insert, based on new data coming in  
 64 | 3. Run a data quality check (check that tables have more than 1 row and there are no null ids)
 65 | 
 66 | ## Potential Improvements
 67 | ---
 68 | * The assumption that I have made is that the data volume will not increase subtantially and the pipeline is only required to run once
 69 | 
 70 | 1. <ins> What if data is increased by 100x? </ins>
 71 | * We can run a increase the number of worker nodes on the spark cluster, to improve performance of compute. Furthermore, airflow schedules can be utilized to pull only a subset of the data at a time, to reduce volume of data handled at any one time.
 72 |   
 73 | 2. <ins> What if data pipeline needs to be run by 7am daily? </ins>
 74 | * We can turn on the EC2 machine and run the pipeline before 7am daily. Currently, the schedule of the airflow pipeline is set to ingest only once. We can set it to a daily schedule, to ingest new data coming in daily. We should add a new node to our Airflow DAG, to download data using API/get request and transfer to S3. In addition, to handle heavy workloads when backdating, CeleryExecutor should be used to run processes in a distributed fashion, ensuring there is no single point of failure. Furthermore, We can make use of Airflow's SLA feature, to send alerts should pipeline not have succeeded before a certain time (for eg, 6:00am)
 75 | 
 76 | 3. <ins> What if the database needs to be accessed by 100+ users? </ins>
 77 | * Redshift should not have an issue handling many users, but we should be careful to scale up/scale out with more nodes whenever necessary. To provide efficiency to queries, we can seek to understand common queries users have, so we can tweak our data model. Aggregated data tables can be provided beforehand to reduce query times. We can also assign sort keys according to users querying needs for each table.
 78 | 
 79 | ## Development
 80 | ---
 81 |   
 82 | <ins>Setting up</ins>
 83 | * Build the docker/pull docker image  
 84 |     ```bash 
 85 |     # Build the docker image  
 86 |     docker build -t {IMAGE_NAME} .
 87 |     ```
 88 | * Alternatively, pull the docker image I have pushed to Dockerhub:
 89 |     ```bash 
 90 |     # Build the docker image  
 91 |     docker pull alanchn31/alanchn31-capstone-udacity-de-nd:1
 92 |     ```
 93 | 
 94 | * Replace the webserver image in docker-compose-LocalExecutor.yml to the image name of the Docker image you have built/pulled.
 95 |     ```yml
 96 |      webserver:
 97 |         image: alanchn31/alanchn31-capstone-udacity-de-nd:1
 98 |     ```
 99 | 
100 | * Run `docker-compose -f docker-compose-LocalExecutor.yml up -d `. Your airflow server should be initiated and will be up and running. Visit `https://{your ec2 ip address}:8080` to view Airflow UI
101 | 
102 | <ins>Add necessary connections and variables in Airflow UI</ins>  
103 | There are 4 variables to be defined:  
104 | 1. `movie_s3_config`. It is defined as a json format as follows:
105 |     ```
106 |     {
107 |         'aws_key': {AWS_KEY},
108 |         'aws_secret_key: {AWS_SECRET_KEY},
109 |         's3_bucket': {AWS_S3_BUCKET},
110 |         's3_key': {AWS_S3_KEY} 
111 |     }
112 |     ```  
113 |     * The AWS_S3_BUCKET is the S3 bucket with S3_KEY (folder) containing the csv files:  
114 |         1. credits.csv (from Kaggle Movielens dataset)  
115 |         2. links.csv (from Kaggle Movielens dataset)  
116 |         3. movies_metadata.csv (from Kaggle Movielens dataset)  
117 |         4. ratings.csv (from Kaggle Movielens dataset)  
118 |         5. consumer_price_index.csv (from Fred St Louis dataset)
119 | 
120 | 2. `db_user` (user name of user with access to Redshift database)
121 | 3. `db_pass` (password of user with access to Redshift database)
122 | 4. `redshift_conn_string` (Redshift JDBC connection string for spark dataframe to write to Redshift)
123 | 
124 | In addition, define the Hook to connect to Redshift:
125 | 
126 |     Conn Id: `redshift`.  
127 |     Conn Type: `Postgres`.   
128 |     Host: Enter the endpoint of your Redshift cluster, excluding the port at the end. 
129 |     Schema: This is the Redshift database you want to connect to.  
130 |     Login: Enter Redshift user  
131 |     Password: Enter Redshift password  
132 |     Port: Enter `5439`.
133 | 
134 | After configuring, visit Airflow UI and enable DAG to start the data pipeline
135 | 
136 | ## Acknowledgements
137 | ---
138 | Many thanks to:
139 | * Udacity - for providing the project template and points of consideration :clap:
140 | * Rounak Banik - for providing me with the dataset I extracted from Kaggle and used :clap:
141 | * St Louis Fred - for providing me with the consumer price index data :clap:


--------------------------------------------------------------------------------
/config/airflow.cfg:
--------------------------------------------------------------------------------
  1 | [core]
  2 | # The home folder for airflow, default is ~/airflow
  3 | airflow_home = /usr/local/airflow
  4 | 
  5 | # The folder where your airflow pipelines live, most likely a
  6 | # subfolder in a code repository
  7 | # This path must be absolute
  8 | dags_folder = /usr/local/airflow/dags
  9 | 
 10 | # The folder where airflow should store its log files
 11 | # This path must be absolute
 12 | base_log_folder = /usr/local/airflow/logs
 13 | 
 14 | # Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users
 15 | # must supply a remote location URL (starting with either 's3://...' or
 16 | # 'gs://...') and an Airflow connection id that provides access to the storage
 17 | # location.
 18 | remote_base_log_folder =
 19 | remote_log_conn_id =
 20 | # Use server-side encryption for logs stored in S3
 21 | encrypt_s3_logs = False
 22 | # DEPRECATED option for remote log storage, use remote_base_log_folder instead!
 23 | s3_log_folder =
 24 | 
 25 | # The executor class that airflow should use. Choices include
 26 | # SequentialExecutor, LocalExecutor, CeleryExecutor
 27 | executor = CeleryExecutor
 28 | 
 29 | # The SqlAlchemy connection string to the metadata database.
 30 | # SqlAlchemy supports many different database engine, more information
 31 | # their website
 32 | sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow
 33 | 
 34 | # The SqlAlchemy pool size is the maximum number of database connections
 35 | # in the pool.
 36 | sql_alchemy_pool_size = 5
 37 | 
 38 | # The SqlAlchemy pool recycle is the number of seconds a connection
 39 | # can be idle in the pool before it is invalidated. This config does
 40 | # not apply to sqlite.
 41 | sql_alchemy_pool_recycle = 3600
 42 | 
 43 | # The amount of parallelism as a setting to the executor. This defines
 44 | # the max number of task instances that should run simultaneously
 45 | # on this airflow installation
 46 | parallelism = 32
 47 | 
 48 | # The number of task instances allowed to run concurrently by the scheduler
 49 | dag_concurrency = 16
 50 | 
 51 | # Are DAGs paused by default at creation
 52 | dags_are_paused_at_creation = True
 53 | 
 54 | # When not using pools, tasks are run in the "default pool",
 55 | # whose size is guided by this config element
 56 | non_pooled_task_slot_count = 128
 57 | 
 58 | # The maximum number of active DAG runs per DAG
 59 | max_active_runs_per_dag = 16
 60 | 
 61 | # Whether to load the examples that ship with Airflow. It's good to
 62 | # get started, but you probably want to set this to False in a production
 63 | # environment
 64 | load_examples = True
 65 | 
 66 | # Where your Airflow plugins are stored
 67 | plugins_folder = /usr/local/airflow/plugins
 68 | 
 69 | # Secret key to save connection passwords in the db
 70 | fernet_key = $FERNET_KEY
 71 | 
 72 | # Whether to disable pickling dags
 73 | donot_pickle = False
 74 | 
 75 | # How long before timing out a python file import while filling the DagBag
 76 | dagbag_import_timeout = 30
 77 | 
 78 | # The class to use for running task instances in a subprocess
 79 | task_runner = BashTaskRunner
 80 | 
 81 | # If set, tasks without a `run_as_user` argument will be run with this user
 82 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
 83 | default_impersonation =
 84 | 
 85 | # What security module to use (for example kerberos):
 86 | security =
 87 | 
 88 | # Turn unit test mode on (overwrites many configuration options with test
 89 | # values at runtime)
 90 | unit_test_mode = False
 91 | 
 92 | [cli]
 93 | # In what way should the cli access the API. The LocalClient will use the
 94 | # database directly, while the json_client will use the api running on the
 95 | # webserver
 96 | api_client = airflow.api.client.local_client
 97 | endpoint_url = http://localhost:8080
 98 | 
 99 | [api]
100 | # How to authenticate users of the API
101 | auth_backend = airflow.api.auth.backend.default
102 | 
103 | [operators]
104 | # The default owner assigned to each new operator, unless
105 | # provided explicitly or passed via `default_args`
106 | default_owner = Airflow
107 | default_cpus = 1
108 | default_ram = 512
109 | default_disk = 512
110 | default_gpus = 0
111 | 
112 | [webserver]
113 | # The base url of your website as airflow cannot guess what domain or
114 | # cname you are using. This is used in automated emails that
115 | # airflow sends to point links to the right web server
116 | base_url = http://localhost:8080
117 | 
118 | # The ip specified when starting the web server
119 | web_server_host = 0.0.0.0
120 | 
121 | # The port on which to run the web server
122 | web_server_port = 8080
123 | 
124 | # Paths to the SSL certificate and key for the web server. When both are
125 | # provided SSL will be enabled. This does not change the web server port.
126 | web_server_ssl_cert =
127 | web_server_ssl_key =
128 | 
129 | # Number of seconds the gunicorn webserver waits before timing out on a worker
130 | web_server_worker_timeout = 120
131 | 
132 | # Number of workers to refresh at a time. When set to 0, worker refresh is
133 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
134 | # bringing up new ones and killing old ones.
135 | worker_refresh_batch_size = 1
136 | 
137 | # Number of seconds to wait before refreshing a batch of workers.
138 | worker_refresh_interval = 30
139 | 
140 | # Secret key used to run your flask app
141 | secret_key = temporary_key
142 | 
143 | # Number of workers to run the Gunicorn web server
144 | workers = 4
145 | 
146 | # The worker class gunicorn should use. Choices include
147 | # sync (default), eventlet, gevent
148 | worker_class = sync
149 | 
150 | # Log files for the gunicorn webserver. '-' means log to stderr.
151 | access_logfile = -
152 | error_logfile = -
153 | 
154 | # Expose the configuration file in the web server
155 | expose_config = True
156 | 
157 | # Set to true to turn on authentication:
158 | # http://pythonhosted.org/airflow/security.html#web-authentication
159 | authenticate = False
160 | 
161 | # Filter the list of dags by owner name (requires authentication to be enabled)
162 | filter_by_owner = False
163 | 
164 | # Filtering mode. Choices include user (default) and ldapgroup.
165 | # Ldap group filtering requires using the ldap backend
166 | #
167 | # Note that the ldap server needs the "memberOf" overlay to be set up
168 | # in order to user the ldapgroup mode.
169 | owner_mode = user
170 | 
171 | # Default DAG orientation. Valid values are:
172 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top)
173 | dag_orientation = LR
174 | 
175 | # Puts the webserver in demonstration mode; blurs the names of Operators for
176 | # privacy.
177 | demo_mode = False
178 | 
179 | # The amount of time (in secs) webserver will wait for initial handshake
180 | # while fetching logs from other worker machine
181 | log_fetch_timeout_sec = 5
182 | 
183 | # By default, the webserver shows paused DAGs. Flip this to hide paused
184 | # DAGs by default
185 | hide_paused_dags_by_default = False
186 | 
187 | [email]
188 | email_backend = airflow.utils.email.send_email_smtp
189 | 
190 | [smtp]
191 | # If you want airflow to send emails on retries, failure, and you want to use
192 | # the airflow.utils.email.send_email_smtp function, you have to configure an
193 | # smtp server here
194 | smtp_host = localhost
195 | smtp_starttls = True
196 | smtp_ssl = False
197 | # Uncomment and set the user/pass settings if you want to use SMTP AUTH
198 | # smtp_user = airflow
199 | # smtp_password = airflow
200 | smtp_port = 25
201 | smtp_mail_from = airflow@airflow.com
202 | 
203 | [celery]
204 | # This section only applies if you are using the CeleryExecutor in
205 | # [core] section above
206 | 
207 | # The app name that will be used by celery
208 | celery_app_name = airflow.executors.celery_executor
209 | 
210 | # The concurrency that will be used when starting workers with the
211 | # "airflow worker" command. This defines the number of task instances that
212 | # a worker will take, so size up your workers based on the resources on
213 | # your worker box and the nature of your tasks
214 | celeryd_concurrency = 16
215 | 
216 | # When you start an airflow worker, airflow starts a tiny web server
217 | # subprocess to serve the workers local log files to the airflow main
218 | # web server, who then builds pages and sends them to users. This defines
219 | # the port on which the logs are served. It needs to be unused, and open
220 | # visible from the main web server to connect into the workers.
221 | worker_log_server_port = 8793
222 | 
223 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
224 | # a sqlalchemy database. Refer to the Celery documentation for more
225 | # information.
226 | broker_url = redis://redis:6379/1
227 | 
228 | # Another key Celery setting
229 | celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow
230 | 
231 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
232 | # it `airflow flower`. This defines the IP that Celery Flower runs on
233 | flower_host = 0.0.0.0
234 | 
235 | # This defines the port that Celery Flower runs on
236 | flower_port = 5555
237 | 
238 | # Default queue that tasks get assigned to and that worker listen on.
239 | default_queue = default
240 | 
241 | [scheduler]
242 | # Task instances listen for external kill signal (when you clear tasks
243 | # from the CLI or the UI), this defines the frequency at which they should
244 | # listen (in seconds).
245 | job_heartbeat_sec = 5
246 | 
247 | # The scheduler constantly tries to trigger new tasks (look at the
248 | # scheduler section in the docs for more information). This defines
249 | # how often the scheduler should run (in seconds).
250 | scheduler_heartbeat_sec = 5
251 | 
252 | # after how much time should the scheduler terminate in seconds
253 | # -1 indicates to run continuously (see also num_runs)
254 | run_duration = -1
255 | 
256 | # after how much time a new DAGs should be picked up from the filesystem
257 | min_file_process_interval = 0
258 | 
259 | dag_dir_list_interval = 300
260 | 
261 | # How often should stats be printed to the logs
262 | print_stats_interval = 30
263 | 
264 | child_process_log_directory = /usr/local/airflow/logs/scheduler
265 | 
266 | # Local task jobs periodically heartbeat to the DB. If the job has
267 | # not heartbeat in this many seconds, the scheduler will mark the
268 | # associated task instance as failed and will re-schedule the task.
269 | scheduler_zombie_task_threshold = 300
270 | 
271 | # Turn off scheduler catchup by setting this to False.
272 | # Default behavior is unchanged and
273 | # Command Line Backfills still work, but the scheduler
274 | # will not do scheduler catchup if this is False,
275 | # however it can be set on a per DAG basis in the
276 | # DAG definition (catchup)
277 | catchup_by_default = True
278 | 
279 | # Statsd (https://github.com/etsy/statsd) integration settings
280 | statsd_on = False
281 | statsd_host = localhost
282 | statsd_port = 8125
283 | statsd_prefix = airflow
284 | 
285 | # The scheduler can run multiple threads in parallel to schedule dags.
286 | # This defines how many threads will run. However airflow will never
287 | # use more threads than the amount of cpu cores available.
288 | max_threads = 2
289 | 
290 | authenticate = False
291 | 
292 | [mesos]
293 | # Mesos master address which MesosExecutor will connect to.
294 | master = localhost:5050
295 | 
296 | # The framework name which Airflow scheduler will register itself as on mesos
297 | framework_name = Airflow
298 | 
299 | # Number of cpu cores required for running one task instance using
300 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
301 | # command on a mesos slave
302 | task_cpu = 1
303 | 
304 | # Memory in MB required for running one task instance using
305 | # 'airflow run <dag_id> <task_id> <execution_date> --local -p <pickle_id>'
306 | # command on a mesos slave
307 | task_memory = 256
308 | 
309 | # Enable framework checkpointing for mesos
310 | # See http://mesos.apache.org/documentation/latest/slave-recovery/
311 | checkpoint = False
312 | 
313 | # Failover timeout in milliseconds.
314 | # When checkpointing is enabled and this option is set, Mesos waits
315 | # until the configured timeout for
316 | # the MesosExecutor framework to re-register after a failover. Mesos
317 | # shuts down running tasks if the
318 | # MesosExecutor framework fails to re-register within this timeframe.
319 | # failover_timeout = 604800
320 | 
321 | # Enable framework authentication for mesos
322 | # See http://mesos.apache.org/documentation/latest/configuration/
323 | authenticate = False
324 | 
325 | # Mesos credentials, if authentication is enabled
326 | # default_principal = admin
327 | # default_secret = admin
328 | 
329 | [kerberos]
330 | ccache = /tmp/airflow_krb5_ccache
331 | # gets augmented with fqdn
332 | principal = airflow
333 | reinit_frequency = 3600
334 | kinit_path = kinit
335 | keytab = airflow.keytab
336 | 
337 | [github_enterprise]
338 | api_rev = v3
339 | 
340 | [admin]
341 | # UI to hide sensitive variable fields when set to True
342 | hide_sensitive_variable_fields = True


--------------------------------------------------------------------------------
/dags/bash_scripts/load_staging_table.sh:
--------------------------------------------------------------------------------
1 | spark-submit --driver-class-path $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar \
2 | --jars $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar \
3 | $AIRFLOW_HOME/dags/python_scripts/{{params.python_script}} {{ params.s3_bucket }} {{ params.s3_key }} \
4 | {{ params.aws_key }} {{ params.aws_secret_key }} {{ params.redshift_conn_string }} \
5 | {{ params.db_user }} {{params.db_pass}} --conf "fs.s3a.multipart.size=104857600"


--------------------------------------------------------------------------------
/dags/movies_dwh_dag.py:
--------------------------------------------------------------------------------
  1 | from airflow import DAG
  2 | from airflow.operators.bash_operator import BashOperator
  3 | from airflow.operators.dummy_operator import DummyOperator
  4 | from airflow.operators.postgres_operator import PostgresOperator
  5 | from airflow.operators.movies_plugin import DataQualityOperator
  6 | from airflow.models import Variable
  7 | from datetime import datetime, timedelta
  8 | import os
  9 | 
 10 | # Define configured variables to connect to AWS S3 and Redshift
 11 | movie_s3_config = Variable.get("movie_s3_config", deserialize_json=True)
 12 | 
 13 | # Parameters that are reused when submitting spark job to load staging tables
 14 | params = {'aws_key': movie_s3_config["awsKey"],
 15 |           'aws_secret_key': movie_s3_config["awsSecretKey"],
 16 |           'db_user': Variable.get("redshift_db_user"),
 17 |           'db_pass': Variable.get("redshift_db_pass"),
 18 |           'redshift_conn_string': Variable.get("redshift_conn_string"),
 19 |           's3_bucket': movie_s3_config["s3Bucket"],
 20 |           's3_key': movie_s3_config["s3Key"]  
 21 |          }
 22 | 
 23 | # Default settings for DAG
 24 | default_args = {
 25 |     'owner': 'Alan',
 26 |     'depends_on_past': False,
 27 |     'start_date': datetime.today(),
 28 |     'retries': 5,
 29 |     'retry_delay': timedelta(minutes=1),
 30 | }
 31 | 
 32 | ## Define the DAG object
 33 | with DAG(dag_id='movie_dwh_dag', default_args=default_args,
 34 |          description='Load and transform data in Redshift \
 35 |                       Data Warehouse with Airflow',
 36 |          schedule_interval='@once') as dag:
 37 |     
 38 |     start_operator = DummyOperator(task_id='begin-execution', dag=dag)
 39 | 
 40 |     # Create tables in movies schema
 41 |     create_tables = PostgresOperator(task_id='create-tables', postgres_conn_id="redshift",
 42 |                                      sql="sql_scripts/create_tables.sql", dag=dag)
 43 |     
 44 |     # Load stage_ratings data table
 45 |     params['python_script'] = 'load_staging_ratings.py'
 46 |     load_staging_ratings = BashOperator(task_id='load-staging-ratings',
 47 |                                         bash_command= './bash_scripts/load_staging_table.sh',
 48 |                                         params=params,
 49 |                                         dag=dag)
 50 |     
 51 |     # Load stage_movies data table
 52 |     params['python_script'] = 'load_staging_movies.py'
 53 |     load_staging_movies = BashOperator(task_id='load-staging-movies',
 54 |                                        bash_command= './bash_scripts/load_staging_table.sh',
 55 |                                        params=params,
 56 |                                        dag=dag)
 57 | 
 58 |     # Load stage_cpi data table
 59 |     params['python_script'] = 'load_staging_cpi.py'
 60 |     load_staging_cpi = BashOperator(task_id='load-staging-cpi',
 61 |                                     bash_command= './bash_scripts/load_staging_table.sh',
 62 |                                     params=params,
 63 |                                     dag=dag)
 64 |     
 65 |     # Load stage_genre data table
 66 |     params['python_script'] = 'load_staging_genre.py'
 67 |     load_staging_genre = BashOperator(task_id='load-staging-genre',
 68 |                                       bash_command= './bash_scripts/load_staging_table.sh',
 69 |                                       params=params,
 70 |                                       dag=dag)
 71 |     
 72 |     # Load stage_date data table
 73 |     params['python_script'] = 'load_staging_date.py'
 74 |     load_staging_date = BashOperator(task_id='load-staging-date',
 75 |                                      bash_command= './bash_scripts/load_staging_table.sh',
 76 |                                      params=params,
 77 |                                      dag=dag)
 78 |     
 79 |     # Run upsert on tables and delete staging tables
 80 |     upsert_ratings = PostgresOperator(task_id='upsert-ratings-table', postgres_conn_id="redshift",
 81 |                                     sql="sql_scripts/upsert_ratings.sql", dag=dag)
 82 | 
 83 |     upsert_movies = PostgresOperator(task_id='upsert-movies-table', postgres_conn_id="redshift",
 84 |                                      sql="sql_scripts/upsert_movies.sql", dag=dag)
 85 | 
 86 |     upsert_cpi = PostgresOperator(task_id='upsert-staging-cpi', postgres_conn_id="redshift",
 87 |                                   sql='sql_scripts/upsert_cpi.sql', dag=dag)
 88 | 
 89 |     upsert_date = PostgresOperator(task_id='upsert-staging-date', postgres_conn_id="redshift",
 90 |                                   sql='sql_scripts/upsert_date.sql', dag=dag)
 91 | 
 92 |     upsert_genre = PostgresOperator(task_id='upsert-staging-genre', postgres_conn_id="redshift",
 93 |                                   sql='sql_scripts/upsert_genre.sql', dag=dag)
 94 |     
 95 |     # Check for quality issues in ingested data
 96 |     tables = ["movies.movies", "movies.ratings", "movies.movie_genre",
 97 |               "movies.genre", "movies.date", "movies.cpi"]
 98 |     check_data_quality = DataQualityOperator(task_id='run_data_quality_checks',
 99 |                                             redshift_conn_id="redshift",
100 |                                             table_names=tables,
101 |                                             dag=dag)
102 | 
103 |     # Define data pipeline DAG structure
104 |     start_operator >> create_tables
105 |     create_tables >> [load_staging_ratings, load_staging_movies, load_staging_cpi, load_staging_date, load_staging_genre]
106 |     load_staging_ratings >> upsert_ratings
107 |     load_staging_movies >> upsert_movies
108 |     load_staging_cpi >> upsert_cpi
109 |     load_staging_date >> upsert_date
110 |     load_staging_genre >> upsert_genre
111 |     [upsert_cpi, upsert_ratings, upsert_movies, upsert_date, upsert_genre] >> check_data_quality


--------------------------------------------------------------------------------
/dags/python_scripts/load_staging_cpi.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from datetime import datetime
 4 | from pyspark import SparkConf, SparkContext
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.sql.types import (StructType, StructField as Fld,
 7 |                                DateType as Date, FloatType as Float)
 8 | from pyspark.sql.functions import col
 9 | 
10 | 
11 | def create_spark_session(aws_key, aws_secret_key):
12 |     """
13 |     Description: Creates spark session.
14 |     Returns:
15 |         spark session object
16 |     """
17 | 
18 |     spark = SparkSession \
19 |         .builder \
20 |         .config("spark.executor.heartbeatInterval", "40s") \
21 |         .getOrCreate()
22 |     
23 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl",
24 |                                                       "org.apache.hadoop.fs.s3a.S3AFileSystem")
25 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
26 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
27 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
28 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100")
29 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000")
30 |     return spark
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     s3_bucket = sys.argv[1]
35 |     s3_key = sys.argv[2]
36 |     aws_key = sys.argv[3]
37 |     aws_secret_key = sys.argv[4]
38 |     redshift_conn_string = sys.argv[5]
39 |     db_user = sys.argv[6]
40 |     db_pass = sys.argv[7]
41 | 
42 |     spark = create_spark_session(aws_key, aws_secret_key)
43 | 
44 |     cpi_schema = StructType([
45 |         Fld("DATE", Date()),
46 |         Fld("CUSR0000SS62031", Float())
47 |     ])
48 | 
49 |     cpi_df = spark.read.option("header", "true") \
50 |                            .csv("s3a://{}/{}/consumer_price_index.csv".format(s3_bucket, s3_key), 
51 |                             schema=cpi_schema)
52 | 
53 |     cpi_df = cpi_df.select(col("DATE").alias("date_cd"),
54 |                            col("CUSR0000SS62031").alias("consumer_price_index"))
55 |     
56 |     cpi_df = cpi_df.filter(cpi_df.date_cd.isNotNull())
57 | 
58 |     cpi_df.write \
59 |               .format("jdbc")  \
60 |               .option("url", redshift_conn_string) \
61 |               .option("dbtable", "movies.stage_cpi") \
62 |               .option("user", sys.argv[6]) \
63 |               .option("password", sys.argv[7]) \
64 |               .option("driver", "com.amazon.redshift.jdbc42.Driver") \
65 |               .mode("append") \
66 |               .save()


--------------------------------------------------------------------------------
/dags/python_scripts/load_staging_date.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from datetime import datetime
  4 | from pyspark import SparkConf, SparkContext
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl,
  7 |                                IntegerType as Int, DateType as Date,
  8 |                                BooleanType as Boolean, FloatType as Float,
  9 |                                LongType as Long, StringType as String,
 10 |                                ArrayType as Array)
 11 | from pyspark.sql.functions import (col, year, month, dayofmonth, weekofyear, quarter)
 12 | 
 13 | 
 14 | def create_spark_session(aws_key, aws_secret_key):
 15 |     """
 16 |     Description: Creates spark session.
 17 |     Returns:
 18 |         spark session object
 19 |     """
 20 | 
 21 |     spark = SparkSession \
 22 |         .builder \
 23 |         .config("spark.executor.heartbeatInterval", "40s") \
 24 |         .getOrCreate()
 25 |     
 26 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl",
 27 |                                                       "org.apache.hadoop.fs.s3a.S3AFileSystem")
 28 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
 29 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
 30 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
 31 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100")
 32 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000")
 33 |     return spark
 34 | 
 35 | 
 36 | def format_datetime(ts):
 37 |     return datetime.fromtimestamp(ts/1000.0) 
 38 | 
 39 | if __name__ == "__main__":
 40 |     s3_bucket = sys.argv[1]
 41 |     s3_key = sys.argv[2]
 42 |     aws_key = sys.argv[3]
 43 |     aws_secret_key = sys.argv[4]
 44 |     redshift_conn_string = sys.argv[5]
 45 |     db_user = sys.argv[6]
 46 |     db_pass = sys.argv[7]
 47 | 
 48 |     spark = create_spark_session(aws_key, aws_secret_key)
 49 | 
 50 |     movies_schema = StructType([
 51 |         Fld("adult", String()),
 52 |         Fld("belongs_to_collection", Long()),
 53 |         Fld("budget", Long()),
 54 |         Fld("genres", String()),
 55 |         Fld("homepage", String()),
 56 |         Fld("id", Int()),
 57 |         Fld("imdb_id", String()),
 58 |         Fld("original_language", String()),
 59 |         Fld("original_title", String()),
 60 |         Fld("overview", String()),
 61 |         Fld("popularity", Dbl()),
 62 |         Fld("poster_path", String()),
 63 |         Fld("production_company", String()),
 64 |         Fld("production_country",  String()),
 65 |         Fld("release_date", Date()),
 66 |         Fld("revenue", Long()),
 67 |         Fld("runtime", Float()),
 68 |         Fld("spoken_languages", String()),
 69 |         Fld("status", String()),
 70 |         Fld("tagline", String()),
 71 |         Fld("title", String()),
 72 |         Fld("video", Boolean()),
 73 |         Fld("vote_average", Float()),
 74 |         Fld("vote_count", Int())
 75 |     ])
 76 | 
 77 | 
 78 |     movies_df = spark.read.option("header", "true") \
 79 |                            .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 
 80 |                                 schema=movies_schema)
 81 | 
 82 |     movies_df = movies_df.na.drop()
 83 | 
 84 |     # extract columns to create time table
 85 |     date_table = movies_df.select(
 86 |                     col('release_date'),
 87 |                     dayofmonth("release_date").alias('day'),
 88 |                     weekofyear("release_date").alias('week'),
 89 |                     month("release_date").alias('month'),
 90 |                     quarter("release_date").alias('quarter'),
 91 |                     year("release_date").alias('year')
 92 |                  ).dropDuplicates()
 93 |     
 94 |     date_table.write \
 95 |               .format("jdbc")  \
 96 |               .option("url", redshift_conn_string) \
 97 |               .option("dbtable", "movies.stage_date") \
 98 |               .option("user", sys.argv[6]) \
 99 |               .option("password", sys.argv[7]) \
100 |               .option("driver", "com.amazon.redshift.jdbc42.Driver") \
101 |               .mode("append") \
102 |               .save()


--------------------------------------------------------------------------------
/dags/python_scripts/load_staging_genre.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from datetime import datetime
  4 | from pyspark import SparkConf, SparkContext
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl,
  7 |                                IntegerType as Int, DateType as Date,
  8 |                                BooleanType as Boolean, FloatType as Float,
  9 |                                LongType as Long, StringType as String,
 10 |                                ArrayType as Array)
 11 | from pyspark.sql.functions import (col, year, month, dayofmonth, weekofyear, quarter,
 12 |                                    explode, from_json)
 13 | 
 14 | 
 15 | def create_spark_session(aws_key, aws_secret_key):
 16 |     """
 17 |     Description: Creates spark session.
 18 |     Returns:
 19 |         spark session object
 20 |     """
 21 | 
 22 |     spark = SparkSession \
 23 |         .builder \
 24 |         .config("spark.executor.heartbeatInterval", "40s") \
 25 |         .getOrCreate()
 26 |     
 27 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl",
 28 |                                                       "org.apache.hadoop.fs.s3a.S3AFileSystem")
 29 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
 30 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
 31 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
 32 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100")
 33 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000")
 34 |     spark.conf.set("spark.sql.shuffle.partitions", 4)
 35 |     return spark
 36 | 
 37 | 
 38 | def format_datetime(ts):
 39 |     return datetime.fromtimestamp(ts/1000.0) 
 40 | 
 41 | if __name__ == "__main__":
 42 |     s3_bucket = sys.argv[1]
 43 |     s3_key = sys.argv[2]
 44 |     aws_key = sys.argv[3]
 45 |     aws_secret_key = sys.argv[4]
 46 |     redshift_conn_string = sys.argv[5]
 47 |     db_user = sys.argv[6]
 48 |     db_pass = sys.argv[7]
 49 | 
 50 |     spark = create_spark_session(aws_key, aws_secret_key)
 51 | 
 52 |     movies_schema = StructType([
 53 |         Fld("adult", String()),
 54 |         Fld("belongs_to_collection", Long()),
 55 |         Fld("budget", Long()),
 56 |         Fld("genres", String()),
 57 |         Fld("homepage", String()),
 58 |         Fld("id", Int()),
 59 |         Fld("imdb_id", String()),
 60 |         Fld("original_language", String()),
 61 |         Fld("original_title", String()),
 62 |         Fld("overview", String()),
 63 |         Fld("popularity", Dbl()),
 64 |         Fld("poster_path", String()),
 65 |         Fld("production_company", String()),
 66 |         Fld("production_country",  String()),
 67 |         Fld("release_date", Date()),
 68 |         Fld("revenue", Long()),
 69 |         Fld("runtime", Float()),
 70 |         Fld("spoken_languages", String()),
 71 |         Fld("status", String()),
 72 |         Fld("tagline", String()),
 73 |         Fld("title", String()),
 74 |         Fld("video", Boolean()),
 75 |         Fld("vote_average", Float()),
 76 |         Fld("vote_count", Int())
 77 |     ])
 78 | 
 79 | 
 80 |     movies_df = spark.read.option("header", "true") \
 81 |                            .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 
 82 |                                 schema=movies_schema)
 83 | 
 84 |     genre_schema = Array(StructType([Fld("id", Int()), Fld("name", String())]))
 85 | 
 86 |     movies_df = movies_df.withColumn("genres", explode(from_json("genres", genre_schema))) \
 87 |                          .withColumn("genre_id", col("genres.id")) \
 88 |                          .withColumn("genre_name", col("genres.name")) \
 89 |     
 90 |     movie_genre = movies_df.select("id", "genre_id").distinct()
 91 |     movie_genre = movie_genre.select(col("id").alias("movie_id"), col("genre_id"))
 92 |     
 93 |     genre = movies_df.select("genre_id", "genre_name").distinct()
 94 |     genre = genre.na.drop()
 95 | 
 96 |     # Load data into staging:
 97 |     genre.write \
 98 |          .format("jdbc")  \
 99 |          .option("url", redshift_conn_string) \
100 |          .option("dbtable", "movies.stage_genre") \
101 |          .option("user", sys.argv[6]) \
102 |          .option("password", sys.argv[7]) \
103 |          .option("driver", "com.amazon.redshift.jdbc42.Driver") \
104 |          .mode("append") \
105 |          .save()
106 |     
107 |     movie_genre.write \
108 |                .format("jdbc")  \
109 |                .option("url", redshift_conn_string) \
110 |                .option("dbtable", "movies.stage_movie_genre") \
111 |                .option("user", sys.argv[6]) \
112 |                .option("password", sys.argv[7]) \
113 |                .option("driver", "com.amazon.redshift.jdbc42.Driver") \
114 |                .mode("append") \
115 |                .save()


--------------------------------------------------------------------------------
/dags/python_scripts/load_staging_movies.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from datetime import datetime
  4 | from pyspark import SparkConf, SparkContext
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl,
  7 |                                IntegerType as Int, DateType as Date,
  8 |                                BooleanType as Boolean, FloatType as Float,
  9 |                                LongType as Long, StringType as String,
 10 |                                ArrayType as Array)
 11 | from pyspark.sql.functions import col
 12 | 
 13 | 
 14 | def create_spark_session(aws_key, aws_secret_key):
 15 |     """
 16 |     Description: Creates spark session.
 17 |     Returns:
 18 |         spark session object
 19 |     """
 20 | 
 21 |     spark = SparkSession \
 22 |         .builder \
 23 |         .config("spark.executor.heartbeatInterval", "40s") \
 24 |         .getOrCreate()
 25 |     
 26 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl",
 27 |                                                       "org.apache.hadoop.fs.s3a.S3AFileSystem")
 28 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
 29 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
 30 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
 31 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100")
 32 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000")
 33 |     return spark
 34 | 
 35 | 
 36 | def format_datetime(ts):
 37 |     return datetime.fromtimestamp(ts/1000.0) 
 38 | 
 39 | if __name__ == "__main__":
 40 |     s3_bucket = sys.argv[1]
 41 |     s3_key = sys.argv[2]
 42 |     aws_key = sys.argv[3]
 43 |     aws_secret_key = sys.argv[4]
 44 |     redshift_conn_string = sys.argv[5]
 45 |     db_user = sys.argv[6]
 46 |     db_pass = sys.argv[7]
 47 | 
 48 |     spark = create_spark_session(aws_key, aws_secret_key)
 49 | 
 50 |     movies_schema = StructType([
 51 |         Fld("adult", String()),
 52 |         Fld("belongs_to_collection", Long()),
 53 |         Fld("budget", Long()),
 54 |         Fld("genres", String()),
 55 |         Fld("homepage", String()),
 56 |         Fld("id", Int()),
 57 |         Fld("imdb_id", String()),
 58 |         Fld("original_language", String()),
 59 |         Fld("original_title", String()),
 60 |         Fld("overview", String()),
 61 |         Fld("popularity", Dbl()),
 62 |         Fld("poster_path", String()),
 63 |         Fld("production_companies", String()),
 64 |         Fld("production_countries",  String()),
 65 |         Fld("release_date", Date()),
 66 |         Fld("revenue", Long()),
 67 |         Fld("runtime", Float()),
 68 |         Fld("spoken_languages", String()),
 69 |         Fld("status", String()),
 70 |         Fld("tagline", String()),
 71 |         Fld("title", String()),
 72 |         Fld("video", Boolean()),
 73 |         Fld("vote_average", Float()),
 74 |         Fld("vote_count", Int())
 75 |     ])
 76 | 
 77 | 
 78 |     movies_df = spark.read.option("header", "true") \
 79 |                            .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 
 80 |                                 schema=movies_schema)
 81 | 
 82 | 
 83 |     movies_df = movies_df.select(
 84 |         col("id").alias("movie_id"),
 85 |         col("adult").alias("is_adult"),
 86 |         col("budget"),
 87 |         col("original_language"),
 88 |         col("title"),
 89 |         col("popularity"),
 90 |         col("release_date"),
 91 |         col("revenue"),
 92 |         col("vote_count"),
 93 |         col("vote_average")
 94 |     )
 95 | 
 96 |     movies_df = movies_df.na.drop()
 97 |     
 98 |     movies_df.write \
 99 |              .format("jdbc")  \
100 |              .option("url", redshift_conn_string) \
101 |              .option("dbtable", "movies.stage_movies") \
102 |              .option("user", sys.argv[6]) \
103 |              .option("password", sys.argv[7]) \
104 |              .option("driver", "com.amazon.redshift.jdbc42.Driver") \
105 |              .mode("append") \
106 |              .save()


--------------------------------------------------------------------------------
/dags/python_scripts/load_staging_ratings.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from datetime import datetime
 4 | from pyspark import SparkConf, SparkContext
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl,
 7 |                                IntegerType as Int, TimestampType as Timestamp, 
 8 |                                DateType as Date)
 9 | from pyspark.sql.functions import col
10 | 
11 | 
12 | def create_spark_session(aws_key, aws_secret_key):
13 |     """
14 |     Description: Creates spark session.
15 |     Returns:
16 |         spark session object
17 |     """
18 | 
19 |     spark = SparkSession \
20 |         .builder \
21 |         .config("spark.executor.heartbeatInterval", "40s") \
22 |         .getOrCreate()
23 |     
24 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl",
25 |                                                       "org.apache.hadoop.fs.s3a.S3AFileSystem")
26 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
27 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
28 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
29 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100")
30 |     spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000")
31 |     return spark
32 | 
33 | if __name__ == "__main__":
34 |     s3_bucket = sys.argv[1]
35 |     s3_key = sys.argv[2]
36 |     aws_key = sys.argv[3]
37 |     aws_secret_key = sys.argv[4]
38 |     redshift_conn_string = sys.argv[5]
39 |     db_user = sys.argv[6]
40 |     db_pass = sys.argv[7]
41 | 
42 |     spark = create_spark_session(aws_key, aws_secret_key)
43 | 
44 |     ratings_schema = StructType([
45 |         Fld("userId", Int()),
46 |         Fld("movieId", Int()),
47 |         Fld("rating", Dbl()),
48 |         Fld("timestamp", Timestamp())
49 |     ])
50 | 
51 |     ratings_df = spark.read.option("header", "true") \
52 |                            .csv("s3a://{}/{}/ratings.csv".format(s3_bucket, s3_key), 
53 |                                 schema=ratings_schema)
54 | 
55 |     ratings_df = ratings_df.select(
56 |         col("userId").alias("user_id"),
57 |         col("movieId").alias("movie_id"),
58 |         col("rating")
59 |     )
60 | 
61 |     ratings_df.write \
62 |               .format("jdbc")  \
63 |               .option("url", redshift_conn_string) \
64 |               .option("dbtable", "movies.stage_ratings") \
65 |               .option("user", sys.argv[6]) \
66 |               .option("password", sys.argv[7]) \
67 |               .option("driver", "com.amazon.redshift.jdbc42.Driver") \
68 |               .mode("append") \
69 |               .save()
70 |     


--------------------------------------------------------------------------------
/dags/sql_scripts/create_tables.sql:
--------------------------------------------------------------------------------
  1 | BEGIN;
  2 | 
  3 | CREATE SCHEMA IF NOT EXISTS movies;
  4 | 
  5 | 
  6 | CREATE TABLE IF NOT EXISTS movies.stage_ratings (
  7 |     user_movie_id BIGINT IDENTITY(0,1),
  8 |     user_id INTEGER NOT NULL,
  9 |     movie_id INTEGER NOT NULL,
 10 |     rating NUMERIC,
 11 |     primary key (user_movie_id)
 12 | ) diststyle key distkey(movie_id);
 13 | 
 14 | CREATE TABLE IF NOT EXISTS movies.ratings (
 15 |     user_movie_id INT IDENTITY(0,1),
 16 |     user_id INTEGER NOT NULL,
 17 |     movie_id INTEGER NOT NULL,
 18 |     rating NUMERIC,
 19 |     primary key (user_movie_id)
 20 | ) diststyle key distkey(movie_id);
 21 | 
 22 | CREATE TABLE IF NOT EXISTS movies.stage_movies (
 23 |     movie_id INT NOT NULL,
 24 |     is_adult VARCHAR(5) NOT NULL,
 25 |     budget BIGINT NOT NULL,
 26 |     original_language CHAR(2) NOT NULL,
 27 |     title VARCHAR(300) NOT NULL,
 28 |     popularity FLOAT,
 29 |     release_date DATE NOT NULL,
 30 |     revenue BIGINT NOT NULL SORTKEY,
 31 |     vote_count INT,
 32 |     vote_average FLOAT,
 33 |     primary key (movie_id)
 34 | ) diststyle key distkey(movie_id);
 35 | 
 36 | CREATE TABLE IF NOT EXISTS movies.movies (
 37 |     movie_id INT NOT NULL,
 38 |     is_adult VARCHAR(5) NOT NULL,
 39 |     budget BIGINT NOT NULL,
 40 |     original_language CHAR(2) NOT NULL,
 41 |     title VARCHAR(300) NOT NULL,
 42 |     popularity FLOAT,
 43 |     release_date DATE,
 44 |     revenue BIGINT NOT NULL SORTKEY,
 45 |     vote_count INT,
 46 |     vote_average FLOAT,
 47 |     primary key (movie_id)
 48 | ) diststyle key distkey(movie_id);
 49 | 
 50 | CREATE TABLE IF NOT EXISTS movies.stage_movie_genre (
 51 |     movie_id INT NOT NULL,
 52 |     genre_id INT NOT NULL,
 53 |     primary key (movie_id, genre_id)
 54 | )
 55 | diststyle key distkey(movie_id);
 56 | 
 57 | CREATE TABLE IF NOT EXISTS movies.movie_genre (
 58 |     movie_id INT NOT NULL,
 59 |     genre_id INT NOT NULL,
 60 |     primary key (movie_id, genre_id)
 61 | )
 62 | diststyle key distkey(movie_id);
 63 | 
 64 | CREATE TABLE IF NOT EXISTS movies.stage_genre (
 65 |     genre_id INT NOT NULL,
 66 |     genre_name VARCHAR(300),
 67 |     primary key (genre_id)
 68 | )
 69 | diststyle all;
 70 | 
 71 | CREATE TABLE IF NOT EXISTS movies.genre (
 72 |     genre_id INT NOT NULL,
 73 |     genre_name VARCHAR(300),
 74 |     primary key (genre_id)
 75 | )
 76 | diststyle all;
 77 | 
 78 | CREATE TABLE IF NOT EXISTS movies.stage_date (
 79 |     release_date DATE NOT NULL SORTKEY,
 80 |     day INT,
 81 |     week INT,
 82 |     month INT,
 83 |     quarter INT,
 84 |     year INT,
 85 |     primary key (release_date)
 86 | )
 87 | diststyle all;
 88 | 
 89 | CREATE TABLE IF NOT EXISTS movies.date (
 90 |     release_date DATE NOT NULL SORTKEY,
 91 |     day INT,
 92 |     week INT,
 93 |     month INT,
 94 |     quarter INT,
 95 |     year INT,
 96 |     primary key (release_date)
 97 | )
 98 | diststyle all;
 99 | 
100 | CREATE TABLE IF NOT EXISTS movies.stage_cpi (
101 |     date_cd DATE NOT NULL SORTKEY,
102 |     consumer_price_index FLOAT
103 | )
104 | diststyle all;
105 | 
106 | CREATE TABLE IF NOT EXISTS movies.cpi (
107 |     date_cd DATE NOT NULL SORTKEY,
108 |     consumer_price_index FLOAT
109 | )
110 | diststyle all;
111 | 
112 | END;


--------------------------------------------------------------------------------
/dags/sql_scripts/upsert_cpi.sql:
--------------------------------------------------------------------------------
 1 | BEGIN;
 2 | 
 3 | -- Upsert cpi table
 4 | UPDATE movies.cpi 
 5 | SET consumer_price_index = sc.consumer_price_index
 6 | FROM movies.stage_cpi sc
 7 | WHERE movies.cpi.date_cd= sc.date_cd; 
 8 | 
 9 | 
10 | INSERT INTO movies.cpi
11 | SELECT sc.* FROM movies.stage_cpi sc LEFT JOIN movies.cpi 
12 | ON sc.date_cd = movies.cpi.date_cd
13 | WHERE movies.cpi.date_cd IS NULL;
14 | 
15 | DROP TABLE IF EXISTS movies.stage_cpi;
16 | 
17 | END;


--------------------------------------------------------------------------------
/dags/sql_scripts/upsert_date.sql:
--------------------------------------------------------------------------------
 1 | BEGIN;
 2 | 
 3 | -- Upsert date table
 4 | UPDATE movies.date 
 5 | SET day = md.day, week = md.week, month = md.month,
 6 | quarter = md.quarter, year = md.year
 7 | FROM movies.stage_date md
 8 | WHERE movies.date.release_date = md.release_date; 
 9 | 
10 | INSERT INTO movies.date
11 | SELECT md.* FROM movies.stage_date md LEFT JOIN movies.date
12 | ON md.release_date = movies.date.release_date
13 | WHERE movies.date.release_date IS NULL;
14 | 
15 | DROP TABLE IF EXISTS movies.stage_date;
16 | 
17 | END;


--------------------------------------------------------------------------------
/dags/sql_scripts/upsert_genre.sql:
--------------------------------------------------------------------------------
 1 | BEGIN;
 2 | 
 3 | -- Update movie_genre table
 4 | INSERT INTO movies.movie_genre 
 5 | SELECT mg.* FROM movies.stage_movie_genre mg LEFT JOIN movies.movie_genre
 6 | ON mg.movie_id = movies.movie_genre.movie_id AND mg.genre_id = movies.movie_genre.genre_id
 7 | WHERE movies.movie_genre.movie_id IS NULL;
 8 | 
 9 | DROP TABLE movies.stage_movie_genre;
10 | 
11 | -- Upsert genre table
12 | UPDATE movies.genre 
13 | SET genre_name = mg.genre_name
14 | FROM movies.stage_genre mg
15 | WHERE movies.genre.genre_id = mg.genre_id; 
16 | 
17 | 
18 | INSERT INTO movies.genre
19 | SELECT mg.* FROM movies.stage_genre mg LEFT JOIN movies.genre
20 | ON mg.genre_id = movies.genre.genre_id
21 | WHERE movies.genre.genre_id IS NULL;
22 | 
23 | DROP TABLE movies.stage_genre;
24 | 
25 | END;


--------------------------------------------------------------------------------
/dags/sql_scripts/upsert_movies.sql:
--------------------------------------------------------------------------------
 1 | BEGIN;
 2 | 
 3 | -- Upsert movies table
 4 | UPDATE movies.movies 
 5 | SET is_adult = mm.is_adult, budget = mm.budget, original_language = mm.original_language,
 6 | title = mm.title, popularity = mm.popularity, release_date = mm.release_date,
 7 | revenue = mm.revenue, vote_count = mm.vote_count, vote_average = mm.vote_average
 8 | FROM movies.stage_movies mm
 9 | WHERE movies.movies.movie_id = mm.movie_id; 
10 | 
11 | 
12 | INSERT INTO movies.movies 
13 | SELECT mm.* FROM movies.stage_movies mm LEFT JOIN movies.movies 
14 | ON mm.movie_id = movies.movies.movie_id
15 | WHERE movies.movies.movie_id IS NULL;
16 | 
17 | DROP TABLE movies.stage_movies;
18 | 
19 | END;
20 | 
21 | 


--------------------------------------------------------------------------------
/dags/sql_scripts/upsert_ratings.sql:
--------------------------------------------------------------------------------
 1 | BEGIN;
 2 | 
 3 | -- Upsert ratings table
 4 | UPDATE movies.ratings 
 5 | SET movie_rating = ms.rating 
 6 | FROM movies.stage_ratings ms 
 7 | WHERE movies.ratings.user_id = ms.user_id AND movies.ratings.movie_id = ms.movie_id; 
 8 | 
 9 | 
10 | INSERT INTO movies.ratings 
11 | SELECT ms.* FROM movies.stage_ratings ms LEFT JOIN movies.ratings 
12 | ON ms.user_id = movies.ratings.user_id AND ms.movie_id = movies.ratings.movie_id
13 | WHERE movies.ratings.user_id IS NULL;
14 | 
15 | DROP TABLE IF EXISTS movies.stage_ratings;
16 | 
17 | END;


--------------------------------------------------------------------------------
/docker-compose-LocalExecutor.yml:
--------------------------------------------------------------------------------
 1 | version: '3.3'
 2 | services:
 3 |     postgres:
 4 |         image: postgres:9.6
 5 |         environment:
 6 |             - POSTGRES_USER=airflow
 7 |             - POSTGRES_PASSWORD=airflow
 8 |             - POSTGRES_DB=airflow
 9 |         logging:
10 |             options:
11 |                 max-size: 10m
12 |                 max-file: "3"
13 | 
14 |     webserver:
15 |         image: alanchn31/alanchn31-capstone-udacity-de-nd:1
16 |         restart: always
17 |         depends_on:
18 |             - postgres
19 |         environment:
20 |             - LOAD_EX=n
21 |             - EXECUTOR=Local
22 |             - PYSPARK_PYTHON=/usr/bin/python3
23 |         logging:
24 |             options:
25 |                 max-size: 10m
26 |                 max-file: "3"
27 |         volumes:
28 |             - ./dags:/usr/local/airflow/dags
29 |             - ./plugins:/usr/local/airflow/plugins
30 |         ports:
31 |             - "8080:8080"
32 |         command: webserver
33 |         healthcheck:
34 |             test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
35 |             interval: 30s
36 |             timeout: 30s
37 |             retries: 3


--------------------------------------------------------------------------------
/documentation/Data Dictionary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/Data Dictionary.pdf


--------------------------------------------------------------------------------
/documentation/README_images/architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/architecture.PNG


--------------------------------------------------------------------------------
/documentation/README_images/dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/dag.PNG


--------------------------------------------------------------------------------
/documentation/README_images/data_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/data_model.png


--------------------------------------------------------------------------------
/documentation/README_images/logo.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/logo.PNG


--------------------------------------------------------------------------------
/plugins/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, absolute_import, print_function
 2 | 
 3 | from airflow.plugins_manager import AirflowPlugin
 4 | 
 5 | import operators
 6 | 
 7 | # Defining the plugin class
 8 | class MoviesPlugin(AirflowPlugin):
 9 |     name = "movies_plugin"
10 |     operators = [
11 |         operators.DataQualityOperator
12 |     ]


--------------------------------------------------------------------------------
/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
1 | from operators.data_quality import DataQualityOperator
2 | 
3 | __all__ = [
4 |     'DataQualityOperator'
5 | ]


--------------------------------------------------------------------------------
/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class DataQualityOperator(BaseOperator):
 6 |     ui_color = '#89DA59'
 7 | 
 8 |     @apply_defaults
 9 |     def __init__(self,
10 |                  redshift_conn_id="",
11 |                  table_names=[""],
12 |                  *args, **kwargs):
13 | 
14 |         super(DataQualityOperator, self).__init__(*args, **kwargs)
15 |         self.redshift_conn_id = redshift_conn_id
16 |         self.table_names = table_names
17 | 
18 |     def execute(self, context):
19 |         redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
20 |         for table in self.table_names:
21 |             # Check that entries are being copied to table
22 |             records = redshift.get_records("SELECT COUNT(*) FROM {}".format(table))
23 |             if len(records) < 1 or len(records[0]) < 1:
24 |                 raise ValueError("Data quality check failed. {} returned no results".format(table))
25 | 
26 |         # Check that there are no rows with null ids
27 |         dq_checks=[
28 |             {'table': 'movies.movies',
29 |              'check_sql': "SELECT COUNT(*) FROM movies.movies WHERE movie_id is null",
30 |              'expected_result': 0},
31 |             {'table': 'movies.genre',
32 |              'check_sql': "SELECT COUNT(*) FROM movies.genre WHERE genre_id is null",
33 |              'expected_result': 0},
34 |              {'table': 'movies.date',
35 |              'check_sql': "SELECT COUNT(*) FROM movies.date WHERE release_date is null",
36 |              'expected_result': 0},
37 |              {'table': 'movies.cpi',
38 |              'check_sql': "SELECT COUNT(*) FROM movies.cpi WHERE date_cd is null",
39 |              'expected_result': 0},
40 |         ]
41 |         for check in dq_checks:
42 |              records = redshift.get_records(check['check_sql'])
43 |              if records[0][0] != check['expected_result']:
44 |                 print("Number of rows with null ids: ", records[0][0])
45 |                 print("Expected number of rows with null ids: ", check['expected_result'])
46 |                 raise ValueError("Data quality check failed. {} contains null in id column".format(check['table']))


--------------------------------------------------------------------------------
/script/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | AIRFLOW_HOME="/usr/local/airflow"
 4 | CMD="airflow"
 5 | TRY_LOOP="20"
 6 | 
 7 | : ${REDIS_HOST:="redis"}
 8 | : ${REDIS_PORT:="6379"}
 9 | 
10 | : ${POSTGRES_HOST:="postgres"}
11 | : ${POSTGRES_PORT:="5432"}
12 | : ${POSTGRES_USER:="airflow"}
13 | : ${POSTGRES_PASSWORD:="airflow"}
14 | : ${POSTGRES_DB:="airflow"}
15 | 
16 | : ${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}
17 | 
18 | # Load DAGs exemples (default: Yes)
19 | if [ "$LOAD_EX" = "n" ]; then
20 |     sed -i "s/load_examples = True/load_examples = False/" "$AIRFLOW_HOME"/airflow.cfg
21 | fi
22 | 
23 | # Install custome python package if requirements.txt is present
24 | if [ -e "/requirements.txt" ]; then
25 |     $(which pip) install --user -r /requirements.txt
26 | fi
27 | 
28 | # Update airflow config - Fernet key
29 | sed -i "s|\$FERNET_KEY|$FERNET_KEY|" "$AIRFLOW_HOME"/airflow.cfg
30 | 
31 | # Wait for Postresql
32 | if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then
33 |   i=0
34 |   while ! nc -z $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do
35 |     i=$((i+1))
36 |     if [ "$1" = "webserver" ]; then
37 |       echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP"
38 |       if [ $i -ge $TRY_LOOP ]; then
39 |         echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up"
40 |         exit 1
41 |       fi
42 |     fi
43 |     sleep 10
44 |   done
45 | fi
46 | 
47 | # Update configuration depending the type of Executor
48 | if [ "$EXECUTOR" = "Celery" ]
49 | then
50 |   # Wait for Redis
51 |   if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [ "$1" = "flower" ] ; then
52 |     j=0
53 |     while ! nc -z $REDIS_HOST $REDIS_PORT >/dev/null 2>&1 < /dev/null; do
54 |       j=$((j+1))
55 |       if [ $j -ge $TRY_LOOP ]; then
56 |         echo "$(date) - $REDIS_HOST still not reachable, giving up"
57 |         exit 1
58 |       fi
59 |       echo "$(date) - waiting for Redis... $j/$TRY_LOOP"
60 |       sleep 5
61 |     done
62 |   fi
63 |   sed -i "s#celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow#celery_result_backend = db+postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg
64 |   sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg
65 |   sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg
66 |   if [ "$1" = "webserver" ]; then
67 |     echo "Initialize database..."
68 |     $CMD initdb
69 |     exec $CMD webserver
70 |   else
71 |     sleep 10
72 |     exec $CMD "$@"
73 |   fi
74 | elif [ "$EXECUTOR" = "Local" ]
75 | then
76 |   sed -i "s/executor = CeleryExecutor/executor = LocalExecutor/" "$AIRFLOW_HOME"/airflow.cfg
77 |   sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg
78 |   sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg
79 |   echo "Initialize database..."
80 |   $CMD initdb
81 |   exec $CMD webserver &
82 |   exec $CMD scheduler
83 | # By default we use SequentialExecutor
84 | else
85 |   if [ "$1" = "version" ]; then
86 |     exec $CMD version
87 |     exit
88 |   fi
89 |   sed -i "s/executor = CeleryExecutor/executor = SequentialExecutor/" "$AIRFLOW_HOME"/airflow.cfg
90 |   sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = sqlite:////usr/local/airflow/airflow.db#" "$AIRFLOW_HOME"/airflow.cfg
91 |   echo "Initialize database..."
92 |   $CMD initdb
93 |   exec $CMD webserver
94 | fi


--------------------------------------------------------------------------------