├── .gitignore ├── Dockerfile ├── README.md ├── config └── airflow.cfg ├── dags ├── bash_scripts │ └── load_staging_table.sh ├── movies_dwh_dag.py ├── python_scripts │ ├── load_staging_cpi.py │ ├── load_staging_date.py │ ├── load_staging_genre.py │ ├── load_staging_movies.py │ └── load_staging_ratings.py └── sql_scripts │ ├── create_tables.sql │ ├── upsert_cpi.sql │ ├── upsert_date.sql │ ├── upsert_genre.sql │ ├── upsert_movies.sql │ └── upsert_ratings.sql ├── docker-compose-LocalExecutor.yml ├── documentation ├── Data Dictionary.pdf └── README_images │ ├── architecture.PNG │ ├── dag.PNG │ ├── data_model.png │ └── logo.PNG ├── plugins ├── __init__.py └── operators │ ├── __init__.py │ └── data_quality.py └── script └── entrypoint.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gettyimages/spark:latest 2 | LABEL Alan 3 | 4 | # Never prompts the user for choices on installation/configuration of packages 5 | ENV DEBIAN_FRONTEND noninteractive 6 | ENV TERM linux 7 | 8 | # Airflow 9 | ARG AIRFLOW_VERSION=1.10.9 10 | ARG AIRFLOW_USER_HOME=/usr/local/airflow 11 | ARG AIRFLOW_DEPS="" 12 | ARG PYTHON_DEPS="" 13 | ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} 14 | 15 | # Define en_US. 16 | ENV LANGUAGE en_US.UTF-8 17 | ENV LANG en_US.UTF-8 18 | ENV LC_ALL en_US.UTF-8 19 | ENV LC_CTYPE en_US.UTF-8 20 | ENV LC_MESSAGES en_US.UTF-8 21 | ENV LC_ALL en_US.UTF-8 22 | 23 | RUN set -ex \ 24 | && buildDeps=' \ 25 | freetds-dev \ 26 | python-dev \ 27 | python3-dev \ 28 | libkrb5-dev \ 29 | libsasl2-dev \ 30 | libssl-dev \ 31 | libffi-dev \ 32 | libpq-dev \ 33 | git \ 34 | libxslt-dev \ 35 | build-essential \ 36 | libblas-dev \ 37 | liblapack-dev \ 38 | libpq-dev \ 39 | git \ 40 | ' \ 41 | && apt-get update -yqq \ 42 | && apt-get upgrade -yqq \ 43 | && apt-get install -yqq --no-install-recommends \ 44 | $buildDeps \ 45 | freetds-bin \ 46 | build-essential \ 47 | default-libmysqlclient-dev \ 48 | apt-utils \ 49 | curl \ 50 | rsync \ 51 | netcat \ 52 | locales \ 53 | && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ 54 | && locale-gen \ 55 | && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ 56 | && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \ 57 | && pip install -U pip setuptools wheel \ 58 | && pip install pytz \ 59 | && pip install pyOpenSSL \ 60 | && pip install ndg-httpsclient \ 61 | && pip install pyasn1 \ 62 | && pip install SQLAlchemy==1.3.15 \ 63 | && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \ 64 | && pip install 'redis==3.2' \ 65 | && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \ 66 | && apt-get purge --auto-remove -yqq $buildDeps \ 67 | && apt-get autoremove -yqq --purge \ 68 | && apt-get clean \ 69 | && rm -rf \ 70 | /var/lib/apt/lists/* \ 71 | /tmp/* \ 72 | /var/tmp/* \ 73 | /usr/share/man \ 74 | /usr/share/doc \ 75 | /usr/share/doc-base 76 | 77 | COPY script/entrypoint.sh /entrypoint.sh 78 | 79 | COPY config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg 80 | 81 | # Add jar file for Redshift JDBC driver 82 | ADD https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.43.1067/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar $SPARK_HOME/jars 83 | 84 | RUN chown -R airflow: ${AIRFLOW_USER_HOME} 85 | RUN chmod -R o+rx $SPARK_HOME/jars 86 | 87 | RUN echo "EXPORT HADOOP_OPTIONAL_TOOLS=hadoop-aws" >> $HADOOP_CONF_DIR/hadoop-env.sh 88 | 89 | EXPOSE 8080 5555 8793 90 | 91 | USER airflow 92 | WORKDIR ${AIRFLOW_HOME} 93 | ENTRYPOINT ["/entrypoint.sh"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Movalytics 2 | --- 3 | ![Movalytics](documentation/README_images/logo.PNG) 4 | 5 | ## Project Description 6 | --- 7 | * This project is a case study for a start-up, involved with recommending movies to users as a service, as well as investigating certain factors contributing to the success of movies. 8 | * The aim of this project, is to perform Extract, Transform, Load, on movies data, to answer questions the business may have about its users, such as: 9 | * What is the highest rated movie of all time? 10 | * Which genre of movies are the most popular with users? 11 | * Trends in box office earnings - Does releasing a movie at a certain quarter/month of a year, lead to higher box office earnings? 12 | * Which genres are the highest earning of all-time, normalized against a consumer price index? 13 | 14 | * The movies data and metadata comes from Movielens, extracted from a Kaggle dataset: https://www.kaggle.com/rounakbanik/the-movies-dataset. The data contains 26 million user ratings of over 270,000 users on a collection of over 45,000 movies. 15 | * In addition, Consumer Price Index of Admission to Movies, Theaters, and Concerts in U.S. City Average is extracted from: https://fred.stlouisfed.org/series/CUSR0000SS62031. This will help us normalize box office earnings against inflation over the years. 16 | 17 | ## Architecture 18 | --- 19 | The technical architecture for this project is as show below: 20 | 21 | ![Architecture](documentation/README_images/architecture.PNG) 22 | 23 | 1. Data Extraction is done using Kaggle API and using GET request to St Louis Fred's CPI dataset. 24 | Set up an EC2 instance with python and pip installed. Then, run `pip install kaggle`. To download the movielens dataset, run 25 | ```bash 26 | kaggle datasets download -d "rounakbanik/the-movies-dataset" 27 | ``` 28 | For St Louis Fred's Consumer Price Index dataset, run 29 | ```bash 30 | wget https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=CUSR0000SS62031&scale=left&cosd=1999-01-01&coed=2020-04-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-05-31&revision_date=2020-05-31&nd=1999-01-01 31 | ``` 32 | 33 | 2. Next, copy the files downloaded from the EC2 instance to S3. Make sure that it has the aws-cli installed. Run `aws configure` and then `aws s3 cp {FILE} s3://{S3_BUCKET}/{S3_FOLDER}/` to transfer the files to S3. Note that if the situation changes such that this becomes a daily job, we can write a shell script containing these commands, and add the command to run this shell script in our Airflow data pipeline 34 | 35 | 3. Run the ETL pipeline, scheduled using Airflow. Data Processing is done using Spark, and data is eventually ingested into Redshift. 36 | 37 | ## Choice of Technologies 38 | --- 39 | * For data processing (data transformation step), Spark is chosen because of its parellel processing capabilities. Should the amount of data proliferate to 100x, more worker nodes can be added to the spark cluster to scale out. 40 | 41 | * For orchestrating the steps in the pipeline, Airflow is chosen as it allows building of data pipelines that are straghtforward and modular. Airflow allows tasks to be defined in a Directed Acyclic Graph (DAG) with dependencies of tasks between one another. This allows running of tasks to be optimized. It also enables the pipeline to be run on a schedule (for eg, daily) should the need arise. Finally, it has an intuitive UI that allows users to check the steps in the data pipeline should any part of the pipeline fail. 42 | 43 | * Redshift is chosen as the cloud Data Warehouse as it is highly scalable. Should our data grow in size, we can provision more nodes or scale up, to handle the larger volume of data. 44 | 45 | * Docker is used to encapsulate package dependencies the code may have, to allow the code to run on any machine 46 | 47 | ## Data Model 48 | --- 49 | * The data model for this project is as shown below: 50 | 51 | ![Data Model](documentation/README_images/data_model.png) 52 | 53 | The approach taken, is to normalize the data. This will lead to more efficient UPDATES and DELETES as and when required. 54 | 55 | ## ETL Pipeline 56 | --- 57 | The ETL process runs through an Airflow DAG: 58 | 59 | ![Data Model](documentation/README_images/dag.PNG) 60 | 61 | The process is as follows: 62 | 1. We create the tables and staging tables (if they do not exist) 63 | 2. We perform an update and insert, based on new data coming in 64 | 3. Run a data quality check (check that tables have more than 1 row and there are no null ids) 65 | 66 | ## Potential Improvements 67 | --- 68 | * The assumption that I have made is that the data volume will not increase subtantially and the pipeline is only required to run once 69 | 70 | 1. What if data is increased by 100x? 71 | * We can run a increase the number of worker nodes on the spark cluster, to improve performance of compute. Furthermore, airflow schedules can be utilized to pull only a subset of the data at a time, to reduce volume of data handled at any one time. 72 | 73 | 2. What if data pipeline needs to be run by 7am daily? 74 | * We can turn on the EC2 machine and run the pipeline before 7am daily. Currently, the schedule of the airflow pipeline is set to ingest only once. We can set it to a daily schedule, to ingest new data coming in daily. We should add a new node to our Airflow DAG, to download data using API/get request and transfer to S3. In addition, to handle heavy workloads when backdating, CeleryExecutor should be used to run processes in a distributed fashion, ensuring there is no single point of failure. Furthermore, We can make use of Airflow's SLA feature, to send alerts should pipeline not have succeeded before a certain time (for eg, 6:00am) 75 | 76 | 3. What if the database needs to be accessed by 100+ users? 77 | * Redshift should not have an issue handling many users, but we should be careful to scale up/scale out with more nodes whenever necessary. To provide efficiency to queries, we can seek to understand common queries users have, so we can tweak our data model. Aggregated data tables can be provided beforehand to reduce query times. We can also assign sort keys according to users querying needs for each table. 78 | 79 | ## Development 80 | --- 81 | 82 | Setting up 83 | * Build the docker/pull docker image 84 | ```bash 85 | # Build the docker image 86 | docker build -t {IMAGE_NAME} . 87 | ``` 88 | * Alternatively, pull the docker image I have pushed to Dockerhub: 89 | ```bash 90 | # Build the docker image 91 | docker pull alanchn31/alanchn31-capstone-udacity-de-nd:1 92 | ``` 93 | 94 | * Replace the webserver image in docker-compose-LocalExecutor.yml to the image name of the Docker image you have built/pulled. 95 | ```yml 96 | webserver: 97 | image: alanchn31/alanchn31-capstone-udacity-de-nd:1 98 | ``` 99 | 100 | * Run `docker-compose -f docker-compose-LocalExecutor.yml up -d `. Your airflow server should be initiated and will be up and running. Visit `https://{your ec2 ip address}:8080` to view Airflow UI 101 | 102 | Add necessary connections and variables in Airflow UI 103 | There are 4 variables to be defined: 104 | 1. `movie_s3_config`. It is defined as a json format as follows: 105 | ``` 106 | { 107 | 'aws_key': {AWS_KEY}, 108 | 'aws_secret_key: {AWS_SECRET_KEY}, 109 | 's3_bucket': {AWS_S3_BUCKET}, 110 | 's3_key': {AWS_S3_KEY} 111 | } 112 | ``` 113 | * The AWS_S3_BUCKET is the S3 bucket with S3_KEY (folder) containing the csv files: 114 | 1. credits.csv (from Kaggle Movielens dataset) 115 | 2. links.csv (from Kaggle Movielens dataset) 116 | 3. movies_metadata.csv (from Kaggle Movielens dataset) 117 | 4. ratings.csv (from Kaggle Movielens dataset) 118 | 5. consumer_price_index.csv (from Fred St Louis dataset) 119 | 120 | 2. `db_user` (user name of user with access to Redshift database) 121 | 3. `db_pass` (password of user with access to Redshift database) 122 | 4. `redshift_conn_string` (Redshift JDBC connection string for spark dataframe to write to Redshift) 123 | 124 | In addition, define the Hook to connect to Redshift: 125 | 126 | Conn Id: `redshift`. 127 | Conn Type: `Postgres`. 128 | Host: Enter the endpoint of your Redshift cluster, excluding the port at the end. 129 | Schema: This is the Redshift database you want to connect to. 130 | Login: Enter Redshift user 131 | Password: Enter Redshift password 132 | Port: Enter `5439`. 133 | 134 | After configuring, visit Airflow UI and enable DAG to start the data pipeline 135 | 136 | ## Acknowledgements 137 | --- 138 | Many thanks to: 139 | * Udacity - for providing the project template and points of consideration :clap: 140 | * Rounak Banik - for providing me with the dataset I extracted from Kaggle and used :clap: 141 | * St Louis Fred - for providing me with the consumer price index data :clap: -------------------------------------------------------------------------------- /config/airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The home folder for airflow, default is ~/airflow 3 | airflow_home = /usr/local/airflow 4 | 5 | # The folder where your airflow pipelines live, most likely a 6 | # subfolder in a code repository 7 | # This path must be absolute 8 | dags_folder = /usr/local/airflow/dags 9 | 10 | # The folder where airflow should store its log files 11 | # This path must be absolute 12 | base_log_folder = /usr/local/airflow/logs 13 | 14 | # Airflow can store logs remotely in AWS S3 or Google Cloud Storage. Users 15 | # must supply a remote location URL (starting with either 's3://...' or 16 | # 'gs://...') and an Airflow connection id that provides access to the storage 17 | # location. 18 | remote_base_log_folder = 19 | remote_log_conn_id = 20 | # Use server-side encryption for logs stored in S3 21 | encrypt_s3_logs = False 22 | # DEPRECATED option for remote log storage, use remote_base_log_folder instead! 23 | s3_log_folder = 24 | 25 | # The executor class that airflow should use. Choices include 26 | # SequentialExecutor, LocalExecutor, CeleryExecutor 27 | executor = CeleryExecutor 28 | 29 | # The SqlAlchemy connection string to the metadata database. 30 | # SqlAlchemy supports many different database engine, more information 31 | # their website 32 | sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow 33 | 34 | # The SqlAlchemy pool size is the maximum number of database connections 35 | # in the pool. 36 | sql_alchemy_pool_size = 5 37 | 38 | # The SqlAlchemy pool recycle is the number of seconds a connection 39 | # can be idle in the pool before it is invalidated. This config does 40 | # not apply to sqlite. 41 | sql_alchemy_pool_recycle = 3600 42 | 43 | # The amount of parallelism as a setting to the executor. This defines 44 | # the max number of task instances that should run simultaneously 45 | # on this airflow installation 46 | parallelism = 32 47 | 48 | # The number of task instances allowed to run concurrently by the scheduler 49 | dag_concurrency = 16 50 | 51 | # Are DAGs paused by default at creation 52 | dags_are_paused_at_creation = True 53 | 54 | # When not using pools, tasks are run in the "default pool", 55 | # whose size is guided by this config element 56 | non_pooled_task_slot_count = 128 57 | 58 | # The maximum number of active DAG runs per DAG 59 | max_active_runs_per_dag = 16 60 | 61 | # Whether to load the examples that ship with Airflow. It's good to 62 | # get started, but you probably want to set this to False in a production 63 | # environment 64 | load_examples = True 65 | 66 | # Where your Airflow plugins are stored 67 | plugins_folder = /usr/local/airflow/plugins 68 | 69 | # Secret key to save connection passwords in the db 70 | fernet_key = $FERNET_KEY 71 | 72 | # Whether to disable pickling dags 73 | donot_pickle = False 74 | 75 | # How long before timing out a python file import while filling the DagBag 76 | dagbag_import_timeout = 30 77 | 78 | # The class to use for running task instances in a subprocess 79 | task_runner = BashTaskRunner 80 | 81 | # If set, tasks without a `run_as_user` argument will be run with this user 82 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 83 | default_impersonation = 84 | 85 | # What security module to use (for example kerberos): 86 | security = 87 | 88 | # Turn unit test mode on (overwrites many configuration options with test 89 | # values at runtime) 90 | unit_test_mode = False 91 | 92 | [cli] 93 | # In what way should the cli access the API. The LocalClient will use the 94 | # database directly, while the json_client will use the api running on the 95 | # webserver 96 | api_client = airflow.api.client.local_client 97 | endpoint_url = http://localhost:8080 98 | 99 | [api] 100 | # How to authenticate users of the API 101 | auth_backend = airflow.api.auth.backend.default 102 | 103 | [operators] 104 | # The default owner assigned to each new operator, unless 105 | # provided explicitly or passed via `default_args` 106 | default_owner = Airflow 107 | default_cpus = 1 108 | default_ram = 512 109 | default_disk = 512 110 | default_gpus = 0 111 | 112 | [webserver] 113 | # The base url of your website as airflow cannot guess what domain or 114 | # cname you are using. This is used in automated emails that 115 | # airflow sends to point links to the right web server 116 | base_url = http://localhost:8080 117 | 118 | # The ip specified when starting the web server 119 | web_server_host = 0.0.0.0 120 | 121 | # The port on which to run the web server 122 | web_server_port = 8080 123 | 124 | # Paths to the SSL certificate and key for the web server. When both are 125 | # provided SSL will be enabled. This does not change the web server port. 126 | web_server_ssl_cert = 127 | web_server_ssl_key = 128 | 129 | # Number of seconds the gunicorn webserver waits before timing out on a worker 130 | web_server_worker_timeout = 120 131 | 132 | # Number of workers to refresh at a time. When set to 0, worker refresh is 133 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 134 | # bringing up new ones and killing old ones. 135 | worker_refresh_batch_size = 1 136 | 137 | # Number of seconds to wait before refreshing a batch of workers. 138 | worker_refresh_interval = 30 139 | 140 | # Secret key used to run your flask app 141 | secret_key = temporary_key 142 | 143 | # Number of workers to run the Gunicorn web server 144 | workers = 4 145 | 146 | # The worker class gunicorn should use. Choices include 147 | # sync (default), eventlet, gevent 148 | worker_class = sync 149 | 150 | # Log files for the gunicorn webserver. '-' means log to stderr. 151 | access_logfile = - 152 | error_logfile = - 153 | 154 | # Expose the configuration file in the web server 155 | expose_config = True 156 | 157 | # Set to true to turn on authentication: 158 | # http://pythonhosted.org/airflow/security.html#web-authentication 159 | authenticate = False 160 | 161 | # Filter the list of dags by owner name (requires authentication to be enabled) 162 | filter_by_owner = False 163 | 164 | # Filtering mode. Choices include user (default) and ldapgroup. 165 | # Ldap group filtering requires using the ldap backend 166 | # 167 | # Note that the ldap server needs the "memberOf" overlay to be set up 168 | # in order to user the ldapgroup mode. 169 | owner_mode = user 170 | 171 | # Default DAG orientation. Valid values are: 172 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) 173 | dag_orientation = LR 174 | 175 | # Puts the webserver in demonstration mode; blurs the names of Operators for 176 | # privacy. 177 | demo_mode = False 178 | 179 | # The amount of time (in secs) webserver will wait for initial handshake 180 | # while fetching logs from other worker machine 181 | log_fetch_timeout_sec = 5 182 | 183 | # By default, the webserver shows paused DAGs. Flip this to hide paused 184 | # DAGs by default 185 | hide_paused_dags_by_default = False 186 | 187 | [email] 188 | email_backend = airflow.utils.email.send_email_smtp 189 | 190 | [smtp] 191 | # If you want airflow to send emails on retries, failure, and you want to use 192 | # the airflow.utils.email.send_email_smtp function, you have to configure an 193 | # smtp server here 194 | smtp_host = localhost 195 | smtp_starttls = True 196 | smtp_ssl = False 197 | # Uncomment and set the user/pass settings if you want to use SMTP AUTH 198 | # smtp_user = airflow 199 | # smtp_password = airflow 200 | smtp_port = 25 201 | smtp_mail_from = airflow@airflow.com 202 | 203 | [celery] 204 | # This section only applies if you are using the CeleryExecutor in 205 | # [core] section above 206 | 207 | # The app name that will be used by celery 208 | celery_app_name = airflow.executors.celery_executor 209 | 210 | # The concurrency that will be used when starting workers with the 211 | # "airflow worker" command. This defines the number of task instances that 212 | # a worker will take, so size up your workers based on the resources on 213 | # your worker box and the nature of your tasks 214 | celeryd_concurrency = 16 215 | 216 | # When you start an airflow worker, airflow starts a tiny web server 217 | # subprocess to serve the workers local log files to the airflow main 218 | # web server, who then builds pages and sends them to users. This defines 219 | # the port on which the logs are served. It needs to be unused, and open 220 | # visible from the main web server to connect into the workers. 221 | worker_log_server_port = 8793 222 | 223 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 224 | # a sqlalchemy database. Refer to the Celery documentation for more 225 | # information. 226 | broker_url = redis://redis:6379/1 227 | 228 | # Another key Celery setting 229 | celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow 230 | 231 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 232 | # it `airflow flower`. This defines the IP that Celery Flower runs on 233 | flower_host = 0.0.0.0 234 | 235 | # This defines the port that Celery Flower runs on 236 | flower_port = 5555 237 | 238 | # Default queue that tasks get assigned to and that worker listen on. 239 | default_queue = default 240 | 241 | [scheduler] 242 | # Task instances listen for external kill signal (when you clear tasks 243 | # from the CLI or the UI), this defines the frequency at which they should 244 | # listen (in seconds). 245 | job_heartbeat_sec = 5 246 | 247 | # The scheduler constantly tries to trigger new tasks (look at the 248 | # scheduler section in the docs for more information). This defines 249 | # how often the scheduler should run (in seconds). 250 | scheduler_heartbeat_sec = 5 251 | 252 | # after how much time should the scheduler terminate in seconds 253 | # -1 indicates to run continuously (see also num_runs) 254 | run_duration = -1 255 | 256 | # after how much time a new DAGs should be picked up from the filesystem 257 | min_file_process_interval = 0 258 | 259 | dag_dir_list_interval = 300 260 | 261 | # How often should stats be printed to the logs 262 | print_stats_interval = 30 263 | 264 | child_process_log_directory = /usr/local/airflow/logs/scheduler 265 | 266 | # Local task jobs periodically heartbeat to the DB. If the job has 267 | # not heartbeat in this many seconds, the scheduler will mark the 268 | # associated task instance as failed and will re-schedule the task. 269 | scheduler_zombie_task_threshold = 300 270 | 271 | # Turn off scheduler catchup by setting this to False. 272 | # Default behavior is unchanged and 273 | # Command Line Backfills still work, but the scheduler 274 | # will not do scheduler catchup if this is False, 275 | # however it can be set on a per DAG basis in the 276 | # DAG definition (catchup) 277 | catchup_by_default = True 278 | 279 | # Statsd (https://github.com/etsy/statsd) integration settings 280 | statsd_on = False 281 | statsd_host = localhost 282 | statsd_port = 8125 283 | statsd_prefix = airflow 284 | 285 | # The scheduler can run multiple threads in parallel to schedule dags. 286 | # This defines how many threads will run. However airflow will never 287 | # use more threads than the amount of cpu cores available. 288 | max_threads = 2 289 | 290 | authenticate = False 291 | 292 | [mesos] 293 | # Mesos master address which MesosExecutor will connect to. 294 | master = localhost:5050 295 | 296 | # The framework name which Airflow scheduler will register itself as on mesos 297 | framework_name = Airflow 298 | 299 | # Number of cpu cores required for running one task instance using 300 | # 'airflow run --local -p ' 301 | # command on a mesos slave 302 | task_cpu = 1 303 | 304 | # Memory in MB required for running one task instance using 305 | # 'airflow run --local -p ' 306 | # command on a mesos slave 307 | task_memory = 256 308 | 309 | # Enable framework checkpointing for mesos 310 | # See http://mesos.apache.org/documentation/latest/slave-recovery/ 311 | checkpoint = False 312 | 313 | # Failover timeout in milliseconds. 314 | # When checkpointing is enabled and this option is set, Mesos waits 315 | # until the configured timeout for 316 | # the MesosExecutor framework to re-register after a failover. Mesos 317 | # shuts down running tasks if the 318 | # MesosExecutor framework fails to re-register within this timeframe. 319 | # failover_timeout = 604800 320 | 321 | # Enable framework authentication for mesos 322 | # See http://mesos.apache.org/documentation/latest/configuration/ 323 | authenticate = False 324 | 325 | # Mesos credentials, if authentication is enabled 326 | # default_principal = admin 327 | # default_secret = admin 328 | 329 | [kerberos] 330 | ccache = /tmp/airflow_krb5_ccache 331 | # gets augmented with fqdn 332 | principal = airflow 333 | reinit_frequency = 3600 334 | kinit_path = kinit 335 | keytab = airflow.keytab 336 | 337 | [github_enterprise] 338 | api_rev = v3 339 | 340 | [admin] 341 | # UI to hide sensitive variable fields when set to True 342 | hide_sensitive_variable_fields = True -------------------------------------------------------------------------------- /dags/bash_scripts/load_staging_table.sh: -------------------------------------------------------------------------------- 1 | spark-submit --driver-class-path $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar \ 2 | --jars $SPARK_HOME/jars/RedshiftJDBC42-no-awssdk-1.2.43.1067.jar \ 3 | $AIRFLOW_HOME/dags/python_scripts/{{params.python_script}} {{ params.s3_bucket }} {{ params.s3_key }} \ 4 | {{ params.aws_key }} {{ params.aws_secret_key }} {{ params.redshift_conn_string }} \ 5 | {{ params.db_user }} {{params.db_pass}} --conf "fs.s3a.multipart.size=104857600" -------------------------------------------------------------------------------- /dags/movies_dwh_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators.postgres_operator import PostgresOperator 5 | from airflow.operators.movies_plugin import DataQualityOperator 6 | from airflow.models import Variable 7 | from datetime import datetime, timedelta 8 | import os 9 | 10 | # Define configured variables to connect to AWS S3 and Redshift 11 | movie_s3_config = Variable.get("movie_s3_config", deserialize_json=True) 12 | 13 | # Parameters that are reused when submitting spark job to load staging tables 14 | params = {'aws_key': movie_s3_config["awsKey"], 15 | 'aws_secret_key': movie_s3_config["awsSecretKey"], 16 | 'db_user': Variable.get("redshift_db_user"), 17 | 'db_pass': Variable.get("redshift_db_pass"), 18 | 'redshift_conn_string': Variable.get("redshift_conn_string"), 19 | 's3_bucket': movie_s3_config["s3Bucket"], 20 | 's3_key': movie_s3_config["s3Key"] 21 | } 22 | 23 | # Default settings for DAG 24 | default_args = { 25 | 'owner': 'Alan', 26 | 'depends_on_past': False, 27 | 'start_date': datetime.today(), 28 | 'retries': 5, 29 | 'retry_delay': timedelta(minutes=1), 30 | } 31 | 32 | ## Define the DAG object 33 | with DAG(dag_id='movie_dwh_dag', default_args=default_args, 34 | description='Load and transform data in Redshift \ 35 | Data Warehouse with Airflow', 36 | schedule_interval='@once') as dag: 37 | 38 | start_operator = DummyOperator(task_id='begin-execution', dag=dag) 39 | 40 | # Create tables in movies schema 41 | create_tables = PostgresOperator(task_id='create-tables', postgres_conn_id="redshift", 42 | sql="sql_scripts/create_tables.sql", dag=dag) 43 | 44 | # Load stage_ratings data table 45 | params['python_script'] = 'load_staging_ratings.py' 46 | load_staging_ratings = BashOperator(task_id='load-staging-ratings', 47 | bash_command= './bash_scripts/load_staging_table.sh', 48 | params=params, 49 | dag=dag) 50 | 51 | # Load stage_movies data table 52 | params['python_script'] = 'load_staging_movies.py' 53 | load_staging_movies = BashOperator(task_id='load-staging-movies', 54 | bash_command= './bash_scripts/load_staging_table.sh', 55 | params=params, 56 | dag=dag) 57 | 58 | # Load stage_cpi data table 59 | params['python_script'] = 'load_staging_cpi.py' 60 | load_staging_cpi = BashOperator(task_id='load-staging-cpi', 61 | bash_command= './bash_scripts/load_staging_table.sh', 62 | params=params, 63 | dag=dag) 64 | 65 | # Load stage_genre data table 66 | params['python_script'] = 'load_staging_genre.py' 67 | load_staging_genre = BashOperator(task_id='load-staging-genre', 68 | bash_command= './bash_scripts/load_staging_table.sh', 69 | params=params, 70 | dag=dag) 71 | 72 | # Load stage_date data table 73 | params['python_script'] = 'load_staging_date.py' 74 | load_staging_date = BashOperator(task_id='load-staging-date', 75 | bash_command= './bash_scripts/load_staging_table.sh', 76 | params=params, 77 | dag=dag) 78 | 79 | # Run upsert on tables and delete staging tables 80 | upsert_ratings = PostgresOperator(task_id='upsert-ratings-table', postgres_conn_id="redshift", 81 | sql="sql_scripts/upsert_ratings.sql", dag=dag) 82 | 83 | upsert_movies = PostgresOperator(task_id='upsert-movies-table', postgres_conn_id="redshift", 84 | sql="sql_scripts/upsert_movies.sql", dag=dag) 85 | 86 | upsert_cpi = PostgresOperator(task_id='upsert-staging-cpi', postgres_conn_id="redshift", 87 | sql='sql_scripts/upsert_cpi.sql', dag=dag) 88 | 89 | upsert_date = PostgresOperator(task_id='upsert-staging-date', postgres_conn_id="redshift", 90 | sql='sql_scripts/upsert_date.sql', dag=dag) 91 | 92 | upsert_genre = PostgresOperator(task_id='upsert-staging-genre', postgres_conn_id="redshift", 93 | sql='sql_scripts/upsert_genre.sql', dag=dag) 94 | 95 | # Check for quality issues in ingested data 96 | tables = ["movies.movies", "movies.ratings", "movies.movie_genre", 97 | "movies.genre", "movies.date", "movies.cpi"] 98 | check_data_quality = DataQualityOperator(task_id='run_data_quality_checks', 99 | redshift_conn_id="redshift", 100 | table_names=tables, 101 | dag=dag) 102 | 103 | # Define data pipeline DAG structure 104 | start_operator >> create_tables 105 | create_tables >> [load_staging_ratings, load_staging_movies, load_staging_cpi, load_staging_date, load_staging_genre] 106 | load_staging_ratings >> upsert_ratings 107 | load_staging_movies >> upsert_movies 108 | load_staging_cpi >> upsert_cpi 109 | load_staging_date >> upsert_date 110 | load_staging_genre >> upsert_genre 111 | [upsert_cpi, upsert_ratings, upsert_movies, upsert_date, upsert_genre] >> check_data_quality -------------------------------------------------------------------------------- /dags/python_scripts/load_staging_cpi.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | from pyspark import SparkConf, SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import (StructType, StructField as Fld, 7 | DateType as Date, FloatType as Float) 8 | from pyspark.sql.functions import col 9 | 10 | 11 | def create_spark_session(aws_key, aws_secret_key): 12 | """ 13 | Description: Creates spark session. 14 | Returns: 15 | spark session object 16 | """ 17 | 18 | spark = SparkSession \ 19 | .builder \ 20 | .config("spark.executor.heartbeatInterval", "40s") \ 21 | .getOrCreate() 22 | 23 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", 24 | "org.apache.hadoop.fs.s3a.S3AFileSystem") 25 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key) 26 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key) 27 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com") 28 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100") 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000") 30 | return spark 31 | 32 | 33 | if __name__ == "__main__": 34 | s3_bucket = sys.argv[1] 35 | s3_key = sys.argv[2] 36 | aws_key = sys.argv[3] 37 | aws_secret_key = sys.argv[4] 38 | redshift_conn_string = sys.argv[5] 39 | db_user = sys.argv[6] 40 | db_pass = sys.argv[7] 41 | 42 | spark = create_spark_session(aws_key, aws_secret_key) 43 | 44 | cpi_schema = StructType([ 45 | Fld("DATE", Date()), 46 | Fld("CUSR0000SS62031", Float()) 47 | ]) 48 | 49 | cpi_df = spark.read.option("header", "true") \ 50 | .csv("s3a://{}/{}/consumer_price_index.csv".format(s3_bucket, s3_key), 51 | schema=cpi_schema) 52 | 53 | cpi_df = cpi_df.select(col("DATE").alias("date_cd"), 54 | col("CUSR0000SS62031").alias("consumer_price_index")) 55 | 56 | cpi_df = cpi_df.filter(cpi_df.date_cd.isNotNull()) 57 | 58 | cpi_df.write \ 59 | .format("jdbc") \ 60 | .option("url", redshift_conn_string) \ 61 | .option("dbtable", "movies.stage_cpi") \ 62 | .option("user", sys.argv[6]) \ 63 | .option("password", sys.argv[7]) \ 64 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 65 | .mode("append") \ 66 | .save() -------------------------------------------------------------------------------- /dags/python_scripts/load_staging_date.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | from pyspark import SparkConf, SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl, 7 | IntegerType as Int, DateType as Date, 8 | BooleanType as Boolean, FloatType as Float, 9 | LongType as Long, StringType as String, 10 | ArrayType as Array) 11 | from pyspark.sql.functions import (col, year, month, dayofmonth, weekofyear, quarter) 12 | 13 | 14 | def create_spark_session(aws_key, aws_secret_key): 15 | """ 16 | Description: Creates spark session. 17 | Returns: 18 | spark session object 19 | """ 20 | 21 | spark = SparkSession \ 22 | .builder \ 23 | .config("spark.executor.heartbeatInterval", "40s") \ 24 | .getOrCreate() 25 | 26 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", 27 | "org.apache.hadoop.fs.s3a.S3AFileSystem") 28 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key) 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key) 30 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com") 31 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100") 32 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000") 33 | return spark 34 | 35 | 36 | def format_datetime(ts): 37 | return datetime.fromtimestamp(ts/1000.0) 38 | 39 | if __name__ == "__main__": 40 | s3_bucket = sys.argv[1] 41 | s3_key = sys.argv[2] 42 | aws_key = sys.argv[3] 43 | aws_secret_key = sys.argv[4] 44 | redshift_conn_string = sys.argv[5] 45 | db_user = sys.argv[6] 46 | db_pass = sys.argv[7] 47 | 48 | spark = create_spark_session(aws_key, aws_secret_key) 49 | 50 | movies_schema = StructType([ 51 | Fld("adult", String()), 52 | Fld("belongs_to_collection", Long()), 53 | Fld("budget", Long()), 54 | Fld("genres", String()), 55 | Fld("homepage", String()), 56 | Fld("id", Int()), 57 | Fld("imdb_id", String()), 58 | Fld("original_language", String()), 59 | Fld("original_title", String()), 60 | Fld("overview", String()), 61 | Fld("popularity", Dbl()), 62 | Fld("poster_path", String()), 63 | Fld("production_company", String()), 64 | Fld("production_country", String()), 65 | Fld("release_date", Date()), 66 | Fld("revenue", Long()), 67 | Fld("runtime", Float()), 68 | Fld("spoken_languages", String()), 69 | Fld("status", String()), 70 | Fld("tagline", String()), 71 | Fld("title", String()), 72 | Fld("video", Boolean()), 73 | Fld("vote_average", Float()), 74 | Fld("vote_count", Int()) 75 | ]) 76 | 77 | 78 | movies_df = spark.read.option("header", "true") \ 79 | .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 80 | schema=movies_schema) 81 | 82 | movies_df = movies_df.na.drop() 83 | 84 | # extract columns to create time table 85 | date_table = movies_df.select( 86 | col('release_date'), 87 | dayofmonth("release_date").alias('day'), 88 | weekofyear("release_date").alias('week'), 89 | month("release_date").alias('month'), 90 | quarter("release_date").alias('quarter'), 91 | year("release_date").alias('year') 92 | ).dropDuplicates() 93 | 94 | date_table.write \ 95 | .format("jdbc") \ 96 | .option("url", redshift_conn_string) \ 97 | .option("dbtable", "movies.stage_date") \ 98 | .option("user", sys.argv[6]) \ 99 | .option("password", sys.argv[7]) \ 100 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 101 | .mode("append") \ 102 | .save() -------------------------------------------------------------------------------- /dags/python_scripts/load_staging_genre.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | from pyspark import SparkConf, SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl, 7 | IntegerType as Int, DateType as Date, 8 | BooleanType as Boolean, FloatType as Float, 9 | LongType as Long, StringType as String, 10 | ArrayType as Array) 11 | from pyspark.sql.functions import (col, year, month, dayofmonth, weekofyear, quarter, 12 | explode, from_json) 13 | 14 | 15 | def create_spark_session(aws_key, aws_secret_key): 16 | """ 17 | Description: Creates spark session. 18 | Returns: 19 | spark session object 20 | """ 21 | 22 | spark = SparkSession \ 23 | .builder \ 24 | .config("spark.executor.heartbeatInterval", "40s") \ 25 | .getOrCreate() 26 | 27 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", 28 | "org.apache.hadoop.fs.s3a.S3AFileSystem") 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key) 30 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key) 31 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com") 32 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100") 33 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000") 34 | spark.conf.set("spark.sql.shuffle.partitions", 4) 35 | return spark 36 | 37 | 38 | def format_datetime(ts): 39 | return datetime.fromtimestamp(ts/1000.0) 40 | 41 | if __name__ == "__main__": 42 | s3_bucket = sys.argv[1] 43 | s3_key = sys.argv[2] 44 | aws_key = sys.argv[3] 45 | aws_secret_key = sys.argv[4] 46 | redshift_conn_string = sys.argv[5] 47 | db_user = sys.argv[6] 48 | db_pass = sys.argv[7] 49 | 50 | spark = create_spark_session(aws_key, aws_secret_key) 51 | 52 | movies_schema = StructType([ 53 | Fld("adult", String()), 54 | Fld("belongs_to_collection", Long()), 55 | Fld("budget", Long()), 56 | Fld("genres", String()), 57 | Fld("homepage", String()), 58 | Fld("id", Int()), 59 | Fld("imdb_id", String()), 60 | Fld("original_language", String()), 61 | Fld("original_title", String()), 62 | Fld("overview", String()), 63 | Fld("popularity", Dbl()), 64 | Fld("poster_path", String()), 65 | Fld("production_company", String()), 66 | Fld("production_country", String()), 67 | Fld("release_date", Date()), 68 | Fld("revenue", Long()), 69 | Fld("runtime", Float()), 70 | Fld("spoken_languages", String()), 71 | Fld("status", String()), 72 | Fld("tagline", String()), 73 | Fld("title", String()), 74 | Fld("video", Boolean()), 75 | Fld("vote_average", Float()), 76 | Fld("vote_count", Int()) 77 | ]) 78 | 79 | 80 | movies_df = spark.read.option("header", "true") \ 81 | .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 82 | schema=movies_schema) 83 | 84 | genre_schema = Array(StructType([Fld("id", Int()), Fld("name", String())])) 85 | 86 | movies_df = movies_df.withColumn("genres", explode(from_json("genres", genre_schema))) \ 87 | .withColumn("genre_id", col("genres.id")) \ 88 | .withColumn("genre_name", col("genres.name")) \ 89 | 90 | movie_genre = movies_df.select("id", "genre_id").distinct() 91 | movie_genre = movie_genre.select(col("id").alias("movie_id"), col("genre_id")) 92 | 93 | genre = movies_df.select("genre_id", "genre_name").distinct() 94 | genre = genre.na.drop() 95 | 96 | # Load data into staging: 97 | genre.write \ 98 | .format("jdbc") \ 99 | .option("url", redshift_conn_string) \ 100 | .option("dbtable", "movies.stage_genre") \ 101 | .option("user", sys.argv[6]) \ 102 | .option("password", sys.argv[7]) \ 103 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 104 | .mode("append") \ 105 | .save() 106 | 107 | movie_genre.write \ 108 | .format("jdbc") \ 109 | .option("url", redshift_conn_string) \ 110 | .option("dbtable", "movies.stage_movie_genre") \ 111 | .option("user", sys.argv[6]) \ 112 | .option("password", sys.argv[7]) \ 113 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 114 | .mode("append") \ 115 | .save() -------------------------------------------------------------------------------- /dags/python_scripts/load_staging_movies.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | from pyspark import SparkConf, SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl, 7 | IntegerType as Int, DateType as Date, 8 | BooleanType as Boolean, FloatType as Float, 9 | LongType as Long, StringType as String, 10 | ArrayType as Array) 11 | from pyspark.sql.functions import col 12 | 13 | 14 | def create_spark_session(aws_key, aws_secret_key): 15 | """ 16 | Description: Creates spark session. 17 | Returns: 18 | spark session object 19 | """ 20 | 21 | spark = SparkSession \ 22 | .builder \ 23 | .config("spark.executor.heartbeatInterval", "40s") \ 24 | .getOrCreate() 25 | 26 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", 27 | "org.apache.hadoop.fs.s3a.S3AFileSystem") 28 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key) 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key) 30 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com") 31 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100") 32 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000") 33 | return spark 34 | 35 | 36 | def format_datetime(ts): 37 | return datetime.fromtimestamp(ts/1000.0) 38 | 39 | if __name__ == "__main__": 40 | s3_bucket = sys.argv[1] 41 | s3_key = sys.argv[2] 42 | aws_key = sys.argv[3] 43 | aws_secret_key = sys.argv[4] 44 | redshift_conn_string = sys.argv[5] 45 | db_user = sys.argv[6] 46 | db_pass = sys.argv[7] 47 | 48 | spark = create_spark_session(aws_key, aws_secret_key) 49 | 50 | movies_schema = StructType([ 51 | Fld("adult", String()), 52 | Fld("belongs_to_collection", Long()), 53 | Fld("budget", Long()), 54 | Fld("genres", String()), 55 | Fld("homepage", String()), 56 | Fld("id", Int()), 57 | Fld("imdb_id", String()), 58 | Fld("original_language", String()), 59 | Fld("original_title", String()), 60 | Fld("overview", String()), 61 | Fld("popularity", Dbl()), 62 | Fld("poster_path", String()), 63 | Fld("production_companies", String()), 64 | Fld("production_countries", String()), 65 | Fld("release_date", Date()), 66 | Fld("revenue", Long()), 67 | Fld("runtime", Float()), 68 | Fld("spoken_languages", String()), 69 | Fld("status", String()), 70 | Fld("tagline", String()), 71 | Fld("title", String()), 72 | Fld("video", Boolean()), 73 | Fld("vote_average", Float()), 74 | Fld("vote_count", Int()) 75 | ]) 76 | 77 | 78 | movies_df = spark.read.option("header", "true") \ 79 | .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), 80 | schema=movies_schema) 81 | 82 | 83 | movies_df = movies_df.select( 84 | col("id").alias("movie_id"), 85 | col("adult").alias("is_adult"), 86 | col("budget"), 87 | col("original_language"), 88 | col("title"), 89 | col("popularity"), 90 | col("release_date"), 91 | col("revenue"), 92 | col("vote_count"), 93 | col("vote_average") 94 | ) 95 | 96 | movies_df = movies_df.na.drop() 97 | 98 | movies_df.write \ 99 | .format("jdbc") \ 100 | .option("url", redshift_conn_string) \ 101 | .option("dbtable", "movies.stage_movies") \ 102 | .option("user", sys.argv[6]) \ 103 | .option("password", sys.argv[7]) \ 104 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 105 | .mode("append") \ 106 | .save() -------------------------------------------------------------------------------- /dags/python_scripts/load_staging_ratings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from datetime import datetime 4 | from pyspark import SparkConf, SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import (StructType, StructField as Fld, DoubleType as Dbl, 7 | IntegerType as Int, TimestampType as Timestamp, 8 | DateType as Date) 9 | from pyspark.sql.functions import col 10 | 11 | 12 | def create_spark_session(aws_key, aws_secret_key): 13 | """ 14 | Description: Creates spark session. 15 | Returns: 16 | spark session object 17 | """ 18 | 19 | spark = SparkSession \ 20 | .builder \ 21 | .config("spark.executor.heartbeatInterval", "40s") \ 22 | .getOrCreate() 23 | 24 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.impl", 25 | "org.apache.hadoop.fs.s3a.S3AFileSystem") 26 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key) 27 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key) 28 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com") 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "100") 30 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.connection.maximum", "5000") 31 | return spark 32 | 33 | if __name__ == "__main__": 34 | s3_bucket = sys.argv[1] 35 | s3_key = sys.argv[2] 36 | aws_key = sys.argv[3] 37 | aws_secret_key = sys.argv[4] 38 | redshift_conn_string = sys.argv[5] 39 | db_user = sys.argv[6] 40 | db_pass = sys.argv[7] 41 | 42 | spark = create_spark_session(aws_key, aws_secret_key) 43 | 44 | ratings_schema = StructType([ 45 | Fld("userId", Int()), 46 | Fld("movieId", Int()), 47 | Fld("rating", Dbl()), 48 | Fld("timestamp", Timestamp()) 49 | ]) 50 | 51 | ratings_df = spark.read.option("header", "true") \ 52 | .csv("s3a://{}/{}/ratings.csv".format(s3_bucket, s3_key), 53 | schema=ratings_schema) 54 | 55 | ratings_df = ratings_df.select( 56 | col("userId").alias("user_id"), 57 | col("movieId").alias("movie_id"), 58 | col("rating") 59 | ) 60 | 61 | ratings_df.write \ 62 | .format("jdbc") \ 63 | .option("url", redshift_conn_string) \ 64 | .option("dbtable", "movies.stage_ratings") \ 65 | .option("user", sys.argv[6]) \ 66 | .option("password", sys.argv[7]) \ 67 | .option("driver", "com.amazon.redshift.jdbc42.Driver") \ 68 | .mode("append") \ 69 | .save() 70 | -------------------------------------------------------------------------------- /dags/sql_scripts/create_tables.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | CREATE SCHEMA IF NOT EXISTS movies; 4 | 5 | 6 | CREATE TABLE IF NOT EXISTS movies.stage_ratings ( 7 | user_movie_id BIGINT IDENTITY(0,1), 8 | user_id INTEGER NOT NULL, 9 | movie_id INTEGER NOT NULL, 10 | rating NUMERIC, 11 | primary key (user_movie_id) 12 | ) diststyle key distkey(movie_id); 13 | 14 | CREATE TABLE IF NOT EXISTS movies.ratings ( 15 | user_movie_id INT IDENTITY(0,1), 16 | user_id INTEGER NOT NULL, 17 | movie_id INTEGER NOT NULL, 18 | rating NUMERIC, 19 | primary key (user_movie_id) 20 | ) diststyle key distkey(movie_id); 21 | 22 | CREATE TABLE IF NOT EXISTS movies.stage_movies ( 23 | movie_id INT NOT NULL, 24 | is_adult VARCHAR(5) NOT NULL, 25 | budget BIGINT NOT NULL, 26 | original_language CHAR(2) NOT NULL, 27 | title VARCHAR(300) NOT NULL, 28 | popularity FLOAT, 29 | release_date DATE NOT NULL, 30 | revenue BIGINT NOT NULL SORTKEY, 31 | vote_count INT, 32 | vote_average FLOAT, 33 | primary key (movie_id) 34 | ) diststyle key distkey(movie_id); 35 | 36 | CREATE TABLE IF NOT EXISTS movies.movies ( 37 | movie_id INT NOT NULL, 38 | is_adult VARCHAR(5) NOT NULL, 39 | budget BIGINT NOT NULL, 40 | original_language CHAR(2) NOT NULL, 41 | title VARCHAR(300) NOT NULL, 42 | popularity FLOAT, 43 | release_date DATE, 44 | revenue BIGINT NOT NULL SORTKEY, 45 | vote_count INT, 46 | vote_average FLOAT, 47 | primary key (movie_id) 48 | ) diststyle key distkey(movie_id); 49 | 50 | CREATE TABLE IF NOT EXISTS movies.stage_movie_genre ( 51 | movie_id INT NOT NULL, 52 | genre_id INT NOT NULL, 53 | primary key (movie_id, genre_id) 54 | ) 55 | diststyle key distkey(movie_id); 56 | 57 | CREATE TABLE IF NOT EXISTS movies.movie_genre ( 58 | movie_id INT NOT NULL, 59 | genre_id INT NOT NULL, 60 | primary key (movie_id, genre_id) 61 | ) 62 | diststyle key distkey(movie_id); 63 | 64 | CREATE TABLE IF NOT EXISTS movies.stage_genre ( 65 | genre_id INT NOT NULL, 66 | genre_name VARCHAR(300), 67 | primary key (genre_id) 68 | ) 69 | diststyle all; 70 | 71 | CREATE TABLE IF NOT EXISTS movies.genre ( 72 | genre_id INT NOT NULL, 73 | genre_name VARCHAR(300), 74 | primary key (genre_id) 75 | ) 76 | diststyle all; 77 | 78 | CREATE TABLE IF NOT EXISTS movies.stage_date ( 79 | release_date DATE NOT NULL SORTKEY, 80 | day INT, 81 | week INT, 82 | month INT, 83 | quarter INT, 84 | year INT, 85 | primary key (release_date) 86 | ) 87 | diststyle all; 88 | 89 | CREATE TABLE IF NOT EXISTS movies.date ( 90 | release_date DATE NOT NULL SORTKEY, 91 | day INT, 92 | week INT, 93 | month INT, 94 | quarter INT, 95 | year INT, 96 | primary key (release_date) 97 | ) 98 | diststyle all; 99 | 100 | CREATE TABLE IF NOT EXISTS movies.stage_cpi ( 101 | date_cd DATE NOT NULL SORTKEY, 102 | consumer_price_index FLOAT 103 | ) 104 | diststyle all; 105 | 106 | CREATE TABLE IF NOT EXISTS movies.cpi ( 107 | date_cd DATE NOT NULL SORTKEY, 108 | consumer_price_index FLOAT 109 | ) 110 | diststyle all; 111 | 112 | END; -------------------------------------------------------------------------------- /dags/sql_scripts/upsert_cpi.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | -- Upsert cpi table 4 | UPDATE movies.cpi 5 | SET consumer_price_index = sc.consumer_price_index 6 | FROM movies.stage_cpi sc 7 | WHERE movies.cpi.date_cd= sc.date_cd; 8 | 9 | 10 | INSERT INTO movies.cpi 11 | SELECT sc.* FROM movies.stage_cpi sc LEFT JOIN movies.cpi 12 | ON sc.date_cd = movies.cpi.date_cd 13 | WHERE movies.cpi.date_cd IS NULL; 14 | 15 | DROP TABLE IF EXISTS movies.stage_cpi; 16 | 17 | END; -------------------------------------------------------------------------------- /dags/sql_scripts/upsert_date.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | -- Upsert date table 4 | UPDATE movies.date 5 | SET day = md.day, week = md.week, month = md.month, 6 | quarter = md.quarter, year = md.year 7 | FROM movies.stage_date md 8 | WHERE movies.date.release_date = md.release_date; 9 | 10 | INSERT INTO movies.date 11 | SELECT md.* FROM movies.stage_date md LEFT JOIN movies.date 12 | ON md.release_date = movies.date.release_date 13 | WHERE movies.date.release_date IS NULL; 14 | 15 | DROP TABLE IF EXISTS movies.stage_date; 16 | 17 | END; -------------------------------------------------------------------------------- /dags/sql_scripts/upsert_genre.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | -- Update movie_genre table 4 | INSERT INTO movies.movie_genre 5 | SELECT mg.* FROM movies.stage_movie_genre mg LEFT JOIN movies.movie_genre 6 | ON mg.movie_id = movies.movie_genre.movie_id AND mg.genre_id = movies.movie_genre.genre_id 7 | WHERE movies.movie_genre.movie_id IS NULL; 8 | 9 | DROP TABLE movies.stage_movie_genre; 10 | 11 | -- Upsert genre table 12 | UPDATE movies.genre 13 | SET genre_name = mg.genre_name 14 | FROM movies.stage_genre mg 15 | WHERE movies.genre.genre_id = mg.genre_id; 16 | 17 | 18 | INSERT INTO movies.genre 19 | SELECT mg.* FROM movies.stage_genre mg LEFT JOIN movies.genre 20 | ON mg.genre_id = movies.genre.genre_id 21 | WHERE movies.genre.genre_id IS NULL; 22 | 23 | DROP TABLE movies.stage_genre; 24 | 25 | END; -------------------------------------------------------------------------------- /dags/sql_scripts/upsert_movies.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | -- Upsert movies table 4 | UPDATE movies.movies 5 | SET is_adult = mm.is_adult, budget = mm.budget, original_language = mm.original_language, 6 | title = mm.title, popularity = mm.popularity, release_date = mm.release_date, 7 | revenue = mm.revenue, vote_count = mm.vote_count, vote_average = mm.vote_average 8 | FROM movies.stage_movies mm 9 | WHERE movies.movies.movie_id = mm.movie_id; 10 | 11 | 12 | INSERT INTO movies.movies 13 | SELECT mm.* FROM movies.stage_movies mm LEFT JOIN movies.movies 14 | ON mm.movie_id = movies.movies.movie_id 15 | WHERE movies.movies.movie_id IS NULL; 16 | 17 | DROP TABLE movies.stage_movies; 18 | 19 | END; 20 | 21 | -------------------------------------------------------------------------------- /dags/sql_scripts/upsert_ratings.sql: -------------------------------------------------------------------------------- 1 | BEGIN; 2 | 3 | -- Upsert ratings table 4 | UPDATE movies.ratings 5 | SET movie_rating = ms.rating 6 | FROM movies.stage_ratings ms 7 | WHERE movies.ratings.user_id = ms.user_id AND movies.ratings.movie_id = ms.movie_id; 8 | 9 | 10 | INSERT INTO movies.ratings 11 | SELECT ms.* FROM movies.stage_ratings ms LEFT JOIN movies.ratings 12 | ON ms.user_id = movies.ratings.user_id AND ms.movie_id = movies.ratings.movie_id 13 | WHERE movies.ratings.user_id IS NULL; 14 | 15 | DROP TABLE IF EXISTS movies.stage_ratings; 16 | 17 | END; -------------------------------------------------------------------------------- /docker-compose-LocalExecutor.yml: -------------------------------------------------------------------------------- 1 | version: '3.3' 2 | services: 3 | postgres: 4 | image: postgres:9.6 5 | environment: 6 | - POSTGRES_USER=airflow 7 | - POSTGRES_PASSWORD=airflow 8 | - POSTGRES_DB=airflow 9 | logging: 10 | options: 11 | max-size: 10m 12 | max-file: "3" 13 | 14 | webserver: 15 | image: alanchn31/alanchn31-capstone-udacity-de-nd:1 16 | restart: always 17 | depends_on: 18 | - postgres 19 | environment: 20 | - LOAD_EX=n 21 | - EXECUTOR=Local 22 | - PYSPARK_PYTHON=/usr/bin/python3 23 | logging: 24 | options: 25 | max-size: 10m 26 | max-file: "3" 27 | volumes: 28 | - ./dags:/usr/local/airflow/dags 29 | - ./plugins:/usr/local/airflow/plugins 30 | ports: 31 | - "8080:8080" 32 | command: webserver 33 | healthcheck: 34 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 35 | interval: 30s 36 | timeout: 30s 37 | retries: 3 -------------------------------------------------------------------------------- /documentation/Data Dictionary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/Data Dictionary.pdf -------------------------------------------------------------------------------- /documentation/README_images/architecture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/architecture.PNG -------------------------------------------------------------------------------- /documentation/README_images/dag.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/dag.PNG -------------------------------------------------------------------------------- /documentation/README_images/data_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/data_model.png -------------------------------------------------------------------------------- /documentation/README_images/logo.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alanchn31/Movalytics-Data-Warehouse/11aeefe0ea1d61660351d5ba158d729759f458a4/documentation/README_images/logo.PNG -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | 7 | # Defining the plugin class 8 | class MoviesPlugin(AirflowPlugin): 9 | name = "movies_plugin" 10 | operators = [ 11 | operators.DataQualityOperator 12 | ] -------------------------------------------------------------------------------- /plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.data_quality import DataQualityOperator 2 | 3 | __all__ = [ 4 | 'DataQualityOperator' 5 | ] -------------------------------------------------------------------------------- /plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | ui_color = '#89DA59' 7 | 8 | @apply_defaults 9 | def __init__(self, 10 | redshift_conn_id="", 11 | table_names=[""], 12 | *args, **kwargs): 13 | 14 | super(DataQualityOperator, self).__init__(*args, **kwargs) 15 | self.redshift_conn_id = redshift_conn_id 16 | self.table_names = table_names 17 | 18 | def execute(self, context): 19 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 20 | for table in self.table_names: 21 | # Check that entries are being copied to table 22 | records = redshift.get_records("SELECT COUNT(*) FROM {}".format(table)) 23 | if len(records) < 1 or len(records[0]) < 1: 24 | raise ValueError("Data quality check failed. {} returned no results".format(table)) 25 | 26 | # Check that there are no rows with null ids 27 | dq_checks=[ 28 | {'table': 'movies.movies', 29 | 'check_sql': "SELECT COUNT(*) FROM movies.movies WHERE movie_id is null", 30 | 'expected_result': 0}, 31 | {'table': 'movies.genre', 32 | 'check_sql': "SELECT COUNT(*) FROM movies.genre WHERE genre_id is null", 33 | 'expected_result': 0}, 34 | {'table': 'movies.date', 35 | 'check_sql': "SELECT COUNT(*) FROM movies.date WHERE release_date is null", 36 | 'expected_result': 0}, 37 | {'table': 'movies.cpi', 38 | 'check_sql': "SELECT COUNT(*) FROM movies.cpi WHERE date_cd is null", 39 | 'expected_result': 0}, 40 | ] 41 | for check in dq_checks: 42 | records = redshift.get_records(check['check_sql']) 43 | if records[0][0] != check['expected_result']: 44 | print("Number of rows with null ids: ", records[0][0]) 45 | print("Expected number of rows with null ids: ", check['expected_result']) 46 | raise ValueError("Data quality check failed. {} contains null in id column".format(check['table'])) -------------------------------------------------------------------------------- /script/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | AIRFLOW_HOME="/usr/local/airflow" 4 | CMD="airflow" 5 | TRY_LOOP="20" 6 | 7 | : ${REDIS_HOST:="redis"} 8 | : ${REDIS_PORT:="6379"} 9 | 10 | : ${POSTGRES_HOST:="postgres"} 11 | : ${POSTGRES_PORT:="5432"} 12 | : ${POSTGRES_USER:="airflow"} 13 | : ${POSTGRES_PASSWORD:="airflow"} 14 | : ${POSTGRES_DB:="airflow"} 15 | 16 | : ${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")} 17 | 18 | # Load DAGs exemples (default: Yes) 19 | if [ "$LOAD_EX" = "n" ]; then 20 | sed -i "s/load_examples = True/load_examples = False/" "$AIRFLOW_HOME"/airflow.cfg 21 | fi 22 | 23 | # Install custome python package if requirements.txt is present 24 | if [ -e "/requirements.txt" ]; then 25 | $(which pip) install --user -r /requirements.txt 26 | fi 27 | 28 | # Update airflow config - Fernet key 29 | sed -i "s|\$FERNET_KEY|$FERNET_KEY|" "$AIRFLOW_HOME"/airflow.cfg 30 | 31 | # Wait for Postresql 32 | if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then 33 | i=0 34 | while ! nc -z $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do 35 | i=$((i+1)) 36 | if [ "$1" = "webserver" ]; then 37 | echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP" 38 | if [ $i -ge $TRY_LOOP ]; then 39 | echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up" 40 | exit 1 41 | fi 42 | fi 43 | sleep 10 44 | done 45 | fi 46 | 47 | # Update configuration depending the type of Executor 48 | if [ "$EXECUTOR" = "Celery" ] 49 | then 50 | # Wait for Redis 51 | if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] || [ "$1" = "flower" ] ; then 52 | j=0 53 | while ! nc -z $REDIS_HOST $REDIS_PORT >/dev/null 2>&1 < /dev/null; do 54 | j=$((j+1)) 55 | if [ $j -ge $TRY_LOOP ]; then 56 | echo "$(date) - $REDIS_HOST still not reachable, giving up" 57 | exit 1 58 | fi 59 | echo "$(date) - waiting for Redis... $j/$TRY_LOOP" 60 | sleep 5 61 | done 62 | fi 63 | sed -i "s#celery_result_backend = db+postgresql://airflow:airflow@postgres/airflow#celery_result_backend = db+postgresql://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg 64 | sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg 65 | sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg 66 | if [ "$1" = "webserver" ]; then 67 | echo "Initialize database..." 68 | $CMD initdb 69 | exec $CMD webserver 70 | else 71 | sleep 10 72 | exec $CMD "$@" 73 | fi 74 | elif [ "$EXECUTOR" = "Local" ] 75 | then 76 | sed -i "s/executor = CeleryExecutor/executor = LocalExecutor/" "$AIRFLOW_HOME"/airflow.cfg 77 | sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = postgresql+psycopg2://$POSTGRES_USER:$POSTGRES_PASSWORD@$POSTGRES_HOST:$POSTGRES_PORT/$POSTGRES_DB#" "$AIRFLOW_HOME"/airflow.cfg 78 | sed -i "s#broker_url = redis://redis:6379/1#broker_url = redis://$REDIS_HOST:$REDIS_PORT/1#" "$AIRFLOW_HOME"/airflow.cfg 79 | echo "Initialize database..." 80 | $CMD initdb 81 | exec $CMD webserver & 82 | exec $CMD scheduler 83 | # By default we use SequentialExecutor 84 | else 85 | if [ "$1" = "version" ]; then 86 | exec $CMD version 87 | exit 88 | fi 89 | sed -i "s/executor = CeleryExecutor/executor = SequentialExecutor/" "$AIRFLOW_HOME"/airflow.cfg 90 | sed -i "s#sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@postgres/airflow#sql_alchemy_conn = sqlite:////usr/local/airflow/airflow.db#" "$AIRFLOW_HOME"/airflow.cfg 91 | echo "Initialize database..." 92 | $CMD initdb 93 | exec $CMD webserver 94 | fi --------------------------------------------------------------------------------