├── .gitignore
├── LICENSE
├── README.md
├── airflow
    ├── Dockerfile
    ├── README.md
    ├── custom_scripts
    │   ├── __init__.py
    │   ├── ingest_reddit.py
    │   └── preprocessing.py
    ├── dags
    │   ├── load_datawarehouse.sql
    │   └── stocks_dag.py
    ├── docker-compose.yaml
    ├── requirements.txt
    └── scripts
    │   └── entrypoint.sh
├── images
    ├── airflow.png
    ├── architecture.png
    └── dashboard.png
├── spark
    ├── README.md
    ├── spark-bigquery-latest_2.12.jar
    └── wordcount_by_date.py
└── terraform
    ├── main.tf
    └── variables.tf


/.gitignore:
--------------------------------------------------------------------------------
  1 | # SPECIFIC TO THIS PROJECT
  2 | 
  3 | # terraform
  4 | *.tfstate
  5 | *.tfstate.*
  6 | **.terraform
  7 | **.terraform.lock.*
  8 | 
  9 | # files
 10 | *.parquet
 11 | *.ipynb
 12 | commands.txt
 13 | 
 14 | #airflow
 15 | airflow/logs/
 16 | 
 17 | # END
 18 | 
 19 | # Byte-compiled / optimized / DLL files
 20 | __pycache__/
 21 | *.py[cod]
 22 | *$py.class
 23 | 
 24 | # C extensions
 25 | *.so
 26 | 
 27 | # Distribution / packaging
 28 | .Python
 29 | build/
 30 | develop-eggs/
 31 | dist/
 32 | downloads/
 33 | eggs/
 34 | .eggs/
 35 | lib/
 36 | lib64/
 37 | parts/
 38 | sdist/
 39 | var/
 40 | wheels/
 41 | pip-wheel-metadata/
 42 | share/python-wheels/
 43 | *.egg-info/
 44 | .installed.cfg
 45 | *.egg
 46 | MANIFEST
 47 | 
 48 | # PyInstaller
 49 | #  Usually these files are written by a python script from a template
 50 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 51 | *.manifest
 52 | *.spec
 53 | 
 54 | # Installer logs
 55 | pip-log.txt
 56 | pip-delete-this-directory.txt
 57 | 
 58 | # Unit test / coverage reports
 59 | htmlcov/
 60 | .tox/
 61 | .nox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | *.py,cover
 69 | .hypothesis/
 70 | .pytest_cache/
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | local_settings.py
 79 | db.sqlite3
 80 | db.sqlite3-journal
 81 | 
 82 | # Flask stuff:
 83 | instance/
 84 | .webassets-cache
 85 | 
 86 | # Scrapy stuff:
 87 | .scrapy
 88 | 
 89 | # Sphinx documentation
 90 | docs/_build/
 91 | 
 92 | # PyBuilder
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | .python-version
104 | 
105 | # pipenv
106 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | #   install all needed dependencies.
110 | #Pipfile.lock
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Zachary
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div id="top"></div>
  2 | 
  3 | <!-- PROJECT SHIELDS -->
  4 | <!--
  5 | *** I'm using markdown "reference style" links for readability.
  6 | *** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
  7 | *** See the bottom of this document for the declaration of the reference variables
  8 | *** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
  9 | *** https://www.markdownguide.org/basic-syntax/#reference-style-links
 10 | -->
 11 | 
 12 | # Data Pipeline for Reddit data (r/Stocks)
 13 | <!-- TABLE OF CONTENTS -->
 14 | <details>
 15 |   <summary>Table of Contents</summary>
 16 |   <ol>
 17 |     <li>
 18 |       <a href="#about-the-project">About The Project</a>
 19 |       <ul>
 20 |         <li><a href="#built-with">Built With</a></li>
 21 |       </ul>
 22 |     </li>
 23 |     <li>
 24 |       <a href="#getting-started">Getting Started</a>
 25 |       <ul>
 26 |         <li><a href="#prerequisites">Prerequisites</a></li>
 27 |         <li><a href="#create-a-google-cloud-project">Create a Google Cloud Project</a></li>
 28 |         <li><a href="#set-up-the-infrastructure-on-google-cloud-with-terraform">Set up the infrastructure on Google Cloud with Terraform</a></li>
 29 |         <li><a href="#set-up-airflow">Set up Airflow</a></li>
 30 |       </ul>
 31 |     </li>
 32 |     <li>
 33 |       <a href="#usage">Usage</a>
 34 |       <ul>
 35 |         <li><a href="#start-airflow">Start Airflow</a></li>
 36 |         <li><a href="#prepare-for-spark-jobs-on-dataproc">Prepare for Spark jobs on Dataproc</a></li>
 37 |       </ul>
 38 |     </li>
 39 |     <li><a href="#help">Help</a></li>
 40 |     <li><a href="#roadmap-for-future-development">Roadmap for Future Development</a></li>
 41 |     <li><a href="#contributing">Contributing</a></li>
 42 |     <li><a href="#license">License</a></li>
 43 |     <li><a href="#contact">Contact</a></li>
 44 |     <li><a href="#acknowledgements">Acknowledgements</a></li>
 45 |   </ol>
 46 | </details>
 47 | 
 48 | <!-- ABOUT THE PROJECT -->
 49 | ## About The Project
 50 | 
 51 | [![Dashboard][dashboard_screenshot]](https://datastudio.google.com/s/mjIjKwWNUQU)
 52 | 
 53 | Interested to explore Reddit data for trends, analytics, or just for the fun of it?
 54 | 
 55 | This project builds a data pipeline (from data ingestion to visualisation) that stores and preprocess data over any time period that you want.
 56 | 
 57 | <p align="right">(<a href="#top">back to top</a>)</p>
 58 | 
 59 | ### Built With
 60 | 
 61 | * Data Ingestion: [Pushshift API](https://github.com/pushshift/api)
 62 | * Infrastructure as Code: [Terraform](https://www.terraform.io/)
 63 | * Workflow Orchestration: [Airflow](https://airflow.apache.org)
 64 | * Data Lake: [Google Cloud Storage](https://cloud.google.com/storage)
 65 | * Data Warehouse: [Google BigQuery](https://cloud.google.com/bigquery)
 66 | * Batch Processing: [Spark](https://spark.apache.org/) on [Dataproc](https://cloud.google.com/dataproc)
 67 | * Visualisation: [Google Data Studio](https://datastudio.google.com/)
 68 | 
 69 | ![architecture][architecture_diagram]
 70 | Cloud infrastructure is set up with Terraform.
 71 | 
 72 | Airflow is run on a local docker container.
 73 | It orchestrates the following on a weekly schedule:
 74 | * Download data (JSON)
 75 | * Parquetize the data and store it in a bucket on Google Cloud Storage
 76 | * Write data to a table on BigQuery
 77 | * Create cluster on Dataproc and submit PySpark job to preprocess parquet files from Google Cloud Storage
 78 | * Write preprocessed data to a table on BigQuery
 79 | 
 80 | <p align="right">(<a href="#top">back to top</a>)</p>
 81 | 
 82 | ## Getting Started
 83 | 
 84 | I created this project in WSL 2 (Windows Subsystem for Linux) on Windows 10.
 85 | 
 86 | ### Prerequisites
 87 | 
 88 | To get a local copy up and running in the same environment, you'll need to:
 89 | * Install Python (3.8 and above)
 90 | * Install VSCode
 91 | * [Install WSL 2](https://docs.microsoft.com/en-us/windows/wsl/install) if you haven't
 92 | * [Install Terraform](https://www.terraform.io/downloads) for Linux
 93 | * [Install Docker Desktop](https://docs.docker.com/desktop/windows/install/)
 94 | * [Install Google Cloud SDK](https://cloud.google.com/sdk/docs/install-sdk#deb) for Ubuntu
 95 | * Have a Google Cloud Platform account
 96 | * Clone this repository locally
 97 | 
 98 | ### Create a Google Cloud Project
 99 | 1. Go to [Google Cloud](https://console.cloud.google.com/) and create a new project. I set the id to 'de-r-stocks'.
100 | 2. Go to IAM and [create a Service Account](https://cloud.google.com/docs/authentication/getting-started#creating_a_service_account) with these roles:
101 |     * BigQuery Admin
102 |     * Storage Admin
103 |     * Storage Object Admin
104 |     * Viewer
105 | 3. Download the Service Account credentials, rename it to `de-r-stocks.json` and store it in `$HOME/.google/credentials/`.
106 | 4. On the Google console, enable the following APIs:
107 |     * IAM API
108 |     * IAM Service Account Credentials API
109 |     * Cloud Dataproc API
110 |     * Compute Engine API
111 | 
112 | ### Set up the infrastructure on Google Cloud with Terraform
113 | I recommend executing the following on VSCode.
114 | 
115 | 1. Using VSCode + WSL, open the project folder `de_r-stocks`. 
116 | 2. Open `variables.tf` and modify:
117 |     
118 |     * `variable "project"` to your own project id (I think may not be necessary)
119 |     * `variable "region"` to your project region
120 |     * `variable "credentials"` to your credentials path
121 | 
122 | 3. Open the VSCode terminal and change directory to the terraform folder, e.g. `cd terraform`.
123 | 4. Initialise Terraform: `terraform init`
124 | 5. Plan the infrastructure: `terraform plan`
125 | 6. Apply the changes: `terraform apply`
126 | 
127 | If everything goes right, you now have a bucket on Google Cloud Storage called 'datalake_de-r-stocks' and a dataset on BigQuery called 'stocks_data'.
128 | 
129 | ### Set up Airflow
130 | 1. Using VSCode, open `docker-compose.yaml` and look for the `#self-defined` block. Modify the variables to match your setup.
131 | 2. Open `stocks_dag.py`. You may need to change the following:
132 | 
133 |     * `zone` in `CLUSTER_GENERATOR_CONFIG`
134 |     * Parameters in `default_args`
135 | 
136 | <p align="right">(<a href="#top">back to top</a>)</p>
137 | 
138 | ## Usage
139 | 
140 | ### Start Airflow
141 | 1. Using the terminal, change the directory to the airflow folder, e.g. `cd airflow`.
142 | 2. Build the custom Airflow docker image: `docker-compose build`
143 | 3. Initialise the Airflow configs: `docker-compose up airflow-init`
144 | 4. Run Airflow: `docker-compose up`
145 | 
146 | If the setup was done correctly, you will be able to access the Airflow interface by going to `localhost:8080` on your browser.
147 | 
148 | Username and password are both `airflow`.
149 | 
150 | ### Prepare for Spark jobs on Dataproc
151 | 1. Go to `wordcount_by_date.py` and modify the string value of `BUCKET` to your bucket's id.
152 | 2. Store initialisation and PySpark scripts on your bucket. It is required to create the cluster to run our Spark job.
153 |     
154 |     Run in the terminal (using the correct bucket name and region):
155 |     * `gsutil cp gs://goog-dataproc-initialization-actions-asia-southeast1/python/pip-install.sh gs://datalake_de-r-stocks/scripts`
156 |     * `gsutil cp spark/wordcount_by_date.py gs://datalake_de-r-stocks/scripts`
157 | 
158 | <p align="right">(<a href="#top">back to top</a>)</p>
159 | 
160 | Now, you are ready to enable the DAG on Airflow and let it do its magic!
161 | 
162 | ![airflow][airflow_screenshot]
163 | 
164 | When you are done, just stop the airflow services by going to the `airflow` directory with terminal and execute `docker-compose down`.
165 | 
166 | ## Help
167 | 
168 | Authorisation error while trying to create a Dataproc cluster from Airflow
169 |   1. Go to Google Cloud Platform's IAM
170 |   2. Under the Compute Engine default service account, add the roles 'Editor' and 'Dataproc Worker'.
171 | 
172 | ## Roadmap for Future Development
173 | 
174 | - [ ] Refactor code for convenient change to `subreddit` and `mode`.
175 | - [ ] Use Terraform to set up tables on BigQuery instead of creating tables as part of the DAG.
176 | - [ ] Unit tests
177 | - [ ] Data quality checks
178 | - [ ] CI/CD
179 | 
180 | <p align="right">(<a href="#top">back to top</a>)</p>
181 | 
182 | ## Contributing
183 | 
184 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
185 | Don't forget to give the project a star! Thanks again!
186 | 
187 | 1. Fork the Project
188 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
189 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
190 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
191 | 5. Open a Pull Request
192 | 
193 | <p align="right">(<a href="#top">back to top</a>)</p>
194 | 
195 | ## License
196 | 
197 | Distributed under the MIT License. See `LICENSE.txt` for more information.
198 | 
199 | ## Contact
200 | 
201 | [Connect with me on LinkedIn!](https://www.linkedin.com/in/zacharytancs/)
202 | 
203 | ## Acknowledgements
204 | 
205 | Use this space to list resources you find helpful and would like to give credit to. I've included a few of my favorites to kick things off!
206 | 
207 | * [Data Engineering Zoomcamp by DataTalksClub](https://github.com/DataTalksClub/data-engineering-zoomcamp)
208 | * [Best-README-Template](https://github.com/othneildrew/Best-README-Template)
209 | 
210 | <p align="right">(<a href="#top">back to top</a>)</p>
211 | 
212 | <!-- MARKDOWN LINKS & IMAGES -->
213 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
214 | [dashboard_screenshot]: images/dashboard.png
215 | [architecture_diagram]: images/architecture.png
216 | [airflow_screenshot]: images/airflow.png
217 | 


--------------------------------------------------------------------------------
/airflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | # First-time build can take upto 10 mins.
 2 | 
 3 | FROM apache/airflow:2.2.3
 4 | 
 5 | ENV AIRFLOW_HOME=/opt/airflow
 6 | ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}"
 7 | 
 8 | USER root
 9 | RUN apt-get update -qq && apt-get install vim -qqq
10 | # git gcc g++ -qqq
11 | 
12 | COPY requirements.txt .
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html
16 | 
17 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
18 | 
19 | ARG CLOUD_SDK_VERSION=322.0.0
20 | ENV GCLOUD_HOME=/home/google-cloud-sdk
21 | 
22 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
23 | 
24 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
25 |     && TMP_DIR="$(mktemp -d)" \
26 |     && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
27 |     && mkdir -p "${GCLOUD_HOME}" \
28 |     && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
29 |     && "${GCLOUD_HOME}/install.sh" \
30 |        --bash-completion=false \
31 |        --path-update=false \
32 |        --usage-reporting=false \
33 |        --quiet \
34 |     && rm -rf "${TMP_DIR}" \
35 |     && gcloud --version
36 | 
37 | WORKDIR $AIRFLOW_HOME
38 | 
39 | COPY scripts scripts
40 | RUN chmod +x scripts
41 | 
42 | USER $AIRFLOW_UID
43 | 


--------------------------------------------------------------------------------
/airflow/README.md:
--------------------------------------------------------------------------------
  1 | # Documentation
  2 | 
  3 | ## File structure
  4 | 
  5 | ### custom_scripts
  6 | This directory contains Python modules that are called by the PythonOperator in Airflow.
  7 | 
  8 | ### dags
  9 | This directory contains the DAG scripts, as well as SQL code that is called by the BigQueryInsertJobOperator in Airflow.
 10 | 
 11 | ## Code breakdown
 12 | 
 13 | ### Call Python functions from custom modules
 14 | Official doc: https://airflow.apache.org/docs/apache-airflow/stable/modules_management.html
 15 | 
 16 | - In DAG script (`stocks_dag.py`), import module:
 17 | `from custom_scripts.ingest_reddit import extract_reddit_data`
 18 | This allows you to set `extract_reddit_data` in python_callable of PythonOperator.
 19 | 
 20 | - In `Dockerfile`, set env:
 21 | `ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}"`
 22 | 
 23 | - In `docker-compose.yml`, mount directories under `x-airflow-common/volumes` so that Airflow can 'see' these paths:
 24 | ```
 25 | volumes:
 26 |   - ./custom_scripts:/opt/airflow/custom_scripts
 27 |   - ./data/json:/opt/airflow/data/json
 28 |   - ./data/csv:/opt/airflow/data/csv
 29 |   - ./data/parquet:/opt/airflow/data/parquet
 30 | ```
 31 | 
 32 | ### Create a Dataproc cluster in Airflow
 33 | Official doc: https://airflow.apache.org/docs/apache-airflow-providers-google/stable/operators/cloud/dataproc.html#examples-of-job-configurations-to-submit
 34 | 
 35 | The `gcloud` bash command is as follows:
 36 | ```
 37 | gcloud dataproc clusters create de-spark-cluster \
 38 |     --region asia-southeast1 \
 39 |     --zone asia-southeast1-a \
 40 |     --single-node \
 41 |     --master-machine-type n1-standard-4 \
 42 |     --master-boot-disk-size 500 \
 43 |     --image-version 2.0-debian10 \
 44 |     --max-idle 900s \
 45 |     --project de-r-stocks \
 46 |     --metadata 'PIP_PACKAGES=spark-nlp' \
 47 |     --initialization-actions gs://datalake_de-r-stocks/pip-install.sh
 48 | ```
 49 | We can use ClusterGenerator to generate the cluster configuration instead of manually setting the API.
 50 | ```
 51 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator
 52 | 
 53 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
 54 | 	project_id=PROJECT_ID,
 55 | 	zone="asia-southeast1-a",
 56 | 	master_machine_type="n1-standard-4",
 57 | 	master_disk_size=500,
 58 | 	num_masters=1,
 59 | 	num_workers=0,                          # single node mode
 60 | 	idle_delete_ttl=900,                    # idle time before deleting cluster
 61 | 	init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'],
 62 | 	metadata={'PIP_PACKAGES': 'spark-nlp'},
 63 | ).make()
 64 | 
 65 | create_cluster_operator_task = DataprocCreateClusterOperator(
 66 | 	task_id='create_dataproc_cluster',
 67 | 	cluster_name="de-spark-cluster",
 68 | 	project_id=PROJECT_ID,
 69 | 	region="asia-southeast1",
 70 | 	cluster_config=CLUSTER_GENERATOR_CONFIG
 71 | )
 72 | ```
 73 | `init_actions_uris`: When the cluster is created, it will be initalised to install dependencies under `metadata` with pip
 74 | 
 75 | `metadata`: `spark-nlp` is required in our PySpark job
 76 | 
 77 | See https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python
 78 | 
 79 | ### Submit a PySpark job to a Dataproc cluster in Airflow
 80 | Official doc: https://cloud.google.com/sdk/gcloud/reference/dataproc/jobs/submit/pyspark
 81 | 
 82 | The documentation lays out clearly the bash code for submitting a PySpark job.
 83 | 
 84 | Using the same API, my code looks like this:
 85 | ```
 86 | gcloud dataproc jobs submit pyspark \
 87 |     --cluster=de-spark-cluster \
 88 |     --region=asia-southeast1 \
 89 |     --project=de-r-stocks \
 90 |     --jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
 91 |     --properties spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3 \
 92 |     wordcount_by_date.py \
 93 |     -- \
 94 |         --input=gs://{BUCKET}/file.parquet \
 95 |         --dataset=stocks_data
 96 |         --subreddit=stocks
 97 |         --mode=submission
 98 | ```
 99 | `jars`: The JAR connector for Spark and BigQuery. This is required as the submit job will be writing processed data to a BigQuery table.
100 | `properties`: The package for sparknlp module, which is imported during the submit job.
101 | 
102 | `wordcount_by_date.py`: The main .py file to run as the driver. It contains the PySpark code which does the preprocessing. In the above case, the file exists locally, but it can be on the cluster or in a storage bucket.
103 | 
104 | The last four arguments are custom flags passed to the driver, e.g.:
105 | 
106 | `gcloud dataproc jobs submit pyspark --cluster=my_cluster my_script.py -- --custom-flag`
107 | 
108 | These arguments will be parsed and used in the `wordcount_by_date.py` script.
109 | 
110 | Instead of using the bash code, I wrapped the above in the DataprocSubmitJobOperator. Besides the difference in API and the parameterisation, it is functionally identical.
111 | 
112 | The code looks like this (you can find it in `stocks_dag.py`):
113 | ```
114 | from airflow.providers.google.cloud.operators.dataproc import DataprocSubmitJobOperator
115 | 
116 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py'
117 | 
118 | 
119 | pyspark_job = {
120 | 	"reference": {"project_id": PROJECT_ID},
121 | 	"placement": {"cluster_name": 'de-spark-cluster'},
122 | 	"pyspark_job": {
123 | 		"main_python_file_uri": PYSPARK_URI,
124 | 		"jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
125 | 		"properties": {
126 | 			"spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"
127 | 		},
128 | 		"args": [
129 | 			f"--input=gs://{BUCKET}/{gcs_path}",
130 | 			f"--dataset={BIGQUERY_DATASET}",
131 | 			f"--subreddit={subreddit}",
132 | 			f"--mode={mode}"
133 | 		]
134 | 	}
135 | }
136 | 
137 | wordcount_sparksubmit_task = DataprocSubmitJobOperator(
138 | 	task_id='wordcount_sparksubmit',
139 | 	job=pyspark_job,
140 | 	region='asia-southeast1',
141 | 	project_id=PROJECT_ID,
142 | 	trigger_rule='all_done'
143 | )
144 | ```
145 | Note that the trigger_rule is not implemented correctly and can be left out.
146 | 


--------------------------------------------------------------------------------
/airflow/custom_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/airflow/custom_scripts/__init__.py


--------------------------------------------------------------------------------
/airflow/custom_scripts/ingest_reddit.py:
--------------------------------------------------------------------------------
 1 | import pendulum
 2 | import requests
 3 | import json
 4 | import time
 5 | 
 6 | url = "https://api.pushshift.io/reddit/search"
 7 | 
 8 | def fetchObjects(mode, **kwargs):
 9 |     
10 |     # Default parameters
11 |     # Change as necessary/desired
12 |     params = {
13 |         "sorted_type": "created_utc",
14 |         "sort": "asc",
15 |         "size": "1000"
16 |         }
17 | 
18 |     # Add additional parameters based on function arguments
19 |     for key, value in kwargs.items():
20 |         params[key] = value
21 |     
22 |     loop = True
23 |     while loop:
24 |         # Perform API request
25 |         r = requests.get(f'{url}/{mode}/', params=params, timeout=90)
26 |         # print(r.url)
27 |         if r.status_code != 200:
28 |             print(r.status_code)
29 |             print("Retrying...")
30 |         else:
31 |             # successful (200), loop = False and process data
32 |             loop = False
33 |     else:
34 |         response = json.loads(r.text)
35 |         data = response['data']
36 |         sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
37 |         return sorted_data_by_id
38 | 
39 | def extract_reddit_data(subreddit, mode, start, end, filepath):
40 | 
41 |     # arg datetime format: pendulum.DateTime
42 |     start = pendulum.parse(start)
43 |     end = pendulum.parse(end)
44 | 
45 |     # convert DateTime to timestamp to pass into API
46 |     start_ts = start.int_timestamp
47 |     end_ts = end.int_timestamp
48 |     
49 |     max_id = 0
50 |     
51 |     # Open file for JSON output
52 |     file = open(filepath, "a")
53 | 
54 |     while True: 
55 |         nothing_processed = True
56 |         objects = fetchObjects(mode, subreddit=subreddit, after=start_ts, before=end_ts)
57 |         
58 |         for object in objects:
59 |             id = int(object['id'],36)
60 |             if id > max_id:
61 |                 nothing_processed = False
62 |                 created_utc = object['created_utc']
63 |                 max_id = id
64 |                 if created_utc > start_ts:
65 |                     start_ts = created_utc
66 |                 # Output JSON data to the opened file
67 |                 file.write(json.dumps(object,sort_keys=True,ensure_ascii=True) + "\n")
68 | 
69 |         # Exit if nothing happened
70 |         if nothing_processed: break
71 |         start_ts -= 1
72 | 
73 |         # Sleep a little before the next function call
74 |         time.sleep(.5)
75 |     
76 |     file.close()


--------------------------------------------------------------------------------
/airflow/custom_scripts/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | import logging
 4 | import pyarrow.csv as pv
 5 | import pyarrow.parquet as pq
 6 | 
 7 | 
 8 | def json_to_csv(json_filepath, csv_filepath, mode):
 9 | 
10 |     with open(json_filepath) as json_file:
11 |         json_list = list(json_file)
12 |     # Create new file to write to
13 |     newfile = open(csv_filepath, 'w', encoding='utf-8', newline='')
14 |     csv_writer = csv.writer(newfile)
15 |     
16 |     if mode == 'submission':
17 |         csv_writer.writerow(["id","title","author", "created_utc", "num_comments","total_awards_received"])
18 |         for json_str in json_list:
19 |             # convert string to json object
20 |             result = json.loads(json_str)
21 |             # write each column, row by row
22 |             id = result['id']
23 |             title = result['title']
24 |             author = result['author']
25 |             created_utc = result['created_utc']
26 |             num_comments = result['num_comments']
27 |             total_awards_received = result['total_awards_received']
28 |             csv_writer.writerow([id, title, author, created_utc, num_comments, total_awards_received])
29 |         newfile.close()
30 |     
31 |     elif mode == 'comment':
32 |         csv_writer.writerow(["id", "author", "created_utc", "body", "total_awards_received"])
33 |         for json_str in json_list:
34 |             # convert string to json object
35 |             result = json.loads(json_str)
36 |             # write each column, row by row
37 |             id = result['id']
38 |             author = result['author']
39 |             created_utc = result['created_utc']
40 |             body = result['body']
41 |             total_awards_received = result['total_awards_received']
42 |             csv_writer.writerow([id, title, author, created_utc, body, total_awards_received])
43 |         newfile.close()
44 | 
45 | def csv_to_parquet(csv_filepath, parquet_filepath):
46 |     
47 |     if not csv_filepath.endswith('.csv'):
48 |         logging.error("Not a CSV file")
49 |         return
50 |     table = pv.read_csv(csv_filepath)
51 |     pq.write_table(table, parquet_filepath)


--------------------------------------------------------------------------------
/airflow/dags/load_datawarehouse.sql:
--------------------------------------------------------------------------------
 1 | -- create temp table (with modification)
 2 | CREATE OR REPLACE TEMP TABLE {{ subreddit }}_{{ mode }}
 3 | AS
 4 | SELECT id, title, author, num_comments, total_awards_received, DATE(TIMESTAMP_SECONDS(created_utc)) AS {{ mode }}_date
 5 | FROM {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_external_table;
 6 | 
 7 | -- if permanent table does not exist, create permanent table from temp table (with partition)
 8 | CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
 9 | PARTITION BY {{ mode }}_date
10 | AS 
11 | SELECT * FROM {{ subreddit }}_{{ mode }};
12 | 
13 | -- maintain idempotency using delete-write
14 | -- delete rows from permanent table (only delete data that the pipeline will re-create)
15 | DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
16 | WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}';
17 | 
18 | -- insert data from temp table to permanent table
19 | INSERT INTO {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
20 | SELECT * FROM {{ subreddit }}_{{ mode }};


--------------------------------------------------------------------------------
/airflow/dags/stocks_dag.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime, timedelta
  3 | from airflow import DAG
  4 | from airflow.operators.bash import BashOperator
  5 | from airflow.operators.python import PythonOperator
  6 | 
  7 | from google.cloud import storage
  8 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
  9 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator, DataprocSubmitJobOperator
 10 | 
 11 | from custom_scripts.ingest_reddit import extract_reddit_data
 12 | from custom_scripts.preprocessing import json_to_csv, csv_to_parquet
 13 | 
 14 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
 15 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
 16 | BIGQUERY_DATASET = os.environ.get('BIGQUERY_DATASET', 'stocks_data')
 17 | PROJECT_ID = os.environ.get('GCP_PROJECT_ID')
 18 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py'
 19 | 
 20 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
 21 |             project_id=PROJECT_ID,
 22 |             zone="asia-southeast1-a",
 23 |             master_machine_type="n1-standard-4",
 24 |             master_disk_size=500,
 25 |             num_masters=1,
 26 |             num_workers=0,                          # single node mode
 27 |             idle_delete_ttl=900,                    # idle time before deleting cluster
 28 |             init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'],
 29 |             metadata={'PIP_PACKAGES': 'spark-nlp'},
 30 |         ).make()
 31 | 
 32 | def load_to_gcs(bucket, object_name, local_file):
 33 |     """
 34 |     Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
 35 |     :param bucket: GCS bucket name
 36 |     :param object_name: target path & file-name
 37 |     :param local_file: source path & file-name
 38 |     :return:
 39 |     """
 40 |     client = storage.Client()
 41 |     bucket = client.bucket(bucket)
 42 | 
 43 |     blob = bucket.blob(object_name)
 44 |     blob.upload_from_filename(local_file)
 45 | 
 46 | def reddit_pipeline_template(
 47 |     # arguments
 48 |     dag,
 49 |     subreddit,
 50 |     mode,
 51 |     json_filepath,
 52 |     csv_filepath,
 53 |     parquet_filepath,
 54 |     gcs_path
 55 | ):
 56 |     with dag:
 57 |         download_data_task = PythonOperator(
 58 |             task_id = 'ingest_reddit_json',
 59 |             python_callable = extract_reddit_data,
 60 |             op_kwargs = {
 61 |                 'subreddit': subreddit,
 62 |                 'mode': mode,
 63 |                 'start': '{{ data_interval_start }}',
 64 |                 'end': '{{ data_interval_end }}',
 65 |                 'filepath': json_filepath
 66 |             }
 67 |         )
 68 | 
 69 |         json_to_csv_task = PythonOperator(
 70 |             task_id = 'json_to_csv',
 71 |             python_callable = json_to_csv,
 72 |             op_kwargs = {
 73 |                 'json_filepath': json_filepath,
 74 |                 'csv_filepath': csv_filepath,
 75 |                 'mode': mode
 76 |             }
 77 |         )
 78 | 
 79 |         csv_to_parquet_task = PythonOperator(
 80 |             task_id = 'csv_to_parquet',
 81 |             python_callable = csv_to_parquet,
 82 |             op_kwargs = {
 83 |                 'csv_filepath': csv_filepath,
 84 |                 'parquet_filepath': parquet_filepath
 85 |             }
 86 |         )
 87 | 
 88 |         load_to_gcs_task = PythonOperator(
 89 |             task_id = "load_to_gcs",
 90 |             python_callable = load_to_gcs,
 91 |             op_kwargs={
 92 |                 "bucket": BUCKET,
 93 |                 "object_name": gcs_path,
 94 |                 "local_file": parquet_filepath,
 95 |             }
 96 |         )
 97 | 
 98 |         delete_local_json_csv = BashOperator(
 99 |             task_id = "delete_local_json_csv",
100 |             bash_command = f'rm {json_filepath} {csv_filepath}'
101 |         )
102 | 
103 |         QUERY = f'''CREATE OR REPLACE EXTERNAL TABLE {BIGQUERY_DATASET}.{subreddit}_{mode}_external_table
104 |                     OPTIONS (
105 |                         format="PARQUET",
106 |                         uris=["gs://{BUCKET}/{gcs_path}"]
107 |                     );'''
108 | 
109 |         create_BQ_external_table_task = BigQueryInsertJobOperator(
110 |             task_id = 'create_external_table',
111 |             configuration={
112 |                 'query': {
113 |                     'query': QUERY,
114 |                     'useLegacySql': False,
115 |                 }
116 |             }
117 |         )
118 |        
119 |         # Create a partitioned table from external table
120 |         BQ_create_partitioned_table_task = BigQueryInsertJobOperator(
121 |             task_id = "bq_create_partitioned_table",
122 |             configuration={
123 |                 "query": {
124 |                     "query": "{% include 'load_datawarehouse.sql' %}",
125 |                     "useLegacySql": False,
126 |                 }
127 |             }
128 |         )
129 | 
130 |         QUERY_CREATE_WORDCOUNT_TABLE = '''
131 |             CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount (
132 |                 word STRING,
133 |                 wordcount INTEGER,
134 |                 {{ mode }}_date DATE
135 |             )
136 |             PARTITION BY {{ mode }}_date'''
137 |         
138 |         create_wordcount_table_task = BigQueryInsertJobOperator(
139 |             task_id = 'create_wordcount_table',
140 |             configuration={
141 |                 'query': {
142 |                     'query': QUERY_CREATE_WORDCOUNT_TABLE,
143 |                     'useLegacySql': False,
144 |                 }
145 |             }
146 |         )
147 |         # task will marked as 'success' if cluster exists
148 |         create_cluster_operator_task = DataprocCreateClusterOperator(
149 |             task_id='create_dataproc_cluster',
150 |             cluster_name="de-spark-cluster",
151 |             project_id=PROJECT_ID,
152 |             region="asia-southeast1",
153 |             cluster_config=CLUSTER_GENERATOR_CONFIG
154 |         )
155 | 
156 |         QUERY_DELETE_WORDCOUNT_ROWS = '''
157 |         DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount
158 |         WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}';
159 |         '''
160 | 
161 |         # delete any existing duplicate rows before writing
162 |         delete_wordcountdup_task = BigQueryInsertJobOperator(
163 |             task_id = 'delete_wordcountdup',
164 |             configuration={
165 |                 'query': {
166 |                     'query': QUERY_DELETE_WORDCOUNT_ROWS,
167 |                     'useLegacySql': False,
168 |                 }
169 |             }
170 |         )
171 |         
172 |         pyspark_job = {
173 |             "reference": {"project_id": PROJECT_ID},
174 |             "placement": {"cluster_name": 'de-spark-cluster'},
175 |             "pyspark_job": {
176 |                 "main_python_file_uri": PYSPARK_URI,
177 |                 "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
178 |                 "properties": {
179 |                     "spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"
180 |                 },
181 |                 "args": [
182 |                     f"--input=gs://{BUCKET}/{gcs_path}",
183 |                     f"--dataset={BIGQUERY_DATASET}",
184 |                     f"--subreddit={subreddit}",
185 |                     f"--mode={mode}"
186 |                     ]
187 |             }
188 |         }
189 | 
190 |         wordcount_sparksubmit_task = DataprocSubmitJobOperator(
191 |             task_id='wordcount_sparksubmit',
192 |             job=pyspark_job,
193 |             region='asia-southeast1',
194 |             project_id=PROJECT_ID,
195 |             trigger_rule='all_done'
196 |         )
197 | 
198 |         download_data_task >> json_to_csv_task >> csv_to_parquet_task >> load_to_gcs_task >> [delete_local_json_csv, create_BQ_external_table_task]
199 |         create_BQ_external_table_task >> BQ_create_partitioned_table_task
200 |         load_to_gcs_task >> create_wordcount_table_task >> create_cluster_operator_task >> delete_wordcountdup_task >> wordcount_sparksubmit_task
201 | 
202 | default_args = {
203 |     "owner": "Zachary",
204 |     "start_date": datetime(2022, 3, 1),
205 |     "end_date": datetime(2022, 4, 30),
206 |     "depends_on_past": False,
207 |     "retries": 1,
208 |     "retry_delay": timedelta(seconds=60)
209 | }
210 | 
211 | # all dag definitions (dag = DAG()) should be in the global scope
212 | stocks_submission_weekly_dag = DAG(
213 |         dag_id = 'stocks_submission_weekly',
214 |         schedule_interval = '@weekly',
215 |         catchup = True,
216 |         max_active_runs = 3,
217 |         default_args = default_args,
218 |         user_defined_macros={
219 |             "BIGQUERY_DATASET": BIGQUERY_DATASET,
220 |             "subreddit": 'stocks',
221 |             "mode": 'submission',
222 |         }
223 |     )
224 | 
225 | # submission
226 | reddit_pipeline_template(
227 |     dag = stocks_submission_weekly_dag,
228 |     subreddit = 'stocks',
229 |     mode = 'submission',
230 |     json_filepath = AIRFLOW_HOME + '/data/json/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.json',
231 |     csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv',
232 |     parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet',
233 |     gcs_path = 'stocks/submission/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet'
234 | )
235 | 
236 | 
237 | #---------------------------- THE SECTION BELOW IS UNUSED ---------------------------
238 | # stocks_comment_weekly_dag = DAG(
239 | #         dag_id = 'stocks_comment_weekly',
240 | #         schedule_interval = '@weekly',
241 | #         catchup = True,
242 | #         max_active_runs = 2,
243 | #         default_args = default_args
244 | #     )
245 | 
246 | # comment
247 | # reddit_pipeline_template(
248 | #     dag = stocks_comment_weekly_dag,
249 | #     subreddit = 'stocks',
250 | #     mode = 'comment',
251 | #     json_filepath = AIRFLOW_HOME + '/data/json/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.json',
252 | #     csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv',
253 | #     parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet',
254 | #     gcs_path = 'stocks/comment/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet'
255 | # )


--------------------------------------------------------------------------------
/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.2.4
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 31 | #
 32 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 33 | #                                Default: airflow
 34 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 37 | #                                Default: ''
 38 | #
 39 | # Feel free to modify this file to suit your needs.
 40 | ---
 41 | version: '3'
 42 | x-airflow-common:
 43 |   &airflow-common
 44 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 45 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 46 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 47 |   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.4}
 48 |   build: .
 49 |   environment:
 50 |     &airflow-common-env
 51 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 52 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 53 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 54 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 55 |     AIRFLOW__CORE__FERNET_KEY: ''
 56 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 57 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 58 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 59 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 60 | 
 61 |     # self-defined
 62 |     GCP_PROJECT_ID: 'de-r-stocks'
 63 |     GCP_GCS_BUCKET: 'datalake_de-r-stocks'
 64 |     GOOGLE_APPLICATION_CREDENTIALS: '/.google/credentials/de-r-stocks.json'
 65 |     AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/de-r-stocks.json'
 66 |   volumes:
 67 |     - ./dags:/opt/airflow/dags
 68 |     - ./logs:/opt/airflow/logs
 69 |     - ./plugins:/opt/airflow/plugins
 70 |     - ~/.google/credentials/:/.google/credentials:ro
 71 |     - ./custom_scripts:/opt/airflow/custom_scripts
 72 |     - ./data/json:/opt/airflow/data/json
 73 |     - ./data/csv:/opt/airflow/data/csv
 74 |     - ./data/parquet:/opt/airflow/data/parquet
 75 |   user: "${AIRFLOW_UID:-50000}:0"
 76 |   depends_on:
 77 |     &airflow-common-depends-on
 78 |     redis:
 79 |       condition: service_healthy
 80 |     postgres:
 81 |       condition: service_healthy
 82 | 
 83 | services:
 84 |   postgres:
 85 |     image: postgres:13
 86 |     environment:
 87 |       POSTGRES_USER: airflow
 88 |       POSTGRES_PASSWORD: airflow
 89 |       POSTGRES_DB: airflow
 90 |     volumes:
 91 |       - postgres-db-volume:/var/lib/postgresql/data
 92 |     healthcheck:
 93 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 94 |       interval: 5s
 95 |       retries: 5
 96 |     restart: always
 97 | 
 98 |   redis:
 99 |     image: redis:latest
100 |     expose:
101 |       - 6379
102 |     healthcheck:
103 |       test: ["CMD", "redis-cli", "ping"]
104 |       interval: 5s
105 |       timeout: 30s
106 |       retries: 50
107 |     restart: always
108 | 
109 |   airflow-webserver:
110 |     <<: *airflow-common
111 |     command: webserver
112 |     ports:
113 |       - 8080:8080
114 |     healthcheck:
115 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
116 |       interval: 10s
117 |       timeout: 10s
118 |       retries: 5
119 |     restart: always
120 |     depends_on:
121 |       <<: *airflow-common-depends-on
122 |       airflow-init:
123 |         condition: service_completed_successfully
124 | 
125 |   airflow-scheduler:
126 |     <<: *airflow-common
127 |     command: scheduler
128 |     healthcheck:
129 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
130 |       interval: 10s
131 |       timeout: 10s
132 |       retries: 5
133 |     restart: always
134 |     depends_on:
135 |       <<: *airflow-common-depends-on
136 |       airflow-init:
137 |         condition: service_completed_successfully
138 | 
139 |   airflow-worker:
140 |     <<: *airflow-common
141 |     command: celery worker
142 |     healthcheck:
143 |       test:
144 |         - "CMD-SHELL"
145 |         - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
146 |       interval: 10s
147 |       timeout: 10s
148 |       retries: 5
149 |     environment:
150 |       <<: *airflow-common-env
151 |       # Required to handle warm shutdown of the celery workers properly
152 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
153 |       DUMB_INIT_SETSID: "0"
154 |     restart: always
155 |     depends_on:
156 |       <<: *airflow-common-depends-on
157 |       airflow-init:
158 |         condition: service_completed_successfully
159 | 
160 |   airflow-triggerer:
161 |     <<: *airflow-common
162 |     command: triggerer
163 |     healthcheck:
164 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
165 |       interval: 10s
166 |       timeout: 10s
167 |       retries: 5
168 |     restart: always
169 |     depends_on:
170 |       <<: *airflow-common-depends-on
171 |       airflow-init:
172 |         condition: service_completed_successfully
173 | 
174 |   airflow-init:
175 |     <<: *airflow-common
176 |     entrypoint: /bin/bash
177 |     # yamllint disable rule:line-length
178 |     command:
179 |       - -c
180 |       - |
181 |         function ver() {
182 |           printf "%04d%04d%04d%04d" $${1//./ }
183 |         }
184 |         airflow_version=$$(gosu airflow airflow version)
185 |         airflow_version_comparable=$$(ver $${airflow_version})
186 |         min_airflow_version=2.2.0
187 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
188 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
189 |           echo
190 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
191 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
192 |           echo
193 |           exit 1
194 |         fi
195 |         if [[ -z "${AIRFLOW_UID}" ]]; then
196 |           echo
197 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
198 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
199 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
200 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
201 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
202 |           echo
203 |         fi
204 |         one_meg=1048576
205 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
206 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
207 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
208 |         warning_resources="false"
209 |         if (( mem_available < 4000 )) ; then
210 |           echo
211 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
212 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
213 |           echo
214 |           warning_resources="true"
215 |         fi
216 |         if (( cpus_available < 2 )); then
217 |           echo
218 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
219 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
220 |           echo
221 |           warning_resources="true"
222 |         fi
223 |         if (( disk_available < one_meg * 10 )); then
224 |           echo
225 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
226 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
227 |           echo
228 |           warning_resources="true"
229 |         fi
230 |         if [[ $${warning_resources} == "true" ]]; then
231 |           echo
232 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
233 |           echo "Please follow the instructions to increase amount of resources available:"
234 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
235 |           echo
236 |         fi
237 |         mkdir -p /sources/logs /sources/dags /sources/plugins
238 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
239 |         exec /entrypoint airflow version
240 |     # yamllint enable rule:line-length
241 |     environment:
242 |       <<: *airflow-common-env
243 |       _AIRFLOW_DB_UPGRADE: 'true'
244 |       _AIRFLOW_WWW_USER_CREATE: 'true'
245 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
246 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
247 |     user: "0:0"
248 |     volumes:
249 |       - .:/sources
250 | 
251 |   airflow-cli:
252 |     <<: *airflow-common
253 |     profiles:
254 |       - debug
255 |     environment:
256 |       <<: *airflow-common-env
257 |       CONNECTION_CHECK_MAX_COUNT: "0"
258 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
259 |     command:
260 |       - bash
261 |       - -c
262 |       - airflow
263 | 
264 |   flower:
265 |     <<: *airflow-common
266 |     command: celery flower
267 |     ports:
268 |       - 5555:5555
269 |     healthcheck:
270 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
271 |       interval: 10s
272 |       timeout: 10s
273 |       retries: 5
274 |     restart: always
275 |     depends_on:
276 |       <<: *airflow-common-depends-on
277 |       airflow-init:
278 |         condition: service_completed_successfully
279 | 
280 | volumes:
281 |   postgres-db-volume:
282 | 


--------------------------------------------------------------------------------
/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-google
2 | pyarrow
3 | pendulum
4 | requests


--------------------------------------------------------------------------------
/airflow/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS}
 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT}
 4 | 
 5 | airflow db upgrade
 6 | 
 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow
 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD"
 9 | 
10 | airflow webserver
11 | 


--------------------------------------------------------------------------------
/images/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/airflow.png


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/architecture.png


--------------------------------------------------------------------------------
/images/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/dashboard.png


--------------------------------------------------------------------------------
/spark/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | ## Code breakdown for `wordcount_by_date.py`
 4 | The script does the following:
 5 | - Reads parquet file from a GCS bucket
 6 |   - It assumes that the file contains the columns 'author', 'date' and 'title'
 7 | - Builds an NLP pipeline and transforms the text data in 'title' with the pipeline
 8 |   - https://nlp.johnsnowlabs.com/docs/en/install#python
 9 |   - https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb
10 | - Creates a dataframe that contains the token count on each date, e.g.:
11 | 
12 | | word   | wordcount | submission_date |
13 | |--------|-------|------------|
14 | | invest | 6     | 2022-04-11 |
15 | | invest | 4     | 2022-04-12 |
16 | | market | 2     | 2022-04-12 |
17 | 
18 | - Lastly, it writes dataframe containing the word count into a BigQuery table
19 | 
20 | ### Write dataframe into a BigQuery table
21 | Official doc: https://github.com/GoogleCloudDataproc/spark-bigquery-connector
22 | 
23 | ```
24 | df_wordcountbydate.write.format('bigquery') \
25 |     .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \
26 |     .option('temporaryGcsBucket', BUCKET) \
27 |     .option('partitionField', f'{mode}_date') \
28 |     .option('partitionType', 'DAY') \
29 |     .mode('append') \
30 |     .save()
31 | ```
32 | 
33 | This is done using a BigQuery connector for Spark. The connector must be specified when submitting the PySpark job for this script, which I did so in the Airflow DAG `stocks_dag.py`.
34 | 


--------------------------------------------------------------------------------
/spark/spark-bigquery-latest_2.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/spark/spark-bigquery-latest_2.12.jar


--------------------------------------------------------------------------------
/spark/wordcount_by_date.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.types import StructField, StructType, StringType, FloatType, IntegerType
  4 | from pyspark.ml.feature import CountVectorizer
  5 | from pyspark.ml import Pipeline
  6 | import pyspark.sql.functions as F
  7 | 
  8 | from sparknlp.annotator import LemmatizerModel, Tokenizer, Normalizer, StopWordsCleaner, NGramGenerator
  9 | from sparknlp.base import Finisher, DocumentAssembler
 10 | 
 11 | import argparse
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--input', required=True)
 14 | parser.add_argument('--dataset', required=True)
 15 | parser.add_argument('--subreddit', required=True)
 16 | parser.add_argument('--mode', required=True)
 17 | args = parser.parse_args()
 18 | 
 19 | input = args.input
 20 | dataset = args.dataset
 21 | subreddit = args.subreddit
 22 | mode = args.mode
 23 | 
 24 | # change this to your bucket
 25 | # bucket is used as temporary storage while writing data from Spark to BigQuery
 26 | BUCKET = 'datalake_de-r-stocks'
 27 | 
 28 | # Start Spark session
 29 | spark = SparkSession.builder \
 30 |     .appName('preprocessing_wordcount') \
 31 |     .config('"spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"') \
 32 |     .getOrCreate()
 33 | 
 34 | # %%
 35 | # Access data from GCS
 36 | df = spark.read.parquet(input)
 37 | 
 38 | # 1. Remove posts by AutoModerator
 39 | # 2. Remove duplicate titles
 40 | # 3. Convert unix timestamp to date
 41 | # 4. Keep title and date columns
 42 | df_filter = df.filter(~F.col('author').contains('AutoModerator')) \
 43 |     .dropDuplicates(['title']) \
 44 |         .withColumn('date', F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')) \
 45 |             .select('title', 'date')
 46 |             
 47 | documentAssembler = DocumentAssembler() \
 48 |     .setInputCol('title') \
 49 |     .setOutputCol('title_document')
 50 | 
 51 | tokenizer = Tokenizer() \
 52 |     .setInputCols(['title_document']) \
 53 |     .setOutputCol('title_token')
 54 | 
 55 | normalizer = Normalizer() \
 56 |     .setInputCols(['title_token']) \
 57 |     .setOutputCol('title_normalized') \
 58 |     .setLowercase(True)
 59 | 
 60 | lemmatizer = LemmatizerModel.pretrained() \
 61 |             .setInputCols(['title_normalized']) \
 62 |             .setOutputCol('title_lemma')
 63 | 
 64 | stopwords_cleaner = StopWordsCleaner() \
 65 |     .setInputCols(['title_lemma']) \
 66 |     .setOutputCol('title_cleaned') \
 67 |     .setCaseSensitive(False)
 68 | 
 69 | ngrams_cum = NGramGenerator() \
 70 |             .setInputCols(["title_cleaned"]) \
 71 |             .setOutputCol("title_ngrams") \
 72 |             .setN(2) \
 73 |             .setEnableCumulative(True)\
 74 |             .setDelimiter("_") # Default is space
 75 | 
 76 | finisher = Finisher() \
 77 |     .setInputCols(['title_ngrams']) \
 78 |     .setOutputCols(['title_finished']) \
 79 |     .setCleanAnnotations(False)
 80 | 
 81 | nlpPipeline = Pipeline(stages=[
 82 |               documentAssembler, 
 83 |               tokenizer,
 84 |               normalizer,
 85 |               lemmatizer,
 86 |               stopwords_cleaner,
 87 |               ngrams_cum,
 88 |               finisher
 89 |  ])
 90 | 
 91 | df_result = nlpPipeline.fit(df_filter).transform(df_filter).select('title_finished', 'date')
 92 | 
 93 | # CountVectorizer model
 94 | cv = CountVectorizer(inputCol='title_finished', outputCol='features', minDF=3.0)
 95 | 
 96 | # Train on all submissions
 97 | model = cv.fit(df_result)
 98 | 
 99 | df_tokensbydate = df_result.groupBy('date').agg(F.flatten(F.collect_list('title_finished')).alias('title_finished'))
100 | 
101 | # Get counts for each date
102 | counts = model.transform(df_tokensbydate).select('date','features').collect()
103 | 
104 | # Create empty dataframe
105 | df_wordcountbydate = spark.createDataFrame(spark.sparkContext.emptyRDD(), 
106 |                         schema=StructType(fields=[
107 |                             StructField("word", StringType()), 
108 |                             StructField("count", FloatType()),
109 |                             StructField("date", StringType())]))
110 | 
111 | # Append count for each day to dataframe
112 | for row in range(len(counts)):
113 |     test_dict = dict(zip(model.vocabulary, (float(x) for x in counts[row]['features'].values)))
114 |     df_temp = spark.createDataFrame(test_dict.items(), 
115 |                         schema=StructType(fields=[
116 |                             StructField("word", StringType()), 
117 |                             StructField("count", FloatType())]))
118 |     df_temp = df_temp.withColumn('date', F.lit(counts[row]['date']))
119 |     df_wordcountbydate = df_wordcountbydate.unionAll(df_temp)
120 | 
121 | # %%
122 | 
123 | df_wordcountbydate = df_wordcountbydate.withColumn('count', F.col('count').cast(IntegerType())) \
124 |                         .withColumn(f'{mode}_date', F.to_date(F.col('date'), 'yyyy-MM-dd')) \
125 |                         .withColumnRenamed('count', 'wordcount') \
126 |                         .drop('date')
127 | 
128 | # upload dataframe to BigQuery
129 | df_wordcountbydate.write.format('bigquery') \
130 |     .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \
131 |     .option('temporaryGcsBucket', BUCKET) \
132 |     .option('partitionField', f'{mode}_date') \
133 |     .option('partitionType', 'DAY') \
134 |     .mode('append') \
135 |     .save()


--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.0"
 3 |   required_providers {
 4 |     google = {
 5 |         source = "hashicorp/google"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "google" {
11 |     credentials = file(var.credentials)
12 |     project = var.project
13 |     region = var.region
14 | }
15 | 
16 | resource "google_storage_bucket" "data-lake-bucket" {
17 |     name = "${local.data_lake_bucket}_${var.project}"
18 |     location = var.region
19 | 
20 |     storage_class = var.storage_class
21 |     uniform_bucket_level_access = true
22 | 
23 |     versioning {
24 |         enabled = true
25 |     }
26 | 
27 |     lifecycle_rule {
28 |         action {
29 |             type = "Delete"
30 |         }
31 |         condition {
32 |             age = 30
33 |         }
34 |     }
35 | 
36 |     force_destroy = true
37 | }
38 | 
39 | resource "google_bigquery_dataset" "dataset" {
40 |     dataset_id = var.BQ_DATASET
41 |     project = var.project
42 |     location = var.region
43 | }


--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | locals {
 2 |   data_lake_bucket = "datalake"
 3 | }
 4 | 
 5 | variable "project" {
 6 |   default     = "de-r-stocks"
 7 |   description = "GCP project ID"
 8 | }
 9 | 
10 | variable "region" {
11 |   type        = string
12 |   default     = "asia-southeast1"
13 |   description = "Region for GCP resources"
14 | }
15 | 
16 | variable "storage_class" {
17 |   default     = "STANDARD"
18 |   description = "Storage class type for bucket"
19 | }
20 | 
21 | variable "BQ_DATASET" {
22 |   type        = string
23 |   default     = "stocks_data"
24 |   description = "BigQuery dataset that raw data from GCS will be written to"
25 | }
26 | 
27 | variable "credentials" {
28 |   type        = string
29 |   default     = "/home/ztmj96/.google/credentials/de-r-stocks.json"
30 |   description = "Path for GCP account credentials"
31 | }
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------