├── .gitignore
├── LICENSE
├── README.md
├── airflow
├── Dockerfile
├── README.md
├── custom_scripts
│ ├── __init__.py
│ ├── ingest_reddit.py
│ └── preprocessing.py
├── dags
│ ├── load_datawarehouse.sql
│ └── stocks_dag.py
├── docker-compose.yaml
├── requirements.txt
└── scripts
│ └── entrypoint.sh
├── images
├── airflow.png
├── architecture.png
└── dashboard.png
├── spark
├── README.md
├── spark-bigquery-latest_2.12.jar
└── wordcount_by_date.py
└── terraform
├── main.tf
└── variables.tf
/.gitignore:
--------------------------------------------------------------------------------
1 | # SPECIFIC TO THIS PROJECT
2 |
3 | # terraform
4 | *.tfstate
5 | *.tfstate.*
6 | **.terraform
7 | **.terraform.lock.*
8 |
9 | # files
10 | *.parquet
11 | *.ipynb
12 | commands.txt
13 |
14 | #airflow
15 | airflow/logs/
16 |
17 | # END
18 |
19 | # Byte-compiled / optimized / DLL files
20 | __pycache__/
21 | *.py[cod]
22 | *$py.class
23 |
24 | # C extensions
25 | *.so
26 |
27 | # Distribution / packaging
28 | .Python
29 | build/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | .eggs/
35 | lib/
36 | lib64/
37 | parts/
38 | sdist/
39 | var/
40 | wheels/
41 | pip-wheel-metadata/
42 | share/python-wheels/
43 | *.egg-info/
44 | .installed.cfg
45 | *.egg
46 | MANIFEST
47 |
48 | # PyInstaller
49 | # Usually these files are written by a python script from a template
50 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
51 | *.manifest
52 | *.spec
53 |
54 | # Installer logs
55 | pip-log.txt
56 | pip-delete-this-directory.txt
57 |
58 | # Unit test / coverage reports
59 | htmlcov/
60 | .tox/
61 | .nox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *.cover
68 | *.py,cover
69 | .hypothesis/
70 | .pytest_cache/
71 |
72 | # Translations
73 | *.mo
74 | *.pot
75 |
76 | # Django stuff:
77 | *.log
78 | local_settings.py
79 | db.sqlite3
80 | db.sqlite3-journal
81 |
82 | # Flask stuff:
83 | instance/
84 | .webassets-cache
85 |
86 | # Scrapy stuff:
87 | .scrapy
88 |
89 | # Sphinx documentation
90 | docs/_build/
91 |
92 | # PyBuilder
93 | target/
94 |
95 | # Jupyter Notebook
96 | .ipynb_checkpoints
97 |
98 | # IPython
99 | profile_default/
100 | ipython_config.py
101 |
102 | # pyenv
103 | .python-version
104 |
105 | # pipenv
106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | # install all needed dependencies.
110 | #Pipfile.lock
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Zachary
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 | # Data Pipeline for Reddit data (r/Stocks)
13 |
14 |
15 | Table of Contents
16 |
17 | -
18 | About The Project
19 |
22 |
23 | -
24 | Getting Started
25 |
31 |
32 | -
33 | Usage
34 |
38 |
39 | - Help
40 | - Roadmap for Future Development
41 | - Contributing
42 | - License
43 | - Contact
44 | - Acknowledgements
45 |
46 |
47 |
48 |
49 | ## About The Project
50 |
51 | [![Dashboard][dashboard_screenshot]](https://datastudio.google.com/s/mjIjKwWNUQU)
52 |
53 | Interested to explore Reddit data for trends, analytics, or just for the fun of it?
54 |
55 | This project builds a data pipeline (from data ingestion to visualisation) that stores and preprocess data over any time period that you want.
56 |
57 | (back to top)
58 |
59 | ### Built With
60 |
61 | * Data Ingestion: [Pushshift API](https://github.com/pushshift/api)
62 | * Infrastructure as Code: [Terraform](https://www.terraform.io/)
63 | * Workflow Orchestration: [Airflow](https://airflow.apache.org)
64 | * Data Lake: [Google Cloud Storage](https://cloud.google.com/storage)
65 | * Data Warehouse: [Google BigQuery](https://cloud.google.com/bigquery)
66 | * Batch Processing: [Spark](https://spark.apache.org/) on [Dataproc](https://cloud.google.com/dataproc)
67 | * Visualisation: [Google Data Studio](https://datastudio.google.com/)
68 |
69 | ![architecture][architecture_diagram]
70 | Cloud infrastructure is set up with Terraform.
71 |
72 | Airflow is run on a local docker container.
73 | It orchestrates the following on a weekly schedule:
74 | * Download data (JSON)
75 | * Parquetize the data and store it in a bucket on Google Cloud Storage
76 | * Write data to a table on BigQuery
77 | * Create cluster on Dataproc and submit PySpark job to preprocess parquet files from Google Cloud Storage
78 | * Write preprocessed data to a table on BigQuery
79 |
80 | (back to top)
81 |
82 | ## Getting Started
83 |
84 | I created this project in WSL 2 (Windows Subsystem for Linux) on Windows 10.
85 |
86 | ### Prerequisites
87 |
88 | To get a local copy up and running in the same environment, you'll need to:
89 | * Install Python (3.8 and above)
90 | * Install VSCode
91 | * [Install WSL 2](https://docs.microsoft.com/en-us/windows/wsl/install) if you haven't
92 | * [Install Terraform](https://www.terraform.io/downloads) for Linux
93 | * [Install Docker Desktop](https://docs.docker.com/desktop/windows/install/)
94 | * [Install Google Cloud SDK](https://cloud.google.com/sdk/docs/install-sdk#deb) for Ubuntu
95 | * Have a Google Cloud Platform account
96 | * Clone this repository locally
97 |
98 | ### Create a Google Cloud Project
99 | 1. Go to [Google Cloud](https://console.cloud.google.com/) and create a new project. I set the id to 'de-r-stocks'.
100 | 2. Go to IAM and [create a Service Account](https://cloud.google.com/docs/authentication/getting-started#creating_a_service_account) with these roles:
101 | * BigQuery Admin
102 | * Storage Admin
103 | * Storage Object Admin
104 | * Viewer
105 | 3. Download the Service Account credentials, rename it to `de-r-stocks.json` and store it in `$HOME/.google/credentials/`.
106 | 4. On the Google console, enable the following APIs:
107 | * IAM API
108 | * IAM Service Account Credentials API
109 | * Cloud Dataproc API
110 | * Compute Engine API
111 |
112 | ### Set up the infrastructure on Google Cloud with Terraform
113 | I recommend executing the following on VSCode.
114 |
115 | 1. Using VSCode + WSL, open the project folder `de_r-stocks`.
116 | 2. Open `variables.tf` and modify:
117 |
118 | * `variable "project"` to your own project id (I think may not be necessary)
119 | * `variable "region"` to your project region
120 | * `variable "credentials"` to your credentials path
121 |
122 | 3. Open the VSCode terminal and change directory to the terraform folder, e.g. `cd terraform`.
123 | 4. Initialise Terraform: `terraform init`
124 | 5. Plan the infrastructure: `terraform plan`
125 | 6. Apply the changes: `terraform apply`
126 |
127 | If everything goes right, you now have a bucket on Google Cloud Storage called 'datalake_de-r-stocks' and a dataset on BigQuery called 'stocks_data'.
128 |
129 | ### Set up Airflow
130 | 1. Using VSCode, open `docker-compose.yaml` and look for the `#self-defined` block. Modify the variables to match your setup.
131 | 2. Open `stocks_dag.py`. You may need to change the following:
132 |
133 | * `zone` in `CLUSTER_GENERATOR_CONFIG`
134 | * Parameters in `default_args`
135 |
136 | (back to top)
137 |
138 | ## Usage
139 |
140 | ### Start Airflow
141 | 1. Using the terminal, change the directory to the airflow folder, e.g. `cd airflow`.
142 | 2. Build the custom Airflow docker image: `docker-compose build`
143 | 3. Initialise the Airflow configs: `docker-compose up airflow-init`
144 | 4. Run Airflow: `docker-compose up`
145 |
146 | If the setup was done correctly, you will be able to access the Airflow interface by going to `localhost:8080` on your browser.
147 |
148 | Username and password are both `airflow`.
149 |
150 | ### Prepare for Spark jobs on Dataproc
151 | 1. Go to `wordcount_by_date.py` and modify the string value of `BUCKET` to your bucket's id.
152 | 2. Store initialisation and PySpark scripts on your bucket. It is required to create the cluster to run our Spark job.
153 |
154 | Run in the terminal (using the correct bucket name and region):
155 | * `gsutil cp gs://goog-dataproc-initialization-actions-asia-southeast1/python/pip-install.sh gs://datalake_de-r-stocks/scripts`
156 | * `gsutil cp spark/wordcount_by_date.py gs://datalake_de-r-stocks/scripts`
157 |
158 | (back to top)
159 |
160 | Now, you are ready to enable the DAG on Airflow and let it do its magic!
161 |
162 | ![airflow][airflow_screenshot]
163 |
164 | When you are done, just stop the airflow services by going to the `airflow` directory with terminal and execute `docker-compose down`.
165 |
166 | ## Help
167 |
168 | Authorisation error while trying to create a Dataproc cluster from Airflow
169 | 1. Go to Google Cloud Platform's IAM
170 | 2. Under the Compute Engine default service account, add the roles 'Editor' and 'Dataproc Worker'.
171 |
172 | ## Roadmap for Future Development
173 |
174 | - [ ] Refactor code for convenient change to `subreddit` and `mode`.
175 | - [ ] Use Terraform to set up tables on BigQuery instead of creating tables as part of the DAG.
176 | - [ ] Unit tests
177 | - [ ] Data quality checks
178 | - [ ] CI/CD
179 |
180 | (back to top)
181 |
182 | ## Contributing
183 |
184 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
185 | Don't forget to give the project a star! Thanks again!
186 |
187 | 1. Fork the Project
188 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
189 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
190 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
191 | 5. Open a Pull Request
192 |
193 | (back to top)
194 |
195 | ## License
196 |
197 | Distributed under the MIT License. See `LICENSE.txt` for more information.
198 |
199 | ## Contact
200 |
201 | [Connect with me on LinkedIn!](https://www.linkedin.com/in/zacharytancs/)
202 |
203 | ## Acknowledgements
204 |
205 | Use this space to list resources you find helpful and would like to give credit to. I've included a few of my favorites to kick things off!
206 |
207 | * [Data Engineering Zoomcamp by DataTalksClub](https://github.com/DataTalksClub/data-engineering-zoomcamp)
208 | * [Best-README-Template](https://github.com/othneildrew/Best-README-Template)
209 |
210 | (back to top)
211 |
212 |
213 |
214 | [dashboard_screenshot]: images/dashboard.png
215 | [architecture_diagram]: images/architecture.png
216 | [airflow_screenshot]: images/airflow.png
217 |
--------------------------------------------------------------------------------
/airflow/Dockerfile:
--------------------------------------------------------------------------------
1 | # First-time build can take upto 10 mins.
2 |
3 | FROM apache/airflow:2.2.3
4 |
5 | ENV AIRFLOW_HOME=/opt/airflow
6 | ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}"
7 |
8 | USER root
9 | RUN apt-get update -qq && apt-get install vim -qqq
10 | # git gcc g++ -qqq
11 |
12 | COPY requirements.txt .
13 | RUN pip install --no-cache-dir -r requirements.txt
14 |
15 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html
16 |
17 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
18 |
19 | ARG CLOUD_SDK_VERSION=322.0.0
20 | ENV GCLOUD_HOME=/home/google-cloud-sdk
21 |
22 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
23 |
24 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
25 | && TMP_DIR="$(mktemp -d)" \
26 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
27 | && mkdir -p "${GCLOUD_HOME}" \
28 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
29 | && "${GCLOUD_HOME}/install.sh" \
30 | --bash-completion=false \
31 | --path-update=false \
32 | --usage-reporting=false \
33 | --quiet \
34 | && rm -rf "${TMP_DIR}" \
35 | && gcloud --version
36 |
37 | WORKDIR $AIRFLOW_HOME
38 |
39 | COPY scripts scripts
40 | RUN chmod +x scripts
41 |
42 | USER $AIRFLOW_UID
43 |
--------------------------------------------------------------------------------
/airflow/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | ## File structure
4 |
5 | ### custom_scripts
6 | This directory contains Python modules that are called by the PythonOperator in Airflow.
7 |
8 | ### dags
9 | This directory contains the DAG scripts, as well as SQL code that is called by the BigQueryInsertJobOperator in Airflow.
10 |
11 | ## Code breakdown
12 |
13 | ### Call Python functions from custom modules
14 | Official doc: https://airflow.apache.org/docs/apache-airflow/stable/modules_management.html
15 |
16 | - In DAG script (`stocks_dag.py`), import module:
17 | `from custom_scripts.ingest_reddit import extract_reddit_data`
18 | This allows you to set `extract_reddit_data` in python_callable of PythonOperator.
19 |
20 | - In `Dockerfile`, set env:
21 | `ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}"`
22 |
23 | - In `docker-compose.yml`, mount directories under `x-airflow-common/volumes` so that Airflow can 'see' these paths:
24 | ```
25 | volumes:
26 | - ./custom_scripts:/opt/airflow/custom_scripts
27 | - ./data/json:/opt/airflow/data/json
28 | - ./data/csv:/opt/airflow/data/csv
29 | - ./data/parquet:/opt/airflow/data/parquet
30 | ```
31 |
32 | ### Create a Dataproc cluster in Airflow
33 | Official doc: https://airflow.apache.org/docs/apache-airflow-providers-google/stable/operators/cloud/dataproc.html#examples-of-job-configurations-to-submit
34 |
35 | The `gcloud` bash command is as follows:
36 | ```
37 | gcloud dataproc clusters create de-spark-cluster \
38 | --region asia-southeast1 \
39 | --zone asia-southeast1-a \
40 | --single-node \
41 | --master-machine-type n1-standard-4 \
42 | --master-boot-disk-size 500 \
43 | --image-version 2.0-debian10 \
44 | --max-idle 900s \
45 | --project de-r-stocks \
46 | --metadata 'PIP_PACKAGES=spark-nlp' \
47 | --initialization-actions gs://datalake_de-r-stocks/pip-install.sh
48 | ```
49 | We can use ClusterGenerator to generate the cluster configuration instead of manually setting the API.
50 | ```
51 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator
52 |
53 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
54 | project_id=PROJECT_ID,
55 | zone="asia-southeast1-a",
56 | master_machine_type="n1-standard-4",
57 | master_disk_size=500,
58 | num_masters=1,
59 | num_workers=0, # single node mode
60 | idle_delete_ttl=900, # idle time before deleting cluster
61 | init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'],
62 | metadata={'PIP_PACKAGES': 'spark-nlp'},
63 | ).make()
64 |
65 | create_cluster_operator_task = DataprocCreateClusterOperator(
66 | task_id='create_dataproc_cluster',
67 | cluster_name="de-spark-cluster",
68 | project_id=PROJECT_ID,
69 | region="asia-southeast1",
70 | cluster_config=CLUSTER_GENERATOR_CONFIG
71 | )
72 | ```
73 | `init_actions_uris`: When the cluster is created, it will be initalised to install dependencies under `metadata` with pip
74 |
75 | `metadata`: `spark-nlp` is required in our PySpark job
76 |
77 | See https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python
78 |
79 | ### Submit a PySpark job to a Dataproc cluster in Airflow
80 | Official doc: https://cloud.google.com/sdk/gcloud/reference/dataproc/jobs/submit/pyspark
81 |
82 | The documentation lays out clearly the bash code for submitting a PySpark job.
83 |
84 | Using the same API, my code looks like this:
85 | ```
86 | gcloud dataproc jobs submit pyspark \
87 | --cluster=de-spark-cluster \
88 | --region=asia-southeast1 \
89 | --project=de-r-stocks \
90 | --jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
91 | --properties spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3 \
92 | wordcount_by_date.py \
93 | -- \
94 | --input=gs://{BUCKET}/file.parquet \
95 | --dataset=stocks_data
96 | --subreddit=stocks
97 | --mode=submission
98 | ```
99 | `jars`: The JAR connector for Spark and BigQuery. This is required as the submit job will be writing processed data to a BigQuery table.
100 | `properties`: The package for sparknlp module, which is imported during the submit job.
101 |
102 | `wordcount_by_date.py`: The main .py file to run as the driver. It contains the PySpark code which does the preprocessing. In the above case, the file exists locally, but it can be on the cluster or in a storage bucket.
103 |
104 | The last four arguments are custom flags passed to the driver, e.g.:
105 |
106 | `gcloud dataproc jobs submit pyspark --cluster=my_cluster my_script.py -- --custom-flag`
107 |
108 | These arguments will be parsed and used in the `wordcount_by_date.py` script.
109 |
110 | Instead of using the bash code, I wrapped the above in the DataprocSubmitJobOperator. Besides the difference in API and the parameterisation, it is functionally identical.
111 |
112 | The code looks like this (you can find it in `stocks_dag.py`):
113 | ```
114 | from airflow.providers.google.cloud.operators.dataproc import DataprocSubmitJobOperator
115 |
116 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py'
117 |
118 |
119 | pyspark_job = {
120 | "reference": {"project_id": PROJECT_ID},
121 | "placement": {"cluster_name": 'de-spark-cluster'},
122 | "pyspark_job": {
123 | "main_python_file_uri": PYSPARK_URI,
124 | "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
125 | "properties": {
126 | "spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"
127 | },
128 | "args": [
129 | f"--input=gs://{BUCKET}/{gcs_path}",
130 | f"--dataset={BIGQUERY_DATASET}",
131 | f"--subreddit={subreddit}",
132 | f"--mode={mode}"
133 | ]
134 | }
135 | }
136 |
137 | wordcount_sparksubmit_task = DataprocSubmitJobOperator(
138 | task_id='wordcount_sparksubmit',
139 | job=pyspark_job,
140 | region='asia-southeast1',
141 | project_id=PROJECT_ID,
142 | trigger_rule='all_done'
143 | )
144 | ```
145 | Note that the trigger_rule is not implemented correctly and can be left out.
146 |
--------------------------------------------------------------------------------
/airflow/custom_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/airflow/custom_scripts/__init__.py
--------------------------------------------------------------------------------
/airflow/custom_scripts/ingest_reddit.py:
--------------------------------------------------------------------------------
1 | import pendulum
2 | import requests
3 | import json
4 | import time
5 |
6 | url = "https://api.pushshift.io/reddit/search"
7 |
8 | def fetchObjects(mode, **kwargs):
9 |
10 | # Default parameters
11 | # Change as necessary/desired
12 | params = {
13 | "sorted_type": "created_utc",
14 | "sort": "asc",
15 | "size": "1000"
16 | }
17 |
18 | # Add additional parameters based on function arguments
19 | for key, value in kwargs.items():
20 | params[key] = value
21 |
22 | loop = True
23 | while loop:
24 | # Perform API request
25 | r = requests.get(f'{url}/{mode}/', params=params, timeout=90)
26 | # print(r.url)
27 | if r.status_code != 200:
28 | print(r.status_code)
29 | print("Retrying...")
30 | else:
31 | # successful (200), loop = False and process data
32 | loop = False
33 | else:
34 | response = json.loads(r.text)
35 | data = response['data']
36 | sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
37 | return sorted_data_by_id
38 |
39 | def extract_reddit_data(subreddit, mode, start, end, filepath):
40 |
41 | # arg datetime format: pendulum.DateTime
42 | start = pendulum.parse(start)
43 | end = pendulum.parse(end)
44 |
45 | # convert DateTime to timestamp to pass into API
46 | start_ts = start.int_timestamp
47 | end_ts = end.int_timestamp
48 |
49 | max_id = 0
50 |
51 | # Open file for JSON output
52 | file = open(filepath, "a")
53 |
54 | while True:
55 | nothing_processed = True
56 | objects = fetchObjects(mode, subreddit=subreddit, after=start_ts, before=end_ts)
57 |
58 | for object in objects:
59 | id = int(object['id'],36)
60 | if id > max_id:
61 | nothing_processed = False
62 | created_utc = object['created_utc']
63 | max_id = id
64 | if created_utc > start_ts:
65 | start_ts = created_utc
66 | # Output JSON data to the opened file
67 | file.write(json.dumps(object,sort_keys=True,ensure_ascii=True) + "\n")
68 |
69 | # Exit if nothing happened
70 | if nothing_processed: break
71 | start_ts -= 1
72 |
73 | # Sleep a little before the next function call
74 | time.sleep(.5)
75 |
76 | file.close()
--------------------------------------------------------------------------------
/airflow/custom_scripts/preprocessing.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 | import logging
4 | import pyarrow.csv as pv
5 | import pyarrow.parquet as pq
6 |
7 |
8 | def json_to_csv(json_filepath, csv_filepath, mode):
9 |
10 | with open(json_filepath) as json_file:
11 | json_list = list(json_file)
12 | # Create new file to write to
13 | newfile = open(csv_filepath, 'w', encoding='utf-8', newline='')
14 | csv_writer = csv.writer(newfile)
15 |
16 | if mode == 'submission':
17 | csv_writer.writerow(["id","title","author", "created_utc", "num_comments","total_awards_received"])
18 | for json_str in json_list:
19 | # convert string to json object
20 | result = json.loads(json_str)
21 | # write each column, row by row
22 | id = result['id']
23 | title = result['title']
24 | author = result['author']
25 | created_utc = result['created_utc']
26 | num_comments = result['num_comments']
27 | total_awards_received = result['total_awards_received']
28 | csv_writer.writerow([id, title, author, created_utc, num_comments, total_awards_received])
29 | newfile.close()
30 |
31 | elif mode == 'comment':
32 | csv_writer.writerow(["id", "author", "created_utc", "body", "total_awards_received"])
33 | for json_str in json_list:
34 | # convert string to json object
35 | result = json.loads(json_str)
36 | # write each column, row by row
37 | id = result['id']
38 | author = result['author']
39 | created_utc = result['created_utc']
40 | body = result['body']
41 | total_awards_received = result['total_awards_received']
42 | csv_writer.writerow([id, title, author, created_utc, body, total_awards_received])
43 | newfile.close()
44 |
45 | def csv_to_parquet(csv_filepath, parquet_filepath):
46 |
47 | if not csv_filepath.endswith('.csv'):
48 | logging.error("Not a CSV file")
49 | return
50 | table = pv.read_csv(csv_filepath)
51 | pq.write_table(table, parquet_filepath)
--------------------------------------------------------------------------------
/airflow/dags/load_datawarehouse.sql:
--------------------------------------------------------------------------------
1 | -- create temp table (with modification)
2 | CREATE OR REPLACE TEMP TABLE {{ subreddit }}_{{ mode }}
3 | AS
4 | SELECT id, title, author, num_comments, total_awards_received, DATE(TIMESTAMP_SECONDS(created_utc)) AS {{ mode }}_date
5 | FROM {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_external_table;
6 |
7 | -- if permanent table does not exist, create permanent table from temp table (with partition)
8 | CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
9 | PARTITION BY {{ mode }}_date
10 | AS
11 | SELECT * FROM {{ subreddit }}_{{ mode }};
12 |
13 | -- maintain idempotency using delete-write
14 | -- delete rows from permanent table (only delete data that the pipeline will re-create)
15 | DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
16 | WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}';
17 |
18 | -- insert data from temp table to permanent table
19 | INSERT INTO {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all
20 | SELECT * FROM {{ subreddit }}_{{ mode }};
--------------------------------------------------------------------------------
/airflow/dags/stocks_dag.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import datetime, timedelta
3 | from airflow import DAG
4 | from airflow.operators.bash import BashOperator
5 | from airflow.operators.python import PythonOperator
6 |
7 | from google.cloud import storage
8 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
9 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator, DataprocSubmitJobOperator
10 |
11 | from custom_scripts.ingest_reddit import extract_reddit_data
12 | from custom_scripts.preprocessing import json_to_csv, csv_to_parquet
13 |
14 | BUCKET = os.environ.get("GCP_GCS_BUCKET")
15 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
16 | BIGQUERY_DATASET = os.environ.get('BIGQUERY_DATASET', 'stocks_data')
17 | PROJECT_ID = os.environ.get('GCP_PROJECT_ID')
18 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py'
19 |
20 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator(
21 | project_id=PROJECT_ID,
22 | zone="asia-southeast1-a",
23 | master_machine_type="n1-standard-4",
24 | master_disk_size=500,
25 | num_masters=1,
26 | num_workers=0, # single node mode
27 | idle_delete_ttl=900, # idle time before deleting cluster
28 | init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'],
29 | metadata={'PIP_PACKAGES': 'spark-nlp'},
30 | ).make()
31 |
32 | def load_to_gcs(bucket, object_name, local_file):
33 | """
34 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
35 | :param bucket: GCS bucket name
36 | :param object_name: target path & file-name
37 | :param local_file: source path & file-name
38 | :return:
39 | """
40 | client = storage.Client()
41 | bucket = client.bucket(bucket)
42 |
43 | blob = bucket.blob(object_name)
44 | blob.upload_from_filename(local_file)
45 |
46 | def reddit_pipeline_template(
47 | # arguments
48 | dag,
49 | subreddit,
50 | mode,
51 | json_filepath,
52 | csv_filepath,
53 | parquet_filepath,
54 | gcs_path
55 | ):
56 | with dag:
57 | download_data_task = PythonOperator(
58 | task_id = 'ingest_reddit_json',
59 | python_callable = extract_reddit_data,
60 | op_kwargs = {
61 | 'subreddit': subreddit,
62 | 'mode': mode,
63 | 'start': '{{ data_interval_start }}',
64 | 'end': '{{ data_interval_end }}',
65 | 'filepath': json_filepath
66 | }
67 | )
68 |
69 | json_to_csv_task = PythonOperator(
70 | task_id = 'json_to_csv',
71 | python_callable = json_to_csv,
72 | op_kwargs = {
73 | 'json_filepath': json_filepath,
74 | 'csv_filepath': csv_filepath,
75 | 'mode': mode
76 | }
77 | )
78 |
79 | csv_to_parquet_task = PythonOperator(
80 | task_id = 'csv_to_parquet',
81 | python_callable = csv_to_parquet,
82 | op_kwargs = {
83 | 'csv_filepath': csv_filepath,
84 | 'parquet_filepath': parquet_filepath
85 | }
86 | )
87 |
88 | load_to_gcs_task = PythonOperator(
89 | task_id = "load_to_gcs",
90 | python_callable = load_to_gcs,
91 | op_kwargs={
92 | "bucket": BUCKET,
93 | "object_name": gcs_path,
94 | "local_file": parquet_filepath,
95 | }
96 | )
97 |
98 | delete_local_json_csv = BashOperator(
99 | task_id = "delete_local_json_csv",
100 | bash_command = f'rm {json_filepath} {csv_filepath}'
101 | )
102 |
103 | QUERY = f'''CREATE OR REPLACE EXTERNAL TABLE {BIGQUERY_DATASET}.{subreddit}_{mode}_external_table
104 | OPTIONS (
105 | format="PARQUET",
106 | uris=["gs://{BUCKET}/{gcs_path}"]
107 | );'''
108 |
109 | create_BQ_external_table_task = BigQueryInsertJobOperator(
110 | task_id = 'create_external_table',
111 | configuration={
112 | 'query': {
113 | 'query': QUERY,
114 | 'useLegacySql': False,
115 | }
116 | }
117 | )
118 |
119 | # Create a partitioned table from external table
120 | BQ_create_partitioned_table_task = BigQueryInsertJobOperator(
121 | task_id = "bq_create_partitioned_table",
122 | configuration={
123 | "query": {
124 | "query": "{% include 'load_datawarehouse.sql' %}",
125 | "useLegacySql": False,
126 | }
127 | }
128 | )
129 |
130 | QUERY_CREATE_WORDCOUNT_TABLE = '''
131 | CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount (
132 | word STRING,
133 | wordcount INTEGER,
134 | {{ mode }}_date DATE
135 | )
136 | PARTITION BY {{ mode }}_date'''
137 |
138 | create_wordcount_table_task = BigQueryInsertJobOperator(
139 | task_id = 'create_wordcount_table',
140 | configuration={
141 | 'query': {
142 | 'query': QUERY_CREATE_WORDCOUNT_TABLE,
143 | 'useLegacySql': False,
144 | }
145 | }
146 | )
147 | # task will marked as 'success' if cluster exists
148 | create_cluster_operator_task = DataprocCreateClusterOperator(
149 | task_id='create_dataproc_cluster',
150 | cluster_name="de-spark-cluster",
151 | project_id=PROJECT_ID,
152 | region="asia-southeast1",
153 | cluster_config=CLUSTER_GENERATOR_CONFIG
154 | )
155 |
156 | QUERY_DELETE_WORDCOUNT_ROWS = '''
157 | DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount
158 | WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}';
159 | '''
160 |
161 | # delete any existing duplicate rows before writing
162 | delete_wordcountdup_task = BigQueryInsertJobOperator(
163 | task_id = 'delete_wordcountdup',
164 | configuration={
165 | 'query': {
166 | 'query': QUERY_DELETE_WORDCOUNT_ROWS,
167 | 'useLegacySql': False,
168 | }
169 | }
170 | )
171 |
172 | pyspark_job = {
173 | "reference": {"project_id": PROJECT_ID},
174 | "placement": {"cluster_name": 'de-spark-cluster'},
175 | "pyspark_job": {
176 | "main_python_file_uri": PYSPARK_URI,
177 | "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
178 | "properties": {
179 | "spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"
180 | },
181 | "args": [
182 | f"--input=gs://{BUCKET}/{gcs_path}",
183 | f"--dataset={BIGQUERY_DATASET}",
184 | f"--subreddit={subreddit}",
185 | f"--mode={mode}"
186 | ]
187 | }
188 | }
189 |
190 | wordcount_sparksubmit_task = DataprocSubmitJobOperator(
191 | task_id='wordcount_sparksubmit',
192 | job=pyspark_job,
193 | region='asia-southeast1',
194 | project_id=PROJECT_ID,
195 | trigger_rule='all_done'
196 | )
197 |
198 | download_data_task >> json_to_csv_task >> csv_to_parquet_task >> load_to_gcs_task >> [delete_local_json_csv, create_BQ_external_table_task]
199 | create_BQ_external_table_task >> BQ_create_partitioned_table_task
200 | load_to_gcs_task >> create_wordcount_table_task >> create_cluster_operator_task >> delete_wordcountdup_task >> wordcount_sparksubmit_task
201 |
202 | default_args = {
203 | "owner": "Zachary",
204 | "start_date": datetime(2022, 3, 1),
205 | "end_date": datetime(2022, 4, 30),
206 | "depends_on_past": False,
207 | "retries": 1,
208 | "retry_delay": timedelta(seconds=60)
209 | }
210 |
211 | # all dag definitions (dag = DAG()) should be in the global scope
212 | stocks_submission_weekly_dag = DAG(
213 | dag_id = 'stocks_submission_weekly',
214 | schedule_interval = '@weekly',
215 | catchup = True,
216 | max_active_runs = 3,
217 | default_args = default_args,
218 | user_defined_macros={
219 | "BIGQUERY_DATASET": BIGQUERY_DATASET,
220 | "subreddit": 'stocks',
221 | "mode": 'submission',
222 | }
223 | )
224 |
225 | # submission
226 | reddit_pipeline_template(
227 | dag = stocks_submission_weekly_dag,
228 | subreddit = 'stocks',
229 | mode = 'submission',
230 | json_filepath = AIRFLOW_HOME + '/data/json/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.json',
231 | csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv',
232 | parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet',
233 | gcs_path = 'stocks/submission/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet'
234 | )
235 |
236 |
237 | #---------------------------- THE SECTION BELOW IS UNUSED ---------------------------
238 | # stocks_comment_weekly_dag = DAG(
239 | # dag_id = 'stocks_comment_weekly',
240 | # schedule_interval = '@weekly',
241 | # catchup = True,
242 | # max_active_runs = 2,
243 | # default_args = default_args
244 | # )
245 |
246 | # comment
247 | # reddit_pipeline_template(
248 | # dag = stocks_comment_weekly_dag,
249 | # subreddit = 'stocks',
250 | # mode = 'comment',
251 | # json_filepath = AIRFLOW_HOME + '/data/json/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.json',
252 | # csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv',
253 | # parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet',
254 | # gcs_path = 'stocks/comment/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet'
255 | # )
--------------------------------------------------------------------------------
/airflow/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied. See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | #
18 |
19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
20 | #
21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
22 | #
23 | # This configuration supports basic configuration using environment variables or an .env file
24 | # The following variables are supported:
25 | #
26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow.
27 | # Default: apache/airflow:2.2.4
28 | # AIRFLOW_UID - User ID in Airflow containers
29 | # Default: 50000
30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
31 | #
32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested).
33 | # Default: airflow
34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested).
35 | # Default: airflow
36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
37 | # Default: ''
38 | #
39 | # Feel free to modify this file to suit your needs.
40 | ---
41 | version: '3'
42 | x-airflow-common:
43 | &airflow-common
44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image.
45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
47 | # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.4}
48 | build: .
49 | environment:
50 | &airflow-common-env
51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor
52 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
53 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
54 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
55 | AIRFLOW__CORE__FERNET_KEY: ''
56 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
57 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
58 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
59 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
60 |
61 | # self-defined
62 | GCP_PROJECT_ID: 'de-r-stocks'
63 | GCP_GCS_BUCKET: 'datalake_de-r-stocks'
64 | GOOGLE_APPLICATION_CREDENTIALS: '/.google/credentials/de-r-stocks.json'
65 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/de-r-stocks.json'
66 | volumes:
67 | - ./dags:/opt/airflow/dags
68 | - ./logs:/opt/airflow/logs
69 | - ./plugins:/opt/airflow/plugins
70 | - ~/.google/credentials/:/.google/credentials:ro
71 | - ./custom_scripts:/opt/airflow/custom_scripts
72 | - ./data/json:/opt/airflow/data/json
73 | - ./data/csv:/opt/airflow/data/csv
74 | - ./data/parquet:/opt/airflow/data/parquet
75 | user: "${AIRFLOW_UID:-50000}:0"
76 | depends_on:
77 | &airflow-common-depends-on
78 | redis:
79 | condition: service_healthy
80 | postgres:
81 | condition: service_healthy
82 |
83 | services:
84 | postgres:
85 | image: postgres:13
86 | environment:
87 | POSTGRES_USER: airflow
88 | POSTGRES_PASSWORD: airflow
89 | POSTGRES_DB: airflow
90 | volumes:
91 | - postgres-db-volume:/var/lib/postgresql/data
92 | healthcheck:
93 | test: ["CMD", "pg_isready", "-U", "airflow"]
94 | interval: 5s
95 | retries: 5
96 | restart: always
97 |
98 | redis:
99 | image: redis:latest
100 | expose:
101 | - 6379
102 | healthcheck:
103 | test: ["CMD", "redis-cli", "ping"]
104 | interval: 5s
105 | timeout: 30s
106 | retries: 50
107 | restart: always
108 |
109 | airflow-webserver:
110 | <<: *airflow-common
111 | command: webserver
112 | ports:
113 | - 8080:8080
114 | healthcheck:
115 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
116 | interval: 10s
117 | timeout: 10s
118 | retries: 5
119 | restart: always
120 | depends_on:
121 | <<: *airflow-common-depends-on
122 | airflow-init:
123 | condition: service_completed_successfully
124 |
125 | airflow-scheduler:
126 | <<: *airflow-common
127 | command: scheduler
128 | healthcheck:
129 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
130 | interval: 10s
131 | timeout: 10s
132 | retries: 5
133 | restart: always
134 | depends_on:
135 | <<: *airflow-common-depends-on
136 | airflow-init:
137 | condition: service_completed_successfully
138 |
139 | airflow-worker:
140 | <<: *airflow-common
141 | command: celery worker
142 | healthcheck:
143 | test:
144 | - "CMD-SHELL"
145 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
146 | interval: 10s
147 | timeout: 10s
148 | retries: 5
149 | environment:
150 | <<: *airflow-common-env
151 | # Required to handle warm shutdown of the celery workers properly
152 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
153 | DUMB_INIT_SETSID: "0"
154 | restart: always
155 | depends_on:
156 | <<: *airflow-common-depends-on
157 | airflow-init:
158 | condition: service_completed_successfully
159 |
160 | airflow-triggerer:
161 | <<: *airflow-common
162 | command: triggerer
163 | healthcheck:
164 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
165 | interval: 10s
166 | timeout: 10s
167 | retries: 5
168 | restart: always
169 | depends_on:
170 | <<: *airflow-common-depends-on
171 | airflow-init:
172 | condition: service_completed_successfully
173 |
174 | airflow-init:
175 | <<: *airflow-common
176 | entrypoint: /bin/bash
177 | # yamllint disable rule:line-length
178 | command:
179 | - -c
180 | - |
181 | function ver() {
182 | printf "%04d%04d%04d%04d" $${1//./ }
183 | }
184 | airflow_version=$$(gosu airflow airflow version)
185 | airflow_version_comparable=$$(ver $${airflow_version})
186 | min_airflow_version=2.2.0
187 | min_airflow_version_comparable=$$(ver $${min_airflow_version})
188 | if (( airflow_version_comparable < min_airflow_version_comparable )); then
189 | echo
190 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
191 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
192 | echo
193 | exit 1
194 | fi
195 | if [[ -z "${AIRFLOW_UID}" ]]; then
196 | echo
197 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
198 | echo "If you are on Linux, you SHOULD follow the instructions below to set "
199 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
200 | echo "For other operating systems you can get rid of the warning with manually created .env file:"
201 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
202 | echo
203 | fi
204 | one_meg=1048576
205 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
206 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
207 | disk_available=$$(df / | tail -1 | awk '{print $$4}')
208 | warning_resources="false"
209 | if (( mem_available < 4000 )) ; then
210 | echo
211 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
212 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
213 | echo
214 | warning_resources="true"
215 | fi
216 | if (( cpus_available < 2 )); then
217 | echo
218 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
219 | echo "At least 2 CPUs recommended. You have $${cpus_available}"
220 | echo
221 | warning_resources="true"
222 | fi
223 | if (( disk_available < one_meg * 10 )); then
224 | echo
225 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
226 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
227 | echo
228 | warning_resources="true"
229 | fi
230 | if [[ $${warning_resources} == "true" ]]; then
231 | echo
232 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
233 | echo "Please follow the instructions to increase amount of resources available:"
234 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
235 | echo
236 | fi
237 | mkdir -p /sources/logs /sources/dags /sources/plugins
238 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
239 | exec /entrypoint airflow version
240 | # yamllint enable rule:line-length
241 | environment:
242 | <<: *airflow-common-env
243 | _AIRFLOW_DB_UPGRADE: 'true'
244 | _AIRFLOW_WWW_USER_CREATE: 'true'
245 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
246 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
247 | user: "0:0"
248 | volumes:
249 | - .:/sources
250 |
251 | airflow-cli:
252 | <<: *airflow-common
253 | profiles:
254 | - debug
255 | environment:
256 | <<: *airflow-common-env
257 | CONNECTION_CHECK_MAX_COUNT: "0"
258 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
259 | command:
260 | - bash
261 | - -c
262 | - airflow
263 |
264 | flower:
265 | <<: *airflow-common
266 | command: celery flower
267 | ports:
268 | - 5555:5555
269 | healthcheck:
270 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
271 | interval: 10s
272 | timeout: 10s
273 | retries: 5
274 | restart: always
275 | depends_on:
276 | <<: *airflow-common-depends-on
277 | airflow-init:
278 | condition: service_completed_successfully
279 |
280 | volumes:
281 | postgres-db-volume:
282 |
--------------------------------------------------------------------------------
/airflow/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-google
2 | pyarrow
3 | pendulum
4 | requests
--------------------------------------------------------------------------------
/airflow/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS}
3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT}
4 |
5 | airflow db upgrade
6 |
7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow
8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD"
9 |
10 | airflow webserver
11 |
--------------------------------------------------------------------------------
/images/airflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/airflow.png
--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/architecture.png
--------------------------------------------------------------------------------
/images/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/dashboard.png
--------------------------------------------------------------------------------
/spark/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | ## Code breakdown for `wordcount_by_date.py`
4 | The script does the following:
5 | - Reads parquet file from a GCS bucket
6 | - It assumes that the file contains the columns 'author', 'date' and 'title'
7 | - Builds an NLP pipeline and transforms the text data in 'title' with the pipeline
8 | - https://nlp.johnsnowlabs.com/docs/en/install#python
9 | - https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb
10 | - Creates a dataframe that contains the token count on each date, e.g.:
11 |
12 | | word | wordcount | submission_date |
13 | |--------|-------|------------|
14 | | invest | 6 | 2022-04-11 |
15 | | invest | 4 | 2022-04-12 |
16 | | market | 2 | 2022-04-12 |
17 |
18 | - Lastly, it writes dataframe containing the word count into a BigQuery table
19 |
20 | ### Write dataframe into a BigQuery table
21 | Official doc: https://github.com/GoogleCloudDataproc/spark-bigquery-connector
22 |
23 | ```
24 | df_wordcountbydate.write.format('bigquery') \
25 | .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \
26 | .option('temporaryGcsBucket', BUCKET) \
27 | .option('partitionField', f'{mode}_date') \
28 | .option('partitionType', 'DAY') \
29 | .mode('append') \
30 | .save()
31 | ```
32 |
33 | This is done using a BigQuery connector for Spark. The connector must be specified when submitting the PySpark job for this script, which I did so in the Airflow DAG `stocks_dag.py`.
34 |
--------------------------------------------------------------------------------
/spark/spark-bigquery-latest_2.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/spark/spark-bigquery-latest_2.12.jar
--------------------------------------------------------------------------------
/spark/wordcount_by_date.py:
--------------------------------------------------------------------------------
1 | # %%
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import StructField, StructType, StringType, FloatType, IntegerType
4 | from pyspark.ml.feature import CountVectorizer
5 | from pyspark.ml import Pipeline
6 | import pyspark.sql.functions as F
7 |
8 | from sparknlp.annotator import LemmatizerModel, Tokenizer, Normalizer, StopWordsCleaner, NGramGenerator
9 | from sparknlp.base import Finisher, DocumentAssembler
10 |
11 | import argparse
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--input', required=True)
14 | parser.add_argument('--dataset', required=True)
15 | parser.add_argument('--subreddit', required=True)
16 | parser.add_argument('--mode', required=True)
17 | args = parser.parse_args()
18 |
19 | input = args.input
20 | dataset = args.dataset
21 | subreddit = args.subreddit
22 | mode = args.mode
23 |
24 | # change this to your bucket
25 | # bucket is used as temporary storage while writing data from Spark to BigQuery
26 | BUCKET = 'datalake_de-r-stocks'
27 |
28 | # Start Spark session
29 | spark = SparkSession.builder \
30 | .appName('preprocessing_wordcount') \
31 | .config('"spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"') \
32 | .getOrCreate()
33 |
34 | # %%
35 | # Access data from GCS
36 | df = spark.read.parquet(input)
37 |
38 | # 1. Remove posts by AutoModerator
39 | # 2. Remove duplicate titles
40 | # 3. Convert unix timestamp to date
41 | # 4. Keep title and date columns
42 | df_filter = df.filter(~F.col('author').contains('AutoModerator')) \
43 | .dropDuplicates(['title']) \
44 | .withColumn('date', F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')) \
45 | .select('title', 'date')
46 |
47 | documentAssembler = DocumentAssembler() \
48 | .setInputCol('title') \
49 | .setOutputCol('title_document')
50 |
51 | tokenizer = Tokenizer() \
52 | .setInputCols(['title_document']) \
53 | .setOutputCol('title_token')
54 |
55 | normalizer = Normalizer() \
56 | .setInputCols(['title_token']) \
57 | .setOutputCol('title_normalized') \
58 | .setLowercase(True)
59 |
60 | lemmatizer = LemmatizerModel.pretrained() \
61 | .setInputCols(['title_normalized']) \
62 | .setOutputCol('title_lemma')
63 |
64 | stopwords_cleaner = StopWordsCleaner() \
65 | .setInputCols(['title_lemma']) \
66 | .setOutputCol('title_cleaned') \
67 | .setCaseSensitive(False)
68 |
69 | ngrams_cum = NGramGenerator() \
70 | .setInputCols(["title_cleaned"]) \
71 | .setOutputCol("title_ngrams") \
72 | .setN(2) \
73 | .setEnableCumulative(True)\
74 | .setDelimiter("_") # Default is space
75 |
76 | finisher = Finisher() \
77 | .setInputCols(['title_ngrams']) \
78 | .setOutputCols(['title_finished']) \
79 | .setCleanAnnotations(False)
80 |
81 | nlpPipeline = Pipeline(stages=[
82 | documentAssembler,
83 | tokenizer,
84 | normalizer,
85 | lemmatizer,
86 | stopwords_cleaner,
87 | ngrams_cum,
88 | finisher
89 | ])
90 |
91 | df_result = nlpPipeline.fit(df_filter).transform(df_filter).select('title_finished', 'date')
92 |
93 | # CountVectorizer model
94 | cv = CountVectorizer(inputCol='title_finished', outputCol='features', minDF=3.0)
95 |
96 | # Train on all submissions
97 | model = cv.fit(df_result)
98 |
99 | df_tokensbydate = df_result.groupBy('date').agg(F.flatten(F.collect_list('title_finished')).alias('title_finished'))
100 |
101 | # Get counts for each date
102 | counts = model.transform(df_tokensbydate).select('date','features').collect()
103 |
104 | # Create empty dataframe
105 | df_wordcountbydate = spark.createDataFrame(spark.sparkContext.emptyRDD(),
106 | schema=StructType(fields=[
107 | StructField("word", StringType()),
108 | StructField("count", FloatType()),
109 | StructField("date", StringType())]))
110 |
111 | # Append count for each day to dataframe
112 | for row in range(len(counts)):
113 | test_dict = dict(zip(model.vocabulary, (float(x) for x in counts[row]['features'].values)))
114 | df_temp = spark.createDataFrame(test_dict.items(),
115 | schema=StructType(fields=[
116 | StructField("word", StringType()),
117 | StructField("count", FloatType())]))
118 | df_temp = df_temp.withColumn('date', F.lit(counts[row]['date']))
119 | df_wordcountbydate = df_wordcountbydate.unionAll(df_temp)
120 |
121 | # %%
122 |
123 | df_wordcountbydate = df_wordcountbydate.withColumn('count', F.col('count').cast(IntegerType())) \
124 | .withColumn(f'{mode}_date', F.to_date(F.col('date'), 'yyyy-MM-dd')) \
125 | .withColumnRenamed('count', 'wordcount') \
126 | .drop('date')
127 |
128 | # upload dataframe to BigQuery
129 | df_wordcountbydate.write.format('bigquery') \
130 | .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \
131 | .option('temporaryGcsBucket', BUCKET) \
132 | .option('partitionField', f'{mode}_date') \
133 | .option('partitionType', 'DAY') \
134 | .mode('append') \
135 | .save()
--------------------------------------------------------------------------------
/terraform/main.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.0"
3 | required_providers {
4 | google = {
5 | source = "hashicorp/google"
6 | }
7 | }
8 | }
9 |
10 | provider "google" {
11 | credentials = file(var.credentials)
12 | project = var.project
13 | region = var.region
14 | }
15 |
16 | resource "google_storage_bucket" "data-lake-bucket" {
17 | name = "${local.data_lake_bucket}_${var.project}"
18 | location = var.region
19 |
20 | storage_class = var.storage_class
21 | uniform_bucket_level_access = true
22 |
23 | versioning {
24 | enabled = true
25 | }
26 |
27 | lifecycle_rule {
28 | action {
29 | type = "Delete"
30 | }
31 | condition {
32 | age = 30
33 | }
34 | }
35 |
36 | force_destroy = true
37 | }
38 |
39 | resource "google_bigquery_dataset" "dataset" {
40 | dataset_id = var.BQ_DATASET
41 | project = var.project
42 | location = var.region
43 | }
--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | locals {
2 | data_lake_bucket = "datalake"
3 | }
4 |
5 | variable "project" {
6 | default = "de-r-stocks"
7 | description = "GCP project ID"
8 | }
9 |
10 | variable "region" {
11 | type = string
12 | default = "asia-southeast1"
13 | description = "Region for GCP resources"
14 | }
15 |
16 | variable "storage_class" {
17 | default = "STANDARD"
18 | description = "Storage class type for bucket"
19 | }
20 |
21 | variable "BQ_DATASET" {
22 | type = string
23 | default = "stocks_data"
24 | description = "BigQuery dataset that raw data from GCS will be written to"
25 | }
26 |
27 | variable "credentials" {
28 | type = string
29 | default = "/home/ztmj96/.google/credentials/de-r-stocks.json"
30 | description = "Path for GCP account credentials"
31 | }
32 |
33 |
34 |
--------------------------------------------------------------------------------