├── .gitignore ├── LICENSE ├── README.md ├── airflow ├── Dockerfile ├── README.md ├── custom_scripts │ ├── __init__.py │ ├── ingest_reddit.py │ └── preprocessing.py ├── dags │ ├── load_datawarehouse.sql │ └── stocks_dag.py ├── docker-compose.yaml ├── requirements.txt └── scripts │ └── entrypoint.sh ├── images ├── airflow.png ├── architecture.png └── dashboard.png ├── spark ├── README.md ├── spark-bigquery-latest_2.12.jar └── wordcount_by_date.py └── terraform ├── main.tf └── variables.tf /.gitignore: -------------------------------------------------------------------------------- 1 | # SPECIFIC TO THIS PROJECT 2 | 3 | # terraform 4 | *.tfstate 5 | *.tfstate.* 6 | **.terraform 7 | **.terraform.lock.* 8 | 9 | # files 10 | *.parquet 11 | *.ipynb 12 | commands.txt 13 | 14 | #airflow 15 | airflow/logs/ 16 | 17 | # END 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | pip-wheel-metadata/ 42 | share/python-wheels/ 43 | *.egg-info/ 44 | .installed.cfg 45 | *.egg 46 | MANIFEST 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .nox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | *.py,cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | db.sqlite3 80 | db.sqlite3-journal 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Zachary 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 11 | 12 | # Data Pipeline for Reddit data (r/Stocks) 13 | 14 |
15 | Table of Contents 16 |
    17 |
  1. 18 | About The Project 19 | 22 |
  2. 23 |
  3. 24 | Getting Started 25 | 31 |
  4. 32 |
  5. 33 | Usage 34 | 38 |
  6. 39 |
  7. Help
  8. 40 |
  9. Roadmap for Future Development
  10. 41 |
  11. Contributing
  12. 42 |
  13. License
  14. 43 |
  15. Contact
  16. 44 |
  17. Acknowledgements
  18. 45 |
46 |
47 | 48 | 49 | ## About The Project 50 | 51 | [![Dashboard][dashboard_screenshot]](https://datastudio.google.com/s/mjIjKwWNUQU) 52 | 53 | Interested to explore Reddit data for trends, analytics, or just for the fun of it? 54 | 55 | This project builds a data pipeline (from data ingestion to visualisation) that stores and preprocess data over any time period that you want. 56 | 57 |

(back to top)

58 | 59 | ### Built With 60 | 61 | * Data Ingestion: [Pushshift API](https://github.com/pushshift/api) 62 | * Infrastructure as Code: [Terraform](https://www.terraform.io/) 63 | * Workflow Orchestration: [Airflow](https://airflow.apache.org) 64 | * Data Lake: [Google Cloud Storage](https://cloud.google.com/storage) 65 | * Data Warehouse: [Google BigQuery](https://cloud.google.com/bigquery) 66 | * Batch Processing: [Spark](https://spark.apache.org/) on [Dataproc](https://cloud.google.com/dataproc) 67 | * Visualisation: [Google Data Studio](https://datastudio.google.com/) 68 | 69 | ![architecture][architecture_diagram] 70 | Cloud infrastructure is set up with Terraform. 71 | 72 | Airflow is run on a local docker container. 73 | It orchestrates the following on a weekly schedule: 74 | * Download data (JSON) 75 | * Parquetize the data and store it in a bucket on Google Cloud Storage 76 | * Write data to a table on BigQuery 77 | * Create cluster on Dataproc and submit PySpark job to preprocess parquet files from Google Cloud Storage 78 | * Write preprocessed data to a table on BigQuery 79 | 80 |

(back to top)

81 | 82 | ## Getting Started 83 | 84 | I created this project in WSL 2 (Windows Subsystem for Linux) on Windows 10. 85 | 86 | ### Prerequisites 87 | 88 | To get a local copy up and running in the same environment, you'll need to: 89 | * Install Python (3.8 and above) 90 | * Install VSCode 91 | * [Install WSL 2](https://docs.microsoft.com/en-us/windows/wsl/install) if you haven't 92 | * [Install Terraform](https://www.terraform.io/downloads) for Linux 93 | * [Install Docker Desktop](https://docs.docker.com/desktop/windows/install/) 94 | * [Install Google Cloud SDK](https://cloud.google.com/sdk/docs/install-sdk#deb) for Ubuntu 95 | * Have a Google Cloud Platform account 96 | * Clone this repository locally 97 | 98 | ### Create a Google Cloud Project 99 | 1. Go to [Google Cloud](https://console.cloud.google.com/) and create a new project. I set the id to 'de-r-stocks'. 100 | 2. Go to IAM and [create a Service Account](https://cloud.google.com/docs/authentication/getting-started#creating_a_service_account) with these roles: 101 | * BigQuery Admin 102 | * Storage Admin 103 | * Storage Object Admin 104 | * Viewer 105 | 3. Download the Service Account credentials, rename it to `de-r-stocks.json` and store it in `$HOME/.google/credentials/`. 106 | 4. On the Google console, enable the following APIs: 107 | * IAM API 108 | * IAM Service Account Credentials API 109 | * Cloud Dataproc API 110 | * Compute Engine API 111 | 112 | ### Set up the infrastructure on Google Cloud with Terraform 113 | I recommend executing the following on VSCode. 114 | 115 | 1. Using VSCode + WSL, open the project folder `de_r-stocks`. 116 | 2. Open `variables.tf` and modify: 117 | 118 | * `variable "project"` to your own project id (I think may not be necessary) 119 | * `variable "region"` to your project region 120 | * `variable "credentials"` to your credentials path 121 | 122 | 3. Open the VSCode terminal and change directory to the terraform folder, e.g. `cd terraform`. 123 | 4. Initialise Terraform: `terraform init` 124 | 5. Plan the infrastructure: `terraform plan` 125 | 6. Apply the changes: `terraform apply` 126 | 127 | If everything goes right, you now have a bucket on Google Cloud Storage called 'datalake_de-r-stocks' and a dataset on BigQuery called 'stocks_data'. 128 | 129 | ### Set up Airflow 130 | 1. Using VSCode, open `docker-compose.yaml` and look for the `#self-defined` block. Modify the variables to match your setup. 131 | 2. Open `stocks_dag.py`. You may need to change the following: 132 | 133 | * `zone` in `CLUSTER_GENERATOR_CONFIG` 134 | * Parameters in `default_args` 135 | 136 |

(back to top)

137 | 138 | ## Usage 139 | 140 | ### Start Airflow 141 | 1. Using the terminal, change the directory to the airflow folder, e.g. `cd airflow`. 142 | 2. Build the custom Airflow docker image: `docker-compose build` 143 | 3. Initialise the Airflow configs: `docker-compose up airflow-init` 144 | 4. Run Airflow: `docker-compose up` 145 | 146 | If the setup was done correctly, you will be able to access the Airflow interface by going to `localhost:8080` on your browser. 147 | 148 | Username and password are both `airflow`. 149 | 150 | ### Prepare for Spark jobs on Dataproc 151 | 1. Go to `wordcount_by_date.py` and modify the string value of `BUCKET` to your bucket's id. 152 | 2. Store initialisation and PySpark scripts on your bucket. It is required to create the cluster to run our Spark job. 153 | 154 | Run in the terminal (using the correct bucket name and region): 155 | * `gsutil cp gs://goog-dataproc-initialization-actions-asia-southeast1/python/pip-install.sh gs://datalake_de-r-stocks/scripts` 156 | * `gsutil cp spark/wordcount_by_date.py gs://datalake_de-r-stocks/scripts` 157 | 158 |

(back to top)

159 | 160 | Now, you are ready to enable the DAG on Airflow and let it do its magic! 161 | 162 | ![airflow][airflow_screenshot] 163 | 164 | When you are done, just stop the airflow services by going to the `airflow` directory with terminal and execute `docker-compose down`. 165 | 166 | ## Help 167 | 168 | Authorisation error while trying to create a Dataproc cluster from Airflow 169 | 1. Go to Google Cloud Platform's IAM 170 | 2. Under the Compute Engine default service account, add the roles 'Editor' and 'Dataproc Worker'. 171 | 172 | ## Roadmap for Future Development 173 | 174 | - [ ] Refactor code for convenient change to `subreddit` and `mode`. 175 | - [ ] Use Terraform to set up tables on BigQuery instead of creating tables as part of the DAG. 176 | - [ ] Unit tests 177 | - [ ] Data quality checks 178 | - [ ] CI/CD 179 | 180 |

(back to top)

181 | 182 | ## Contributing 183 | 184 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". 185 | Don't forget to give the project a star! Thanks again! 186 | 187 | 1. Fork the Project 188 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 189 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 190 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 191 | 5. Open a Pull Request 192 | 193 |

(back to top)

194 | 195 | ## License 196 | 197 | Distributed under the MIT License. See `LICENSE.txt` for more information. 198 | 199 | ## Contact 200 | 201 | [Connect with me on LinkedIn!](https://www.linkedin.com/in/zacharytancs/) 202 | 203 | ## Acknowledgements 204 | 205 | Use this space to list resources you find helpful and would like to give credit to. I've included a few of my favorites to kick things off! 206 | 207 | * [Data Engineering Zoomcamp by DataTalksClub](https://github.com/DataTalksClub/data-engineering-zoomcamp) 208 | * [Best-README-Template](https://github.com/othneildrew/Best-README-Template) 209 | 210 |

(back to top)

211 | 212 | 213 | 214 | [dashboard_screenshot]: images/dashboard.png 215 | [architecture_diagram]: images/architecture.png 216 | [airflow_screenshot]: images/airflow.png 217 | -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # First-time build can take upto 10 mins. 2 | 3 | FROM apache/airflow:2.2.3 4 | 5 | ENV AIRFLOW_HOME=/opt/airflow 6 | ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}" 7 | 8 | USER root 9 | RUN apt-get update -qq && apt-get install vim -qqq 10 | # git gcc g++ -qqq 11 | 12 | COPY requirements.txt . 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html 16 | 17 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] 18 | 19 | ARG CLOUD_SDK_VERSION=322.0.0 20 | ENV GCLOUD_HOME=/home/google-cloud-sdk 21 | 22 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}" 23 | 24 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \ 25 | && TMP_DIR="$(mktemp -d)" \ 26 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \ 27 | && mkdir -p "${GCLOUD_HOME}" \ 28 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \ 29 | && "${GCLOUD_HOME}/install.sh" \ 30 | --bash-completion=false \ 31 | --path-update=false \ 32 | --usage-reporting=false \ 33 | --quiet \ 34 | && rm -rf "${TMP_DIR}" \ 35 | && gcloud --version 36 | 37 | WORKDIR $AIRFLOW_HOME 38 | 39 | COPY scripts scripts 40 | RUN chmod +x scripts 41 | 42 | USER $AIRFLOW_UID 43 | -------------------------------------------------------------------------------- /airflow/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## File structure 4 | 5 | ### custom_scripts 6 | This directory contains Python modules that are called by the PythonOperator in Airflow. 7 | 8 | ### dags 9 | This directory contains the DAG scripts, as well as SQL code that is called by the BigQueryInsertJobOperator in Airflow. 10 | 11 | ## Code breakdown 12 | 13 | ### Call Python functions from custom modules 14 | Official doc: https://airflow.apache.org/docs/apache-airflow/stable/modules_management.html 15 | 16 | - In DAG script (`stocks_dag.py`), import module: 17 | `from custom_scripts.ingest_reddit import extract_reddit_data` 18 | This allows you to set `extract_reddit_data` in python_callable of PythonOperator. 19 | 20 | - In `Dockerfile`, set env: 21 | `ENV PYTHONPATH="/opt/airflow/custom_scripts:${PYTHONPATH}"` 22 | 23 | - In `docker-compose.yml`, mount directories under `x-airflow-common/volumes` so that Airflow can 'see' these paths: 24 | ``` 25 | volumes: 26 | - ./custom_scripts:/opt/airflow/custom_scripts 27 | - ./data/json:/opt/airflow/data/json 28 | - ./data/csv:/opt/airflow/data/csv 29 | - ./data/parquet:/opt/airflow/data/parquet 30 | ``` 31 | 32 | ### Create a Dataproc cluster in Airflow 33 | Official doc: https://airflow.apache.org/docs/apache-airflow-providers-google/stable/operators/cloud/dataproc.html#examples-of-job-configurations-to-submit 34 | 35 | The `gcloud` bash command is as follows: 36 | ``` 37 | gcloud dataproc clusters create de-spark-cluster \ 38 | --region asia-southeast1 \ 39 | --zone asia-southeast1-a \ 40 | --single-node \ 41 | --master-machine-type n1-standard-4 \ 42 | --master-boot-disk-size 500 \ 43 | --image-version 2.0-debian10 \ 44 | --max-idle 900s \ 45 | --project de-r-stocks \ 46 | --metadata 'PIP_PACKAGES=spark-nlp' \ 47 | --initialization-actions gs://datalake_de-r-stocks/pip-install.sh 48 | ``` 49 | We can use ClusterGenerator to generate the cluster configuration instead of manually setting the API. 50 | ``` 51 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator 52 | 53 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator( 54 | project_id=PROJECT_ID, 55 | zone="asia-southeast1-a", 56 | master_machine_type="n1-standard-4", 57 | master_disk_size=500, 58 | num_masters=1, 59 | num_workers=0, # single node mode 60 | idle_delete_ttl=900, # idle time before deleting cluster 61 | init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'], 62 | metadata={'PIP_PACKAGES': 'spark-nlp'}, 63 | ).make() 64 | 65 | create_cluster_operator_task = DataprocCreateClusterOperator( 66 | task_id='create_dataproc_cluster', 67 | cluster_name="de-spark-cluster", 68 | project_id=PROJECT_ID, 69 | region="asia-southeast1", 70 | cluster_config=CLUSTER_GENERATOR_CONFIG 71 | ) 72 | ``` 73 | `init_actions_uris`: When the cluster is created, it will be initalised to install dependencies under `metadata` with pip 74 | 75 | `metadata`: `spark-nlp` is required in our PySpark job 76 | 77 | See https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python 78 | 79 | ### Submit a PySpark job to a Dataproc cluster in Airflow 80 | Official doc: https://cloud.google.com/sdk/gcloud/reference/dataproc/jobs/submit/pyspark 81 | 82 | The documentation lays out clearly the bash code for submitting a PySpark job. 83 | 84 | Using the same API, my code looks like this: 85 | ``` 86 | gcloud dataproc jobs submit pyspark \ 87 | --cluster=de-spark-cluster \ 88 | --region=asia-southeast1 \ 89 | --project=de-r-stocks \ 90 | --jars=gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ 91 | --properties spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3 \ 92 | wordcount_by_date.py \ 93 | -- \ 94 | --input=gs://{BUCKET}/file.parquet \ 95 | --dataset=stocks_data 96 | --subreddit=stocks 97 | --mode=submission 98 | ``` 99 | `jars`: The JAR connector for Spark and BigQuery. This is required as the submit job will be writing processed data to a BigQuery table. 100 | `properties`: The package for sparknlp module, which is imported during the submit job. 101 | 102 | `wordcount_by_date.py`: The main .py file to run as the driver. It contains the PySpark code which does the preprocessing. In the above case, the file exists locally, but it can be on the cluster or in a storage bucket. 103 | 104 | The last four arguments are custom flags passed to the driver, e.g.: 105 | 106 | `gcloud dataproc jobs submit pyspark --cluster=my_cluster my_script.py -- --custom-flag` 107 | 108 | These arguments will be parsed and used in the `wordcount_by_date.py` script. 109 | 110 | Instead of using the bash code, I wrapped the above in the DataprocSubmitJobOperator. Besides the difference in API and the parameterisation, it is functionally identical. 111 | 112 | The code looks like this (you can find it in `stocks_dag.py`): 113 | ``` 114 | from airflow.providers.google.cloud.operators.dataproc import DataprocSubmitJobOperator 115 | 116 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py' 117 | 118 | 119 | pyspark_job = { 120 | "reference": {"project_id": PROJECT_ID}, 121 | "placement": {"cluster_name": 'de-spark-cluster'}, 122 | "pyspark_job": { 123 | "main_python_file_uri": PYSPARK_URI, 124 | "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], 125 | "properties": { 126 | "spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3" 127 | }, 128 | "args": [ 129 | f"--input=gs://{BUCKET}/{gcs_path}", 130 | f"--dataset={BIGQUERY_DATASET}", 131 | f"--subreddit={subreddit}", 132 | f"--mode={mode}" 133 | ] 134 | } 135 | } 136 | 137 | wordcount_sparksubmit_task = DataprocSubmitJobOperator( 138 | task_id='wordcount_sparksubmit', 139 | job=pyspark_job, 140 | region='asia-southeast1', 141 | project_id=PROJECT_ID, 142 | trigger_rule='all_done' 143 | ) 144 | ``` 145 | Note that the trigger_rule is not implemented correctly and can be left out. 146 | -------------------------------------------------------------------------------- /airflow/custom_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/airflow/custom_scripts/__init__.py -------------------------------------------------------------------------------- /airflow/custom_scripts/ingest_reddit.py: -------------------------------------------------------------------------------- 1 | import pendulum 2 | import requests 3 | import json 4 | import time 5 | 6 | url = "https://api.pushshift.io/reddit/search" 7 | 8 | def fetchObjects(mode, **kwargs): 9 | 10 | # Default parameters 11 | # Change as necessary/desired 12 | params = { 13 | "sorted_type": "created_utc", 14 | "sort": "asc", 15 | "size": "1000" 16 | } 17 | 18 | # Add additional parameters based on function arguments 19 | for key, value in kwargs.items(): 20 | params[key] = value 21 | 22 | loop = True 23 | while loop: 24 | # Perform API request 25 | r = requests.get(f'{url}/{mode}/', params=params, timeout=90) 26 | # print(r.url) 27 | if r.status_code != 200: 28 | print(r.status_code) 29 | print("Retrying...") 30 | else: 31 | # successful (200), loop = False and process data 32 | loop = False 33 | else: 34 | response = json.loads(r.text) 35 | data = response['data'] 36 | sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36)) 37 | return sorted_data_by_id 38 | 39 | def extract_reddit_data(subreddit, mode, start, end, filepath): 40 | 41 | # arg datetime format: pendulum.DateTime 42 | start = pendulum.parse(start) 43 | end = pendulum.parse(end) 44 | 45 | # convert DateTime to timestamp to pass into API 46 | start_ts = start.int_timestamp 47 | end_ts = end.int_timestamp 48 | 49 | max_id = 0 50 | 51 | # Open file for JSON output 52 | file = open(filepath, "a") 53 | 54 | while True: 55 | nothing_processed = True 56 | objects = fetchObjects(mode, subreddit=subreddit, after=start_ts, before=end_ts) 57 | 58 | for object in objects: 59 | id = int(object['id'],36) 60 | if id > max_id: 61 | nothing_processed = False 62 | created_utc = object['created_utc'] 63 | max_id = id 64 | if created_utc > start_ts: 65 | start_ts = created_utc 66 | # Output JSON data to the opened file 67 | file.write(json.dumps(object,sort_keys=True,ensure_ascii=True) + "\n") 68 | 69 | # Exit if nothing happened 70 | if nothing_processed: break 71 | start_ts -= 1 72 | 73 | # Sleep a little before the next function call 74 | time.sleep(.5) 75 | 76 | file.close() -------------------------------------------------------------------------------- /airflow/custom_scripts/preprocessing.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import logging 4 | import pyarrow.csv as pv 5 | import pyarrow.parquet as pq 6 | 7 | 8 | def json_to_csv(json_filepath, csv_filepath, mode): 9 | 10 | with open(json_filepath) as json_file: 11 | json_list = list(json_file) 12 | # Create new file to write to 13 | newfile = open(csv_filepath, 'w', encoding='utf-8', newline='') 14 | csv_writer = csv.writer(newfile) 15 | 16 | if mode == 'submission': 17 | csv_writer.writerow(["id","title","author", "created_utc", "num_comments","total_awards_received"]) 18 | for json_str in json_list: 19 | # convert string to json object 20 | result = json.loads(json_str) 21 | # write each column, row by row 22 | id = result['id'] 23 | title = result['title'] 24 | author = result['author'] 25 | created_utc = result['created_utc'] 26 | num_comments = result['num_comments'] 27 | total_awards_received = result['total_awards_received'] 28 | csv_writer.writerow([id, title, author, created_utc, num_comments, total_awards_received]) 29 | newfile.close() 30 | 31 | elif mode == 'comment': 32 | csv_writer.writerow(["id", "author", "created_utc", "body", "total_awards_received"]) 33 | for json_str in json_list: 34 | # convert string to json object 35 | result = json.loads(json_str) 36 | # write each column, row by row 37 | id = result['id'] 38 | author = result['author'] 39 | created_utc = result['created_utc'] 40 | body = result['body'] 41 | total_awards_received = result['total_awards_received'] 42 | csv_writer.writerow([id, title, author, created_utc, body, total_awards_received]) 43 | newfile.close() 44 | 45 | def csv_to_parquet(csv_filepath, parquet_filepath): 46 | 47 | if not csv_filepath.endswith('.csv'): 48 | logging.error("Not a CSV file") 49 | return 50 | table = pv.read_csv(csv_filepath) 51 | pq.write_table(table, parquet_filepath) -------------------------------------------------------------------------------- /airflow/dags/load_datawarehouse.sql: -------------------------------------------------------------------------------- 1 | -- create temp table (with modification) 2 | CREATE OR REPLACE TEMP TABLE {{ subreddit }}_{{ mode }} 3 | AS 4 | SELECT id, title, author, num_comments, total_awards_received, DATE(TIMESTAMP_SECONDS(created_utc)) AS {{ mode }}_date 5 | FROM {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_external_table; 6 | 7 | -- if permanent table does not exist, create permanent table from temp table (with partition) 8 | CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all 9 | PARTITION BY {{ mode }}_date 10 | AS 11 | SELECT * FROM {{ subreddit }}_{{ mode }}; 12 | 13 | -- maintain idempotency using delete-write 14 | -- delete rows from permanent table (only delete data that the pipeline will re-create) 15 | DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all 16 | WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}'; 17 | 18 | -- insert data from temp table to permanent table 19 | INSERT INTO {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_all 20 | SELECT * FROM {{ subreddit }}_{{ mode }}; -------------------------------------------------------------------------------- /airflow/dags/stocks_dag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | from airflow import DAG 4 | from airflow.operators.bash import BashOperator 5 | from airflow.operators.python import PythonOperator 6 | 7 | from google.cloud import storage 8 | from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator 9 | from airflow.providers.google.cloud.operators.dataproc import ClusterGenerator, DataprocCreateClusterOperator, DataprocSubmitJobOperator 10 | 11 | from custom_scripts.ingest_reddit import extract_reddit_data 12 | from custom_scripts.preprocessing import json_to_csv, csv_to_parquet 13 | 14 | BUCKET = os.environ.get("GCP_GCS_BUCKET") 15 | AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/") 16 | BIGQUERY_DATASET = os.environ.get('BIGQUERY_DATASET', 'stocks_data') 17 | PROJECT_ID = os.environ.get('GCP_PROJECT_ID') 18 | PYSPARK_URI = f'gs://{BUCKET}/scripts/wordcount_by_date.py' 19 | 20 | CLUSTER_GENERATOR_CONFIG = ClusterGenerator( 21 | project_id=PROJECT_ID, 22 | zone="asia-southeast1-a", 23 | master_machine_type="n1-standard-4", 24 | master_disk_size=500, 25 | num_masters=1, 26 | num_workers=0, # single node mode 27 | idle_delete_ttl=900, # idle time before deleting cluster 28 | init_actions_uris=[f'gs://{BUCKET}/scripts/pip-install.sh'], 29 | metadata={'PIP_PACKAGES': 'spark-nlp'}, 30 | ).make() 31 | 32 | def load_to_gcs(bucket, object_name, local_file): 33 | """ 34 | Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 35 | :param bucket: GCS bucket name 36 | :param object_name: target path & file-name 37 | :param local_file: source path & file-name 38 | :return: 39 | """ 40 | client = storage.Client() 41 | bucket = client.bucket(bucket) 42 | 43 | blob = bucket.blob(object_name) 44 | blob.upload_from_filename(local_file) 45 | 46 | def reddit_pipeline_template( 47 | # arguments 48 | dag, 49 | subreddit, 50 | mode, 51 | json_filepath, 52 | csv_filepath, 53 | parquet_filepath, 54 | gcs_path 55 | ): 56 | with dag: 57 | download_data_task = PythonOperator( 58 | task_id = 'ingest_reddit_json', 59 | python_callable = extract_reddit_data, 60 | op_kwargs = { 61 | 'subreddit': subreddit, 62 | 'mode': mode, 63 | 'start': '{{ data_interval_start }}', 64 | 'end': '{{ data_interval_end }}', 65 | 'filepath': json_filepath 66 | } 67 | ) 68 | 69 | json_to_csv_task = PythonOperator( 70 | task_id = 'json_to_csv', 71 | python_callable = json_to_csv, 72 | op_kwargs = { 73 | 'json_filepath': json_filepath, 74 | 'csv_filepath': csv_filepath, 75 | 'mode': mode 76 | } 77 | ) 78 | 79 | csv_to_parquet_task = PythonOperator( 80 | task_id = 'csv_to_parquet', 81 | python_callable = csv_to_parquet, 82 | op_kwargs = { 83 | 'csv_filepath': csv_filepath, 84 | 'parquet_filepath': parquet_filepath 85 | } 86 | ) 87 | 88 | load_to_gcs_task = PythonOperator( 89 | task_id = "load_to_gcs", 90 | python_callable = load_to_gcs, 91 | op_kwargs={ 92 | "bucket": BUCKET, 93 | "object_name": gcs_path, 94 | "local_file": parquet_filepath, 95 | } 96 | ) 97 | 98 | delete_local_json_csv = BashOperator( 99 | task_id = "delete_local_json_csv", 100 | bash_command = f'rm {json_filepath} {csv_filepath}' 101 | ) 102 | 103 | QUERY = f'''CREATE OR REPLACE EXTERNAL TABLE {BIGQUERY_DATASET}.{subreddit}_{mode}_external_table 104 | OPTIONS ( 105 | format="PARQUET", 106 | uris=["gs://{BUCKET}/{gcs_path}"] 107 | );''' 108 | 109 | create_BQ_external_table_task = BigQueryInsertJobOperator( 110 | task_id = 'create_external_table', 111 | configuration={ 112 | 'query': { 113 | 'query': QUERY, 114 | 'useLegacySql': False, 115 | } 116 | } 117 | ) 118 | 119 | # Create a partitioned table from external table 120 | BQ_create_partitioned_table_task = BigQueryInsertJobOperator( 121 | task_id = "bq_create_partitioned_table", 122 | configuration={ 123 | "query": { 124 | "query": "{% include 'load_datawarehouse.sql' %}", 125 | "useLegacySql": False, 126 | } 127 | } 128 | ) 129 | 130 | QUERY_CREATE_WORDCOUNT_TABLE = ''' 131 | CREATE TABLE IF NOT EXISTS {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount ( 132 | word STRING, 133 | wordcount INTEGER, 134 | {{ mode }}_date DATE 135 | ) 136 | PARTITION BY {{ mode }}_date''' 137 | 138 | create_wordcount_table_task = BigQueryInsertJobOperator( 139 | task_id = 'create_wordcount_table', 140 | configuration={ 141 | 'query': { 142 | 'query': QUERY_CREATE_WORDCOUNT_TABLE, 143 | 'useLegacySql': False, 144 | } 145 | } 146 | ) 147 | # task will marked as 'success' if cluster exists 148 | create_cluster_operator_task = DataprocCreateClusterOperator( 149 | task_id='create_dataproc_cluster', 150 | cluster_name="de-spark-cluster", 151 | project_id=PROJECT_ID, 152 | region="asia-southeast1", 153 | cluster_config=CLUSTER_GENERATOR_CONFIG 154 | ) 155 | 156 | QUERY_DELETE_WORDCOUNT_ROWS = ''' 157 | DELETE {{ BIGQUERY_DATASET }}.{{ subreddit }}_{{ mode }}_wordcount 158 | WHERE {{ mode }}_date BETWEEN '{{ ds }}' AND '{{ macros.ds_add(data_interval_end.strftime('%Y-%m-%d'), -1) }}'; 159 | ''' 160 | 161 | # delete any existing duplicate rows before writing 162 | delete_wordcountdup_task = BigQueryInsertJobOperator( 163 | task_id = 'delete_wordcountdup', 164 | configuration={ 165 | 'query': { 166 | 'query': QUERY_DELETE_WORDCOUNT_ROWS, 167 | 'useLegacySql': False, 168 | } 169 | } 170 | ) 171 | 172 | pyspark_job = { 173 | "reference": {"project_id": PROJECT_ID}, 174 | "placement": {"cluster_name": 'de-spark-cluster'}, 175 | "pyspark_job": { 176 | "main_python_file_uri": PYSPARK_URI, 177 | "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], 178 | "properties": { 179 | "spark.jars.packages":"com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3" 180 | }, 181 | "args": [ 182 | f"--input=gs://{BUCKET}/{gcs_path}", 183 | f"--dataset={BIGQUERY_DATASET}", 184 | f"--subreddit={subreddit}", 185 | f"--mode={mode}" 186 | ] 187 | } 188 | } 189 | 190 | wordcount_sparksubmit_task = DataprocSubmitJobOperator( 191 | task_id='wordcount_sparksubmit', 192 | job=pyspark_job, 193 | region='asia-southeast1', 194 | project_id=PROJECT_ID, 195 | trigger_rule='all_done' 196 | ) 197 | 198 | download_data_task >> json_to_csv_task >> csv_to_parquet_task >> load_to_gcs_task >> [delete_local_json_csv, create_BQ_external_table_task] 199 | create_BQ_external_table_task >> BQ_create_partitioned_table_task 200 | load_to_gcs_task >> create_wordcount_table_task >> create_cluster_operator_task >> delete_wordcountdup_task >> wordcount_sparksubmit_task 201 | 202 | default_args = { 203 | "owner": "Zachary", 204 | "start_date": datetime(2022, 3, 1), 205 | "end_date": datetime(2022, 4, 30), 206 | "depends_on_past": False, 207 | "retries": 1, 208 | "retry_delay": timedelta(seconds=60) 209 | } 210 | 211 | # all dag definitions (dag = DAG()) should be in the global scope 212 | stocks_submission_weekly_dag = DAG( 213 | dag_id = 'stocks_submission_weekly', 214 | schedule_interval = '@weekly', 215 | catchup = True, 216 | max_active_runs = 3, 217 | default_args = default_args, 218 | user_defined_macros={ 219 | "BIGQUERY_DATASET": BIGQUERY_DATASET, 220 | "subreddit": 'stocks', 221 | "mode": 'submission', 222 | } 223 | ) 224 | 225 | # submission 226 | reddit_pipeline_template( 227 | dag = stocks_submission_weekly_dag, 228 | subreddit = 'stocks', 229 | mode = 'submission', 230 | json_filepath = AIRFLOW_HOME + '/data/json/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.json', 231 | csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv', 232 | parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet', 233 | gcs_path = 'stocks/submission/stocks_submission_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet' 234 | ) 235 | 236 | 237 | #---------------------------- THE SECTION BELOW IS UNUSED --------------------------- 238 | # stocks_comment_weekly_dag = DAG( 239 | # dag_id = 'stocks_comment_weekly', 240 | # schedule_interval = '@weekly', 241 | # catchup = True, 242 | # max_active_runs = 2, 243 | # default_args = default_args 244 | # ) 245 | 246 | # comment 247 | # reddit_pipeline_template( 248 | # dag = stocks_comment_weekly_dag, 249 | # subreddit = 'stocks', 250 | # mode = 'comment', 251 | # json_filepath = AIRFLOW_HOME + '/data/json/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.json', 252 | # csv_filepath = AIRFLOW_HOME + '/data/csv/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.csv', 253 | # parquet_filepath = AIRFLOW_HOME + '/data/parquet/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet', 254 | # gcs_path = 'stocks/comment/stocks_comment_{{ data_interval_start.strftime("%Y-%m-%d") }}.parquet' 255 | # ) -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.2.4 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.2.4} 48 | build: . 49 | environment: 50 | &airflow-common-env 51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 52 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 53 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 54 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 55 | AIRFLOW__CORE__FERNET_KEY: '' 56 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 57 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 58 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 59 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 60 | 61 | # self-defined 62 | GCP_PROJECT_ID: 'de-r-stocks' 63 | GCP_GCS_BUCKET: 'datalake_de-r-stocks' 64 | GOOGLE_APPLICATION_CREDENTIALS: '/.google/credentials/de-r-stocks.json' 65 | AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT: 'google-cloud-platform://?extra__google_cloud_platform__key_path=/.google/credentials/de-r-stocks.json' 66 | volumes: 67 | - ./dags:/opt/airflow/dags 68 | - ./logs:/opt/airflow/logs 69 | - ./plugins:/opt/airflow/plugins 70 | - ~/.google/credentials/:/.google/credentials:ro 71 | - ./custom_scripts:/opt/airflow/custom_scripts 72 | - ./data/json:/opt/airflow/data/json 73 | - ./data/csv:/opt/airflow/data/csv 74 | - ./data/parquet:/opt/airflow/data/parquet 75 | user: "${AIRFLOW_UID:-50000}:0" 76 | depends_on: 77 | &airflow-common-depends-on 78 | redis: 79 | condition: service_healthy 80 | postgres: 81 | condition: service_healthy 82 | 83 | services: 84 | postgres: 85 | image: postgres:13 86 | environment: 87 | POSTGRES_USER: airflow 88 | POSTGRES_PASSWORD: airflow 89 | POSTGRES_DB: airflow 90 | volumes: 91 | - postgres-db-volume:/var/lib/postgresql/data 92 | healthcheck: 93 | test: ["CMD", "pg_isready", "-U", "airflow"] 94 | interval: 5s 95 | retries: 5 96 | restart: always 97 | 98 | redis: 99 | image: redis:latest 100 | expose: 101 | - 6379 102 | healthcheck: 103 | test: ["CMD", "redis-cli", "ping"] 104 | interval: 5s 105 | timeout: 30s 106 | retries: 50 107 | restart: always 108 | 109 | airflow-webserver: 110 | <<: *airflow-common 111 | command: webserver 112 | ports: 113 | - 8080:8080 114 | healthcheck: 115 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 116 | interval: 10s 117 | timeout: 10s 118 | retries: 5 119 | restart: always 120 | depends_on: 121 | <<: *airflow-common-depends-on 122 | airflow-init: 123 | condition: service_completed_successfully 124 | 125 | airflow-scheduler: 126 | <<: *airflow-common 127 | command: scheduler 128 | healthcheck: 129 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 130 | interval: 10s 131 | timeout: 10s 132 | retries: 5 133 | restart: always 134 | depends_on: 135 | <<: *airflow-common-depends-on 136 | airflow-init: 137 | condition: service_completed_successfully 138 | 139 | airflow-worker: 140 | <<: *airflow-common 141 | command: celery worker 142 | healthcheck: 143 | test: 144 | - "CMD-SHELL" 145 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 146 | interval: 10s 147 | timeout: 10s 148 | retries: 5 149 | environment: 150 | <<: *airflow-common-env 151 | # Required to handle warm shutdown of the celery workers properly 152 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 153 | DUMB_INIT_SETSID: "0" 154 | restart: always 155 | depends_on: 156 | <<: *airflow-common-depends-on 157 | airflow-init: 158 | condition: service_completed_successfully 159 | 160 | airflow-triggerer: 161 | <<: *airflow-common 162 | command: triggerer 163 | healthcheck: 164 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 165 | interval: 10s 166 | timeout: 10s 167 | retries: 5 168 | restart: always 169 | depends_on: 170 | <<: *airflow-common-depends-on 171 | airflow-init: 172 | condition: service_completed_successfully 173 | 174 | airflow-init: 175 | <<: *airflow-common 176 | entrypoint: /bin/bash 177 | # yamllint disable rule:line-length 178 | command: 179 | - -c 180 | - | 181 | function ver() { 182 | printf "%04d%04d%04d%04d" $${1//./ } 183 | } 184 | airflow_version=$$(gosu airflow airflow version) 185 | airflow_version_comparable=$$(ver $${airflow_version}) 186 | min_airflow_version=2.2.0 187 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 188 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 189 | echo 190 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 191 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 192 | echo 193 | exit 1 194 | fi 195 | if [[ -z "${AIRFLOW_UID}" ]]; then 196 | echo 197 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 198 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 199 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 200 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 201 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 202 | echo 203 | fi 204 | one_meg=1048576 205 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 206 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 207 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 208 | warning_resources="false" 209 | if (( mem_available < 4000 )) ; then 210 | echo 211 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 212 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 213 | echo 214 | warning_resources="true" 215 | fi 216 | if (( cpus_available < 2 )); then 217 | echo 218 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 219 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 220 | echo 221 | warning_resources="true" 222 | fi 223 | if (( disk_available < one_meg * 10 )); then 224 | echo 225 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 226 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 227 | echo 228 | warning_resources="true" 229 | fi 230 | if [[ $${warning_resources} == "true" ]]; then 231 | echo 232 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 233 | echo "Please follow the instructions to increase amount of resources available:" 234 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 235 | echo 236 | fi 237 | mkdir -p /sources/logs /sources/dags /sources/plugins 238 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 239 | exec /entrypoint airflow version 240 | # yamllint enable rule:line-length 241 | environment: 242 | <<: *airflow-common-env 243 | _AIRFLOW_DB_UPGRADE: 'true' 244 | _AIRFLOW_WWW_USER_CREATE: 'true' 245 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 246 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 247 | user: "0:0" 248 | volumes: 249 | - .:/sources 250 | 251 | airflow-cli: 252 | <<: *airflow-common 253 | profiles: 254 | - debug 255 | environment: 256 | <<: *airflow-common-env 257 | CONNECTION_CHECK_MAX_COUNT: "0" 258 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 259 | command: 260 | - bash 261 | - -c 262 | - airflow 263 | 264 | flower: 265 | <<: *airflow-common 266 | command: celery flower 267 | ports: 268 | - 5555:5555 269 | healthcheck: 270 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 271 | interval: 10s 272 | timeout: 10s 273 | retries: 5 274 | restart: always 275 | depends_on: 276 | <<: *airflow-common-depends-on 277 | airflow-init: 278 | condition: service_completed_successfully 279 | 280 | volumes: 281 | postgres-db-volume: 282 | -------------------------------------------------------------------------------- /airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-google 2 | pyarrow 3 | pendulum 4 | requests -------------------------------------------------------------------------------- /airflow/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} 3 | export AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT=${AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT} 4 | 5 | airflow db upgrade 6 | 7 | airflow users create -r Admin -u admin -p admin -e admin@example.com -f admin -l airflow 8 | # "$_AIRFLOW_WWW_USER_USERNAME" -p "$_AIRFLOW_WWW_USER_PASSWORD" 9 | 10 | airflow webserver 11 | -------------------------------------------------------------------------------- /images/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/airflow.png -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/architecture.png -------------------------------------------------------------------------------- /images/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/images/dashboard.png -------------------------------------------------------------------------------- /spark/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## Code breakdown for `wordcount_by_date.py` 4 | The script does the following: 5 | - Reads parquet file from a GCS bucket 6 | - It assumes that the file contains the columns 'author', 'date' and 'title' 7 | - Builds an NLP pipeline and transforms the text data in 'title' with the pipeline 8 | - https://nlp.johnsnowlabs.com/docs/en/install#python 9 | - https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb 10 | - Creates a dataframe that contains the token count on each date, e.g.: 11 | 12 | | word | wordcount | submission_date | 13 | |--------|-------|------------| 14 | | invest | 6 | 2022-04-11 | 15 | | invest | 4 | 2022-04-12 | 16 | | market | 2 | 2022-04-12 | 17 | 18 | - Lastly, it writes dataframe containing the word count into a BigQuery table 19 | 20 | ### Write dataframe into a BigQuery table 21 | Official doc: https://github.com/GoogleCloudDataproc/spark-bigquery-connector 22 | 23 | ``` 24 | df_wordcountbydate.write.format('bigquery') \ 25 | .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \ 26 | .option('temporaryGcsBucket', BUCKET) \ 27 | .option('partitionField', f'{mode}_date') \ 28 | .option('partitionType', 'DAY') \ 29 | .mode('append') \ 30 | .save() 31 | ``` 32 | 33 | This is done using a BigQuery connector for Spark. The connector must be specified when submitting the PySpark job for this script, which I did so in the Airflow DAG `stocks_dag.py`. 34 | -------------------------------------------------------------------------------- /spark/spark-bigquery-latest_2.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacharyt-cs/reddit-data-engineering/76b1a345e8a34431cfc310c7bdc2e378eb4057d1/spark/spark-bigquery-latest_2.12.jar -------------------------------------------------------------------------------- /spark/wordcount_by_date.py: -------------------------------------------------------------------------------- 1 | # %% 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import StructField, StructType, StringType, FloatType, IntegerType 4 | from pyspark.ml.feature import CountVectorizer 5 | from pyspark.ml import Pipeline 6 | import pyspark.sql.functions as F 7 | 8 | from sparknlp.annotator import LemmatizerModel, Tokenizer, Normalizer, StopWordsCleaner, NGramGenerator 9 | from sparknlp.base import Finisher, DocumentAssembler 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--input', required=True) 14 | parser.add_argument('--dataset', required=True) 15 | parser.add_argument('--subreddit', required=True) 16 | parser.add_argument('--mode', required=True) 17 | args = parser.parse_args() 18 | 19 | input = args.input 20 | dataset = args.dataset 21 | subreddit = args.subreddit 22 | mode = args.mode 23 | 24 | # change this to your bucket 25 | # bucket is used as temporary storage while writing data from Spark to BigQuery 26 | BUCKET = 'datalake_de-r-stocks' 27 | 28 | # Start Spark session 29 | spark = SparkSession.builder \ 30 | .appName('preprocessing_wordcount') \ 31 | .config('"spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3"') \ 32 | .getOrCreate() 33 | 34 | # %% 35 | # Access data from GCS 36 | df = spark.read.parquet(input) 37 | 38 | # 1. Remove posts by AutoModerator 39 | # 2. Remove duplicate titles 40 | # 3. Convert unix timestamp to date 41 | # 4. Keep title and date columns 42 | df_filter = df.filter(~F.col('author').contains('AutoModerator')) \ 43 | .dropDuplicates(['title']) \ 44 | .withColumn('date', F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')) \ 45 | .select('title', 'date') 46 | 47 | documentAssembler = DocumentAssembler() \ 48 | .setInputCol('title') \ 49 | .setOutputCol('title_document') 50 | 51 | tokenizer = Tokenizer() \ 52 | .setInputCols(['title_document']) \ 53 | .setOutputCol('title_token') 54 | 55 | normalizer = Normalizer() \ 56 | .setInputCols(['title_token']) \ 57 | .setOutputCol('title_normalized') \ 58 | .setLowercase(True) 59 | 60 | lemmatizer = LemmatizerModel.pretrained() \ 61 | .setInputCols(['title_normalized']) \ 62 | .setOutputCol('title_lemma') 63 | 64 | stopwords_cleaner = StopWordsCleaner() \ 65 | .setInputCols(['title_lemma']) \ 66 | .setOutputCol('title_cleaned') \ 67 | .setCaseSensitive(False) 68 | 69 | ngrams_cum = NGramGenerator() \ 70 | .setInputCols(["title_cleaned"]) \ 71 | .setOutputCol("title_ngrams") \ 72 | .setN(2) \ 73 | .setEnableCumulative(True)\ 74 | .setDelimiter("_") # Default is space 75 | 76 | finisher = Finisher() \ 77 | .setInputCols(['title_ngrams']) \ 78 | .setOutputCols(['title_finished']) \ 79 | .setCleanAnnotations(False) 80 | 81 | nlpPipeline = Pipeline(stages=[ 82 | documentAssembler, 83 | tokenizer, 84 | normalizer, 85 | lemmatizer, 86 | stopwords_cleaner, 87 | ngrams_cum, 88 | finisher 89 | ]) 90 | 91 | df_result = nlpPipeline.fit(df_filter).transform(df_filter).select('title_finished', 'date') 92 | 93 | # CountVectorizer model 94 | cv = CountVectorizer(inputCol='title_finished', outputCol='features', minDF=3.0) 95 | 96 | # Train on all submissions 97 | model = cv.fit(df_result) 98 | 99 | df_tokensbydate = df_result.groupBy('date').agg(F.flatten(F.collect_list('title_finished')).alias('title_finished')) 100 | 101 | # Get counts for each date 102 | counts = model.transform(df_tokensbydate).select('date','features').collect() 103 | 104 | # Create empty dataframe 105 | df_wordcountbydate = spark.createDataFrame(spark.sparkContext.emptyRDD(), 106 | schema=StructType(fields=[ 107 | StructField("word", StringType()), 108 | StructField("count", FloatType()), 109 | StructField("date", StringType())])) 110 | 111 | # Append count for each day to dataframe 112 | for row in range(len(counts)): 113 | test_dict = dict(zip(model.vocabulary, (float(x) for x in counts[row]['features'].values))) 114 | df_temp = spark.createDataFrame(test_dict.items(), 115 | schema=StructType(fields=[ 116 | StructField("word", StringType()), 117 | StructField("count", FloatType())])) 118 | df_temp = df_temp.withColumn('date', F.lit(counts[row]['date'])) 119 | df_wordcountbydate = df_wordcountbydate.unionAll(df_temp) 120 | 121 | # %% 122 | 123 | df_wordcountbydate = df_wordcountbydate.withColumn('count', F.col('count').cast(IntegerType())) \ 124 | .withColumn(f'{mode}_date', F.to_date(F.col('date'), 'yyyy-MM-dd')) \ 125 | .withColumnRenamed('count', 'wordcount') \ 126 | .drop('date') 127 | 128 | # upload dataframe to BigQuery 129 | df_wordcountbydate.write.format('bigquery') \ 130 | .option('table', f'{dataset}.{subreddit}_{mode}_wordcount') \ 131 | .option('temporaryGcsBucket', BUCKET) \ 132 | .option('partitionField', f'{mode}_date') \ 133 | .option('partitionType', 'DAY') \ 134 | .mode('append') \ 135 | .save() -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0" 3 | required_providers { 4 | google = { 5 | source = "hashicorp/google" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | credentials = file(var.credentials) 12 | project = var.project 13 | region = var.region 14 | } 15 | 16 | resource "google_storage_bucket" "data-lake-bucket" { 17 | name = "${local.data_lake_bucket}_${var.project}" 18 | location = var.region 19 | 20 | storage_class = var.storage_class 21 | uniform_bucket_level_access = true 22 | 23 | versioning { 24 | enabled = true 25 | } 26 | 27 | lifecycle_rule { 28 | action { 29 | type = "Delete" 30 | } 31 | condition { 32 | age = 30 33 | } 34 | } 35 | 36 | force_destroy = true 37 | } 38 | 39 | resource "google_bigquery_dataset" "dataset" { 40 | dataset_id = var.BQ_DATASET 41 | project = var.project 42 | location = var.region 43 | } -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | locals { 2 | data_lake_bucket = "datalake" 3 | } 4 | 5 | variable "project" { 6 | default = "de-r-stocks" 7 | description = "GCP project ID" 8 | } 9 | 10 | variable "region" { 11 | type = string 12 | default = "asia-southeast1" 13 | description = "Region for GCP resources" 14 | } 15 | 16 | variable "storage_class" { 17 | default = "STANDARD" 18 | description = "Storage class type for bucket" 19 | } 20 | 21 | variable "BQ_DATASET" { 22 | type = string 23 | default = "stocks_data" 24 | description = "BigQuery dataset that raw data from GCS will be written to" 25 | } 26 | 27 | variable "credentials" { 28 | type = string 29 | default = "/home/ztmj96/.google/credentials/de-r-stocks.json" 30 | description = "Path for GCP account credentials" 31 | } 32 | 33 | 34 | --------------------------------------------------------------------------------