├── .env
├── .gitignore
├── Dockerfile
├── README.md
├── celery
    └── Dockerfile
├── dags
    ├── ETL_Databricks.py
    ├── decorator.py
    ├── postgres_spark.py
    ├── sampleTaskFlow.py
    └── weather_data.py
├── databricks
    └── notebooks
    │   └── opendentalfhir.py
├── docker-compose-celery.yaml
├── docker-compose.yaml
├── filemanager
    └── index.php
├── img
    ├── DAG.JPG
    ├── airflowcelery.png
    ├── airflowcelerydags.png
    └── views.JPG
├── jars
    └── postgresql-42.6.0.jar
├── pgadmin.json
├── spark
    ├── spark_postgres_query.py
    └── spark_weather_data.py
└── variables
    ├── airflow_connections.json
    └── airflow_variables.json


/.env:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=1000
2 | AIRFLOW_GID=0
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dags/__pycache__
2 | logs/*
3 | data/*
4 | .databricks/*
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:2.2.3
 2 | 
 3 | USER root
 4 | 
 5 | # Install OpenJDK-11
 6 | RUN apt update && \
 7 |     apt-get install -y openjdk-11-jdk && \
 8 |     apt-get install -y ant && \
 9 |     apt-get clean;
10 | 
11 | # Set JAVA_HOME
12 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
13 | RUN export JAVA_HOME
14 | 
15 | USER airflow
16 | 
17 | WORKDIR /app
18 | 
19 | COPY variables/airflow_variables.json /app
20 | COPY variables/airflow_connections.json /app


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Apache Airflow data pipeline
  2 | ## Consuming Weather API and Storing on PostgreSql Database.
  3 | 
  4 | ### This set of code and instructions has the porpouse to instanciate a compiled environment with set of docker images like airflow webserver, airflow scheduler, postgresql, pyspark.
  5 | 
  6 | ## Overview
  7 | Extract the last 5 days of data from the free API: https://api.openweathermap.org/data/2.5/onecall/timemachine (Historical weather data) from 10 different locations.  
  8 | 
  9 | Build a repository of data where we will keep the data extracted from the API. This repository should only have deduplicated data. Idempotency should also be guaranteed.  
 10 |   
 11 | Build another repository of data that will contain the results of the following calculations from the data stored in step 2.  
 12 | A dataset containing the location, date and temperature of the highest temperatures reported by location and month.  
 13 | A dataset containing the average temperature, min temperature, location of min temperature, and location of max temperature per day.  
 14 | Remote Infrastructure with all components required to the python workflow works.  
 15 |   
 16 |   
 17 | The set of scripts in Airflow DAG takes care of extracting historical data from the weather API based on 10 different location (5 days per location) and store it locally, light pre processing data and lookup with geopy API to enrich data with locations, store pre processed json files locally also, then the medium effort processing work is send to Spark local through airflow Spark Submit, where the spark script is sent to the local spark cluster, do all the data processing, filtering, aggregations and store back locally in the filesystem, then once the files are stored back as csv files, it runs postgres scripts to copy from (ingest) files to the postgres sql database, finally views are created with the aggregated queries in order to attend the request.     
 18 | 
 19 | In Addition, the Dockerfile and docker-compose file scripts are in charge of spining up containers with everything needed to run the airflow DAG successfully end to end.  
 20 | 
 21 |   
 22 | Remember to sing up an account at https://home.openweathermap.org/ and create an API Token.
 23 |     
 24 |   
 25 | ## Requirements
 26 | * Linux Operational System (Ubuntu 20.04 Prefered) or Windows and docker desktop.
 27 | * [Git Versioning Software ](https://git-scm.com/download/linux)
 28 | * [Docker Container Virtualization Software](https://docs.docker.com/engine/install/ubuntu/)
 29 | * [Docker-Compose multi-container Docker applications](https://docs.docker.com/compose/install/)
 30 | 
 31 |   
 32 | 
 33 | ## Instructions AIrflow 2.7.1 (CeleryExecutor) Heavier Version
 34 | Clone this repository into your linux working directory and navegate into it.  
 35 |   
 36 | run commands:
 37 | ```
 38 | 
 39 | # Clone Git repository to the created folder
 40 | git clone https://github.com/mpavanetti/airflow .
 41 | 
 42 | # Run docker compose
 43 | sudo docker-compose -f docker-compose-celery.yaml up -d
 44 | 
 45 | # Import Airflow connections and variables
 46 | sudo docker exec -it airflow_airflow-worker_1 airflow connections import /opt/airflow/variables/airflow_connections.json
 47 | sudo docker exec -it airflow_airflow-worker_1 airflow variables import /opt/airflow/variables/airflow_variables.json
 48 | 
 49 | # Add permissions (If any write error happens)
 50 | sudo chmod -R 777 ../airflow
 51 | 
 52 | # Stop containers
 53 | sudo docker-compose -f docker-compose-celery.yaml kill
 54 | ```
 55 |   
 56 | 
 57 | 
 58 | ## Instructions AIrflow 2.2.3 (LocalExecutor) Lighter Version
 59 | Clone this repository into your linux working directory and navegate into it.  
 60 |   
 61 | run commands:
 62 | ```
 63 | 
 64 | # Clone Git repository to the created folder
 65 | git clone https://github.com/mpavanetti/airflow .
 66 | 
 67 | # Run docker compose
 68 | sudo docker-compose up -d
 69 | 
 70 | # Import Airflow connections and variables
 71 | sudo docker exec -it airflow_airflow-scheduler_1 airflow connections import /opt/airflow/variables/airflow_connections.json
 72 | sudo docker exec -it airflow_airflow-scheduler_1 airflow variables import /opt/airflow/variables/airflow_variables.json
 73 | 
 74 | # Add permissions (If any write error happens)
 75 | sudo chmod -R 755 ../airflow
 76 | ```
 77 |   
 78 | In case you have any issues while importing airflow connections and variables, take the json files and import it manually.  
 79 |   
 80 | Note that you can enter manually latitude and logitude in the airflow varaibles one by one, open the file [airflow_variables.json](airflow_variables.json) and change the parameters weather_data_lat and weather_data_lon respectively or you if you let it as blank, the script will suggest 10 different location.  
 81 | 
 82 | Note that the temperature results are in Celsius(units=metric), if you want to change to Fahrenheit open the variable file [airflow_variables.json](airflow_variables.json) and change the parameter weather_data_units to imperial. respectively standard for Kelvin.  
 83 | 
 84 | *Important, to Change the variables you can go at the variables section in apache airflow and change the values as you want any time you need or either do in the [airflow_variables.json](airflow_variables.json) and redeploy the containers which might not be very fast.
 85 | 
 86 | 
 87 | 
 88 | ## Accesses
 89 | Access the Airflow UI through the link http://localhost:8080/  
 90 | 
 91 | Username: airflow  
 92 | Password: airflow
 93 |   
 94 | Access Postgres Admin through the link http://localhost:15432/  
 95 | Username: postgres@email.com  
 96 | Password: postgres
 97 |   
 98 | Note: using localhost as example, in case the application sits on cloud virtual machines, the host will be given at your cloud machine instance, also make sure the ports are opened.
 99 |   
100 |   
101 | ## Data Pipeline Run
102 | Go to airflow DAGs view , turn on the dag weather_data, and trigger it.  
103 | 
104 | ![weather_data](img/DAG.JPG)
105 | 
106 | ## Checking DAG result
107 | 
108 | Open the postgres admin in a chrome web browser, go to Servers and add a server with the information described in the json file [pgadmin.json](pgadmin.json)
109 |   
110 | Check the final views  
111 | 
112 | SELECT * FROM VW_DATASET_1;  
113 | 
114 | SELECT * FROM VW_DATASET_2;  
115 |   
116 | After running the airflow DAG pipeline, you should expect the following view result in postgresl:  
117 |   
118 | ![weather_data](img/views.JPG)
119 | 
120 | Thanks.  
121 |   
122 | 
123 | ## Images
124 | Airflow 2.7.1 Celery deployment dags:
125 | ![airflowcelery](img/airflowcelerydags.png)
126 |   
127 | Weather_data DAG process:
128 | ![weather_data](img/airflowcelery.png)
129 |    
130 | 
131 | ## Extra notes
132 | 
133 | ```
134 | # Install docker-compose v2 on linux os  
135 | sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
136 | sudo chmod +x /usr/local/bin/docker-compose
137 | 
138 | # Install tiny file manager
139 | sudo docker run -d -v ./filemanager:/var/www/html -p 80:80 -v ./dags:/var/www/html/data/dags -v ./logs:/var/www/html/data/logs -v ./plugins:/var/www/html/data/plugins -v ./spark:/var/www/html/data/spark -v ./jars:/var/www/html/data/jars -v ./variables:/var/www/html/data/variables --restart=always --name filemanager tinyfilemanager/tinyfilemanager:master
140 | 
141 | Default username/password: admin/admin@123 and user/12345
142 | 
143 | ```
144 | 


--------------------------------------------------------------------------------
/celery/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:2.7.1
 2 | 
 3 | USER root
 4 | 
 5 | # Install OpenJDK-11
 6 | RUN apt update && \
 7 |     apt-get install -y openjdk-11-jdk && \
 8 |     apt-get install -y ant && \
 9 |     apt-get clean;
10 | 
11 | # Set JAVA_HOME
12 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
13 | RUN export JAVA_HOME
14 | 
15 | USER airflow
16 | 
17 | WORKDIR /app
18 | 
19 | #COPY /variables/airflow_variables.json /app
20 | #COPY /variables/airflow_connections.json /app


--------------------------------------------------------------------------------
/dags/ETL_Databricks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | from airflow.providers.databricks.operators.databricks import DatabricksSubmitRunOperator
 4 | 
 5 | notebook_task_params = {
 6 |     'existing_cluster_id': '',
 7 |     'notebook_task': {
 8 |         'notebook_path': '',
 9 |     },
10 | }
11 | 
12 | from airflow.decorators import dag, task
13 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','databricks','notebook'])
14 | def ETL_Databricks():
15 |     
16 |     @task()
17 |     def extract():
18 |         return True
19 |     
20 |     # Transform
21 |     notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', json=notebook_task_params)
22 |     
23 |     @task()
24 |     def load():
25 |         return True
26 |     
27 |     ext = extract() >> notebook_task >> load()
28 | 
29 | etl = ETL_Databricks()


--------------------------------------------------------------------------------
/dags/decorator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | 
 4 | from airflow.decorators import dag, task
 5 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator'])
 6 | def ETL():
 7 |     
 8 |     @task()
 9 |     def extract():
10 |         return True
11 |     
12 |     @task()
13 |     def transform():
14 |         return True
15 |     
16 |     @task()
17 |     def load():
18 |         return True
19 |     
20 |     ext = extract() >> transform() >> load()
21 | 
22 | etl = ETL()


--------------------------------------------------------------------------------
/dags/postgres_spark.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | from airflow.decorators import dag, task
 4 | from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
 5 | from airflow.models import Variable
 6 | 
 7 | # Variables
 8 | spark_dir = Variable.get("spark_dir")
 9 | jars_dir = Variable.get("jars_dir")
10 | 
11 | 
12 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','postgres','spark'])
13 | def postgres_spark():
14 |     
15 |     @task()
16 |     def start():
17 |         return True
18 |     
19 |     # Spark Submit
20 |     postgres_spark = SparkSubmitOperator(
21 |         application=f'{spark_dir}spark_postgres_query.py', 
22 |         task_id="postgres_spark_query",
23 |         jars=f"{jars_dir}postgresql-42.6.0.jar"
24 |     )
25 |     
26 |     @task()
27 |     def finish():
28 |         return True
29 |     
30 |     ext = start() >> postgres_spark >> finish()
31 | 
32 | etl = postgres_spark()


--------------------------------------------------------------------------------
/dags/sampleTaskFlow.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | 
 4 | from airflow.decorators import dag, task
 5 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','TaskFlowApi'])
 6 | def tutorial_taskflow_api_etl():
 7 |     """
 8 |     ### TaskFlow API Tutorial Documentation
 9 |     This is a simple ETL data pipeline example which demonstrates the use of
10 |     the TaskFlow API using three simple tasks for Extract, Transform, and Load.
11 |     Documentation that goes along with the Airflow TaskFlow API tutorial is
12 |     located
13 |     [here](https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html)
14 |     """
15 |     @task()
16 |     def extract():
17 |         """
18 |         #### Extract task
19 |         A simple Extract task to get data ready for the rest of the data
20 |         pipeline. In this case, getting data is simulated by reading from a
21 |         hardcoded JSON string.
22 |         """
23 |         data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}'
24 | 
25 |         order_data_dict = json.loads(data_string)
26 |         return order_data_dict
27 |     @task(multiple_outputs=True)
28 |     def transform(order_data_dict: dict):
29 |         """
30 |         #### Transform task
31 |         A simple Transform task which takes in the collection of order data and
32 |         computes the total order value.
33 |         """
34 |         total_order_value = 0
35 | 
36 |         for value in order_data_dict.values():
37 |             total_order_value += value
38 | 
39 |         return {"total_order_value": total_order_value}
40 |     @task()
41 |     def load(total_order_value: float):
42 |         """
43 |         #### Load task
44 |         A simple Load task which takes in the result of the Transform task and
45 |         instead of saving it to end user review, just prints it out.
46 |         """
47 | 
48 |         print(f"Total order value is: {total_order_value:.2f}")
49 |     order_data = extract()
50 |     order_summary = transform(order_data)
51 |     load(order_summary["total_order_value"])
52 | tutorial_etl_dag = tutorial_taskflow_api_etl()
53 | 


--------------------------------------------------------------------------------
/dags/weather_data.py:
--------------------------------------------------------------------------------
  1 | # Import default Apache Airflow Libraries
  2 | from airflow.models import DAG
  3 | from airflow.operators.dummy import DummyOperator
  4 | from airflow.hooks.base import BaseHook
  5 | from airflow.models import Variable
  6 | from airflow.providers.http.sensors.http import HttpSensor
  7 | from airflow.operators.python import PythonOperator
  8 | from airflow.providers.http.operators.http import SimpleHttpOperator
  9 | from airflow.utils.task_group import TaskGroup
 10 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 11 | from airflow.operators.bash import BashOperator
 12 | from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator
 13 | 
 14 | # Importing Python Libraries
 15 | from datetime import datetime, timedelta
 16 | import time
 17 | import json
 18 | import os
 19 | from pandas import json_normalize
 20 | from geopy.geocoders import Nominatim
 21 | import csv, sqlite3
 22 | import glob
 23 | import requests
 24 | 
 25 | # Default Arguments and attibutes
 26 | default_args ={
 27 |     'start_date': datetime.today() - timedelta(days=1),
 28 |     'owner': 'Matheus'
 29 | }
 30 | 
 31 | # Get Current date, subtract 5 days and convert to timestamp
 32 | todayLessFiveDays =  datetime.today() - timedelta(days=5)
 33 | todayLessFiveDaysTimestamp = time.mktime(todayLessFiveDays.timetuple())
 34 | 
 35 | # Store last 5 days date into a list
 36 | days=[]
 37 | i = 1
 38 | while i < 6:
 39 |   todayLessFiveDays =  datetime.today() - timedelta(days=i)
 40 |   todayLessFiveDaysTimestamp = time.mktime(todayLessFiveDays.timetuple())
 41 |   days.append(todayLessFiveDaysTimestamp)
 42 |   i += 1
 43 | 
 44 | # Get Connection from airflow db
 45 | api_connection = BaseHook.get_connection("openweathermapApi")
 46 | 
 47 | # Get Variables
 48 | latitude = Variable.get("weather_data_lat")
 49 | longitude = Variable.get("weather_data_lon")
 50 | units = Variable.get("weather_data_units")
 51 | tmp_data_dir = Variable.get("weather_data_tmp_directory")
 52 | weather_data_spark_code = Variable.get("weather_data_spark_code")
 53 | 
 54 | # Suggested Locations
 55 | 
 56 | suggested_locations = (
 57 |       ['30.318878','-81.690173'],
 58 |       ['28.538336','-81.379234'],
 59 |       ['27.950575','-82.457176'],
 60 |       ['25.761681','-80.191788'],
 61 |       ['34.052235','-118.243683'],
 62 |       ['40.712776','-74.005974'],
 63 |       ['41.878113','-87.629799'],
 64 |       ['32.776665','-96.796989'],
 65 |       ['47.950356','-124.385490'],
 66 |       ['36.169941','-115.139832']
 67 | )
 68 | 
 69 | # weather data api query params
 70 | api_params = {
 71 |     'lat':suggested_locations[0][0],
 72 |     'lon':suggested_locations[0][1],
 73 |     'units':units,
 74 |     'dt':int(todayLessFiveDaysTimestamp),
 75 |     'appid':api_connection.password,
 76 | }
 77 | 
 78 | # Notify, Email
 79 | def _notify(ti):
 80 |     raise ValueError('Api Not Available')
 81 | 
 82 | # Tmp Data Check
 83 | def _tmp_data():
 84 |     # Checking if directories exist
 85 |     if not os.path.exists(tmp_data_dir):
 86 |         os.mkdir(tmp_data_dir)
 87 |     if not os.path.exists(f'{tmp_data_dir}weather/'):
 88 |         os.mkdir(f'{tmp_data_dir}weather/')
 89 |     if not os.path.exists(f'{tmp_data_dir}processed/'):
 90 |         os.mkdir(f'{tmp_data_dir}processed/')
 91 |     if not os.path.exists(f'{tmp_data_dir}processed/current_weather/'):
 92 |         os.mkdir(f'{tmp_data_dir}processed/current_weather/')
 93 |     if not os.path.exists(f'{tmp_data_dir}processed/hourly_weather/'):
 94 |         os.mkdir(f'{tmp_data_dir}processed/hourly_weather/')
 95 |     
 96 |         
 97 | # Extract Weather
 98 | def _extract_weather():
 99 |     if((Variable.get("weather_data_lat") == None or Variable.get("weather_data_lat") == '') and (Variable.get("weather_data_lon") == None or Variable.get("weather_data_lon") == '')):    
100 |         for latitude, longitude in suggested_locations:
101 |             for day in days:
102 |                 # weather data api query params
103 |                 api_param = {
104 |                     'lat':latitude,
105 |                     'lon':longitude,
106 |                     'units':units,
107 |                     'dt':int(day),
108 |                     'appid':api_connection.password
109 |                 }
110 |                 r = requests.get(url = api_connection.host + Variable.get("weather_data_endpoint"), params = api_param)
111 |                 data = r.json()
112 |                 time = datetime.today().strftime('%Y%m%d%H%M%S%f')
113 |                 with open(f"{tmp_data_dir}/weather/weather_output_{time}.json", "w") as outfile:
114 |                     json.dump(data, outfile)
115 |     else:
116 |         for day in days:
117 |                 # weather data api query params
118 |                 api_param = {
119 |                     'lat':Variable.get("weather_data_lat"),
120 |                     'lon':Variable.get("weather_data_lon"),
121 |                     'units':units,
122 |                     'dt':int(day),
123 |                     'appid':api_connection.password
124 |                 }
125 |                 r = requests.get(url = api_connection.host + Variable.get("weather_data_endpoint"), params = api_param)
126 |                 data = r.json()
127 |                 time = datetime.today().strftime('%Y%m%d%H%M%S%f')
128 |                 with open(f"{tmp_data_dir}/weather/weather_output_{time}.json", "w") as outfile:
129 |                     json.dump(data, outfile)
130 |         
131 | 
132 | # Store Location Iterative
133 | def _process_location_csv_iterative():
134 |     if((latitude == None or latitude == '') and (longitude == None or longitude == '')):    
135 |         for lat,long in suggested_locations:
136 |             _store_location_csv(lat,long)
137 |     else:
138 |         _store_location_csv(latitude,longitude)
139 |    
140 | # Processing and Deduplicating Weather API Data
141 | def _store_location_csv(lat,long):
142 |     
143 |     # Invoking geo locator api and getting address from latitude and longitude
144 |     geolocator = Nominatim(user_agent="weather_data")
145 |     location = geolocator.reverse(lat+","+long)
146 |     address = location.raw['address']
147 |     #current = datetime.today().strftime('%Y%m%d%H%M%S%f')
148 |     
149 |     # Process location data
150 |     location_df = json_normalize({
151 |         'latitude':lat,
152 |         'logitude': long,
153 |         'city':address.get('city'),
154 |         'state':address.get('state'),
155 |         'postcode':address.get('postcode'),
156 |         'country':address.get('country')
157 |     })
158 |     
159 |     # Store Location
160 |     location_df.to_csv(f'{tmp_data_dir}location.csv', mode='a', sep=',', index=None, header=False)
161 | 
162 | # Processed files
163 | def get_current_weather_file():
164 |      for i in glob.glob(f'{tmp_data_dir}processed/current_weather/part-*.csv'):
165 |          return i
166 | 
167 | def get_hourly_weather_file():
168 |     for i in glob.glob(f'{tmp_data_dir}processed/hourly_weather/part-*.csv'):
169 |         return i
170 |     
171 | # DAG Skeleton
172 | with DAG('weather_data', schedule_interval='@daily',default_args=default_args, catchup=False) as dag:
173 |     
174 |     # Start
175 |     start = DummyOperator(
176 |         task_id='Start'
177 |     )
178 |     
179 |     # Temp Data 
180 |     tmp_data = PythonOperator(
181 |         task_id='tmp_data',
182 |         python_callable=_tmp_data
183 |     )
184 |     
185 |     # Create Http Sensor Operator
186 |     check_api = HttpSensor(
187 |         task_id='check_api',
188 |         http_conn_id='openweathermapApi',
189 |         endpoint=Variable.get("weather_data_endpoint"),
190 |         method='GET',
191 |         response_check=lambda response: True if response.status_code == 200 or response.status_code == 204 else False,
192 |         poke_interval=5,
193 |         timeout=60,
194 |         retries=2,
195 |         mode="reschedule",
196 |         soft_fail=False,
197 |         request_params = api_params
198 |     )
199 |     
200 |     # Api is not available
201 |     api_not_available = PythonOperator(
202 |         task_id='api_not_available',
203 |         python_callable=_notify,
204 |         trigger_rule='one_failed'
205 |     )
206 |     
207 |     # Extract User Records Simple Http Operator
208 |     extracting_weather = PythonOperator(
209 |         task_id='extracting_weather',
210 |         python_callable=_extract_weather,
211 |         trigger_rule='all_success'
212 |     )
213 |     
214 |     # TaskGroup for Creating Postgres tables
215 |     with TaskGroup('create_postgres_tables') as create_postgres_tables:
216 |         
217 |     # Create table Location
218 |         creating_table_location = PostgresOperator(
219 |             task_id='creating_table_location',
220 |             postgres_conn_id='postgres_default',
221 |             sql='''
222 |                 CREATE TABLE IF NOT EXISTS location_tmp (
223 |                     latitude VARCHAR(255) NOT NULL,
224 |                     longitude VARCHAR(255) NOT NULL,
225 |                     city VARCHAR(255) NULL,
226 |                     state VARCHAR(255) NULL,
227 |                     postcode VARCHAR(255) NULL,
228 |                     country VARCHAR(255) NULL,
229 |                     PRIMARY KEY (latitude,longitude)
230 |                 );
231 |                 
232 |                 CREATE TABLE IF NOT EXISTS location (
233 |                     latitude VARCHAR(255) NOT NULL,
234 |                     longitude VARCHAR(255) NOT NULL,
235 |                     city VARCHAR(255) NULL,
236 |                     state VARCHAR(255) NULL,
237 |                     postcode VARCHAR(255) NULL,
238 |                     country VARCHAR(255) NULL,
239 |                     PRIMARY KEY (latitude,longitude)
240 |                 );
241 |                 
242 |                 '''
243 |         )
244 |         
245 |        # Create Table Requested Weather 
246 |         creating_table_requested_weather = PostgresOperator(
247 |             task_id='creating_table_requested_weather',
248 |             postgres_conn_id='postgres_default',
249 |             sql='''
250 |                 CREATE TABLE IF NOT EXISTS current_weather_tmp (
251 |                     latitude VARCHAR(255) NOT NULL,
252 |                     longitude VARCHAR(255) NOT NULL,
253 |                     timezone VARCHAR(255) NOT NULL,
254 |                     requested_datetime VARCHAR(255) NULL,
255 |                     sunrise VARCHAR(255) NULL,
256 |                     sunset VARCHAR(255) NULL,
257 |                     temp VARCHAR(255) NULL,
258 |                     feels_like VARCHAR(255) NULL,
259 |                     pressure VARCHAR(255) NULL,
260 |                     humidity VARCHAR(255) NULL,
261 |                     dew_point VARCHAR(255) NULL,
262 |                     uvi VARCHAR(255) NULL,
263 |                     clouds VARCHAR(255) NULL,
264 |                     visibility VARCHAR(255) NULL,
265 |                     wind_speed VARCHAR(255) NULL,
266 |                     wind_deg VARCHAR(255) NULL,
267 |                     weather_id VARCHAR(255) NULL,
268 |                     weather_main VARCHAR(255) NULL,
269 |                     weather_description VARCHAR(255) NULL,
270 |                     weather_icon VARCHAR(255) NULL,
271 |                     PRIMARY KEY (latitude,longitude,requested_datetime)
272 |                 );
273 |                 
274 |                 CREATE TABLE IF NOT EXISTS current_weather (
275 |                     latitude VARCHAR(255) NOT NULL,
276 |                     longitude VARCHAR(255) NOT NULL,
277 |                     timezone VARCHAR(255) NOT NULL,
278 |                     requested_datetime VARCHAR(255) NULL,
279 |                     sunrise VARCHAR(255) NULL,
280 |                     sunset VARCHAR(255) NULL,
281 |                     temp VARCHAR(255) NULL,
282 |                     feels_like VARCHAR(255) NULL,
283 |                     pressure VARCHAR(255) NULL,
284 |                     humidity VARCHAR(255) NULL,
285 |                     dew_point VARCHAR(255) NULL,
286 |                     uvi VARCHAR(255) NULL,
287 |                     clouds VARCHAR(255) NULL,
288 |                     visibility VARCHAR(255) NULL,
289 |                     wind_speed VARCHAR(255) NULL,
290 |                     wind_deg VARCHAR(255) NULL,
291 |                     weather_id VARCHAR(255) NULL,
292 |                     weather_main VARCHAR(255) NULL,
293 |                     weather_description VARCHAR(255) NULL,
294 |                     weather_icon VARCHAR(255) NULL,
295 |                     PRIMARY KEY (latitude,longitude,requested_datetime)
296 |                 );
297 |                 
298 |                 '''
299 |         )
300 |         
301 |         # Create Table Hourly Weather
302 |         creating_table_hourly_weather = PostgresOperator(
303 |             task_id='creating_table_hourly_weather',
304 |             postgres_conn_id='postgres_default',
305 |             sql='''
306 |                 CREATE TABLE IF NOT EXISTS hourly_weather_tmp (
307 |                     latitude VARCHAR(255) NOT NULL,
308 |                     longitude VARCHAR(255) NOT NULL,
309 |                     timezone VARCHAR(255) NOT NULL,
310 |                     datetime VARCHAR(255) NULL,
311 |                     temp VARCHAR(255) NULL,
312 |                     feels_like VARCHAR(255) NULL,
313 |                     pressure VARCHAR(255) NULL,
314 |                     humidity VARCHAR(255) NULL,
315 |                     dew_point VARCHAR(255) NULL,
316 |                     uvi VARCHAR(255) NULL,
317 |                     clouds VARCHAR(255) NULL,
318 |                     visibility VARCHAR(255) NULL,
319 |                     wind_speed VARCHAR(255) NULL,
320 |                     wind_deg VARCHAR(255) NULL,
321 |                     wind_gust VARCHAR(255) NULL,
322 |                     weather_id VARCHAR(255) NULL,
323 |                     weather_main VARCHAR(255) NULL,
324 |                     weather_description VARCHAR(255) NULL,
325 |                     weather_icon VARCHAR(255) NULL,
326 |                     PRIMARY KEY (latitude,longitude,datetime)
327 |                 );
328 |                 
329 |                 CREATE TABLE IF NOT EXISTS hourly_weather (
330 |                     latitude VARCHAR(255) NOT NULL,
331 |                     longitude VARCHAR(255) NOT NULL,
332 |                     timezone VARCHAR(255) NOT NULL,
333 |                     datetime VARCHAR(255) NULL,
334 |                     temp VARCHAR(255) NULL,
335 |                     feels_like VARCHAR(255) NULL,
336 |                     pressure VARCHAR(255) NULL,
337 |                     humidity VARCHAR(255) NULL,
338 |                     dew_point VARCHAR(255) NULL,
339 |                     uvi VARCHAR(255) NULL,
340 |                     clouds VARCHAR(255) NULL,
341 |                     visibility VARCHAR(255) NULL,
342 |                     wind_speed VARCHAR(255) NULL,
343 |                     wind_deg VARCHAR(255) NULL,
344 |                     wind_gust VARCHAR(255) NULL,
345 |                     weather_id VARCHAR(255) NULL,
346 |                     weather_main VARCHAR(255) NULL,
347 |                     weather_description VARCHAR(255) NULL,
348 |                     weather_icon VARCHAR(255) NULL,
349 |                     PRIMARY KEY (latitude,longitude,datetime)
350 |                 );
351 |                 
352 |                 '''
353 |         )
354 |         
355 |     # Truncate Temp Tables    
356 |     with TaskGroup('truncate_temp_table_postgres') as truncate_temp_table_postgres:  
357 |         
358 |         # Truncate location_temp Postgres
359 |         truncate_location_temp_postgres = PostgresOperator(
360 |             task_id='truncate_location_temp_postgres',
361 |             postgres_conn_id='postgres_default',
362 |             sql='''
363 |                     TRUNCATE TABLE location_tmp;
364 |                 '''
365 |         )
366 |         
367 |         # Truncate current_weather_temp Postgres
368 |         truncate_current_weather_temp_postgres = PostgresOperator(
369 |             task_id='truncate_current_weather_temp_postgres',
370 |             postgres_conn_id='postgres_default',
371 |             sql='''
372 |                     TRUNCATE TABLE current_weather_tmp;
373 |                 '''
374 |         )
375 |         
376 |         # Truncate hourly_weather_temp Postgres
377 |         truncate_hourly_weather_temp_postgres = PostgresOperator(
378 |             task_id='truncate_hourly_weather_temp_postgres',
379 |             postgres_conn_id='postgres_default',
380 |             sql='''
381 |                     TRUNCATE TABLE hourly_weather_tmp;
382 |                 '''
383 |         )
384 | 
385 |     # Process Location Data
386 |     process_location_csv = PythonOperator(
387 |         task_id='process_location_csv',
388 |         python_callable=_process_location_csv_iterative
389 |     )
390 |      
391 |     # Spark Submit
392 |     spark_process_weather = SparkSubmitOperator(
393 |         application=f'{weather_data_spark_code}', task_id="spark_process_weather"
394 |     )
395 |         
396 |     # TaskGroup for Storing processed data into postgres temp tables
397 |     with TaskGroup('store_processed_temp_data_in_postgres') as store_processed_temp_data_in_postgres:
398 | 
399 |         store_location_tmp_postgres = PostgresOperator(
400 |             task_id='store_location_tmp_postgres',
401 |             postgres_conn_id='postgres_default',
402 |             sql=f'''
403 |                     COPY location_tmp
404 |                     FROM '{tmp_data_dir}location.csv' 
405 |                     DELIMITER ','
406 |                     ;
407 |                 '''
408 |         )
409 |         
410 |         store_current_weather_tmp_postgres = PostgresOperator(
411 |             task_id='store_current_weather_tmp_postgres',
412 |             postgres_conn_id='postgres_default',
413 |             sql='''
414 |                     COPY current_weather_tmp
415 |                     FROM '%s' 
416 |                     DELIMITER ','
417 |                     ;
418 |                 ''' % get_current_weather_file()
419 |         )
420 |         
421 |         store_hourly_weather_tmp_postgres = PostgresOperator(
422 |             task_id='store_hourly_weather_tmp_postgres',
423 |             postgres_conn_id='postgres_default',
424 |             sql='''
425 |                     COPY hourly_weather_tmp
426 |                     FROM '%s' 
427 |                     DELIMITER ','
428 |                     ;
429 |                 ''' % get_hourly_weather_file()
430 |         )
431 |         
432 |     # TaskGroup for Storing from temp tables to original tables
433 |     with TaskGroup('copy_from_tmp_table_to_original_table') as copy_from_tmp_table_to_original_table:
434 |         
435 |         copy_location_tmp_to_location = PostgresOperator(
436 |             task_id='copy_location_tmp_to_location',
437 |             postgres_conn_id='postgres_default',
438 |             sql='''
439 |                     INSERT INTO location 
440 |                     SELECT * 
441 |                     FROM location_tmp
442 |                     EXCEPT
443 |                     SELECT * 
444 |                     FROM location
445 |                     ON CONFLICT (latitude,longitude) DO NOTHING;
446 |                 ''' 
447 |         )
448 |         
449 |         copy_current_weather_tmp_to_current_weather = PostgresOperator(
450 |             task_id='copy_current_weather_tmp_to_current_weather',
451 |             postgres_conn_id='postgres_default',
452 |             sql='''
453 |                     INSERT INTO current_weather 
454 |                     SELECT * 
455 |                     FROM current_weather_tmp
456 |                     EXCEPT
457 |                     SELECT * 
458 |                     FROM current_weather
459 |                     ON CONFLICT (latitude,longitude,requested_datetime) DO NOTHING;
460 |                 ''' 
461 |         )
462 |         
463 |         copy_hourly_weather_tmp_to_current_weather = PostgresOperator(
464 |             task_id='copy_hourly_weather_tmp_to_current_weather',
465 |             postgres_conn_id='postgres_default',
466 |             sql='''
467 |                     INSERT INTO hourly_weather 
468 |                     SELECT * 
469 |                     FROM hourly_weather_tmp
470 |                     EXCEPT
471 |                     SELECT * 
472 |                     FROM hourly_weather
473 |                     ON CONFLICT (latitude,longitude,datetime) DO NOTHING;
474 |                 ''' 
475 |         )
476 |         
477 |      # TaskGroup for Creating Postgres Views
478 |     with TaskGroup('create_materialized_views') as create_materialized_views:
479 |         # Create View for DataSet 1
480 |         create_view_dataset_1 = PostgresOperator(
481 |             task_id='create_view_dataset_1',
482 |             postgres_conn_id='postgres_default',
483 |             sql='''
484 |                 CREATE OR REPLACE VIEW VW_DATASET_1
485 |                 AS
486 |                 SELECT 
487 |                 loc.country AS Country,
488 |                 loc.state AS State,
489 |                 loc.city AS City,
490 |                 CAST(hw.datetime AS DATE) AS Date,
491 |                 EXTRACT(MONTH FROM CAST(hw.datetime AS DATE)) AS Month,
492 |                 MAX(CAST(hw.temp AS DECIMAL)) AS Max_Temperature
493 |                 FROM location loc, hourly_weather hw
494 |                 WHERE ROUND(CAST(loc.latitude AS DECIMAL),4) = ROUND(CAST(hw.latitude AS DECIMAL),4)
495 |                 AND ROUND(CAST(loc.longitude AS DECIMAL),4) = ROUND(CAST(hw.longitude AS DECIMAL),4)
496 |                 GROUP BY City,State,Country,Date,Month
497 |                 ORDER BY Date DESC;
498 |                 '''
499 |         )
500 |         
501 |         # Create View for DataSet 2
502 |         create_view_dataset_2 = PostgresOperator(
503 |             task_id='create_view_dataset_2',
504 |             postgres_conn_id='postgres_default',
505 |             sql='''
506 |                 CREATE OR REPLACE VIEW  VW_DATASET_2
507 |                 AS
508 |                 SELECT 
509 |                 loc.country AS Country,
510 |                 loc.state AS State,
511 |                 loc.city AS City,
512 |                 CAST(hw.datetime AS DATE) AS Date,
513 |                 MAX(CAST(hw.temp AS DECIMAL)) AS Max_Temperature,
514 |                 MIN(CAST(hw.temp AS DECIMAL)) AS Min_Temperature,
515 |                 ROUND(AVG(CAST(hw.temp AS DECIMAL)),2) AS Average_Temperature
516 |                 FROM location loc, hourly_weather hw
517 |                 WHERE ROUND(CAST(loc.latitude AS DECIMAL),4) = ROUND(CAST(hw.latitude AS DECIMAL),4)
518 |                 AND ROUND(CAST(loc.longitude AS DECIMAL),4) = ROUND(CAST(hw.longitude AS DECIMAL),4)
519 |                 GROUP BY City,State,Country,Date
520 |                 ORDER BY Date DESC;
521 |                 '''
522 |         )
523 |         
524 |     # Pre Cleanup task    
525 |     pre_cleanup= BashOperator(
526 |         task_id='pre_cleanup',
527 |         bash_command=f'rm -rf {tmp_data_dir}'
528 |     )    
529 |     
530 |     # Post Cleanup task    
531 |     post_cleanup= BashOperator(
532 |         task_id='post_cleanup',
533 |         bash_command=f'rm -r {tmp_data_dir}'
534 |     )
535 |     
536 |     # DAG Dependencies
537 |     start >> pre_cleanup >> tmp_data >> check_api >> [extracting_weather,api_not_available]
538 |     extracting_weather >> create_postgres_tables >> truncate_temp_table_postgres >> process_location_csv >> spark_process_weather 
539 |     spark_process_weather >> store_processed_temp_data_in_postgres >> copy_from_tmp_table_to_original_table >> create_materialized_views >> post_cleanup


--------------------------------------------------------------------------------
/databricks/notebooks/opendentalfhir.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC ### Open Dental API - FHIR r4 implementation
  4 | # MAGIC <p>Matheus Pavanetti</p>
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md
  9 | # MAGIC ### Data Architecture
 10 | # MAGIC ![](https://files.training.databricks.com/images/sslh/multi-hop-simple.png)
 11 | # MAGIC 
 12 | # MAGIC - **Bronze** tables contain raw data ingested from various sources (JSON data from open dental api call).
 13 | # MAGIC 
 14 | # MAGIC - **Silver** tables provide a more refined view of our data. (parsed json, cleaned data, schema enforcement).
 15 | # MAGIC 
 16 | # MAGIC - **Gold** tables provide business level aggregates often used for reporting and dashboarding.
 17 | # MAGIC   
 18 | # MAGIC ### Documentation Reference
 19 | # MAGIC - **API Implementation** https://www.opendental.com/site/apiimplementation.html  
 20 | # MAGIC   
 21 | # MAGIC - **Open Dental Specs FHIR**  https://www.opendental.com/resources/OpenDentalFHIR19-3Spec.pdf
 22 | # MAGIC     
 23 | # MAGIC - **FHIR Standard** http://hl7.org/fhir/R4/
 24 | 
 25 | # COMMAND ----------
 26 | 
 27 | # DBTITLE 1,Setting up storage account on spark session
 28 | # Run this to Authenticate on storage account
 29 | spark.conf.set("fs.azure.account.auth.type", "OAuth")
 30 | spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
 31 | spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get('FILLIN', 'FILLIN'))
 32 | spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get('FILLIN', 'FILLIN'))
 33 | spark.conf.set("fs.azure.account.oauth2.client.endpoint", f"https://login.microsoftonline.com/{dbutils.secrets.get('FILLIN', 'FILLIN')}/oauth2/token")
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | # DBTITLE 1,Environment Class
 38 | import requests
 39 | import json
 40 | import pandas as pd
 41 | import os
 42 | from datetime import datetime
 43 | 
 44 | class Env:
 45 |   
 46 |   # Datalake
 47 |   datalakeDir="abfss://datalake@FILLIN.dfs.core.windows.net/"
 48 |   workdir=f"{datalakeDir}opendentalfhir/" 
 49 |   
 50 |   # Unity Catalog
 51 |   catalog="FILLIN"
 52 |   catalog_bronze=f"{catalog}.bronze"
 53 |   catalog_silver=f"{catalog}.silver"
 54 |   catalog_gold=f"{catalog}.gold"
 55 |   
 56 |   # Driver
 57 |   tmp_dir = "/tmp/opendentalfhir/"
 58 |   
 59 |   # Multi-Hop Architecture variables.
 60 |   raw=f"{workdir}raw/"
 61 |   bronze=f"{workdir}bronze/"
 62 |   silver=f"{workdir}silver/"
 63 |   gold=f"{workdir}gold/"
 64 |   
 65 |   # Open Dental API variables.
 66 |   endpoint="https://api.opendental.com/fhir/v2/"
 67 |   token="ODFHIR NFF6i0KrXrxDkZHt/VzkmZEaUWOjnQX2z"
 68 |   resources=["organization","location",
 69 |             "patient","procedure","practitioner",
 70 |             "ServiceRequest"]
 71 |   
 72 |   def __init__(self):
 73 |     self.tmp_dir = Env.tmp_dir
 74 |   
 75 |   def createTmp(self):
 76 |     if not os.path.exists(self.tmp_dir):
 77 |         os.mkdir(self.tmp_dir)
 78 |   
 79 |   def wipeTmp(self):
 80 |     if os.path.exists(self.tmp_dir):
 81 |       for file in os.listdir(self.tmp_dir):
 82 |         os.remove(os.path.join(self.tmp_dir, file))
 83 |   
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | # DBTITLE 1,Extract Class
 88 | class Extract(Env):
 89 |   
 90 |   def __init__(self):
 91 |     Env.__init__(self)
 92 |     Env.createTmp(self)
 93 |     
 94 |   def cleanRaw(self):
 95 |     return dbutils.fs.rm(Env.raw,True)
 96 |   
 97 |   def cleanTmp(self):
 98 |     return Env.wipeTmp()
 99 |   
100 |   def pullData(self):
101 |     headers={"Accept": "application/json",
102 |              "Content-Type": "application/json",
103 |              "Authorization": Env.token}
104 |     
105 |     Env.wipeTmp(self)
106 |     
107 |     for item in Env.resources:
108 |       time = datetime.now().strftime(("%Y%m%d%H%M%S")) # current date and time of file download (UTC).
109 |       result = requests.get(f"{Env.endpoint}{item}",headers=headers)
110 |       
111 |       if result.status_code == 200:
112 |         resultjson = json.loads(result.text)
113 |         pd_df = pd.json_normalize(resultjson)
114 |         
115 |         # Download temp json
116 |         pd_df.to_json(f"{Env.tmp_dir}/{item}_{time}.json")
117 |         
118 |         # Upload Datalake
119 |         dbutils.fs.cp(f"file:////{Env.tmp_dir}/{item}_{time}.json",f"{Env.raw}{item}/{item}_{time}.json")
120 |         print(f"RAW Downloaded {resultjson['total']} {item} records at file: {Env.raw}{item}/{item}_{time}.json")
121 |     
122 | 
123 | # COMMAND ----------
124 | 
125 | # DBTITLE 1,Transform Class
126 | #from pyspark.sql.functions import explode, cast, flatten, collect_set
127 | 
128 | class Transform(Env):
129 |   
130 |     def __init__(self):
131 |       Env.__init__(self)
132 |     
133 |     def rawToBronze(self):
134 |       for item in Env.resources:
135 |         df = (spark.read.json(f"{Env.raw}{item}/"))
136 |         transformed_df = df.selectExpr("explode(flatten(collect_set(entry.*))) as data")
137 |         transformed_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(f"{Env.bronze}{item}/")#.saveAsTable(f"{Env.catalog_bronze}.{item}")
138 |         print(f"BRONZE records for entity {item} were persisted at {Env.bronze}{item}/")
139 |         
140 |     def patientToSilver(self):
141 |       df = (spark.read.format("delta").load(f"{Env.bronze}/patient"))
142 |       
143 |       transformed_df = (df.selectExpr("CAST(data.resource.id as LONG) as id",
144 |                                "data.resource.active as active",
145 |                                "data.resource.name as name",
146 |                                "data.resource.resourceType as resourceType",
147 |                                "data.resource.address as address",
148 |                                "CAST(data.resource.birthdate as DATE) as birthdate",
149 |                                "data.resource.careProvider as careProvider",
150 |                                "data.resource.gender as gender",
151 |                                "data.resource.identifier as identifier",
152 |                                "data.resource.managingOrganization as managingOrganization",
153 |                                "data.resource.maritalStatus as maritalStatus",
154 |                                "data.resource.meta as meta",
155 |                               "data.resource.telecom as telecom",
156 |                               "data.search as search")
157 |                           .distinct())
158 |       (transformed_df
159 |          .write
160 |          .mode("overwrite")
161 |          .save(f"{Env.silver}patient/"))
162 |          #.saveAsTable(f"{Env.catalog_silver}.patient"))
163 |       print(f"SILVER patient records were persisted at {Env.silver}patient")
164 |       
165 | 
166 | # COMMAND ----------
167 | 
168 | # Main File
169 | if __name__ == "__main__":
170 |   
171 |   ext = Extract()
172 |   trans = Transform()
173 |   
174 |   #ext.cleanRaw()
175 |   
176 |   ext.pullData()
177 |   trans.rawToBronze()
178 |   
179 |   trans.patientToSilver()
180 |   
181 | 


--------------------------------------------------------------------------------
/docker-compose-celery.yaml:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one
  2 | # or more contributor license agreements.  See the NOTICE file
  3 | # distributed with this work for additional information
  4 | # regarding copyright ownership.  The ASF licenses this file
  5 | # to you under the Apache License, Version 2.0 (the
  6 | # "License"); you may not use this file except in compliance
  7 | # with the License.  You may obtain a copy of the License at
  8 | #
  9 | #   http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing,
 12 | # software distributed under the License is distributed on an
 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | # KIND, either express or implied.  See the License for the
 15 | # specific language governing permissions and limitations
 16 | # under the License.
 17 | #
 18 | 
 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
 20 | #
 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment.
 22 | #
 23 | # This configuration supports basic configuration using environment variables or an .env file
 24 | # The following variables are supported:
 25 | #
 26 | # AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
 27 | #                                Default: apache/airflow:2.7.1
 28 | # AIRFLOW_UID                  - User ID in Airflow containers
 29 | #                                Default: 50000
 30 | # AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
 31 | #                                Default: .
 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
 33 | #
 34 | # _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
 35 | #                                Default: airflow
 36 | # _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
 37 | #                                Default: airflow
 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
 39 | #                                Use this option ONLY for quick checks. Installing requirements at container
 40 | #                                startup is done EVERY TIME the service is started.
 41 | #                                A better way is to build a custom image or extend the official image
 42 | #                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
 43 | #                                Default: ''
 44 | #
 45 | # Feel free to modify this file to suit your needs.
 46 | ---
 47 | version: '3.8'
 48 | x-airflow-common:
 49 |   &airflow-common
 50 |   # In order to add custom dependencies or upgrade provider packages you can use your extended image.
 51 |   # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
 52 |   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
 53 |   #image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.7.1}
 54 |   build: ./celery
 55 |   environment:
 56 |     &airflow-common-env
 57 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 58 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 59 |     # For backward compatibility, with Airflow <2.3
 60 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 61 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
 62 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 63 |     AIRFLOW__CORE__FERNET_KEY: ''
 64 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 65 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 66 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 67 |     # yamllint disable rule:line-length
 68 |     # Use simple http server on scheduler for health checks
 69 |     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
 70 |     # yamllint enable rule:line-length
 71 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 72 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 73 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 74 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- geopy apache-airflow[postgres] apache-airflow-providers-apache-spark apache-airflow-providers-databricks}
 75 |   volumes:
 76 |     - ./dags:/opt/airflow/dags
 77 |     - ./logs:/opt/airflow/logs
 78 |     - ./data:/opt/airflow/data
 79 |     - ./plugins:/opt/airflow/plugins
 80 |     - ./spark:/opt/airflow/spark
 81 |     - ./variables:/opt/airflow/variables
 82 |     - ./jars:/opt/airflow/jars
 83 |     - ./config:/opt/airflow/config
 84 | 
 85 |   user: "${AIRFLOW_UID:-50000}:0"
 86 |   depends_on:
 87 |     &airflow-common-depends-on
 88 |     redis:
 89 |       condition: service_healthy
 90 |     postgres:
 91 |       condition: service_healthy
 92 | 
 93 | services:
 94 |   postgres:
 95 |     image: postgres:13
 96 |     environment:
 97 |       POSTGRES_USER: airflow
 98 |       POSTGRES_PASSWORD: airflow
 99 |       POSTGRES_DB: airflow
100 |     volumes:
101 |       -  postgres-db-volume-celery:/var/lib/postgresql/data
102 |       - ./data:/opt/airflow/data
103 |     healthcheck:
104 |       test: ["CMD", "pg_isready", "-U", "airflow"]
105 |       interval: 10s
106 |       retries: 5
107 |       start_period: 5s
108 |     restart: always
109 | 
110 |   pgadmin:
111 |     image: dpage/pgadmin4
112 |     environment:
113 |       PGADMIN_DEFAULT_EMAIL: "postgres@email.com"
114 |       PGADMIN_DEFAULT_PASSWORD: "postgres"
115 |     ports:
116 |       - "15432:80"
117 |     depends_on:
118 |       - postgres
119 |     volumes:
120 |       - ./data:/opt/airflow/data
121 | 
122 |   redis:
123 |     image: redis:latest
124 |     expose:
125 |       - 6379
126 |     healthcheck:
127 |       test: ["CMD", "redis-cli", "ping"]
128 |       interval: 10s
129 |       timeout: 30s
130 |       retries: 50
131 |       start_period: 30s
132 |     restart: always
133 | 
134 |   airflow-webserver:
135 |     <<: *airflow-common
136 |     command: webserver
137 |     ports:
138 |       - "8080:8080"
139 |     healthcheck:
140 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
141 |       interval: 30s
142 |       timeout: 10s
143 |       retries: 5
144 |       start_period: 30s
145 |     restart: always
146 |     depends_on:
147 |       <<: *airflow-common-depends-on
148 |       airflow-init:
149 |         condition: service_completed_successfully
150 | 
151 |   airflow-scheduler:
152 |     <<: *airflow-common
153 |     command: scheduler
154 |     healthcheck:
155 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
156 |       interval: 30s
157 |       timeout: 10s
158 |       retries: 5
159 |       start_period: 30s
160 |     restart: always
161 |     depends_on:
162 |       <<: *airflow-common-depends-on
163 |       airflow-init:
164 |         condition: service_completed_successfully
165 | 
166 |   airflow-worker:
167 |     <<: *airflow-common
168 |     command: celery worker
169 |     healthcheck:
170 |       # yamllint disable rule:line-length
171 |       test:
172 |         - "CMD-SHELL"
173 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
174 |       interval: 30s
175 |       timeout: 10s
176 |       retries: 5
177 |       start_period: 30s
178 |     environment:
179 |       <<: *airflow-common-env
180 |       # Required to handle warm shutdown of the celery workers properly
181 |       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
182 |       DUMB_INIT_SETSID: "0"
183 |     restart: always
184 |     depends_on:
185 |       <<: *airflow-common-depends-on
186 |       airflow-init:
187 |         condition: service_completed_successfully
188 | 
189 |   airflow-triggerer:
190 |     <<: *airflow-common
191 |     command: triggerer
192 |     healthcheck:
193 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
194 |       interval: 30s
195 |       timeout: 10s
196 |       retries: 5
197 |       start_period: 30s
198 |     restart: always
199 |     depends_on:
200 |       <<: *airflow-common-depends-on
201 |       airflow-init:
202 |         condition: service_completed_successfully
203 | 
204 |   airflow-init:
205 |     <<: *airflow-common
206 |     entrypoint: /bin/bash
207 |     # yamllint disable rule:line-length
208 |     command:
209 |       - -c
210 |       - |
211 |         function ver() {
212 |           printf "%04d%04d%04d%04d" $${1//./ }
213 |         }
214 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
215 |         airflow_version_comparable=$$(ver $${airflow_version})
216 |         min_airflow_version=2.2.0
217 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
218 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
219 |           echo
220 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
221 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
222 |           echo
223 |           exit 1
224 |         fi
225 |         if [[ -z "${AIRFLOW_UID}" ]]; then
226 |           echo
227 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
228 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
229 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
230 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
231 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
232 |           echo
233 |         fi
234 |         one_meg=1048576
235 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
236 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
237 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
238 |         warning_resources="false"
239 |         if (( mem_available < 4000 )) ; then
240 |           echo
241 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
242 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
243 |           echo
244 |           warning_resources="true"
245 |         fi
246 |         if (( cpus_available < 2 )); then
247 |           echo
248 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
249 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
250 |           echo
251 |           warning_resources="true"
252 |         fi
253 |         if (( disk_available < one_meg * 10 )); then
254 |           echo
255 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
256 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
257 |           echo
258 |           warning_resources="true"
259 |         fi
260 |         if [[ $${warning_resources} == "true" ]]; then
261 |           echo
262 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
263 |           echo "Please follow the instructions to increase amount of resources available:"
264 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
265 |           echo
266 |         fi
267 |         mkdir -p /sources/logs /sources/dags /sources/plugins
268 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
269 |         exec /entrypoint airflow version
270 |     # yamllint enable rule:line-length
271 |     environment:
272 |       <<: *airflow-common-env
273 |       _AIRFLOW_DB_MIGRATE: 'true'
274 |       _AIRFLOW_WWW_USER_CREATE: 'true'
275 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
276 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
277 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
278 |     user: "0:0"
279 |     volumes:
280 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
281 | 
282 |   airflow-cli:
283 |     <<: *airflow-common
284 |     profiles:
285 |       - debug
286 |     environment:
287 |       <<: *airflow-common-env
288 |       CONNECTION_CHECK_MAX_COUNT: "0"
289 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
290 |     command:
291 |       - bash
292 |       - -c
293 |       - airflow
294 | 
295 |   # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
296 |   # or by explicitly targeted on the command line e.g. docker-compose up flower.
297 |   # See: https://docs.docker.com/compose/profiles/
298 |   flower:
299 |     <<: *airflow-common
300 |     command: celery flower
301 |     profiles:
302 |       - flower
303 |     ports:
304 |       - "5555:5555"
305 |     healthcheck:
306 |       test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
307 |       interval: 30s
308 |       timeout: 10s
309 |       retries: 5
310 |       start_period: 30s
311 |     restart: always
312 |     depends_on:
313 |       <<: *airflow-common-depends-on
314 |       airflow-init:
315 |         condition: service_completed_successfully
316 | 
317 | volumes:
318 |   postgres-db-volume-celery:


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: '3'
  2 | x-airflow-common:
  3 |   &airflow-common
  4 |   #image: ${AIRFLOW_IMAGE_NAME:-my-image:0.0.1}
  5 |   build: .
  6 |   environment:
  7 |     &airflow-common-env
  8 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
  9 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 10 |     AIRFLOW__CORE__FERNET_KEY: ''
 11 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 12 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 13 |     AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth'
 14 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- geopy apache-airflow[postgres] apache-airflow-providers-apache-spark apache-airflow-providers-databricks}
 15 |   volumes:
 16 |     - ./dags:/opt/airflow/dags
 17 |     - ./logs:/opt/airflow/logs
 18 |     - ./data:/opt/airflow/data
 19 |     - ./plugins:/opt/airflow/plugins
 20 |     - ./spark:/opt/airflow/spark
 21 |     - ./variables:/opt/airflow/variables
 22 |     - ./jars:/opt/airflow/jars
 23 | 
 24 |   user: "${AIRFLOW_UID:-50000}:0"
 25 |   depends_on:
 26 |     &airflow-common-depends-on
 27 |     postgres:
 28 |       condition: service_healthy
 29 | services:
 30 |   postgres:
 31 |     image: postgres:13
 32 |     environment:
 33 |       POSTGRES_USER: airflow
 34 |       POSTGRES_PASSWORD: airflow
 35 |       POSTGRES_DB: airflow
 36 |     ports:
 37 |       - 5432:5432
 38 |     volumes:
 39 |       - postgres-db-volume:/var/lib/postgresql/data
 40 |       - ./data:/opt/airflow/data
 41 |     healthcheck:
 42 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 43 |       interval: 5s
 44 |       retries: 5
 45 |     restart: on-failure
 46 |   pgadmin:
 47 |     image: dpage/pgadmin4
 48 |     environment:
 49 |       PGADMIN_DEFAULT_EMAIL: "postgres@email.com"
 50 |       PGADMIN_DEFAULT_PASSWORD: "postgres"
 51 |     ports:
 52 |       - "15432:80"
 53 |     depends_on:
 54 |       - postgres
 55 |     volumes:
 56 |       - ./data:/opt/airflow/data
 57 | 
 58 |   airflow-webserver:
 59 |     <<: *airflow-common
 60 |     command: webserver
 61 |     ports:
 62 |       - 8080:8080
 63 |     healthcheck:
 64 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 65 |       interval: 10s
 66 |       timeout: 10s
 67 |       retries: 5
 68 |     restart: on-failure
 69 |     depends_on:
 70 |       <<: *airflow-common-depends-on
 71 |       airflow-init:
 72 |         condition: service_completed_successfully
 73 | 
 74 |   airflow-scheduler:
 75 |     <<: *airflow-common
 76 |     command: scheduler
 77 |     healthcheck:
 78 |       test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
 79 |       interval: 10s
 80 |       timeout: 10s
 81 |       retries: 5
 82 |     restart: on-failure
 83 |     depends_on:
 84 |       <<: *airflow-common-depends-on
 85 |       airflow-init:
 86 |         condition: service_completed_successfully
 87 | 
 88 |   airflow-init:
 89 |     <<: *airflow-common
 90 |     entrypoint: /bin/bash
 91 |     # yamllint disable rule:line-length
 92 |     command:
 93 |       - -c
 94 |       - |
 95 |         function ver() {
 96 |           printf "%04d%04d%04d%04d" $${1//./ }
 97 |         }
 98 |         airflow_version=$$(gosu airflow airflow version)
 99 |         airflow_version_comparable=$$(ver $${airflow_version})
100 |         min_airflow_version=2.2.0
101 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
102 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
103 |           echo
104 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
105 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
106 |           echo
107 |           exit 1
108 |         fi
109 |         if [[ -z "${AIRFLOW_UID}" ]]; then
110 |           echo
111 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
112 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
113 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
114 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
115 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user"
116 |           echo
117 |         fi
118 |         one_meg=1048576
119 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
120 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
121 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
122 |         warning_resources="false"
123 |         if (( mem_available < 4000 )) ; then
124 |           echo
125 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
126 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
127 |           echo
128 |           warning_resources="true"
129 |         fi
130 |         if (( cpus_available < 2 )); then
131 |           echo
132 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
133 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
134 |           echo
135 |           warning_resources="true"
136 |         fi
137 |         if (( disk_available < one_meg * 10 )); then
138 |           echo
139 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
140 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
141 |           echo
142 |           warning_resources="true"
143 |         fi
144 |         if [[ $${warning_resources} == "true" ]]; then
145 |           echo
146 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
147 |           echo "Please follow the instructions to increase amount of resources available:"
148 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin"
149 |           echo
150 |         fi
151 |         mkdir -p /sources/logs /sources/dags /sources/plugins
152 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
153 |         chmod -R 777 /opt/airflow/
154 |         exec /entrypoint airflow version
155 |     # yamllint enable rule:line-length
156 |     environment:
157 |       <<: *airflow-common-env
158 |       _AIRFLOW_DB_UPGRADE: 'true'
159 |       _AIRFLOW_WWW_USER_CREATE: 'true'
160 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
161 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
162 |     user: "0:0"
163 |     volumes:
164 |       - .:/sources
165 | 
166 |   airflow-cli:
167 |     <<: *airflow-common
168 |     profiles:
169 |       - debug
170 |     environment:
171 |       <<: *airflow-common-env
172 |       CONNECTION_CHECK_MAX_COUNT: "0"
173 |     # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
174 |     command:
175 |       - bash
176 |       - -c
177 |       - airflow
178 | 
179 | volumes:
180 |   postgres-db-volume:
181 | 


--------------------------------------------------------------------------------
/img/DAG.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/DAG.JPG


--------------------------------------------------------------------------------
/img/airflowcelery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/airflowcelery.png


--------------------------------------------------------------------------------
/img/airflowcelerydags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/airflowcelerydags.png


--------------------------------------------------------------------------------
/img/views.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/views.JPG


--------------------------------------------------------------------------------
/jars/postgresql-42.6.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/jars/postgresql-42.6.0.jar


--------------------------------------------------------------------------------
/pgadmin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Servers": {
 3 |         "1": {
 4 |             "Name": "postgres@email.com",
 5 |             "Group": "Servers",
 6 |             "Host": "postgres",
 7 |             "Port": 5432,
 8 |             "MaintenanceDB": "airflow",
 9 |             "Username": "airflow",
10 |             "SSLMode": "prefer",
11 |             "PassFile": "/pgpassfile"
12 |         }
13 |     }
14 | }


--------------------------------------------------------------------------------
/spark/spark_postgres_query.py:
--------------------------------------------------------------------------------
 1 | from airflow.models import Variable
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | if __name__ == '__main__':
 5 |     spark_dir = Variable.get("spark_dir")
 6 |     
 7 |     # Start Spark Session
 8 |     spark = (SparkSession 
 9 |                 .builder  
10 |                 .master("local[1]")
11 |                 .appName("postgres_query")
12 |                 .getOrCreate())
13 |     
14 |     pg_read = (spark.read
15 |                     .format("jdbc")
16 |                     .option("url", "jdbc:postgresql://postgres:5432/airflow")
17 |                     .option("user", "airflow")
18 |                     .option("password", "airflow")
19 |                     .option("driver", "org.postgresql.Driver"))
20 |     
21 |     dataset_df = (pg_read
22 |                   .option("query", "SELECT * FROM VW_DATASET_1")
23 |                   .load())
24 |     
25 |     dataset_df.show(15,False)
26 |     dataset_df.printSchema()


--------------------------------------------------------------------------------
/spark/spark_weather_data.py:
--------------------------------------------------------------------------------
  1 | # Airflow Imports
  2 | from airflow.models import Variable
  3 | 
  4 | # Spark Imports
  5 | from pyspark.sql import SparkSession
  6 | import os
  7 | from pyspark.sql.functions import col,explode, element_at, expr,unix_timestamp, to_timestamp, to_date, regexp_replace
  8 | 
  9 | if __name__ == '__main__':
 10 |     tmp_data_dir = Variable.get("weather_data_tmp_directory")
 11 | 
 12 |     # Start Spark Session
 13 |     spark = (SparkSession 
 14 |         .builder  
 15 |         .master("local[2]")
 16 |         .appName("weather_data")  
 17 |         .getOrCreate())
 18 |         
 19 |     # Read Data From Weather folder
 20 |     df = spark.read.format("json") \
 21 |             .option('inferSchema',True) \
 22 |             .load(f'{tmp_data_dir}weather/') \
 23 |             .drop("timezone_offset")
 24 |             
 25 |     # Persist Data (MEMORY_AND_DISK) 
 26 |     df.persist()
 27 | 
 28 |     # Add and processc olumns to df_hourly
 29 |     df_hourly = df.withColumn('hourly',explode(col('hourly'))) \
 30 |                 .withColumn("datetime", to_timestamp(expr("hourly.dt")))    \
 31 |                 .withColumn("temp", expr("hourly.temp")) \
 32 |                 .withColumn("feels_like", expr("hourly.feels_like")) \
 33 |                 .withColumn("pressure", expr("hourly.pressure")) \
 34 |                 .withColumn("humidity", expr("hourly.humidity")) \
 35 |                 .withColumn("dew_point", expr("hourly.dew_point")) \
 36 |                 .withColumn("uvi", expr("hourly.uvi")) \
 37 |                 .withColumn("clouds", expr("hourly.clouds")) \
 38 |                 .withColumn("visibility", expr("hourly.visibility")) \
 39 |                 .withColumn("wind_speed", expr("hourly.wind_speed")) \
 40 |                 .withColumn("wind_deg", expr("hourly.wind_deg")) \
 41 |                 .withColumn("wind_gust", expr("hourly.wind_gust")) \
 42 |                 .withColumn("weather_id", expr("hourly.weather.id")) \
 43 |                 .withColumn("weather_id", element_at(col("weather_id"), 1)) \
 44 |                 .withColumn("weather_main", expr("hourly.weather.main")) \
 45 |                 .withColumn("weather_main", element_at(col("weather_main"), 1)) \
 46 |                 .withColumn("weather_description", expr("hourly.weather.description")) \
 47 |                 .withColumn("weather_description", element_at(col("weather_description"), 1)) \
 48 |                 .withColumn("weather_icon", expr("hourly.weather.icon")) \
 49 |                 .withColumn("weather_icon", element_at(col("weather_icon"), 1)) \
 50 |                 .withColumnRenamed('lat','latitude') \
 51 |                 .withColumnRenamed('lon','longitude') \
 52 |                 .drop("hourly","current") \
 53 |                 .coalesce(1)
 54 |                 
 55 |     # Add and process column to df_current
 56 |     df_current = df.withColumn("datetime", to_timestamp(expr("current.dt")))    \
 57 |                 .withColumn("sunrise", to_timestamp(expr("current.sunrise")))    \
 58 |                 .withColumn("sunset", to_timestamp(expr("current.sunset")))    \
 59 |                 .withColumn("temp", expr("current.temp")) \
 60 |                 .withColumn("feels_like", expr("current.feels_like")) \
 61 |                 .withColumn("pressure", expr("current.pressure")) \
 62 |                 .withColumn("humidity", expr("current.humidity")) \
 63 |                 .withColumn("dew_point", expr("current.dew_point")) \
 64 |                 .withColumn("uvi", expr("current.uvi")) \
 65 |                 .withColumn("clouds", expr("current.clouds")) \
 66 |                 .withColumn("visibility", expr("current.visibility")) \
 67 |                 .withColumn("wind_speed", expr("current.wind_speed")) \
 68 |                 .withColumn("wind_deg", expr("current.wind_deg")) \
 69 |                 .withColumn("weather_id", expr("current.weather.id")) \
 70 |                 .withColumn("weather_id", element_at(col("weather_id"), 1)) \
 71 |                 .withColumn("weather_main", expr("current.weather.main")) \
 72 |                 .withColumn("weather_main", element_at(col("weather_main"), 1)) \
 73 |                 .withColumn("weather_description", expr("current.weather.description")) \
 74 |                 .withColumn("weather_description", element_at(col("weather_description"), 1)) \
 75 |                 .withColumn("weather_icon", expr("current.weather.icon")) \
 76 |                 .withColumn("weather_icon", element_at(col("weather_icon"), 1)) \
 77 |                 .withColumnRenamed('lat','latitude') \
 78 |                 .withColumnRenamed('lon','longitude') \
 79 |                 .drop("hourly","current") \
 80 |                 .coalesce(1)
 81 |                 
 82 |     # Write df_current            
 83 |     df_current.write \
 84 |     .format('csv') \
 85 |     .mode('overwrite') \
 86 |     .option('header',False) \
 87 |     .option('sep',',') \
 88 |     .save(f'{tmp_data_dir}processed/current_weather/')
 89 | 
 90 |     #df_current.show(10)
 91 | 
 92 |     # Write df_hourly                            
 93 |     df_hourly.write \
 94 |     .format('csv') \
 95 |     .mode('overwrite') \
 96 |     .option('header',False) \
 97 |     .option('sep',',') \
 98 |     .save(f'{tmp_data_dir}processed/hourly_weather/')
 99 | 
100 |     #df_hourly.show(10)


--------------------------------------------------------------------------------
/variables/airflow_connections.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "postgres_default": {
 3 |         "conn_type":"postgres",
 4 |         "host":"postgres",
 5 |         "login":"airflow",
 6 |         "password":"airflow",
 7 |         "schema":"airflow",
 8 |         "port":"5432",
 9 |         "extra": null
10 | },
11 | 
12 |     "openweathermapApi": {
13 |         "conn_type":"HTTP",
14 |         "host":"http://api.openweathermap.org/",
15 |         "login":null,
16 |         "password":"FILL UP WITH YOUR TOKEN",
17 |         "schema":null,
18 |         "port":null,
19 |         "extra": null
20 |     },
21 | 
22 |     "spark_default": {
23 |         "conn_type":"Spark",
24 |         "host":"spark://spark",
25 |         "login": null,
26 |         "password": null,
27 |         "schema": null,
28 |         "port":"7077",
29 |         "extra": null
30 |     },
31 | 
32 |     "databricks_default": {
33 |         "conn_type":"databricks",
34 |         "host":"https://ACCOUNTID.azuredatabricks.net/",
35 |         "login": "token",
36 |         "password": "FILL UP WITH YOUR TOKEN",
37 |         "schema": null,
38 |         "port": null,
39 |         "extra": null
40 |     }
41 | }


--------------------------------------------------------------------------------
/variables/airflow_variables.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"weather_data_lat":"",
 3 | 	"weather_data_lon":"",
 4 | 	"weather_data_endpoint":"data/2.5/onecall/timemachine",
 5 | 	"weather_data_units":"metric",
 6 | 	"weather_data_tmp_directory":"/opt/airflow/data/tmp/",
 7 | 	"weather_data_spark_code":"/opt/airflow/spark/spark_weather_data.py",
 8 | 	"spark_dir":"/opt/airflow/spark/",
 9 | 	"jars_dir":"/opt/airflow/jars/"
10 | }


--------------------------------------------------------------------------------