├── .env ├── .gitignore ├── Dockerfile ├── README.md ├── celery └── Dockerfile ├── dags ├── ETL_Databricks.py ├── decorator.py ├── postgres_spark.py ├── sampleTaskFlow.py └── weather_data.py ├── databricks └── notebooks │ └── opendentalfhir.py ├── docker-compose-celery.yaml ├── docker-compose.yaml ├── filemanager └── index.php ├── img ├── DAG.JPG ├── airflowcelery.png ├── airflowcelerydags.png └── views.JPG ├── jars └── postgresql-42.6.0.jar ├── pgadmin.json ├── spark ├── spark_postgres_query.py └── spark_weather_data.py └── variables ├── airflow_connections.json └── airflow_variables.json /.env: -------------------------------------------------------------------------------- 1 | AIRFLOW_UID=1000 2 | AIRFLOW_GID=0 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dags/__pycache__ 2 | logs/* 3 | data/* 4 | .databricks/* 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.2.3 2 | 3 | USER root 4 | 5 | # Install OpenJDK-11 6 | RUN apt update && \ 7 | apt-get install -y openjdk-11-jdk && \ 8 | apt-get install -y ant && \ 9 | apt-get clean; 10 | 11 | # Set JAVA_HOME 12 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/ 13 | RUN export JAVA_HOME 14 | 15 | USER airflow 16 | 17 | WORKDIR /app 18 | 19 | COPY variables/airflow_variables.json /app 20 | COPY variables/airflow_connections.json /app -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Airflow data pipeline 2 | ## Consuming Weather API and Storing on PostgreSql Database. 3 | 4 | ### This set of code and instructions has the porpouse to instanciate a compiled environment with set of docker images like airflow webserver, airflow scheduler, postgresql, pyspark. 5 | 6 | ## Overview 7 | Extract the last 5 days of data from the free API: https://api.openweathermap.org/data/2.5/onecall/timemachine (Historical weather data) from 10 different locations. 8 | 9 | Build a repository of data where we will keep the data extracted from the API. This repository should only have deduplicated data. Idempotency should also be guaranteed. 10 | 11 | Build another repository of data that will contain the results of the following calculations from the data stored in step 2. 12 | A dataset containing the location, date and temperature of the highest temperatures reported by location and month. 13 | A dataset containing the average temperature, min temperature, location of min temperature, and location of max temperature per day. 14 | Remote Infrastructure with all components required to the python workflow works. 15 | 16 | 17 | The set of scripts in Airflow DAG takes care of extracting historical data from the weather API based on 10 different location (5 days per location) and store it locally, light pre processing data and lookup with geopy API to enrich data with locations, store pre processed json files locally also, then the medium effort processing work is send to Spark local through airflow Spark Submit, where the spark script is sent to the local spark cluster, do all the data processing, filtering, aggregations and store back locally in the filesystem, then once the files are stored back as csv files, it runs postgres scripts to copy from (ingest) files to the postgres sql database, finally views are created with the aggregated queries in order to attend the request. 18 | 19 | In Addition, the Dockerfile and docker-compose file scripts are in charge of spining up containers with everything needed to run the airflow DAG successfully end to end. 20 | 21 | 22 | Remember to sing up an account at https://home.openweathermap.org/ and create an API Token. 23 | 24 | 25 | ## Requirements 26 | * Linux Operational System (Ubuntu 20.04 Prefered) or Windows and docker desktop. 27 | * [Git Versioning Software ](https://git-scm.com/download/linux) 28 | * [Docker Container Virtualization Software](https://docs.docker.com/engine/install/ubuntu/) 29 | * [Docker-Compose multi-container Docker applications](https://docs.docker.com/compose/install/) 30 | 31 | 32 | 33 | ## Instructions AIrflow 2.7.1 (CeleryExecutor) Heavier Version 34 | Clone this repository into your linux working directory and navegate into it. 35 | 36 | run commands: 37 | ``` 38 | 39 | # Clone Git repository to the created folder 40 | git clone https://github.com/mpavanetti/airflow . 41 | 42 | # Run docker compose 43 | sudo docker-compose -f docker-compose-celery.yaml up -d 44 | 45 | # Import Airflow connections and variables 46 | sudo docker exec -it airflow_airflow-worker_1 airflow connections import /opt/airflow/variables/airflow_connections.json 47 | sudo docker exec -it airflow_airflow-worker_1 airflow variables import /opt/airflow/variables/airflow_variables.json 48 | 49 | # Add permissions (If any write error happens) 50 | sudo chmod -R 777 ../airflow 51 | 52 | # Stop containers 53 | sudo docker-compose -f docker-compose-celery.yaml kill 54 | ``` 55 | 56 | 57 | 58 | ## Instructions AIrflow 2.2.3 (LocalExecutor) Lighter Version 59 | Clone this repository into your linux working directory and navegate into it. 60 | 61 | run commands: 62 | ``` 63 | 64 | # Clone Git repository to the created folder 65 | git clone https://github.com/mpavanetti/airflow . 66 | 67 | # Run docker compose 68 | sudo docker-compose up -d 69 | 70 | # Import Airflow connections and variables 71 | sudo docker exec -it airflow_airflow-scheduler_1 airflow connections import /opt/airflow/variables/airflow_connections.json 72 | sudo docker exec -it airflow_airflow-scheduler_1 airflow variables import /opt/airflow/variables/airflow_variables.json 73 | 74 | # Add permissions (If any write error happens) 75 | sudo chmod -R 755 ../airflow 76 | ``` 77 | 78 | In case you have any issues while importing airflow connections and variables, take the json files and import it manually. 79 | 80 | Note that you can enter manually latitude and logitude in the airflow varaibles one by one, open the file [airflow_variables.json](airflow_variables.json) and change the parameters weather_data_lat and weather_data_lon respectively or you if you let it as blank, the script will suggest 10 different location. 81 | 82 | Note that the temperature results are in Celsius(units=metric), if you want to change to Fahrenheit open the variable file [airflow_variables.json](airflow_variables.json) and change the parameter weather_data_units to imperial. respectively standard for Kelvin. 83 | 84 | *Important, to Change the variables you can go at the variables section in apache airflow and change the values as you want any time you need or either do in the [airflow_variables.json](airflow_variables.json) and redeploy the containers which might not be very fast. 85 | 86 | 87 | 88 | ## Accesses 89 | Access the Airflow UI through the link http://localhost:8080/ 90 | 91 | Username: airflow 92 | Password: airflow 93 | 94 | Access Postgres Admin through the link http://localhost:15432/ 95 | Username: postgres@email.com 96 | Password: postgres 97 | 98 | Note: using localhost as example, in case the application sits on cloud virtual machines, the host will be given at your cloud machine instance, also make sure the ports are opened. 99 | 100 | 101 | ## Data Pipeline Run 102 | Go to airflow DAGs view , turn on the dag weather_data, and trigger it. 103 | 104 |  105 | 106 | ## Checking DAG result 107 | 108 | Open the postgres admin in a chrome web browser, go to Servers and add a server with the information described in the json file [pgadmin.json](pgadmin.json) 109 | 110 | Check the final views 111 | 112 | SELECT * FROM VW_DATASET_1; 113 | 114 | SELECT * FROM VW_DATASET_2; 115 | 116 | After running the airflow DAG pipeline, you should expect the following view result in postgresl: 117 | 118 |  119 | 120 | Thanks. 121 | 122 | 123 | ## Images 124 | Airflow 2.7.1 Celery deployment dags: 125 |  126 | 127 | Weather_data DAG process: 128 |  129 | 130 | 131 | ## Extra notes 132 | 133 | ``` 134 | # Install docker-compose v2 on linux os 135 | sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose 136 | sudo chmod +x /usr/local/bin/docker-compose 137 | 138 | # Install tiny file manager 139 | sudo docker run -d -v ./filemanager:/var/www/html -p 80:80 -v ./dags:/var/www/html/data/dags -v ./logs:/var/www/html/data/logs -v ./plugins:/var/www/html/data/plugins -v ./spark:/var/www/html/data/spark -v ./jars:/var/www/html/data/jars -v ./variables:/var/www/html/data/variables --restart=always --name filemanager tinyfilemanager/tinyfilemanager:master 140 | 141 | Default username/password: admin/admin@123 and user/12345 142 | 143 | ``` 144 | -------------------------------------------------------------------------------- /celery/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.7.1 2 | 3 | USER root 4 | 5 | # Install OpenJDK-11 6 | RUN apt update && \ 7 | apt-get install -y openjdk-11-jdk && \ 8 | apt-get install -y ant && \ 9 | apt-get clean; 10 | 11 | # Set JAVA_HOME 12 | ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/ 13 | RUN export JAVA_HOME 14 | 15 | USER airflow 16 | 17 | WORKDIR /app 18 | 19 | #COPY /variables/airflow_variables.json /app 20 | #COPY /variables/airflow_connections.json /app -------------------------------------------------------------------------------- /dags/ETL_Databricks.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from airflow.providers.databricks.operators.databricks import DatabricksSubmitRunOperator 4 | 5 | notebook_task_params = { 6 | 'existing_cluster_id': '', 7 | 'notebook_task': { 8 | 'notebook_path': '', 9 | }, 10 | } 11 | 12 | from airflow.decorators import dag, task 13 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','databricks','notebook']) 14 | def ETL_Databricks(): 15 | 16 | @task() 17 | def extract(): 18 | return True 19 | 20 | # Transform 21 | notebook_task = DatabricksSubmitRunOperator(task_id='notebook_task', json=notebook_task_params) 22 | 23 | @task() 24 | def load(): 25 | return True 26 | 27 | ext = extract() >> notebook_task >> load() 28 | 29 | etl = ETL_Databricks() -------------------------------------------------------------------------------- /dags/decorator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | 4 | from airflow.decorators import dag, task 5 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator']) 6 | def ETL(): 7 | 8 | @task() 9 | def extract(): 10 | return True 11 | 12 | @task() 13 | def transform(): 14 | return True 15 | 16 | @task() 17 | def load(): 18 | return True 19 | 20 | ext = extract() >> transform() >> load() 21 | 22 | etl = ETL() -------------------------------------------------------------------------------- /dags/postgres_spark.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from airflow.decorators import dag, task 4 | from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator 5 | from airflow.models import Variable 6 | 7 | # Variables 8 | spark_dir = Variable.get("spark_dir") 9 | jars_dir = Variable.get("jars_dir") 10 | 11 | 12 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','postgres','spark']) 13 | def postgres_spark(): 14 | 15 | @task() 16 | def start(): 17 | return True 18 | 19 | # Spark Submit 20 | postgres_spark = SparkSubmitOperator( 21 | application=f'{spark_dir}spark_postgres_query.py', 22 | task_id="postgres_spark_query", 23 | jars=f"{jars_dir}postgresql-42.6.0.jar" 24 | ) 25 | 26 | @task() 27 | def finish(): 28 | return True 29 | 30 | ext = start() >> postgres_spark >> finish() 31 | 32 | etl = postgres_spark() -------------------------------------------------------------------------------- /dags/sampleTaskFlow.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | 4 | from airflow.decorators import dag, task 5 | @dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example','decorator','TaskFlowApi']) 6 | def tutorial_taskflow_api_etl(): 7 | """ 8 | ### TaskFlow API Tutorial Documentation 9 | This is a simple ETL data pipeline example which demonstrates the use of 10 | the TaskFlow API using three simple tasks for Extract, Transform, and Load. 11 | Documentation that goes along with the Airflow TaskFlow API tutorial is 12 | located 13 | [here](https://airflow.apache.org/docs/apache-airflow/stable/tutorial_taskflow_api.html) 14 | """ 15 | @task() 16 | def extract(): 17 | """ 18 | #### Extract task 19 | A simple Extract task to get data ready for the rest of the data 20 | pipeline. In this case, getting data is simulated by reading from a 21 | hardcoded JSON string. 22 | """ 23 | data_string = '{"1001": 301.27, "1002": 433.21, "1003": 502.22}' 24 | 25 | order_data_dict = json.loads(data_string) 26 | return order_data_dict 27 | @task(multiple_outputs=True) 28 | def transform(order_data_dict: dict): 29 | """ 30 | #### Transform task 31 | A simple Transform task which takes in the collection of order data and 32 | computes the total order value. 33 | """ 34 | total_order_value = 0 35 | 36 | for value in order_data_dict.values(): 37 | total_order_value += value 38 | 39 | return {"total_order_value": total_order_value} 40 | @task() 41 | def load(total_order_value: float): 42 | """ 43 | #### Load task 44 | A simple Load task which takes in the result of the Transform task and 45 | instead of saving it to end user review, just prints it out. 46 | """ 47 | 48 | print(f"Total order value is: {total_order_value:.2f}") 49 | order_data = extract() 50 | order_summary = transform(order_data) 51 | load(order_summary["total_order_value"]) 52 | tutorial_etl_dag = tutorial_taskflow_api_etl() 53 | -------------------------------------------------------------------------------- /dags/weather_data.py: -------------------------------------------------------------------------------- 1 | # Import default Apache Airflow Libraries 2 | from airflow.models import DAG 3 | from airflow.operators.dummy import DummyOperator 4 | from airflow.hooks.base import BaseHook 5 | from airflow.models import Variable 6 | from airflow.providers.http.sensors.http import HttpSensor 7 | from airflow.operators.python import PythonOperator 8 | from airflow.providers.http.operators.http import SimpleHttpOperator 9 | from airflow.utils.task_group import TaskGroup 10 | from airflow.providers.postgres.operators.postgres import PostgresOperator 11 | from airflow.operators.bash import BashOperator 12 | from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator 13 | 14 | # Importing Python Libraries 15 | from datetime import datetime, timedelta 16 | import time 17 | import json 18 | import os 19 | from pandas import json_normalize 20 | from geopy.geocoders import Nominatim 21 | import csv, sqlite3 22 | import glob 23 | import requests 24 | 25 | # Default Arguments and attibutes 26 | default_args ={ 27 | 'start_date': datetime.today() - timedelta(days=1), 28 | 'owner': 'Matheus' 29 | } 30 | 31 | # Get Current date, subtract 5 days and convert to timestamp 32 | todayLessFiveDays = datetime.today() - timedelta(days=5) 33 | todayLessFiveDaysTimestamp = time.mktime(todayLessFiveDays.timetuple()) 34 | 35 | # Store last 5 days date into a list 36 | days=[] 37 | i = 1 38 | while i < 6: 39 | todayLessFiveDays = datetime.today() - timedelta(days=i) 40 | todayLessFiveDaysTimestamp = time.mktime(todayLessFiveDays.timetuple()) 41 | days.append(todayLessFiveDaysTimestamp) 42 | i += 1 43 | 44 | # Get Connection from airflow db 45 | api_connection = BaseHook.get_connection("openweathermapApi") 46 | 47 | # Get Variables 48 | latitude = Variable.get("weather_data_lat") 49 | longitude = Variable.get("weather_data_lon") 50 | units = Variable.get("weather_data_units") 51 | tmp_data_dir = Variable.get("weather_data_tmp_directory") 52 | weather_data_spark_code = Variable.get("weather_data_spark_code") 53 | 54 | # Suggested Locations 55 | 56 | suggested_locations = ( 57 | ['30.318878','-81.690173'], 58 | ['28.538336','-81.379234'], 59 | ['27.950575','-82.457176'], 60 | ['25.761681','-80.191788'], 61 | ['34.052235','-118.243683'], 62 | ['40.712776','-74.005974'], 63 | ['41.878113','-87.629799'], 64 | ['32.776665','-96.796989'], 65 | ['47.950356','-124.385490'], 66 | ['36.169941','-115.139832'] 67 | ) 68 | 69 | # weather data api query params 70 | api_params = { 71 | 'lat':suggested_locations[0][0], 72 | 'lon':suggested_locations[0][1], 73 | 'units':units, 74 | 'dt':int(todayLessFiveDaysTimestamp), 75 | 'appid':api_connection.password, 76 | } 77 | 78 | # Notify, Email 79 | def _notify(ti): 80 | raise ValueError('Api Not Available') 81 | 82 | # Tmp Data Check 83 | def _tmp_data(): 84 | # Checking if directories exist 85 | if not os.path.exists(tmp_data_dir): 86 | os.mkdir(tmp_data_dir) 87 | if not os.path.exists(f'{tmp_data_dir}weather/'): 88 | os.mkdir(f'{tmp_data_dir}weather/') 89 | if not os.path.exists(f'{tmp_data_dir}processed/'): 90 | os.mkdir(f'{tmp_data_dir}processed/') 91 | if not os.path.exists(f'{tmp_data_dir}processed/current_weather/'): 92 | os.mkdir(f'{tmp_data_dir}processed/current_weather/') 93 | if not os.path.exists(f'{tmp_data_dir}processed/hourly_weather/'): 94 | os.mkdir(f'{tmp_data_dir}processed/hourly_weather/') 95 | 96 | 97 | # Extract Weather 98 | def _extract_weather(): 99 | if((Variable.get("weather_data_lat") == None or Variable.get("weather_data_lat") == '') and (Variable.get("weather_data_lon") == None or Variable.get("weather_data_lon") == '')): 100 | for latitude, longitude in suggested_locations: 101 | for day in days: 102 | # weather data api query params 103 | api_param = { 104 | 'lat':latitude, 105 | 'lon':longitude, 106 | 'units':units, 107 | 'dt':int(day), 108 | 'appid':api_connection.password 109 | } 110 | r = requests.get(url = api_connection.host + Variable.get("weather_data_endpoint"), params = api_param) 111 | data = r.json() 112 | time = datetime.today().strftime('%Y%m%d%H%M%S%f') 113 | with open(f"{tmp_data_dir}/weather/weather_output_{time}.json", "w") as outfile: 114 | json.dump(data, outfile) 115 | else: 116 | for day in days: 117 | # weather data api query params 118 | api_param = { 119 | 'lat':Variable.get("weather_data_lat"), 120 | 'lon':Variable.get("weather_data_lon"), 121 | 'units':units, 122 | 'dt':int(day), 123 | 'appid':api_connection.password 124 | } 125 | r = requests.get(url = api_connection.host + Variable.get("weather_data_endpoint"), params = api_param) 126 | data = r.json() 127 | time = datetime.today().strftime('%Y%m%d%H%M%S%f') 128 | with open(f"{tmp_data_dir}/weather/weather_output_{time}.json", "w") as outfile: 129 | json.dump(data, outfile) 130 | 131 | 132 | # Store Location Iterative 133 | def _process_location_csv_iterative(): 134 | if((latitude == None or latitude == '') and (longitude == None or longitude == '')): 135 | for lat,long in suggested_locations: 136 | _store_location_csv(lat,long) 137 | else: 138 | _store_location_csv(latitude,longitude) 139 | 140 | # Processing and Deduplicating Weather API Data 141 | def _store_location_csv(lat,long): 142 | 143 | # Invoking geo locator api and getting address from latitude and longitude 144 | geolocator = Nominatim(user_agent="weather_data") 145 | location = geolocator.reverse(lat+","+long) 146 | address = location.raw['address'] 147 | #current = datetime.today().strftime('%Y%m%d%H%M%S%f') 148 | 149 | # Process location data 150 | location_df = json_normalize({ 151 | 'latitude':lat, 152 | 'logitude': long, 153 | 'city':address.get('city'), 154 | 'state':address.get('state'), 155 | 'postcode':address.get('postcode'), 156 | 'country':address.get('country') 157 | }) 158 | 159 | # Store Location 160 | location_df.to_csv(f'{tmp_data_dir}location.csv', mode='a', sep=',', index=None, header=False) 161 | 162 | # Processed files 163 | def get_current_weather_file(): 164 | for i in glob.glob(f'{tmp_data_dir}processed/current_weather/part-*.csv'): 165 | return i 166 | 167 | def get_hourly_weather_file(): 168 | for i in glob.glob(f'{tmp_data_dir}processed/hourly_weather/part-*.csv'): 169 | return i 170 | 171 | # DAG Skeleton 172 | with DAG('weather_data', schedule_interval='@daily',default_args=default_args, catchup=False) as dag: 173 | 174 | # Start 175 | start = DummyOperator( 176 | task_id='Start' 177 | ) 178 | 179 | # Temp Data 180 | tmp_data = PythonOperator( 181 | task_id='tmp_data', 182 | python_callable=_tmp_data 183 | ) 184 | 185 | # Create Http Sensor Operator 186 | check_api = HttpSensor( 187 | task_id='check_api', 188 | http_conn_id='openweathermapApi', 189 | endpoint=Variable.get("weather_data_endpoint"), 190 | method='GET', 191 | response_check=lambda response: True if response.status_code == 200 or response.status_code == 204 else False, 192 | poke_interval=5, 193 | timeout=60, 194 | retries=2, 195 | mode="reschedule", 196 | soft_fail=False, 197 | request_params = api_params 198 | ) 199 | 200 | # Api is not available 201 | api_not_available = PythonOperator( 202 | task_id='api_not_available', 203 | python_callable=_notify, 204 | trigger_rule='one_failed' 205 | ) 206 | 207 | # Extract User Records Simple Http Operator 208 | extracting_weather = PythonOperator( 209 | task_id='extracting_weather', 210 | python_callable=_extract_weather, 211 | trigger_rule='all_success' 212 | ) 213 | 214 | # TaskGroup for Creating Postgres tables 215 | with TaskGroup('create_postgres_tables') as create_postgres_tables: 216 | 217 | # Create table Location 218 | creating_table_location = PostgresOperator( 219 | task_id='creating_table_location', 220 | postgres_conn_id='postgres_default', 221 | sql=''' 222 | CREATE TABLE IF NOT EXISTS location_tmp ( 223 | latitude VARCHAR(255) NOT NULL, 224 | longitude VARCHAR(255) NOT NULL, 225 | city VARCHAR(255) NULL, 226 | state VARCHAR(255) NULL, 227 | postcode VARCHAR(255) NULL, 228 | country VARCHAR(255) NULL, 229 | PRIMARY KEY (latitude,longitude) 230 | ); 231 | 232 | CREATE TABLE IF NOT EXISTS location ( 233 | latitude VARCHAR(255) NOT NULL, 234 | longitude VARCHAR(255) NOT NULL, 235 | city VARCHAR(255) NULL, 236 | state VARCHAR(255) NULL, 237 | postcode VARCHAR(255) NULL, 238 | country VARCHAR(255) NULL, 239 | PRIMARY KEY (latitude,longitude) 240 | ); 241 | 242 | ''' 243 | ) 244 | 245 | # Create Table Requested Weather 246 | creating_table_requested_weather = PostgresOperator( 247 | task_id='creating_table_requested_weather', 248 | postgres_conn_id='postgres_default', 249 | sql=''' 250 | CREATE TABLE IF NOT EXISTS current_weather_tmp ( 251 | latitude VARCHAR(255) NOT NULL, 252 | longitude VARCHAR(255) NOT NULL, 253 | timezone VARCHAR(255) NOT NULL, 254 | requested_datetime VARCHAR(255) NULL, 255 | sunrise VARCHAR(255) NULL, 256 | sunset VARCHAR(255) NULL, 257 | temp VARCHAR(255) NULL, 258 | feels_like VARCHAR(255) NULL, 259 | pressure VARCHAR(255) NULL, 260 | humidity VARCHAR(255) NULL, 261 | dew_point VARCHAR(255) NULL, 262 | uvi VARCHAR(255) NULL, 263 | clouds VARCHAR(255) NULL, 264 | visibility VARCHAR(255) NULL, 265 | wind_speed VARCHAR(255) NULL, 266 | wind_deg VARCHAR(255) NULL, 267 | weather_id VARCHAR(255) NULL, 268 | weather_main VARCHAR(255) NULL, 269 | weather_description VARCHAR(255) NULL, 270 | weather_icon VARCHAR(255) NULL, 271 | PRIMARY KEY (latitude,longitude,requested_datetime) 272 | ); 273 | 274 | CREATE TABLE IF NOT EXISTS current_weather ( 275 | latitude VARCHAR(255) NOT NULL, 276 | longitude VARCHAR(255) NOT NULL, 277 | timezone VARCHAR(255) NOT NULL, 278 | requested_datetime VARCHAR(255) NULL, 279 | sunrise VARCHAR(255) NULL, 280 | sunset VARCHAR(255) NULL, 281 | temp VARCHAR(255) NULL, 282 | feels_like VARCHAR(255) NULL, 283 | pressure VARCHAR(255) NULL, 284 | humidity VARCHAR(255) NULL, 285 | dew_point VARCHAR(255) NULL, 286 | uvi VARCHAR(255) NULL, 287 | clouds VARCHAR(255) NULL, 288 | visibility VARCHAR(255) NULL, 289 | wind_speed VARCHAR(255) NULL, 290 | wind_deg VARCHAR(255) NULL, 291 | weather_id VARCHAR(255) NULL, 292 | weather_main VARCHAR(255) NULL, 293 | weather_description VARCHAR(255) NULL, 294 | weather_icon VARCHAR(255) NULL, 295 | PRIMARY KEY (latitude,longitude,requested_datetime) 296 | ); 297 | 298 | ''' 299 | ) 300 | 301 | # Create Table Hourly Weather 302 | creating_table_hourly_weather = PostgresOperator( 303 | task_id='creating_table_hourly_weather', 304 | postgres_conn_id='postgres_default', 305 | sql=''' 306 | CREATE TABLE IF NOT EXISTS hourly_weather_tmp ( 307 | latitude VARCHAR(255) NOT NULL, 308 | longitude VARCHAR(255) NOT NULL, 309 | timezone VARCHAR(255) NOT NULL, 310 | datetime VARCHAR(255) NULL, 311 | temp VARCHAR(255) NULL, 312 | feels_like VARCHAR(255) NULL, 313 | pressure VARCHAR(255) NULL, 314 | humidity VARCHAR(255) NULL, 315 | dew_point VARCHAR(255) NULL, 316 | uvi VARCHAR(255) NULL, 317 | clouds VARCHAR(255) NULL, 318 | visibility VARCHAR(255) NULL, 319 | wind_speed VARCHAR(255) NULL, 320 | wind_deg VARCHAR(255) NULL, 321 | wind_gust VARCHAR(255) NULL, 322 | weather_id VARCHAR(255) NULL, 323 | weather_main VARCHAR(255) NULL, 324 | weather_description VARCHAR(255) NULL, 325 | weather_icon VARCHAR(255) NULL, 326 | PRIMARY KEY (latitude,longitude,datetime) 327 | ); 328 | 329 | CREATE TABLE IF NOT EXISTS hourly_weather ( 330 | latitude VARCHAR(255) NOT NULL, 331 | longitude VARCHAR(255) NOT NULL, 332 | timezone VARCHAR(255) NOT NULL, 333 | datetime VARCHAR(255) NULL, 334 | temp VARCHAR(255) NULL, 335 | feels_like VARCHAR(255) NULL, 336 | pressure VARCHAR(255) NULL, 337 | humidity VARCHAR(255) NULL, 338 | dew_point VARCHAR(255) NULL, 339 | uvi VARCHAR(255) NULL, 340 | clouds VARCHAR(255) NULL, 341 | visibility VARCHAR(255) NULL, 342 | wind_speed VARCHAR(255) NULL, 343 | wind_deg VARCHAR(255) NULL, 344 | wind_gust VARCHAR(255) NULL, 345 | weather_id VARCHAR(255) NULL, 346 | weather_main VARCHAR(255) NULL, 347 | weather_description VARCHAR(255) NULL, 348 | weather_icon VARCHAR(255) NULL, 349 | PRIMARY KEY (latitude,longitude,datetime) 350 | ); 351 | 352 | ''' 353 | ) 354 | 355 | # Truncate Temp Tables 356 | with TaskGroup('truncate_temp_table_postgres') as truncate_temp_table_postgres: 357 | 358 | # Truncate location_temp Postgres 359 | truncate_location_temp_postgres = PostgresOperator( 360 | task_id='truncate_location_temp_postgres', 361 | postgres_conn_id='postgres_default', 362 | sql=''' 363 | TRUNCATE TABLE location_tmp; 364 | ''' 365 | ) 366 | 367 | # Truncate current_weather_temp Postgres 368 | truncate_current_weather_temp_postgres = PostgresOperator( 369 | task_id='truncate_current_weather_temp_postgres', 370 | postgres_conn_id='postgres_default', 371 | sql=''' 372 | TRUNCATE TABLE current_weather_tmp; 373 | ''' 374 | ) 375 | 376 | # Truncate hourly_weather_temp Postgres 377 | truncate_hourly_weather_temp_postgres = PostgresOperator( 378 | task_id='truncate_hourly_weather_temp_postgres', 379 | postgres_conn_id='postgres_default', 380 | sql=''' 381 | TRUNCATE TABLE hourly_weather_tmp; 382 | ''' 383 | ) 384 | 385 | # Process Location Data 386 | process_location_csv = PythonOperator( 387 | task_id='process_location_csv', 388 | python_callable=_process_location_csv_iterative 389 | ) 390 | 391 | # Spark Submit 392 | spark_process_weather = SparkSubmitOperator( 393 | application=f'{weather_data_spark_code}', task_id="spark_process_weather" 394 | ) 395 | 396 | # TaskGroup for Storing processed data into postgres temp tables 397 | with TaskGroup('store_processed_temp_data_in_postgres') as store_processed_temp_data_in_postgres: 398 | 399 | store_location_tmp_postgres = PostgresOperator( 400 | task_id='store_location_tmp_postgres', 401 | postgres_conn_id='postgres_default', 402 | sql=f''' 403 | COPY location_tmp 404 | FROM '{tmp_data_dir}location.csv' 405 | DELIMITER ',' 406 | ; 407 | ''' 408 | ) 409 | 410 | store_current_weather_tmp_postgres = PostgresOperator( 411 | task_id='store_current_weather_tmp_postgres', 412 | postgres_conn_id='postgres_default', 413 | sql=''' 414 | COPY current_weather_tmp 415 | FROM '%s' 416 | DELIMITER ',' 417 | ; 418 | ''' % get_current_weather_file() 419 | ) 420 | 421 | store_hourly_weather_tmp_postgres = PostgresOperator( 422 | task_id='store_hourly_weather_tmp_postgres', 423 | postgres_conn_id='postgres_default', 424 | sql=''' 425 | COPY hourly_weather_tmp 426 | FROM '%s' 427 | DELIMITER ',' 428 | ; 429 | ''' % get_hourly_weather_file() 430 | ) 431 | 432 | # TaskGroup for Storing from temp tables to original tables 433 | with TaskGroup('copy_from_tmp_table_to_original_table') as copy_from_tmp_table_to_original_table: 434 | 435 | copy_location_tmp_to_location = PostgresOperator( 436 | task_id='copy_location_tmp_to_location', 437 | postgres_conn_id='postgres_default', 438 | sql=''' 439 | INSERT INTO location 440 | SELECT * 441 | FROM location_tmp 442 | EXCEPT 443 | SELECT * 444 | FROM location 445 | ON CONFLICT (latitude,longitude) DO NOTHING; 446 | ''' 447 | ) 448 | 449 | copy_current_weather_tmp_to_current_weather = PostgresOperator( 450 | task_id='copy_current_weather_tmp_to_current_weather', 451 | postgres_conn_id='postgres_default', 452 | sql=''' 453 | INSERT INTO current_weather 454 | SELECT * 455 | FROM current_weather_tmp 456 | EXCEPT 457 | SELECT * 458 | FROM current_weather 459 | ON CONFLICT (latitude,longitude,requested_datetime) DO NOTHING; 460 | ''' 461 | ) 462 | 463 | copy_hourly_weather_tmp_to_current_weather = PostgresOperator( 464 | task_id='copy_hourly_weather_tmp_to_current_weather', 465 | postgres_conn_id='postgres_default', 466 | sql=''' 467 | INSERT INTO hourly_weather 468 | SELECT * 469 | FROM hourly_weather_tmp 470 | EXCEPT 471 | SELECT * 472 | FROM hourly_weather 473 | ON CONFLICT (latitude,longitude,datetime) DO NOTHING; 474 | ''' 475 | ) 476 | 477 | # TaskGroup for Creating Postgres Views 478 | with TaskGroup('create_materialized_views') as create_materialized_views: 479 | # Create View for DataSet 1 480 | create_view_dataset_1 = PostgresOperator( 481 | task_id='create_view_dataset_1', 482 | postgres_conn_id='postgres_default', 483 | sql=''' 484 | CREATE OR REPLACE VIEW VW_DATASET_1 485 | AS 486 | SELECT 487 | loc.country AS Country, 488 | loc.state AS State, 489 | loc.city AS City, 490 | CAST(hw.datetime AS DATE) AS Date, 491 | EXTRACT(MONTH FROM CAST(hw.datetime AS DATE)) AS Month, 492 | MAX(CAST(hw.temp AS DECIMAL)) AS Max_Temperature 493 | FROM location loc, hourly_weather hw 494 | WHERE ROUND(CAST(loc.latitude AS DECIMAL),4) = ROUND(CAST(hw.latitude AS DECIMAL),4) 495 | AND ROUND(CAST(loc.longitude AS DECIMAL),4) = ROUND(CAST(hw.longitude AS DECIMAL),4) 496 | GROUP BY City,State,Country,Date,Month 497 | ORDER BY Date DESC; 498 | ''' 499 | ) 500 | 501 | # Create View for DataSet 2 502 | create_view_dataset_2 = PostgresOperator( 503 | task_id='create_view_dataset_2', 504 | postgres_conn_id='postgres_default', 505 | sql=''' 506 | CREATE OR REPLACE VIEW VW_DATASET_2 507 | AS 508 | SELECT 509 | loc.country AS Country, 510 | loc.state AS State, 511 | loc.city AS City, 512 | CAST(hw.datetime AS DATE) AS Date, 513 | MAX(CAST(hw.temp AS DECIMAL)) AS Max_Temperature, 514 | MIN(CAST(hw.temp AS DECIMAL)) AS Min_Temperature, 515 | ROUND(AVG(CAST(hw.temp AS DECIMAL)),2) AS Average_Temperature 516 | FROM location loc, hourly_weather hw 517 | WHERE ROUND(CAST(loc.latitude AS DECIMAL),4) = ROUND(CAST(hw.latitude AS DECIMAL),4) 518 | AND ROUND(CAST(loc.longitude AS DECIMAL),4) = ROUND(CAST(hw.longitude AS DECIMAL),4) 519 | GROUP BY City,State,Country,Date 520 | ORDER BY Date DESC; 521 | ''' 522 | ) 523 | 524 | # Pre Cleanup task 525 | pre_cleanup= BashOperator( 526 | task_id='pre_cleanup', 527 | bash_command=f'rm -rf {tmp_data_dir}' 528 | ) 529 | 530 | # Post Cleanup task 531 | post_cleanup= BashOperator( 532 | task_id='post_cleanup', 533 | bash_command=f'rm -r {tmp_data_dir}' 534 | ) 535 | 536 | # DAG Dependencies 537 | start >> pre_cleanup >> tmp_data >> check_api >> [extracting_weather,api_not_available] 538 | extracting_weather >> create_postgres_tables >> truncate_temp_table_postgres >> process_location_csv >> spark_process_weather 539 | spark_process_weather >> store_processed_temp_data_in_postgres >> copy_from_tmp_table_to_original_table >> create_materialized_views >> post_cleanup -------------------------------------------------------------------------------- /databricks/notebooks/opendentalfhir.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC ### Open Dental API - FHIR r4 implementation 4 | # MAGIC
Matheus Pavanetti
5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md 9 | # MAGIC ### Data Architecture 10 | # MAGIC  11 | # MAGIC 12 | # MAGIC - **Bronze** tables contain raw data ingested from various sources (JSON data from open dental api call). 13 | # MAGIC 14 | # MAGIC - **Silver** tables provide a more refined view of our data. (parsed json, cleaned data, schema enforcement). 15 | # MAGIC 16 | # MAGIC - **Gold** tables provide business level aggregates often used for reporting and dashboarding. 17 | # MAGIC 18 | # MAGIC ### Documentation Reference 19 | # MAGIC - **API Implementation** https://www.opendental.com/site/apiimplementation.html 20 | # MAGIC 21 | # MAGIC - **Open Dental Specs FHIR** https://www.opendental.com/resources/OpenDentalFHIR19-3Spec.pdf 22 | # MAGIC 23 | # MAGIC - **FHIR Standard** http://hl7.org/fhir/R4/ 24 | 25 | # COMMAND ---------- 26 | 27 | # DBTITLE 1,Setting up storage account on spark session 28 | # Run this to Authenticate on storage account 29 | spark.conf.set("fs.azure.account.auth.type", "OAuth") 30 | spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") 31 | spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get('FILLIN', 'FILLIN')) 32 | spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get('FILLIN', 'FILLIN')) 33 | spark.conf.set("fs.azure.account.oauth2.client.endpoint", f"https://login.microsoftonline.com/{dbutils.secrets.get('FILLIN', 'FILLIN')}/oauth2/token") 34 | 35 | # COMMAND ---------- 36 | 37 | # DBTITLE 1,Environment Class 38 | import requests 39 | import json 40 | import pandas as pd 41 | import os 42 | from datetime import datetime 43 | 44 | class Env: 45 | 46 | # Datalake 47 | datalakeDir="abfss://datalake@FILLIN.dfs.core.windows.net/" 48 | workdir=f"{datalakeDir}opendentalfhir/" 49 | 50 | # Unity Catalog 51 | catalog="FILLIN" 52 | catalog_bronze=f"{catalog}.bronze" 53 | catalog_silver=f"{catalog}.silver" 54 | catalog_gold=f"{catalog}.gold" 55 | 56 | # Driver 57 | tmp_dir = "/tmp/opendentalfhir/" 58 | 59 | # Multi-Hop Architecture variables. 60 | raw=f"{workdir}raw/" 61 | bronze=f"{workdir}bronze/" 62 | silver=f"{workdir}silver/" 63 | gold=f"{workdir}gold/" 64 | 65 | # Open Dental API variables. 66 | endpoint="https://api.opendental.com/fhir/v2/" 67 | token="ODFHIR NFF6i0KrXrxDkZHt/VzkmZEaUWOjnQX2z" 68 | resources=["organization","location", 69 | "patient","procedure","practitioner", 70 | "ServiceRequest"] 71 | 72 | def __init__(self): 73 | self.tmp_dir = Env.tmp_dir 74 | 75 | def createTmp(self): 76 | if not os.path.exists(self.tmp_dir): 77 | os.mkdir(self.tmp_dir) 78 | 79 | def wipeTmp(self): 80 | if os.path.exists(self.tmp_dir): 81 | for file in os.listdir(self.tmp_dir): 82 | os.remove(os.path.join(self.tmp_dir, file)) 83 | 84 | 85 | # COMMAND ---------- 86 | 87 | # DBTITLE 1,Extract Class 88 | class Extract(Env): 89 | 90 | def __init__(self): 91 | Env.__init__(self) 92 | Env.createTmp(self) 93 | 94 | def cleanRaw(self): 95 | return dbutils.fs.rm(Env.raw,True) 96 | 97 | def cleanTmp(self): 98 | return Env.wipeTmp() 99 | 100 | def pullData(self): 101 | headers={"Accept": "application/json", 102 | "Content-Type": "application/json", 103 | "Authorization": Env.token} 104 | 105 | Env.wipeTmp(self) 106 | 107 | for item in Env.resources: 108 | time = datetime.now().strftime(("%Y%m%d%H%M%S")) # current date and time of file download (UTC). 109 | result = requests.get(f"{Env.endpoint}{item}",headers=headers) 110 | 111 | if result.status_code == 200: 112 | resultjson = json.loads(result.text) 113 | pd_df = pd.json_normalize(resultjson) 114 | 115 | # Download temp json 116 | pd_df.to_json(f"{Env.tmp_dir}/{item}_{time}.json") 117 | 118 | # Upload Datalake 119 | dbutils.fs.cp(f"file:////{Env.tmp_dir}/{item}_{time}.json",f"{Env.raw}{item}/{item}_{time}.json") 120 | print(f"RAW Downloaded {resultjson['total']} {item} records at file: {Env.raw}{item}/{item}_{time}.json") 121 | 122 | 123 | # COMMAND ---------- 124 | 125 | # DBTITLE 1,Transform Class 126 | #from pyspark.sql.functions import explode, cast, flatten, collect_set 127 | 128 | class Transform(Env): 129 | 130 | def __init__(self): 131 | Env.__init__(self) 132 | 133 | def rawToBronze(self): 134 | for item in Env.resources: 135 | df = (spark.read.json(f"{Env.raw}{item}/")) 136 | transformed_df = df.selectExpr("explode(flatten(collect_set(entry.*))) as data") 137 | transformed_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(f"{Env.bronze}{item}/")#.saveAsTable(f"{Env.catalog_bronze}.{item}") 138 | print(f"BRONZE records for entity {item} were persisted at {Env.bronze}{item}/") 139 | 140 | def patientToSilver(self): 141 | df = (spark.read.format("delta").load(f"{Env.bronze}/patient")) 142 | 143 | transformed_df = (df.selectExpr("CAST(data.resource.id as LONG) as id", 144 | "data.resource.active as active", 145 | "data.resource.name as name", 146 | "data.resource.resourceType as resourceType", 147 | "data.resource.address as address", 148 | "CAST(data.resource.birthdate as DATE) as birthdate", 149 | "data.resource.careProvider as careProvider", 150 | "data.resource.gender as gender", 151 | "data.resource.identifier as identifier", 152 | "data.resource.managingOrganization as managingOrganization", 153 | "data.resource.maritalStatus as maritalStatus", 154 | "data.resource.meta as meta", 155 | "data.resource.telecom as telecom", 156 | "data.search as search") 157 | .distinct()) 158 | (transformed_df 159 | .write 160 | .mode("overwrite") 161 | .save(f"{Env.silver}patient/")) 162 | #.saveAsTable(f"{Env.catalog_silver}.patient")) 163 | print(f"SILVER patient records were persisted at {Env.silver}patient") 164 | 165 | 166 | # COMMAND ---------- 167 | 168 | # Main File 169 | if __name__ == "__main__": 170 | 171 | ext = Extract() 172 | trans = Transform() 173 | 174 | #ext.cleanRaw() 175 | 176 | ext.pullData() 177 | trans.rawToBronze() 178 | 179 | trans.patientToSilver() 180 | 181 | -------------------------------------------------------------------------------- /docker-compose-celery.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.7.1 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Use this option ONLY for quick checks. Installing requirements at container 40 | # startup is done EVERY TIME the service is started. 41 | # A better way is to build a custom image or extend the official image 42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html. 43 | # Default: '' 44 | # 45 | # Feel free to modify this file to suit your needs. 46 | --- 47 | version: '3.8' 48 | x-airflow-common: 49 | &airflow-common 50 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 51 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 52 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 53 | #image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.7.1} 54 | build: ./celery 55 | environment: 56 | &airflow-common-env 57 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 58 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 59 | # For backward compatibility, with Airflow <2.3 60 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 61 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 62 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 63 | AIRFLOW__CORE__FERNET_KEY: '' 64 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 65 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 66 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 67 | # yamllint disable rule:line-length 68 | # Use simple http server on scheduler for health checks 69 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server 70 | # yamllint enable rule:line-length 71 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 72 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 73 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 74 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- geopy apache-airflow[postgres] apache-airflow-providers-apache-spark apache-airflow-providers-databricks} 75 | volumes: 76 | - ./dags:/opt/airflow/dags 77 | - ./logs:/opt/airflow/logs 78 | - ./data:/opt/airflow/data 79 | - ./plugins:/opt/airflow/plugins 80 | - ./spark:/opt/airflow/spark 81 | - ./variables:/opt/airflow/variables 82 | - ./jars:/opt/airflow/jars 83 | - ./config:/opt/airflow/config 84 | 85 | user: "${AIRFLOW_UID:-50000}:0" 86 | depends_on: 87 | &airflow-common-depends-on 88 | redis: 89 | condition: service_healthy 90 | postgres: 91 | condition: service_healthy 92 | 93 | services: 94 | postgres: 95 | image: postgres:13 96 | environment: 97 | POSTGRES_USER: airflow 98 | POSTGRES_PASSWORD: airflow 99 | POSTGRES_DB: airflow 100 | volumes: 101 | - postgres-db-volume-celery:/var/lib/postgresql/data 102 | - ./data:/opt/airflow/data 103 | healthcheck: 104 | test: ["CMD", "pg_isready", "-U", "airflow"] 105 | interval: 10s 106 | retries: 5 107 | start_period: 5s 108 | restart: always 109 | 110 | pgadmin: 111 | image: dpage/pgadmin4 112 | environment: 113 | PGADMIN_DEFAULT_EMAIL: "postgres@email.com" 114 | PGADMIN_DEFAULT_PASSWORD: "postgres" 115 | ports: 116 | - "15432:80" 117 | depends_on: 118 | - postgres 119 | volumes: 120 | - ./data:/opt/airflow/data 121 | 122 | redis: 123 | image: redis:latest 124 | expose: 125 | - 6379 126 | healthcheck: 127 | test: ["CMD", "redis-cli", "ping"] 128 | interval: 10s 129 | timeout: 30s 130 | retries: 50 131 | start_period: 30s 132 | restart: always 133 | 134 | airflow-webserver: 135 | <<: *airflow-common 136 | command: webserver 137 | ports: 138 | - "8080:8080" 139 | healthcheck: 140 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 141 | interval: 30s 142 | timeout: 10s 143 | retries: 5 144 | start_period: 30s 145 | restart: always 146 | depends_on: 147 | <<: *airflow-common-depends-on 148 | airflow-init: 149 | condition: service_completed_successfully 150 | 151 | airflow-scheduler: 152 | <<: *airflow-common 153 | command: scheduler 154 | healthcheck: 155 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 156 | interval: 30s 157 | timeout: 10s 158 | retries: 5 159 | start_period: 30s 160 | restart: always 161 | depends_on: 162 | <<: *airflow-common-depends-on 163 | airflow-init: 164 | condition: service_completed_successfully 165 | 166 | airflow-worker: 167 | <<: *airflow-common 168 | command: celery worker 169 | healthcheck: 170 | # yamllint disable rule:line-length 171 | test: 172 | - "CMD-SHELL" 173 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 174 | interval: 30s 175 | timeout: 10s 176 | retries: 5 177 | start_period: 30s 178 | environment: 179 | <<: *airflow-common-env 180 | # Required to handle warm shutdown of the celery workers properly 181 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 182 | DUMB_INIT_SETSID: "0" 183 | restart: always 184 | depends_on: 185 | <<: *airflow-common-depends-on 186 | airflow-init: 187 | condition: service_completed_successfully 188 | 189 | airflow-triggerer: 190 | <<: *airflow-common 191 | command: triggerer 192 | healthcheck: 193 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 194 | interval: 30s 195 | timeout: 10s 196 | retries: 5 197 | start_period: 30s 198 | restart: always 199 | depends_on: 200 | <<: *airflow-common-depends-on 201 | airflow-init: 202 | condition: service_completed_successfully 203 | 204 | airflow-init: 205 | <<: *airflow-common 206 | entrypoint: /bin/bash 207 | # yamllint disable rule:line-length 208 | command: 209 | - -c 210 | - | 211 | function ver() { 212 | printf "%04d%04d%04d%04d" $${1//./ } 213 | } 214 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 215 | airflow_version_comparable=$$(ver $${airflow_version}) 216 | min_airflow_version=2.2.0 217 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 218 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 219 | echo 220 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 221 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 222 | echo 223 | exit 1 224 | fi 225 | if [[ -z "${AIRFLOW_UID}" ]]; then 226 | echo 227 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 228 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 229 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 230 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 231 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 232 | echo 233 | fi 234 | one_meg=1048576 235 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 236 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 237 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 238 | warning_resources="false" 239 | if (( mem_available < 4000 )) ; then 240 | echo 241 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 242 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 243 | echo 244 | warning_resources="true" 245 | fi 246 | if (( cpus_available < 2 )); then 247 | echo 248 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 249 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 250 | echo 251 | warning_resources="true" 252 | fi 253 | if (( disk_available < one_meg * 10 )); then 254 | echo 255 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 256 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 257 | echo 258 | warning_resources="true" 259 | fi 260 | if [[ $${warning_resources} == "true" ]]; then 261 | echo 262 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 263 | echo "Please follow the instructions to increase amount of resources available:" 264 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 265 | echo 266 | fi 267 | mkdir -p /sources/logs /sources/dags /sources/plugins 268 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 269 | exec /entrypoint airflow version 270 | # yamllint enable rule:line-length 271 | environment: 272 | <<: *airflow-common-env 273 | _AIRFLOW_DB_MIGRATE: 'true' 274 | _AIRFLOW_WWW_USER_CREATE: 'true' 275 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 276 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 277 | _PIP_ADDITIONAL_REQUIREMENTS: '' 278 | user: "0:0" 279 | volumes: 280 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 281 | 282 | airflow-cli: 283 | <<: *airflow-common 284 | profiles: 285 | - debug 286 | environment: 287 | <<: *airflow-common-env 288 | CONNECTION_CHECK_MAX_COUNT: "0" 289 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 290 | command: 291 | - bash 292 | - -c 293 | - airflow 294 | 295 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 296 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 297 | # See: https://docs.docker.com/compose/profiles/ 298 | flower: 299 | <<: *airflow-common 300 | command: celery flower 301 | profiles: 302 | - flower 303 | ports: 304 | - "5555:5555" 305 | healthcheck: 306 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 307 | interval: 30s 308 | timeout: 10s 309 | retries: 5 310 | start_period: 30s 311 | restart: always 312 | depends_on: 313 | <<: *airflow-common-depends-on 314 | airflow-init: 315 | condition: service_completed_successfully 316 | 317 | volumes: 318 | postgres-db-volume-celery: -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | x-airflow-common: 3 | &airflow-common 4 | #image: ${AIRFLOW_IMAGE_NAME:-my-image:0.0.1} 5 | build: . 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 9 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 10 | AIRFLOW__CORE__FERNET_KEY: '' 11 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 12 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 13 | AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' 14 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- geopy apache-airflow[postgres] apache-airflow-providers-apache-spark apache-airflow-providers-databricks} 15 | volumes: 16 | - ./dags:/opt/airflow/dags 17 | - ./logs:/opt/airflow/logs 18 | - ./data:/opt/airflow/data 19 | - ./plugins:/opt/airflow/plugins 20 | - ./spark:/opt/airflow/spark 21 | - ./variables:/opt/airflow/variables 22 | - ./jars:/opt/airflow/jars 23 | 24 | user: "${AIRFLOW_UID:-50000}:0" 25 | depends_on: 26 | &airflow-common-depends-on 27 | postgres: 28 | condition: service_healthy 29 | services: 30 | postgres: 31 | image: postgres:13 32 | environment: 33 | POSTGRES_USER: airflow 34 | POSTGRES_PASSWORD: airflow 35 | POSTGRES_DB: airflow 36 | ports: 37 | - 5432:5432 38 | volumes: 39 | - postgres-db-volume:/var/lib/postgresql/data 40 | - ./data:/opt/airflow/data 41 | healthcheck: 42 | test: ["CMD", "pg_isready", "-U", "airflow"] 43 | interval: 5s 44 | retries: 5 45 | restart: on-failure 46 | pgadmin: 47 | image: dpage/pgadmin4 48 | environment: 49 | PGADMIN_DEFAULT_EMAIL: "postgres@email.com" 50 | PGADMIN_DEFAULT_PASSWORD: "postgres" 51 | ports: 52 | - "15432:80" 53 | depends_on: 54 | - postgres 55 | volumes: 56 | - ./data:/opt/airflow/data 57 | 58 | airflow-webserver: 59 | <<: *airflow-common 60 | command: webserver 61 | ports: 62 | - 8080:8080 63 | healthcheck: 64 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 65 | interval: 10s 66 | timeout: 10s 67 | retries: 5 68 | restart: on-failure 69 | depends_on: 70 | <<: *airflow-common-depends-on 71 | airflow-init: 72 | condition: service_completed_successfully 73 | 74 | airflow-scheduler: 75 | <<: *airflow-common 76 | command: scheduler 77 | healthcheck: 78 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 79 | interval: 10s 80 | timeout: 10s 81 | retries: 5 82 | restart: on-failure 83 | depends_on: 84 | <<: *airflow-common-depends-on 85 | airflow-init: 86 | condition: service_completed_successfully 87 | 88 | airflow-init: 89 | <<: *airflow-common 90 | entrypoint: /bin/bash 91 | # yamllint disable rule:line-length 92 | command: 93 | - -c 94 | - | 95 | function ver() { 96 | printf "%04d%04d%04d%04d" $${1//./ } 97 | } 98 | airflow_version=$$(gosu airflow airflow version) 99 | airflow_version_comparable=$$(ver $${airflow_version}) 100 | min_airflow_version=2.2.0 101 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 102 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 103 | echo 104 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 105 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 106 | echo 107 | exit 1 108 | fi 109 | if [[ -z "${AIRFLOW_UID}" ]]; then 110 | echo 111 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 112 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 113 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 114 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 115 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 116 | echo 117 | fi 118 | one_meg=1048576 119 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 120 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 121 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 122 | warning_resources="false" 123 | if (( mem_available < 4000 )) ; then 124 | echo 125 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 126 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 127 | echo 128 | warning_resources="true" 129 | fi 130 | if (( cpus_available < 2 )); then 131 | echo 132 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 133 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 134 | echo 135 | warning_resources="true" 136 | fi 137 | if (( disk_available < one_meg * 10 )); then 138 | echo 139 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 140 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 141 | echo 142 | warning_resources="true" 143 | fi 144 | if [[ $${warning_resources} == "true" ]]; then 145 | echo 146 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 147 | echo "Please follow the instructions to increase amount of resources available:" 148 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 149 | echo 150 | fi 151 | mkdir -p /sources/logs /sources/dags /sources/plugins 152 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 153 | chmod -R 777 /opt/airflow/ 154 | exec /entrypoint airflow version 155 | # yamllint enable rule:line-length 156 | environment: 157 | <<: *airflow-common-env 158 | _AIRFLOW_DB_UPGRADE: 'true' 159 | _AIRFLOW_WWW_USER_CREATE: 'true' 160 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 161 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 162 | user: "0:0" 163 | volumes: 164 | - .:/sources 165 | 166 | airflow-cli: 167 | <<: *airflow-common 168 | profiles: 169 | - debug 170 | environment: 171 | <<: *airflow-common-env 172 | CONNECTION_CHECK_MAX_COUNT: "0" 173 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 174 | command: 175 | - bash 176 | - -c 177 | - airflow 178 | 179 | volumes: 180 | postgres-db-volume: 181 | -------------------------------------------------------------------------------- /img/DAG.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/DAG.JPG -------------------------------------------------------------------------------- /img/airflowcelery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/airflowcelery.png -------------------------------------------------------------------------------- /img/airflowcelerydags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/airflowcelerydags.png -------------------------------------------------------------------------------- /img/views.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/img/views.JPG -------------------------------------------------------------------------------- /jars/postgresql-42.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mpavanetti/airflow/ac64520c026d4add665a136f4d393c8531e88105/jars/postgresql-42.6.0.jar -------------------------------------------------------------------------------- /pgadmin.json: -------------------------------------------------------------------------------- 1 | { 2 | "Servers": { 3 | "1": { 4 | "Name": "postgres@email.com", 5 | "Group": "Servers", 6 | "Host": "postgres", 7 | "Port": 5432, 8 | "MaintenanceDB": "airflow", 9 | "Username": "airflow", 10 | "SSLMode": "prefer", 11 | "PassFile": "/pgpassfile" 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /spark/spark_postgres_query.py: -------------------------------------------------------------------------------- 1 | from airflow.models import Variable 2 | from pyspark.sql import SparkSession 3 | 4 | if __name__ == '__main__': 5 | spark_dir = Variable.get("spark_dir") 6 | 7 | # Start Spark Session 8 | spark = (SparkSession 9 | .builder 10 | .master("local[1]") 11 | .appName("postgres_query") 12 | .getOrCreate()) 13 | 14 | pg_read = (spark.read 15 | .format("jdbc") 16 | .option("url", "jdbc:postgresql://postgres:5432/airflow") 17 | .option("user", "airflow") 18 | .option("password", "airflow") 19 | .option("driver", "org.postgresql.Driver")) 20 | 21 | dataset_df = (pg_read 22 | .option("query", "SELECT * FROM VW_DATASET_1") 23 | .load()) 24 | 25 | dataset_df.show(15,False) 26 | dataset_df.printSchema() -------------------------------------------------------------------------------- /spark/spark_weather_data.py: -------------------------------------------------------------------------------- 1 | # Airflow Imports 2 | from airflow.models import Variable 3 | 4 | # Spark Imports 5 | from pyspark.sql import SparkSession 6 | import os 7 | from pyspark.sql.functions import col,explode, element_at, expr,unix_timestamp, to_timestamp, to_date, regexp_replace 8 | 9 | if __name__ == '__main__': 10 | tmp_data_dir = Variable.get("weather_data_tmp_directory") 11 | 12 | # Start Spark Session 13 | spark = (SparkSession 14 | .builder 15 | .master("local[2]") 16 | .appName("weather_data") 17 | .getOrCreate()) 18 | 19 | # Read Data From Weather folder 20 | df = spark.read.format("json") \ 21 | .option('inferSchema',True) \ 22 | .load(f'{tmp_data_dir}weather/') \ 23 | .drop("timezone_offset") 24 | 25 | # Persist Data (MEMORY_AND_DISK) 26 | df.persist() 27 | 28 | # Add and processc olumns to df_hourly 29 | df_hourly = df.withColumn('hourly',explode(col('hourly'))) \ 30 | .withColumn("datetime", to_timestamp(expr("hourly.dt"))) \ 31 | .withColumn("temp", expr("hourly.temp")) \ 32 | .withColumn("feels_like", expr("hourly.feels_like")) \ 33 | .withColumn("pressure", expr("hourly.pressure")) \ 34 | .withColumn("humidity", expr("hourly.humidity")) \ 35 | .withColumn("dew_point", expr("hourly.dew_point")) \ 36 | .withColumn("uvi", expr("hourly.uvi")) \ 37 | .withColumn("clouds", expr("hourly.clouds")) \ 38 | .withColumn("visibility", expr("hourly.visibility")) \ 39 | .withColumn("wind_speed", expr("hourly.wind_speed")) \ 40 | .withColumn("wind_deg", expr("hourly.wind_deg")) \ 41 | .withColumn("wind_gust", expr("hourly.wind_gust")) \ 42 | .withColumn("weather_id", expr("hourly.weather.id")) \ 43 | .withColumn("weather_id", element_at(col("weather_id"), 1)) \ 44 | .withColumn("weather_main", expr("hourly.weather.main")) \ 45 | .withColumn("weather_main", element_at(col("weather_main"), 1)) \ 46 | .withColumn("weather_description", expr("hourly.weather.description")) \ 47 | .withColumn("weather_description", element_at(col("weather_description"), 1)) \ 48 | .withColumn("weather_icon", expr("hourly.weather.icon")) \ 49 | .withColumn("weather_icon", element_at(col("weather_icon"), 1)) \ 50 | .withColumnRenamed('lat','latitude') \ 51 | .withColumnRenamed('lon','longitude') \ 52 | .drop("hourly","current") \ 53 | .coalesce(1) 54 | 55 | # Add and process column to df_current 56 | df_current = df.withColumn("datetime", to_timestamp(expr("current.dt"))) \ 57 | .withColumn("sunrise", to_timestamp(expr("current.sunrise"))) \ 58 | .withColumn("sunset", to_timestamp(expr("current.sunset"))) \ 59 | .withColumn("temp", expr("current.temp")) \ 60 | .withColumn("feels_like", expr("current.feels_like")) \ 61 | .withColumn("pressure", expr("current.pressure")) \ 62 | .withColumn("humidity", expr("current.humidity")) \ 63 | .withColumn("dew_point", expr("current.dew_point")) \ 64 | .withColumn("uvi", expr("current.uvi")) \ 65 | .withColumn("clouds", expr("current.clouds")) \ 66 | .withColumn("visibility", expr("current.visibility")) \ 67 | .withColumn("wind_speed", expr("current.wind_speed")) \ 68 | .withColumn("wind_deg", expr("current.wind_deg")) \ 69 | .withColumn("weather_id", expr("current.weather.id")) \ 70 | .withColumn("weather_id", element_at(col("weather_id"), 1)) \ 71 | .withColumn("weather_main", expr("current.weather.main")) \ 72 | .withColumn("weather_main", element_at(col("weather_main"), 1)) \ 73 | .withColumn("weather_description", expr("current.weather.description")) \ 74 | .withColumn("weather_description", element_at(col("weather_description"), 1)) \ 75 | .withColumn("weather_icon", expr("current.weather.icon")) \ 76 | .withColumn("weather_icon", element_at(col("weather_icon"), 1)) \ 77 | .withColumnRenamed('lat','latitude') \ 78 | .withColumnRenamed('lon','longitude') \ 79 | .drop("hourly","current") \ 80 | .coalesce(1) 81 | 82 | # Write df_current 83 | df_current.write \ 84 | .format('csv') \ 85 | .mode('overwrite') \ 86 | .option('header',False) \ 87 | .option('sep',',') \ 88 | .save(f'{tmp_data_dir}processed/current_weather/') 89 | 90 | #df_current.show(10) 91 | 92 | # Write df_hourly 93 | df_hourly.write \ 94 | .format('csv') \ 95 | .mode('overwrite') \ 96 | .option('header',False) \ 97 | .option('sep',',') \ 98 | .save(f'{tmp_data_dir}processed/hourly_weather/') 99 | 100 | #df_hourly.show(10) -------------------------------------------------------------------------------- /variables/airflow_connections.json: -------------------------------------------------------------------------------- 1 | { 2 | "postgres_default": { 3 | "conn_type":"postgres", 4 | "host":"postgres", 5 | "login":"airflow", 6 | "password":"airflow", 7 | "schema":"airflow", 8 | "port":"5432", 9 | "extra": null 10 | }, 11 | 12 | "openweathermapApi": { 13 | "conn_type":"HTTP", 14 | "host":"http://api.openweathermap.org/", 15 | "login":null, 16 | "password":"FILL UP WITH YOUR TOKEN", 17 | "schema":null, 18 | "port":null, 19 | "extra": null 20 | }, 21 | 22 | "spark_default": { 23 | "conn_type":"Spark", 24 | "host":"spark://spark", 25 | "login": null, 26 | "password": null, 27 | "schema": null, 28 | "port":"7077", 29 | "extra": null 30 | }, 31 | 32 | "databricks_default": { 33 | "conn_type":"databricks", 34 | "host":"https://ACCOUNTID.azuredatabricks.net/", 35 | "login": "token", 36 | "password": "FILL UP WITH YOUR TOKEN", 37 | "schema": null, 38 | "port": null, 39 | "extra": null 40 | } 41 | } -------------------------------------------------------------------------------- /variables/airflow_variables.json: -------------------------------------------------------------------------------- 1 | { 2 | "weather_data_lat":"", 3 | "weather_data_lon":"", 4 | "weather_data_endpoint":"data/2.5/onecall/timemachine", 5 | "weather_data_units":"metric", 6 | "weather_data_tmp_directory":"/opt/airflow/data/tmp/", 7 | "weather_data_spark_code":"/opt/airflow/spark/spark_weather_data.py", 8 | "spark_dir":"/opt/airflow/spark/", 9 | "jars_dir":"/opt/airflow/jars/" 10 | } --------------------------------------------------------------------------------