├── requirements.txt
├── dags
    └── my_dag.py
├── dag_2.py
├── ML.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas~=2.0.0
2 | scikit-learn~=1.2.2
3 | airflow~=2.4.3


--------------------------------------------------------------------------------
/dags/my_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.python import PythonOperator, BranchPythonOperator
 3 | from airflow.operators.bash import BashOperator
 4 | 
 5 | from random import randint
 6 | from datetime import datetime
 7 | 
 8 | def _choose_best_model(ti):
 9 |     accuracies = ti.xcom_pull(task_ids=[
10 |         'training_model_A',
11 |         'training_model_B',
12 |         'training_model_C'
13 |     ])
14 |     best_accuracy = max(accuracies)
15 |     if (best_accuracy > 8):
16 |         return 'accurate'
17 |     return 'inaccurate'
18 | 
19 | 
20 | def _training_model():
21 |     return randint(1, 10)
22 | 
23 | with DAG("my_dag", start_date=datetime(2021, 1, 1),
24 |     schedule_interval="@daily", catchup=False) as dag:
25 | 
26 |         training_model_A = PythonOperator(
27 |             task_id="training_model_A",
28 |             python_callable=_training_model
29 |         )
30 | 
31 |         training_model_B = PythonOperator(
32 |             task_id="training_model_B",
33 |             python_callable=_training_model
34 |         )
35 | 
36 |         training_model_C = PythonOperator(
37 |             task_id="training_model_C",
38 |             python_callable=_training_model
39 |         )
40 | 
41 |         choose_best_model = BranchPythonOperator(
42 |             task_id="choose_best_model",
43 |             python_callable=_choose_best_model
44 |         )
45 | 
46 |         accurate = BashOperator(
47 |             task_id="accurate",
48 |             bash_command="echo 'accurate'"
49 |         )
50 | 
51 |         inaccurate = BashOperator(
52 |             task_id="inaccurate",
53 |             bash_command="echo 'inaccurate'"
54 |         )
55 | 
56 | [training_model_A, training_model_B, training_model_C] >> choose_best_model >> [accurate, inaccurate]
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/dag_2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
 4 | from sklearn.cluster import KMeans
 5 | from airflow import DAG
 6 | from airflow.operators.python import PythonOperator, BranchPythonOperator
 7 | from datetime import datetime, timedelta
 8 | import os
 9 | 
10 | # Define the kmeans_clustering() function
11 | def kmeans_clustering():
12 |     # Load the dataset
13 |     df = pd.read_csv('/Users/erictak/PycharmProjects/Airflow/tracklist.csv')
14 | 
15 |     # Preprocessing for k-means
16 |     df_cl = df[['tempo', 'loudness', 'danceability', 'energy', 'key', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
17 |             'liveness', 'valence']]
18 |     df_cl = df_cl.replace(0, 0.1)
19 |     df_cl = df_cl.fillna(df_cl.mean())
20 | 
21 |     # Standardization
22 |     std_scaler = StandardScaler()
23 |     df_scaled = std_scaler.fit_transform(df_cl)
24 | 
25 |     # Kmeans
26 |     model = KMeans(n_clusters=10, random_state=42)
27 |     model.fit(df_scaled)
28 |     df = df.assign(KMeans=model.labels_)
29 | 
30 |     # Rename ClusterLabel to KMeans
31 |     df = df.rename(columns={'ClusterLabel': 'KMeans'})
32 | 
33 |     # Cluster Label to categorical
34 |     df['KMeans'] = df['KMeans'].astype('category')
35 | 
36 |     # Save the dataframe to csv
37 |     save_path = os.path.join('/Users/erictak/airflow', 'tracklist_kmeans.csv')
38 |     df.to_csv('save_path', index=False)
39 | 
40 |     return print(df.head())
41 | 
42 | #%%
43 | 
44 | # Define the default_args for the DAG
45 | default_args = {
46 |     'owner': 'your_name',  # Replace with your name
47 |     'start_date': datetime(2023, 4, 11),  # Replace with the start date of your DAG
48 |     'depends_on_past': False,
49 |     'retries': 1,
50 |     'retry_delay': timedelta(minutes=5),
51 | }
52 | 
53 | # Instantiate the DAG with the default_args
54 | dag = DAG(
55 |     'dag_2_kmeans',  # Replace with the name of your DAG
56 |     default_args=default_args,
57 |     schedule_interval='@hourly',  # Replace with the desired schedule interval for your DAG
58 | )
59 | 
60 | # Define the PythonOperator to run the kmeans_clustering() function
61 | kmeans_task = PythonOperator(
62 |     task_id='kmeans_clustering_task',  # Replace with the name of the task
63 |     python_callable=kmeans_clustering,  # Replace with the actual name of your function
64 |     dag=dag,
65 | )
66 | 


--------------------------------------------------------------------------------
/ML.py:
--------------------------------------------------------------------------------
 1 | # import libraries
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
 5 | from sklearn.cluster import KMeans
 6 | from airflow import DAG
 7 | from airflow.operators.python import PythonOperator, BranchPythonOperator
 8 | from airflow.operators.bash import BashOperator
 9 | from random import randint
10 | from datetime import datetime
11 | from datetime import datetime, timedelta
12 | 
13 | # function to run the kmeans clustering
14 | 
15 | def kmeans_clustering(file_path):
16 |     # Load the dataset
17 |     df = pd.read_csv('/Users/erictak/PycharmProjects/Airflow/tracklist.csv')
18 | 
19 |     # Preprocessing for k-means
20 |     df_cl = df[['tempo', 'loudness', 'danceability', 'energy', 'key', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
21 |             'liveness', 'valence']]
22 |     df_cl = df_cl.replace(0, 0.1)
23 |     df_cl = df_cl.fillna(df_cl.mean())
24 | 
25 |     # Log transformation
26 |     df_log = np.log(df_cl)
27 | 
28 |     # Standardization
29 |     std_scaler = StandardScaler()
30 |     df_scaled = std_scaler.fit_transform(df_cl)
31 | 
32 |     # Min Max Scaling
33 |     scaler = MinMaxScaler()
34 |     df_scaled_positive = scaler.fit_transform(df_log)
35 | 
36 |     # Kmeans
37 |     model = KMeans(n_clusters=10, random_state=42)
38 |     model.fit(df_scaled)
39 |     df = df.assign(KMeans=model.labels_)
40 | 
41 |     # Rename ClusterLabel to KMeans
42 |     df = df.rename(columns={'ClusterLabel': 'KMeans'})
43 | 
44 |     # Cluster Label to categorical
45 |     df['KMeans'] = df['KMeans'].astype('category')
46 | 
47 |     # Save the dataframe to csv
48 |     df.to_csv('tracklist_kmeans.csv', index=False)
49 | 
50 |     return df
51 | 
52 | # kmeans_clustering('tracklist.csv')
53 | 
54 | 
55 | # Define the default_args for the DAG
56 | default_args = {
57 |     'owner': 'your_name',  # Replace with your name
58 |     'start_date': datetime(2023, 1, 1),  # Replace with the start date of your DAG
59 |     'depends_on_past': False,
60 |     'retries': 1,
61 |     'retry_delay': timedelta(minutes=5),
62 | }
63 | 
64 | # Instantiate the DAG with the default_args
65 | dag = DAG(
66 |     'kmeans_clustering_dag',  # Replace with the name of your DAG
67 |     default_args=default_args,
68 |     schedule_interval='@hourly',  # Replace with the desired schedule interval for your DAG
69 | )
70 | 
71 | # Define the PythonOperator to run the kmeans_clustering() function
72 | kmeans_task = PythonOperator(
73 |     task_id='kmeans_clustering_task',  # Replace with the name of the task
74 |     python_callable=kmeans_clustering,  # Replace with the actual name of your function
75 |     op_args=['tracklist.csv'],  # Replace with the argument(s) to pass to your function
76 |     dag=dag,
77 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Final assignment Big Data Infrastructure 
 2 | 
 3 | ## By Ruben Tak
 4 | 
 5 | ### Assignment:
 6 | 
 7 | Create an Airflow pipeline for an end-to-end ML workflow: 
 8 | 
 9 | * get some data in bronze(s3)  (raw csv/json...)
10 | * Join it with other data and save it in silver (s3) parquet and with the splits and format needed for training a model
11 | * Execute a training job with the data
12 | * Track parameters and metrics (e.g. in logs)
13 | * Deploy the model to production as an endpoint/API container or execute a batch prediction on some data
14 | * TIP: Don't reinvent the wheel, the model does not need to be complex. You can use pre-created ones
15 | 
16 | You can run it in AWS, on your local PC or hybrid
17 | 
18 | ##### Deliver
19 | 
20 | * Architecture documentation (2-3 pages)
21 | * Purpose and explanation of the solution and its components
22 | * Architecture diagram
23 | * All the code
24 | * Proof of the things running (pictures)
25 | 
26 | 
27 | ### What is airflow?
28 | Airflow in Python refers to an open-source platform used for orchestrating complex workflows and data pipelines. It provides a framework for defining, scheduling, and monitoring tasks as directed acyclic graphs (DAGs), which allows users to specify how tasks are organized and executed. Airflow is often used in data engineering and data science pipelines to automate workflows that involve multiple steps or dependencies, such as data ingestion, data processing, and data transformation. Airflow provides a web-based user interface for visualizing and managing workflows, along with a rich set of operators and sensors that can be used to define tasks and their dependencies. It also supports advanced features such as dynamic task generation, retries, and error handling, making it a powerful tool for managing complex data workflows in Python.
29 | 
30 | ### Steps:
31 | 
32 | 
33 | #### Step 1: Set up Airflow
34 | 
35 | Install Apache Airflow on your local PC or on an AWS EC2 instance.
36 | Create an Airflow DAG (Directed Acyclic Graph) to define the workflow.
37 | 
38 | #### Step 2: Data Ingestion
39 | 
40 | Use Airflow to trigger a data ingestion task that retrieves the raw data from S3 (bronze).
41 | Perform any necessary data cleaning and preprocessing.
42 | Save the cleaned data to a new location in S3 (silver) in Parquet format, along with the splits and format needed for model training.
43 | 
44 | #### Step 3: Model Training
45 | 
46 | Create a training job using a machine learning library or framework of your choice (e.g., scikit-learn, TensorFlow, PyTorch) to train a model using the cleaned data in S3 (silver).
47 | Log relevant parameters and metrics (e.g., hyperparameters, loss, accuracy) during the training process using a logging library (e.g., MLflow, TensorBoard).
48 | 
49 | #### Step 4: Model Deployment
50 | 
51 | Deploy the trained model to production as an endpoint or API container using a containerization tool like Docker.
52 | Alternatively, if you want to execute batch predictions on some data, use Airflow to trigger a batch prediction task that uses the trained model to make predictions on new data.
53 | 
54 | #### Step 5: Monitoring and Logging
55 | 
56 | Use Airflow to set up monitoring tasks that periodically check the status of the pipeline components (e.g., data ingestion, model training, model deployment).
57 | Use logging libraries or tools (e.g., ELK stack, CloudWatch, Splunk) to capture and analyze logs from the pipeline components for troubleshooting, auditing, and performance monitoring purposes.
58 | 
59 | #### Step 6: Documentation and Proof of Execution
60 | 
61 | Create architecture documentation that includes the purpose and explanation of the solution and its components, along with an architecture diagram that illustrates the flow of data and tasks in the pipeline.
62 | Include all the code used in the pipeline, including the Airflow DAG definition, data ingestion, model training, and model deployment code.
63 | Provide proof of execution, such as screenshots or output logs, to demonstrate that the pipeline is running and producing the expected results.
64 | 
65 | Note: Depending on your specific use case and environment (AWS, local PC, hybrid), you may need to configure additional components such as AWS S3, AWS SageMaker, or Docker in your pipeline.
66 | 
67 | Once you have completed the above steps, you will have a functional Airflow pipeline for an end-to-end ML workflow, including data ingestion, model training, model deployment, and monitoring.
68 | 
69 | ### steps so far:
70 | 
71 | - created a kmeans ML function
72 | - defined the DAG in airflow
73 | - run the DAG in airflow
74 | - 
75 | 
76 | ## references:
77 | 
78 | What is DAG?
79 | https://www.youtube.com/watch?v=1Yh5S-S6wsI
80 | 


--------------------------------------------------------------------------------