├── 01-data-transformation
    ├── data-cleaning.py
    ├── data-exploration.py
    ├── data-transformation.py
    └── mockdata.py
├── 02-kafka
    ├── 01-api-setup
    │   ├── main.py
    │   └── readme.md
    ├── 02-kafka-producer-consumer
    │   ├── main.py
    │   └── readme.md
    └── 03-api-to-kafka
    │   ├── main.py
    │   └── readme.md
├── 03-mlflow
    ├── example-mlflow.py
    ├── requirements.txt
    └── validation.py
├── 04-bentoml
    ├── __pycache__
    │   ├── model_service_v1.cpython-312.pyc
    │   ├── model_service_v2.cpython-312.pyc
    │   └── model_service_v3.cpython-312.pyc
    ├── model_service_v1.py
    ├── model_service_v2.py
    ├── model_service_v3.py
    ├── model_train_v1.py
    ├── model_train_v2.py
    └── readme.md
├── 05-project
    ├── flask_app.py
    ├── isolation_model.py
    ├── readme.md
    ├── register_model.py
    ├── requirements.txt
    ├── service.py
    ├── synthetic_health_claims.py
    ├── templates
    │   ├── index.html
    │   ├── result.html
    │   └── visualize.html
    ├── test_claim.py
    └── v2_app.py
├── 06-orchestration
    └── iot_dag.py
└── readme.md


/01-data-transformation/data-cleaning.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | # Load the CSV file into a DataFrame
 5 | df = pd.read_csv("mock_data.csv")
 6 | 
 7 | # Fill missing values in 'age' and 'salary' with the median
 8 | df['age'].fillna(df['age'].median(), inplace=True)
 9 | df['salary'].fillna(df['salary'].median(), inplace=True)
10 | 
11 | # Fill missing values in 'department' with 'Unknown'
12 | df['department'].fillna('Unknown', inplace=True)
13 | 
14 | # Print sample data after handling missing values
15 | print("Sample data after filling missing values:")
16 | print(df.head(), "\n")
17 | 
18 | # Convert 'profile' from JSON string to dictionary
19 | df['profile'] = df['profile'].apply(lambda x: json.loads(x) if pd.notnull(x) else {})
20 | 
21 | # Print sample data after converting 'profile' column
22 | print("Sample data after converting 'profile' column:")
23 | print(df[['profile']].head(), "\n")
24 | 
25 | # Extract 'address', 'phone', and 'email' from 'profile' column
26 | df['address'] = df['profile'].apply(lambda x: x.get('address', None))
27 | df['phone'] = df['profile'].apply(lambda x: x.get('phone', None))
28 | df['email'] = df['profile'].apply(lambda x: x.get('email', None))
29 | 
30 | # Print sample data after extracting fields from 'profile'
31 | print("Sample data after extracting fields from 'profile':")
32 | print(df[['address', 'phone', 'email']].head(), "\n")
33 | 
34 | # Drop the original 'profile' column
35 | df.drop(columns=['profile'], inplace=True)
36 | 
37 | # Print sample data after dropping 'profile' column
38 | print("Sample data after dropping 'profile' column:")
39 | print(df.head(), "\n")
40 | 
41 | # Save the cleaned DataFrame to a new CSV file
42 | df.to_csv("cleaned_data.csv", index=False)
43 | 
44 | # Confirm data has been saved
45 | print("Cleaned data saved to 'cleaned_data.csv'")
46 | 
47 | 


--------------------------------------------------------------------------------
/01-data-transformation/data-exploration.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Load the CSV file into a DataFrame
 4 | df = pd.read_csv("mock_data.csv")
 5 | 
 6 | # Display the first few rows of the DataFrame
 7 | df.head()
 8 | 
 9 | # Get a summary of the DataFrame
10 | df.info()
11 | 
12 | # Check for missing values
13 | df.isnull().sum()
14 | 
15 | # View statistical summary for numeric columns
16 | df.describe(include='all')
17 | 
18 | # Check unique values in the 'department' column
19 | df['department'].unique()
20 | 
21 | 


--------------------------------------------------------------------------------
/01-data-transformation/data-transformation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | 
 5 | # Load the CSV file into a DataFrame
 6 | df = pd.read_csv("cleaned_data.csv")
 7 | 
 8 | 
 9 | # Add a new column 'address_length' that calculates the length of the address
10 | df['address_length'] = df['address'].apply(lambda x: len(str(x)))
11 | 
12 | # Print sample data after adding 'address_length' column
13 | print("Sample data after adding 'address_length' column:")
14 | print(df[['address', 'address_length']].head(), "\n")
15 | 
16 | # Define the bins and labels
17 | bins = [0, 50000, 70000, 100000]
18 | labels = ['low', 'medium', 'high']
19 | 
20 | # Create a new column 'salary_category'
21 | df['salary_category'] = pd.cut(df['salary'], bins=bins, labels=labels, include_lowest=True)
22 | 
23 | # Print sample data after adding 'salary_category' column
24 | print("Sample data after adding 'salary_category' column:")
25 | print(df[['salary', 'salary_category']].head(), "\n")
26 | 
27 | # Group by 'department' and calculate average salary and age
28 | summary_report = df.groupby('department').agg({
29 |     'salary': 'mean',
30 |     'age': 'mean'
31 | }).reset_index()
32 | 
33 | # Rename columns for clarity
34 | summary_report.columns = ['Department', 'Average Salary', 'Average Age']
35 | 
36 | # Print the summary report
37 | print("Summary report of average salary and age by department:")
38 | print(summary_report, "\n")
39 | 
40 | # Save the final transformed DataFrame to a new CSV file
41 | df.to_csv("transformed_data.csv", index=False)
42 | 
43 | # Confirm data has been saved
44 | print("Final transformed data saved to 'transformed_data.csv'")
45 | 


--------------------------------------------------------------------------------
/01-data-transformation/mockdata.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import json
 4 | import random
 5 | from datetime import datetime, timedelta
 6 | 
 7 | # Set random seed for reproducibility
 8 | np.random.seed(42)
 9 | 
10 | # Number of records
11 | num_records = 20000
12 | 
13 | # Generate random data
14 | data = {
15 |     "id": np.arange(1, num_records + 1),
16 |     "name": [f"Name_{i}" for i in np.random.randint(1, 1000, num_records)],
17 |     "age": np.random.randint(18, 80, num_records),
18 |     "salary": np.random.choice([50000, 60000, 70000, None], num_records),
19 |     "hire_date": [
20 |         (datetime.now() - timedelta(days=random.randint(0, 3650))).strftime("%Y-%m-%d")
21 |         if random.random() > 0.1 else None
22 |         for _ in range(num_records)
23 |     ],
24 |     "profile": [
25 |         json.dumps({
26 |             "address": f"Street {random.randint(1, 100)}, City {random.randint(1, 50)}",
27 |             "phone": f"{random.randint(1000000000, 9999999999)}",
28 |             "email": f"email_{random.randint(1, 1000)}@example.com"
29 |         })
30 |         if random.random() > 0.1 else None
31 |         for _ in range(num_records)
32 |     ],
33 |     "department": np.random.choice(["HR", "IT", "Finance", "Marketing", None], num_records),
34 |     "bonus": [None if random.random() > 0.9 else random.randint(1000, 10000) for _ in range(num_records)]
35 | }
36 | 
37 | # Create DataFrame
38 | df = pd.DataFrame(data)
39 | 
40 | # Introduce some NaN values randomly
41 | df.loc[np.random.choice(df.index, size=int(num_records * 0.05), replace=False), "age"] = np.nan
42 | df.loc[np.random.choice(df.index, size=int(num_records * 0.1), replace=False), "salary"] = np.nan
43 | 
44 | # Save to CSV
45 | df.to_csv("mock_data.csv", index=False)
46 | 


--------------------------------------------------------------------------------
/02-kafka/01-api-setup/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | import random
 3 | import string
 4 | import json
 5 | from datetime import datetime
 6 | import uuid
 7 | 
 8 | app = FastAPI()
 9 | 
10 | # Sample data
11 | products = ['Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Smartwatch']
12 | cities = ['New York', 'London', 'Tokyo', 'Paris', 'Sydney', 'Berlin', 'Singapore', 'Dubai', 'Toronto', 'Mumbai']
13 | 
14 | def generate_random_order():
15 |     return {
16 |         "order_id": str(uuid.uuid4()),
17 |         "product": random.choice(products),
18 |         "quantity": random.randint(1, 5),
19 |         "price": round(random.uniform(50.00, 1000.00), 2),
20 |         "customer_location": random.choice(cities),
21 |         "timestamp": datetime.now().isoformat()
22 |     }
23 | 
24 | @app.get("/generate-orders")
25 | def generate_orders():
26 |     orders = {}
27 |     for i in range(50):
28 |         order_key = f"order_{i+1}"
29 |         orders[order_key] = generate_random_order()
30 |     return orders
31 | 


--------------------------------------------------------------------------------
/02-kafka/01-api-setup/readme.md:
--------------------------------------------------------------------------------
 1 | ## FastAPI Order Generation API
 2 | 
 3 | This project provides a simple FastAPI endpoint to generate mock orders.
 4 | 
 5 | ### Prerequisites
 6 | 
 7 | * **Python 3**: Make sure you have Python 3 installed. You can check by running `python3 --version`.
 8 | * **pip**: The Python package manager, `pip`, should also be installed. You can check by running `pip3 --version`.
 9 | 
10 | ### Installation
11 | 
12 | 1. **Update your system and install Python 3 and pip (if not already installed):**
13 | 
14 |    ```bash
15 |    sudo yum update -y
16 |    sudo yum install -y python3
17 |    sudo yum install -y python3-pip
18 |    ```
19 | 
20 | 2. **Install FastAPI and Uvicorn:**
21 | 
22 |    ```bash
23 |    pip3 install fastapi uvicorn
24 |    ```
25 | 
26 | ### Running the API
27 | 
28 | 1. **Create a `main.py` file with your FastAPI code.** (Refer to the FastAPI documentation for how to structure your `main.py`)
29 | 
30 | 2. **Start the Uvicorn server:**
31 | 
32 |    ```bash
33 |    uvicorn main:app --host 0.0.0.0 --port 8000
34 |    ```
35 | 
36 |    This will make your API accessible at `http://<your-server-ip>:8000`.
37 | 
38 | ### Endpoint
39 | 
40 | * **`/generate-orders` (GET)**
41 | 
42 |    This endpoint generates mock order data.
43 | 
44 | ### Testing
45 | 
46 | You can test the endpoint using `curl`:
47 | 
48 | ```bash
49 | curl -X 'GET' \
50 |   'http://<your-server-ip>:8000/generate-orders' \
51 |   -H 'accept: application/json'
52 | ```
53 | 
54 | Replace `<your-server-ip>` with the actual IP address or hostname where your API is running.
55 | 
56 | **Note:** If you are running this on an EC2 instance, make sure to open port 8000 in your security group settings.
57 | 
58 | 


--------------------------------------------------------------------------------
/02-kafka/02-kafka-producer-consumer/main.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka import Producer, Consumer, KafkaError, KafkaException
 2 | from confluent_kafka.admin import AdminClient, NewTopic
 3 | 
 4 | # Configuration for Kafka connection
 5 | kafka_config = {
 6 |     'bootstrap.servers': 'localhost:9092',  # Replace with your Kafka broker address
 7 | }
 8 | 
 9 | # Admin client to create topics
10 | admin_client = AdminClient(kafka_config)
11 | 
12 | # Function to create a topic
13 | def create_topic(topic_name, num_partitions=1, replication_factor=1):
14 |     topic_list = [NewTopic(topic_name, num_partitions=num_partitions, replication_factor=replication_factor)]
15 |     fs = admin_client.create_topics(topic_list)
16 |     for topic, f in fs.items():
17 |         try:
18 |             f.result()  # The result itself is None
19 |             print(f"Topic '{topic}' created successfully")
20 |         except KafkaException as e:
21 |             # If the topic already exists, we skip this step
22 |             print(f"Failed to create topic '{topic}': {e}")
23 | 
24 | # Function to produce messages to Kafka
25 | def produce_messages(topic_name, messages):
26 |     producer = Producer(kafka_config)
27 | 
28 |     for message in messages:
29 |         producer.produce(topic_name, message)
30 |         print(f"Produced message: {message}")
31 | 
32 |     producer.flush()
33 | 
34 | # Function to consume messages from Kafka
35 | def consume_messages(topic_name, group_id='my-group'):
36 |     consumer_config = {
37 |         'bootstrap.servers': 'localhost:9092',
38 |         'group.id': group_id,
39 |         'auto.offset.reset': 'earliest'
40 |     }
41 |     consumer = Consumer(consumer_config)
42 | 
43 |     consumer.subscribe([topic_name])
44 |     print(f"Subscribed to topic '{topic_name}'")
45 | 
46 |     try:
47 |         while True:
48 |             msg = consumer.poll(timeout=1.0)
49 |             if msg is None:
50 |                 break
51 |             if msg.error():
52 |                 if msg.error().code() == KafkaError._PARTITION_EOF:
53 |                     print('End of partition reached {0}/{1}'.format(msg.topic(), msg.partition()))
54 |                 elif msg.error():
55 |                     raise KafkaException(msg.error())
56 |             else:
57 |                 print(f"Consumed message: {msg.value().decode('utf-8')}")
58 |     finally:
59 |         consumer.close()
60 | 
61 | # Main function to run the producer and consumer
62 | def main():
63 |     topic_name = 'test-topic'
64 |     
65 |     # Create topic
66 |     create_topic(topic_name)
67 | 
68 |     # Produce messages
69 |     messages_to_produce = ['Hello Kafka', 'Kafka with Python', 'Test message 1', 'Test message 2']
70 |     produce_messages(topic_name, messages_to_produce)
71 | 
72 |     # Consume messages
73 |     print("Consuming messages from topic...")
74 |     consume_messages(topic_name)
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/02-kafka/02-kafka-producer-consumer/readme.md:
--------------------------------------------------------------------------------
  1 | # Kafka Setup on Ubuntu
  2 | 
  3 | This guide provides step-by-step instructions for setting up a single-node Kafka cluster on an Ubuntu instance.
  4 | Machine type ***t2.medium***
  5 | 
  6 | ## Commands & Explanation
  7 | 
  8 | ```bash
  9 | 2  sudo apt update
 10 | ```
 11 | 
 12 | * Updates the package lists for upgrades and new package installations.
 13 | 
 14 | ```bash
 15 | 3  sudo apt install default-jdk -y
 16 | ```
 17 | 
 18 | * Installs the default Java Development Kit (JDK), required for running Kafka. The `-y` flag automatically answers "yes" to any prompts.
 19 | 
 20 | ```bash
 21 | 4  java -version
 22 | ```
 23 | 
 24 | * Verifies the installed Java version.
 25 | 
 26 | ```bash
 27 | 5  cd /opt
 28 | ```
 29 | 
 30 | * Changes the current working directory to `/opt`, a common location for installing optional software.
 31 | 
 32 | ```bash
 33 | 6  sudo wget https://downloads.apache.org/kafka/3.8.0/kafka_2.12-3.8.0.tgz
 34 | ```
 35 | 
 36 | * Downloads the Kafka distribution archive (version 3.8.0) using `wget`.
 37 | 
 38 | ```bash
 39 | 7  sudo tar -xvzf kafka_2.12-3.8.0.tgz
 40 | ```
 41 | 
 42 | * Extracts the downloaded Kafka archive.
 43 | 
 44 | ```bash
 45 | 8  sudo mv kafka_2.12-3.8.0 /usr/local/kafka
 46 | ```
 47 | 
 48 | * Moves the extracted Kafka directory to `/usr/local/kafka`, a standard location for installing Kafka.
 49 | 
 50 | ```bash
 51 | 9  cd /usr/local/kafka
 52 | ```
 53 | 
 54 | * Changes the current working directory to the Kafka installation directory.
 55 | 
 56 | ```bash
 57 | 10 sudo nohup bin/zookeeper-server-start.sh config/zookeeper.properties > /tmp/zookeeper.log 2>&1 &
 58 | ```
 59 | 
 60 | * Starts the ZooKeeper server in the background. 
 61 |     * `nohup` ensures the process continues running even if the terminal is closed. 
 62 |     * `>` redirects standard output to `/tmp/zookeeper.log`. 
 63 |     * `2>&1` redirects standard error to the same file as standard output.
 64 |     * `&` runs the command in the background.
 65 | 
 66 | ```bash
 67 | 11 sudo nohup bin/kafka-server-start.sh config/server.properties > /tmp/kafka.log 2>&1 &
 68 | ```
 69 | 
 70 | * Starts the Kafka server in the background, similar to how ZooKeeper was started.
 71 | 
 72 | ```bash
 73 | 12 tail -100f /tmp/kafka.log
 74 | ```
 75 | 
 76 | * Displays the last 100 lines of the Kafka log file, useful for monitoring the server startup process.
 77 | 
 78 | ```bash
 79 | 13 sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092
 80 | ```
 81 | 
 82 | * Lists existing Kafka topics using the local Kafka broker running on `localhost:9092`.
 83 | 
 84 | ```bash
 85 | 14 sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server 3.64.165.242:9092 --partitions 1 --replication-factor 1
 86 | ```
 87 | 
 88 | * **Likely incorrect**: Attempts to create a topic named `test-topic` on a remote Kafka broker (specified by the external IP address), which would typically require additional configuration.
 89 | 
 90 | ```bash
 91 | 15 sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1
 92 | ```
 93 | 
 94 | * Creates a topic named `test-topic` on the local Kafka broker with one partition and a replication factor of 1.
 95 | 
 96 | ```bash
 97 | 16 sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092
 98 | ```
 99 | 
100 | * Lists Kafka topics again to confirm the creation of `test-topic`.
101 | 
102 | ```bash
103 | 17 sudo bin/kafka-console-producer.sh --topic test-topic --bootstrap-server localhost:9092
104 | ```
105 | 
106 | ```bash
107 | sudo apt update
108 | sudo apt install default-jdk -y
109 | java -version
110 | cd /opt
111 | sudo wget https://downloads.apache.org/kafka/3.8.0/kafka_2.12-3.8.0.tgz
112 | sudo tar -xvzf kafka_2.12-3.8.0.tgz
113 | sudo mv kafka_2.12-3.8.0 /usr/local/kafka
114 | cd /usr/local/kafka
115 | sudo nohup bin/zookeeper-server-start.sh config/zookeeper.properties > /tmp/zookeeper.log 2>&1 &
116 | sudo nohup bin/kafka-server-start.sh config/server.properties > /tmp/kafka.log 2>&1 &
117 | tail -100f /tmp/kafka.log
118 | sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092
119 | sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server 3.64.165.242:9092 --partitions 1 --replication-factor 1
120 | sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1
121 | sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092
122 | sudo bin/kafka-console-producer.sh --topic test-topic --bootstrap-server localhost:9092
123 | history
124 | ```
125 | 
126 | * Install pip and confluent package on ubuntu 
127 | 
128 | ```bash
129 | apt install python3-pip
130 | apt install python3-confluent-kafka
131 | ```
132 | 


--------------------------------------------------------------------------------
/02-kafka/03-api-to-kafka/main.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from confluent_kafka import Producer, KafkaException, KafkaError
 3 | from confluent_kafka.admin import AdminClient, NewTopic
 4 | 
 5 | # Kafka configuration
 6 | kafka_config = {
 7 |     'bootstrap.servers': 'localhost:9092',  # Replace with your Kafka broker address if needed
 8 | }
 9 | 
10 | # Topic name
11 | topic_name = "order-topic"
12 | 
13 | # Function to log messages
14 | def log(message):
15 |     print(f"[INFO] {message}")
16 | 
17 | # Function to create a topic
18 | def create_topic(admin_client, topic_name):
19 |     log(f"Creating topic: {topic_name}")
20 |     topic_list = [NewTopic(topic_name, num_partitions=1, replication_factor=1)]
21 |     fs = admin_client.create_topics(topic_list)
22 |     for topic, f in fs.items():
23 |         try:
24 |             f.result()  # The result itself is None if successful
25 |             log(f"Topic '{topic}' created successfully.")
26 |         except KafkaException as e:
27 |             log(f"Failed to create topic '{topic}': {e}")
28 | 
29 | # Function to delete a topic
30 | def delete_topic(admin_client, topic_name):
31 |     log(f"Deleting topic: {topic_name}")
32 |     fs = admin_client.delete_topics([topic_name])
33 |     for topic, f in fs.items():
34 |         try:
35 |             f.result()  # The result itself is None if successful
36 |             log(f"Topic '{topic}' deleted successfully.")
37 |         except KafkaException as e:
38 |             log(f"Failed to delete topic '{topic}': {e}")
39 | 
40 | # Function to fetch orders from FastAPI
41 | def fetch_orders():
42 |     url = "http://52.59.240.23:8000/generate-orders"
43 |     log(f"Fetching orders from {url}")
44 |     response = requests.get(url)
45 |     
46 |     if response.status_code == 200:
47 |         log("Orders fetched successfully.")
48 |         return response.json()  # Assuming the response is a JSON
49 |     else:
50 |         log(f"Failed to fetch orders. Status code: {response.status_code}")
51 |         return None
52 | 
53 | # Function to produce messages to Kafka
54 | def produce_messages(producer, topic_name, message):
55 |     log(f"Producing message to topic: {topic_name}")
56 |     producer.produce(topic_name, message)
57 |     producer.flush()
58 |     log("Message produced successfully.")
59 | 
60 | # Main function
61 | def main():
62 |     # Step 1: Setup Kafka Admin Client and Producer
63 |     admin_client = AdminClient(kafka_config)
64 |     producer = Producer(kafka_config)
65 |     
66 |     # Step 2: Check if the topic exists, delete if it does
67 |     topics = admin_client.list_topics().topics
68 |     if topic_name in topics:
69 |         delete_topic(admin_client, topic_name)
70 |     
71 |     # Step 3: Create the Kafka topic
72 |     create_topic(admin_client, topic_name)
73 | 
74 |     # Step 4: Fetch orders from FastAPI
75 |     orders = fetch_orders()
76 |     
77 |     if orders is not None:
78 |         # Convert the fetched orders to a string (assuming it's JSON-compatible)
79 |         orders_str = str(orders)
80 | 
81 |         # Step 5: Produce the fetched order to Kafka
82 |         produce_messages(producer, topic_name, orders_str)
83 |     
84 |     log("Script finished after producing message.")
85 | 
86 | # Run the main function
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/02-kafka/03-api-to-kafka/readme.md:
--------------------------------------------------------------------------------
1 | ```bash
2 | cd /usr/local/kafka
3 | sudo bin/kafka-console-consumer.sh --topic order-topic --from-beginning --bootstrap-server localhost:9092
4 | ```


--------------------------------------------------------------------------------
/03-mlflow/example-mlflow.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | import mlflow.sklearn
 3 | from sklearn.datasets import make_regression
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.linear_model import LinearRegression
 6 | from sklearn.tree import DecisionTreeRegressor
 7 | from sklearn.ensemble import RandomForestRegressor
 8 | from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
 9 | import numpy as np
10 | 
11 | # Set the MLflow tracking URI to the remote MLflow server
12 | mlflow.set_tracking_uri("http://localhost:5000")
13 | 
14 | # Create synthetic data for regression
15 | X, y = make_regression(n_samples=100, n_features=4, noise=0.1, random_state=42)
16 | 
17 | # Split the data
18 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
19 | 
20 | # Set the experiment name
21 | mlflow.set_experiment("ML Model Experiment")
22 | 
23 | def log_model(model, model_name):
24 |     with mlflow.start_run(run_name=model_name):
25 |         # Train the model
26 |         model.fit(X_train, y_train)
27 |         
28 |         # Make predictions
29 |         y_pred = model.predict(X_test)
30 |         
31 |         # Calculate metrics
32 |         mse = mean_squared_error(y_test, y_pred)
33 |         rmse = np.sqrt(mse)
34 |         mae = mean_absolute_error(y_test, y_pred)
35 |         r2 = r2_score(y_test, y_pred)
36 |         evs = explained_variance_score(y_test, y_pred)
37 | 
38 |         # Log metrics
39 |         mlflow.log_metric("mse", mse)
40 |         mlflow.log_metric("rmse", rmse)
41 |         mlflow.log_metric("mae", mae)
42 |         mlflow.log_metric("r2", r2)
43 |         mlflow.log_metric("explained_variance", evs)
44 | 
45 |         # Log model
46 |         mlflow.sklearn.log_model(model, model_name)
47 |         
48 |         print(f"{model_name} - MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}, Explained Variance: {evs}")
49 | 
50 | # Linear Regression Model
51 | linear_model = LinearRegression()
52 | log_model(linear_model, "Linear Regression")
53 | 
54 | # Decision Tree Regressor Model
55 | tree_model = DecisionTreeRegressor()
56 | log_model(tree_model, "Decision Tree Regressor")
57 | 
58 | # Random Forest Regressor Model
59 | forest_model = RandomForestRegressor()
60 | log_model(forest_model, "Random Forest Regressor")
61 | 
62 | print("Experiment completed! Check the MLflow server for details.")
63 | 


--------------------------------------------------------------------------------
/03-mlflow/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | scikit-learn


--------------------------------------------------------------------------------
/03-mlflow/validation.py:
--------------------------------------------------------------------------------
1 | from mlflow.tracking import MlflowClient
2 | 
3 | client = MlflowClient()
4 | for rm in client.search_registered_models():
5 |     print(f"Model name: {rm.name}")
6 | 


--------------------------------------------------------------------------------
/04-bentoml/__pycache__/model_service_v1.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v1.cpython-312.pyc


--------------------------------------------------------------------------------
/04-bentoml/__pycache__/model_service_v2.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v2.cpython-312.pyc


--------------------------------------------------------------------------------
/04-bentoml/__pycache__/model_service_v3.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v3.cpython-312.pyc


--------------------------------------------------------------------------------
/04-bentoml/model_service_v1.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.io import JSON
 3 | from pydantic import BaseModel
 4 | 
 5 | # Load the model
 6 | model_ref = bentoml.sklearn.get("house_price_model:latest")
 7 | model_runner = model_ref.to_runner()
 8 | 
 9 | # Define the service
10 | svc = bentoml.Service("house_price_predictor", runners=[model_runner])
11 | 
12 | # Input schema
13 | class HouseInput(BaseModel):
14 |     square_footage: float
15 |     num_rooms: int
16 | 
17 | # API for prediction
18 | @svc.api(input=JSON(pydantic_model=HouseInput), output=JSON())
19 | async def predict_house_price(data: HouseInput):
20 |     input_data = [[data.square_footage, data.num_rooms]]
21 |     prediction = await model_runner.predict.async_run(input_data)
22 |     return {"predicted_price": prediction[0]}
23 | 


--------------------------------------------------------------------------------
/04-bentoml/model_service_v2.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.io import JSON
 3 | from pydantic import BaseModel
 4 | 
 5 | # Load the model
 6 | model_ref = bentoml.sklearn.get("house_price_model_v2:latest")
 7 | model_runner = model_ref.to_runner()
 8 | 
 9 | # Define the service
10 | svc = bentoml.Service("house_price_predictor_v2", runners=[model_runner])
11 | 
12 | # Input schema
13 | class HouseInput(BaseModel):
14 |     square_footage: float
15 |     num_rooms: int
16 |     num_bathrooms: int
17 |     house_age: int
18 |     distance_to_city_center: float
19 |     has_garage: int
20 |     has_garden: int
21 |     crime_rate: float
22 |     avg_school_rating: float
23 |     country: str
24 | 
25 | # API for prediction
26 | @svc.api(input=JSON(pydantic_model=HouseInput), output=JSON())
27 | async def predict_house_price(data: HouseInput):
28 |     # One-hot encoding for the country
29 |     country_encoded = [0, 0, 0]  # Default for ['Canada', 'Germany', 'UK']
30 |     if data.country == "Canada":
31 |         country_encoded[0] = 1
32 |     elif data.country == "Germany":
33 |         country_encoded[1] = 1
34 |     elif data.country == "UK":
35 |         country_encoded[2] = 1
36 | 
37 |     input_data = [[
38 |         data.square_footage, data.num_rooms, data.num_bathrooms, data.house_age,
39 |         data.distance_to_city_center, data.has_garage, data.has_garden,
40 |         data.crime_rate, data.avg_school_rating
41 |     ] + country_encoded]
42 |     
43 |     prediction = await model_runner.predict.async_run(input_data)
44 |     return {"predicted_price": prediction[0]}
45 | 


--------------------------------------------------------------------------------
/04-bentoml/model_service_v3.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.io import JSON
 3 | from pydantic import BaseModel
 4 | 
 5 | # Load the v1 and v2 models
 6 | model_v1_ref = bentoml.sklearn.get("house_price_model:latest")
 7 | model_v2_ref = bentoml.sklearn.get("house_price_model_v2:latest")
 8 | model_v1_runner = model_v1_ref.to_runner()
 9 | model_v2_runner = model_v2_ref.to_runner()
10 | 
11 | # Define the service with both runners
12 | svc = bentoml.Service("house_price_predictor", runners=[model_v1_runner, model_v2_runner])
13 | 
14 | # Input schema for V1 (simpler model)
15 | class HouseInputV1(BaseModel):
16 |     square_footage: float
17 |     num_rooms: int
18 | 
19 | # Input schema for V2 (expanded model)
20 | class HouseInputV2(BaseModel):
21 |     square_footage: float
22 |     num_rooms: int
23 |     num_bathrooms: int
24 |     house_age: int
25 |     distance_to_city_center: float
26 |     has_garage: int
27 |     has_garden: int
28 |     crime_rate: float
29 |     avg_school_rating: float
30 |     country: str
31 | 
32 | # API for V1 model prediction
33 | @svc.api(input=JSON(pydantic_model=HouseInputV1), output=JSON(), route="/predict_house_price_v1")
34 | async def predict_house_price_v1(data: HouseInputV1):
35 |     input_data = [[data.square_footage, data.num_rooms]]
36 |     prediction = await model_v1_runner.predict.async_run(input_data)
37 |     return {"predicted_price_v1": prediction[0]}
38 | 
39 | # API for V2 model prediction
40 | @svc.api(input=JSON(pydantic_model=HouseInputV2), output=JSON(), route="/predict_house_price_v2")
41 | async def predict_house_price_v2(data: HouseInputV2):
42 |     # One-hot encoding for the country
43 |     country_encoded = [0, 0, 0]  # Default for ['Canada', 'Germany', 'UK']
44 |     if data.country == "Canada":
45 |         country_encoded[0] = 1
46 |     elif data.country == "Germany":
47 |         country_encoded[1] = 1
48 |     elif data.country == "UK":
49 |         country_encoded[2] = 1
50 | 
51 |     input_data = [[
52 |         data.square_footage, data.num_rooms, data.num_bathrooms, data.house_age,
53 |         data.distance_to_city_center, data.has_garage, data.has_garden,
54 |         data.crime_rate, data.avg_school_rating
55 |     ] + country_encoded]
56 |     
57 |     prediction = await model_v2_runner.predict.async_run(input_data)
58 |     return {"predicted_price_v2": prediction[0]}
59 | 


--------------------------------------------------------------------------------
/04-bentoml/model_train_v1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.linear_model import LinearRegression
 3 | from sklearn.model_selection import train_test_split
 4 | import bentoml
 5 | 
 6 | # Generate synthetic data for house price prediction
 7 | def generate_data():
 8 |     data = {
 9 |         'square_footage': [1000, 1500, 1800, 2000, 2300, 2500, 2700, 3000, 3200, 3500],
10 |         'num_rooms': [3, 4, 4, 5, 5, 6, 6, 7, 7, 8],
11 |         'price': [200000, 250000, 280000, 310000, 340000, 370000, 400000, 430000, 460000, 500000]
12 |     }
13 |     return pd.DataFrame(data)
14 | 
15 | # Load the data
16 | df = generate_data()
17 | 
18 | # Features and target
19 | X = df[['square_footage', 'num_rooms']]
20 | y = df['price']
21 | 
22 | # Split the data
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
24 | 
25 | # Train the model
26 | model = LinearRegression()
27 | model.fit(X_train, y_train)
28 | 
29 | # Save the model with BentoML
30 | bentoml.sklearn.save_model("house_price_model", model)
31 | 


--------------------------------------------------------------------------------
/04-bentoml/model_train_v2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.linear_model import LinearRegression
 3 | from sklearn.model_selection import train_test_split
 4 | import bentoml
 5 | 
 6 | # Generate synthetic data for house price prediction with 10 features
 7 | def generate_data():
 8 |     data = {
 9 |         'square_footage': [1000, 1500, 1800, 2000, 2300, 2500, 2700, 3000, 3200, 3500],
10 |         'num_rooms': [3, 4, 4, 5, 5, 6, 6, 7, 7, 8],
11 |         'num_bathrooms': [1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
12 |         'house_age': [10, 5, 15, 20, 8, 12, 7, 3, 25, 30],
13 |         'distance_to_city_center': [10, 8, 12, 5, 15, 6, 20, 2, 18, 25],
14 |         'has_garage': [1, 1, 0, 1, 0, 1, 1, 1, 0, 1],
15 |         'has_garden': [1, 1, 1, 0, 1, 0, 0, 1, 1, 0],
16 |         'crime_rate': [0.3, 0.2, 0.5, 0.1, 0.4, 0.3, 0.6, 0.1, 0.7, 0.8],
17 |         'avg_school_rating': [8, 9, 7, 8, 6, 7, 5, 9, 4, 3],
18 |         'country': ['USA', 'USA', 'USA', 'Canada', 'Canada', 'Canada', 'UK', 'UK', 'Germany', 'Germany'],
19 |         'price': [200000, 250000, 280000, 310000, 340000, 370000, 400000, 430000, 460000, 500000]
20 |     }
21 |     return pd.DataFrame(data)
22 | 
23 | # Load the data
24 | df = generate_data()
25 | 
26 | # One-hot encode categorical features like 'country'
27 | df = pd.get_dummies(df, columns=['country'], drop_first=True)
28 | 
29 | # Features and target
30 | X = df.drop(columns=['price'])
31 | y = df['price']
32 | 
33 | # Split the data
34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
35 | 
36 | # Train the model
37 | model = LinearRegression()
38 | model.fit(X_train, y_train)
39 | 
40 | # Save the model with BentoML
41 | bentoml.sklearn.save_model("house_price_model_v2", model)
42 | 


--------------------------------------------------------------------------------
/04-bentoml/readme.md:
--------------------------------------------------------------------------------
  1 | # House Price Prediction with BentoML
  2 | 
  3 | This project demonstrates how to build, train, and deploy a house price prediction model using Python, BentoML, and various machine learning libraries. Follow the steps below to set up the environment, train the model, serve it using BentoML, and make predictions via API requests.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [House Price Prediction with BentoML](#house-price-prediction-with-bentoml)
  8 |   - [Table of Contents](#table-of-contents)
  9 |   - [Prerequisites](#prerequisites)
 10 |   - [Setup](#setup)
 11 |   - [Training the Model](#training-the-model)
 12 |     - [Version 1](#version-1)
 13 |   - [Serving the Model](#serving-the-model)
 14 |     - [Version 1](#version-1-1)
 15 |     - [Version 2](#version-2)
 16 |     - [Version 3](#version-3)
 17 |   - [Versioning](#versioning)
 18 |   - [Cleanup](#cleanup)
 19 | 
 20 | ## Prerequisites
 21 | 
 22 | Ensure you have the following installed on your local machine:
 23 | 
 24 | - **Python 3.7+**
 25 | - **pip**
 26 | - **Git** (optional, for version control)
 27 | 
 28 | ## Setup
 29 | 
 30 | 1. **Create a Virtual Environment**
 31 | 
 32 |    ```bash
 33 |    python3 -m venv bentoml-env
 34 |    ```
 35 | 
 36 | 2. **Activate the Virtual Environment**
 37 | 
 38 |    ```bash
 39 |    source bentoml-env/bin/activate
 40 |    ```
 41 | 
 42 | 3. **Install Required Packages**
 43 | 
 44 |    ```bash
 45 |    pip3 install bentoml scikit-learn pandas
 46 |    ```
 47 | 
 48 | 4. **Navigate to the Project Directory**
 49 | 
 50 |    ```bash
 51 |    cd 04-bentoml
 52 |    ```
 53 | 
 54 | ## Training the Model
 55 | 
 56 | ### Version 1
 57 | 
 58 | 1. **Train the Initial Model**
 59 | 
 60 |    ```bash
 61 |    python3 model_train_v1.py
 62 |    ```
 63 | 
 64 | 2. **List Available BentoML Models**
 65 | 
 66 |    ```bash
 67 |    bentoml models list
 68 |    ```
 69 | 
 70 | ## Serving the Model
 71 | 
 72 | ### Version 1
 73 | 
 74 | 1. **Serve the Model with BentoML**
 75 | 
 76 |    ```bash
 77 |    bentoml serve model_service_v1.py --reload
 78 |    ```
 79 | 
 80 | 2. **Make a Prediction Request**
 81 | 
 82 |    Open a new terminal window/tab, activate the virtual environment, navigate to the project directory, and run:
 83 | 
 84 |    ```bash
 85 |    curl -X POST "http://127.0.0.1:3000/predict_house_price" \
 86 |         -H "Content-Type: application/json" \
 87 |         -d '{"square_footage": 2500, "num_rooms": 5}'
 88 |    ```
 89 | 
 90 | ### Version 2
 91 | 
 92 | 1. **Train the Enhanced Model**
 93 | 
 94 |    ```bash
 95 |    python3 model_train_v2.py
 96 |    ```
 97 | 
 98 | 2. **Serve the Enhanced Model**
 99 | 
100 |    ```bash
101 |    bentoml serve model_service_v2.py --reload
102 |    ```
103 | 
104 | 3. **Make a Detailed Prediction Request**
105 | 
106 |    ```bash
107 |    curl -X POST "http://127.0.0.1:3000/predict_house_price" \
108 |         -H "Content-Type: application/json" \
109 |         -d '{
110 |              "square_footage": 2500,
111 |              "num_rooms": 5,
112 |              "num_bathrooms": 3,
113 |              "house_age": 10,
114 |              "distance_to_city_center": 8,
115 |              "has_garage": 1,
116 |              "has_garden": 1,
117 |              "crime_rate": 0.2,
118 |              "avg_school_rating": 8,
119 |              "country": "Germany"
120 |          }'
121 |    ```
122 | 
123 | ### Version 3
124 | 
125 | 1. **Serve Additional Model Versions**
126 | 
127 |    ```bash
128 |    bentoml serve model_service_v3.py --reload
129 |    ```
130 | 
131 | 2. **Make Prediction Requests to Specific Model Versions**
132 | 
133 |    - **Version 1 Endpoint**
134 | 
135 |      ```bash
136 |      curl -X POST "http://127.0.0.1:3000/predict_house_price_v1" \
137 |           -H "Content-Type: application/json" \
138 |           -d '{"square_footage": 2500, "num_rooms": 5}'
139 |      ```
140 | 
141 |    - **Version 2 Endpoint**
142 | 
143 |      ```bash
144 |      curl -X POST "http://127.0.0.1:3000/predict_house_price_v2" \
145 |           -H "Content-Type: application/json" \
146 |           -d '{
147 |                "square_footage": 2500,
148 |                "num_rooms": 5,
149 |                "num_bathrooms": 3,
150 |                "house_age": 10,
151 |                "distance_to_city_center": 8,
152 |                "has_garage": 1,
153 |                "has_garden": 1,
154 |                "crime_rate": 0.2,
155 |                "avg_school_rating": 8,
156 |                "country": "Germany"
157 |            }'
158 |      ```
159 | 
160 | ## Versioning
161 | 
162 | Each version of the model and its corresponding service script allows for iterative improvements and testing. Ensure that you train and serve the appropriate version based on your requirements.
163 | 
164 | ## Cleanup
165 | 
166 | To deactivate the virtual environment and clean up your terminal:
167 | 
168 | 1. **Deactivate the Virtual Environment**
169 | 
170 |    ```bash
171 |    deactivate
172 |    ```
173 | 
174 | 2. **Clear the Terminal (Optional)**
175 | 
176 |    ```bash
177 |    clear
178 |    ```
179 | 
180 | ---


--------------------------------------------------------------------------------
/05-project/flask_app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request
 2 | import pandas as pd
 3 | import requests
 4 | import base64
 5 | import io
 6 | 
 7 | app = Flask(__name__)
 8 | 
 9 | # Route for the home page
10 | @app.route('/')
11 | def index():
12 |     return render_template('index.html')
13 | 
14 | # Route to handle the CSV file upload and prediction
15 | @app.route('/predict', methods=['POST'])
16 | def predict():
17 |     file_data = request.form.get('file')
18 |     
19 |     # Decode the Base64 encoded file content
20 |     decoded_file = base64.b64decode(file_data.split(',')[1])
21 | 
22 |     # Read the decoded content into a DataFrame
23 |     df = pd.read_csv(io.StringIO(decoded_file.decode('utf-8')))
24 | 
25 |     # Separate the 'claim_id' column if it exists
26 |     if 'claim_id' in df.columns:
27 |         claim_ids = df['claim_id']
28 |         df = df.drop(columns=['claim_id'])
29 |     else:
30 |         claim_ids = None
31 | 
32 |     # Send the DataFrame to the BentoML service
33 |     response = requests.post(
34 |         'http://127.0.0.1:3000/predict',  # BentoML endpoint
35 |         json=df.to_dict(orient='records')
36 |     )
37 | 
38 |     # Get predictions from the response
39 |     predictions = response.json()['predictions']
40 | 
41 |     # Add predictions to the DataFrame
42 |     df['Prediction'] = predictions
43 | 
44 |     # Reattach the 'claim_id' column to the DataFrame
45 |     if claim_ids is not None:
46 |         df['claim_id'] = claim_ids
47 | 
48 |     # Reorder columns to have 'claim_id' first
49 |     if 'claim_id' in df.columns:
50 |         df = df[['claim_id'] + [col for col in df.columns if col != 'claim_id']]
51 | 
52 |     # Render the DataFrame as an HTML table
53 |     return render_template('result.html', tables=[df.to_html(classes='data', header="true")])
54 | 
55 | if __name__ == '__main__':
56 |     app.run(debug=True, port=5005)
57 | 


--------------------------------------------------------------------------------
/05-project/isolation_model.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.ensemble import IsolationForest
 3 | from sklearn.model_selection import train_test_split
 4 | import mlflow
 5 | import mlflow.sklearn
 6 | 
 7 | # Load the synthetic data
 8 | df = pd.read_csv('synthetic_health_claims.csv')
 9 | 
10 | mlflow.set_tracking_uri("http://127.0.0.1:5000")
11 | 
12 | # Features to use for the model
13 | features = ['claim_amount', 'num_services', 'patient_age', 'provider_id', 'days_since_last_claim']
14 | 
15 | # Split the data into training and test sets
16 | X_train, X_test = train_test_split(df[features], test_size=0.2, random_state=42)
17 | 
18 | # Set up MLflow
19 | mlflow.set_experiment("Health Insurance Claim Anomaly Detection")
20 | 
21 | with mlflow.start_run():
22 |     # Train the Isolation Forest model
23 |     model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
24 |     model.fit(X_train)
25 | 
26 |     # Predict on the test set
27 |     y_pred_train = model.predict(X_train)
28 |     y_pred_test = model.predict(X_test)
29 |     
30 |     # Convert predictions to anomaly scores (-1 is anomaly, 1 is normal)
31 |     anomaly_score_train = (y_pred_train == -1).astype(int)
32 |     anomaly_score_test = (y_pred_test == -1).astype(int)
33 | 
34 |     # Log parameters
35 |     mlflow.log_param("n_estimators", 100)
36 |     mlflow.log_param("contamination", 0.05)
37 |     
38 |     # Log metrics
39 |     train_anomaly_percentage = anomaly_score_train.mean() * 100
40 |     test_anomaly_percentage = anomaly_score_test.mean() * 100
41 |     
42 |     mlflow.log_metric("train_anomaly_percentage", train_anomaly_percentage)
43 |     mlflow.log_metric("test_anomaly_percentage", test_anomaly_percentage)
44 | 
45 |     # Log the model
46 |     mlflow.sklearn.log_model(model, "model")
47 | 
48 |     print(f"Train Anomaly Percentage: {train_anomaly_percentage:.2f}%")
49 |     print(f"Test Anomaly Percentage: {test_anomaly_percentage:.2f}%")
50 |     print("Model and metrics logged to MLflow.")
51 | 
52 | 


--------------------------------------------------------------------------------
/05-project/readme.md:
--------------------------------------------------------------------------------
 1 | Here is the converted `README.md` file for your project:
 2 | 
 3 | ```markdown
 4 | # Health Claims Fraud Detection Project
 5 | 
 6 | This project involves building a Flask web application that uses an Isolation Forest model to detect potentially fraudulent health claims. It leverages BentoML for model serving and MLflow for experiment tracking.
 7 | 
 8 | ## Setup Instructions
 9 | 
10 | ### 1. Environment Setup
11 | 
12 | - Load the bash profile and set up the virtual environment:
13 | 
14 | ```bash
15 | source ~/.bash_profile
16 | virtualenv venv
17 | source venv/bin/activate
18 | ```
19 | 
20 | - Install required dependencies:
21 | 
22 | ```bash
23 | pip3 install -r requirements.txt
24 | ```
25 | 
26 | ### 2. BentoML and Model Management
27 | 
28 | - List BentoML models:
29 | 
30 | ```bash
31 | bentoml models list
32 | ```
33 | 
34 | - Run the synthetic data generator script:
35 | 
36 | ```bash
37 | python3 synthetic_health_claims.py
38 | ```
39 | 
40 | - Train and evaluate the Isolation Forest model:
41 | 
42 | ```bash
43 | python3 isolation_model.py
44 | ```
45 | 
46 | ### 3. MLflow for Experiment Tracking
47 | 
48 | - Start the MLflow UI to track experiments:
49 | 
50 | ```bash
51 | mlflow ui
52 | ```
53 | 
54 | ### 4. Run the Flask Web Application
55 | 
56 | - Open a new terminal window, source the environment, and run the Flask app:
57 | 
58 | ```bash
59 | source ~/.bash_profile
60 | source venv/bin/activate
61 | python3 flask_app.py
62 | ```
63 | 
64 | ### 5. Register Model and Serve with BentoML
65 | 
66 | - In another terminal, run the following commands to register and serve the model with BentoML:
67 | 
68 | ```bash
69 | source ~/.bash_profile
70 | source venv/bin/activate
71 | python3 isolation_model.py
72 | python3 register_model.py
73 | bentoml serve service.py --reload
74 | ```
75 | 
76 | ## Additional Notes
77 | 
78 | - Ensure that all commands are executed in the correct order for proper setup and functioning of the application.
79 | - Use BentoML to serve models and integrate them with the Flask app for real-time fraud detection.
80 | - The MLflow UI helps to track experiments and evaluate model performance.
81 | ```
82 | 
83 | This `README.md` file gives a clear step-by-step guide for setting up and running your project. Let me know if you'd like any additional adjustments!


--------------------------------------------------------------------------------
/05-project/register_model.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | import pickle
 3 | 
 4 | # Load the model from the downloaded PKL file using pickle
 5 | model_path = "model.pkl"  # Replace with your actual path
 6 | 
 7 | with open(model_path, 'rb') as model_file:  # Open in binary mode
 8 |     model = pickle.load(model_file)
 9 | 
10 | # Save the model to BentoML
11 | bento_model = bentoml.sklearn.save_model("health_insurance_anomaly_detector", model)
12 | 
13 | print(f"Model registered with BentoML: {bento_model}")
14 | 


--------------------------------------------------------------------------------
/05-project/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | pandas
3 | numpy
4 | bentoml


--------------------------------------------------------------------------------
/05-project/service.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | from bentoml.io import JSON, PandasDataFrame
 3 | 
 4 | # Load the registered model
 5 | model_runner = bentoml.sklearn.get("health_insurance_anomaly_detector:latest").to_runner()
 6 | 
 7 | # Create a BentoML Service
 8 | svc = bentoml.Service("health_insurance_anomaly_detection_service", runners=[model_runner])
 9 | 
10 | # Define an API endpoint for prediction
11 | @svc.api(input=PandasDataFrame(), output=JSON())
12 | def predict(data):
13 |     # Make predictions
14 |     predictions = model_runner.predict.run(data)
15 |     # Return predictions as JSON
16 |     return {"predictions": predictions.tolist()}
17 | 


--------------------------------------------------------------------------------
/05-project/synthetic_health_claims.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Set random seed for reproducibility
 5 | np.random.seed(42)
 6 | 
 7 | # Generate synthetic data
 8 | num_samples = 1000
 9 | data = {
10 |     'claim_id': np.arange(1, num_samples + 1),
11 |     'claim_amount': np.random.normal(1000, 250, num_samples),
12 |     'num_services': np.random.randint(1, 10, num_samples),
13 |     'patient_age': np.random.randint(18, 90, num_samples),
14 |     'provider_id': np.random.randint(1, 50, num_samples),
15 |     'days_since_last_claim': np.random.randint(0, 365, num_samples),
16 | }
17 | 
18 | # Convert to DataFrame
19 | df = pd.DataFrame(data)
20 | 
21 | # Introduce some anomalies (e.g., very high claim amounts)
22 | num_anomalies = 50
23 | anomalies = {
24 |     'claim_id': np.arange(num_samples + 1, num_samples + num_anomalies + 1),
25 |     'claim_amount': np.random.normal(10000, 2500, num_anomalies),  # Much higher amounts
26 |     'num_services': np.random.randint(10, 20, num_anomalies),
27 |     'patient_age': np.random.randint(18, 90, num_anomalies),
28 |     'provider_id': np.random.randint(1, 50, num_anomalies),
29 |     'days_since_last_claim': np.random.randint(0, 365, num_anomalies),
30 | }
31 | 
32 | df_anomalies = pd.DataFrame(anomalies)
33 | 
34 | # Combine normal data with anomalies
35 | df = pd.concat([df, df_anomalies]).reset_index(drop=True)
36 | 
37 | # Shuffle the dataset
38 | df = df.sample(frac=1).reset_index(drop=True)
39 | 
40 | # Save the data to CSV
41 | df.to_csv('synthetic_health_claims.csv', index=False)
42 | 
43 | print("Synthetic data generated and saved to 'synthetic_health_claims.csv'.")
44 | 


--------------------------------------------------------------------------------
/05-project/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Health Insurance Anomaly Detection</title>
 7 |     <style>
 8 |         .drag-area {
 9 |             border: 2px dashed #ccc;
10 |             border-radius: 10px;
11 |             width: 100%;
12 |             height: 200px;
13 |             text-align: center;
14 |             padding: 20px;
15 |             font-size: 20px;
16 |             color: #333;
17 |         }
18 |         .drag-area.hover {
19 |             border-color: #333;
20 |             background-color: #f0f0f0;
21 |         }
22 |         .drag-area p {
23 |             margin: 30px 0;
24 |         }
25 |         .drag-area button {
26 |             margin-top: 10px;
27 |         }
28 |     </style>
29 | </head>
30 | <body>
31 |     <h1>Upload CSV File</h1>
32 |     <div class="drag-area" id="drag-area">
33 |         <p>Drag & Drop your CSV file here or click to upload</p>
34 |         <input type="file" id="fileInput" name="file" style="display: none;" accept=".csv">
35 |         <button onclick="document.getElementById('fileInput').click();">Browse Files</button>
36 |     </div>
37 |     <form id="uploadForm" action="/predict" method="post" enctype="multipart/form-data">
38 |         <input type="hidden" name="file" id="hiddenFileInput">
39 |     </form>
40 | 
41 |     <script>
42 |         const dragArea = document.getElementById('drag-area');
43 |         const fileInput = document.getElementById('fileInput');
44 |         const hiddenFileInput = document.getElementById('hiddenFileInput');
45 |         const uploadForm = document.getElementById('uploadForm');
46 | 
47 |         dragArea.addEventListener('dragover', (e) => {
48 |             e.preventDefault();
49 |             dragArea.classList.add('hover');
50 |         });
51 | 
52 |         dragArea.addEventListener('dragleave', () => {
53 |             dragArea.classList.remove('hover');
54 |         });
55 | 
56 |         dragArea.addEventListener('drop', (e) => {
57 |             e.preventDefault();
58 |             dragArea.classList.remove('hover');
59 |             const file = e.dataTransfer.files[0];
60 |             handleFile(file);
61 |         });
62 | 
63 |         fileInput.addEventListener('change', (e) => {
64 |             const file = e.target.files[0];
65 |             handleFile(file);
66 |         });
67 | 
68 |         function handleFile(file) {
69 |             if (file && file.type === 'text/csv') {
70 |                 const reader = new FileReader();
71 |                 reader.readAsDataURL(file);
72 |                 reader.onload = () => {
73 |                     hiddenFileInput.value = reader.result;
74 |                     uploadForm.submit();
75 |                 };
76 |             } else {
77 |                 alert('Please upload a valid CSV file.');
78 |             }
79 |         }
80 |     </script>
81 | </body>
82 | </html>
83 | 


--------------------------------------------------------------------------------
/05-project/templates/result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Prediction Results</title>
 7 |     <style>
 8 |         table {
 9 |             width: 100%;
10 |             border-collapse: collapse;
11 |         }
12 |         table, th, td {
13 |             border: 1px solid black;
14 |         }
15 |         th, td {
16 |             padding: 10px;
17 |             text-align: left;
18 |         }
19 |     </style>
20 | </head>
21 | <body>
22 |     <h1>Prediction Results</h1>
23 |     {% for table in tables %}
24 |         {{ table|safe }}
25 |     {% endfor %}
26 |     <a href="/">Go Back</a>
27 | </body>
28 | </html>
29 | 


--------------------------------------------------------------------------------
/05-project/templates/visualize.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Visualization</title>
 7 | </head>
 8 | <body>
 9 |     <h1>Prediction Distribution</h1>
10 |     <img src="{{ url_for('static', filename='prediction_pie_chart.png') }}" alt="Prediction Pie Chart">
11 |     <br><br>
12 |     <a href="/">Go Back</a>
13 | </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/05-project/test_claim.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | 
 4 | # Define 10 different test inputs
 5 | test_data = [
 6 |     {"claim_amount": 1000, "num_services": 2, "patient_age": 30, "provider_id": 1, "days_since_last_claim": 100},
 7 |     {"claim_amount": 2000, "num_services": 5, "patient_age": 45, "provider_id": 2, "days_since_last_claim": 200},
 8 |     {"claim_amount": 15000, "num_services": 10, "patient_age": 50, "provider_id": 3, "days_since_last_claim": 300},
 9 |     {"claim_amount": 500, "num_services": 1, "patient_age": 25, "provider_id": 4, "days_since_last_claim": 10},
10 |     {"claim_amount": 7500, "num_services": 8, "patient_age": 60, "provider_id": 5, "days_since_last_claim": 50},
11 |     {"claim_amount": 2500, "num_services": 3, "patient_age": 35, "provider_id": 6, "days_since_last_claim": 120},
12 |     {"claim_amount": 9000, "num_services": 15, "patient_age": 70, "provider_id": 7, "days_since_last_claim": 180},
13 |     {"claim_amount": 400, "num_services": 2, "patient_age": 22, "provider_id": 8, "days_since_last_claim": 365},
14 |     {"claim_amount": 11000, "num_services": 6, "patient_age": 55, "provider_id": 9, "days_since_last_claim": 250},
15 |     {"claim_amount": 600, "num_services": 4, "patient_age": 40, "provider_id": 10, "days_since_last_claim": 30},
16 | ]
17 | 
18 | # Convert to DataFrame
19 | df_test = pd.DataFrame(test_data)
20 | 
21 | # Make the prediction request
22 | response = requests.post("http://127.0.0.1:3000/predict", json=df_test.to_dict(orient="records"))
23 | 
24 | # Check the response
25 | if response.status_code == 200:
26 |     predictions = response.json()["predictions"]
27 |     for i, prediction in enumerate(predictions):
28 |         print(f"Test Case {i+1}: Prediction: {prediction}")
29 | else:
30 |     print(f"Error: {response.status_code} - {response.text}")
31 | 


--------------------------------------------------------------------------------
/05-project/v2_app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request, redirect, url_for
 2 | import pandas as pd
 3 | import requests
 4 | import base64
 5 | import io
 6 | import matplotlib.pyplot as plt
 7 | import os
 8 | 
 9 | app = Flask(__name__)
10 | 
11 | # Route for the home page
12 | @app.route('/')
13 | def index():
14 |     return render_template('index.html')
15 | 
16 | # Route to handle the CSV file upload and prediction
17 | @app.route('/predict', methods=['POST'])
18 | def predict():
19 |     file_data = request.form.get('file')
20 |     
21 |     # Decode the Base64 encoded file content
22 |     decoded_file = base64.b64decode(file_data.split(',')[1])
23 | 
24 |     # Read the decoded content into a DataFrame
25 |     df = pd.read_csv(io.StringIO(decoded_file.decode('utf-8')))
26 | 
27 |     # Separate the 'claim_id' column if it exists
28 |     if 'claim_id' in df.columns:
29 |         claim_ids = df['claim_id']
30 |         df = df.drop(columns=['claim_id'])
31 |     else:
32 |         claim_ids = None
33 | 
34 |     # Send the DataFrame to the BentoML service
35 |     response = requests.post(
36 |         'http://127.0.0.1:3000/predict',  # BentoML endpoint
37 |         json=df.to_dict(orient='records')
38 |     )
39 | 
40 |     # Get predictions from the response
41 |     predictions = response.json()['predictions']
42 | 
43 |     # Add predictions to the DataFrame
44 |     df['Prediction'] = predictions
45 | 
46 |     # Reattach the 'claim_id' column to the DataFrame
47 |     if claim_ids is not None:
48 |         df['claim_id'] = claim_ids
49 | 
50 |     # Reorder columns to have 'claim_id' first
51 |     if 'claim_id' in df.columns:
52 |         df = df[['claim_id'] + [col for col in df.columns if col != 'claim_id']]
53 | 
54 |     # Save the DataFrame to a session file for visualization
55 |     df.to_csv('session_data.csv', index=False)
56 | 
57 |     # Render the DataFrame as an HTML table with a link to visualize
58 |     return render_template('result.html', tables=[df.to_html(classes='data', header="true")])
59 | 
60 | # Route to handle the visualization
61 | @app.route('/visualize')
62 | def visualize():
63 |     # Load the session data
64 |     df = pd.read_csv('session_data.csv')
65 | 
66 |     # Create a pie chart based on the 'Prediction' column
67 |     prediction_counts = df['Prediction'].value_counts()
68 |     plt.figure(figsize=(8, 8))
69 |     plt.pie(prediction_counts, labels=prediction_counts.index, autopct='%1.1f%%', startangle=140)
70 |     plt.title('Prediction Distribution')
71 | 
72 |     # Save the pie chart as an image
73 |     if not os.path.exists('static'):
74 |         os.makedirs('static')
75 |     chart_path = 'static/prediction_pie_chart.png'
76 |     plt.savefig(chart_path)
77 |     plt.close()
78 | 
79 |     # Render the visualization page with the pie chart
80 |     return render_template('visualize.html', chart_path=chart_path)
81 | 
82 | if __name__ == '__main__':
83 |     app.run(debug=True, port=5005)
84 | 


--------------------------------------------------------------------------------
/06-orchestration/iot_dag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.dummy import DummyOperator
 3 | from airflow.operators.python import PythonOperator
 4 | from airflow.utils.dates import days_ago
 5 | from airflow.operators.email import EmailOperator
 6 | import random
 7 | import time
 8 | 
 9 | # Function to generate random IoT data
10 | def generate_iot_data(**kwargs):
11 |     data = []
12 |     for _ in range(60):  # 60 seconds x 5 minutes = 300 readings (1 every second)
13 |         data.append(random.choice([0, 1]))
14 |         time.sleep(1)  # simulate 1-second intervals
15 |     return data
16 | 
17 | # Function to aggregate the IoT data
18 | def aggregate_machine_data(**kwargs):
19 |     ti = kwargs['ti']
20 |     data = ti.xcom_pull(task_ids='getting_iot_data')
21 |     count_0 = data.count(0)
22 |     count_1 = data.count(1)
23 |     aggregated_data = {'count_0': count_0, 'count_1': count_1}
24 |     return aggregated_data
25 | 
26 | # Email content generation
27 | def create_email_content(**kwargs):
28 |     ti = kwargs['ti']
29 |     aggregated_data = ti.xcom_pull(task_ids='aggrigate_machine_data')
30 |     return f"Aggregated IoT Data:\nCount of 0: {aggregated_data['count_0']}\nCount of 1: {aggregated_data['count_1']}"
31 | 
32 | # Default arguments for the DAG
33 | default_args = {
34 |     'owner': 'airflow',
35 |     'start_date': days_ago(1),
36 |     'email_on_failure': False,
37 |     'email_on_retry': False,
38 |     'retries': 1,
39 | }
40 | 
41 | # Define the DAG
42 | with DAG(
43 |     dag_id='iot_data_pipeline',
44 |     default_args=default_args,
45 |     schedule_interval=None,
46 |     catchup=False,
47 | ) as dag:
48 | 
49 |     start_task = DummyOperator(task_id='start_task')
50 | 
51 |     getting_iot_data = PythonOperator(
52 |         task_id='getting_iot_data',
53 |         python_callable=generate_iot_data,
54 |     )
55 | 
56 |     aggregate_machine_data = PythonOperator(
57 |         task_id='aggregate_machine_data',
58 |         python_callable=aggregate_machine_data,
59 |     )
60 | 
61 |     end_task = DummyOperator(task_id='end_task')
62 | 
63 |     # Task dependencies
64 |     start_task >> getting_iot_data >> aggregate_machine_data >> end_task
65 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ### MLOps foundation course code repo
2 | 


--------------------------------------------------------------------------------