├── 01-data-transformation ├── data-cleaning.py ├── data-exploration.py ├── data-transformation.py └── mockdata.py ├── 02-kafka ├── 01-api-setup │ ├── main.py │ └── readme.md ├── 02-kafka-producer-consumer │ ├── main.py │ └── readme.md └── 03-api-to-kafka │ ├── main.py │ └── readme.md ├── 03-mlflow ├── example-mlflow.py ├── requirements.txt └── validation.py ├── 04-bentoml ├── __pycache__ │ ├── model_service_v1.cpython-312.pyc │ ├── model_service_v2.cpython-312.pyc │ └── model_service_v3.cpython-312.pyc ├── model_service_v1.py ├── model_service_v2.py ├── model_service_v3.py ├── model_train_v1.py ├── model_train_v2.py └── readme.md ├── 05-project ├── flask_app.py ├── isolation_model.py ├── readme.md ├── register_model.py ├── requirements.txt ├── service.py ├── synthetic_health_claims.py ├── templates │ ├── index.html │ ├── result.html │ └── visualize.html ├── test_claim.py └── v2_app.py ├── 06-orchestration └── iot_dag.py └── readme.md /01-data-transformation/data-cleaning.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | # Load the CSV file into a DataFrame 5 | df = pd.read_csv("mock_data.csv") 6 | 7 | # Fill missing values in 'age' and 'salary' with the median 8 | df['age'].fillna(df['age'].median(), inplace=True) 9 | df['salary'].fillna(df['salary'].median(), inplace=True) 10 | 11 | # Fill missing values in 'department' with 'Unknown' 12 | df['department'].fillna('Unknown', inplace=True) 13 | 14 | # Print sample data after handling missing values 15 | print("Sample data after filling missing values:") 16 | print(df.head(), "\n") 17 | 18 | # Convert 'profile' from JSON string to dictionary 19 | df['profile'] = df['profile'].apply(lambda x: json.loads(x) if pd.notnull(x) else {}) 20 | 21 | # Print sample data after converting 'profile' column 22 | print("Sample data after converting 'profile' column:") 23 | print(df[['profile']].head(), "\n") 24 | 25 | # Extract 'address', 'phone', and 'email' from 'profile' column 26 | df['address'] = df['profile'].apply(lambda x: x.get('address', None)) 27 | df['phone'] = df['profile'].apply(lambda x: x.get('phone', None)) 28 | df['email'] = df['profile'].apply(lambda x: x.get('email', None)) 29 | 30 | # Print sample data after extracting fields from 'profile' 31 | print("Sample data after extracting fields from 'profile':") 32 | print(df[['address', 'phone', 'email']].head(), "\n") 33 | 34 | # Drop the original 'profile' column 35 | df.drop(columns=['profile'], inplace=True) 36 | 37 | # Print sample data after dropping 'profile' column 38 | print("Sample data after dropping 'profile' column:") 39 | print(df.head(), "\n") 40 | 41 | # Save the cleaned DataFrame to a new CSV file 42 | df.to_csv("cleaned_data.csv", index=False) 43 | 44 | # Confirm data has been saved 45 | print("Cleaned data saved to 'cleaned_data.csv'") 46 | 47 | -------------------------------------------------------------------------------- /01-data-transformation/data-exploration.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Load the CSV file into a DataFrame 4 | df = pd.read_csv("mock_data.csv") 5 | 6 | # Display the first few rows of the DataFrame 7 | df.head() 8 | 9 | # Get a summary of the DataFrame 10 | df.info() 11 | 12 | # Check for missing values 13 | df.isnull().sum() 14 | 15 | # View statistical summary for numeric columns 16 | df.describe(include='all') 17 | 18 | # Check unique values in the 'department' column 19 | df['department'].unique() 20 | 21 | -------------------------------------------------------------------------------- /01-data-transformation/data-transformation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | # Load the CSV file into a DataFrame 6 | df = pd.read_csv("cleaned_data.csv") 7 | 8 | 9 | # Add a new column 'address_length' that calculates the length of the address 10 | df['address_length'] = df['address'].apply(lambda x: len(str(x))) 11 | 12 | # Print sample data after adding 'address_length' column 13 | print("Sample data after adding 'address_length' column:") 14 | print(df[['address', 'address_length']].head(), "\n") 15 | 16 | # Define the bins and labels 17 | bins = [0, 50000, 70000, 100000] 18 | labels = ['low', 'medium', 'high'] 19 | 20 | # Create a new column 'salary_category' 21 | df['salary_category'] = pd.cut(df['salary'], bins=bins, labels=labels, include_lowest=True) 22 | 23 | # Print sample data after adding 'salary_category' column 24 | print("Sample data after adding 'salary_category' column:") 25 | print(df[['salary', 'salary_category']].head(), "\n") 26 | 27 | # Group by 'department' and calculate average salary and age 28 | summary_report = df.groupby('department').agg({ 29 | 'salary': 'mean', 30 | 'age': 'mean' 31 | }).reset_index() 32 | 33 | # Rename columns for clarity 34 | summary_report.columns = ['Department', 'Average Salary', 'Average Age'] 35 | 36 | # Print the summary report 37 | print("Summary report of average salary and age by department:") 38 | print(summary_report, "\n") 39 | 40 | # Save the final transformed DataFrame to a new CSV file 41 | df.to_csv("transformed_data.csv", index=False) 42 | 43 | # Confirm data has been saved 44 | print("Final transformed data saved to 'transformed_data.csv'") 45 | -------------------------------------------------------------------------------- /01-data-transformation/mockdata.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import json 4 | import random 5 | from datetime import datetime, timedelta 6 | 7 | # Set random seed for reproducibility 8 | np.random.seed(42) 9 | 10 | # Number of records 11 | num_records = 20000 12 | 13 | # Generate random data 14 | data = { 15 | "id": np.arange(1, num_records + 1), 16 | "name": [f"Name_{i}" for i in np.random.randint(1, 1000, num_records)], 17 | "age": np.random.randint(18, 80, num_records), 18 | "salary": np.random.choice([50000, 60000, 70000, None], num_records), 19 | "hire_date": [ 20 | (datetime.now() - timedelta(days=random.randint(0, 3650))).strftime("%Y-%m-%d") 21 | if random.random() > 0.1 else None 22 | for _ in range(num_records) 23 | ], 24 | "profile": [ 25 | json.dumps({ 26 | "address": f"Street {random.randint(1, 100)}, City {random.randint(1, 50)}", 27 | "phone": f"{random.randint(1000000000, 9999999999)}", 28 | "email": f"email_{random.randint(1, 1000)}@example.com" 29 | }) 30 | if random.random() > 0.1 else None 31 | for _ in range(num_records) 32 | ], 33 | "department": np.random.choice(["HR", "IT", "Finance", "Marketing", None], num_records), 34 | "bonus": [None if random.random() > 0.9 else random.randint(1000, 10000) for _ in range(num_records)] 35 | } 36 | 37 | # Create DataFrame 38 | df = pd.DataFrame(data) 39 | 40 | # Introduce some NaN values randomly 41 | df.loc[np.random.choice(df.index, size=int(num_records * 0.05), replace=False), "age"] = np.nan 42 | df.loc[np.random.choice(df.index, size=int(num_records * 0.1), replace=False), "salary"] = np.nan 43 | 44 | # Save to CSV 45 | df.to_csv("mock_data.csv", index=False) 46 | -------------------------------------------------------------------------------- /02-kafka/01-api-setup/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | import random 3 | import string 4 | import json 5 | from datetime import datetime 6 | import uuid 7 | 8 | app = FastAPI() 9 | 10 | # Sample data 11 | products = ['Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Smartwatch'] 12 | cities = ['New York', 'London', 'Tokyo', 'Paris', 'Sydney', 'Berlin', 'Singapore', 'Dubai', 'Toronto', 'Mumbai'] 13 | 14 | def generate_random_order(): 15 | return { 16 | "order_id": str(uuid.uuid4()), 17 | "product": random.choice(products), 18 | "quantity": random.randint(1, 5), 19 | "price": round(random.uniform(50.00, 1000.00), 2), 20 | "customer_location": random.choice(cities), 21 | "timestamp": datetime.now().isoformat() 22 | } 23 | 24 | @app.get("/generate-orders") 25 | def generate_orders(): 26 | orders = {} 27 | for i in range(50): 28 | order_key = f"order_{i+1}" 29 | orders[order_key] = generate_random_order() 30 | return orders 31 | -------------------------------------------------------------------------------- /02-kafka/01-api-setup/readme.md: -------------------------------------------------------------------------------- 1 | ## FastAPI Order Generation API 2 | 3 | This project provides a simple FastAPI endpoint to generate mock orders. 4 | 5 | ### Prerequisites 6 | 7 | * **Python 3**: Make sure you have Python 3 installed. You can check by running `python3 --version`. 8 | * **pip**: The Python package manager, `pip`, should also be installed. You can check by running `pip3 --version`. 9 | 10 | ### Installation 11 | 12 | 1. **Update your system and install Python 3 and pip (if not already installed):** 13 | 14 | ```bash 15 | sudo yum update -y 16 | sudo yum install -y python3 17 | sudo yum install -y python3-pip 18 | ``` 19 | 20 | 2. **Install FastAPI and Uvicorn:** 21 | 22 | ```bash 23 | pip3 install fastapi uvicorn 24 | ``` 25 | 26 | ### Running the API 27 | 28 | 1. **Create a `main.py` file with your FastAPI code.** (Refer to the FastAPI documentation for how to structure your `main.py`) 29 | 30 | 2. **Start the Uvicorn server:** 31 | 32 | ```bash 33 | uvicorn main:app --host 0.0.0.0 --port 8000 34 | ``` 35 | 36 | This will make your API accessible at `http://:8000`. 37 | 38 | ### Endpoint 39 | 40 | * **`/generate-orders` (GET)** 41 | 42 | This endpoint generates mock order data. 43 | 44 | ### Testing 45 | 46 | You can test the endpoint using `curl`: 47 | 48 | ```bash 49 | curl -X 'GET' \ 50 | 'http://:8000/generate-orders' \ 51 | -H 'accept: application/json' 52 | ``` 53 | 54 | Replace `` with the actual IP address or hostname where your API is running. 55 | 56 | **Note:** If you are running this on an EC2 instance, make sure to open port 8000 in your security group settings. 57 | 58 | -------------------------------------------------------------------------------- /02-kafka/02-kafka-producer-consumer/main.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Producer, Consumer, KafkaError, KafkaException 2 | from confluent_kafka.admin import AdminClient, NewTopic 3 | 4 | # Configuration for Kafka connection 5 | kafka_config = { 6 | 'bootstrap.servers': 'localhost:9092', # Replace with your Kafka broker address 7 | } 8 | 9 | # Admin client to create topics 10 | admin_client = AdminClient(kafka_config) 11 | 12 | # Function to create a topic 13 | def create_topic(topic_name, num_partitions=1, replication_factor=1): 14 | topic_list = [NewTopic(topic_name, num_partitions=num_partitions, replication_factor=replication_factor)] 15 | fs = admin_client.create_topics(topic_list) 16 | for topic, f in fs.items(): 17 | try: 18 | f.result() # The result itself is None 19 | print(f"Topic '{topic}' created successfully") 20 | except KafkaException as e: 21 | # If the topic already exists, we skip this step 22 | print(f"Failed to create topic '{topic}': {e}") 23 | 24 | # Function to produce messages to Kafka 25 | def produce_messages(topic_name, messages): 26 | producer = Producer(kafka_config) 27 | 28 | for message in messages: 29 | producer.produce(topic_name, message) 30 | print(f"Produced message: {message}") 31 | 32 | producer.flush() 33 | 34 | # Function to consume messages from Kafka 35 | def consume_messages(topic_name, group_id='my-group'): 36 | consumer_config = { 37 | 'bootstrap.servers': 'localhost:9092', 38 | 'group.id': group_id, 39 | 'auto.offset.reset': 'earliest' 40 | } 41 | consumer = Consumer(consumer_config) 42 | 43 | consumer.subscribe([topic_name]) 44 | print(f"Subscribed to topic '{topic_name}'") 45 | 46 | try: 47 | while True: 48 | msg = consumer.poll(timeout=1.0) 49 | if msg is None: 50 | break 51 | if msg.error(): 52 | if msg.error().code() == KafkaError._PARTITION_EOF: 53 | print('End of partition reached {0}/{1}'.format(msg.topic(), msg.partition())) 54 | elif msg.error(): 55 | raise KafkaException(msg.error()) 56 | else: 57 | print(f"Consumed message: {msg.value().decode('utf-8')}") 58 | finally: 59 | consumer.close() 60 | 61 | # Main function to run the producer and consumer 62 | def main(): 63 | topic_name = 'test-topic' 64 | 65 | # Create topic 66 | create_topic(topic_name) 67 | 68 | # Produce messages 69 | messages_to_produce = ['Hello Kafka', 'Kafka with Python', 'Test message 1', 'Test message 2'] 70 | produce_messages(topic_name, messages_to_produce) 71 | 72 | # Consume messages 73 | print("Consuming messages from topic...") 74 | consume_messages(topic_name) 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /02-kafka/02-kafka-producer-consumer/readme.md: -------------------------------------------------------------------------------- 1 | # Kafka Setup on Ubuntu 2 | 3 | This guide provides step-by-step instructions for setting up a single-node Kafka cluster on an Ubuntu instance. 4 | Machine type ***t2.medium*** 5 | 6 | ## Commands & Explanation 7 | 8 | ```bash 9 | 2 sudo apt update 10 | ``` 11 | 12 | * Updates the package lists for upgrades and new package installations. 13 | 14 | ```bash 15 | 3 sudo apt install default-jdk -y 16 | ``` 17 | 18 | * Installs the default Java Development Kit (JDK), required for running Kafka. The `-y` flag automatically answers "yes" to any prompts. 19 | 20 | ```bash 21 | 4 java -version 22 | ``` 23 | 24 | * Verifies the installed Java version. 25 | 26 | ```bash 27 | 5 cd /opt 28 | ``` 29 | 30 | * Changes the current working directory to `/opt`, a common location for installing optional software. 31 | 32 | ```bash 33 | 6 sudo wget https://downloads.apache.org/kafka/3.8.0/kafka_2.12-3.8.0.tgz 34 | ``` 35 | 36 | * Downloads the Kafka distribution archive (version 3.8.0) using `wget`. 37 | 38 | ```bash 39 | 7 sudo tar -xvzf kafka_2.12-3.8.0.tgz 40 | ``` 41 | 42 | * Extracts the downloaded Kafka archive. 43 | 44 | ```bash 45 | 8 sudo mv kafka_2.12-3.8.0 /usr/local/kafka 46 | ``` 47 | 48 | * Moves the extracted Kafka directory to `/usr/local/kafka`, a standard location for installing Kafka. 49 | 50 | ```bash 51 | 9 cd /usr/local/kafka 52 | ``` 53 | 54 | * Changes the current working directory to the Kafka installation directory. 55 | 56 | ```bash 57 | 10 sudo nohup bin/zookeeper-server-start.sh config/zookeeper.properties > /tmp/zookeeper.log 2>&1 & 58 | ``` 59 | 60 | * Starts the ZooKeeper server in the background. 61 | * `nohup` ensures the process continues running even if the terminal is closed. 62 | * `>` redirects standard output to `/tmp/zookeeper.log`. 63 | * `2>&1` redirects standard error to the same file as standard output. 64 | * `&` runs the command in the background. 65 | 66 | ```bash 67 | 11 sudo nohup bin/kafka-server-start.sh config/server.properties > /tmp/kafka.log 2>&1 & 68 | ``` 69 | 70 | * Starts the Kafka server in the background, similar to how ZooKeeper was started. 71 | 72 | ```bash 73 | 12 tail -100f /tmp/kafka.log 74 | ``` 75 | 76 | * Displays the last 100 lines of the Kafka log file, useful for monitoring the server startup process. 77 | 78 | ```bash 79 | 13 sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092 80 | ``` 81 | 82 | * Lists existing Kafka topics using the local Kafka broker running on `localhost:9092`. 83 | 84 | ```bash 85 | 14 sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server 3.64.165.242:9092 --partitions 1 --replication-factor 1 86 | ``` 87 | 88 | * **Likely incorrect**: Attempts to create a topic named `test-topic` on a remote Kafka broker (specified by the external IP address), which would typically require additional configuration. 89 | 90 | ```bash 91 | 15 sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1 92 | ``` 93 | 94 | * Creates a topic named `test-topic` on the local Kafka broker with one partition and a replication factor of 1. 95 | 96 | ```bash 97 | 16 sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092 98 | ``` 99 | 100 | * Lists Kafka topics again to confirm the creation of `test-topic`. 101 | 102 | ```bash 103 | 17 sudo bin/kafka-console-producer.sh --topic test-topic --bootstrap-server localhost:9092 104 | ``` 105 | 106 | ```bash 107 | sudo apt update 108 | sudo apt install default-jdk -y 109 | java -version 110 | cd /opt 111 | sudo wget https://downloads.apache.org/kafka/3.8.0/kafka_2.12-3.8.0.tgz 112 | sudo tar -xvzf kafka_2.12-3.8.0.tgz 113 | sudo mv kafka_2.12-3.8.0 /usr/local/kafka 114 | cd /usr/local/kafka 115 | sudo nohup bin/zookeeper-server-start.sh config/zookeeper.properties > /tmp/zookeeper.log 2>&1 & 116 | sudo nohup bin/kafka-server-start.sh config/server.properties > /tmp/kafka.log 2>&1 & 117 | tail -100f /tmp/kafka.log 118 | sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092 119 | sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server 3.64.165.242:9092 --partitions 1 --replication-factor 1 120 | sudo bin/kafka-topics.sh --create --topic test-topic --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1 121 | sudo bin/kafka-topics.sh --list --bootstrap-server localhost:9092 122 | sudo bin/kafka-console-producer.sh --topic test-topic --bootstrap-server localhost:9092 123 | history 124 | ``` 125 | 126 | * Install pip and confluent package on ubuntu 127 | 128 | ```bash 129 | apt install python3-pip 130 | apt install python3-confluent-kafka 131 | ``` 132 | -------------------------------------------------------------------------------- /02-kafka/03-api-to-kafka/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from confluent_kafka import Producer, KafkaException, KafkaError 3 | from confluent_kafka.admin import AdminClient, NewTopic 4 | 5 | # Kafka configuration 6 | kafka_config = { 7 | 'bootstrap.servers': 'localhost:9092', # Replace with your Kafka broker address if needed 8 | } 9 | 10 | # Topic name 11 | topic_name = "order-topic" 12 | 13 | # Function to log messages 14 | def log(message): 15 | print(f"[INFO] {message}") 16 | 17 | # Function to create a topic 18 | def create_topic(admin_client, topic_name): 19 | log(f"Creating topic: {topic_name}") 20 | topic_list = [NewTopic(topic_name, num_partitions=1, replication_factor=1)] 21 | fs = admin_client.create_topics(topic_list) 22 | for topic, f in fs.items(): 23 | try: 24 | f.result() # The result itself is None if successful 25 | log(f"Topic '{topic}' created successfully.") 26 | except KafkaException as e: 27 | log(f"Failed to create topic '{topic}': {e}") 28 | 29 | # Function to delete a topic 30 | def delete_topic(admin_client, topic_name): 31 | log(f"Deleting topic: {topic_name}") 32 | fs = admin_client.delete_topics([topic_name]) 33 | for topic, f in fs.items(): 34 | try: 35 | f.result() # The result itself is None if successful 36 | log(f"Topic '{topic}' deleted successfully.") 37 | except KafkaException as e: 38 | log(f"Failed to delete topic '{topic}': {e}") 39 | 40 | # Function to fetch orders from FastAPI 41 | def fetch_orders(): 42 | url = "http://52.59.240.23:8000/generate-orders" 43 | log(f"Fetching orders from {url}") 44 | response = requests.get(url) 45 | 46 | if response.status_code == 200: 47 | log("Orders fetched successfully.") 48 | return response.json() # Assuming the response is a JSON 49 | else: 50 | log(f"Failed to fetch orders. Status code: {response.status_code}") 51 | return None 52 | 53 | # Function to produce messages to Kafka 54 | def produce_messages(producer, topic_name, message): 55 | log(f"Producing message to topic: {topic_name}") 56 | producer.produce(topic_name, message) 57 | producer.flush() 58 | log("Message produced successfully.") 59 | 60 | # Main function 61 | def main(): 62 | # Step 1: Setup Kafka Admin Client and Producer 63 | admin_client = AdminClient(kafka_config) 64 | producer = Producer(kafka_config) 65 | 66 | # Step 2: Check if the topic exists, delete if it does 67 | topics = admin_client.list_topics().topics 68 | if topic_name in topics: 69 | delete_topic(admin_client, topic_name) 70 | 71 | # Step 3: Create the Kafka topic 72 | create_topic(admin_client, topic_name) 73 | 74 | # Step 4: Fetch orders from FastAPI 75 | orders = fetch_orders() 76 | 77 | if orders is not None: 78 | # Convert the fetched orders to a string (assuming it's JSON-compatible) 79 | orders_str = str(orders) 80 | 81 | # Step 5: Produce the fetched order to Kafka 82 | produce_messages(producer, topic_name, orders_str) 83 | 84 | log("Script finished after producing message.") 85 | 86 | # Run the main function 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /02-kafka/03-api-to-kafka/readme.md: -------------------------------------------------------------------------------- 1 | ```bash 2 | cd /usr/local/kafka 3 | sudo bin/kafka-console-consumer.sh --topic order-topic --from-beginning --bootstrap-server localhost:9092 4 | ``` -------------------------------------------------------------------------------- /03-mlflow/example-mlflow.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import mlflow.sklearn 3 | from sklearn.datasets import make_regression 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.tree import DecisionTreeRegressor 7 | from sklearn.ensemble import RandomForestRegressor 8 | from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score 9 | import numpy as np 10 | 11 | # Set the MLflow tracking URI to the remote MLflow server 12 | mlflow.set_tracking_uri("http://localhost:5000") 13 | 14 | # Create synthetic data for regression 15 | X, y = make_regression(n_samples=100, n_features=4, noise=0.1, random_state=42) 16 | 17 | # Split the data 18 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 19 | 20 | # Set the experiment name 21 | mlflow.set_experiment("ML Model Experiment") 22 | 23 | def log_model(model, model_name): 24 | with mlflow.start_run(run_name=model_name): 25 | # Train the model 26 | model.fit(X_train, y_train) 27 | 28 | # Make predictions 29 | y_pred = model.predict(X_test) 30 | 31 | # Calculate metrics 32 | mse = mean_squared_error(y_test, y_pred) 33 | rmse = np.sqrt(mse) 34 | mae = mean_absolute_error(y_test, y_pred) 35 | r2 = r2_score(y_test, y_pred) 36 | evs = explained_variance_score(y_test, y_pred) 37 | 38 | # Log metrics 39 | mlflow.log_metric("mse", mse) 40 | mlflow.log_metric("rmse", rmse) 41 | mlflow.log_metric("mae", mae) 42 | mlflow.log_metric("r2", r2) 43 | mlflow.log_metric("explained_variance", evs) 44 | 45 | # Log model 46 | mlflow.sklearn.log_model(model, model_name) 47 | 48 | print(f"{model_name} - MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R2: {r2}, Explained Variance: {evs}") 49 | 50 | # Linear Regression Model 51 | linear_model = LinearRegression() 52 | log_model(linear_model, "Linear Regression") 53 | 54 | # Decision Tree Regressor Model 55 | tree_model = DecisionTreeRegressor() 56 | log_model(tree_model, "Decision Tree Regressor") 57 | 58 | # Random Forest Regressor Model 59 | forest_model = RandomForestRegressor() 60 | log_model(forest_model, "Random Forest Regressor") 61 | 62 | print("Experiment completed! Check the MLflow server for details.") 63 | -------------------------------------------------------------------------------- /03-mlflow/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | scikit-learn -------------------------------------------------------------------------------- /03-mlflow/validation.py: -------------------------------------------------------------------------------- 1 | from mlflow.tracking import MlflowClient 2 | 3 | client = MlflowClient() 4 | for rm in client.search_registered_models(): 5 | print(f"Model name: {rm.name}") 6 | -------------------------------------------------------------------------------- /04-bentoml/__pycache__/model_service_v1.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v1.cpython-312.pyc -------------------------------------------------------------------------------- /04-bentoml/__pycache__/model_service_v2.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v2.cpython-312.pyc -------------------------------------------------------------------------------- /04-bentoml/__pycache__/model_service_v3.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kodekloudhub/Fundamentals-of-MLOps/71bf519079f4eade480731c13c01dd1e3d489a83/04-bentoml/__pycache__/model_service_v3.cpython-312.pyc -------------------------------------------------------------------------------- /04-bentoml/model_service_v1.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.io import JSON 3 | from pydantic import BaseModel 4 | 5 | # Load the model 6 | model_ref = bentoml.sklearn.get("house_price_model:latest") 7 | model_runner = model_ref.to_runner() 8 | 9 | # Define the service 10 | svc = bentoml.Service("house_price_predictor", runners=[model_runner]) 11 | 12 | # Input schema 13 | class HouseInput(BaseModel): 14 | square_footage: float 15 | num_rooms: int 16 | 17 | # API for prediction 18 | @svc.api(input=JSON(pydantic_model=HouseInput), output=JSON()) 19 | async def predict_house_price(data: HouseInput): 20 | input_data = [[data.square_footage, data.num_rooms]] 21 | prediction = await model_runner.predict.async_run(input_data) 22 | return {"predicted_price": prediction[0]} 23 | -------------------------------------------------------------------------------- /04-bentoml/model_service_v2.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.io import JSON 3 | from pydantic import BaseModel 4 | 5 | # Load the model 6 | model_ref = bentoml.sklearn.get("house_price_model_v2:latest") 7 | model_runner = model_ref.to_runner() 8 | 9 | # Define the service 10 | svc = bentoml.Service("house_price_predictor_v2", runners=[model_runner]) 11 | 12 | # Input schema 13 | class HouseInput(BaseModel): 14 | square_footage: float 15 | num_rooms: int 16 | num_bathrooms: int 17 | house_age: int 18 | distance_to_city_center: float 19 | has_garage: int 20 | has_garden: int 21 | crime_rate: float 22 | avg_school_rating: float 23 | country: str 24 | 25 | # API for prediction 26 | @svc.api(input=JSON(pydantic_model=HouseInput), output=JSON()) 27 | async def predict_house_price(data: HouseInput): 28 | # One-hot encoding for the country 29 | country_encoded = [0, 0, 0] # Default for ['Canada', 'Germany', 'UK'] 30 | if data.country == "Canada": 31 | country_encoded[0] = 1 32 | elif data.country == "Germany": 33 | country_encoded[1] = 1 34 | elif data.country == "UK": 35 | country_encoded[2] = 1 36 | 37 | input_data = [[ 38 | data.square_footage, data.num_rooms, data.num_bathrooms, data.house_age, 39 | data.distance_to_city_center, data.has_garage, data.has_garden, 40 | data.crime_rate, data.avg_school_rating 41 | ] + country_encoded] 42 | 43 | prediction = await model_runner.predict.async_run(input_data) 44 | return {"predicted_price": prediction[0]} 45 | -------------------------------------------------------------------------------- /04-bentoml/model_service_v3.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.io import JSON 3 | from pydantic import BaseModel 4 | 5 | # Load the v1 and v2 models 6 | model_v1_ref = bentoml.sklearn.get("house_price_model:latest") 7 | model_v2_ref = bentoml.sklearn.get("house_price_model_v2:latest") 8 | model_v1_runner = model_v1_ref.to_runner() 9 | model_v2_runner = model_v2_ref.to_runner() 10 | 11 | # Define the service with both runners 12 | svc = bentoml.Service("house_price_predictor", runners=[model_v1_runner, model_v2_runner]) 13 | 14 | # Input schema for V1 (simpler model) 15 | class HouseInputV1(BaseModel): 16 | square_footage: float 17 | num_rooms: int 18 | 19 | # Input schema for V2 (expanded model) 20 | class HouseInputV2(BaseModel): 21 | square_footage: float 22 | num_rooms: int 23 | num_bathrooms: int 24 | house_age: int 25 | distance_to_city_center: float 26 | has_garage: int 27 | has_garden: int 28 | crime_rate: float 29 | avg_school_rating: float 30 | country: str 31 | 32 | # API for V1 model prediction 33 | @svc.api(input=JSON(pydantic_model=HouseInputV1), output=JSON(), route="/predict_house_price_v1") 34 | async def predict_house_price_v1(data: HouseInputV1): 35 | input_data = [[data.square_footage, data.num_rooms]] 36 | prediction = await model_v1_runner.predict.async_run(input_data) 37 | return {"predicted_price_v1": prediction[0]} 38 | 39 | # API for V2 model prediction 40 | @svc.api(input=JSON(pydantic_model=HouseInputV2), output=JSON(), route="/predict_house_price_v2") 41 | async def predict_house_price_v2(data: HouseInputV2): 42 | # One-hot encoding for the country 43 | country_encoded = [0, 0, 0] # Default for ['Canada', 'Germany', 'UK'] 44 | if data.country == "Canada": 45 | country_encoded[0] = 1 46 | elif data.country == "Germany": 47 | country_encoded[1] = 1 48 | elif data.country == "UK": 49 | country_encoded[2] = 1 50 | 51 | input_data = [[ 52 | data.square_footage, data.num_rooms, data.num_bathrooms, data.house_age, 53 | data.distance_to_city_center, data.has_garage, data.has_garden, 54 | data.crime_rate, data.avg_school_rating 55 | ] + country_encoded] 56 | 57 | prediction = await model_v2_runner.predict.async_run(input_data) 58 | return {"predicted_price_v2": prediction[0]} 59 | -------------------------------------------------------------------------------- /04-bentoml/model_train_v1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.linear_model import LinearRegression 3 | from sklearn.model_selection import train_test_split 4 | import bentoml 5 | 6 | # Generate synthetic data for house price prediction 7 | def generate_data(): 8 | data = { 9 | 'square_footage': [1000, 1500, 1800, 2000, 2300, 2500, 2700, 3000, 3200, 3500], 10 | 'num_rooms': [3, 4, 4, 5, 5, 6, 6, 7, 7, 8], 11 | 'price': [200000, 250000, 280000, 310000, 340000, 370000, 400000, 430000, 460000, 500000] 12 | } 13 | return pd.DataFrame(data) 14 | 15 | # Load the data 16 | df = generate_data() 17 | 18 | # Features and target 19 | X = df[['square_footage', 'num_rooms']] 20 | y = df['price'] 21 | 22 | # Split the data 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 24 | 25 | # Train the model 26 | model = LinearRegression() 27 | model.fit(X_train, y_train) 28 | 29 | # Save the model with BentoML 30 | bentoml.sklearn.save_model("house_price_model", model) 31 | -------------------------------------------------------------------------------- /04-bentoml/model_train_v2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.linear_model import LinearRegression 3 | from sklearn.model_selection import train_test_split 4 | import bentoml 5 | 6 | # Generate synthetic data for house price prediction with 10 features 7 | def generate_data(): 8 | data = { 9 | 'square_footage': [1000, 1500, 1800, 2000, 2300, 2500, 2700, 3000, 3200, 3500], 10 | 'num_rooms': [3, 4, 4, 5, 5, 6, 6, 7, 7, 8], 11 | 'num_bathrooms': [1, 2, 2, 2, 3, 3, 3, 4, 4, 4], 12 | 'house_age': [10, 5, 15, 20, 8, 12, 7, 3, 25, 30], 13 | 'distance_to_city_center': [10, 8, 12, 5, 15, 6, 20, 2, 18, 25], 14 | 'has_garage': [1, 1, 0, 1, 0, 1, 1, 1, 0, 1], 15 | 'has_garden': [1, 1, 1, 0, 1, 0, 0, 1, 1, 0], 16 | 'crime_rate': [0.3, 0.2, 0.5, 0.1, 0.4, 0.3, 0.6, 0.1, 0.7, 0.8], 17 | 'avg_school_rating': [8, 9, 7, 8, 6, 7, 5, 9, 4, 3], 18 | 'country': ['USA', 'USA', 'USA', 'Canada', 'Canada', 'Canada', 'UK', 'UK', 'Germany', 'Germany'], 19 | 'price': [200000, 250000, 280000, 310000, 340000, 370000, 400000, 430000, 460000, 500000] 20 | } 21 | return pd.DataFrame(data) 22 | 23 | # Load the data 24 | df = generate_data() 25 | 26 | # One-hot encode categorical features like 'country' 27 | df = pd.get_dummies(df, columns=['country'], drop_first=True) 28 | 29 | # Features and target 30 | X = df.drop(columns=['price']) 31 | y = df['price'] 32 | 33 | # Split the data 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 35 | 36 | # Train the model 37 | model = LinearRegression() 38 | model.fit(X_train, y_train) 39 | 40 | # Save the model with BentoML 41 | bentoml.sklearn.save_model("house_price_model_v2", model) 42 | -------------------------------------------------------------------------------- /04-bentoml/readme.md: -------------------------------------------------------------------------------- 1 | # House Price Prediction with BentoML 2 | 3 | This project demonstrates how to build, train, and deploy a house price prediction model using Python, BentoML, and various machine learning libraries. Follow the steps below to set up the environment, train the model, serve it using BentoML, and make predictions via API requests. 4 | 5 | ## Table of Contents 6 | 7 | - [House Price Prediction with BentoML](#house-price-prediction-with-bentoml) 8 | - [Table of Contents](#table-of-contents) 9 | - [Prerequisites](#prerequisites) 10 | - [Setup](#setup) 11 | - [Training the Model](#training-the-model) 12 | - [Version 1](#version-1) 13 | - [Serving the Model](#serving-the-model) 14 | - [Version 1](#version-1-1) 15 | - [Version 2](#version-2) 16 | - [Version 3](#version-3) 17 | - [Versioning](#versioning) 18 | - [Cleanup](#cleanup) 19 | 20 | ## Prerequisites 21 | 22 | Ensure you have the following installed on your local machine: 23 | 24 | - **Python 3.7+** 25 | - **pip** 26 | - **Git** (optional, for version control) 27 | 28 | ## Setup 29 | 30 | 1. **Create a Virtual Environment** 31 | 32 | ```bash 33 | python3 -m venv bentoml-env 34 | ``` 35 | 36 | 2. **Activate the Virtual Environment** 37 | 38 | ```bash 39 | source bentoml-env/bin/activate 40 | ``` 41 | 42 | 3. **Install Required Packages** 43 | 44 | ```bash 45 | pip3 install bentoml scikit-learn pandas 46 | ``` 47 | 48 | 4. **Navigate to the Project Directory** 49 | 50 | ```bash 51 | cd 04-bentoml 52 | ``` 53 | 54 | ## Training the Model 55 | 56 | ### Version 1 57 | 58 | 1. **Train the Initial Model** 59 | 60 | ```bash 61 | python3 model_train_v1.py 62 | ``` 63 | 64 | 2. **List Available BentoML Models** 65 | 66 | ```bash 67 | bentoml models list 68 | ``` 69 | 70 | ## Serving the Model 71 | 72 | ### Version 1 73 | 74 | 1. **Serve the Model with BentoML** 75 | 76 | ```bash 77 | bentoml serve model_service_v1.py --reload 78 | ``` 79 | 80 | 2. **Make a Prediction Request** 81 | 82 | Open a new terminal window/tab, activate the virtual environment, navigate to the project directory, and run: 83 | 84 | ```bash 85 | curl -X POST "http://127.0.0.1:3000/predict_house_price" \ 86 | -H "Content-Type: application/json" \ 87 | -d '{"square_footage": 2500, "num_rooms": 5}' 88 | ``` 89 | 90 | ### Version 2 91 | 92 | 1. **Train the Enhanced Model** 93 | 94 | ```bash 95 | python3 model_train_v2.py 96 | ``` 97 | 98 | 2. **Serve the Enhanced Model** 99 | 100 | ```bash 101 | bentoml serve model_service_v2.py --reload 102 | ``` 103 | 104 | 3. **Make a Detailed Prediction Request** 105 | 106 | ```bash 107 | curl -X POST "http://127.0.0.1:3000/predict_house_price" \ 108 | -H "Content-Type: application/json" \ 109 | -d '{ 110 | "square_footage": 2500, 111 | "num_rooms": 5, 112 | "num_bathrooms": 3, 113 | "house_age": 10, 114 | "distance_to_city_center": 8, 115 | "has_garage": 1, 116 | "has_garden": 1, 117 | "crime_rate": 0.2, 118 | "avg_school_rating": 8, 119 | "country": "Germany" 120 | }' 121 | ``` 122 | 123 | ### Version 3 124 | 125 | 1. **Serve Additional Model Versions** 126 | 127 | ```bash 128 | bentoml serve model_service_v3.py --reload 129 | ``` 130 | 131 | 2. **Make Prediction Requests to Specific Model Versions** 132 | 133 | - **Version 1 Endpoint** 134 | 135 | ```bash 136 | curl -X POST "http://127.0.0.1:3000/predict_house_price_v1" \ 137 | -H "Content-Type: application/json" \ 138 | -d '{"square_footage": 2500, "num_rooms": 5}' 139 | ``` 140 | 141 | - **Version 2 Endpoint** 142 | 143 | ```bash 144 | curl -X POST "http://127.0.0.1:3000/predict_house_price_v2" \ 145 | -H "Content-Type: application/json" \ 146 | -d '{ 147 | "square_footage": 2500, 148 | "num_rooms": 5, 149 | "num_bathrooms": 3, 150 | "house_age": 10, 151 | "distance_to_city_center": 8, 152 | "has_garage": 1, 153 | "has_garden": 1, 154 | "crime_rate": 0.2, 155 | "avg_school_rating": 8, 156 | "country": "Germany" 157 | }' 158 | ``` 159 | 160 | ## Versioning 161 | 162 | Each version of the model and its corresponding service script allows for iterative improvements and testing. Ensure that you train and serve the appropriate version based on your requirements. 163 | 164 | ## Cleanup 165 | 166 | To deactivate the virtual environment and clean up your terminal: 167 | 168 | 1. **Deactivate the Virtual Environment** 169 | 170 | ```bash 171 | deactivate 172 | ``` 173 | 174 | 2. **Clear the Terminal (Optional)** 175 | 176 | ```bash 177 | clear 178 | ``` 179 | 180 | --- -------------------------------------------------------------------------------- /05-project/flask_app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | import pandas as pd 3 | import requests 4 | import base64 5 | import io 6 | 7 | app = Flask(__name__) 8 | 9 | # Route for the home page 10 | @app.route('/') 11 | def index(): 12 | return render_template('index.html') 13 | 14 | # Route to handle the CSV file upload and prediction 15 | @app.route('/predict', methods=['POST']) 16 | def predict(): 17 | file_data = request.form.get('file') 18 | 19 | # Decode the Base64 encoded file content 20 | decoded_file = base64.b64decode(file_data.split(',')[1]) 21 | 22 | # Read the decoded content into a DataFrame 23 | df = pd.read_csv(io.StringIO(decoded_file.decode('utf-8'))) 24 | 25 | # Separate the 'claim_id' column if it exists 26 | if 'claim_id' in df.columns: 27 | claim_ids = df['claim_id'] 28 | df = df.drop(columns=['claim_id']) 29 | else: 30 | claim_ids = None 31 | 32 | # Send the DataFrame to the BentoML service 33 | response = requests.post( 34 | 'http://127.0.0.1:3000/predict', # BentoML endpoint 35 | json=df.to_dict(orient='records') 36 | ) 37 | 38 | # Get predictions from the response 39 | predictions = response.json()['predictions'] 40 | 41 | # Add predictions to the DataFrame 42 | df['Prediction'] = predictions 43 | 44 | # Reattach the 'claim_id' column to the DataFrame 45 | if claim_ids is not None: 46 | df['claim_id'] = claim_ids 47 | 48 | # Reorder columns to have 'claim_id' first 49 | if 'claim_id' in df.columns: 50 | df = df[['claim_id'] + [col for col in df.columns if col != 'claim_id']] 51 | 52 | # Render the DataFrame as an HTML table 53 | return render_template('result.html', tables=[df.to_html(classes='data', header="true")]) 54 | 55 | if __name__ == '__main__': 56 | app.run(debug=True, port=5005) 57 | -------------------------------------------------------------------------------- /05-project/isolation_model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.ensemble import IsolationForest 3 | from sklearn.model_selection import train_test_split 4 | import mlflow 5 | import mlflow.sklearn 6 | 7 | # Load the synthetic data 8 | df = pd.read_csv('synthetic_health_claims.csv') 9 | 10 | mlflow.set_tracking_uri("http://127.0.0.1:5000") 11 | 12 | # Features to use for the model 13 | features = ['claim_amount', 'num_services', 'patient_age', 'provider_id', 'days_since_last_claim'] 14 | 15 | # Split the data into training and test sets 16 | X_train, X_test = train_test_split(df[features], test_size=0.2, random_state=42) 17 | 18 | # Set up MLflow 19 | mlflow.set_experiment("Health Insurance Claim Anomaly Detection") 20 | 21 | with mlflow.start_run(): 22 | # Train the Isolation Forest model 23 | model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42) 24 | model.fit(X_train) 25 | 26 | # Predict on the test set 27 | y_pred_train = model.predict(X_train) 28 | y_pred_test = model.predict(X_test) 29 | 30 | # Convert predictions to anomaly scores (-1 is anomaly, 1 is normal) 31 | anomaly_score_train = (y_pred_train == -1).astype(int) 32 | anomaly_score_test = (y_pred_test == -1).astype(int) 33 | 34 | # Log parameters 35 | mlflow.log_param("n_estimators", 100) 36 | mlflow.log_param("contamination", 0.05) 37 | 38 | # Log metrics 39 | train_anomaly_percentage = anomaly_score_train.mean() * 100 40 | test_anomaly_percentage = anomaly_score_test.mean() * 100 41 | 42 | mlflow.log_metric("train_anomaly_percentage", train_anomaly_percentage) 43 | mlflow.log_metric("test_anomaly_percentage", test_anomaly_percentage) 44 | 45 | # Log the model 46 | mlflow.sklearn.log_model(model, "model") 47 | 48 | print(f"Train Anomaly Percentage: {train_anomaly_percentage:.2f}%") 49 | print(f"Test Anomaly Percentage: {test_anomaly_percentage:.2f}%") 50 | print("Model and metrics logged to MLflow.") 51 | 52 | -------------------------------------------------------------------------------- /05-project/readme.md: -------------------------------------------------------------------------------- 1 | Here is the converted `README.md` file for your project: 2 | 3 | ```markdown 4 | # Health Claims Fraud Detection Project 5 | 6 | This project involves building a Flask web application that uses an Isolation Forest model to detect potentially fraudulent health claims. It leverages BentoML for model serving and MLflow for experiment tracking. 7 | 8 | ## Setup Instructions 9 | 10 | ### 1. Environment Setup 11 | 12 | - Load the bash profile and set up the virtual environment: 13 | 14 | ```bash 15 | source ~/.bash_profile 16 | virtualenv venv 17 | source venv/bin/activate 18 | ``` 19 | 20 | - Install required dependencies: 21 | 22 | ```bash 23 | pip3 install -r requirements.txt 24 | ``` 25 | 26 | ### 2. BentoML and Model Management 27 | 28 | - List BentoML models: 29 | 30 | ```bash 31 | bentoml models list 32 | ``` 33 | 34 | - Run the synthetic data generator script: 35 | 36 | ```bash 37 | python3 synthetic_health_claims.py 38 | ``` 39 | 40 | - Train and evaluate the Isolation Forest model: 41 | 42 | ```bash 43 | python3 isolation_model.py 44 | ``` 45 | 46 | ### 3. MLflow for Experiment Tracking 47 | 48 | - Start the MLflow UI to track experiments: 49 | 50 | ```bash 51 | mlflow ui 52 | ``` 53 | 54 | ### 4. Run the Flask Web Application 55 | 56 | - Open a new terminal window, source the environment, and run the Flask app: 57 | 58 | ```bash 59 | source ~/.bash_profile 60 | source venv/bin/activate 61 | python3 flask_app.py 62 | ``` 63 | 64 | ### 5. Register Model and Serve with BentoML 65 | 66 | - In another terminal, run the following commands to register and serve the model with BentoML: 67 | 68 | ```bash 69 | source ~/.bash_profile 70 | source venv/bin/activate 71 | python3 isolation_model.py 72 | python3 register_model.py 73 | bentoml serve service.py --reload 74 | ``` 75 | 76 | ## Additional Notes 77 | 78 | - Ensure that all commands are executed in the correct order for proper setup and functioning of the application. 79 | - Use BentoML to serve models and integrate them with the Flask app for real-time fraud detection. 80 | - The MLflow UI helps to track experiments and evaluate model performance. 81 | ``` 82 | 83 | This `README.md` file gives a clear step-by-step guide for setting up and running your project. Let me know if you'd like any additional adjustments! -------------------------------------------------------------------------------- /05-project/register_model.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | import pickle 3 | 4 | # Load the model from the downloaded PKL file using pickle 5 | model_path = "model.pkl" # Replace with your actual path 6 | 7 | with open(model_path, 'rb') as model_file: # Open in binary mode 8 | model = pickle.load(model_file) 9 | 10 | # Save the model to BentoML 11 | bento_model = bentoml.sklearn.save_model("health_insurance_anomaly_detector", model) 12 | 13 | print(f"Model registered with BentoML: {bento_model}") 14 | -------------------------------------------------------------------------------- /05-project/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | pandas 3 | numpy 4 | bentoml -------------------------------------------------------------------------------- /05-project/service.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | from bentoml.io import JSON, PandasDataFrame 3 | 4 | # Load the registered model 5 | model_runner = bentoml.sklearn.get("health_insurance_anomaly_detector:latest").to_runner() 6 | 7 | # Create a BentoML Service 8 | svc = bentoml.Service("health_insurance_anomaly_detection_service", runners=[model_runner]) 9 | 10 | # Define an API endpoint for prediction 11 | @svc.api(input=PandasDataFrame(), output=JSON()) 12 | def predict(data): 13 | # Make predictions 14 | predictions = model_runner.predict.run(data) 15 | # Return predictions as JSON 16 | return {"predictions": predictions.tolist()} 17 | -------------------------------------------------------------------------------- /05-project/synthetic_health_claims.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Set random seed for reproducibility 5 | np.random.seed(42) 6 | 7 | # Generate synthetic data 8 | num_samples = 1000 9 | data = { 10 | 'claim_id': np.arange(1, num_samples + 1), 11 | 'claim_amount': np.random.normal(1000, 250, num_samples), 12 | 'num_services': np.random.randint(1, 10, num_samples), 13 | 'patient_age': np.random.randint(18, 90, num_samples), 14 | 'provider_id': np.random.randint(1, 50, num_samples), 15 | 'days_since_last_claim': np.random.randint(0, 365, num_samples), 16 | } 17 | 18 | # Convert to DataFrame 19 | df = pd.DataFrame(data) 20 | 21 | # Introduce some anomalies (e.g., very high claim amounts) 22 | num_anomalies = 50 23 | anomalies = { 24 | 'claim_id': np.arange(num_samples + 1, num_samples + num_anomalies + 1), 25 | 'claim_amount': np.random.normal(10000, 2500, num_anomalies), # Much higher amounts 26 | 'num_services': np.random.randint(10, 20, num_anomalies), 27 | 'patient_age': np.random.randint(18, 90, num_anomalies), 28 | 'provider_id': np.random.randint(1, 50, num_anomalies), 29 | 'days_since_last_claim': np.random.randint(0, 365, num_anomalies), 30 | } 31 | 32 | df_anomalies = pd.DataFrame(anomalies) 33 | 34 | # Combine normal data with anomalies 35 | df = pd.concat([df, df_anomalies]).reset_index(drop=True) 36 | 37 | # Shuffle the dataset 38 | df = df.sample(frac=1).reset_index(drop=True) 39 | 40 | # Save the data to CSV 41 | df.to_csv('synthetic_health_claims.csv', index=False) 42 | 43 | print("Synthetic data generated and saved to 'synthetic_health_claims.csv'.") 44 | -------------------------------------------------------------------------------- /05-project/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Health Insurance Anomaly Detection 7 | 29 | 30 | 31 |

Upload CSV File

32 |
33 |

Drag & Drop your CSV file here or click to upload

34 | 35 | 36 |
37 |
38 | 39 |
40 | 41 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /05-project/templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Prediction Results 7 | 20 | 21 | 22 |

Prediction Results

23 | {% for table in tables %} 24 | {{ table|safe }} 25 | {% endfor %} 26 | Go Back 27 | 28 | 29 | -------------------------------------------------------------------------------- /05-project/templates/visualize.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Visualization 7 | 8 | 9 |

Prediction Distribution

10 | Prediction Pie Chart 11 |

12 | Go Back 13 | 14 | 15 | -------------------------------------------------------------------------------- /05-project/test_claim.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | 4 | # Define 10 different test inputs 5 | test_data = [ 6 | {"claim_amount": 1000, "num_services": 2, "patient_age": 30, "provider_id": 1, "days_since_last_claim": 100}, 7 | {"claim_amount": 2000, "num_services": 5, "patient_age": 45, "provider_id": 2, "days_since_last_claim": 200}, 8 | {"claim_amount": 15000, "num_services": 10, "patient_age": 50, "provider_id": 3, "days_since_last_claim": 300}, 9 | {"claim_amount": 500, "num_services": 1, "patient_age": 25, "provider_id": 4, "days_since_last_claim": 10}, 10 | {"claim_amount": 7500, "num_services": 8, "patient_age": 60, "provider_id": 5, "days_since_last_claim": 50}, 11 | {"claim_amount": 2500, "num_services": 3, "patient_age": 35, "provider_id": 6, "days_since_last_claim": 120}, 12 | {"claim_amount": 9000, "num_services": 15, "patient_age": 70, "provider_id": 7, "days_since_last_claim": 180}, 13 | {"claim_amount": 400, "num_services": 2, "patient_age": 22, "provider_id": 8, "days_since_last_claim": 365}, 14 | {"claim_amount": 11000, "num_services": 6, "patient_age": 55, "provider_id": 9, "days_since_last_claim": 250}, 15 | {"claim_amount": 600, "num_services": 4, "patient_age": 40, "provider_id": 10, "days_since_last_claim": 30}, 16 | ] 17 | 18 | # Convert to DataFrame 19 | df_test = pd.DataFrame(test_data) 20 | 21 | # Make the prediction request 22 | response = requests.post("http://127.0.0.1:3000/predict", json=df_test.to_dict(orient="records")) 23 | 24 | # Check the response 25 | if response.status_code == 200: 26 | predictions = response.json()["predictions"] 27 | for i, prediction in enumerate(predictions): 28 | print(f"Test Case {i+1}: Prediction: {prediction}") 29 | else: 30 | print(f"Error: {response.status_code} - {response.text}") 31 | -------------------------------------------------------------------------------- /05-project/v2_app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, redirect, url_for 2 | import pandas as pd 3 | import requests 4 | import base64 5 | import io 6 | import matplotlib.pyplot as plt 7 | import os 8 | 9 | app = Flask(__name__) 10 | 11 | # Route for the home page 12 | @app.route('/') 13 | def index(): 14 | return render_template('index.html') 15 | 16 | # Route to handle the CSV file upload and prediction 17 | @app.route('/predict', methods=['POST']) 18 | def predict(): 19 | file_data = request.form.get('file') 20 | 21 | # Decode the Base64 encoded file content 22 | decoded_file = base64.b64decode(file_data.split(',')[1]) 23 | 24 | # Read the decoded content into a DataFrame 25 | df = pd.read_csv(io.StringIO(decoded_file.decode('utf-8'))) 26 | 27 | # Separate the 'claim_id' column if it exists 28 | if 'claim_id' in df.columns: 29 | claim_ids = df['claim_id'] 30 | df = df.drop(columns=['claim_id']) 31 | else: 32 | claim_ids = None 33 | 34 | # Send the DataFrame to the BentoML service 35 | response = requests.post( 36 | 'http://127.0.0.1:3000/predict', # BentoML endpoint 37 | json=df.to_dict(orient='records') 38 | ) 39 | 40 | # Get predictions from the response 41 | predictions = response.json()['predictions'] 42 | 43 | # Add predictions to the DataFrame 44 | df['Prediction'] = predictions 45 | 46 | # Reattach the 'claim_id' column to the DataFrame 47 | if claim_ids is not None: 48 | df['claim_id'] = claim_ids 49 | 50 | # Reorder columns to have 'claim_id' first 51 | if 'claim_id' in df.columns: 52 | df = df[['claim_id'] + [col for col in df.columns if col != 'claim_id']] 53 | 54 | # Save the DataFrame to a session file for visualization 55 | df.to_csv('session_data.csv', index=False) 56 | 57 | # Render the DataFrame as an HTML table with a link to visualize 58 | return render_template('result.html', tables=[df.to_html(classes='data', header="true")]) 59 | 60 | # Route to handle the visualization 61 | @app.route('/visualize') 62 | def visualize(): 63 | # Load the session data 64 | df = pd.read_csv('session_data.csv') 65 | 66 | # Create a pie chart based on the 'Prediction' column 67 | prediction_counts = df['Prediction'].value_counts() 68 | plt.figure(figsize=(8, 8)) 69 | plt.pie(prediction_counts, labels=prediction_counts.index, autopct='%1.1f%%', startangle=140) 70 | plt.title('Prediction Distribution') 71 | 72 | # Save the pie chart as an image 73 | if not os.path.exists('static'): 74 | os.makedirs('static') 75 | chart_path = 'static/prediction_pie_chart.png' 76 | plt.savefig(chart_path) 77 | plt.close() 78 | 79 | # Render the visualization page with the pie chart 80 | return render_template('visualize.html', chart_path=chart_path) 81 | 82 | if __name__ == '__main__': 83 | app.run(debug=True, port=5005) 84 | -------------------------------------------------------------------------------- /06-orchestration/iot_dag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.dummy import DummyOperator 3 | from airflow.operators.python import PythonOperator 4 | from airflow.utils.dates import days_ago 5 | from airflow.operators.email import EmailOperator 6 | import random 7 | import time 8 | 9 | # Function to generate random IoT data 10 | def generate_iot_data(**kwargs): 11 | data = [] 12 | for _ in range(60): # 60 seconds x 5 minutes = 300 readings (1 every second) 13 | data.append(random.choice([0, 1])) 14 | time.sleep(1) # simulate 1-second intervals 15 | return data 16 | 17 | # Function to aggregate the IoT data 18 | def aggregate_machine_data(**kwargs): 19 | ti = kwargs['ti'] 20 | data = ti.xcom_pull(task_ids='getting_iot_data') 21 | count_0 = data.count(0) 22 | count_1 = data.count(1) 23 | aggregated_data = {'count_0': count_0, 'count_1': count_1} 24 | return aggregated_data 25 | 26 | # Email content generation 27 | def create_email_content(**kwargs): 28 | ti = kwargs['ti'] 29 | aggregated_data = ti.xcom_pull(task_ids='aggrigate_machine_data') 30 | return f"Aggregated IoT Data:\nCount of 0: {aggregated_data['count_0']}\nCount of 1: {aggregated_data['count_1']}" 31 | 32 | # Default arguments for the DAG 33 | default_args = { 34 | 'owner': 'airflow', 35 | 'start_date': days_ago(1), 36 | 'email_on_failure': False, 37 | 'email_on_retry': False, 38 | 'retries': 1, 39 | } 40 | 41 | # Define the DAG 42 | with DAG( 43 | dag_id='iot_data_pipeline', 44 | default_args=default_args, 45 | schedule_interval=None, 46 | catchup=False, 47 | ) as dag: 48 | 49 | start_task = DummyOperator(task_id='start_task') 50 | 51 | getting_iot_data = PythonOperator( 52 | task_id='getting_iot_data', 53 | python_callable=generate_iot_data, 54 | ) 55 | 56 | aggregate_machine_data = PythonOperator( 57 | task_id='aggregate_machine_data', 58 | python_callable=aggregate_machine_data, 59 | ) 60 | 61 | end_task = DummyOperator(task_id='end_task') 62 | 63 | # Task dependencies 64 | start_task >> getting_iot_data >> aggregate_machine_data >> end_task 65 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### MLOps foundation course code repo 2 | --------------------------------------------------------------------------------