├── .env ├── Jenkinsfile ├── README.md ├── api ├── README.md ├── images │ └── deployment_pipeline.png ├── triton_client.py └── upload_model_to_minio.py ├── constants.py ├── deployments ├── mwt.yaml ├── triton-isvc.yaml └── triton-servingruntime.yaml ├── distributed_training ├── Dockerfile ├── README.md ├── build.sh ├── images │ └── training_pipeline.png ├── mwt.py ├── nets │ └── nn.py ├── test │ └── test.yaml ├── utils │ ├── config.py │ ├── dataset.py │ └── image_utils.py └── weights │ └── model.h5 ├── docker-compose.yml ├── images ├── PipelineAllcode.png ├── add_credential.png ├── add_credential_dockerhub.png ├── add_token_dockerhub.png ├── architecutre_overview.png ├── bus.jpg ├── check_request_github_jenkins.png ├── connector.png ├── data_pipeline.png ├── diagram_pipe.gif ├── error_log_pod.png ├── false_modelmesh_deploy.png ├── generate_token_docker_hub.png ├── get_token_github.png ├── github_tokens.png ├── instal_docker_jenkins.png ├── install_docker_success.png ├── isvc.png ├── jenkins_container.png ├── jenkins_portal.png ├── jenkins_ui.png ├── messenger.png ├── minio-credentials.png ├── mlflow _modelregistry.png ├── modelmesh-serving-installation.png ├── ngrok.png ├── ngrok_forwarding.png ├── password_jenkins.png ├── result.png ├── result_connect_jenkins_github.png ├── result_push_dockerhub.png ├── result_train_pod.png ├── strategy.png ├── strategy_scope.png ├── structure_data.png ├── structure_training.png ├── topic_tab.png ├── train_process.png ├── ui_build_jenkins.png ├── validate_connect_repo.png └── webhook_github.png ├── mlflow └── Dockerfile ├── model_repo └── yolov8n_car │ ├── 1 │ └── model.onnx │ └── config.pbtxt ├── notebooks └── debug.ipynb ├── requirements.txt └── streaming ├── Dockerfile ├── README.md ├── docker-compose.yml ├── images └── data-pipeline.png ├── kafka_connector └── connect-timescaledb-sink.json ├── produce.py └── run.sh /.env: -------------------------------------------------------------------------------- 1 | MINIO_ACCESS_KEY=AKIAIOSFODNN7EXAMPLE 2 | MINIO_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | agent any 3 | 4 | options{ 5 | buildDiscarder(logRotator(numToKeepStr: '5', daysToKeepStr: '5')) 6 | timestamps() 7 | } 8 | 9 | environment{ 10 | registry = '6666688889/distributed_training' 11 | registryCredential = 'dockerhub' 12 | } 13 | 14 | stages { 15 | stage('Build') { 16 | steps { 17 | script { 18 | echo 'Building image for deployment..' 19 | def dockerImage = docker.build("${registry}:${BUILD_NUMBER}", "\"Trainning Pipeline/train/.\"") 20 | echo 'Pushing image to dockerhub..' 21 | docker.withRegistry( '', registryCredential ) { 22 | dockerImage.push() 23 | dockerImage.push('latest') 24 | } 25 | } 26 | } 27 | } 28 | // stage('Deploy') { 29 | // steps { 30 | // echo 'Deploying models..' 31 | // echo 'Running a script to trigger pull and start a docker container' 32 | // } 33 | // } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Scalable ML System for Car Detection 3 | 4 | ## 📕 Table Of Contents 5 | - 🗣️ [Introduction](#Introduction) 6 | - 🚀 [Challenge](#Challenge) 7 | - 🌟 [System Architecture](#System-architecture) 8 | - 📁 [Repository Structure](#repository-structure) 9 | - 🔍 [How to Guide](#how-to-guide) 10 | 11 | ## 🗣️ Introduction: 12 | 13 | This project implements an advanced car detection system using a comprehensive machine learning pipeline. Our solution leverages state-of-the-art technologies to process data, train models, and deploy them efficiently at scale. 14 | 15 | ## 🚀 **Challenge:** 16 | This project faced several challenges including ensuring data consistency and scalability during ingestion, managing resources and synchronization in distributed training, automating CI/CD pipelines, converting and deploying models efficiently, ensuring data privacy and security, optimizing performance, and handling the complexities of debugging and troubleshooting in a distributed system. 17 | 18 | 19 | ## 🌟 System Architecture 20 | ![systemoverview](images/architecutre_overview.png) 21 | 22 | The pipeline consists of two main components: 23 | 24 | - **Data Pipeline**: This part of the system handles the ingestion, preprocessing, and feature extraction of car detection data. It includes steps like loading the dataset, performing preprocessing tasks, and extracting relevant features using tools like Apache Flink and Redis. 25 | - **Training and Deployment Pipeline**: The training and deployment pipeline focuses on the model development and deployment processes. It includes steps like saving the trained model and artifacts, evaluating the model, and deploying the model using tools like MLflow, Jenkins, and Kubernetes. 26 | 27 | 28 | **Key features of our pipeline include:** 29 | 30 | - Data ingestion and preprocessing using Airflow and Kafka for stream processing 31 | - Feature storage in Redis and an offline data store 32 | - Distributed model training with TensorFlow on Kubeflow 33 | - Model versioning and artifact management with MLflow 34 | - Automated deployment pipeline using Jenkins and Kubernetes 35 | - Scalable model serving with KServe API server 36 | 37 | 38 | ## 📁 Repository Structure 39 | ``` 40 | 📦 41 | ├─ .env # Environment variables used across the project 42 | ├─ Jenkinsfile # Configuration for a Jenkins CI/CD pipeline 43 | ├─ README.md # General project documentation 44 | ├─ api # Contains code related to the API layer 45 | │ ├─ README_serve.md # Documentation for the API serving component 46 | │ ├─ triton_client.py # Code for interacting with the Triton Inference Server 47 | │ └─ upload_model_to_minio.py # Script to upload the trained model to Minio storage 48 | ├─ constants.py # Shared constants and configurations used across the project 49 | ├─ deployments # Kubernetes configurations 50 | │ ├─ mwt.yaml # Configuration for the Multi-Worker Training (MWT) component 51 | │ ├─ triton-isvc.yaml # Configuration for the Triton Inference Service 52 | │ └─ triton-servingruntime.yaml # Configuration for the Triton Inference Server runtime 53 | ├─ distributed_training # Code and configuration for distributed training 54 | │ ├─ Dockerfile # Dockerfile for the distributed training component 55 | │ ├─ README_distributed.md # Documentation for the distributed training component 56 | │ ├─ build.sh # Script to build the distributed training Docker image 57 | │ ├─ mwt.py # Main logic for the Multi-Worker Training component 58 | │ ├─ nets # Neural network architecture definitions 59 | │ │ └─ nn.py # Neural network model implementation 60 | │ ├─ test # Test configuration for the distributed training 61 | │ │ └─ test.yaml # Test deployment configuration 62 | │ ├─ utils # Utility functions for the distributed training 63 | │ │ ├─ config.py # Configuration handling for the distributed training 64 | │ │ ├─ dataset.py # Dataset-related utilities 65 | │ │ └─ image_utils.py # Image processing utilities 66 | │ └─ weights # Folder containing a pre-trained model 67 | │ └─ model.h5 # Saved weights for the pre-trained model 68 | ├─ docker-compose.yml # Docker Compose configuration for the entire project 69 | ├─ images # Folder for storing project-related images 70 | ├─ mlflow # Code and configuration for the MLflow component 71 | │ └─ Dockerfile # Dockerfile for the MLflow component 72 | ├─ model_repo # Repository for storing the trained model 73 | │ └─ yolov8n_car # Folder for the YOLOv8 car detection model 74 | │ ├─ 1 # Version 1 of the model 75 | │ │ └─ model.onnx # ONNX format of the trained model 76 | │ └─ config.pbtxt # Triton Inference Server configuration for the model 77 | ├─ notebooks # Folder for Jupyter Notebooks (likely for debugging/exploration) 78 | │ └─ debug.ipynb # Sample Jupyter Notebook for debugging 79 | ├─ requirements.txt # Python dependencies for the project 80 | └─ streaming # Code and configuration for the data streaming component 81 | ├─ Dockerfile # Dockerfile for the streaming component 82 | ├─ README_streaming.md # Documentation for the streaming component 83 | ├─ docker-compose.yml # Docker Compose configuration for the streaming component 84 | ├─ kafka_connector # Configuration for the Kafka connector 85 | │ └─ connect-timescaledb-sink.json # Kafka connector configuration for TimescaleDB sink 86 | ├─ produce.py # Script to produce sample data for the streaming component 87 | └─ run.sh # Script to run the streaming component 88 | ``` 89 | 90 | ## 🔍 How to Guide: 91 | 92 | ### 1. Data Pipeline: 93 | - The data pipeline starts with the Car Detection Dataset Source. 94 | - Images are loaded, preprocessed, and features are extracted using Airflow. 95 | - We also use Kafka for stream processing of fake stream data, which is then processed by Apache Flink. 96 | - Data is stored in Redis (online store) and synced to PostgreSQL (an offline store). 97 | 98 | To get started with the *Data pipeline* component: 99 | ```shell 100 | cd streaming 101 | ``` 102 | 103 | And read the respective README file: [Data Pipeline Guide](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/streaming/README.md) 104 | 105 | ### 2. Training Pipeline: 106 | Our training pipeline utilizes Kubeflow and TensorFlow for distributed training. Here's an overview of the process: 107 | 108 | 1. Data Preparation: Features are pulled from the offline store and prepared using Kubeflow. 109 | 110 | 2. Distributed Training: We use TensorFlow for distributed training, which allows us to process large datasets efficiently across multiple nodes. 111 | 112 | 3. Model Evaluation: After training, the model is evaluated to ensure it meets performance criteria. 113 | 114 | 4. Artifact Management: The trained model and associated artifacts are saved to the MLflow model registry for versioning and easy retrieval. 115 | 116 | Key features of our distributed training approach: 117 | - Scalability: Easily scale training across multiple nodes using Kubeflow. 118 | - Efficiency: Utilize TensorFlow's distributed training capabilities for faster processing. 119 | - Version Control: Track experiments and models using MLflow for reproducibility. 120 | 121 | To get started with the training pipeline: 122 | 123 | ```shell 124 | cd distributed_training 125 | ``` 126 | 127 | For detailed instructions on setting up and running the distributed training, please refer to our [Distributed Training Guide](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/distributed_training/README.md). 128 | 129 | 130 | Here's an updated version of the serving pipeline section in your README to match the style of the previous parts you've edited: 131 | 132 | ### 3. Deployment Pipeline: 133 | The serving pipeline deploys the trained model for inference, ensuring that it can handle various workloads efficiently. Below are the key aspects of our serving approach: 134 | 135 | 1. **Scalability**: ModelMesh scales the serving infrastructure dynamically to accommodate varying loads and large volumes of requests, ensuring reliable performance even under heavy demand. 136 | 137 | 2. **Multi-Model Support**: ModelMesh can manage and serve multiple models simultaneously, providing flexibility in deployment strategies and enabling seamless model updates. 138 | 139 | 3. **Efficient Resource Utilization**: By dynamically allocating resources based on the demand for different models, ModelMesh optimizes the use of computational resources, reducing costs and improving efficiency. 140 | 141 | To get started with the serving pipeline: 142 | 143 | ```shell 144 | cd api 145 | ``` 146 | 147 | For detailed instructions on setting up and managing the serving infrastructure, please refer to our [Deployment Pipeline Guide](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/api/README.md). -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # Deployment Pipeline Guide 2 | 3 | ## Table of Contents 4 | 5 | 1. [Prerequisites](#prerequisites) 6 | - [Install kustomize](#install-kustomize) 7 | - [Install modelmesh-serving](#install-modelmesh-serving) 8 | 2. [Deployment Pipeline Overview](deployment-pipeline-overview) 9 | 3. [Getting Started](#getting-started) 10 | 4. [Making Prediction](#making-prediction) 11 | 12 | --- 13 | 14 | ## Prerequisites 15 | 16 | Before getting started, ensure that your environment meets the following prerequisites: 17 | 18 | - GKE Version: Use GKE version 1.29 19 | 20 | ### Install kustomize 21 | 22 | [Kustomize](https://kubectl.docs.kubernetes.io/) is an alternative tool to Helm for installing applications on Kubernetes. Install it by running the following commands: 23 | 24 | ```shell 25 | curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash 26 | sudo mv kustomize /usr/local/bin/ 27 | ``` 28 | 29 | ### Install modelmesh-serving 30 | 31 | Clone the modelmesh-serving repository: 32 | 33 | ```shell 34 | RELEASE=release-0.9 35 | git clone -b $RELEASE --depth 1 --single-branch https://github.com/kserve/modelmesh-serving.git 36 | cd modelmesh-serving 37 | ``` 38 | 39 | Create a new namespace and install modelmesh-serving: 40 | 41 | ```shell 42 | kubectl create namespace modelmesh-serving 43 | ./scripts/install.sh --namespace modelmesh-serving --quickstart 44 | 45 | ``` 46 | 47 | After a few minutes, you should see the following output: 48 | 49 | ![modelmesh-serving](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/modelmesh-serving-installation.png) 50 | 51 | ## Deployment Pipeline Overview 52 | 53 | The following diagram provides an overview of the deployment pipeline, detailing each step from model optimization to deployment and scaling. 54 | 55 | ![deploymentOverview](images/deployment_pipeline.png) 56 | 57 | ### Key Components: 58 | 59 | 1. Model Optimization (ONNX): 60 | - Optimizes the model for serving, converting it into ONNX format. 61 | 62 | 2. Model Testing: 63 | 64 | - Runs tests to ensure that the optimized model meets the necessary performance and accuracy criteria. 65 | 66 | 3. Runtime Containerization: 67 | 68 | - Packages the model into a containerized runtime environment. 69 | 70 | 4. Ingest Serving-Model to S3: 71 | 72 | - The containerized model is uploaded to an S3-compatible storage, such as MinIO. 73 | 5. Deployment and Scaling: 74 | 75 | - The model is deployed and scaled using Kubernetes (K8s), managed through `kubectl`. 76 | 77 | 6. Model Serving API: 78 | 79 | - The deployed model is accessible via an API, allowing users to make predictions. 80 | 81 | ## Getting Started 82 | 83 | ### Port-forward the `MinIO` Service 84 | 85 | To access MinIO locally, use the following command: 86 | 87 | ```shell 88 | kubectl port-forward svc/minio 9000:9000 -n modelmesh-serving 89 | ``` 90 | 91 | ### Access MinIO Credentials 92 | 93 | Obtain the `MINIO_ACCESS_KEY` and `MINIO_SECRET_KEY` required to sign in and upload your models to MinIO: 94 | 95 | ```shell 96 | kubectl get po minio-676b8dcf45-nw2zw -o json | jq -r '.spec.containers[0].env[] | select(.name == "MINIO_ACCESS_KEY") | .value' 97 | 98 | kubectl get po minio-676b8dcf45-nw2zw -o json | jq -r '.spec.containers[0].env[] | select(.name == "MINIO_SECRET_KEY") | .value' 99 | ``` 100 | 101 | You can see that in my case, `MINIO_ACCESS_KEY` is `AKIAIOSFODNN7EXAMPLE`, and `MINIO_SECRET_KEY` is `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY`. 102 | 103 | ![minio-credentials](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/minio-credentials.png). 104 | 105 | Access localhost:9000 to open MINIO and upload the model to the MINIO bucket. The structure for storing our ONNX model and the config.pbtxt file should look like this. Remember to use the model we trained in the previous step for serving. The format should be ONNX, so please convert the weight file to ONNX before uploading it to the bucket 106 | 107 | ![Screenshot from 2024-05-11 17-01-13](https://github.com/HungNguyenDev1511/Capstone-Project-Model-Serving/assets/69066161/adc4b65c-a51c-4e64-9a1a-377f680810ed) 108 | 109 | ![Screenshot from 2024-05-11 17-01-19](https://github.com/HungNguyenDev1511/Capstone-Project-Model-Serving/assets/69066161/8461cdc0-1fcd-491e-9b24-8d8d9b5bfc58) 110 | 111 | 112 | ### Upload Model to MinIO: 113 | 114 | You can manually upload the model or use the following script: 115 | ``` shell 116 | python api/upload_model_to_minio.py 117 | ``` 118 | 119 | ### Deploy the ONNX Model: 120 | 121 | Deploy the model using the following commands: 122 | 123 | ```shell 124 | kubectl get p 125 | kubectl apply -f deployments/triton-isvc.yaml 126 | kubectl apply -f deployments/triton-servingruntime.yaml 127 | ``` 128 | 129 | ### Verify the Service Readiness: 130 | 131 | Check if the service is ready: 132 | 133 | ```shell 134 | kubectl get isvc 135 | ``` 136 | 137 | You can see that is `false`: 138 | 139 | ![Error](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/false_modelmesh_deploy.png) 140 | 141 | It should take several minutes for our service to become READY. 142 | 143 | If it doesn’t, please check the logs of the `mm` container in the pod corresponding to triton using the following command to check logs: 144 | 145 | ```shell 146 | kubectl describe pod modelmesh-serving-triton-2.x-6c4978d6db-5k59z 147 | ``` 148 | 149 | ![Error Log Pod Describe](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/error_log_pod.png) 150 | 151 | 152 | Once the service is ready, you should see the following result: 153 | 154 | ![Result](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/result.png) 155 | ![Result Inference Service](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/isvc.png) 156 | 157 | 158 | ## Making Prediction: 159 | 160 | To make a prediction, follow these steps: 161 | 162 | 1. Port-forward `modelmesh-serving` service 163 | ```shell 164 | kubectl port-forward --address 0.0.0.0 service/modelmesh-serving 8008 -n modelmesh-serving 165 | ``` 166 | 2. Test your newly created modelmesh-serving service 167 | ```shell 168 | python api/triton_client.py 169 | ``` 170 | -------------------------------------------------------------------------------- /api/images/deployment_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/api/images/deployment_pipeline.png -------------------------------------------------------------------------------- /api/triton_client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | import wget 5 | import os 6 | import cv2 7 | from constants import CLASSES # Import danh sách CLASSES từ file constants.py 8 | 9 | 10 | def preprocess(cv2_image, model_shape=(640, 640)): 11 | image_rgb = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB) 12 | resized = cv2.resize(image_rgb, model_shape) 13 | 14 | # Scale input pixel value to 0 to 1 15 | input_image = resized / 255.0 16 | input_image = input_image.transpose(2, 0, 1) 17 | result = input_image[np.newaxis, :, :, :].astype(np.float32) 18 | 19 | return result 20 | 21 | 22 | def xywh2xyxy(x): 23 | # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2) 24 | y = np.copy(x) 25 | y[..., 0] = x[..., 0] - x[..., 2] / 2 26 | y[..., 1] = x[..., 1] - x[..., 3] / 2 27 | y[..., 2] = x[..., 0] + x[..., 2] / 2 28 | y[..., 3] = x[..., 1] + x[..., 3] / 2 29 | return y 30 | 31 | 32 | def nms(boxes, scores, iou_threshold): 33 | # Sort by score 34 | sorted_indices = np.argsort(scores)[::-1] 35 | 36 | keep_boxes = [] 37 | while sorted_indices.size > 0: 38 | # Pick the last box 39 | box_id = sorted_indices[0] 40 | keep_boxes.append(box_id) 41 | 42 | # Compute IoU of the picked box with the rest 43 | ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :]) 44 | 45 | # Remove boxes with IoU over the threshold 46 | keep_indices = np.where(ious < iou_threshold)[0] 47 | 48 | sorted_indices = sorted_indices[keep_indices + 1] 49 | 50 | return keep_boxes 51 | 52 | 53 | def compute_iou(box, boxes): 54 | # Compute xmin, ymin, xmax, ymax for both boxes 55 | xmin = np.maximum(box[0], boxes[:, 0]) 56 | ymin = np.maximum(box[1], boxes[:, 1]) 57 | xmax = np.minimum(box[2], boxes[:, 2]) 58 | ymax = np.minimum(box[3], boxes[:, 3]) 59 | 60 | # Compute intersection area 61 | intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin) 62 | 63 | # Compute union area 64 | box_area = (box[2] - box[0]) * (box[3] - box[1]) 65 | boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 66 | union_area = box_area + boxes_area - intersection_area 67 | 68 | # Compute IoU 69 | iou = intersection_area / union_area 70 | 71 | return iou 72 | 73 | 74 | def postprocess(outputs, original_shape, model_shape=(640, 640), threshold=0.8): 75 | model_height, model_width = model_shape 76 | original_height, original_width = original_shape[:2] 77 | outputs = np.array(outputs[0]["data"]).reshape(outputs[0]["shape"]) 78 | predictions = np.squeeze(outputs).T 79 | 80 | # Filter out object confidence scores below threshold 81 | scores = np.max(predictions[:, 4:], axis=1) 82 | predictions = predictions[scores > threshold, :] 83 | scores = scores[scores > threshold] 84 | class_ids = np.argmax(predictions[:, 4:], axis=1) 85 | 86 | # Get bounding boxes for each object 87 | bboxes = predictions[:, :4] 88 | 89 | # Rescale bboxes 90 | model_shape = np.array([model_width, model_height, model_width, model_height]) 91 | original_shape = np.array( 92 | [original_width, original_height, original_width, original_height] 93 | ) 94 | bboxes = np.divide(bboxes, model_shape, dtype=np.float32) 95 | bboxes *= original_shape 96 | bboxes = bboxes.astype(np.int32) 97 | 98 | # Apply non-maxima suppression to suppress weak, overlapping bounding boxes 99 | indices = nms(bboxes, scores, 0.3) 100 | 101 | return bboxes[indices], scores[indices], class_ids[indices] 102 | 103 | 104 | def draw_image(image, bboxes, scores, class_ids): 105 | image_draw = image.copy() 106 | for bbox, score, label in zip(xywh2xyxy(bboxes), scores, class_ids): 107 | bbox = bbox.round().astype(np.int32).tolist() 108 | cls_id = int(label) 109 | cls = CLASSES[cls_id] 110 | color = (0, 255, 0) 111 | cv2.rectangle(image_draw, tuple(bbox[:2]), tuple(bbox[2:]), color, 2) 112 | cv2.putText( 113 | image_draw, 114 | f"{cls}:{int(score*100)}", 115 | (bbox[0], bbox[1] - 2), 116 | cv2.FONT_HERSHEY_SIMPLEX, 117 | 0.60, 118 | [225, 255, 255], 119 | thickness=1, 120 | ) 121 | cv2.imwrite("drawed.jpg", image_draw) 122 | 123 | 124 | def main(): 125 | image_url = "https://ultralytics.com/images/bus.jpg" 126 | image_name = os.path.basename(image_url) 127 | if not os.path.exists(image_name): 128 | wget.download(image_url) 129 | 130 | original_image = cv2.imread(image_name) 131 | image = preprocess(original_image) 132 | 133 | request_data = { 134 | "inputs": [ 135 | { 136 | "name": "images", 137 | "shape": image.shape, 138 | "datatype": "FP32", 139 | "data": image.flatten().tolist(), # Flatten the image and convert to list 140 | } 141 | ] 142 | } 143 | 144 | headers = { 145 | "Content-Type": "application/json", # Change content type to JSON 146 | } 147 | 148 | response = requests.post( 149 | "http://localhost:8008/v2/models/onnx/infer", 150 | headers=headers, 151 | data=json.dumps(request_data), 152 | verify=False, 153 | ).json() 154 | 155 | result = response["outputs"] 156 | bboxes, scores, class_ids = postprocess(result, original_image.shape) 157 | print(bboxes) 158 | print(scores) 159 | print(class_ids) 160 | draw_image( 161 | original_image, bboxes, scores, class_ids 162 | ) # Use DRAWED_PATH instead of "drawed.jpg" 163 | 164 | 165 | if __name__ == "__main__": 166 | main() 167 | -------------------------------------------------------------------------------- /api/upload_model_to_minio.py: -------------------------------------------------------------------------------- 1 | from minio import Minio 2 | from minio.error import S3Error 3 | from dotenv import load_dotenv 4 | import os 5 | 6 | def main(): 7 | # Create a client with the MinIO server playground, its access key 8 | # and secret key. 9 | client = Minio( 10 | "http://localhost:9000", 11 | access_key=os.getenv("MINIO_ACCESS_KEY"), 12 | secret_key=os.getenv("MINIO_SECRET_KEY"), 13 | ) 14 | 15 | # Make 'modelmesh-models' bucket if not exist. 16 | bucket_name = "modelmesh-models" 17 | found = client.bucket_exists(bucket_name) 18 | if not found: 19 | client.make_bucket(bucket_name) 20 | else: 21 | print(f"Bucket {bucket_name} already exists") 22 | 23 | # Upload './models/mnist-svm.joblib' (or whatever) 24 | # as object name to our newly created bucket 'modelmesh-models'. 25 | client.fput_object( 26 | bucket_name=bucket_name, 27 | file_path=f"./model_repo/yolov8n_car/", 28 | ) 29 | print(f"Model and config are successfully uploaded to bucket '{bucket_name}'.") 30 | 31 | 32 | if __name__ == "__main__": 33 | try: 34 | main() 35 | except S3Error as exc: 36 | print("error occurred.", exc) 37 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | # constants.py 2 | CLASSES = [ 3 | "person", 4 | "bicycle", 5 | "car", 6 | "motorcycle", 7 | "airplane", 8 | "bus", 9 | "train", 10 | "truck", 11 | "boat", 12 | "traffic light", 13 | "fire hydrant", 14 | "street sign", 15 | "stop sign", 16 | "parking meter", 17 | "bench", 18 | "bird", 19 | "cat", 20 | "dog", 21 | "horse", 22 | "sheep", 23 | "cow", 24 | "elephant", 25 | "bear", 26 | "zebra", 27 | "giraffe", 28 | "hat", 29 | "backpack", 30 | "umbrella", 31 | "shoe", 32 | "eye glasses", 33 | "handbag", 34 | "tie", 35 | "suitcase", 36 | "frisbee", 37 | "skis", 38 | "snowboard", 39 | "sports ball", 40 | "kite", 41 | "baseball bat", 42 | "baseball glove", 43 | "skateboard", 44 | "surfboard", 45 | "tennis racket", 46 | "bottle", 47 | "plate", 48 | "wine glass", 49 | "cup", 50 | "fork", 51 | "knife", 52 | "spoon", 53 | "bowl", 54 | "banana", 55 | "apple", 56 | "sandwich", 57 | "orange", 58 | "broccoli", 59 | "carrot", 60 | "hot dog", 61 | "pizza", 62 | "donut", 63 | "cake", 64 | "chair", 65 | "couch", 66 | "potted plant", 67 | "bed", 68 | "mirror", 69 | "dining table", 70 | "window", 71 | "desk", 72 | "toilet", 73 | "door", 74 | "tv", 75 | "laptop", 76 | "mouse", 77 | "remote", 78 | "keyboard", 79 | "cell phone", 80 | "microwave", 81 | "oven", 82 | "toaster", 83 | "sink", 84 | "refrigerator", 85 | "blender", 86 | "book", 87 | "clock", 88 | "vase", 89 | "scissors", 90 | "teddy bear", 91 | "hair drier", 92 | "toothbrush", 93 | ] -------------------------------------------------------------------------------- /deployments/mwt.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kubeflow.org/v1 2 | kind: TFJob 3 | metadata: 4 | name: multi-worker 5 | namespace: distributed-training 6 | spec: 7 | tfReplicaSpecs: 8 | Worker: 9 | replicas: 2 10 | restartPolicy: Never 11 | template: 12 | spec: 13 | containers: 14 | - name: tensorflow 15 | image: 6666688889/distributed_training:0.0.13 16 | volumeMounts: 17 | - mountPath: /train 18 | name: training 19 | readOnly: true 20 | volumes: 21 | - name: training 22 | persistentVolumeClaim: 23 | claimName: mwt-volume 24 | readOnly: true 25 | --- 26 | apiVersion: v1 27 | kind: PersistentVolumeClaim 28 | metadata: 29 | name: mwt-volume 30 | namespace: distributed-training 31 | spec: 32 | accessModes: 33 | - ReadWriteMany 34 | resources: 35 | requests: 36 | storage: 10Gi -------------------------------------------------------------------------------- /deployments/triton-isvc.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: serving.kserve.io/v1beta1 2 | kind: InferenceService 3 | metadata: 4 | name: cardetection-mm 5 | namespace: modelmesh-serving 6 | annotations: 7 | serving.kserve.io/deploymentMode: ModelMesh 8 | serving.kserve.io/secretKey: localMinIO 9 | spec: 10 | predictor: 11 | model: 12 | modelFormat: 13 | name: onnx 14 | runtime: triton-2.x 15 | storageUri: s3://modelmesh-example-models/cardetect/yolov8n_car -------------------------------------------------------------------------------- /deployments/triton-servingruntime.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 IBM Corporation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | apiVersion: serving.kserve.io/v1alpha1 15 | kind: ServingRuntime 16 | metadata: 17 | name: triton-2.x 18 | labels: 19 | name: modelmesh-serving-triton-2.x-SR 20 | annotations: 21 | maxLoadingConcurrency: "2" 22 | serving.kserve.io/autoscalerClass: hpa 23 | serving.kserve.io/targetUtilizationPercentage: "75" 24 | serving.kserve.io/metrics: "cpu" 25 | serving.kserve.io/min-scale: "2" 26 | serving.kserve.io/max-scale: "3" 27 | spec: 28 | supportedModelFormats: 29 | - name: keras 30 | version: "2" # 2.6.0 31 | autoSelect: true 32 | - name: onnx 33 | version: "1" # 1.5.3 34 | autoSelect: true 35 | - name: pytorch 36 | version: "1" # 1.8.0a0+17f8c32 37 | autoSelect: true 38 | - name: tensorflow 39 | version: "1" # 1.15.4 40 | autoSelect: true 41 | - name: tensorflow 42 | version: "2" # 2.3.1 43 | autoSelect: true 44 | - name: tensorrt 45 | version: "7" # 7.2.1 46 | autoSelect: true 47 | 48 | protocolVersions: 49 | - grpc-v2 50 | multiModel: true 51 | replicas: 1 52 | grpcEndpoint: "port:8085" 53 | grpcDataEndpoint: "port:8001" 54 | 55 | containers: 56 | - name: triton 57 | image: nvcr.io/nvidia/tritonserver:23.09-py3 58 | command: [/bin/sh] 59 | args: 60 | - -c 61 | - 'mkdir -p /models/_triton_models; 62 | chmod 777 /models/_triton_models; 63 | exec tritonserver 64 | "--model-repository=/models/_triton_models" 65 | "--model-control-mode=explicit" 66 | "--strict-model-config=false" 67 | "--strict-readiness=false" 68 | "--allow-http=true" 69 | "--allow-sagemaker=false" 70 | ' 71 | resources: 72 | requests: 73 | cpu: 500m 74 | memory: 1Gi 75 | limits: 76 | cpu: "5" 77 | memory: 1Gi 78 | livenessProbe: 79 | # the server is listening only on 127.0.0.1, so an httpGet probe sent 80 | # from the kublet running on the node cannot connect to the server 81 | # (not even with the Host header or host field) 82 | # exec a curl call to have the request originate from localhost in the 83 | # container 84 | exec: 85 | command: 86 | - curl 87 | - --fail 88 | - --silent 89 | - --show-error 90 | - --max-time 91 | - "9" 92 | - http://localhost:8000/v2/health/live 93 | initialDelaySeconds: 5 94 | periodSeconds: 30 95 | timeoutSeconds: 10 96 | builtInAdapter: 97 | serverType: triton 98 | runtimeManagementPort: 8001 99 | memBufferBytes: -134217728 100 | modelLoadingTimeoutMillis: 90000 -------------------------------------------------------------------------------- /distributed_training/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.12.0 2 | 3 | # Update the package list 4 | RUN apt-get update 5 | 6 | # Install necessary packages 7 | RUN apt install -y libgl1-mesa-glx 8 | 9 | # Install Python dependencies 10 | RUN pip install opencv-python-headless==4.5.3.56 11 | RUN pip install mlflow==2.14.1 12 | RUN pip install tqdm 13 | 14 | # Copy the application code to the container 15 | COPY . /app 16 | 17 | # Set the working directory 18 | WORKDIR /app 19 | 20 | # Run mwt.py with the --train argument 21 | CMD ["python", "mwt.py", "--train"] 22 | -------------------------------------------------------------------------------- /distributed_training/README.md: -------------------------------------------------------------------------------- 1 | # Distributed Training Pipeline 2 | 3 | ## Overview: 4 | 5 | This pipeline leverages a combination of Redis for online feature storage, PostgreSQL for offline storage, TensorFlow for distributed training, and MLflow for model tracking and registry. Kubeflow orchestrates the entire process, ensuring a seamless flow from data preparation to model deployment. 6 | 7 | ## Table of Contents 8 | 9 | - [Dataset Preparation](#dataset-preparation) 10 | - [Deploying Multi-Worker Training Jobs](#deploying-multi-worker-training-jobs) 11 | - [Monitoring and Investigating Models](#monitoring-and-investigating-models) 12 | - [Running MLflow with Docker Compose:](#running-mlflow-with-docker-compose) 13 | - [Important Considerations](#important-considerations) 14 | - [Integrating Jenkins for Continuous Integration](#integrating-jenkins-for-continuous-integration) 15 | - [References](#references) 16 | 17 | ## Dataset Preparation: 18 | 19 | Begin by downloading the dataset required for the training job from the following link: [Download Dataset](https://drive.google.com/drive/folders/12ncEAoWT_kwuPT8YRdFysqgS54XJwre7?usp=drive_link). The folder structure should resemble the following: 20 | 21 |
22 | Training Job Structure 23 |
24 | 25 | ## Deploying Multi-Worker Training Jobs 26 | 27 | To deploy multi-worker training jobs, apply the configuration using Kubernetes: 28 | 29 | ``` shell 30 | kubectl apply -f deployments/mwt.yaml 31 | ``` 32 | 33 | ## Monitoring and Investigating Models 34 | 35 | To monitor the training process and inspect the models, update the `persistentVolumeClaim` in the `tests/nginx.yaml` file: 36 | 37 | ```shell 38 | kubectl apply -f tests/nginx.yaml 39 | ``` 40 | 41 | This setup creates a pod that shares a volume with other training pods, allowing them to write and read from a common source. This shared volume facilitates easy access to logs and other critical data. 42 | 43 | You can access the pod to check and read logs using the following command: 44 | 45 | ```shell 46 | kubectl exec -ti nginx bash 47 | ``` 48 | 49 | ## Running MLflow with Docker Compose 50 | 51 | For a proof-of-concept (POC) or limited resource environments, you can opt to run the MLflow service using Docker: 52 | 53 | ```shell 54 | docker compose -f docker-compose.yml up --d --build 55 | ``` 56 | 57 | ## Important Considerations: 58 | 59 | 👉 If multiple GPUs are not available, consider using an alternative strategy, as illustrated below: 60 | 61 | ![Strategy Scope](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/strategy.png) 62 | 63 | 👉 Customize the script to run the training job according to your requirements. If the job fails, you can diagnose the issue by checking the pod error logs: 64 | 65 | ```shell 66 | kubectl get TFjob 67 | ``` 68 | 69 | Please check the pod error log and fix it. 70 | ![Result Train ](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/result_train_pod.png) 71 | 72 | 👉 In your training script, ensure the model definition and dataset loading are encapsulated within the strategy scope: 73 | 74 | ![Strategy Scope](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/strategy_scope.png) 75 | 76 | 👉 To monitor the training process, you can exec into the pod or container (if using Docker) to observe the training job in real-time: 77 | 78 | ![Train Process](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/train_process.png) 79 | 80 | 👉 The trained model versions will be stored and managed in MLflow: 81 | 82 | ![Result](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/mlflow%20_modelregistry.png) 83 | 84 | 85 | ## Integrating Jenkins for Continuous Integration 86 | 87 | For automated retraining when new data is available, you can integrate Jenkins into your CI/CD pipeline. 88 | 89 | 1. Install Ngrok: 90 | 91 | ```shell 92 | curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc 93 | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null 94 | echo "deb https://ngrok-agent.s3.amazonaws.com buster main" 95 | sudo tee /etc/apt/sources.list.d/ngrok.list 96 | sudo apt update 97 | sudo apt install ngrok 98 | ``` 99 | 100 | 101 | 2. Test Ngrok Installation: Run `ngrok` in the terminal to verify the installation: 102 | 103 | ![CurlNgrok](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/ngrok.png) 104 | 105 | 3. Retrieve Jenkins Password: Access Jenkins by retrieving the password as shown below: 106 | 107 | ![JenkinsPassword](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/password_jenkins.png) 108 | 109 | 4. Configure Jenkins: 110 | 111 | - Open browser `localhost:8081` to open `Jenkins` -> `Manage Jenkins` -> `Plugins and Type` : `Docker Pipeline` and `Docker` and choose `Install without start` 112 | ![JenkinsPlugin](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/instal_docker_jenkins.png) 113 | - Install necessary plugins like `Docker Pipeline` and `Docker`. 114 | ![DowloadPlugin](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/install_docker_success.png) 115 | 116 | 5. Expose Jenkins with Ngrok: 117 | 118 | - Run `ngrok http 8081` to expose Jenkins: 119 | ![NgrokForwardingPort](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/ngrok_forwarding.png) 120 | 121 | 6. Set Up GitHub Webhook: 122 | 123 | - Open your Github repository: In this case is Capstone-Model-Serving-pipeline -> go to `Settings` of repository -> `Webhook` -> `Add Webhook` and paste the Forwarding url in step above to Payload Url and concat "/github-webhook/", Content Type: choose `Applycation/json`. In the part "Which events would you like to trigger this webhook" choose `Push` and `Pull`. Finally, wait for the webhook status to show a green mark, indicating that it is working correctly 124 | 125 | ![WebhookGithub](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/webhook_github.png) 126 | 127 | - Check the connection. If Jenkins is successfully connected to GitHub, it will appear like this in the GitHub UI (with a green mark on the webhook) 128 | 129 | ![Webhookconnect](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/result_connect_jenkins_github.png) 130 | 131 | 7. Configure Jenkins Multibranch Pipeline: 132 | 133 | - Back to Jenkins -> choose `Dashboard` -> `New Item` then enter the name of your project and choose `Multibranch Pipeline` and `OK` 134 | 135 | - Add name Project -> `Branch Source` and `Add Source` you choose Github 136 | 137 | ![UiConnectToRepository](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/add_credential.png) 138 | 139 | - In `Github Credential` -> Choose the Project Name you create above -> and Type the User Name of your Github Account store the Repository (Model-mesh-serving-pipeline blabla ) and in The Password -> Back to your Github Repository -> `Developer settings` -> Personal access tokens then choose `Token classic` -> Generate a New token classic and choose all options for a demo with no error copy the token generated to `Jenkins Password` and `Add` 140 | 141 | ![TokenGithub](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/github_tokens.png) 142 | 143 | - Copy the repository we are working on using the repository's HTTPS URL 144 | - Check all information, `Validate it`, and if everything looks correct, then `Save` 145 | 146 | ![Validate](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/validate_connect_repo.png) 147 | 148 | - Choose the `Credential` and then choose the `Scope of our project` and `Add the project credential create a new Credential` -> in Username you type the user of DockerHub 149 | ![UiDockerhub](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/add_credential_dockerhub.png) 150 | 151 | 152 | - For the Password: Go to DockerHub (where you store your Docker images) then navigate to `Account Setting` -> `Security` -> Generate new token. Copy this token and paste it into Jenkins credentials, using 'dockerhub' as the ID. 153 | 154 | ![TokenDockerhub](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/generate_token_docker_hub.png) 155 | 156 | - Choose `Manage Jenkins` -> `System and go to Github part` -> In Github API usage rate limiting strategy -> Never check rate limit (NOT RECOMMENDED) and `Save` 157 | - Finally, go to the repository in Jenkins -> `Configure and Github Credential` Select the Github Credential you created in step above then `Save` 158 | - Click `Scan Repository Now` to check if all connections are correct. If they are not, restart Jenkins and try again 159 | 160 | - The result of the build on Jenkins will look like this 161 | ![JenkinsBuild](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/ui_build_jenkins.png) 162 | 163 | - As you can see, the application version will increase 164 | ![Version](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/result_push_dockerhub.png) 165 | 166 | # References 167 | 168 | For more information, please take a look at examples [here](https://github.com/kubeflow/training-operator/tree/master/examples) and [here](https://github.com/kubeflow/examples/tree/master/github_issue_summarization). 169 | 170 | Some other useful examples: 171 | - https://henning.kropponline.de/2017/03/19/distributing-tensorflow/ 172 | - https://www.cs.cornell.edu/courses/cs4787/2019sp/notes/lecture22.pdf 173 | - https://web.eecs.umich.edu/~mosharaf/Readings/Parameter-Server.pdf 174 | - https://s3.us.cloud-object-storage.appdomain.cloud/developer/default/series/os-kubeflow-2020/static/kubeflow06.pdf 175 | - https://xzhu0027.gitbook.io/blog/ml-system/sys-ml-index/parameter-servers 176 | - http://www.juyang.co/distributed-model-training-ii-parameter-server-and-allreduce/ 177 | -------------------------------------------------------------------------------- /distributed_training/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | IMAGE=6666688889/distributed_training:0.0.13 3 | docker build -t $IMAGE . 4 | docker push $IMAGE -------------------------------------------------------------------------------- /distributed_training/images/training_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/distributed_training/images/training_pipeline.png -------------------------------------------------------------------------------- /distributed_training/mwt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import os 4 | import sys 5 | 6 | import cv2 7 | import numpy as np 8 | 9 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 10 | os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" 11 | 12 | import tensorflow as tf 13 | import tqdm 14 | 15 | from nets import nn 16 | from utils import config 17 | from distributed_trainning.utils import image_utils 18 | from utils.dataset import input_fn, DataLoader 19 | import posixpath 20 | 21 | np.random.seed(12345) 22 | tf.random.set_seed(12345) 23 | 24 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 25 | 26 | import mlflow 27 | import mlflow.tensorflow 28 | 29 | # Set the MLflow tracking URI 30 | mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000")) 31 | 32 | 33 | def train(): 34 | strategy = tf.distribute.MultiWorkerMirroredStrategy() 35 | 36 | image_path = posixpath.join(config.data_dir, config.image_dir, "train") 37 | label_path = posixpath.join(config.data_dir, config.label_dir, "train") 38 | 39 | image_files = [ 40 | os.path.splitext(file_name)[0] 41 | for file_name in os.listdir(image_path) 42 | if file_name.lower().endswith(".jpg") 43 | ] 44 | label_files = [ 45 | os.path.splitext(file_name)[0] 46 | for file_name in os.listdir(label_path) 47 | if file_name.lower().endswith(".txt") 48 | ] 49 | 50 | file_names = list(set(image_files) & set(label_files)) 51 | 52 | steps = len(file_names) // config.batch_size 53 | if os.path.exists(os.path.join(config.data_dir, "TF")): 54 | dataset = DataLoader().input_fn(file_names) 55 | else: 56 | dataset = input_fn(file_names) 57 | dataset = strategy.experimental_distribute_dataset(dataset) 58 | 59 | with strategy.scope(): 60 | model = nn.build_model() 61 | model.summary() 62 | optimizer = tf.keras.optimizers.Adam(nn.CosineLR(steps), 0.937) 63 | 64 | with strategy.scope(): 65 | loss_object = nn.ComputeLoss() 66 | 67 | def compute_loss(y_true, y_pred): 68 | total_loss = loss_object(y_pred, y_true) 69 | return tf.reduce_sum(total_loss) / config.batch_size 70 | 71 | with strategy.scope(): 72 | 73 | def train_step(image, y_true): 74 | with tf.GradientTape() as tape: 75 | y_pred = model(image, training=True) 76 | loss = compute_loss(y_true, y_pred) 77 | variables = model.trainable_variables 78 | gradients = tape.gradient(loss, variables) 79 | optimizer.apply_gradients(zip(gradients, variables)) 80 | return loss 81 | 82 | with strategy.scope(): 83 | 84 | @tf.function 85 | def distributed_train_step(image, y_true): 86 | per_replica_losses = strategy.run(train_step, args=(image, y_true)) 87 | return strategy.reduce( 88 | tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None 89 | ) 90 | 91 | def train_fn(): 92 | if not os.path.exists("weights"): 93 | os.makedirs("weights") 94 | pb = tf.keras.utils.Progbar(steps, stateful_metrics=["loss"]) 95 | print(f"[INFO] {len(file_names)} data points") 96 | 97 | # Start MLflow run 98 | with mlflow.start_run(): 99 | mlflow.log_param("batch_size", config.batch_size) 100 | mlflow.log_param("num_epochs", config.num_epochs) 101 | 102 | for step, inputs in enumerate(dataset): 103 | if step % steps == 0: 104 | print(f"Epoch {step // steps + 1}/{config.num_epochs}") 105 | pb = tf.keras.utils.Progbar(steps, stateful_metrics=["loss"]) 106 | step += 1 107 | image, y_true_1, y_true_2, y_true_3 = inputs 108 | y_true = (y_true_1, y_true_2, y_true_3) 109 | loss = distributed_train_step(image, y_true) 110 | pb.add(1, [("loss", loss.numpy())]) 111 | 112 | # Log loss to MLflow 113 | mlflow.log_metric("loss", loss.numpy(), step=step) 114 | 115 | if step % steps == 0: 116 | model.save_weights( 117 | os.path.join("weights", f"model_{config.version}.h5") 118 | ) 119 | # Log model checkpoint to MLflow 120 | mlflow.log_artifact( 121 | os.path.join("weights", f"model_{config.version}.h5") 122 | ) 123 | if step // steps == config.num_epochs: 124 | mlflow.tensorflow.log_model(model, "model") 125 | sys.exit("--- Stop Training ---") 126 | 127 | train_fn() 128 | 129 | 130 | # Rest of your script remains unchanged 131 | def test(): 132 | def draw_bbox(image, boxes): 133 | for box in boxes: 134 | coordinate = np.array(box[:4], dtype=np.int32) 135 | c1, c2 = (coordinate[0], coordinate[1]), (coordinate[2], coordinate[3]) 136 | cv2.rectangle(image, c1, c2, (255, 0, 0), 1) 137 | return image 138 | 139 | def test_fn(): 140 | if not os.path.exists("results"): 141 | os.makedirs("results") 142 | image_path = posixpath.join(config.data_dir, config.image_dir, "valid") 143 | label_path = posixpath.join(config.data_dir, config.label_dir, "valid") 144 | 145 | image_files = [ 146 | os.path.splitext(file_name)[0] 147 | for file_name in os.listdir(image_path) 148 | if file_name.lower().endswith(".jpg") 149 | ] 150 | label_files = [ 151 | os.path.splitext(file_name)[0] 152 | for file_name in os.listdir(label_path) 153 | if file_name.lower().endswith(".txt") 154 | ] 155 | 156 | file_names = list(set(image_files) & set(label_files)) 157 | 158 | model = nn.build_model(training=False) 159 | model.load_weights(f"weights/model_{config.version}.h5", True) 160 | 161 | for file_name in tqdm.tqdm(file_names): 162 | image = cv2.imread( 163 | posixpath.join( 164 | config.data_dir, config.image_dir, "valid", file_name + ".jpg" 165 | ) 166 | ) 167 | image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 168 | 169 | image_np, scale, dw, dh = image_utils.resize(image_np) 170 | image_np = image_np.astype(np.float32) / 255.0 171 | 172 | boxes, scores, labels = model.predict(image_np[np.newaxis, ...]) 173 | 174 | boxes, scores, labels = ( 175 | np.squeeze(boxes, 0), 176 | np.squeeze(scores, 0), 177 | np.squeeze(labels, 0), 178 | ) 179 | 180 | boxes[:, [0, 2]] = (boxes[:, [0, 2]] - dw) / scale 181 | boxes[:, [1, 3]] = (boxes[:, [1, 3]] - dh) / scale 182 | image = draw_bbox(image, boxes) 183 | cv2.imwrite(f"results/{file_name}.jpg", image) 184 | 185 | test_fn() 186 | 187 | 188 | def write_tf_record(queue, sentinel): 189 | def byte_feature(value): 190 | if not isinstance(value, bytes): 191 | if not isinstance(value, list): 192 | value = value.encode("utf-8") 193 | else: 194 | value = [val.encode("utf-8") for val in value] 195 | if not isinstance(value, list): 196 | value = [value] 197 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) 198 | 199 | while True: 200 | file_name = queue.get() 201 | 202 | if file_name == sentinel: 203 | break 204 | in_image = image_utils.load_image(file_name)[:, :, ::-1] 205 | boxes, label = image_utils.load_label(file_name) 206 | 207 | in_image, boxes = image_utils.resize(in_image, boxes) 208 | 209 | y_true_1, y_true_2, y_true_3 = image_utils.process_box(boxes, label) 210 | 211 | in_image = in_image.astype("float32") 212 | y_true_1 = y_true_1.astype("float32") 213 | y_true_2 = y_true_2.astype("float32") 214 | y_true_3 = y_true_3.astype("float32") 215 | 216 | in_image = in_image.tobytes() 217 | y_true_1 = y_true_1.tobytes() 218 | y_true_2 = y_true_2.tobytes() 219 | y_true_3 = y_true_3.tobytes() 220 | 221 | features = tf.train.Features( 222 | feature={ 223 | "in_image": byte_feature(in_image), 224 | "y_true_1": byte_feature(y_true_1), 225 | "y_true_2": byte_feature(y_true_2), 226 | "y_true_3": byte_feature(y_true_3), 227 | } 228 | ) 229 | tf_example = tf.train.Example(features=features) 230 | opt = tf.io.TFRecordOptions("GZIP") 231 | with tf.io.TFRecordWriter( 232 | os.path.join(config.data_dir, "TF", file_name + ".tf"), opt 233 | ) as writer: 234 | writer.write(tf_example.SerializeToString()) 235 | 236 | 237 | def generate_tf_record(): 238 | if not os.path.exists(os.path.join(config.data_dir, "TF")): 239 | os.makedirs(os.path.join(config.data_dir, "TF")) 240 | file_names = [] 241 | with open(os.path.join(config.data_dir, "train.txt")) as reader: 242 | for line in reader.readlines(): 243 | file_names.append(line.rstrip().split(" ")[0]) 244 | sentinel = ("", []) 245 | queue = multiprocessing.Manager().Queue() 246 | for file_name in tqdm.tqdm(file_names): 247 | queue.put(file_name) 248 | for _ in range(os.cpu_count()): 249 | queue.put(sentinel) 250 | print("[INFO] generating TF record") 251 | process_pool = [] 252 | for i in range(os.cpu_count()): 253 | process = multiprocessing.Process( 254 | target=write_tf_record, args=(queue, sentinel) 255 | ) 256 | process_pool.append(process) 257 | process.start() 258 | for process in process_pool: 259 | process.join() 260 | 261 | 262 | class AnchorGenerator: 263 | def __init__(self, num_cluster): 264 | self.num_cluster = num_cluster 265 | 266 | def iou(self, boxes, clusters): # 1 box -> k clusters 267 | n = boxes.shape[0] 268 | k = self.num_cluster 269 | 270 | box_area = boxes[:, 0] * boxes[:, 1] 271 | box_area = box_area.repeat(k) 272 | box_area = np.reshape(box_area, (n, k)) 273 | 274 | cluster_area = clusters[:, 0] * clusters[:, 1] 275 | cluster_area = np.tile(cluster_area, [1, n]) 276 | cluster_area = np.reshape(cluster_area, (n, k)) 277 | 278 | box_w_matrix = np.reshape(boxes[:, 0].repeat(k), (n, k)) 279 | cluster_w_matrix = np.reshape(np.tile(clusters[:, 0], (1, n)), (n, k)) 280 | min_w_matrix = np.minimum(cluster_w_matrix, box_w_matrix) 281 | 282 | box_h_matrix = np.reshape(boxes[:, 1].repeat(k), (n, k)) 283 | cluster_h_matrix = np.reshape(np.tile(clusters[:, 1], (1, n)), (n, k)) 284 | min_h_matrix = np.minimum(cluster_h_matrix, box_h_matrix) 285 | inter_area = np.multiply(min_w_matrix, min_h_matrix) 286 | 287 | return inter_area / (box_area + cluster_area - inter_area) 288 | 289 | def avg_iou(self, boxes, clusters): 290 | accuracy = np.mean([np.max(self.iou(boxes, clusters), axis=1)]) 291 | return accuracy 292 | 293 | def generator(self, boxes, k, dist=np.median): 294 | box_number = boxes.shape[0] 295 | last_nearest = np.zeros((box_number,)) 296 | clusters = boxes[ 297 | np.random.choice(box_number, k, replace=False) 298 | ] # init k clusters 299 | while True: 300 | distances = 1 - self.iou(boxes, clusters) 301 | 302 | current_nearest = np.argmin(distances, axis=1) 303 | if (last_nearest == current_nearest).all(): 304 | break # clusters won't change 305 | for cluster in range(k): 306 | clusters[cluster] = dist(boxes[current_nearest == cluster], axis=0) 307 | last_nearest = current_nearest 308 | 309 | return clusters 310 | 311 | def generate_anchor(self): 312 | boxes = self.get_boxes() 313 | result = self.generator(boxes, k=self.num_cluster) 314 | result = result[np.lexsort(result.T[0, None])] 315 | print("\nAnchors: \n{}".format(result)) 316 | print("\nFitness: {:.4f}".format(self.avg_iou(boxes, result))) 317 | 318 | @staticmethod 319 | def get_boxes(): 320 | boxes = [] 321 | file_names = [ 322 | file_name[:-4] 323 | for file_name in os.listdir( 324 | posixpath.join(config.data_dir, config.label_dir) 325 | ) 326 | ] 327 | for file_name in file_names: 328 | for box in image_utils.load_label(file_name)[0]: 329 | boxes.append([box[2] - box[0], box[3] - box[1]]) 330 | return np.array(boxes) 331 | 332 | 333 | if __name__ == "__main__": 334 | parser = argparse.ArgumentParser() 335 | parser.add_argument("--anchor", action="store_true") 336 | parser.add_argument("--record", action="store_true") 337 | parser.add_argument("--train", action="store_true") 338 | parser.add_argument("--test", action="store_true") 339 | 340 | args = parser.parse_args() 341 | if args.anchor: 342 | AnchorGenerator(9).generate_anchor() 343 | if args.record: 344 | generate_tf_record() 345 | if args.train: 346 | train() 347 | if args.test: 348 | test() 349 | -------------------------------------------------------------------------------- /distributed_training/nets/nn.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras import backend 5 | from tensorflow.keras import layers 6 | 7 | from utils import config 8 | 9 | initializer = tf.random_normal_initializer(stddev=0.01) 10 | l2 = tf.keras.regularizers.l2(4e-5) 11 | 12 | 13 | def conv(x, filters, k=1, s=1): 14 | if s == 2: 15 | x = layers.ZeroPadding2D(((1, 0), (1, 0)))(x) 16 | padding = "valid" 17 | else: 18 | padding = "same" 19 | x = layers.Conv2D( 20 | filters, 21 | k, 22 | s, 23 | padding, 24 | use_bias=False, 25 | kernel_initializer=initializer, 26 | kernel_regularizer=l2, 27 | )(x) 28 | x = layers.BatchNormalization(momentum=0.03)(x) 29 | x = layers.Activation(tf.nn.swish)(x) 30 | return x 31 | 32 | 33 | def residual(x, filters, add=True): 34 | inputs = x 35 | if add: 36 | x = conv(x, filters, 1) 37 | x = conv(x, filters, 3) 38 | x = inputs + x 39 | else: 40 | x = conv(x, filters, 1) 41 | x = conv(x, filters, 3) 42 | return x 43 | 44 | 45 | def csp(x, filters, n, add=True): 46 | y = conv(x, filters // 2) 47 | for _ in range(n): 48 | y = residual(y, filters // 2, add) 49 | 50 | x = conv(x, filters // 2) 51 | x = layers.concatenate([x, y]) 52 | 53 | x = conv(x, filters) 54 | return x 55 | 56 | 57 | def build_model(training=True): 58 | depth = config.depth[config.versions.index(config.version)] 59 | width = config.width[config.versions.index(config.version)] 60 | 61 | inputs = layers.Input([config.image_size, config.image_size, 3]) 62 | x = tf.nn.space_to_depth(inputs, 2) 63 | x = conv(x, int(round(width * 64)), 3) 64 | x = conv(x, int(round(width * 128)), 3, 2) 65 | x = csp(x, int(round(width * 128)), int(round(depth * 3))) 66 | 67 | x = conv(x, int(round(width * 256)), 3, 2) 68 | x = csp(x, int(round(width * 256)), int(round(depth * 9))) 69 | x1 = x 70 | 71 | x = conv(x, int(round(width * 512)), 3, 2) 72 | x = csp(x, int(round(width * 512)), int(round(depth * 9))) 73 | x2 = x 74 | 75 | x = conv(x, int(round(width * 1024)), 3, 2) 76 | x = conv(x, int(round(width * 512)), 1, 1) 77 | x = layers.concatenate( 78 | [ 79 | x, 80 | tf.nn.max_pool(x, 5, 1, "SAME"), 81 | tf.nn.max_pool(x, 9, 1, "SAME"), 82 | tf.nn.max_pool(x, 13, 1, "SAME"), 83 | ] 84 | ) 85 | x = conv(x, int(round(width * 1024)), 1, 1) 86 | x = csp(x, int(round(width * 1024)), int(round(depth * 3)), False) 87 | 88 | x = conv(x, int(round(width * 512)), 1) 89 | x3 = x 90 | x = layers.UpSampling2D()(x) 91 | x = layers.concatenate([x, x2]) 92 | x = csp(x, int(round(width * 512)), int(round(depth * 3)), False) 93 | 94 | x = conv(x, int(round(width * 256)), 1) 95 | x4 = x 96 | x = layers.UpSampling2D()(x) 97 | x = layers.concatenate([x, x1]) 98 | x = csp(x, int(round(width * 256)), int(round(depth * 3)), False) 99 | p3 = layers.Conv2D( 100 | 3 * (len(config.class_dict) + 5), 101 | 1, 102 | name=f"p3_{len(config.class_dict)}", 103 | kernel_initializer=initializer, 104 | kernel_regularizer=l2, 105 | )(x) 106 | 107 | x = conv(x, int(round(width * 256)), 3, 2) 108 | x = layers.concatenate([x, x4]) 109 | x = csp(x, int(round(width * 512)), int(round(depth * 3)), False) 110 | p4 = layers.Conv2D( 111 | 3 * (len(config.class_dict) + 5), 112 | 1, 113 | name=f"p4_{len(config.class_dict)}", 114 | kernel_initializer=initializer, 115 | kernel_regularizer=l2, 116 | )(x) 117 | 118 | x = conv(x, int(round(width * 512)), 3, 2) 119 | x = layers.concatenate([x, x3]) 120 | x = csp(x, int(round(width * 1024)), int(round(depth * 3)), False) 121 | p5 = layers.Conv2D( 122 | 3 * (len(config.class_dict) + 5), 123 | 1, 124 | name=f"p5_{len(config.class_dict)}", 125 | kernel_initializer=initializer, 126 | kernel_regularizer=l2, 127 | )(x) 128 | 129 | if training: 130 | return tf.keras.Model(inputs, [p5, p4, p3]) 131 | else: 132 | return tf.keras.Model(inputs, Predict()([p5, p4, p3])) 133 | 134 | 135 | def process_layer(feature_map, anchors): 136 | grid_size = tf.shape(feature_map)[1:3] 137 | ratio = tf.cast( 138 | tf.constant([config.image_size, config.image_size]) / grid_size, tf.float32 139 | ) 140 | rescaled_anchors = [ 141 | (anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors 142 | ] 143 | 144 | feature_map = tf.reshape( 145 | feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + len(config.class_dict)] 146 | ) 147 | 148 | box_centers, box_sizes, conf, prob = tf.split( 149 | feature_map, [2, 2, 1, len(config.class_dict)], axis=-1 150 | ) 151 | box_centers = tf.nn.sigmoid(box_centers) 152 | 153 | grid_x = tf.range(grid_size[1], dtype=tf.int32) 154 | grid_y = tf.range(grid_size[0], dtype=tf.int32) 155 | grid_x, grid_y = tf.meshgrid(grid_x, grid_y) 156 | x_offset = tf.reshape(grid_x, (-1, 1)) 157 | y_offset = tf.reshape(grid_y, (-1, 1)) 158 | x_y_offset = tf.concat([x_offset, y_offset], axis=-1) 159 | x_y_offset = tf.cast( 160 | tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32 161 | ) 162 | 163 | box_centers = box_centers + x_y_offset 164 | box_centers = box_centers * ratio[::-1] 165 | 166 | box_sizes = tf.exp(box_sizes) * rescaled_anchors 167 | box_sizes = box_sizes * ratio[::-1] 168 | 169 | boxes = tf.concat([box_centers, box_sizes], axis=-1) 170 | 171 | return x_y_offset, boxes, conf, prob 172 | 173 | 174 | def box_iou(pred_boxes, valid_true_boxes): 175 | pred_box_xy = pred_boxes[..., 0:2] 176 | pred_box_wh = pred_boxes[..., 2:4] 177 | 178 | pred_box_xy = tf.expand_dims(pred_box_xy, -2) 179 | pred_box_wh = tf.expand_dims(pred_box_wh, -2) 180 | 181 | true_box_xy = valid_true_boxes[:, 0:2] 182 | true_box_wh = valid_true_boxes[:, 2:4] 183 | 184 | intersect_min = tf.maximum( 185 | pred_box_xy - pred_box_wh / 2.0, true_box_xy - true_box_wh / 2.0 186 | ) 187 | intersect_max = tf.minimum( 188 | pred_box_xy + pred_box_wh / 2.0, true_box_xy + true_box_wh / 2.0 189 | ) 190 | 191 | intersect_wh = tf.maximum(intersect_max - intersect_min, 0.0) 192 | 193 | intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] 194 | pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1] 195 | true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1] 196 | true_box_area = tf.expand_dims(true_box_area, axis=0) 197 | 198 | return intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10) 199 | 200 | 201 | def compute_nms(args): 202 | boxes, classification = args 203 | 204 | def nms_fn(score, label): 205 | score_indices = tf.where(backend.greater(score, config.threshold)) 206 | 207 | filtered_boxes = tf.gather_nd(boxes, score_indices) 208 | filtered_scores = backend.gather(score, score_indices)[:, 0] 209 | 210 | nms_indices = tf.image.non_max_suppression( 211 | filtered_boxes, filtered_scores, config.max_boxes, 0.1 212 | ) 213 | score_indices = backend.gather(score_indices, nms_indices) 214 | 215 | label = tf.gather_nd(label, score_indices) 216 | score_indices = backend.stack([score_indices[:, 0], label], axis=1) 217 | 218 | return score_indices 219 | 220 | all_indices = [] 221 | for c in range(int(classification.shape[1])): 222 | scores = classification[:, c] 223 | labels = c * tf.ones((backend.shape(scores)[0],), dtype="int64") 224 | all_indices.append(nms_fn(scores, labels)) 225 | indices = backend.concatenate(all_indices, axis=0) 226 | 227 | scores = tf.gather_nd(classification, indices) 228 | labels = indices[:, 1] 229 | scores, top_indices = tf.nn.top_k( 230 | scores, k=backend.minimum(config.max_boxes, backend.shape(scores)[0]) 231 | ) 232 | 233 | indices = backend.gather(indices[:, 0], top_indices) 234 | boxes = backend.gather(boxes, indices) 235 | labels = backend.gather(labels, top_indices) 236 | 237 | pad_size = backend.maximum(0, config.max_boxes - backend.shape(scores)[0]) 238 | 239 | boxes = tf.pad(boxes, [[0, pad_size], [0, 0]], constant_values=-1) 240 | scores = tf.pad(scores, [[0, pad_size]], constant_values=-1) 241 | labels = tf.pad(labels, [[0, pad_size]], constant_values=-1) 242 | labels = backend.cast(labels, "int32") 243 | 244 | boxes.set_shape([config.max_boxes, 4]) 245 | scores.set_shape([config.max_boxes]) 246 | labels.set_shape([config.max_boxes]) 247 | 248 | return [boxes, scores, labels] 249 | 250 | 251 | class ComputeLoss(object): 252 | def __init__(self): 253 | super().__init__() 254 | 255 | @staticmethod 256 | def compute_loss(y_pred, y_true, anchors): 257 | grid_size = tf.shape(y_pred)[1:3] 258 | ratio = tf.cast( 259 | tf.constant([config.image_size, config.image_size]) / grid_size, tf.float32 260 | ) 261 | batch_size = tf.cast(tf.shape(y_pred)[0], tf.float32) 262 | 263 | x_y_offset, pred_boxes, pred_conf, pred_prob = process_layer(y_pred, anchors) 264 | 265 | object_mask = y_true[..., 4:5] 266 | 267 | def cond(idx, _): 268 | return tf.less(idx, tf.cast(batch_size, tf.int32)) 269 | 270 | def body(idx, mask): 271 | valid_true_boxes = tf.boolean_mask( 272 | y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], "bool") 273 | ) 274 | iou = box_iou(pred_boxes[idx], valid_true_boxes) 275 | return idx + 1, mask.write( 276 | idx, tf.cast(tf.reduce_max(iou, axis=-1) < 0.2, tf.float32) 277 | ) 278 | 279 | ignore_mask = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True) 280 | 281 | _, ignore_mask = tf.while_loop(cond=cond, body=body, loop_vars=[0, ignore_mask]) 282 | ignore_mask = ignore_mask.stack() 283 | ignore_mask = tf.expand_dims(ignore_mask, -1) 284 | 285 | true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset 286 | pred_xy = pred_boxes[..., 0:2] / ratio[::-1] - x_y_offset 287 | 288 | true_tw_th = y_true[..., 2:4] / anchors 289 | pred_tw_th = pred_boxes[..., 2:4] / anchors 290 | true_tw_th = tf.where( 291 | tf.equal(true_tw_th, 0), tf.ones_like(true_tw_th), true_tw_th 292 | ) 293 | pred_tw_th = tf.where( 294 | tf.equal(pred_tw_th, 0), tf.ones_like(pred_tw_th), pred_tw_th 295 | ) 296 | true_tw_th = tf.math.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9)) 297 | pred_tw_th = tf.math.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9)) 298 | 299 | box_loss_scale = y_true[..., 2:3] * y_true[..., 3:4] 300 | box_loss_scale = 2.0 - box_loss_scale / tf.cast( 301 | config.image_size**2, tf.float32 302 | ) 303 | 304 | xy_loss = tf.reduce_sum( 305 | tf.square(true_xy - pred_xy) * object_mask * box_loss_scale 306 | ) 307 | wh_loss = tf.reduce_sum( 308 | tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale 309 | ) 310 | 311 | conf_pos_mask = object_mask 312 | conf_neg_mask = (1 - object_mask) * ignore_mask 313 | conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits( 314 | labels=object_mask, logits=pred_conf 315 | ) 316 | conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits( 317 | labels=object_mask, logits=pred_conf 318 | ) 319 | 320 | conf_loss = tf.reduce_sum((conf_loss_pos + conf_loss_neg)) 321 | 322 | true_conf = y_true[..., 5:] 323 | 324 | class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits( 325 | true_conf, pred_prob 326 | ) 327 | class_loss = tf.reduce_sum(class_loss) 328 | 329 | return xy_loss + wh_loss + conf_loss + class_loss 330 | 331 | def __call__(self, y_pred, y_true): 332 | loss = 0.0 333 | anchor_group = [config.anchors[6:9], config.anchors[3:6], config.anchors[0:3]] 334 | 335 | for i in range(len(y_pred)): 336 | loss += self.compute_loss(y_pred[i], y_true[i], anchor_group[i]) 337 | return loss 338 | 339 | 340 | class CosineLR(tf.optimizers.schedules.LearningRateSchedule): 341 | def __init__(self, steps): 342 | super().__init__() 343 | self.lr = 0.008 * config.batch_size / 64 344 | self.warmup_init = 0.0008 345 | self.warmup_step = steps 346 | self.decay_steps = tf.cast( 347 | (config.num_epochs - 1) * self.warmup_step, tf.float32 348 | ) 349 | 350 | def __call__(self, step): 351 | linear_warmup = ( 352 | tf.cast(step, dtype=tf.float32) 353 | / self.warmup_step 354 | * (self.lr - self.warmup_init) 355 | ) 356 | cosine_lr = ( 357 | 0.5 358 | * self.lr 359 | * (1 + tf.cos(math.pi * tf.cast(step, tf.float32) / self.decay_steps)) 360 | ) 361 | return tf.where( 362 | step < self.warmup_step, self.warmup_init + linear_warmup, cosine_lr 363 | ) 364 | 365 | def get_config(self): 366 | pass 367 | 368 | 369 | class Predict(layers.Layer): 370 | def __init__(self): 371 | super().__init__() 372 | 373 | def call(self, inputs, **kwargs): 374 | y_pred = [ 375 | (inputs[0], config.anchors[6:9]), 376 | (inputs[1], config.anchors[3:6]), 377 | (inputs[2], config.anchors[0:3]), 378 | ] 379 | 380 | boxes_list, conf_list, prob_list = [], [], [] 381 | for result in [ 382 | process_layer(feature_map, anchors) for (feature_map, anchors) in y_pred 383 | ]: 384 | x_y_offset, box, conf, prob = result 385 | grid_size = tf.shape(x_y_offset)[:2] 386 | box = tf.reshape(box, [-1, grid_size[0] * grid_size[1] * 3, 4]) 387 | conf = tf.reshape(conf, [-1, grid_size[0] * grid_size[1] * 3, 1]) 388 | prob = tf.reshape( 389 | prob, [-1, grid_size[0] * grid_size[1] * 3, len(config.class_dict)] 390 | ) 391 | boxes_list.append(box) 392 | conf_list.append(tf.sigmoid(conf)) 393 | prob_list.append(tf.sigmoid(prob)) 394 | 395 | boxes = tf.concat(boxes_list, axis=1) 396 | conf = tf.concat(conf_list, axis=1) 397 | prob = tf.concat(prob_list, axis=1) 398 | 399 | center_x, center_y, w, h = tf.split(boxes, [1, 1, 1, 1], axis=-1) 400 | x_min = center_x - w / 2 401 | y_min = center_y - h / 2 402 | x_max = center_x + w / 2 403 | y_max = center_y + h / 2 404 | 405 | boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1) 406 | 407 | outputs = tf.map_fn( 408 | fn=compute_nms, 409 | elems=[boxes, conf * prob], 410 | dtype=["float32", "float32", "int32"], 411 | parallel_iterations=100, 412 | ) 413 | 414 | return outputs 415 | 416 | def compute_output_shape(self, input_shape): 417 | return [ 418 | (input_shape[0][0], config.max_boxes, 4), 419 | (input_shape[1][0], config.max_boxes), 420 | (input_shape[1][0], config.max_boxes), 421 | ] 422 | 423 | def compute_mask(self, inputs, mask=None): 424 | return (len(inputs) + 1) * [None] 425 | 426 | def get_config(self): 427 | return super().get_config() 428 | -------------------------------------------------------------------------------- /distributed_training/test/test.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nginx 5 | spec: 6 | containers: 7 | - name: nginx 8 | image: nginx 9 | volumeMounts: 10 | - mountPath: /train 11 | name: training 12 | volumes: 13 | - name: training 14 | persistentVolumeClaim: 15 | claimName: mwt-volume 16 | -------------------------------------------------------------------------------- /distributed_training/utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy 3 | import posixpath 4 | 5 | width = [0.50, 0.75, 1.0, 1.25] 6 | depth = [0.33, 0.67, 1.0, 1.33] 7 | 8 | versions = ["s", "m", "l", "x"] 9 | # data_dir = os.path.join('..', 'Dataset') 10 | data_dir = posixpath.join(".", "Dataset") 11 | 12 | threshold = 0.3 13 | max_boxes = 150 14 | image_dir = "images" 15 | label_dir = "labels" 16 | 17 | num_epochs = 2 18 | batch_size = 32 19 | image_size = 640 20 | class_dict = { 21 | "person": 0, 22 | "bicycle": 1, 23 | "car": 2, 24 | "motorcycle": 3, 25 | "airplane": 4, 26 | "bus": 5, 27 | "train": 6, 28 | "truck": 7, 29 | "boat": 8, 30 | "traffic light": 9, 31 | "fire hydrant": 10, 32 | "stop sign": 11, 33 | "parking meter": 12, 34 | "bench": 13, 35 | "bird": 14, 36 | "cat": 15, 37 | "dog": 16, 38 | "horse": 17, 39 | "sheep": 18, 40 | "cow": 19, 41 | "elephant": 20, 42 | "bear": 21, 43 | "zebra": 22, 44 | "giraffe": 23, 45 | "backpack": 24, 46 | "umbrella": 25, 47 | "handbag": 26, 48 | "tie": 27, 49 | "suitcase": 28, 50 | "frisbee": 29, 51 | "skis": 30, 52 | "snowboard": 31, 53 | "sports ball": 32, 54 | "kite": 33, 55 | "baseball bat": 34, 56 | "baseball glove": 35, 57 | "skateboard": 36, 58 | "surfboard": 37, 59 | "tennis racket": 38, 60 | "bottle": 39, 61 | "wine glass": 40, 62 | "cup": 41, 63 | "fork": 42, 64 | "knife": 43, 65 | "spoon": 44, 66 | "bowl": 45, 67 | "banana": 46, 68 | "apple": 47, 69 | "sandwich": 48, 70 | "orange": 49, 71 | "broccoli": 50, 72 | "carrot": 51, 73 | "hot dog": 52, 74 | "pizza": 53, 75 | "donut": 54, 76 | "cake": 55, 77 | "chair": 56, 78 | "couch": 57, 79 | "potted plant": 58, 80 | "bed": 59, 81 | "dining table": 60, 82 | "toilet": 61, 83 | "tv": 62, 84 | "laptop": 63, 85 | "mouse": 64, 86 | "remote": 65, 87 | "keyboard": 66, 88 | "cell phone": 67, 89 | "microwave": 68, 90 | "oven": 69, 91 | "toaster": 70, 92 | "sink": 71, 93 | "refrigerator": 72, 94 | "book": 73, 95 | "clock": 74, 96 | "vase": 75, 97 | "scissors": 76, 98 | "teddy bear": 77, 99 | "hair drier": 78, 100 | "toothbrush": 79, 101 | } 102 | 103 | version = "s" 104 | anchors = numpy.array( 105 | [ 106 | [8.0, 9.0], 107 | [16.0, 24.0], 108 | [28.0, 58.0], 109 | [41.0, 25.0], 110 | [58.0, 125.0], 111 | [71.0, 52.0], 112 | [129.0, 97.0], 113 | [163.0, 218.0], 114 | [384.0, 347.0], 115 | ], 116 | numpy.float32, 117 | ) 118 | -------------------------------------------------------------------------------- /distributed_training/utils/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy 3 | import tensorflow as tf 4 | from tensorflow.keras import utils 5 | import posixpath 6 | from distributed_trainning.utils import image_utils 7 | from utils import config 8 | 9 | 10 | class Generator(utils.Sequence): 11 | def __init__(self, file_names): 12 | self.file_names = file_names 13 | 14 | def __len__(self): 15 | return int(numpy.floor(len(self.file_names) / config.batch_size)) 16 | 17 | def __getitem__(self, index): 18 | image = image_utils.load_image(self.file_names[index]) 19 | boxes = image_utils.load_label(self.file_names[index]) 20 | image, boxes = image_utils.resize(image, boxes) 21 | # image, boxes = util.random_flip(image, boxes) 22 | 23 | image = image[:, :, ::-1].astype(numpy.float32) 24 | image = image / 255.0 25 | y_true_1, y_true_2, y_true_3 = image_utils.process_box(boxes) 26 | return image, y_true_1, y_true_2, y_true_3 27 | 28 | def on_epoch_end(self): 29 | numpy.random.shuffle(self.file_names) 30 | 31 | 32 | def input_fn(file_names): 33 | def generator_fn(): 34 | generator = utils.OrderedEnqueuer(Generator(file_names), True) 35 | generator.start(workers=min(os.cpu_count(), config.batch_size)) 36 | while True: 37 | image, y_true_1, y_true_2, y_true_3 = generator.get().__next__() 38 | yield image, y_true_1, y_true_2, y_true_3 39 | 40 | output_types = (tf.float32, tf.float32, tf.float32, tf.float32) 41 | output_shapes = ( 42 | (config.image_size, config.image_size, 3), 43 | ( 44 | config.image_size // 32, 45 | config.image_size // 32, 46 | 3, 47 | len(config.class_dict) + 5, 48 | ), 49 | ( 50 | config.image_size // 16, 51 | config.image_size // 16, 52 | 3, 53 | len(config.class_dict) + 5, 54 | ), 55 | (config.image_size // 8, config.image_size // 8, 3, len(config.class_dict) + 5), 56 | ) 57 | 58 | dataset = tf.data.Dataset.from_generator( 59 | generator=generator_fn, output_types=output_types, output_shapes=output_shapes 60 | ) 61 | 62 | dataset = dataset.repeat(config.num_epochs + 1) 63 | dataset = dataset.batch(config.batch_size) 64 | dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) 65 | return dataset 66 | 67 | 68 | class DataLoader: 69 | def __init__(self): 70 | super().__init__() 71 | self.description = { 72 | "in_image": tf.io.FixedLenFeature([], tf.string), 73 | "y_true_1": tf.io.FixedLenFeature([], tf.string), 74 | "y_true_2": tf.io.FixedLenFeature([], tf.string), 75 | "y_true_3": tf.io.FixedLenFeature([], tf.string), 76 | } 77 | 78 | def parse_data(self, tf_record): 79 | features = tf.io.parse_single_example(tf_record, self.description) 80 | 81 | in_image = tf.io.decode_raw(features["in_image"], tf.float32) 82 | in_image = tf.reshape(in_image, (config.image_size, config.image_size, 3)) 83 | in_image = in_image / 255.0 84 | 85 | y_true_1 = tf.io.decode_raw(features["y_true_1"], tf.float32) 86 | y_true_1 = tf.reshape( 87 | y_true_1, 88 | ( 89 | config.image_size // 32, 90 | config.image_size // 32, 91 | 3, 92 | 5 + len(config.class_dict), 93 | ), 94 | ) 95 | 96 | y_true_2 = tf.io.decode_raw(features["y_true_2"], tf.float32) 97 | y_true_2 = tf.reshape( 98 | y_true_2, 99 | ( 100 | config.image_size // 16, 101 | config.image_size // 16, 102 | 3, 103 | 5 + len(config.class_dict), 104 | ), 105 | ) 106 | 107 | y_true_3 = tf.io.decode_raw(features["y_true_3"], tf.float32) 108 | y_true_3 = tf.reshape( 109 | y_true_3, 110 | ( 111 | config.image_size // 8, 112 | config.image_size // 8, 113 | 3, 114 | 5 + len(config.class_dict), 115 | ), 116 | ) 117 | 118 | return in_image, y_true_1, y_true_2, y_true_3 119 | 120 | def input_fn(self, file_names): 121 | dataset = tf.data.TFRecordDataset(file_names, "GZIP") 122 | dataset = dataset.map(self.parse_data, os.cpu_count()) 123 | dataset = dataset.repeat(config.num_epochs + 1) 124 | dataset = dataset.batch(config.batch_size) 125 | dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) 126 | return dataset 127 | -------------------------------------------------------------------------------- /distributed_training/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree 3 | import posixpath 4 | import cv2 5 | import numpy 6 | 7 | from utils import config 8 | 9 | 10 | def load_image(file_name, split="train"): 11 | path = posixpath.join(config.data_dir, config.image_dir, split, file_name + ".jpg") 12 | image = cv2.imread(path) 13 | return image 14 | 15 | 16 | def load_label(file_name, split="train"): 17 | # Construct the new path for the label file 18 | path = posixpath.join(config.data_dir, config.label_dir, split, file_name + ".txt") 19 | 20 | boxes = [] 21 | 22 | # Read the text file line by line 23 | with open(path, "r") as f: 24 | for line in f: 25 | # Split the line into coordinates 26 | _, x_min, y_min, x_max, y_max = line.strip().split() 27 | x_min = float(x_min) 28 | y_min = float(y_min) 29 | x_max = float(x_max) 30 | y_max = float(y_max) 31 | 32 | boxes.append([x_min, y_min, x_max, y_max]) 33 | 34 | boxes = numpy.asarray(boxes, numpy.float32) 35 | return boxes 36 | 37 | 38 | def resize(image, boxes=None): 39 | shape = image.shape[:2] 40 | scale = min(config.image_size / shape[1], config.image_size / shape[0]) 41 | image = cv2.resize(image, (int(scale * shape[1]), int(scale * shape[0]))) 42 | 43 | image_padded = numpy.zeros([config.image_size, config.image_size, 3], numpy.uint8) 44 | 45 | dw = (config.image_size - int(scale * shape[1])) // 2 46 | dh = (config.image_size - int(scale * shape[0])) // 2 47 | 48 | image_padded[ 49 | dh : int(scale * shape[0]) + dh, dw : int(scale * shape[1]) + dw, : 50 | ] = image.copy() 51 | 52 | if boxes is None: 53 | return image_padded, scale, dw, dh 54 | 55 | else: 56 | boxes[:, [0, 2]] = boxes[:, [0, 2]] * scale + dw 57 | boxes[:, [1, 3]] = boxes[:, [1, 3]] * scale + dh 58 | 59 | return image_padded, boxes 60 | 61 | 62 | def random_flip(image, boxes): 63 | if numpy.random.uniform() < 0.5: 64 | image = cv2.flip(image, 1) 65 | boxes[:, 0] = image.shape[1] - boxes[:, 2] 66 | boxes[:, 2] = image.shape[1] - boxes[:, 0] 67 | return image, boxes 68 | 69 | 70 | def process_box(boxes): 71 | anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] 72 | anchors = config.anchors 73 | box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2 74 | box_size = boxes[:, 2:4] - boxes[:, 0:2] 75 | 76 | y_true_1 = numpy.zeros( 77 | ( 78 | config.image_size // 32, 79 | config.image_size // 32, 80 | 3, 81 | 5 + len(config.class_dict), 82 | ), 83 | numpy.float32, 84 | ) 85 | y_true_2 = numpy.zeros( 86 | ( 87 | config.image_size // 16, 88 | config.image_size // 16, 89 | 3, 90 | 5 + len(config.class_dict), 91 | ), 92 | numpy.float32, 93 | ) 94 | y_true_3 = numpy.zeros( 95 | (config.image_size // 8, config.image_size // 8, 3, 5 + len(config.class_dict)), 96 | numpy.float32, 97 | ) 98 | 99 | y_true = [y_true_1, y_true_2, y_true_3] 100 | 101 | box_size = numpy.expand_dims(box_size, 1) 102 | 103 | min_np = numpy.maximum(-box_size / 2, -anchors / 2) 104 | max_np = numpy.minimum(box_size / 2, anchors / 2) 105 | 106 | whs = max_np - min_np 107 | 108 | overlap = whs[:, :, 0] * whs[:, :, 1] 109 | union = ( 110 | box_size[:, :, 0] * box_size[:, :, 1] 111 | + anchors[:, 0] * anchors[:, 1] 112 | - whs[:, :, 0] * whs[:, :, 1] 113 | + 1e-10 114 | ) 115 | 116 | iou = overlap / union 117 | best_match_idx = numpy.argmax(iou, axis=1) 118 | 119 | ratio_dict = {1.0: 8.0, 2.0: 16.0, 3.0: 32.0} 120 | for i, idx in enumerate(best_match_idx): 121 | feature_map_group = 2 - idx // 3 122 | ratio = ratio_dict[numpy.ceil((idx + 1) / 3.0)] 123 | x = int(numpy.floor(box_centers[i, 0] / ratio)) 124 | y = int(numpy.floor(box_centers[i, 1] / ratio)) 125 | k = anchors_mask[feature_map_group].index(idx) 126 | # c = labels[i] 127 | 128 | y_true[feature_map_group][y, x, k, :2] = box_centers[i] 129 | y_true[feature_map_group][y, x, k, 2:4] = box_size[i] 130 | y_true[feature_map_group][y, x, k, 4] = 1.0 131 | # y_true[feature_map_group][y, x, k, 5 + c] = 1. 132 | 133 | return y_true_1, y_true_2, y_true_3 134 | -------------------------------------------------------------------------------- /distributed_training/weights/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/distributed_training/weights/model.h5 -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | mlflow: 5 | container_name: mlflow 6 | build: 7 | context: ./mlflow 8 | dockerfile: Dockerfile 9 | ports: 10 | - "5000:5000" 11 | volumes: 12 | - mlflow-artifacts:/mlflow 13 | environment: 14 | - MLFLOW_TRACKING_URI=http://0.0.0.0:5000 15 | - MLFLOW_ARTIFACT_ROOT=/mlflow 16 | 17 | train-service: 18 | container_name: train 19 | build: 20 | context: ./train 21 | dockerfile: Dockerfile 22 | volumes: 23 | - ./train:/app 24 | depends_on: 25 | - mlflow 26 | environment: 27 | - MLFLOW_TRACKING_URI=http://mlflow:5000 28 | 29 | 30 | jenkins: 31 | image: fullstackdatascience/jenkins:lts 32 | container_name: jenkins 33 | restart: unless-stopped 34 | privileged: true 35 | user: root 36 | ports: 37 | - 8081:8080 38 | - 50000:50000 39 | volumes: 40 | - jenkins_home:/var/jenkins_home 41 | - /var/run/docker.sock:/var/run/docker.sock 42 | 43 | 44 | volumes: 45 | mlflow-artifacts: 46 | jenkins_home: 47 | -------------------------------------------------------------------------------- /images/PipelineAllcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/PipelineAllcode.png -------------------------------------------------------------------------------- /images/add_credential.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/add_credential.png -------------------------------------------------------------------------------- /images/add_credential_dockerhub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/add_credential_dockerhub.png -------------------------------------------------------------------------------- /images/add_token_dockerhub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/add_token_dockerhub.png -------------------------------------------------------------------------------- /images/architecutre_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/architecutre_overview.png -------------------------------------------------------------------------------- /images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/bus.jpg -------------------------------------------------------------------------------- /images/check_request_github_jenkins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/check_request_github_jenkins.png -------------------------------------------------------------------------------- /images/connector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/connector.png -------------------------------------------------------------------------------- /images/data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/data_pipeline.png -------------------------------------------------------------------------------- /images/diagram_pipe.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/diagram_pipe.gif -------------------------------------------------------------------------------- /images/error_log_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/error_log_pod.png -------------------------------------------------------------------------------- /images/false_modelmesh_deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/false_modelmesh_deploy.png -------------------------------------------------------------------------------- /images/generate_token_docker_hub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/generate_token_docker_hub.png -------------------------------------------------------------------------------- /images/get_token_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/get_token_github.png -------------------------------------------------------------------------------- /images/github_tokens.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/github_tokens.png -------------------------------------------------------------------------------- /images/instal_docker_jenkins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/instal_docker_jenkins.png -------------------------------------------------------------------------------- /images/install_docker_success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/install_docker_success.png -------------------------------------------------------------------------------- /images/isvc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/isvc.png -------------------------------------------------------------------------------- /images/jenkins_container.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/jenkins_container.png -------------------------------------------------------------------------------- /images/jenkins_portal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/jenkins_portal.png -------------------------------------------------------------------------------- /images/jenkins_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/jenkins_ui.png -------------------------------------------------------------------------------- /images/messenger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/messenger.png -------------------------------------------------------------------------------- /images/minio-credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/minio-credentials.png -------------------------------------------------------------------------------- /images/mlflow _modelregistry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/mlflow _modelregistry.png -------------------------------------------------------------------------------- /images/modelmesh-serving-installation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/modelmesh-serving-installation.png -------------------------------------------------------------------------------- /images/ngrok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/ngrok.png -------------------------------------------------------------------------------- /images/ngrok_forwarding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/ngrok_forwarding.png -------------------------------------------------------------------------------- /images/password_jenkins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/password_jenkins.png -------------------------------------------------------------------------------- /images/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/result.png -------------------------------------------------------------------------------- /images/result_connect_jenkins_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/result_connect_jenkins_github.png -------------------------------------------------------------------------------- /images/result_push_dockerhub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/result_push_dockerhub.png -------------------------------------------------------------------------------- /images/result_train_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/result_train_pod.png -------------------------------------------------------------------------------- /images/strategy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/strategy.png -------------------------------------------------------------------------------- /images/strategy_scope.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/strategy_scope.png -------------------------------------------------------------------------------- /images/structure_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/structure_data.png -------------------------------------------------------------------------------- /images/structure_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/structure_training.png -------------------------------------------------------------------------------- /images/topic_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/topic_tab.png -------------------------------------------------------------------------------- /images/train_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/train_process.png -------------------------------------------------------------------------------- /images/ui_build_jenkins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/ui_build_jenkins.png -------------------------------------------------------------------------------- /images/validate_connect_repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/validate_connect_repo.png -------------------------------------------------------------------------------- /images/webhook_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/images/webhook_github.png -------------------------------------------------------------------------------- /mlflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile.mlflow 2 | FROM python:3.9-slim 3 | 4 | RUN pip install mlflow 5 | 6 | CMD ["mlflow", "server", "--host", "0.0.0.0", "--port", "5000"] -------------------------------------------------------------------------------- /model_repo/yolov8n_car/1/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/model_repo/yolov8n_car/1/model.onnx -------------------------------------------------------------------------------- /model_repo/yolov8n_car/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Model configuration file (optional) 2 | # https://github.com/triton-inference-server/tutorials/blob/main/Conceptual_Guide/Part_1-model_deployment/README.md#model-configuration 3 | name: "yolov8n_car" 4 | backend: "onnxruntime" # Select the backend to run the model https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton 5 | max_batch_size : 2 # Max batch size the model can support 6 | # In most cases, Triton can help to extract `input` and `output` 7 | # but we should declare it explicitly 8 | input [ 9 | { 10 | name: "images" 11 | data_type: TYPE_FP32 12 | dims: [ 3, 640, 640 ] # If no batch, pls use [ 1, 640, 640 ] 13 | } 14 | ] 15 | output [ 16 | { 17 | name: "output0" 18 | data_type: TYPE_FP32 19 | dims: [ -1, -1 ] # If no batch, pls use [ 84, 8400 ] 20 | } 21 | ] 22 | 23 | instance_group [ { kind: KIND_CPU } ] -------------------------------------------------------------------------------- /notebooks/debug.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import xml.etree.ElementTree\n", 11 | "# import cv2\n", 12 | "import numpy\n", 13 | "from utils import config\n", 14 | "import posixpath" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def load_image(file_name):\n", 24 | " path = posixpath.join(config.data_dir, config.image_dir, file_name + '.jpg')\n", 25 | " image = cv2.imread(path)\n", 26 | " return image\n", 27 | "\n", 28 | "\n", 29 | "def load_label(file_name, split='train'):\n", 30 | " # Construct the new path for the label file\n", 31 | " path = posixpath.join(config.data_dir, config.label_dir, split, file_name + '.txt')\n", 32 | " \n", 33 | " boxes = []\n", 34 | " \n", 35 | " # Read the text file line by line\n", 36 | " with open(path, 'r') as f:\n", 37 | " for line in f:\n", 38 | " # Split the line into coordinates\n", 39 | " _, x_min, y_min, x_max, y_max = line.strip().split()\n", 40 | " x_min = float(x_min)\n", 41 | " y_min = float(y_min)\n", 42 | " x_max = float(x_max)\n", 43 | " y_max = float(y_max)\n", 44 | "\n", 45 | " boxes.append([x_min, y_min, x_max, y_max])\n", 46 | " \n", 47 | " boxes = numpy.asarray(boxes, numpy.float32) \n", 48 | " return boxes" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "0.562590448668639 0.6324806949999999 0.38422575976331363 0.2265122263157895\n", 61 | "0.9337916063609467 0.564913127368421 0.13241678727810652 0.17631917631578944\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "path = \"Dataset/labels/valid/vid_4_700.txt\"\n", 67 | "with open(path, 'r') as f:\n", 68 | " for line in f:\n", 69 | " # print(line)\n", 70 | " _, x_min, y_min, x_max, y_max = line.strip().split()\n", 71 | " print(x_min, y_min, x_max, y_max)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# file_names = []\n", 81 | "# with open(os.path.join(config.data_dir, 'train.txt')) as f:\n", 82 | "# for file_name in f.readlines():\n", 83 | "# image_path = os.path.join(config.data_dir, config.image_dir, file_name.rstrip() + '.jpg')\n", 84 | "# label_path = os.path.join(config.data_dir, config.label_dir, file_name.rstrip() + '.xml')\n", 85 | "# if os.path.exists(image_path) and os.path.exists(label_path):\n", 86 | "# if os.path.exists(os.path.join(config.data_dir, 'TF')):\n", 87 | "# file_names.append(os.path.join(config.data_dir, 'TF', file_name.rstrip() + '.tf'))\n", 88 | "# else:\n", 89 | "# file_names.append(file_name.rstrip())\n", 90 | " \n", 91 | "# print(file_names)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "(284, 338, 284)" 103 | ] 104 | }, 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "\n", 112 | "image_path = posixpath.join(config.data_dir, config.image_dir, 'train')\n", 113 | "label_path = posixpath.join(config.data_dir, config.label_dir, 'train')\n", 114 | "\n", 115 | "image_files = [os.path.splitext(file_name)[0] for file_name in os.listdir(image_path) if file_name.lower().endswith('.jpg')]\n", 116 | "label_files = [os.path.splitext(file_name)[0] for file_name in os.listdir(label_path) if file_name.lower().endswith('.txt')]\n", 117 | "\n", 118 | "file_names = list(set(image_files) & set(label_files))\n", 119 | "len(file_names), len(image_files), len(label_files)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "" 131 | ] 132 | }, 133 | "execution_count": 6, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "from utils import config, util\n", 140 | "from tensorflow.keras import utils\n", 141 | "import tensorflow as tf\n", 142 | "\n", 143 | "\n", 144 | "class Generator(utils.Sequence):\n", 145 | " def __init__(self, file_names):\n", 146 | " self.file_names = file_names\n", 147 | "\n", 148 | " def __len__(self):\n", 149 | " return int(numpy.floor(len(self.file_names) / config.batch_size))\n", 150 | "\n", 151 | " def __getitem__(self, index):\n", 152 | " image = util.load_image(self.file_names[index])\n", 153 | " boxes = util.load_label(self.file_names[index])\n", 154 | " image, boxes = util.resize(image, boxes)\n", 155 | " # image, boxes = util.random_flip(image, boxes)\n", 156 | "\n", 157 | " image = image[:, :, ::-1].astype(numpy.float32)\n", 158 | " image = image / 255.0\n", 159 | " y_true_1, y_true_2, y_true_3 = util.process_box(boxes)\n", 160 | " return image, y_true_1, y_true_2, y_true_3\n", 161 | "\n", 162 | " def on_epoch_end(self):\n", 163 | " numpy.random.shuffle(self.file_names)\n", 164 | "\n", 165 | "\n", 166 | "def input_fn(file_names):\n", 167 | " # def generator_fn():\n", 168 | " generator = utils.OrderedEnqueuer(Generator(file_names), True)\n", 169 | " # generator.start(workers=min(os.cpu_count(), config.batch_size))\n", 170 | " while True:\n", 171 | " image, y_true_1, y_true_2, y_true_3 = generator.get().__next__()\n", 172 | " print(image, y_true_1, y_true_2, y_true_3)\n", 173 | " yield image, y_true_1, y_true_2, y_true_3\n", 174 | " \n", 175 | " \n", 176 | "input_fn(file_names)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "vid_4_6380\n", 189 | "[[[0. 0. 0.]\n", 190 | " [0. 0. 0.]\n", 191 | " [0. 0. 0.]\n", 192 | " ...\n", 193 | " [0. 0. 0.]\n", 194 | " [0. 0. 0.]\n", 195 | " [0. 0. 0.]]\n", 196 | "\n", 197 | " [[0. 0. 0.]\n", 198 | " [0. 0. 0.]\n", 199 | " [0. 0. 0.]\n", 200 | " ...\n", 201 | " [0. 0. 0.]\n", 202 | " [0. 0. 0.]\n", 203 | " [0. 0. 0.]]\n", 204 | "\n", 205 | " [[0. 0. 0.]\n", 206 | " [0. 0. 0.]\n", 207 | " [0. 0. 0.]\n", 208 | " ...\n", 209 | " [0. 0. 0.]\n", 210 | " [0. 0. 0.]\n", 211 | " [0. 0. 0.]]\n", 212 | "\n", 213 | " ...\n", 214 | "\n", 215 | " [[0. 0. 0.]\n", 216 | " [0. 0. 0.]\n", 217 | " [0. 0. 0.]\n", 218 | " ...\n", 219 | " [0. 0. 0.]\n", 220 | " [0. 0. 0.]\n", 221 | " [0. 0. 0.]]\n", 222 | "\n", 223 | " [[0. 0. 0.]\n", 224 | " [0. 0. 0.]\n", 225 | " [0. 0. 0.]\n", 226 | " ...\n", 227 | " [0. 0. 0.]\n", 228 | " [0. 0. 0.]\n", 229 | " [0. 0. 0.]]\n", 230 | "\n", 231 | " [[0. 0. 0.]\n", 232 | " [0. 0. 0.]\n", 233 | " [0. 0. 0.]\n", 234 | " ...\n", 235 | " [0. 0. 0.]\n", 236 | " [0. 0. 0.]\n", 237 | " [0. 0. 0.]]] [[[[0. 0. 0. ... 0. 0. 0.]\n", 238 | " [0. 0. 0. ... 0. 0. 0.]\n", 239 | " [0. 0. 0. ... 0. 0. 0.]]\n", 240 | "\n", 241 | " [[0. 0. 0. ... 0. 0. 0.]\n", 242 | " [0. 0. 0. ... 0. 0. 0.]\n", 243 | " [0. 0. 0. ... 0. 0. 0.]]\n", 244 | "\n", 245 | " [[0. 0. 0. ... 0. 0. 0.]\n", 246 | " [0. 0. 0. ... 0. 0. 0.]\n", 247 | " [0. 0. 0. ... 0. 0. 0.]]\n", 248 | "\n", 249 | " ...\n", 250 | "\n", 251 | " [[0. 0. 0. ... 0. 0. 0.]\n", 252 | " [0. 0. 0. ... 0. 0. 0.]\n", 253 | " [0. 0. 0. ... 0. 0. 0.]]\n", 254 | "\n", 255 | " [[0. 0. 0. ... 0. 0. 0.]\n", 256 | " [0. 0. 0. ... 0. 0. 0.]\n", 257 | " [0. 0. 0. ... 0. 0. 0.]]\n", 258 | "\n", 259 | " [[0. 0. 0. ... 0. 0. 0.]\n", 260 | " [0. 0. 0. ... 0. 0. 0.]\n", 261 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 262 | "\n", 263 | "\n", 264 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 265 | " [0. 0. 0. ... 0. 0. 0.]\n", 266 | " [0. 0. 0. ... 0. 0. 0.]]\n", 267 | "\n", 268 | " [[0. 0. 0. ... 0. 0. 0.]\n", 269 | " [0. 0. 0. ... 0. 0. 0.]\n", 270 | " [0. 0. 0. ... 0. 0. 0.]]\n", 271 | "\n", 272 | " [[0. 0. 0. ... 0. 0. 0.]\n", 273 | " [0. 0. 0. ... 0. 0. 0.]\n", 274 | " [0. 0. 0. ... 0. 0. 0.]]\n", 275 | "\n", 276 | " ...\n", 277 | "\n", 278 | " [[0. 0. 0. ... 0. 0. 0.]\n", 279 | " [0. 0. 0. ... 0. 0. 0.]\n", 280 | " [0. 0. 0. ... 0. 0. 0.]]\n", 281 | "\n", 282 | " [[0. 0. 0. ... 0. 0. 0.]\n", 283 | " [0. 0. 0. ... 0. 0. 0.]\n", 284 | " [0. 0. 0. ... 0. 0. 0.]]\n", 285 | "\n", 286 | " [[0. 0. 0. ... 0. 0. 0.]\n", 287 | " [0. 0. 0. ... 0. 0. 0.]\n", 288 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 289 | "\n", 290 | "\n", 291 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 292 | " [0. 0. 0. ... 0. 0. 0.]\n", 293 | " [0. 0. 0. ... 0. 0. 0.]]\n", 294 | "\n", 295 | " [[0. 0. 0. ... 0. 0. 0.]\n", 296 | " [0. 0. 0. ... 0. 0. 0.]\n", 297 | " [0. 0. 0. ... 0. 0. 0.]]\n", 298 | "\n", 299 | " [[0. 0. 0. ... 0. 0. 0.]\n", 300 | " [0. 0. 0. ... 0. 0. 0.]\n", 301 | " [0. 0. 0. ... 0. 0. 0.]]\n", 302 | "\n", 303 | " ...\n", 304 | "\n", 305 | " [[0. 0. 0. ... 0. 0. 0.]\n", 306 | " [0. 0. 0. ... 0. 0. 0.]\n", 307 | " [0. 0. 0. ... 0. 0. 0.]]\n", 308 | "\n", 309 | " [[0. 0. 0. ... 0. 0. 0.]\n", 310 | " [0. 0. 0. ... 0. 0. 0.]\n", 311 | " [0. 0. 0. ... 0. 0. 0.]]\n", 312 | "\n", 313 | " [[0. 0. 0. ... 0. 0. 0.]\n", 314 | " [0. 0. 0. ... 0. 0. 0.]\n", 315 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 316 | "\n", 317 | "\n", 318 | " ...\n", 319 | "\n", 320 | "\n", 321 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 322 | " [0. 0. 0. ... 0. 0. 0.]\n", 323 | " [0. 0. 0. ... 0. 0. 0.]]\n", 324 | "\n", 325 | " [[0. 0. 0. ... 0. 0. 0.]\n", 326 | " [0. 0. 0. ... 0. 0. 0.]\n", 327 | " [0. 0. 0. ... 0. 0. 0.]]\n", 328 | "\n", 329 | " [[0. 0. 0. ... 0. 0. 0.]\n", 330 | " [0. 0. 0. ... 0. 0. 0.]\n", 331 | " [0. 0. 0. ... 0. 0. 0.]]\n", 332 | "\n", 333 | " ...\n", 334 | "\n", 335 | " [[0. 0. 0. ... 0. 0. 0.]\n", 336 | " [0. 0. 0. ... 0. 0. 0.]\n", 337 | " [0. 0. 0. ... 0. 0. 0.]]\n", 338 | "\n", 339 | " [[0. 0. 0. ... 0. 0. 0.]\n", 340 | " [0. 0. 0. ... 0. 0. 0.]\n", 341 | " [0. 0. 0. ... 0. 0. 0.]]\n", 342 | "\n", 343 | " [[0. 0. 0. ... 0. 0. 0.]\n", 344 | " [0. 0. 0. ... 0. 0. 0.]\n", 345 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 346 | "\n", 347 | "\n", 348 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 349 | " [0. 0. 0. ... 0. 0. 0.]\n", 350 | " [0. 0. 0. ... 0. 0. 0.]]\n", 351 | "\n", 352 | " [[0. 0. 0. ... 0. 0. 0.]\n", 353 | " [0. 0. 0. ... 0. 0. 0.]\n", 354 | " [0. 0. 0. ... 0. 0. 0.]]\n", 355 | "\n", 356 | " [[0. 0. 0. ... 0. 0. 0.]\n", 357 | " [0. 0. 0. ... 0. 0. 0.]\n", 358 | " [0. 0. 0. ... 0. 0. 0.]]\n", 359 | "\n", 360 | " ...\n", 361 | "\n", 362 | " [[0. 0. 0. ... 0. 0. 0.]\n", 363 | " [0. 0. 0. ... 0. 0. 0.]\n", 364 | " [0. 0. 0. ... 0. 0. 0.]]\n", 365 | "\n", 366 | " [[0. 0. 0. ... 0. 0. 0.]\n", 367 | " [0. 0. 0. ... 0. 0. 0.]\n", 368 | " [0. 0. 0. ... 0. 0. 0.]]\n", 369 | "\n", 370 | " [[0. 0. 0. ... 0. 0. 0.]\n", 371 | " [0. 0. 0. ... 0. 0. 0.]\n", 372 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 373 | "\n", 374 | "\n", 375 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 376 | " [0. 0. 0. ... 0. 0. 0.]\n", 377 | " [0. 0. 0. ... 0. 0. 0.]]\n", 378 | "\n", 379 | " [[0. 0. 0. ... 0. 0. 0.]\n", 380 | " [0. 0. 0. ... 0. 0. 0.]\n", 381 | " [0. 0. 0. ... 0. 0. 0.]]\n", 382 | "\n", 383 | " [[0. 0. 0. ... 0. 0. 0.]\n", 384 | " [0. 0. 0. ... 0. 0. 0.]\n", 385 | " [0. 0. 0. ... 0. 0. 0.]]\n", 386 | "\n", 387 | " ...\n", 388 | "\n", 389 | " [[0. 0. 0. ... 0. 0. 0.]\n", 390 | " [0. 0. 0. ... 0. 0. 0.]\n", 391 | " [0. 0. 0. ... 0. 0. 0.]]\n", 392 | "\n", 393 | " [[0. 0. 0. ... 0. 0. 0.]\n", 394 | " [0. 0. 0. ... 0. 0. 0.]\n", 395 | " [0. 0. 0. ... 0. 0. 0.]]\n", 396 | "\n", 397 | " [[0. 0. 0. ... 0. 0. 0.]\n", 398 | " [0. 0. 0. ... 0. 0. 0.]\n", 399 | " [0. 0. 0. ... 0. 0. 0.]]]] [[[[0. 0. 0. ... 0. 0. 0.]\n", 400 | " [0. 0. 0. ... 0. 0. 0.]\n", 401 | " [0. 0. 0. ... 0. 0. 0.]]\n", 402 | "\n", 403 | " [[0. 0. 0. ... 0. 0. 0.]\n", 404 | " [0. 0. 0. ... 0. 0. 0.]\n", 405 | " [0. 0. 0. ... 0. 0. 0.]]\n", 406 | "\n", 407 | " [[0. 0. 0. ... 0. 0. 0.]\n", 408 | " [0. 0. 0. ... 0. 0. 0.]\n", 409 | " [0. 0. 0. ... 0. 0. 0.]]\n", 410 | "\n", 411 | " ...\n", 412 | "\n", 413 | " [[0. 0. 0. ... 0. 0. 0.]\n", 414 | " [0. 0. 0. ... 0. 0. 0.]\n", 415 | " [0. 0. 0. ... 0. 0. 0.]]\n", 416 | "\n", 417 | " [[0. 0. 0. ... 0. 0. 0.]\n", 418 | " [0. 0. 0. ... 0. 0. 0.]\n", 419 | " [0. 0. 0. ... 0. 0. 0.]]\n", 420 | "\n", 421 | " [[0. 0. 0. ... 0. 0. 0.]\n", 422 | " [0. 0. 0. ... 0. 0. 0.]\n", 423 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 424 | "\n", 425 | "\n", 426 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 427 | " [0. 0. 0. ... 0. 0. 0.]\n", 428 | " [0. 0. 0. ... 0. 0. 0.]]\n", 429 | "\n", 430 | " [[0. 0. 0. ... 0. 0. 0.]\n", 431 | " [0. 0. 0. ... 0. 0. 0.]\n", 432 | " [0. 0. 0. ... 0. 0. 0.]]\n", 433 | "\n", 434 | " [[0. 0. 0. ... 0. 0. 0.]\n", 435 | " [0. 0. 0. ... 0. 0. 0.]\n", 436 | " [0. 0. 0. ... 0. 0. 0.]]\n", 437 | "\n", 438 | " ...\n", 439 | "\n", 440 | " [[0. 0. 0. ... 0. 0. 0.]\n", 441 | " [0. 0. 0. ... 0. 0. 0.]\n", 442 | " [0. 0. 0. ... 0. 0. 0.]]\n", 443 | "\n", 444 | " [[0. 0. 0. ... 0. 0. 0.]\n", 445 | " [0. 0. 0. ... 0. 0. 0.]\n", 446 | " [0. 0. 0. ... 0. 0. 0.]]\n", 447 | "\n", 448 | " [[0. 0. 0. ... 0. 0. 0.]\n", 449 | " [0. 0. 0. ... 0. 0. 0.]\n", 450 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 451 | "\n", 452 | "\n", 453 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 454 | " [0. 0. 0. ... 0. 0. 0.]\n", 455 | " [0. 0. 0. ... 0. 0. 0.]]\n", 456 | "\n", 457 | " [[0. 0. 0. ... 0. 0. 0.]\n", 458 | " [0. 0. 0. ... 0. 0. 0.]\n", 459 | " [0. 0. 0. ... 0. 0. 0.]]\n", 460 | "\n", 461 | " [[0. 0. 0. ... 0. 0. 0.]\n", 462 | " [0. 0. 0. ... 0. 0. 0.]\n", 463 | " [0. 0. 0. ... 0. 0. 0.]]\n", 464 | "\n", 465 | " ...\n", 466 | "\n", 467 | " [[0. 0. 0. ... 0. 0. 0.]\n", 468 | " [0. 0. 0. ... 0. 0. 0.]\n", 469 | " [0. 0. 0. ... 0. 0. 0.]]\n", 470 | "\n", 471 | " [[0. 0. 0. ... 0. 0. 0.]\n", 472 | " [0. 0. 0. ... 0. 0. 0.]\n", 473 | " [0. 0. 0. ... 0. 0. 0.]]\n", 474 | "\n", 475 | " [[0. 0. 0. ... 0. 0. 0.]\n", 476 | " [0. 0. 0. ... 0. 0. 0.]\n", 477 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 478 | "\n", 479 | "\n", 480 | " ...\n", 481 | "\n", 482 | "\n", 483 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 484 | " [0. 0. 0. ... 0. 0. 0.]\n", 485 | " [0. 0. 0. ... 0. 0. 0.]]\n", 486 | "\n", 487 | " [[0. 0. 0. ... 0. 0. 0.]\n", 488 | " [0. 0. 0. ... 0. 0. 0.]\n", 489 | " [0. 0. 0. ... 0. 0. 0.]]\n", 490 | "\n", 491 | " [[0. 0. 0. ... 0. 0. 0.]\n", 492 | " [0. 0. 0. ... 0. 0. 0.]\n", 493 | " [0. 0. 0. ... 0. 0. 0.]]\n", 494 | "\n", 495 | " ...\n", 496 | "\n", 497 | " [[0. 0. 0. ... 0. 0. 0.]\n", 498 | " [0. 0. 0. ... 0. 0. 0.]\n", 499 | " [0. 0. 0. ... 0. 0. 0.]]\n", 500 | "\n", 501 | " [[0. 0. 0. ... 0. 0. 0.]\n", 502 | " [0. 0. 0. ... 0. 0. 0.]\n", 503 | " [0. 0. 0. ... 0. 0. 0.]]\n", 504 | "\n", 505 | " [[0. 0. 0. ... 0. 0. 0.]\n", 506 | " [0. 0. 0. ... 0. 0. 0.]\n", 507 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 508 | "\n", 509 | "\n", 510 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 511 | " [0. 0. 0. ... 0. 0. 0.]\n", 512 | " [0. 0. 0. ... 0. 0. 0.]]\n", 513 | "\n", 514 | " [[0. 0. 0. ... 0. 0. 0.]\n", 515 | " [0. 0. 0. ... 0. 0. 0.]\n", 516 | " [0. 0. 0. ... 0. 0. 0.]]\n", 517 | "\n", 518 | " [[0. 0. 0. ... 0. 0. 0.]\n", 519 | " [0. 0. 0. ... 0. 0. 0.]\n", 520 | " [0. 0. 0. ... 0. 0. 0.]]\n", 521 | "\n", 522 | " ...\n", 523 | "\n", 524 | " [[0. 0. 0. ... 0. 0. 0.]\n", 525 | " [0. 0. 0. ... 0. 0. 0.]\n", 526 | " [0. 0. 0. ... 0. 0. 0.]]\n", 527 | "\n", 528 | " [[0. 0. 0. ... 0. 0. 0.]\n", 529 | " [0. 0. 0. ... 0. 0. 0.]\n", 530 | " [0. 0. 0. ... 0. 0. 0.]]\n", 531 | "\n", 532 | " [[0. 0. 0. ... 0. 0. 0.]\n", 533 | " [0. 0. 0. ... 0. 0. 0.]\n", 534 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 535 | "\n", 536 | "\n", 537 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 538 | " [0. 0. 0. ... 0. 0. 0.]\n", 539 | " [0. 0. 0. ... 0. 0. 0.]]\n", 540 | "\n", 541 | " [[0. 0. 0. ... 0. 0. 0.]\n", 542 | " [0. 0. 0. ... 0. 0. 0.]\n", 543 | " [0. 0. 0. ... 0. 0. 0.]]\n", 544 | "\n", 545 | " [[0. 0. 0. ... 0. 0. 0.]\n", 546 | " [0. 0. 0. ... 0. 0. 0.]\n", 547 | " [0. 0. 0. ... 0. 0. 0.]]\n", 548 | "\n", 549 | " ...\n", 550 | "\n", 551 | " [[0. 0. 0. ... 0. 0. 0.]\n", 552 | " [0. 0. 0. ... 0. 0. 0.]\n", 553 | " [0. 0. 0. ... 0. 0. 0.]]\n", 554 | "\n", 555 | " [[0. 0. 0. ... 0. 0. 0.]\n", 556 | " [0. 0. 0. ... 0. 0. 0.]\n", 557 | " [0. 0. 0. ... 0. 0. 0.]]\n", 558 | "\n", 559 | " [[0. 0. 0. ... 0. 0. 0.]\n", 560 | " [0. 0. 0. ... 0. 0. 0.]\n", 561 | " [0. 0. 0. ... 0. 0. 0.]]]] [[[[0. 0. 0. ... 0. 0. 0.]\n", 562 | " [0. 0. 0. ... 0. 0. 0.]\n", 563 | " [0. 0. 0. ... 0. 0. 0.]]\n", 564 | "\n", 565 | " [[0. 0. 0. ... 0. 0. 0.]\n", 566 | " [0. 0. 0. ... 0. 0. 0.]\n", 567 | " [0. 0. 0. ... 0. 0. 0.]]\n", 568 | "\n", 569 | " [[0. 0. 0. ... 0. 0. 0.]\n", 570 | " [0. 0. 0. ... 0. 0. 0.]\n", 571 | " [0. 0. 0. ... 0. 0. 0.]]\n", 572 | "\n", 573 | " ...\n", 574 | "\n", 575 | " [[0. 0. 0. ... 0. 0. 0.]\n", 576 | " [0. 0. 0. ... 0. 0. 0.]\n", 577 | " [0. 0. 0. ... 0. 0. 0.]]\n", 578 | "\n", 579 | " [[0. 0. 0. ... 0. 0. 0.]\n", 580 | " [0. 0. 0. ... 0. 0. 0.]\n", 581 | " [0. 0. 0. ... 0. 0. 0.]]\n", 582 | "\n", 583 | " [[0. 0. 0. ... 0. 0. 0.]\n", 584 | " [0. 0. 0. ... 0. 0. 0.]\n", 585 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 586 | "\n", 587 | "\n", 588 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 589 | " [0. 0. 0. ... 0. 0. 0.]\n", 590 | " [0. 0. 0. ... 0. 0. 0.]]\n", 591 | "\n", 592 | " [[0. 0. 0. ... 0. 0. 0.]\n", 593 | " [0. 0. 0. ... 0. 0. 0.]\n", 594 | " [0. 0. 0. ... 0. 0. 0.]]\n", 595 | "\n", 596 | " [[0. 0. 0. ... 0. 0. 0.]\n", 597 | " [0. 0. 0. ... 0. 0. 0.]\n", 598 | " [0. 0. 0. ... 0. 0. 0.]]\n", 599 | "\n", 600 | " ...\n", 601 | "\n", 602 | " [[0. 0. 0. ... 0. 0. 0.]\n", 603 | " [0. 0. 0. ... 0. 0. 0.]\n", 604 | " [0. 0. 0. ... 0. 0. 0.]]\n", 605 | "\n", 606 | " [[0. 0. 0. ... 0. 0. 0.]\n", 607 | " [0. 0. 0. ... 0. 0. 0.]\n", 608 | " [0. 0. 0. ... 0. 0. 0.]]\n", 609 | "\n", 610 | " [[0. 0. 0. ... 0. 0. 0.]\n", 611 | " [0. 0. 0. ... 0. 0. 0.]\n", 612 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 613 | "\n", 614 | "\n", 615 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 616 | " [0. 0. 0. ... 0. 0. 0.]\n", 617 | " [0. 0. 0. ... 0. 0. 0.]]\n", 618 | "\n", 619 | " [[0. 0. 0. ... 0. 0. 0.]\n", 620 | " [0. 0. 0. ... 0. 0. 0.]\n", 621 | " [0. 0. 0. ... 0. 0. 0.]]\n", 622 | "\n", 623 | " [[0. 0. 0. ... 0. 0. 0.]\n", 624 | " [0. 0. 0. ... 0. 0. 0.]\n", 625 | " [0. 0. 0. ... 0. 0. 0.]]\n", 626 | "\n", 627 | " ...\n", 628 | "\n", 629 | " [[0. 0. 0. ... 0. 0. 0.]\n", 630 | " [0. 0. 0. ... 0. 0. 0.]\n", 631 | " [0. 0. 0. ... 0. 0. 0.]]\n", 632 | "\n", 633 | " [[0. 0. 0. ... 0. 0. 0.]\n", 634 | " [0. 0. 0. ... 0. 0. 0.]\n", 635 | " [0. 0. 0. ... 0. 0. 0.]]\n", 636 | "\n", 637 | " [[0. 0. 0. ... 0. 0. 0.]\n", 638 | " [0. 0. 0. ... 0. 0. 0.]\n", 639 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 640 | "\n", 641 | "\n", 642 | " ...\n", 643 | "\n", 644 | "\n", 645 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 646 | " [0. 0. 0. ... 0. 0. 0.]\n", 647 | " [0. 0. 0. ... 0. 0. 0.]]\n", 648 | "\n", 649 | " [[0. 0. 0. ... 0. 0. 0.]\n", 650 | " [0. 0. 0. ... 0. 0. 0.]\n", 651 | " [0. 0. 0. ... 0. 0. 0.]]\n", 652 | "\n", 653 | " [[0. 0. 0. ... 0. 0. 0.]\n", 654 | " [0. 0. 0. ... 0. 0. 0.]\n", 655 | " [0. 0. 0. ... 0. 0. 0.]]\n", 656 | "\n", 657 | " ...\n", 658 | "\n", 659 | " [[0. 0. 0. ... 0. 0. 0.]\n", 660 | " [0. 0. 0. ... 0. 0. 0.]\n", 661 | " [0. 0. 0. ... 0. 0. 0.]]\n", 662 | "\n", 663 | " [[0. 0. 0. ... 0. 0. 0.]\n", 664 | " [0. 0. 0. ... 0. 0. 0.]\n", 665 | " [0. 0. 0. ... 0. 0. 0.]]\n", 666 | "\n", 667 | " [[0. 0. 0. ... 0. 0. 0.]\n", 668 | " [0. 0. 0. ... 0. 0. 0.]\n", 669 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 670 | "\n", 671 | "\n", 672 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 673 | " [0. 0. 0. ... 0. 0. 0.]\n", 674 | " [0. 0. 0. ... 0. 0. 0.]]\n", 675 | "\n", 676 | " [[0. 0. 0. ... 0. 0. 0.]\n", 677 | " [0. 0. 0. ... 0. 0. 0.]\n", 678 | " [0. 0. 0. ... 0. 0. 0.]]\n", 679 | "\n", 680 | " [[0. 0. 0. ... 0. 0. 0.]\n", 681 | " [0. 0. 0. ... 0. 0. 0.]\n", 682 | " [0. 0. 0. ... 0. 0. 0.]]\n", 683 | "\n", 684 | " ...\n", 685 | "\n", 686 | " [[0. 0. 0. ... 0. 0. 0.]\n", 687 | " [0. 0. 0. ... 0. 0. 0.]\n", 688 | " [0. 0. 0. ... 0. 0. 0.]]\n", 689 | "\n", 690 | " [[0. 0. 0. ... 0. 0. 0.]\n", 691 | " [0. 0. 0. ... 0. 0. 0.]\n", 692 | " [0. 0. 0. ... 0. 0. 0.]]\n", 693 | "\n", 694 | " [[0. 0. 0. ... 0. 0. 0.]\n", 695 | " [0. 0. 0. ... 0. 0. 0.]\n", 696 | " [0. 0. 0. ... 0. 0. 0.]]]\n", 697 | "\n", 698 | "\n", 699 | " [[[0. 0. 0. ... 0. 0. 0.]\n", 700 | " [0. 0. 0. ... 0. 0. 0.]\n", 701 | " [0. 0. 0. ... 0. 0. 0.]]\n", 702 | "\n", 703 | " [[0. 0. 0. ... 0. 0. 0.]\n", 704 | " [0. 0. 0. ... 0. 0. 0.]\n", 705 | " [0. 0. 0. ... 0. 0. 0.]]\n", 706 | "\n", 707 | " [[0. 0. 0. ... 0. 0. 0.]\n", 708 | " [0. 0. 0. ... 0. 0. 0.]\n", 709 | " [0. 0. 0. ... 0. 0. 0.]]\n", 710 | "\n", 711 | " ...\n", 712 | "\n", 713 | " [[0. 0. 0. ... 0. 0. 0.]\n", 714 | " [0. 0. 0. ... 0. 0. 0.]\n", 715 | " [0. 0. 0. ... 0. 0. 0.]]\n", 716 | "\n", 717 | " [[0. 0. 0. ... 0. 0. 0.]\n", 718 | " [0. 0. 0. ... 0. 0. 0.]\n", 719 | " [0. 0. 0. ... 0. 0. 0.]]\n", 720 | "\n", 721 | " [[0. 0. 0. ... 0. 0. 0.]\n", 722 | " [0. 0. 0. ... 0. 0. 0.]\n", 723 | " [0. 0. 0. ... 0. 0. 0.]]]]\n" 724 | ] 725 | } 726 | ], 727 | "source": [ 728 | "index = 0\n", 729 | "print(file_names[index])\n", 730 | "image = util.load_image(file_names[index])\n", 731 | "boxes = util.load_label(file_names[index])\n", 732 | "image, boxes = util.resize(image, boxes)\n", 733 | "# image, boxes = util.random_flip(image, boxes)\n", 734 | "\n", 735 | "image = image[:, :, ::-1].astype(numpy.float32)\n", 736 | "image = image / 255.0\n", 737 | "y_true_1, y_true_2, y_true_3 = util.process_box(boxes)\n", 738 | "print(image, y_true_1, y_true_2, y_true_3)\n" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 8, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "# # Create an iterator for the dataset\n", 748 | "# iterator = iter(dataset)\n", 749 | "\n", 750 | "# # Get one sample from the dataset\n", 751 | "# sample = next(iterator)\n", 752 | "\n", 753 | "# # Unpack the sample\n", 754 | "# image, y_true_1, y_true_2, y_true_3 = sample" 755 | ] 756 | } 757 | ], 758 | "metadata": { 759 | "kernelspec": { 760 | "display_name": "Python 3", 761 | "language": "python", 762 | "name": "python3" 763 | }, 764 | "language_info": { 765 | "codemirror_mode": { 766 | "name": "ipython", 767 | "version": 3 768 | }, 769 | "file_extension": ".py", 770 | "mimetype": "text/x-python", 771 | "name": "python", 772 | "nbconvert_exporter": "python", 773 | "pygments_lexer": "ipython3", 774 | "version": "3.10.13" 775 | } 776 | }, 777 | "nbformat": 4, 778 | "nbformat_minor": 2 779 | } 780 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | minio==7.1.17 2 | ultralytics==8.0.202 3 | onnx==1.15.0 4 | tritonclient[http]==2.39.0 5 | onnxsim==0.4.35 6 | onnxruntime-gpu==1.16.1 7 | tensorrt==8.6.1 8 | cuda-python==12.3.0 9 | -------------------------------------------------------------------------------- /streaming/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | # Copy app handler code 4 | 5 | # Install dependencies 6 | RUN pip3 install kafka-python==2.0.2 7 | RUN pip3 install avro==1.11.1 8 | RUN pip3 install pandas==1.5.1 9 | RUN pip3 install pyarrow==10.0.1 10 | RUN pip3 install python-schema-registry-client==2.4.1 11 | RUN pip3 install pymongo==4.5.0 12 | 13 | # Copy app handler code 14 | COPY produce.py produce.py 15 | # COPY kafka_producer/generate_schemas.py generate_schemas.py 16 | # COPY run.sh . 17 | # # Uncomment this to generate a random schema 18 | # RUN chmod +x /run.sh && ./run.sh generate_schemas 19 | 20 | CMD [ "python", "-u", "produce.py", "--mode", "setup", "--bootstrap_servers", "broker:29092"] -------------------------------------------------------------------------------- /streaming/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline Guide 2 | ## Table of Contents 3 | 1. [Introduction](#introduction) 4 | 2. [Dataset Setup](#dataset-setup) 5 | - [Downloading the Dataset](#downloading-the-dataset) 6 | - [Folder Structure](#folder-structure) 7 | 3. [Kafka and Flink Setup](#kafka-and-flink-setup) 8 | - [Starting Docker Compose](#starting-docker-compose) 9 | - [Accessing Kafka Control Center](#accessing-kafka-control-center) 10 | 4. [Viewing Kafka Topics](#viewing-kafka-topics) 11 | - [Accessing the Topics Tab](#accessing-the-topics-tab) 12 | - [Viewing Topic Messages](#viewing-topic-messages) 13 | 5. [Adding Kafka Connector](#adding-kafka-connector) 14 | 6. [Verifying Data in PostgreSQL](#verifying-data-in-postgresql) 15 | 7. [Note](#note) 16 | 17 | --- 18 | ![DataPipeline](images/data-pipeline.png) 19 | ## Introduction 20 | In this step, we focus on the real-time data processing component of our pipeline. The goal is to simulate and process streaming data to enhance the robustness of our data pipeline. 21 | 22 | 👉 We achieve this by leveraging **Kafka** for stream ingestion and **Apache Flink** for stream processing. 23 | 24 | **Note:** In this project, Kafka acts as the backbone for stream data ingestion, handling both real and simulated (fake) stream data. **Apache Flink** processes this data in real-time, ensuring that the processed data is available in our **Redis** online store. Redis, in turn, is synced with our **PostgreSQL** offline store, providing a unified data storage solution. 25 | 26 | Key steps in this process include: 27 | 28 | 1. **Kafka Producer Setup**: The Kafka producer service is responsible for continuously sending data streams. You can customize the message format, bind data to messages, and specify the Kafka topic for message distribution. 29 | 30 | 2. **Stream Processing with Apache Flink**: Flink processes the incoming data streams, transforming them into a format suitable for storage in Redis. This ensures that data is available for immediate use in both real-time and batch processing scenarios. 31 | 32 | 3. **Data Syncing**: Redis, serving as the online store, is synced with PostgreSQL to maintain consistency between real-time and offline data. 33 | 34 | For more details on setting up the Kafka producer and configuring Flink for stream processing, refer to the [Confluent PostgreSQL Sink Guide](https://docs.confluent.io/cloud/current/connectors/cc-postgresql-sink.html#step-6-check-the-results-in-postgresql). 35 | 36 | 37 | # Dataset Setup 38 | 39 | ## Downloading the Dataset: 40 | To begin, download the dataset required for streaming from the following link: [Dataset Link](https://drive.google.com/drive/folders/12ncEAoWT_kwuPT8YRdFysqgS54XJwre7?usp=drive_link) 41 | 42 | ## Folder Structure 43 | The structure of the folder will be like this: 44 | 45 |
46 | Structure Data Folder 47 |
48 | 49 | # Kafka Flink Setup 50 | 51 | ## Starting Docker Compose 52 | If you haven't already done so in previous steps, start the Docker Compose setup to launch the necessary services. 53 | 54 | ## Accessing Kafka Control Center 55 | Once Docker Compose is running, you can access the Kafka Control Center by navigating to `https://localhost:9021`. This interface allows you to manage and monitor your Kafka cluster. 56 | 57 | # Viewing Kafka Topics 58 | 59 | ## Accessing the Topics Tab 60 | To view the available Kafka topics, click on the `Topics` tab within the Kafka Control Center. You can follow the steps outlined in the image below: 61 | 62 | ![Topic Tab](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/topic_tab.png) 63 | 64 | ## Viewing Topic Messages 65 | Select a specific topic (e.g., `image 0`) to view the messages being transmitted: 66 | 67 | ![Message](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/messenger.png) 68 | 69 | # Adding Kafka Connector 70 | To ensure that messages are forwarded to PostgreSQL, you will need to add a Kafka connector. An example configuration file, `connect-timescaledb-sink.json`, is provided in this repository for your reference: 71 | 72 | ![Connector](https://github.com/HungNguyenDev1511/Car-detection-serving-model/blob/refactor/images/connector.png) 73 | 74 | # Verifying Data in PostgreSQL 75 | Finally, after setting up the Kafka connector, verify that the data has been successfully transferred to PostgreSQL. You can do this by querying the database using SQL to confirm that the data is correctly stored and ready for training. 76 | 77 | # Note 78 | Before proceeding to data verification, ensure that the Kafka connector is properly configured and operational. This step is crucial for the successful transmission of data from Kafka to PostgreSQL. 79 | -------------------------------------------------------------------------------- /streaming/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | zookeeper: 3 | image: confluentinc/cp-zookeeper:7.5.0 4 | # hostname: zookeeper 5 | container_name: flink-zookeeper 6 | ports: 7 | - "2181:2181" 8 | healthcheck: 9 | test: echo srvr | nc zookeeper 2181 || exit 1 10 | start_period: 10s 11 | retries: 20 12 | interval: 10s 13 | environment: 14 | ZOOKEEPER_CLIENT_PORT: 2181 15 | ZOOKEEPER_TICK_TIME: 2000 16 | 17 | # Kafka broker 18 | broker: 19 | image: confluentinc/cp-server:7.5.0 20 | # hostname: broker 21 | container_name: flink-broker 22 | depends_on: 23 | - zookeeper 24 | ports: 25 | - "9092:9092" 26 | - "9101:9101" 27 | healthcheck: 28 | test: nc -z localhost 9092 || exit -1 29 | start_period: 15s 30 | interval: 5s 31 | timeout: 10s 32 | retries: 10 33 | environment: 34 | KAFKA_BROKER_ID: 1 35 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 36 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 37 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 38 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 39 | 40 | # For managing Avro schemas 41 | schema-registry: 42 | image: confluentinc/cp-schema-registry:7.5.0 43 | # hostname: schema-registry 44 | container_name: flink-schema-registry 45 | depends_on: 46 | - broker 47 | ports: 48 | - "8081:8081" 49 | healthcheck: 50 | start_period: 10s 51 | interval: 10s 52 | retries: 20 53 | test: curl --user superUser:superUser --fail --silent --insecure http://localhost:8081/subjects --output /dev/null || exit 1 54 | environment: 55 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 56 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092' 57 | SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081 58 | 59 | # For connecting to offline store 60 | connect: 61 | image: confluentinc/cp-kafka-connect:7.5.0 62 | # hostname: connect 63 | container_name: flink-connect 64 | depends_on: 65 | broker: 66 | condition: service_healthy 67 | schema-registry: 68 | condition: service_healthy 69 | zookeeper: 70 | condition: service_healthy 71 | ports: 72 | - "8083:8083" 73 | environment: 74 | CONNECT_BOOTSTRAP_SERVERS: 'broker:29092' 75 | CONNECT_REST_ADVERTISED_HOST_NAME: connect 76 | CONNECT_REST_PORT: 8083 77 | CONNECT_GROUP_ID: compose-connect-group 78 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs 79 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 80 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 81 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets 82 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 83 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status 84 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 85 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 86 | CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 87 | CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: true 88 | CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: true 89 | CONNECT_KEY_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 90 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 91 | CONNECT_PLUGIN_PATH: '/usr/share/java,/etc/kafka-connect/jars' 92 | 93 | volumes: 94 | - /Car-detection-serving-model/streaming/kafka_connector/jars 95 | 96 | # Confluent control center to manage Kafka 97 | control-center: 98 | image: confluentinc/cp-enterprise-control-center:7.5.0 99 | # hostname: control-center 100 | container_name: flink-control-center 101 | depends_on: 102 | - broker 103 | - schema-registry 104 | - connect 105 | ports: 106 | - "9021:9021" 107 | healthcheck: 108 | test: ["CMD", "curl", "-f", "http://localhost:9021/healthcheck"] # Adjust the URL and options as needed 109 | interval: 30s 110 | timeout: 10s 111 | retries: 3 112 | environment: 113 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' 114 | CONTROL_CENTER_CONNECT_CONNECT-DEFAULT_CLUSTER: 'connect:8083' 115 | # CONTROL_CENTER_KSQL_KSQLDB1_URL: "http://ksqldb-server:8088" 116 | # CONTROL_CENTER_KSQL_KSQLDB1_ADVERTISED_URL: "http://localhost:8088" 117 | CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081" 118 | CONTROL_CENTER_REPLICATION_FACTOR: 1 119 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 120 | # CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 121 | CONTROL_CENTER_CONNECT_HEALTHCHECK_ENDPOINT: '/connectors' 122 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1 123 | # PORT: 9021 124 | 125 | # Offline store 126 | timescaledb: 127 | image: timescale/timescaledb:latest-pg13 128 | command: postgres -c shared_preload_libraries=timescaledb 129 | container_name: flink-timescaledb 130 | ports: 131 | - "5432:5432" 132 | healthcheck: 133 | test: ['CMD', 'psql', '-U', 'k6', '-c', 'SELECT 1'] 134 | interval: 10s 135 | timeout: 5s 136 | retries: 5 137 | environment: 138 | - PGDATA=/var/lib/postgresql/data/timescaledb 139 | - POSTGRES_DB=k6 140 | - POSTGRES_USER=k6 141 | - POSTGRES_PASSWORD=k6 142 | 143 | # Simulation of sending messages to Kafka topics 144 | kafka_producer: 145 | build: 146 | context: . 147 | dockerfile: /Car-detection-serving-model/streaming/Dockerfile 148 | depends_on: 149 | broker: 150 | condition: service_healthy 151 | timescaledb: 152 | condition: service_healthy 153 | container_name: flink-kafka-producer 154 | volumes: 155 | - /Car-detection-serving-model/streaming/Dataset/images/train:/images 156 | -------------------------------------------------------------------------------- /streaming/images/data-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HungNguyenDev1511/Car-detection-serving-model/493b25a3784ec8357420ca43d2e5029376c21dae/streaming/images/data-pipeline.png -------------------------------------------------------------------------------- /streaming/kafka_connector/connect-timescaledb-sink.json : -------------------------------------------------------------------------------- 1 | { 2 | "name": "images-sink", 3 | "config": { 4 | "connector.class": "io.confluent.connect.jdbc.JdbcSinkConnector", 5 | "tasks.max": "1", 6 | "topics": "sink_images_0", 7 | "connection.url": "jdbc:postgresql://host.docker.internal:5432/k6", 8 | "connection.user": "k6", 9 | "connection.password": "k6", 10 | "auto.create": true 11 | } 12 | } -------------------------------------------------------------------------------- /streaming/produce.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import io 3 | import json 4 | import os 5 | from datetime import datetime 6 | from time import sleep 7 | import random 8 | 9 | import numpy as np 10 | from bson import json_util 11 | from kafka import KafkaAdminClient, KafkaProducer 12 | from kafka.admin import NewTopic 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "-m", 17 | "--mode", 18 | default="setup", 19 | choices=["setup", "teardown"], 20 | help="Whether to setup or teardown a Kafka topic with driver stats events. Setup will teardown before beginning emitting events.", 21 | ) 22 | parser.add_argument( 23 | "-b", 24 | "--bootstrap_servers", 25 | default="localhost:9092", 26 | help="Where the bootstrap server is", 27 | ) 28 | parser.add_argument( 29 | "-c", 30 | "--schemas_path", 31 | default="./avro_schemas", 32 | help="Folder containing all generated avro schemas", 33 | ) 34 | parser.add_argument( 35 | "-i", 36 | "--image_dir", 37 | default="./images", 38 | help="Directory containing the images to send", 39 | ) 40 | 41 | args = parser.parse_args() 42 | 43 | image_id_counter = 1 44 | 45 | def create_topic(admin, topic_name): 46 | # Create topic if not exists 47 | try: 48 | # Create Kafka topic 49 | topic = NewTopic(name=topic_name, num_partitions=1, replication_factor=1) 50 | admin.create_topics([topic]) 51 | print(f"A new topic {topic_name} has been created!") 52 | except Exception: 53 | print(f"Topic {topic_name} already exists. Skipping creation!") 54 | pass 55 | 56 | 57 | def create_streams(servers, schemas_path, image_dir): 58 | producer = None 59 | admin = None 60 | for _ in range(10): 61 | try: 62 | producer = KafkaProducer(bootstrap_servers=servers) 63 | admin = KafkaAdminClient(bootstrap_servers=servers) 64 | print("SUCCESS: instantiated Kafka admin and producer") 65 | break 66 | except Exception as e: 67 | print( 68 | f"Trying to instantiate admin and producer with bootstrap servers {servers} with error {e}" 69 | ) 70 | sleep(10) 71 | pass 72 | 73 | image_files = [ 74 | os.path.join(image_dir, f) 75 | for f in os.listdir(image_dir) 76 | if os.path.isfile(os.path.join(image_dir, f)) 77 | ] 78 | image_index = 0 79 | 80 | while True: 81 | image_file = image_files[image_index] 82 | image_index = (image_index + 1) % len(image_files) 83 | 84 | with open(image_file, "rb") as img_file: 85 | image_data = img_file.read() 86 | 87 | record = { 88 | "schema": { 89 | "type": "struct", 90 | "fields": [ 91 | {"type": "int64", "optional": False, "field": "image_id"}, 92 | {"type": "bytes", "optional": False, "field": "image_data"}, 93 | ], 94 | } 95 | } 96 | record["payload"] = {} 97 | 98 | record["payload"]["image_id"] = image_id_counter 99 | image_id_counter += 1 # tanc chi so image id 100 | record["payload"]["image_data"] = image_data 101 | 102 | 103 | # Get topic name for this image 104 | topic_name = f"image_0" 105 | 106 | # Create a new topic for this image if not exists 107 | create_topic(admin, topic_name=topic_name) 108 | 109 | # Send messages to this topic 110 | producer.send( 111 | topic_name, json.dumps(record, default=json_util.default).encode("utf-8") 112 | ) 113 | print(record) 114 | sleep(2) 115 | 116 | 117 | def teardown_stream(topic_name, servers=["localhost:9092"]): 118 | try: 119 | admin = KafkaAdminClient(bootstrap_servers=servers) 120 | print(admin.delete_topics([topic_name])) 121 | print(f"Topic {topic_name} deleted") 122 | except Exception as e: 123 | print(str(e)) 124 | pass 125 | 126 | 127 | if __name__ == "__main__": 128 | parsed_args = vars(args) 129 | mode = parsed_args["mode"] 130 | servers = parsed_args["bootstrap_servers"] 131 | image_dir = parsed_args["image_dir"] 132 | 133 | # Tear down all previous streams 134 | print("Tearing down all existing topics!") 135 | for image_id in range(NUM_IMAGE): 136 | try: 137 | teardown_stream(f"image_{image_id}", [servers]) 138 | except Exception as e: 139 | print(f"Topic image_{image_id} does not exist. Skipping...!") 140 | 141 | if mode == "setup": 142 | schemas_path = parsed_args["schemas_path"] 143 | create_streams([servers], schemas_path, image_dir) 144 | -------------------------------------------------------------------------------- /streaming/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cmd=$1 3 | 4 | usage() { 5 | echo "run.sh " 6 | echo "Available commands:" 7 | echo " register_connector register a new Kafka connector" 8 | echo " start_streaming start streaming to Kafka" 9 | echo " stop_streaming stop streaming to Kafka" 10 | echo "Available arguments:" 11 | echo " [connector config path] path to connector config, for command register_connector only" 12 | } 13 | 14 | if [[ -z "$cmd" ]]; then 15 | echo "Missing command" 16 | usage 17 | exit 1 18 | fi 19 | 20 | case $cmd in 21 | register_connector) 22 | if [[ -z "$2" ]]; then 23 | echo "Missing connector config path" 24 | usage 25 | exit 1 26 | else 27 | echo "Registering a new connector from $2" 28 | # Assign a connector config path such as: kafka_connect_jdbc/configs/connect-timescaledb-sink.json 29 | curl -s -X POST -H 'Content-Type: application/json' --data @$2 http://localhost:8083/connectors 30 | fi 31 | ;; 32 | generate_schemas) 33 | # Generate data for 1 device with number of features in the range from 2 to 10 34 | python generate_schemas.py --min_features 2 --max_features 10 --num_schemas 1 35 | ;; 36 | *) 37 | echo -n "Unknown command: $cmd" 38 | usage 39 | exit 1 40 | ;; 41 | esac --------------------------------------------------------------------------------