├── README.md ├── load_testing ├── README.md ├── locust │ ├── Dockerfile │ ├── create_docker_image.sh │ ├── data_samples.csv │ ├── locustfile.py │ ├── requirements.txt │ └── run_load_test.sh └── machine_provisioning │ ├── baseline_benchmark.sh │ ├── locust_machine.sh │ ├── model_hardware_optimised_benchmark.sh │ └── model_optimised_benchmark.sh └── python-api ├── baseline ├── Dockerfile ├── main.py └── requirements.txt ├── create_docker_images.sh ├── model-hardware-optimised ├── Dockerfile ├── main.py └── requirements.txt └── model-optimised ├── Dockerfile ├── main.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # How to easily and efficiently deploy Hugging Face models 2 | Code repository to reproduce the benchmarking results presented [here](https://www.comet.ml/site/how-to-10x-throughput-when-serving-hugging-face-models-without-a-gpu/) 3 | 4 | The repo is broken down into 2 sections: 5 | * python-api: The Python inference services that we tested: 6 | * baseline: Baseline inference service using default parameters - This is **not** optimized 7 | * model-hardware-optimized: Optimized inference service for DistilBert - This is the most optimized inference service 8 | * model-optimized: Optimized inference service that support both Bert and DistilBert models as well as optional quantization 9 | * load-testing: Utilities to run performance benchmarks 10 | 11 | ## How to run the benchmarks 12 | Given all our benchmarks are run on GCP, you will need to have a Google Cloud project. 13 | 14 | Running the benchmarks is done in three steps: 15 | 1. Create the docker images for each Python API: 16 | ```bash 17 | cd python-api 18 | sh create_docker_images.sh 19 | ``` 20 | 2. Deploy a virtual machine for the python api we wish to test: 21 | ```bash 22 | cd load_testing/machine_provisioning 23 | 24 | sh baseline_benchmark.sh GOOGLE_PROJECT_ID> 25 | ``` 26 | 3. We will now create a virtual machine from which to run our load testing software: 27 | ```bash 28 | cd load_testing/machine_provisioning 29 | sh locust_machine.sh 30 | ``` 31 | 4. The last part of the script run in step 3 is to connect via ssh to the virtual machine. In this terminal we can run our load testing script: 32 | ```bash 33 | cd /locust 34 | sh run_load_test.sh 35 | ``` 36 | -------------------------------------------------------------------------------- /load_testing/README.md: -------------------------------------------------------------------------------- 1 | ## Running benchmarks 2 | 3 | 1. Start instances using: 4 | 1. `sh baseline_benchmark.sh` 5 | 2. `sh model_optimised_benchmark.sh` 6 | 3. `sh baseline_benchmark.sh` 7 | 2. Start locust using: `env LOCUST_MIN_NB_WORDS=45 LOCUST_MAX_NB_WORDS=55 locust --locustfile ./locust/locustfile.py` -------------------------------------------------------------------------------- /load_testing/locust/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.9-slim-buster 2 | 3 | WORKDIR /locust 4 | COPY requirements.txt requirements.txt 5 | COPY locustfile.py locustfile.py 6 | COPY data_samples.csv data_samples.csv 7 | COPY run_load_test.sh run_load_test.sh 8 | 9 | RUN pip install -r requirements.txt 10 | 11 | #ENTRYPOINT exec echo "ran" -------------------------------------------------------------------------------- /load_testing/locust/create_docker_image.sh: -------------------------------------------------------------------------------- 1 | PROJECT_ID=$1 2 | IMAGE_NAME=benchmarking-nlp-inference-locust 3 | 4 | gcloud builds submit --tag gcr.io/$PROJECT_ID/$IMAGE_NAME --project $PROJECT_ID -------------------------------------------------------------------------------- /load_testing/locust/locustfile.py: -------------------------------------------------------------------------------- 1 | from locust import HttpUser, TaskSet, task 2 | import pandas as pd 3 | from locust import events 4 | import os 5 | 6 | data = pd.read_csv('./data_samples.csv', names=['index', 'text']) 7 | data['nb_words'] = data['text'].apply(lambda x: len(x.split(' '))) 8 | 9 | max_nb_words = int(os.environ['LOCUST_MAX_NB_WORDS']) 10 | min_nb_words = int(os.environ['LOCUST_MIN_NB_WORDS']) 11 | data = data.loc[(data['nb_words'] >= min_nb_words) & (data['nb_words'] <= max_nb_words)]\ 12 | .reset_index(drop=True) 13 | 14 | class UserBehavior(HttpUser): 15 | @task(1) 16 | def request_prediction(self): 17 | message = data['text'].sample(1).iloc[0] 18 | self.client.post("/prediction", json={'text': message}) -------------------------------------------------------------------------------- /load_testing/locust/requirements.txt: -------------------------------------------------------------------------------- 1 | locust 2 | pandas -------------------------------------------------------------------------------- /load_testing/locust/run_load_test.sh: -------------------------------------------------------------------------------- 1 | IP_ADDRESS=$1 2 | NB_CONCURRENT_USERS=$2 3 | 4 | env LOCUST_MIN_NB_WORDS=45 LOCUST_MAX_NB_WORDS=55 locust --headless --locustfile ./locustfile.py --host=http://$IP_ADDRESS:8000 \ 5 | --users $NB_CONCURRENT_USERS --spawn-rate 1 --run-time 30s --reset-stats --loglevel ERROR \ 6 | --stop-timeout 999 -------------------------------------------------------------------------------- /load_testing/machine_provisioning/baseline_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_ID=$1 3 | ZONE=us-central1-a 4 | 5 | # Util function 6 | waitforurl() { 7 | attempt_counter=0 8 | max_attempts=100 9 | 10 | until $(curl --output /dev/null --silent --head --fail -X GET $1); do 11 | if [ ${attempt_counter} -eq ${max_attempts} ];then 12 | echo "Max attempts reached" 13 | exit 1 14 | fi 15 | 16 | printf '.' 17 | attempt_counter=$(($attempt_counter+1)) 18 | sleep 5 19 | done 20 | } 21 | 22 | # Create firewall rule so we can access instance from public internet 23 | gcloud compute firewall-rules create benchmarking-nlp-inference-allow-http-8000 \ 24 | --allow tcp:8000 \ 25 | --source-ranges 0.0.0.0/0 \ 26 | --target-tags benchmarking-nlp-inference-allow-http-8000 \ 27 | --description "Allow port 8000 access to benchmarking_nlp_inference" \ 28 | --project $PROJECT_ID 29 | 30 | # Start container for baseline model 31 | MACHINE_TYPE=e2-standard-4 32 | CONTAINER_ENV="NB_WORKERS=1" 33 | INSTANCE_NAME=benchmarking-nlp-inference-baseline 34 | 35 | gcloud compute instances create-with-container benchmarking-nlp-inference-baseline \ 36 | --container-image gcr.io/$PROJECT_ID/benchmarking-nlp-inference-baseline \ 37 | --container-env=$CONTAINER_ENV \ 38 | --machine-type=$MACHINE_TYPE \ 39 | --zone $ZONE \ 40 | --tags benchmarking-nlp-inference-allow-http-8000 \ 41 | --project $PROJECT_ID 42 | 43 | # Wait for machine to start before exiting 44 | IP_ADDRESS=$(gcloud compute instances describe $INSTANCE_NAME --zone $ZONE --project $PROJECT_ID --format='value[](networkInterfaces.accessConfigs[0].natIP)') 45 | waitforurl http://$IP_ADDRESS:8000 46 | 47 | echo "-------------------------" 48 | echo "Baseline inference service is ready to tested:" 49 | echo "* IP = http://$IP_ADDRESS:8000" 50 | -------------------------------------------------------------------------------- /load_testing/machine_provisioning/locust_machine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_ID=$1 3 | 4 | ZONE=us-central1-a 5 | MACHINE_TYPE=e2-standard-4 6 | 7 | gcloud compute instances create benchmarking-nlp-inference-locust \ 8 | --machine-type=$MACHINE_TYPE \ 9 | --metadata startup-script='#! /bin/bash 10 | sudo apt-get update 11 | sudo apt-get --assume-yes install python3 python3-pip 12 | ' \ 13 | --zone $ZONE \ 14 | --project $PROJECT_ID 15 | 16 | gcloud compute scp ../locust/data_samples.csv locust@benchmarking-nlp-inference-locust:~ --zone $ZONE 17 | gcloud compute scp ../locust/requirements.txt locust@benchmarking-nlp-inference-locust:~ --zone $ZONE 18 | gcloud compute scp ../locust/locustfile.py locust@benchmarking-nlp-inference-locust:~ --zone $ZONE 19 | gcloud compute scp ../locust/run_load_test.sh locust@benchmarking-nlp-inference-locust:~ --zone $ZONE 20 | 21 | gcloud compute ssh --zone $ZONE locust@benchmarking-nlp-inference-locust -- 'pip3 install -r requirements.txt' 22 | gcloud compute ssh --zone $ZONE locust@benchmarking-nlp-inference-locust -------------------------------------------------------------------------------- /load_testing/machine_provisioning/model_hardware_optimised_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_ID=$1 3 | INPUT_MACHINE_TYPE=$2 4 | INPUT_NB_WORKERS=$3 5 | ZONE=us-central1-a 6 | 7 | # Util function 8 | waitforurl() { 9 | attempt_counter=0 10 | max_attempts=100 11 | 12 | until $(curl --output /dev/null --silent --head --fail -X GET $1); do 13 | if [ ${attempt_counter} -eq ${max_attempts} ];then 14 | echo "Max attempts reached" 15 | exit 1 16 | fi 17 | 18 | printf '.' 19 | attempt_counter=$(($attempt_counter+1)) 20 | sleep 5 21 | done 22 | } 23 | 24 | # Create firewall rule so we can access instance from public internet 25 | gcloud compute firewall-rules create benchmarking-nlp-inference-allow-http-8000 \ 26 | --allow tcp:8000 \ 27 | --source-ranges 0.0.0.0/0 \ 28 | --target-tags benchmarking-nlp-inference-allow-http-8000 \ 29 | --description "Allow port 8000 access to benchmarking_nlp_inference" \ 30 | --project $PROJECT_ID 31 | 32 | function start_container_instance() { 33 | MACHINE_TYPE=$1 34 | NB_WORKERS=$2 35 | 36 | INSTANCE_NAME=benchmarking-nlp-inference-$NB_WORKERS-$MACHINE_TYPE 37 | ZONE=us-central1-a 38 | 39 | CONTAINER_ENV="NB_WORKERS=$NB_WORKERS,MODEL_NAME=DistilBert,QUANTIZE_MODEL=true" 40 | gcloud compute instances create-with-container $INSTANCE_NAME \ 41 | --container-image gcr.io/$PROJECT_ID/benchmarking-nlp-inference-model-optimised \ 42 | --container-env=$CONTAINER_ENV \ 43 | --machine-type=$MACHINE_TYPE \ 44 | --zone $ZONE \ 45 | --tags benchmarking-nlp-inference-allow-http-8000 \ 46 | --project $PROJECT_ID 47 | 48 | IP_ADDRESS=$(gcloud compute instances describe $INSTANCE_NAME --zone $ZONE --project $PROJECT_ID --format='value[](networkInterfaces.accessConfigs[0].natIP)') 49 | waitforurl http://$IP_ADDRESS:8000 50 | 51 | echo "-------------------------" 52 | echo "Model is ready to be tested:" 53 | echo "* Model = $MODEL_NAME" 54 | echo "* Quantization = $QUANTIZE_MODEL" 55 | echo "* Number workers = $NB_WORKERS" 56 | echo "* Machine Type = $MACHINE_TYPE" 57 | echo "* IP = http://$IP_ADDRESS:8000" 58 | } 59 | 60 | # Test different number of workers 61 | start_container_instance $INPUT_MACHINE_TYPE $INPUT_NB_WORKERS -------------------------------------------------------------------------------- /load_testing/machine_provisioning/model_optimised_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_ID=$1 3 | ZONE=us-central1-a 4 | 5 | # Util function 6 | waitforurl() { 7 | attempt_counter=0 8 | max_attempts=100 9 | 10 | until $(curl --output /dev/null --silent --head --fail -X GET $1); do 11 | if [ ${attempt_counter} -eq ${max_attempts} ];then 12 | echo "Max attempts reached" 13 | exit 1 14 | fi 15 | 16 | printf '.' 17 | attempt_counter=$(($attempt_counter+1)) 18 | sleep 5 19 | done 20 | } 21 | 22 | # Create firewall rule so we can access instance from public internet 23 | gcloud compute firewall-rules create benchmarking-nlp-inference-allow-http-8000 \ 24 | --allow tcp:8000 \ 25 | --source-ranges 0.0.0.0/0 \ 26 | --target-tags benchmarking-nlp-inference-allow-http-8000 \ 27 | --description "Allow port 8000 access to benchmarking-nlp-inference-allow-http-8000" \ 28 | --project $PROJECT_ID 29 | 30 | function start_container_instance() { 31 | MACHINE_TYPE=e2-standard-4 32 | MODEL_NAME=$1 33 | QUANTIZE_MODEL=$2 34 | NB_WORKERS=$3 35 | 36 | MODEL_LOWER_CASE=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]') 37 | INSTANCE_NAME=benchmarking-nlp-inference-$NB_WORKERS-$MODEL_LOWER_CASE-$QUANTIZE_MODEL-$MACHINE_TYPE 38 | ZONE=us-central1-a 39 | 40 | CONTAINER_ENV="NB_WORKERS=$NB_WORKERS,MODEL_NAME=$MODEL_NAME,QUANTIZE_MODEL=$QUANTIZE_MODEL" 41 | gcloud compute instances create-with-container $INSTANCE_NAME \ 42 | --container-image gcr.io/$PROJECT_ID/benchmarking-nlp-inference-model-optimised \ 43 | --container-env=$CONTAINER_ENV \ 44 | --machine-type=$MACHINE_TYPE \ 45 | --zone $ZONE \ 46 | --tags benchmarking-nlp-inference-allow-http-8000 \ 47 | --project $PROJECT_ID 48 | 49 | IP_ADDRESS=$(gcloud compute instances describe $INSTANCE_NAME --zone $ZONE --project $PROJECT_ID --format='value[](networkInterfaces.accessConfigs[0].natIP)') 50 | waitforurl http://$IP_ADDRESS:8000 51 | 52 | echo "-------------------------" 53 | echo "Model is ready to be tested:" 54 | echo "* Model = $MODEL_NAME" 55 | echo "* Quantization = $QUANTIZE_MODEL" 56 | echo "* Number workers = $NB_WORKERS" 57 | echo "* IP = http://$IP_ADDRESS:8000" 58 | } 59 | 60 | # Test different number of workers 61 | start_container_instance $2 $3 $4 -------------------------------------------------------------------------------- /python-api/baseline/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.9-slim-buster 2 | 3 | WORKDIR /app 4 | COPY requirements.txt requirements.txt 5 | COPY main.py main.py 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | EXPOSE 8000 10 | ENV GUNICORN_CMD_ARGS="--bind=0.0.0.0:8000 -k uvicorn.workers.UvicornWorker --timeout 1000" 11 | 12 | ENTRYPOINT exec gunicorn main:app --workers 1 -------------------------------------------------------------------------------- /python-api/baseline/main.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizerFast, BertForSequenceClassification 2 | from typing import List, Optional 3 | import torch 4 | from fastapi import FastAPI 5 | import numpy as np 6 | import os 7 | import transformers 8 | from pydantic import BaseModel 9 | 10 | transformers.logging.set_verbosity_error() 11 | 12 | # Parse args 13 | class ModelInference: 14 | def __init__(self): 15 | self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') 16 | self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased') 17 | 18 | def predict(self, message: str) -> List[np.float32]: 19 | inputs = self.tokenizer(message, return_tensors="pt") 20 | labels = torch.tensor([1]).unsqueeze(0) 21 | outputs = self.model(**inputs, labels=labels) 22 | res = outputs.logits.detach().numpy().tolist() 23 | 24 | return res 25 | 26 | class SimpleMessage(BaseModel): 27 | text: Optional[str] = 'test' 28 | 29 | model_class = ModelInference() 30 | 31 | app = FastAPI() 32 | 33 | @app.get("/") 34 | async def run_prediction(): 35 | prediction = model_class.predict('This is a test message, how awesome !') 36 | return {'prediction': prediction} 37 | 38 | @app.post("/prediction") 39 | async def run_prediction(message: SimpleMessage): 40 | prediction = model_class.predict(message.text) 41 | return {'prediction': prediction} 42 | 43 | @app.get("/health_check") 44 | async def run_health_check(): 45 | return {'res': True} -------------------------------------------------------------------------------- /python-api/baseline/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | pydantic 4 | fastapi 5 | numpy 6 | gunicorn 7 | uvicorn 8 | httptools 9 | uvloop -------------------------------------------------------------------------------- /python-api/create_docker_images.sh: -------------------------------------------------------------------------------- 1 | function build_base_image() { 2 | PROJECT_ID=$1 3 | INFERENCE_SERVER=$2 4 | 5 | cd $INFERENCE_SERVER 6 | 7 | # Create CPU image 8 | IMAGE_NAME=benchmarking-nlp-inference-$INFERENCE_SERVER 9 | gcloud builds submit --tag gcr.io/$PROJECT_ID/$IMAGE_NAME --project $PROJECT_ID 10 | 11 | cd .. 12 | } 13 | 14 | build_base_image $1 baseline & 15 | build_base_image $1 model-optimised & 16 | build_base_image $1 model-hardware-optimised & 17 | -------------------------------------------------------------------------------- /python-api/model-hardware-optimised/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM intel/intel-optimized-pytorch:latest 2 | 3 | WORKDIR /app 4 | COPY requirements.txt requirements.txt 5 | COPY main.py main.py 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | EXPOSE 8000 10 | ENV GUNICORN_CMD_ARGS="--bind=0.0.0.0:8000 -k uvicorn.workers.UvicornWorker --timeout 1000" 11 | 12 | ENTRYPOINT exec gunicorn main:app --workers $NB_WORKERS -------------------------------------------------------------------------------- /python-api/model-hardware-optimised/main.py: -------------------------------------------------------------------------------- 1 | import intel_pytorch_extension as ipex 2 | 3 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 4 | from typing import List, Optional 5 | import torch 6 | from fastapi import FastAPI 7 | import numpy as np 8 | import os 9 | import transformers 10 | from pydantic import BaseModel 11 | 12 | transformers.logging.set_verbosity_error() 13 | ipex.enable_auto_mixed_precision(mixed_dtype = torch.bfloat16) 14 | 15 | # Parse args 16 | class ModelInference: 17 | def __init__(self): 18 | # Set the number of threads to be 1 for better parallelisation 19 | torch.set_num_threads(1) 20 | torch.set_grad_enabled(False) 21 | 22 | self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 23 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') 24 | #model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) 25 | model = model.to(ipex.DEVICE).eval() 26 | model = torch.jit.script(model) 27 | 28 | self.model = model 29 | 30 | def predict(self, message: str) -> List[np.float32]: 31 | with torch.no_grad(): 32 | inputs = self.tokenizer(message, return_tensors="pt") 33 | labels = torch.tensor([1]).unsqueeze(0) 34 | 35 | inputs = {x: inputs[x].to(ipex.DEVICE) for x in inputs} 36 | labels = labels.to(ipex.DEVICE) 37 | 38 | outputs = self.model(**inputs, labels=labels) 39 | res = outputs.logits.cpu().numpy().tolist() 40 | 41 | return res 42 | 43 | class SimpleMessage(BaseModel): 44 | text: Optional[str] = 'test' 45 | 46 | # Parameters are passed using environment variables 47 | model_class = ModelInference() 48 | 49 | app = FastAPI() 50 | 51 | @app.get("/") 52 | def run_prediction(): 53 | prediction = model_class.predict('This is a test message, how awesome !') 54 | return {'prediction': prediction} 55 | 56 | @app.post("/prediction") 57 | def run_prediction(message: SimpleMessage): 58 | prediction = model_class.predict(message.text) 59 | return {'prediction': prediction} 60 | 61 | @app.get("/health_check") 62 | def run_health_check(): 63 | return {'res': True} -------------------------------------------------------------------------------- /python-api/model-hardware-optimised/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | transformers 3 | pydantic 4 | fastapi 5 | numpy 6 | gunicorn 7 | uvicorn 8 | httptools 9 | uvloop -------------------------------------------------------------------------------- /python-api/model-optimised/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.9-slim-buster 2 | 3 | WORKDIR /app 4 | COPY requirements.txt requirements.txt 5 | COPY main.py main.py 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | EXPOSE 8000 10 | ENV GUNICORN_CMD_ARGS="--bind=0.0.0.0:8000 -k uvicorn.workers.UvicornWorker --timeout 1000" 11 | 12 | ENTRYPOINT exec gunicorn main:app --workers $NB_WORKERS -------------------------------------------------------------------------------- /python-api/model-optimised/main.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer, DistilBertTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification 2 | from typing import List, Optional 3 | import torch 4 | from fastapi import FastAPI 5 | import numpy as np 6 | import os 7 | import transformers 8 | from pydantic import BaseModel 9 | 10 | transformers.logging.set_verbosity_error() 11 | 12 | # Parse args 13 | class ModelInference: 14 | def __init__(self, args): 15 | # Set the number of threads to be 1 for better parallelisation 16 | torch.set_num_threads(1) 17 | torch.set_grad_enabled(False) 18 | 19 | if args['model'] == 'Bert': 20 | self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 21 | model = BertForSequenceClassification.from_pretrained('bert-base-uncased') 22 | elif args['model'] == 'DistilBert': 23 | self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 24 | model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') 25 | else: 26 | raise ValueError('Not a valid model name, enter one of [Bert, DistilBert]') 27 | 28 | if args['quantize']: 29 | model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) 30 | 31 | self.model = model 32 | 33 | 34 | def predict(self, message: str) -> List[np.float32]: 35 | with torch.no_grad(): 36 | inputs = self.tokenizer(message, return_tensors="pt") 37 | labels = torch.tensor([1]).unsqueeze(0) 38 | outputs = self.model(**inputs, labels=labels) 39 | res = outputs.logits.numpy().tolist() 40 | 41 | return res 42 | 43 | class SimpleMessage(BaseModel): 44 | text: Optional[str] = 'test' 45 | 46 | # Parameters are passed using environment variables 47 | args = { 48 | 'model': os.environ['MODEL_NAME'], 49 | 'quantize': os.environ['QUANTIZE_MODEL'] == 'true' 50 | } 51 | model_class = ModelInference(args) 52 | 53 | app = FastAPI() 54 | 55 | @app.get("/") 56 | def run_prediction(): 57 | prediction = model_class.predict('This is a test message, how awesome !') 58 | return {'prediction': prediction} 59 | 60 | @app.post("/prediction") 61 | def run_prediction(message: SimpleMessage): 62 | prediction = model_class.predict(message.text) 63 | return {'prediction': prediction} 64 | 65 | @app.get("/health_check") 66 | async def run_health_check(): 67 | return {'res': True} -------------------------------------------------------------------------------- /python-api/model-optimised/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | torch 3 | transformers 4 | pydantic 5 | fastapi 6 | numpy 7 | gunicorn 8 | uvicorn 9 | httptools 10 | uvloop --------------------------------------------------------------------------------