├── assets ├── architecture.png └── distilbert-diff.png ├── examples ├── requirements.txt ├── stable_diffusion.py ├── hello_world.py ├── stable_diffusion_unet.py ├── stable_diffusion_2_1.py ├── resnet.py ├── distilbert.py ├── t5.py ├── minilm.py ├── gpt_neo_125m.py ├── whisper.py ├── README.md └── sentence_transformers │ └── sentence_transformer_eval.csv ├── LICENSE └── README.md /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/octoml/octoml-profile/HEAD/assets/architecture.png -------------------------------------------------------------------------------- /assets/distilbert-diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/octoml/octoml-profile/HEAD/assets/distilbert-diff.png -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.27.4 2 | diffusers==0.14.0 3 | sentencepiece==0.1.97 4 | 5 | librosa==0.10.0 6 | datasets==2.7.1 7 | soundfile==0.12.1 8 | -------------------------------------------------------------------------------- /examples/stable_diffusion.py: -------------------------------------------------------------------------------- 1 | from diffusers import StableDiffusionPipeline 2 | from octoml_profile import accelerate, remote_profile 3 | 4 | model_id = "runwayml/stable-diffusion-v1-5" 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id) 6 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]', 7 | 'g5.xlarge/torch-eager-cuda[fp16]'] 8 | 9 | 10 | @accelerate 11 | def predict(prompt): 12 | steps = 10 13 | images = pipe(prompt, num_inference_steps=steps).images 14 | return images 15 | 16 | 17 | with remote_profile(backends=backends, num_repeats=1): 18 | for i in range(2): 19 | prompt = "A photo of an astronaut riding a horse on marse." 20 | predict(prompt) 21 | -------------------------------------------------------------------------------- /examples/hello_world.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn import Linear, ReLU, Sequential 4 | from octoml_profile import (accelerate, remote_profile) 5 | 6 | 7 | model = Sequential(Linear(100, 200), ReLU(), Linear(200, 10)) 8 | 9 | 10 | @accelerate 11 | def predict(x: torch.Tensor): 12 | y = model(x) 13 | z = F.softmax(y, dim=-1) 14 | return z 15 | 16 | # Alternatively you can also directly use `accelerate` 17 | # on a model, e.g. `predict = accelerate(model)` which will leave the 18 | # softmax out of remote execution 19 | 20 | 21 | # This will create a session default hardware and acceleration option. 22 | with remote_profile(): 23 | for i in range(10): 24 | x = torch.randn(1, 100) 25 | predict(x) 26 | -------------------------------------------------------------------------------- /examples/stable_diffusion_unet.py: -------------------------------------------------------------------------------- 1 | from diffusers import StableDiffusionPipeline 2 | from octoml_profile import accelerate, remote_profile 3 | 4 | model_id = "runwayml/stable-diffusion-v1-5" 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id) 6 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]', 7 | 'g5.xlarge/torch-eager-cuda[fp16]'] 8 | 9 | 10 | pipe.unet = accelerate(pipe.unet) 11 | pipe.vae.decode = accelerate(pipe.vae.decode) 12 | 13 | 14 | def predict(prompt): 15 | steps = 10 16 | images = pipe(prompt, num_inference_steps=steps).images 17 | return images 18 | 19 | 20 | with remote_profile(backends=backends, num_repeats=1): 21 | for i in range(2): 22 | prompt = "A photo of an astronaut riding a horse on marse." 23 | predict(prompt) 24 | -------------------------------------------------------------------------------- /examples/stable_diffusion_2_1.py: -------------------------------------------------------------------------------- 1 | from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler 2 | from octoml_profile import accelerate, remote_profile 3 | 4 | model_id = "stabilityai/stable-diffusion-2-1" 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id) 6 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) 7 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]', 8 | 'g5.xlarge/torch-eager-cuda[fp16]'] 9 | 10 | 11 | @accelerate 12 | def predict(prompt): 13 | steps = 10 14 | images = pipe(prompt, num_inference_steps=steps).images 15 | return images 16 | 17 | 18 | with remote_profile(backends=backends, num_repeats=1): 19 | for i in range(2): 20 | prompt = "A photo of an astronaut riding a horse on mars." 21 | predict(prompt) 22 | -------------------------------------------------------------------------------- /examples/resnet.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoFeatureExtractor, ResNetForImageClassification 3 | from octoml_profile import accelerate, remote_profile 4 | 5 | dataset = load_dataset("huggingface/cats-image") 6 | image = dataset["test"]["image"][0] 7 | model_id = 'microsoft/resnet-50' 8 | feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) 9 | model = ResNetForImageClassification.from_pretrained(model_id) 10 | 11 | inputs = feature_extractor(image, return_tensors="pt") 12 | 13 | 14 | @accelerate 15 | def run_model(inputs): 16 | return model(**inputs) 17 | 18 | 19 | with remote_profile(): 20 | for i in range(3): 21 | result = run_model(inputs) 22 | 23 | 24 | predicted_label = result.logits.argmax(-1).item() 25 | print(model.config.id2label[predicted_label]) 26 | -------------------------------------------------------------------------------- /examples/distilbert.py: -------------------------------------------------------------------------------- 1 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 2 | from octoml_profile import accelerate, remote_profile 3 | 4 | 5 | model_id = "distilbert-base-uncased-finetuned-sst-2-english" 6 | tokenizer = DistilBertTokenizer.from_pretrained(model_id) 7 | model = DistilBertForSequenceClassification.from_pretrained(model_id) 8 | 9 | 10 | @accelerate 11 | def predict(input: str): 12 | inputs = tokenizer(input, return_tensors="pt") 13 | logits = model(**inputs).logits 14 | predicted_class_id = logits.argmax().item() 15 | return model.config.id2label[predicted_class_id] 16 | 17 | 18 | with remote_profile(backends=["r6i.large/torch-eager-cpu", 19 | "g4dn.xlarge/torch-eager-cuda", 20 | "g4dn.xlarge/onnxrt-cuda"]): 21 | examples = [ 22 | "Hello, world!", 23 | "Nice to meet you", 24 | "My dog is cute", 25 | ] 26 | for _ in range(3): 27 | for s in examples: 28 | predict(s) 29 | -------------------------------------------------------------------------------- /examples/t5.py: -------------------------------------------------------------------------------- 1 | # This example requires torch nightly 2 | # Recent nightly introduced a regression on onnx export: 3 | # https://github.com/pytorch/pytorch/issues/99788 4 | # recommend nightly version torch<=2.1.0.dev20230327 for this example 5 | # Please run `pip install -r requirements.txt` 6 | from transformers import T5Tokenizer, T5ForConditionalGeneration 7 | from octoml_profile import accelerate, remote_profile 8 | 9 | model_id = "google/flan-t5-small" 10 | tokenizer = T5Tokenizer.from_pretrained(model_id) 11 | model = T5ForConditionalGeneration.from_pretrained(model_id) 12 | 13 | input_text = "A step by step recipe to make bolognese pasta:" 14 | 15 | 16 | @accelerate(dynamic=True) 17 | def generate(input_text): 18 | input_ids = tokenizer(input_text, return_tensors="pt").input_ids 19 | outputs = model.generate(input_ids) 20 | return tokenizer.decode(outputs[0]) 21 | 22 | 23 | with remote_profile(backends=['g4dn.xlarge/onnxrt-cuda', 'r6i.large/onnxrt-cpu'], 24 | num_repeats=1): 25 | for i in range(2): 26 | result = generate(input_text) 27 | -------------------------------------------------------------------------------- /examples/minilm.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, BertForSequenceClassification 2 | from octoml_profile import accelerate, remote_profile 3 | 4 | model_id = 'philschmid/MiniLM-L6-H384-uncased-sst2' 5 | tokenizer = AutoTokenizer.from_pretrained(model_id) 6 | model = BertForSequenceClassification.from_pretrained(model_id) 7 | 8 | examples = [ 9 | "Hello, world!", 10 | "Nice to meet you", 11 | "Goodbye, world!" 12 | ] 13 | inputs = tokenizer(examples, return_tensors="pt") 14 | 15 | 16 | model = accelerate(model) 17 | 18 | 19 | with remote_profile(backends=["r6i.large/onnxrt-cpu", 20 | "r6i.large/torch-eager-cpu", 21 | "r7g.large/onnxrt-cpu", 22 | "g4dn.xlarge/onnxrt-cuda", 23 | "g4dn.xlarge/onnxrt-tensorrt", 24 | "g4dn.xlarge/torch-eager-cuda", 25 | "g4dn.xlarge/torch-inductor-cuda", 26 | "g5.xlarge/torch-eager-cuda"]): 27 | for i in range(3): 28 | result = model(**inputs) 29 | 30 | print(result.logits) 31 | -------------------------------------------------------------------------------- /examples/gpt_neo_125m.py: -------------------------------------------------------------------------------- 1 | # This example requires torch nightly (see README.md for recommended version) 2 | 3 | from transformers import GPTNeoForCausalLM, GPT2Tokenizer 4 | from octoml_profile import accelerate, remote_profile 5 | 6 | model_id = "EleutherAI/gpt-neo-125M" 7 | model = GPTNeoForCausalLM.from_pretrained(model_id) 8 | tokenizer = GPT2Tokenizer.from_pretrained(model_id) 9 | 10 | prompt = ( 11 | "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " 12 | "previously unexplored valley, in the Andes Mountains. Even more surprising to the " 13 | "researchers was the fact that the unicorns spoke perfect English." 14 | ) 15 | 16 | 17 | @accelerate(dynamic=True) 18 | def predict(prompt): 19 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids 20 | gen_tokens = model.generate( 21 | input_ids, 22 | do_sample=True, 23 | temperature=0.9, 24 | max_length=100, 25 | ) 26 | return tokenizer.batch_decode(gen_tokens)[0] 27 | 28 | 29 | with remote_profile(backends=["g5.xlarge/onnxrt-cuda"], num_repeats=1): 30 | for i in range(3): 31 | predict(prompt) 32 | -------------------------------------------------------------------------------- /examples/whisper.py: -------------------------------------------------------------------------------- 1 | # This example requires torch nightly (see README.md for recommended version) 2 | # It further requires `pip install datasets soundfile librosa` 3 | # Please run `pip install -r requirements.txt` 4 | from datasets import load_dataset 5 | from transformers import WhisperProcessor, WhisperForConditionalGeneration 6 | from octoml_profile import remote_profile, accelerate 7 | 8 | # 9 | # load model and processor 10 | processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") 11 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") 12 | 13 | # load dummy dataset and read audio files 14 | ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") 15 | sample = ds[0]["audio"] 16 | 17 | 18 | @accelerate(dynamic=True) 19 | def predict(sample): 20 | input_features = processor(sample["array"], 21 | sampling_rate=sample["sampling_rate"], 22 | return_tensors="pt").input_features 23 | # generate token ids 24 | predicted_ids = model.generate(input_features) 25 | # decode token ids to text 26 | transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) 27 | return transcription 28 | 29 | 30 | with remote_profile(backends=["g4dn.xlarge/onnxrt-cuda", "r6i.large/onnxrt-cpu"], 31 | num_repeats=1): 32 | for _ in range(3): 33 | text = predict(sample) 34 | print(text) 35 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## Examples 2 | For a slightly more complex example than the [simple tutorial example](../README.md#installation-and-getting-started), 3 | we can take [the DistilBERT model from 4 | HuggingFace](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english), where we make a 5 | couple of lines of modification to the example code (`pip install transformers==4.27.4` is recommended). 6 | 7 | To run other examples in this directory, please run (`pip install -r requirements.txt`) and pay 8 | attention to the examples that require nightly torch. You can find recommended nightly 9 | torch version at [here](../README.md#dynamic-shapes). 10 | 11 | ```python 12 | import torch 13 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 14 | from octoml_profile import accelerate, remote_profile 15 | 16 | model_id = "distilbert-base-uncased-finetuned-sst-2-english" 17 | tokenizer = DistilBertTokenizer.from_pretrained(model_id) 18 | model = DistilBertForSequenceClassification.from_pretrained(model_id) 19 | 20 | @accelerate 21 | def predict(input: str): 22 | inputs = tokenizer(input, return_tensors="pt") 23 | logits = model(**inputs).logits 24 | predicted_class_id = logits.argmax().item() 25 | return model.config.id2label[predicted_class_id] 26 | 27 | with remote_profile(backends=["r6i.large/onnxrt-cpu", "g5.xlarge/onnxrt-cuda"]): 28 | examples = [ 29 | "Hello, world!", 30 | "Nice to meet you", 31 | "My dog is cute", 32 | ] 33 | for _ in range(3): 34 | for s in examples: 35 | predict(s) 36 | ``` 37 | And now we can easily run this model on a variety of hardware and understand 38 | performance implications, all without having to worry about provisioning cloud 39 | instances, configuring software or deploying our code. 40 | 41 | You can use Dynamite directly within your application - whether it be a REST 42 | API, CLI application or anything else - with your own data and tests. 43 | 44 | 45 | ### Dynamic models 46 | 47 | We've enabled dynamic graph capture with `@accelerate(dynamic=True)`. See the 48 | generative model [t5.py](t5.py), [gpt_neo_125m](gpt_neo_125m.py) and 49 | [whisper](whisper.py) as examples. 50 | -------------------------------------------------------------------------------- /examples/sentence_transformers/sentence_transformer_eval.csv: -------------------------------------------------------------------------------- 1 | ,model,backend,time_ms,cost_per_mreq,batch_size 2 | 0,all-MiniLM-L12-v2,r6i.large/onnxrt-cpu,17.8209513,0.6237332955,1 3 | 1,all-MiniLM-L12-v2,r6i.large/torch-eager-cpu,28.495958399999996,0.9973585439999999,1 4 | 2,all-MiniLM-L12-v2,r7g.large/onnxrt-cpu,21.257490899999997,0.13581174741666666,1 5 | 3,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-cuda,11.8977955,1.7384001202777777,1 6 | 4,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-tensorrt,12.402284,1.8121114955555555,1 7 | 5,all-MiniLM-L12-v2,g4dn.xlarge/torch-eager-cuda,18.3080092,2.6750035664444445,1 8 | 6,all-MiniLM-L12-v2,g4dn.xlarge/torch-inductor-cuda,15.7441459,2.300394650944445,1 9 | 7,all-MiniLM-L6-v2,r6i.large/onnxrt-cpu,8.6542595,0.3028990825,1 10 | 8,all-MiniLM-L6-v2,r6i.large/torch-eager-cpu,13.2147832,0.462517412,1 11 | 9,all-MiniLM-L6-v2,r7g.large/onnxrt-cpu,10.397357999999997,0.06642756499999998,1 12 | 10,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-cuda,5.9874456,0.8748323293333332,1 13 | 11,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-tensorrt,6.3256254,0.9242441556666666,1 14 | 12,all-MiniLM-L6-v2,g4dn.xlarge/torch-eager-cuda,9.4387526,1.3791066298888892,1 15 | 13,all-MiniLM-L6-v2,g4dn.xlarge/torch-inductor-cuda,7.867855199999999,1.1495810653333332,1 16 | 14,all-distilroberta-v1,r6i.large/onnxrt-cpu,20.779371999999995,0.7272780199999999,1 17 | 15,all-distilroberta-v1,r6i.large/torch-eager-cpu,38.31659840000001,1.3410809440000004,1 18 | 16,all-distilroberta-v1,r7g.large/onnxrt-cpu,27.108677599999996,0.1731943291111111,1 19 | 17,all-distilroberta-v1,g4dn.xlarge/onnxrt-cuda,7.0399018,1.028607874111111,1 20 | 18,all-distilroberta-v1,g4dn.xlarge/onnxrt-tensorrt,7.6713276,1.1208661993333333,1 21 | 19,all-distilroberta-v1,g4dn.xlarge/torch-eager-cuda,11.2453003,1.6430633216111112,1 22 | 20,all-distilroberta-v1,g4dn.xlarge/torch-inductor-cuda,9.053695099999999,1.3228454507222223,1 23 | 21,paraphrase-albert-small-v2,r6i.large/onnxrt-cpu,19.086343899999996,0.6680220364999998,1 24 | 22,paraphrase-albert-small-v2,r6i.large/torch-eager-cpu,29.197723699999997,1.0219203295,1 25 | 23,paraphrase-albert-small-v2,r7g.large/onnxrt-cpu,28.6267206,0.18289293716666666,1 26 | 24,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-cuda,5.315114,0.7765972122222223,1 27 | 25,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-tensorrt,6.0049864,0.8773952351111112,1 28 | 26,paraphrase-albert-small-v2,g4dn.xlarge/torch-eager-cuda,9.651154,1.4101408344444444,1 29 | 27,paraphrase-albert-small-v2,g4dn.xlarge/torch-inductor-cuda,7.032197,1.0274821172222224,1 30 | 28,paraphrase-MiniLM-L3-v2,r6i.large/onnxrt-cpu,5.3824666,0.188386331,1 31 | 29,paraphrase-MiniLM-L3-v2,r6i.large/torch-eager-cpu,7.807972800000001,0.273279048,1 32 | 30,paraphrase-MiniLM-L3-v2,r7g.large/onnxrt-cpu,6.6782705,0.04266672819444444,1 33 | 31,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-cuda,4.7531763,0.6944918705,1 34 | 32,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-tensorrt,5.019084299999999,0.7333439838333334,1 35 | 33,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-eager-cuda,6.8465889,1.0003627115000002,1 36 | 34,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-inductor-cuda,5.8367854,0.8528192001111112,1 37 | 35,all-MiniLM-L12-v2,r6i.large/onnxrt-cpu,1254.0356565,43.8912479775,256 38 | 36,all-MiniLM-L12-v2,r6i.large/torch-eager-cpu,1209.5858773,42.3355057055,256 39 | 37,all-MiniLM-L12-v2,r7g.large/onnxrt-cpu,3336.4584253999997,21.316262162277773,256 40 | 38,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-cuda,98.84057679999998,14.44170649911111,256 41 | 39,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-tensorrt,209.7131162,30.641416422555555,256 42 | 40,all-MiniLM-L12-v2,g4dn.xlarge/torch-eager-cuda,134.9395834,19.716172463444448,256 43 | 41,all-MiniLM-L12-v2,g4dn.xlarge/torch-inductor-cuda,117.63634059999998,17.187976432111107,256 44 | 42,all-MiniLM-L6-v2,r6i.large/onnxrt-cpu,611.8071633,21.4132507155,256 45 | 43,all-MiniLM-L6-v2,r6i.large/torch-eager-cpu,661.1407737000001,23.1399270795,256 46 | 44,all-MiniLM-L6-v2,r7g.large/onnxrt-cpu,1696.5189904000003,10.838871327555557,256 47 | 45,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-cuda,77.6411086,11.344228645444444,256 48 | 46,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-tensorrt,130.4229707,19.056245163388887,256 49 | 47,all-MiniLM-L6-v2,g4dn.xlarge/torch-eager-cuda,99.27137689999999,14.504651180388889,256 50 | 48,all-MiniLM-L6-v2,g4dn.xlarge/torch-inductor-cuda,86.3636818,12.618693507444446,256 51 | 49,all-distilroberta-v1,r6i.large/onnxrt-cpu,2221.6413357,77.7574467495,256 52 | 50,all-distilroberta-v1,r6i.large/torch-eager-cpu,2149.491522200001,75.23220327700002,256 53 | 51,all-distilroberta-v1,r7g.large/onnxrt-cpu,6580.753075699999,42.04370020586111,256 54 | 52,all-distilroberta-v1,g4dn.xlarge/onnxrt-cuda,114.65186839999998,16.751911882888887,256 55 | 53,all-distilroberta-v1,g4dn.xlarge/onnxrt-tensorrt,200.23376609999997,29.25637804683333,256 56 | 54,all-distilroberta-v1,g4dn.xlarge/torch-eager-cuda,111.4591273,16.28541693327778,256 57 | 55,all-distilroberta-v1,g4dn.xlarge/torch-inductor-cuda,108.91961779999998,15.914366378555554,256 58 | 56,paraphrase-albert-small-v2,r6i.large/onnxrt-cpu,2122.1424868999998,74.27498704149998,256 59 | 57,paraphrase-albert-small-v2,r6i.large/torch-eager-cpu,2223.4401246999996,77.82040436449999,256 60 | 58,paraphrase-albert-small-v2,r7g.large/onnxrt-cpu,6165.979900800002,39.393760477333345,256 61 | 59,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-cuda,103.72082550000003,15.154765059166671,256 62 | 60,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-tensorrt,183.57358340000002,26.822140241222222,256 63 | 61,paraphrase-albert-small-v2,g4dn.xlarge/torch-eager-cuda,121.76579240000001,17.79133522288889,256 64 | 62,paraphrase-albert-small-v2,g4dn.xlarge/torch-inductor-cuda,118.08781490000003,17.253941843722227,256 65 | 63,paraphrase-MiniLM-L3-v2,r6i.large/onnxrt-cpu,352.9728934,12.354051269,256 66 | 64,paraphrase-MiniLM-L3-v2,r6i.large/torch-eager-cpu,353.5955084,12.375842794000002,256 67 | 65,paraphrase-MiniLM-L3-v2,r7g.large/onnxrt-cpu,872.6846742999999,5.575485419138888,256 68 | 66,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-cuda,60.5438919,8.8461353165,256 69 | 67,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-tensorrt,89.4471107,13.069216730055556,256 70 | 68,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-eager-cuda,73.7146786,10.770533595444444,256 71 | 69,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-inductor-cuda,62.96103779999999,9.199307189666666,256 72 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## octoml-profile 2 | 3 | *octoml-profile* is a python library and cloud service that enables ML 4 | engineers to easily assess the performance and cost of PyTorch models on cloud 5 | hardware with state-of-the-art ML acceleration technology. 6 | 7 | Whether you're building machine learning models for research, development, or 8 | production, benchmarking your AI applications is a necessary step before 9 | deployment. An optimally chosen hardware + runtime deployment strategy can 10 | reduce cloud costs by more than 10x over default solutions. 11 | 12 | With *octoml-profile*, within minutes, you can measure the performance and 13 | cost of your PyTorch models on different cloud hardware. Our ML 14 | acceleration technology ensures that you get the most accurate and efficient 15 | results, so you can make informed decisions about how to optimize your AI 16 | applications. 17 | 18 | *Note: this tool is not designed for profiling individual PyTorch 19 | ops on the local machine. Please use `torch.profiler` for such purpose.* 20 | 21 | 22 | ### Key Features 23 | - 🔧 Magic remote execution with only a few additional lines of code 24 | - 💻 Runs on local development environment without any GPU requirement 25 | - 💪 Absolves tedious tasks such as model export, hardware provisioning, and dependency preparation 26 | - 🚀 Provides performance and cost insights within seconds (or minutes for larger models) 27 | - ⚙️ Supports diverse hardware and state-of-the-art software backends 28 | - 🌟 Supports the latest generative AI models with dynamic shapes 29 | - 📊 Uses the same data and workflow as your training and experiment tracking 30 | 31 | 32 | ### Limitation 33 | - Only supports inference workload 34 | 35 | 36 | ### Demos 37 | - [SentenceTransformers](examples/sentence_transformers/SentenceTransformerEval.ipynb) 38 | - [Stable Diffusion](examples/stable_diffusion.py) 39 | - [GPTNeo](examples/gpt_neo_125m.py) 40 | - [Whisper](examples/whisper.py) 41 | - [T5](examples/t5.py) 42 | 43 | 44 | ### Latest 45 | - [04-25-2023] Client update `v0.2.2` with enhanced terminal output and more examples 46 | - [03-22-2023] Initial release of `v0.2.0` 47 | 48 | 49 | ### Documentation quick links 50 | * [Installation and Getting Started](#installation-and-getting-started) 51 | * [Dynamic shapes](#dynamic-shapes) 52 | * [How it works](#how-it-works) 53 | * [Data privacy](#data-privacy) 54 | * [Known issues](#known-issues) 55 | * [Contact the team](#contact-the-team) 56 | 57 | ### "Hello World" example 58 | 59 | Let's say you have a PyTorch model that performs sentiment analysis using a 60 | DistilBert model, and you want to optimize it for cloud deployment. With 61 | octoml-profile, you can easily benchmark the predict function on various cloud 62 | hardware and use different acceleration techniques to find the optimal 63 | deployment strategy. 64 | 65 | ![Distilbert Example](assets/distilbert-diff.png) 66 | 67 | Within a few seconds, you will find the runtime and cost that help you pick the 68 | optimal hardware and inference engine for deployment. 69 | 70 | ``` 71 | Function `predict` has 1 profile: 72 | - Profile `predict[1/1]` ran 3 times. (1 discarded because compilation happened) 73 | 74 | Instance Processor Backend Backend Time (ms) Total Time (ms) Cost ($/MReq) 75 | ======================================================================================================= 76 | r6i.large Intel Ice Lake CPU torch-eager-cpu 24.735 52.009 $1.82 77 | g4dn.xlarge Nvidia T4 GPU torch-eager-cuda 5.336 32.610 $4.76 78 | g4dn.xlarge Nvidia T4 GPU torch-inductor-cuda 3.249 30.523 $4.46 79 | g4dn.xlarge Nvidia T4 GPU onnxrt-cuda 1.399 28.673 $4.19 80 | ------------------------------------------------------------------------------------------------------- 81 | Total time above is `remote backend time + local python code time`, 82 | in which local python code run time is 27.274 ms. 83 | Graph level profile is located at /tmp/octoml_profile_n603dewx/0/predict_1* 84 | ``` 85 | 86 | ## Installation and Getting Started 87 | - Create and activate a python virtual environment. `Python 3.8` is recommended 88 | and tested on both `Ubuntu` and `macOS`. `Python 3.10.9` is tested on `macOS` 89 | with Apple silicon. 90 | 91 | ``` 92 | python3 -m venv env 93 | source env/bin/activate 94 | ``` 95 | - Install dependencies 96 | 97 | PyTorch 2.0 and above is required. Below we install the cpu version for 98 | simplicity; CUDA version works too. 99 | 100 | ``` 101 | pip install --upgrade pip 102 | pip install "torch>=2.0.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu 103 | pip install "octoml-profile>=0.2.0" 104 | ``` 105 | 106 | You've completed installation! (If you have trouble, see [issues with installation](#issues-with-installation)) 107 | 108 | - Next, try running this very simple example that shows how to integrate octoml-profile into your model code. 109 | 110 | ```python 111 | import torch 112 | from torch.nn import Linear, ReLU, Sequential 113 | from octoml_profile import accelerate, remote_profile 114 | 115 | model = Sequential(Linear(100, 200), ReLU()) 116 | 117 | @accelerate 118 | def predict(x: torch.Tensor): 119 | return model(x) 120 | 121 | with remote_profile(): 122 | for _ in range(3): 123 | x = torch.randn(1, 100) 124 | predict(x) 125 | ``` 126 | 127 | - The first time you run this, you'll be prompted to supply your API key. 128 | 129 | ``` 130 | ,-""-. 131 | / \ Welcome to OctoML Profiler! 132 | : ; 133 | \ / It looks like you don't have an access token configured. 134 | `. .' Please go to https://profiler.app.octoml.ai/ to generate one 135 | '._.'`._.' and then paste it here. 136 | 137 | Access token: 138 | ``` 139 | (Sign up so that you can generate an API token when prompted) 140 | 141 | - Once you've provided credentials, running this results in the following 142 | output that shows times of the function being executed remotely on each 143 | backend. 144 | 145 | ``` 146 | Function `predict` has 1 profile: 147 | - Profile `predict[1/1]` ran 3 times. (1 discarded because compilation happened) 148 | 149 | Instance Processor Backend Backend Time (ms) Total Time (ms) Cost ($/MReq) 150 | ======================================================================================================= 151 | r6i.large Intel Ice Lake CPU torch-eager-cpu 0.024 0.086 $0.00 152 | g4dn.xlarge Nvidia T4 GPU torch-eager-cuda 0.097 0.159 $0.02 153 | g4dn.xlarge Nvidia T4 GPU torch-inductor-cuda 0.177 0.239 $0.03 154 | ------------------------------------------------------------------------------------------------------- 155 | Total time above is `remote backend time + local python code time`, 156 | in which local python code run time is 0.062 ms. 157 | Graph level profile is located at /tmp/octoml_profile_8o45fe39/0/predict_1* 158 | ``` 159 | To see more examples, see [examples/](examples). 160 | 161 | 162 | ### Issues with installation 163 | 164 | - If you are on macOS with Apple silicon and seeing `symbol not found in flat 165 | namespace '_CFRelease'`, it is likely that you created a `venv` with python 166 | installed by `conda`. Please make sure to deactivate any `conda` 167 | environment(s) and use the system-shipped python on macOS to create `venv`. 168 | Or follow the instructions below to create a conda environment. 169 | 170 | ``` 171 | conda create -n octoml python=3.8 172 | conda activate octoml 173 | ``` 174 | 175 | - If you see a version conflict, please install the pip dependencies above with `--force-reinstall`. 176 | 177 | - For any other problems, please file a github issue. 178 | 179 | 180 | ### Dynamic shapes 181 | 182 | This is an experimental feature that requires installing nightly of PyTorch. 183 | The dynamic shape graph capture feature is still under active development by 184 | the PyTorch team, so your results may vary. If you find any problems, please 185 | report via github issue. 186 | 187 | ``` 188 | pip install --pre torch==2.1.0.dev20230416 torchaudio==2.1.0.dev20230416 torchvision==0.16.0.dev20230416 --index-url https://download.pytorch.org/whl/nightly/cpu 189 | ``` 190 | 191 | By default, the `@accelerate` decorator will recompile a new graph if the input 192 | shapes to the graph is changed. For generative model cases such as text 193 | generation, it is inefficient to have to compile a separate graph for each 194 | sequence length. The solution is to turn on "dynamic-shapes" for the compiler, 195 | which means the graph compilation will be agnostic to the input shapes, 196 | resulting in drastically fewer graphs to be compiled and lower memory to run 197 | end to end. 198 | 199 | As an toy example: 200 | 201 | ```python 202 | import torch 203 | from octoml_profile import accelerate, remote_profile 204 | 205 | conv = torch.nn.Conv2d(16, 16, 3) 206 | 207 | # With `dynamic=True` any model inside will not be specialized to the input shape 208 | @accelerate(dynamic=True) 209 | def predict(x: torch.Tensor): 210 | return conv(x) 211 | 212 | with remote_profile(backends=["r6i.large/onnxrt-cpu"]): 213 | # batch size is different but compilation only 214 | # happens once 215 | for i in range(1, 5): 216 | predict(torch.randn(i, 16, 10, 10)) 217 | ``` 218 | 219 | Set `@accelerate(dynamic=True)` on any `accelerate` usage. 220 | 221 | 222 | ## How it works 223 | 224 | * [How octoml-profile works](#how-octoml-profile-works) 225 | * [Where `@accelerate` should be applied](#where-accelerate-should-be-applied) 226 | * [The profile report](#the-profile-report) 227 | * [Local python segments](#local-python-segments) 228 | * [Quota](#quota) 229 | * [Supported backends](#supported-backends) 230 | 231 | ### How octoml-profile works 232 | 233 | octoml-profile consists of two main components: a Python library and a cloud 234 | service. The Python library is used to automatically extract PyTorch models on 235 | your local machine and send them to the cloud service for remote benchmarking. 236 | The cloud service provides access to different cloud hardware targets that are 237 | prepared with various deep learning inference engines. This enables users to 238 | optimize and measure their PyTorch models in a variety of deployment 239 | configurations. 240 | 241 | ![Architecture Illustration](assets/architecture.png) 242 | 243 | In various examples above, we first `import octoml-profile` python library, and then 244 | decorate a `predict` function with the `@accelerate` decorator. By default, it 245 | behaves like `@torch.compile`: 246 | [TorchDynamo](https://pytorch.org/docs/stable/dynamo/index.html) is used to 247 | extract one or more computation graphs, optimize them, and replace the bytecode 248 | inside the function with the optimized version. 249 | 250 | When the code is surrounded with `remote_profile()` context manager, the 251 | behavior of the `@accelerate` decorator changes. Instead of running the 252 | extracted graphs on the local machine, the graphs are sent to one or more 253 | remote inference workers for execution and measurement. The run time of the 254 | offloaded graphs are referred to as "remote backend run time" in the output 255 | above. 256 | 257 | Code that cannot be captured as a computation graph is not offloaded -- such code 258 | runs locally and is shown as "local python". For more details see 259 | the [local python code section](#local-python-segments) below. 260 | 261 | When the `remote_profile()` context manager is entered, it reserves 262 | exclusive access to hardware specified in the optional `backends` keyword argument 263 | (or to a set of default hardware targets if the argument is omitted). 264 | If there are multiple backends, they will run in parallel. 265 | 266 | The `predict` function may contain pre/post processing code, non tensor logic 267 | like control flows, side effects, and multiple models. Only eligible graphs 268 | will be intelligently extracted and offloaded for remote execution. 269 | 270 | As a result, the estimated end to end run time of the decorated function for a 271 | particular hardware and acceleration engine is `remote backend run 272 | time + local python run time`. If `local python run time` is much smaller 273 | comparing to the total time, the estimate is fairly accurate because the 274 | impact of potential difference between local and remote machine for local python 275 | code is minimal. 276 | 277 | 278 | ### Where to apply `@accelerate` 279 | In general, `@accelerate` is a drop-in replacement for `@torch.compile` and 280 | should be applied to function which contains PyTorch Model that performs 281 | inference. When the function is called under the context manager of `with 282 | remote_profile()`, the remote execution and profiling activated. When called 283 | without `remote_profile()` it behaves just as `torch.compile`. By default, 284 | `torch.no_grad()` is set in the `remote_profile()` context, because the remote 285 | execution does not support training yet. 286 | 287 | If you expect the input shape to change especially for generative models, 288 | see [Dynamic Shapes](dynamic-shapes). 289 | 290 | Last but not least, `@accelerate` should not be used to decorate a function 291 | that has already been decorated with `@accelerate` or `@torch.compile`. 292 | 293 | ### The profile report 294 | 295 | Most users should be satisfied with the output of just total run time and cost 296 | of using different hardware and software backends. 297 | However, advanced users like ML compiler engineers may be interested in diving 298 | into graph level performance analysis or reducing the number of graph breaks. This 299 | section shows you where to find the next level details. 300 | 301 | The location of the `Profile` report is printed at the end of total runtime 302 | table, for instance: 303 | ``` 304 | Graph level profile is located at /tmp/octoml_profile_8o45fe39/0/predict_1* 305 | ``` 306 | 307 | For each decorated function, the profile report is suffixed with `.profile.txt`, 308 | i.e. `/tmp/octoml_profile_8o45fe39/0/predict_1.profile.txt` is for function `predict`. 309 | 310 | 311 | An example report for DistilBert is the following: 312 | ``` 313 | Segment Samples Avg ms Failures 314 | =============================================================== 315 | 0 Local python 2 27.227 316 | 317 | 1 Graph #1 318 | r6i.large/torch-eager-cpu 20 25.648 0 319 | g4dn.xlarge/torch-eager-cuda 20 5.381 0 320 | g4dn.xlarge/torch-inductor-cuda 20 3.208 0 321 | g4dn.xlarge/onnxrt-cuda 20 1.400 0 322 | 323 | 2 Local python 2 0.110 324 | --------------------------------------------------------------- 325 | ``` 326 | 327 | To understand this report, let's first define some terminology. 328 | 329 | Terminology: 330 | - `function` is a python function decorated with `@accelerate` to be profiled. 331 | - `run` is one execution of the function. 332 | - `subgraph` is a computation graph of tensor operations auto captured by TorchDynamo as 333 | you run the function. A subgraph is a logical portion of the function. 334 | - `call` is one execution of a subgraph. A segment in a profile is a result of a call. 335 | - `repeats` is the number of times a graph is measured on a remote backend for each call. 336 | - `samples` is the total number an execution of a subgraph or local python segment is measured. 337 | For each subgraph, `samples = repeats * call`. 338 | 339 | On function, subgraph, and profile: 340 | - A single run of `predict` can have more than one subgraphs due to graph breaks. 341 | - Runs with different arguments may produce different subgraphs because the 342 | computation may change. Runs that have the same sequence of graph execution 343 | are merged into a "profile". For example, if `f(x1)`, `f(x2)` 344 | runs graph "1,2,3", and `f(x3)` runs graph "1,3,4", the segments of `f(x1)` and 345 | `f(x2)` will be merged into one profile and `f(x3)` will be its own profile. 346 | 347 | To find out what operations each graph contains and where in the source code 348 | does each graph come from, take a look at `graph_dump.txt` under the same 349 | directory. 350 | 351 | Like the DistilBert example above, when there are only a few segments, a profile 352 | will show the linear sequence of subgraph segment runs. 353 | 354 | What happens when there are tens or hundreds of segments? When too many segments 355 | are run, we collapse the linear segment sequence into an abridged summary where 356 | only a few subgraphs that have the highest aggregate run times across their run 357 | segments are shown. 358 | 359 | For example, a generative encoder-decoder based model that produces a large number of 360 | run segments will display an abridged report displaying 361 | **runtime by subgraph** instead of **runtime by segment** by default. 362 | In cases like this, you'll see: 363 | 364 | ``` 365 | Top subgraph Avg ms/call Avg ms/run Runtime % of e2e Failures 366 | ============================================================================== 367 | Graph #7 (17 calls) 368 | r6i.large/onnxrt-cpu 36.979 628.645 70.2 0 369 | g4dn.xlarge/onnxrt-cuda 5.612 95.403 37.9 0 370 | 371 | Graph #4 (1 calls) 372 | r6i.large/onnxrt-cpu 43.823 43.823 4.9 0 373 | g4dn.xlarge/onnxrt-cuda 5.002 5.002 2.0 0 374 | 375 | Graph #2 (1 calls) 376 | r6i.large/onnxrt-cpu 43.357 43.357 4.8 0 377 | g4dn.xlarge/onnxrt-cuda 3.154 3.154 1.3 0 378 | 379 | 4 other subgraphs 380 | r6i.large/onnxrt-cpu 35.892 4.0 0 381 | g4dn.xlarge/onnxrt-cuda 4.833 1.9 0 382 | 383 | 42 local python segments 143.621 384 | ------------------------------------------------------------------------------ 385 | ``` 386 | 387 | Other graphs are hidden. If your output has been abridged in this way 388 | but you want to see the full, sequential results of your profiling run, you can print 389 | the report with `verbose`: 390 | 391 | ```python 392 | # print_results_to=None silences the default output profile report. 393 | with remote_profile(print_results_to=None) as prof: 394 | ... 395 | prof.report().print(verbose=True) 396 | ``` 397 | 398 | ### Local python segments 399 | 400 | You will see `Local python` in the profiling report because TorchDynamo only 401 | captures PyTorch code as graph. Where it cannot continue capturing, a "graph 402 | break" happens and the uncaptured code runs in python eagerly. 403 | 404 | Let's look at the DistilBert example again: 405 | ```python 406 | @accelerate 407 | def predict(input: str): 408 | inputs = tokenizer(input, return_tensors="pt") 409 | logits = model(**inputs).logits 410 | predicted_class_id = logits.argmax().item() 411 | return model.config.id2label[predicted_class_id] 412 | ``` 413 | 414 | ``` 415 | Segment Samples Avg ms Failures 416 | =============================================================== 417 | 0 Local python 2 27.227 418 | 419 | 1 Graph #1 420 | r6i.large/torch-eager-cpu 20 25.648 0 421 | g4dn.xlarge/torch-eager-cuda 20 5.381 0 422 | g4dn.xlarge/torch-inductor-cuda 20 3.208 0 423 | g4dn.xlarge/onnxrt-cuda 20 1.400 0 424 | 425 | 2 Local python 2 0.110 426 | --------------------------------------------------------------- 427 | ``` 428 | 429 | Segment 1 is runs `Graph #1` which corresponds to the `model`. 430 | Segment 0 corresponds to the `tokenizer` function, and Segment 2 the logits and 431 | label mapping. 432 | 433 | Local Python segment is run locally once for every invocation. The total estimated time 434 | equals total remote backend time for all the graphs and total local python time. 435 | 436 | To print graph breaks and understand more of what TorchDynamo is doing under the hood, see 437 | the [TorchDynamo Deeper Dive](https://pytorch.org/docs/stable/dynamo/deep-dive.html), 438 | [Torch.FX](https://pytorch.org/docs/stable/fx.html) and 439 | [PyTorch 440 | Troubleshooting](https://pytorch.org/docs/master/dynamo/troubleshooting.html#torchdynamo-troubleshooting) 441 | pages. 442 | 443 | 444 | ### Quota 445 | 446 | Each user has a limit on the number of concurrent backends held by the user's sessions which automatically get created and closed within `remote_profile()` context manager. 447 | If you find yourself hitting quota limits because you need to run more tasks concurrently, please contact us to increase the limit. 448 | 449 | ### Supported backends 450 | 451 | To programmatically access a list of supported backends, please invoke: 452 | 453 | ```python 454 | import octoml_profile 455 | print(octoml_profile.get_supported_backends()) 456 | ``` 457 | 458 | **Supported Cloud Hardware** 459 | 460 | AWS 461 | - g4dn.xlarge (Nvidia T4 GPU) 462 | - g5.xlarge (Nvidia A10g GPU) 463 | - r6i.large (Intel Xeon IceLake CPU) 464 | - r7g.large (Arm based Graviton3 CPU) 465 | 466 | **Supported Acceleration Libraries** 467 | 468 | ONNXRuntime 469 | - onnxrt-cpu 470 | - onnxrt-cuda 471 | - onnxrt-tensorrt 472 | 473 | PyTorch 474 | - torch-eager-cpu 475 | - torch-eager-cuda 476 | - torch-inductor-cpu 477 | - torch-inductor-cuda 478 | 479 | If no backends are specified while calling `remote_profile(backends=[...])`, then defaults are used, 480 | which are determined by the server. At the moment of writing, the default is 481 | `["r6i.large/torch-eager-cpu", "g4dn.xlarge/torch-eager-cuda", "g4dn.xlarge/torch-inductor-cuda"]`. 482 | 483 | 484 | 485 | ## Data privacy 486 | 487 | We know that keeping your model data private is important to you. We guarantee that no other 488 | user has access to your data. We do not scrape any model information internally, and we do not use 489 | your uploaded data to try to improve our system -- we rely on you filing github issues or otherwise 490 | contacting us to understand your use case, and anything else about your model. 491 | Here's how our system currently works. 492 | 493 | We leverage TorchDynamo's subgraph capture to identify PyTorch-only code, serialize those subgraphs, 494 | and upload them to our system for benchmark. 495 | g 496 | On model upload, we cache your model in AWS S3. This helps with your development iteration speed -- 497 | every subsequent time you want profiling results, you won't have to wait for model re-upload on every 498 | minor tweak to your model or update to your requested backends list. When untouched for four weeks, 499 | any model subgraphs and constants are automatically removed from S3. 500 | 501 | Your model subgraphs are loaded onto our remote workers and are cleaned up on the creation of 502 | every subsequent session. Between your session's closure and another session's startup, 503 | serialized subgraphs may lie around idle. No users can access these subgraphs in this interval. 504 | 505 | If you still have concerns around data privacy, please [contact the team](#contact-the-team). 506 | 507 | ## Known issues 508 | 509 | ### Waiting for Session 510 | When you create a session, you get exclusive access to the requested hardware. 511 | When there are no hardware available, new session requests will 512 | be queued. 513 | 514 | ### OOM for large models 515 | When a function contains too many graph breaks or individual 516 | graph exceeds the memory on the worker, 517 | the remote inference worker may run out of CPU/GPU memory. 518 | When it happens, you may get an "Error on loading model component". 519 | This is known to happen with models like Stable Diffusion. 520 | We are actively working on optimizing the memory allocation of 521 | many subgraphs. 522 | 523 | ### Limitations of TorchDynamo 524 | TorchDynamo is under active development. You may encounter 525 | errors that are TorchDynamo related. 526 | These should not be fundamental problems as we believe TorchDynamo 527 | will continue to improve its coverage. 528 | If you find a broken model, please [file an issue](https://github.com/octoml/octoml-profile/issues). 529 | 530 | ## Contact the team 531 | - Discord: [OctoML community Discord](https://discord.gg/Quc8hSxpMe) 532 | - Github issues: https://github.com/octoml/octoml-profile/issues 533 | - Email: dynamite@octoml.ai 534 | --------------------------------------------------------------------------------