├── assets
    ├── architecture.png
    └── distilbert-diff.png
├── examples
    ├── requirements.txt
    ├── stable_diffusion.py
    ├── hello_world.py
    ├── stable_diffusion_unet.py
    ├── stable_diffusion_2_1.py
    ├── resnet.py
    ├── distilbert.py
    ├── t5.py
    ├── minilm.py
    ├── gpt_neo_125m.py
    ├── whisper.py
    ├── README.md
    └── sentence_transformers
    │   └── sentence_transformer_eval.csv
├── LICENSE
└── README.md


/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/octoml/octoml-profile/HEAD/assets/architecture.png


--------------------------------------------------------------------------------
/assets/distilbert-diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/octoml/octoml-profile/HEAD/assets/distilbert-diff.png


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.27.4
2 | diffusers==0.14.0
3 | sentencepiece==0.1.97
4 | 
5 | librosa==0.10.0
6 | datasets==2.7.1
7 | soundfile==0.12.1
8 | 


--------------------------------------------------------------------------------
/examples/stable_diffusion.py:
--------------------------------------------------------------------------------
 1 | from diffusers import StableDiffusionPipeline
 2 | from octoml_profile import accelerate, remote_profile
 3 | 
 4 | model_id = "runwayml/stable-diffusion-v1-5"
 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id)
 6 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]',
 7 |             'g5.xlarge/torch-eager-cuda[fp16]']
 8 | 
 9 | 
10 | @accelerate
11 | def predict(prompt):
12 |     steps = 10
13 |     images = pipe(prompt, num_inference_steps=steps).images
14 |     return images
15 | 
16 | 
17 | with remote_profile(backends=backends, num_repeats=1):
18 |     for i in range(2):
19 |         prompt = "A photo of an astronaut riding a horse on marse."
20 |         predict(prompt)
21 | 


--------------------------------------------------------------------------------
/examples/hello_world.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Linear, ReLU, Sequential
 4 | from octoml_profile import (accelerate, remote_profile)
 5 | 
 6 | 
 7 | model = Sequential(Linear(100, 200), ReLU(), Linear(200, 10))
 8 | 
 9 | 
10 | @accelerate
11 | def predict(x: torch.Tensor):
12 |     y = model(x)
13 |     z = F.softmax(y, dim=-1)
14 |     return z
15 | 
16 | # Alternatively you can also directly use `accelerate`
17 | # on a model, e.g. `predict = accelerate(model)` which will leave the
18 | # softmax out of remote execution
19 | 
20 | 
21 | # This will create a session default hardware and acceleration option.
22 | with remote_profile():
23 |     for i in range(10):
24 |         x = torch.randn(1, 100)
25 |         predict(x)
26 | 


--------------------------------------------------------------------------------
/examples/stable_diffusion_unet.py:
--------------------------------------------------------------------------------
 1 | from diffusers import StableDiffusionPipeline
 2 | from octoml_profile import accelerate, remote_profile
 3 | 
 4 | model_id = "runwayml/stable-diffusion-v1-5"
 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id)
 6 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]',
 7 |             'g5.xlarge/torch-eager-cuda[fp16]']
 8 | 
 9 | 
10 | pipe.unet = accelerate(pipe.unet)
11 | pipe.vae.decode = accelerate(pipe.vae.decode)
12 | 
13 | 
14 | def predict(prompt):
15 |     steps = 10
16 |     images = pipe(prompt, num_inference_steps=steps).images
17 |     return images
18 | 
19 | 
20 | with remote_profile(backends=backends, num_repeats=1):
21 |     for i in range(2):
22 |         prompt = "A photo of an astronaut riding a horse on marse."
23 |         predict(prompt)
24 | 


--------------------------------------------------------------------------------
/examples/stable_diffusion_2_1.py:
--------------------------------------------------------------------------------
 1 | from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
 2 | from octoml_profile import accelerate, remote_profile
 3 | 
 4 | model_id = "stabilityai/stable-diffusion-2-1"
 5 | pipe = StableDiffusionPipeline.from_pretrained(model_id)
 6 | pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 7 | backends = ['g4dn.xlarge/torch-eager-cuda[fp16]',
 8 |             'g5.xlarge/torch-eager-cuda[fp16]']
 9 | 
10 | 
11 | @accelerate
12 | def predict(prompt):
13 |     steps = 10
14 |     images = pipe(prompt, num_inference_steps=steps).images
15 |     return images
16 | 
17 | 
18 | with remote_profile(backends=backends, num_repeats=1):
19 |     for i in range(2):
20 |         prompt = "A photo of an astronaut riding a horse on mars."
21 |         predict(prompt)
22 | 


--------------------------------------------------------------------------------
/examples/resnet.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from transformers import AutoFeatureExtractor, ResNetForImageClassification
 3 | from octoml_profile import accelerate, remote_profile
 4 | 
 5 | dataset = load_dataset("huggingface/cats-image")
 6 | image = dataset["test"]["image"][0]
 7 | model_id = 'microsoft/resnet-50'
 8 | feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
 9 | model = ResNetForImageClassification.from_pretrained(model_id)
10 | 
11 | inputs = feature_extractor(image, return_tensors="pt")
12 | 
13 | 
14 | @accelerate
15 | def run_model(inputs):
16 |     return model(**inputs)
17 | 
18 | 
19 | with remote_profile():
20 |     for i in range(3):
21 |         result = run_model(inputs)
22 | 
23 | 
24 | predicted_label = result.logits.argmax(-1).item()
25 | print(model.config.id2label[predicted_label])
26 | 


--------------------------------------------------------------------------------
/examples/distilbert.py:
--------------------------------------------------------------------------------
 1 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 2 | from octoml_profile import accelerate, remote_profile
 3 | 
 4 | 
 5 | model_id = "distilbert-base-uncased-finetuned-sst-2-english"
 6 | tokenizer = DistilBertTokenizer.from_pretrained(model_id)
 7 | model = DistilBertForSequenceClassification.from_pretrained(model_id)
 8 | 
 9 | 
10 | @accelerate
11 | def predict(input: str):
12 |     inputs = tokenizer(input, return_tensors="pt")
13 |     logits = model(**inputs).logits
14 |     predicted_class_id = logits.argmax().item()
15 |     return model.config.id2label[predicted_class_id]
16 | 
17 | 
18 | with remote_profile(backends=["r6i.large/torch-eager-cpu",
19 |                               "g4dn.xlarge/torch-eager-cuda",
20 |                               "g4dn.xlarge/onnxrt-cuda"]):
21 |     examples = [
22 |         "Hello, world!",
23 |         "Nice to meet you",
24 |         "My dog is cute",
25 |     ]
26 |     for _ in range(3):
27 |         for s in examples:
28 |             predict(s)
29 | 


--------------------------------------------------------------------------------
/examples/t5.py:
--------------------------------------------------------------------------------
 1 | # This example requires torch nightly
 2 | # Recent nightly introduced a regression on onnx export:
 3 | #    https://github.com/pytorch/pytorch/issues/99788
 4 | # recommend nightly version torch<=2.1.0.dev20230327 for this example
 5 | # Please run `pip install -r requirements.txt`
 6 | from transformers import T5Tokenizer, T5ForConditionalGeneration
 7 | from octoml_profile import accelerate, remote_profile
 8 | 
 9 | model_id = "google/flan-t5-small"
10 | tokenizer = T5Tokenizer.from_pretrained(model_id)
11 | model = T5ForConditionalGeneration.from_pretrained(model_id)
12 | 
13 | input_text = "A step by step recipe to make bolognese pasta:"
14 | 
15 | 
16 | @accelerate(dynamic=True)
17 | def generate(input_text):
18 |     input_ids = tokenizer(input_text, return_tensors="pt").input_ids
19 |     outputs = model.generate(input_ids)
20 |     return tokenizer.decode(outputs[0])
21 | 
22 | 
23 | with remote_profile(backends=['g4dn.xlarge/onnxrt-cuda', 'r6i.large/onnxrt-cpu'],
24 |                     num_repeats=1):
25 |     for i in range(2):
26 |         result = generate(input_text)
27 | 


--------------------------------------------------------------------------------
/examples/minilm.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, BertForSequenceClassification
 2 | from octoml_profile import accelerate, remote_profile
 3 | 
 4 | model_id = 'philschmid/MiniLM-L6-H384-uncased-sst2'
 5 | tokenizer = AutoTokenizer.from_pretrained(model_id)
 6 | model = BertForSequenceClassification.from_pretrained(model_id)
 7 | 
 8 | examples = [
 9 |     "Hello, world!",
10 |     "Nice to meet you",
11 |     "Goodbye, world!"
12 | ]
13 | inputs = tokenizer(examples, return_tensors="pt")
14 | 
15 | 
16 | model = accelerate(model)
17 | 
18 | 
19 | with remote_profile(backends=["r6i.large/onnxrt-cpu",
20 |                               "r6i.large/torch-eager-cpu",
21 |                               "r7g.large/onnxrt-cpu",
22 |                               "g4dn.xlarge/onnxrt-cuda",
23 |                               "g4dn.xlarge/onnxrt-tensorrt",
24 |                               "g4dn.xlarge/torch-eager-cuda",
25 |                               "g4dn.xlarge/torch-inductor-cuda",
26 |                               "g5.xlarge/torch-eager-cuda"]):
27 |     for i in range(3):
28 |         result = model(**inputs)
29 | 
30 | print(result.logits)
31 | 


--------------------------------------------------------------------------------
/examples/gpt_neo_125m.py:
--------------------------------------------------------------------------------
 1 | # This example requires torch nightly (see README.md for recommended version)
 2 | 
 3 | from transformers import GPTNeoForCausalLM, GPT2Tokenizer
 4 | from octoml_profile import accelerate, remote_profile
 5 | 
 6 | model_id = "EleutherAI/gpt-neo-125M"
 7 | model = GPTNeoForCausalLM.from_pretrained(model_id)
 8 | tokenizer = GPT2Tokenizer.from_pretrained(model_id)
 9 | 
10 | prompt = (
11 |     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
12 |     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
13 |     "researchers was the fact that the unicorns spoke perfect English."
14 | )
15 | 
16 | 
17 | @accelerate(dynamic=True)
18 | def predict(prompt):
19 |     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
20 |     gen_tokens = model.generate(
21 |         input_ids,
22 |         do_sample=True,
23 |         temperature=0.9,
24 |         max_length=100,
25 |     )
26 |     return tokenizer.batch_decode(gen_tokens)[0]
27 | 
28 | 
29 | with remote_profile(backends=["g5.xlarge/onnxrt-cuda"], num_repeats=1):
30 |     for i in range(3):
31 |         predict(prompt)
32 | 


--------------------------------------------------------------------------------
/examples/whisper.py:
--------------------------------------------------------------------------------
 1 | # This example requires torch nightly (see README.md for recommended version)
 2 | # It further requires `pip install datasets soundfile librosa`
 3 | # Please run `pip install -r requirements.txt`
 4 | from datasets import load_dataset
 5 | from transformers import WhisperProcessor, WhisperForConditionalGeneration
 6 | from octoml_profile import remote_profile, accelerate
 7 | 
 8 | #
 9 | # load model and processor
10 | processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
11 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
12 | 
13 | # load dummy dataset and read audio files
14 | ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
15 | sample = ds[0]["audio"]
16 | 
17 | 
18 | @accelerate(dynamic=True)
19 | def predict(sample):
20 |     input_features = processor(sample["array"],
21 |                                sampling_rate=sample["sampling_rate"],
22 |                                return_tensors="pt").input_features
23 |     # generate token ids
24 |     predicted_ids = model.generate(input_features)
25 |     # decode token ids to text
26 |     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
27 |     return transcription
28 | 
29 | 
30 | with remote_profile(backends=["g4dn.xlarge/onnxrt-cuda", "r6i.large/onnxrt-cpu"],
31 |                     num_repeats=1):
32 |     for _ in range(3):
33 |         text = predict(sample)
34 |         print(text)
35 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ## Examples
 2 | For a slightly more complex example than the [simple tutorial example](../README.md#installation-and-getting-started),
 3 | we can take [the DistilBERT model from
 4 | HuggingFace](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english), where we make a
 5 | couple of lines of modification to the example code (`pip install transformers==4.27.4` is recommended).
 6 | 
 7 | To run other examples in this directory, please run (`pip install -r requirements.txt`) and pay
 8 | attention to the examples that require nightly torch. You can find recommended nightly 
 9 | torch version at [here](../README.md#dynamic-shapes).
10 | 
11 | ```python
12 | import torch
13 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
14 | from octoml_profile import accelerate, remote_profile
15 | 
16 | model_id = "distilbert-base-uncased-finetuned-sst-2-english"
17 | tokenizer = DistilBertTokenizer.from_pretrained(model_id)
18 | model = DistilBertForSequenceClassification.from_pretrained(model_id)
19 | 
20 | @accelerate
21 | def predict(input: str):
22 |   inputs = tokenizer(input, return_tensors="pt")
23 |   logits = model(**inputs).logits
24 |   predicted_class_id = logits.argmax().item()
25 |   return model.config.id2label[predicted_class_id]
26 | 
27 | with remote_profile(backends=["r6i.large/onnxrt-cpu", "g5.xlarge/onnxrt-cuda"]):
28 |     examples = [
29 |       "Hello, world!",
30 |       "Nice to meet you",
31 |       "My dog is cute",
32 |     ]
33 |     for _ in range(3):
34 |       for s in examples:
35 |           predict(s)
36 | ```
37 | And now we can easily run this model on a variety of hardware and understand
38 | performance implications, all without having to worry about provisioning cloud
39 | instances, configuring software or deploying our code.
40 | 
41 | You can use Dynamite directly within your application - whether it be a REST
42 | API, CLI application or anything else - with your own data and tests.
43 | 
44 | 
45 | ### Dynamic models
46 | 
47 | We've enabled dynamic graph capture with `@accelerate(dynamic=True)`. See the
48 | generative model [t5.py](t5.py), [gpt_neo_125m](gpt_neo_125m.py) and
49 | [whisper](whisper.py) as examples.
50 | 


--------------------------------------------------------------------------------
/examples/sentence_transformers/sentence_transformer_eval.csv:
--------------------------------------------------------------------------------
 1 | ,model,backend,time_ms,cost_per_mreq,batch_size
 2 | 0,all-MiniLM-L12-v2,r6i.large/onnxrt-cpu,17.8209513,0.6237332955,1
 3 | 1,all-MiniLM-L12-v2,r6i.large/torch-eager-cpu,28.495958399999996,0.9973585439999999,1
 4 | 2,all-MiniLM-L12-v2,r7g.large/onnxrt-cpu,21.257490899999997,0.13581174741666666,1
 5 | 3,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-cuda,11.8977955,1.7384001202777777,1
 6 | 4,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-tensorrt,12.402284,1.8121114955555555,1
 7 | 5,all-MiniLM-L12-v2,g4dn.xlarge/torch-eager-cuda,18.3080092,2.6750035664444445,1
 8 | 6,all-MiniLM-L12-v2,g4dn.xlarge/torch-inductor-cuda,15.7441459,2.300394650944445,1
 9 | 7,all-MiniLM-L6-v2,r6i.large/onnxrt-cpu,8.6542595,0.3028990825,1
10 | 8,all-MiniLM-L6-v2,r6i.large/torch-eager-cpu,13.2147832,0.462517412,1
11 | 9,all-MiniLM-L6-v2,r7g.large/onnxrt-cpu,10.397357999999997,0.06642756499999998,1
12 | 10,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-cuda,5.9874456,0.8748323293333332,1
13 | 11,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-tensorrt,6.3256254,0.9242441556666666,1
14 | 12,all-MiniLM-L6-v2,g4dn.xlarge/torch-eager-cuda,9.4387526,1.3791066298888892,1
15 | 13,all-MiniLM-L6-v2,g4dn.xlarge/torch-inductor-cuda,7.867855199999999,1.1495810653333332,1
16 | 14,all-distilroberta-v1,r6i.large/onnxrt-cpu,20.779371999999995,0.7272780199999999,1
17 | 15,all-distilroberta-v1,r6i.large/torch-eager-cpu,38.31659840000001,1.3410809440000004,1
18 | 16,all-distilroberta-v1,r7g.large/onnxrt-cpu,27.108677599999996,0.1731943291111111,1
19 | 17,all-distilroberta-v1,g4dn.xlarge/onnxrt-cuda,7.0399018,1.028607874111111,1
20 | 18,all-distilroberta-v1,g4dn.xlarge/onnxrt-tensorrt,7.6713276,1.1208661993333333,1
21 | 19,all-distilroberta-v1,g4dn.xlarge/torch-eager-cuda,11.2453003,1.6430633216111112,1
22 | 20,all-distilroberta-v1,g4dn.xlarge/torch-inductor-cuda,9.053695099999999,1.3228454507222223,1
23 | 21,paraphrase-albert-small-v2,r6i.large/onnxrt-cpu,19.086343899999996,0.6680220364999998,1
24 | 22,paraphrase-albert-small-v2,r6i.large/torch-eager-cpu,29.197723699999997,1.0219203295,1
25 | 23,paraphrase-albert-small-v2,r7g.large/onnxrt-cpu,28.6267206,0.18289293716666666,1
26 | 24,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-cuda,5.315114,0.7765972122222223,1
27 | 25,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-tensorrt,6.0049864,0.8773952351111112,1
28 | 26,paraphrase-albert-small-v2,g4dn.xlarge/torch-eager-cuda,9.651154,1.4101408344444444,1
29 | 27,paraphrase-albert-small-v2,g4dn.xlarge/torch-inductor-cuda,7.032197,1.0274821172222224,1
30 | 28,paraphrase-MiniLM-L3-v2,r6i.large/onnxrt-cpu,5.3824666,0.188386331,1
31 | 29,paraphrase-MiniLM-L3-v2,r6i.large/torch-eager-cpu,7.807972800000001,0.273279048,1
32 | 30,paraphrase-MiniLM-L3-v2,r7g.large/onnxrt-cpu,6.6782705,0.04266672819444444,1
33 | 31,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-cuda,4.7531763,0.6944918705,1
34 | 32,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-tensorrt,5.019084299999999,0.7333439838333334,1
35 | 33,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-eager-cuda,6.8465889,1.0003627115000002,1
36 | 34,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-inductor-cuda,5.8367854,0.8528192001111112,1
37 | 35,all-MiniLM-L12-v2,r6i.large/onnxrt-cpu,1254.0356565,43.8912479775,256
38 | 36,all-MiniLM-L12-v2,r6i.large/torch-eager-cpu,1209.5858773,42.3355057055,256
39 | 37,all-MiniLM-L12-v2,r7g.large/onnxrt-cpu,3336.4584253999997,21.316262162277773,256
40 | 38,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-cuda,98.84057679999998,14.44170649911111,256
41 | 39,all-MiniLM-L12-v2,g4dn.xlarge/onnxrt-tensorrt,209.7131162,30.641416422555555,256
42 | 40,all-MiniLM-L12-v2,g4dn.xlarge/torch-eager-cuda,134.9395834,19.716172463444448,256
43 | 41,all-MiniLM-L12-v2,g4dn.xlarge/torch-inductor-cuda,117.63634059999998,17.187976432111107,256
44 | 42,all-MiniLM-L6-v2,r6i.large/onnxrt-cpu,611.8071633,21.4132507155,256
45 | 43,all-MiniLM-L6-v2,r6i.large/torch-eager-cpu,661.1407737000001,23.1399270795,256
46 | 44,all-MiniLM-L6-v2,r7g.large/onnxrt-cpu,1696.5189904000003,10.838871327555557,256
47 | 45,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-cuda,77.6411086,11.344228645444444,256
48 | 46,all-MiniLM-L6-v2,g4dn.xlarge/onnxrt-tensorrt,130.4229707,19.056245163388887,256
49 | 47,all-MiniLM-L6-v2,g4dn.xlarge/torch-eager-cuda,99.27137689999999,14.504651180388889,256
50 | 48,all-MiniLM-L6-v2,g4dn.xlarge/torch-inductor-cuda,86.3636818,12.618693507444446,256
51 | 49,all-distilroberta-v1,r6i.large/onnxrt-cpu,2221.6413357,77.7574467495,256
52 | 50,all-distilroberta-v1,r6i.large/torch-eager-cpu,2149.491522200001,75.23220327700002,256
53 | 51,all-distilroberta-v1,r7g.large/onnxrt-cpu,6580.753075699999,42.04370020586111,256
54 | 52,all-distilroberta-v1,g4dn.xlarge/onnxrt-cuda,114.65186839999998,16.751911882888887,256
55 | 53,all-distilroberta-v1,g4dn.xlarge/onnxrt-tensorrt,200.23376609999997,29.25637804683333,256
56 | 54,all-distilroberta-v1,g4dn.xlarge/torch-eager-cuda,111.4591273,16.28541693327778,256
57 | 55,all-distilroberta-v1,g4dn.xlarge/torch-inductor-cuda,108.91961779999998,15.914366378555554,256
58 | 56,paraphrase-albert-small-v2,r6i.large/onnxrt-cpu,2122.1424868999998,74.27498704149998,256
59 | 57,paraphrase-albert-small-v2,r6i.large/torch-eager-cpu,2223.4401246999996,77.82040436449999,256
60 | 58,paraphrase-albert-small-v2,r7g.large/onnxrt-cpu,6165.979900800002,39.393760477333345,256
61 | 59,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-cuda,103.72082550000003,15.154765059166671,256
62 | 60,paraphrase-albert-small-v2,g4dn.xlarge/onnxrt-tensorrt,183.57358340000002,26.822140241222222,256
63 | 61,paraphrase-albert-small-v2,g4dn.xlarge/torch-eager-cuda,121.76579240000001,17.79133522288889,256
64 | 62,paraphrase-albert-small-v2,g4dn.xlarge/torch-inductor-cuda,118.08781490000003,17.253941843722227,256
65 | 63,paraphrase-MiniLM-L3-v2,r6i.large/onnxrt-cpu,352.9728934,12.354051269,256
66 | 64,paraphrase-MiniLM-L3-v2,r6i.large/torch-eager-cpu,353.5955084,12.375842794000002,256
67 | 65,paraphrase-MiniLM-L3-v2,r7g.large/onnxrt-cpu,872.6846742999999,5.575485419138888,256
68 | 66,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-cuda,60.5438919,8.8461353165,256
69 | 67,paraphrase-MiniLM-L3-v2,g4dn.xlarge/onnxrt-tensorrt,89.4471107,13.069216730055556,256
70 | 68,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-eager-cuda,73.7146786,10.770533595444444,256
71 | 69,paraphrase-MiniLM-L3-v2,g4dn.xlarge/torch-inductor-cuda,62.96103779999999,9.199307189666666,256
72 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## octoml-profile
  2 | 
  3 | *octoml-profile* is a python library and cloud service that enables ML
  4 | engineers to easily assess the performance and cost of PyTorch models on cloud
  5 | hardware with state-of-the-art ML acceleration technology.
  6 | 
  7 | Whether you're building machine learning models for research, development, or
  8 | production, benchmarking your AI applications is a necessary step before
  9 | deployment. An optimally chosen hardware + runtime deployment strategy can
 10 | reduce cloud costs by more than 10x over default solutions.
 11 | 
 12 | With *octoml-profile*, within minutes, you can measure the performance and
 13 | cost of your PyTorch models on different cloud hardware. Our ML
 14 | acceleration technology ensures that you get the most accurate and efficient
 15 | results, so you can make informed decisions about how to optimize your AI
 16 | applications.
 17 | 
 18 | *Note: this tool is not designed for profiling individual PyTorch
 19 | ops on the local machine. Please use `torch.profiler` for such purpose.*
 20 | 
 21 | 
 22 | ### Key Features
 23 | - 🔧 Magic remote execution with only a few additional lines of code
 24 | - 💻 Runs on local development environment without any GPU requirement
 25 | - 💪 Absolves tedious tasks such as model export, hardware provisioning, and dependency preparation
 26 | - 🚀 Provides performance and cost insights within seconds (or minutes for larger models)
 27 | - ⚙️  Supports diverse hardware and state-of-the-art software backends
 28 | - 🌟 Supports the latest generative AI models with dynamic shapes
 29 | - 📊 Uses the same data and workflow as your training and experiment tracking
 30 | 
 31 | 
 32 | ### Limitation
 33 | - Only supports inference workload
 34 | 
 35 | 
 36 | ### Demos
 37 | - [SentenceTransformers](examples/sentence_transformers/SentenceTransformerEval.ipynb)
 38 | - [Stable Diffusion](examples/stable_diffusion.py)
 39 | - [GPTNeo](examples/gpt_neo_125m.py)
 40 | - [Whisper](examples/whisper.py)
 41 | - [T5](examples/t5.py)
 42 | 
 43 | 
 44 | ### Latest
 45 | - [04-25-2023] Client update `v0.2.2` with enhanced terminal output and more examples
 46 | - [03-22-2023] Initial release of `v0.2.0`
 47 | 
 48 | 
 49 | ### Documentation quick links
 50 | * [Installation and Getting Started](#installation-and-getting-started)
 51 | * [Dynamic shapes](#dynamic-shapes)
 52 | * [How it works](#how-it-works)
 53 | * [Data privacy](#data-privacy)
 54 | * [Known issues](#known-issues)
 55 | * [Contact the team](#contact-the-team)
 56 | 
 57 | ### "Hello World" example
 58 | 
 59 | Let's say you have a PyTorch model that performs sentiment analysis using a
 60 | DistilBert model, and you want to optimize it for cloud deployment. With
 61 | octoml-profile, you can easily benchmark the predict function on various cloud
 62 | hardware and use different acceleration techniques to find the optimal
 63 | deployment strategy.
 64 | 
 65 | ![Distilbert Example](assets/distilbert-diff.png)
 66 | 
 67 | Within a few seconds, you will find the runtime and cost that help you pick the
 68 | optimal hardware and inference engine for deployment.
 69 | 
 70 | ```
 71 | Function `predict` has 1 profile:
 72 | - Profile `predict[1/1]` ran 3 times. (1 discarded because compilation happened)
 73 | 
 74 | Instance     Processor           Backend              Backend Time (ms)  Total Time (ms)  Cost ($/MReq)
 75 | =======================================================================================================
 76 | r6i.large    Intel Ice Lake CPU  torch-eager-cpu                 24.735           52.009          $1.82
 77 | g4dn.xlarge  Nvidia T4 GPU       torch-eager-cuda                 5.336           32.610          $4.76
 78 | g4dn.xlarge  Nvidia T4 GPU       torch-inductor-cuda              3.249           30.523          $4.46
 79 | g4dn.xlarge  Nvidia T4 GPU       onnxrt-cuda                      1.399           28.673          $4.19
 80 | -------------------------------------------------------------------------------------------------------
 81 | Total time above is `remote backend time + local python code time`,
 82 |  in which local python code run time is 27.274 ms.
 83 | Graph level profile is located at /tmp/octoml_profile_n603dewx/0/predict_1*
 84 | ```
 85 | 
 86 | ## Installation and Getting Started
 87 | - Create and activate a python virtual environment. `Python 3.8` is recommended
 88 |   and tested on both `Ubuntu` and `macOS`. `Python 3.10.9` is tested on `macOS`
 89 |   with Apple silicon.
 90 | 
 91 |   ```
 92 |   python3 -m venv env
 93 |   source env/bin/activate
 94 |   ```
 95 | - Install dependencies
 96 | 
 97 |   PyTorch 2.0 and above is required. Below we install the cpu version for
 98 |   simplicity; CUDA version works too.
 99 | 
100 |   ```
101 |   pip install --upgrade pip
102 |   pip install "torch>=2.0.0" torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
103 |   pip install "octoml-profile>=0.2.0"
104 |   ```
105 | 
106 |   You've completed installation! (If you have trouble, see [issues with installation](#issues-with-installation))
107 | 
108 | - Next, try running this very simple example that shows how to integrate octoml-profile into your model code.
109 | 
110 |   ```python
111 |   import torch
112 |   from torch.nn import Linear, ReLU, Sequential
113 |   from octoml_profile import accelerate, remote_profile
114 | 
115 |   model = Sequential(Linear(100, 200), ReLU())
116 | 
117 |   @accelerate
118 |   def predict(x: torch.Tensor):
119 |       return model(x)
120 | 
121 |   with remote_profile():
122 |       for _ in range(3):
123 |           x = torch.randn(1, 100)
124 |           predict(x)
125 |   ```
126 | 
127 | - The first time you run this, you'll be prompted to supply your API key.  
128 | 
129 |   ```
130 |       ,-""-.
131 |     /      \    Welcome to OctoML Profiler!
132 |     :        ;
133 |     \      /    It looks like you don't have an access token configured.
134 |       `.  .'     Please go to https://profiler.app.octoml.ai/ to generate one
135 |     '._.'`._.'   and then paste it here.
136 | 
137 |   Access token: 
138 |   ```
139 |   (<a href="https://profiler.app.octoml.ai/">Sign up</a> so that you can generate an API token when prompted)
140 | 
141 | - Once you've provided credentials, running this results in the following
142 |   output that shows times of the function being executed remotely on each
143 |   backend.
144 | 
145 |   ```
146 |   Function `predict` has 1 profile:
147 |   - Profile `predict[1/1]` ran 3 times. (1 discarded because compilation happened)
148 | 
149 |   Instance     Processor           Backend              Backend Time (ms)  Total Time (ms)  Cost ($/MReq)
150 |   =======================================================================================================
151 |   r6i.large    Intel Ice Lake CPU  torch-eager-cpu                  0.024            0.086          $0.00
152 |   g4dn.xlarge  Nvidia T4 GPU       torch-eager-cuda                 0.097            0.159          $0.02
153 |   g4dn.xlarge  Nvidia T4 GPU       torch-inductor-cuda              0.177            0.239          $0.03
154 |   -------------------------------------------------------------------------------------------------------
155 |   Total time above is `remote backend time + local python code time`,
156 |   in which local python code run time is 0.062 ms.
157 |   Graph level profile is located at /tmp/octoml_profile_8o45fe39/0/predict_1*
158 |   ```
159 |   To see more examples, see [examples/](examples).
160 | 
161 | 
162 | ### Issues with installation
163 | 
164 |  - If you are on macOS with Apple silicon and seeing `symbol not found in flat
165 |    namespace '_CFRelease'`, it is likely that you created a `venv` with python
166 |    installed by `conda`. Please make sure to deactivate any `conda`
167 |    environment(s) and use the system-shipped python on macOS to create `venv`.
168 |    Or follow the instructions below to create a conda environment.
169 | 
170 |     ```
171 |     conda create -n octoml python=3.8
172 |     conda activate octoml
173 |     ```
174 | 
175 | - If you see a version conflict, please install the pip dependencies above with `--force-reinstall`.
176 | 
177 | - For any other problems, please file a github issue.
178 | 
179 | 
180 | ### Dynamic shapes
181 | 
182 | This is an experimental feature that requires installing nightly of PyTorch.
183 | The dynamic shape graph capture feature is still under active development by
184 | the PyTorch team, so your results may vary. If you find any problems, please
185 | report via github issue.
186 | 
187 | ```
188 | pip install --pre torch==2.1.0.dev20230416 torchaudio==2.1.0.dev20230416 torchvision==0.16.0.dev20230416 --index-url https://download.pytorch.org/whl/nightly/cpu
189 | ```
190 | 
191 | By default, the `@accelerate` decorator will recompile a new graph if the input
192 | shapes to the graph is changed. For generative model cases such as text
193 | generation, it is inefficient to have to compile a separate graph for each
194 | sequence length. The solution is to turn on "dynamic-shapes" for the compiler,
195 | which means the graph compilation will be agnostic to the input shapes,
196 | resulting in drastically fewer graphs to be compiled and lower memory to run
197 | end to end.
198 | 
199 | As an toy example:
200 | 
201 | ```python
202 | import torch
203 | from octoml_profile import accelerate, remote_profile
204 | 
205 | conv = torch.nn.Conv2d(16, 16, 3)
206 | 
207 | # With `dynamic=True` any model inside will not be specialized to the input shape
208 | @accelerate(dynamic=True)
209 | def predict(x: torch.Tensor):
210 |     return conv(x)
211 | 
212 | with remote_profile(backends=["r6i.large/onnxrt-cpu"]):
213 |     # batch size is different but compilation only 
214 |     # happens once
215 |     for i in range(1, 5):
216 |         predict(torch.randn(i, 16, 10, 10))
217 | ```
218 | 
219 | Set `@accelerate(dynamic=True)` on any `accelerate` usage.
220 | 
221 | 
222 | ## How it works
223 | 
224 | * [How octoml-profile works](#how-octoml-profile-works)
225 | * [Where `@accelerate` should be applied](#where-accelerate-should-be-applied)
226 | * [The profile report](#the-profile-report)
227 | * [Local python segments](#local-python-segments)
228 | * [Quota](#quota)
229 | * [Supported backends](#supported-backends)
230 | 
231 | ### How octoml-profile works
232 | 
233 | octoml-profile consists of two main components: a Python library and a cloud
234 | service. The Python library is used to automatically extract PyTorch models on
235 | your local machine and send them to the cloud service for remote benchmarking.
236 | The cloud service provides access to different cloud hardware targets that are
237 | prepared with various deep learning inference engines. This enables users to
238 | optimize and measure their PyTorch models in a variety of deployment
239 | configurations.
240 | 
241 | ![Architecture Illustration](assets/architecture.png)
242 | 
243 | In various examples above, we first `import octoml-profile` python library, and then
244 | decorate a `predict` function with the `@accelerate` decorator. By default, it
245 | behaves like `@torch.compile`:
246 | [TorchDynamo](https://pytorch.org/docs/stable/dynamo/index.html) is used to
247 | extract one or more computation graphs, optimize them, and replace the bytecode
248 | inside the function with the optimized version.
249 | 
250 | When the code is surrounded with `remote_profile()` context manager, the
251 | behavior of the `@accelerate` decorator changes. Instead of running the
252 | extracted graphs on the local machine, the graphs are sent to one or more
253 | remote inference workers for execution and measurement. The run time of the
254 | offloaded graphs are referred to as "remote backend run time" in the output
255 | above.
256 | 
257 | Code that cannot be captured as a computation graph is not offloaded -- such code
258 | runs locally and is shown as "local python". For more details see
259 | the [local python code section](#local-python-segments) below.
260 | 
261 | When the `remote_profile()` context manager is entered, it reserves
262 | exclusive access to hardware specified in the optional `backends` keyword argument
263 | (or to a set of default hardware targets if the argument is omitted).
264 | If there are multiple backends, they will run in parallel.
265 | 
266 | The `predict` function may contain pre/post processing code, non tensor logic
267 | like control flows, side effects, and multiple models. Only eligible graphs
268 | will be intelligently extracted and offloaded for remote execution.
269 | 
270 | As a result, the estimated end to end run time of the decorated function for a
271 | particular hardware and acceleration engine is `remote backend run
272 | time + local python run time`. If `local python run time` is much smaller
273 | comparing to the total time, the estimate is fairly accurate because the
274 | impact of potential difference between local and remote machine for local python 
275 | code is minimal.
276 | 
277 | 
278 | ### Where to apply `@accelerate`
279 | In general, `@accelerate` is a drop-in replacement for `@torch.compile` and
280 | should be applied to function which contains PyTorch Model that performs
281 | inference. When the function is called under the context manager of `with
282 | remote_profile()`, the remote execution and profiling activated. When called
283 | without `remote_profile()` it behaves just as `torch.compile`. By default,
284 | `torch.no_grad()` is set in the `remote_profile()` context, because the remote
285 | execution does not support training yet.
286 | 
287 | If you expect the input shape to change especially for generative models,
288 | see [Dynamic Shapes](dynamic-shapes).
289 | 
290 | Last but not least, `@accelerate` should not be used to decorate a function
291 | that has already been decorated with `@accelerate` or `@torch.compile`.
292 | 
293 | ### The profile report
294 | 
295 | Most users should be satisfied with the output of just total run time and cost
296 | of using different hardware and software backends.
297 | However, advanced users like ML compiler engineers may be interested in diving
298 | into graph level performance analysis or reducing the number of graph breaks. This
299 | section shows you where to find the next level details.
300 | 
301 | The location of the `Profile` report is printed at the end of total runtime
302 | table, for instance:
303 | ```
304 | Graph level profile is located at /tmp/octoml_profile_8o45fe39/0/predict_1*
305 | ```
306 | 
307 | For each decorated function, the profile report is suffixed with `.profile.txt`,
308 | i.e. `/tmp/octoml_profile_8o45fe39/0/predict_1.profile.txt` is for function `predict`.
309 | 
310 | 
311 | An example report for DistilBert is the following:
312 | ```
313 |    Segment                            Samples  Avg ms  Failures
314 | ===============================================================
315 | 0  Local python                             2  27.227
316 | 
317 | 1  Graph #1
318 |      r6i.large/torch-eager-cpu             20  25.648         0
319 |      g4dn.xlarge/torch-eager-cuda          20   5.381         0
320 |      g4dn.xlarge/torch-inductor-cuda       20   3.208         0
321 |      g4dn.xlarge/onnxrt-cuda               20   1.400         0
322 | 
323 | 2  Local python                             2   0.110
324 | ---------------------------------------------------------------
325 | ```
326 | 
327 | To understand this report, let's first define some terminology.
328 | 
329 | Terminology:
330 | - `function` is a python function decorated with `@accelerate` to be profiled.
331 | - `run` is one execution of the function.
332 | - `subgraph` is a computation graph of tensor operations auto captured by TorchDynamo as
333 |   you run the function. A subgraph is a logical portion of the function.
334 | - `call` is one execution of a subgraph. A segment in a profile is a result of a call.
335 | - `repeats` is the number of times a graph is measured on a remote backend for each call.
336 | - `samples` is the total number an execution of a subgraph or local python segment is measured.
337 |   For each subgraph, `samples = repeats * call`.
338 | 
339 | On function, subgraph, and profile:
340 | - A single run of `predict` can have more than one subgraphs due to graph breaks.
341 | - Runs with different arguments may produce different subgraphs because the
342 |   computation may change. Runs that have the same sequence of graph execution
343 |   are merged into a "profile". For example, if `f(x1)`, `f(x2)`
344 | runs graph "1,2,3", and `f(x3)` runs graph "1,3,4", the segments of `f(x1)` and
345 | `f(x2)` will be merged into one profile and `f(x3)` will be its own profile.
346 | 
347 | To find out what operations each graph contains and where in the source code
348 | does each graph come from, take a look at `graph_dump.txt` under the same
349 | directory.
350 | 
351 | Like the DistilBert example above, when there are only a few segments, a profile 
352 | will show the linear sequence of subgraph segment runs. 
353 | 
354 | What happens when there are tens or hundreds of segments? When too many segments
355 | are run, we collapse the linear segment sequence into an abridged summary where
356 | only a few subgraphs that have the highest aggregate run times across their run
357 | segments are shown.
358 | 
359 | For example, a generative encoder-decoder based model that produces a large number of
360 | run segments will display an abridged report displaying 
361 | **runtime by subgraph** instead of **runtime by segment** by default.
362 | In cases like this, you'll see:
363 | 
364 | ```
365 | Top subgraph               Avg ms/call  Avg ms/run  Runtime % of e2e  Failures
366 | ==============================================================================
367 | Graph #7 (17 calls)      
368 |   r6i.large/onnxrt-cpu          36.979     628.645              70.2         0
369 |   g4dn.xlarge/onnxrt-cuda        5.612      95.403              37.9         0
370 | 
371 | Graph #4 (1 calls)       
372 |   r6i.large/onnxrt-cpu          43.823      43.823               4.9         0
373 |   g4dn.xlarge/onnxrt-cuda        5.002       5.002               2.0         0
374 | 
375 | Graph #2 (1 calls)       
376 |   r6i.large/onnxrt-cpu          43.357      43.357               4.8         0
377 |   g4dn.xlarge/onnxrt-cuda        3.154       3.154               1.3         0
378 | 
379 | 4 other subgraphs        
380 |   r6i.large/onnxrt-cpu                      35.892               4.0         0
381 |   g4dn.xlarge/onnxrt-cuda                    4.833               1.9         0
382 | 
383 | 42 local python segments                   143.621
384 | ------------------------------------------------------------------------------
385 | ```
386 | 
387 | Other graphs are hidden. If your output has been abridged in this way
388 | but you want to see the full, sequential results of your profiling run, you can print
389 | the report with `verbose`:
390 | 
391 | ```python
392 | # print_results_to=None silences the default output profile report.
393 | with remote_profile(print_results_to=None) as prof:
394 |     ...
395 | prof.report().print(verbose=True)
396 | ```
397 | 
398 | ### Local python segments
399 | 
400 | You will see `Local python` in the profiling report because TorchDynamo only
401 | captures PyTorch code as graph. Where it cannot continue capturing, a "graph
402 | break" happens and the uncaptured code runs in python eagerly.
403 | 
404 | Let's look at the DistilBert example again:
405 | ```python
406 | @accelerate
407 | def predict(input: str):
408 |     inputs = tokenizer(input, return_tensors="pt")
409 |     logits = model(**inputs).logits
410 |     predicted_class_id = logits.argmax().item()
411 |     return model.config.id2label[predicted_class_id]
412 | ```
413 | 
414 | ```
415 |    Segment                            Samples  Avg ms  Failures
416 | ===============================================================
417 | 0  Local python                             2  27.227
418 | 
419 | 1  Graph #1
420 |      r6i.large/torch-eager-cpu             20  25.648         0
421 |      g4dn.xlarge/torch-eager-cuda          20   5.381         0
422 |      g4dn.xlarge/torch-inductor-cuda       20   3.208         0
423 |      g4dn.xlarge/onnxrt-cuda               20   1.400         0
424 | 
425 | 2  Local python                             2   0.110
426 | ---------------------------------------------------------------
427 | ```
428 | 
429 | Segment 1 is runs `Graph #1` which corresponds to the `model`.
430 | Segment 0 corresponds to the `tokenizer` function, and Segment 2 the logits and
431 | label mapping.
432 | 
433 | Local Python segment is run locally once for every invocation. The total estimated time 
434 | equals total remote backend time for all the graphs and total local python time.
435 | 
436 | To print graph breaks and understand more of what TorchDynamo is doing under the hood, see
437 | the [TorchDynamo Deeper Dive](https://pytorch.org/docs/stable/dynamo/deep-dive.html), 
438 | [Torch.FX](https://pytorch.org/docs/stable/fx.html) and
439 | [PyTorch
440 | Troubleshooting](https://pytorch.org/docs/master/dynamo/troubleshooting.html#torchdynamo-troubleshooting)
441 | pages.
442 | 
443 | 
444 | ### Quota
445 | 
446 | Each user has a limit on the number of concurrent backends held by the user's sessions which automatically get created and closed within `remote_profile()` context manager.
447 | If you find yourself hitting quota limits because you need to run more tasks concurrently, please contact us to increase the limit. 
448 | 
449 | ### Supported backends
450 | 
451 | To programmatically access a list of supported backends, please invoke:
452 | 
453 | ```python
454 | import octoml_profile
455 | print(octoml_profile.get_supported_backends())
456 | ```
457 | 
458 | **Supported Cloud Hardware**
459 | 
460 | AWS
461 | - g4dn.xlarge (Nvidia T4 GPU)
462 | - g5.xlarge  (Nvidia A10g GPU)
463 | - r6i.large (Intel Xeon IceLake CPU)
464 | - r7g.large (Arm based Graviton3 CPU)
465 | 
466 | **Supported Acceleration Libraries**
467 | 
468 | ONNXRuntime
469 | - onnxrt-cpu
470 | - onnxrt-cuda
471 | - onnxrt-tensorrt
472 | 
473 | PyTorch
474 | - torch-eager-cpu
475 | - torch-eager-cuda
476 | - torch-inductor-cpu
477 | - torch-inductor-cuda
478 | 
479 | If no backends are specified while calling `remote_profile(backends=[...])`, then defaults are used,
480 | which are determined by the server. At the moment of writing, the default is
481 | `["r6i.large/torch-eager-cpu", "g4dn.xlarge/torch-eager-cuda", "g4dn.xlarge/torch-inductor-cuda"]`.
482 | 
483 | 
484 | 
485 | ## Data privacy
486 | 
487 | We know that keeping your model data private is important to you. We guarantee that no other
488 | user has access to your data. We do not scrape any model information internally, and we do not use
489 | your uploaded data to try to improve our system -- we rely on you filing github issues or otherwise
490 | contacting us to understand your use case, and anything else about your model.
491 | Here's how our system currently works.
492 | 
493 | We leverage TorchDynamo's subgraph capture to identify PyTorch-only code, serialize those subgraphs,
494 | and upload them to our system for benchmark.
495 | g
496 | On model upload, we cache your model in AWS S3. This helps with your development iteration speed --
497 | every subsequent time you want profiling results, you won't have to wait for model re-upload on every
498 | minor tweak to your model or update to your requested backends list. When untouched for four weeks,
499 | any model subgraphs and constants are automatically removed from S3.
500 | 
501 | Your model subgraphs are loaded onto our remote workers and are cleaned up on the creation of
502 | every subsequent session. Between your session's closure and another session's startup,
503 | serialized subgraphs may lie around idle. No users can access these subgraphs in this interval.
504 | 
505 | If you still have concerns around data privacy, please [contact the team](#contact-the-team).
506 | 
507 | ## Known issues
508 | 
509 | ### Waiting for Session
510 | When you create a session, you get exclusive access to the requested hardware.
511 | When there are no hardware available, new session requests will
512 | be queued.
513 | 
514 | ### OOM for large models
515 | When a function contains too many graph breaks or individual
516 | graph exceeds the memory on the worker,
517 | the remote inference worker may run out of CPU/GPU memory.
518 | When it happens, you may get an "Error on loading model component". 
519 | This is known to happen with models like Stable Diffusion.
520 | We are actively working on optimizing the memory allocation of 
521 | many subgraphs.
522 | 
523 | ### Limitations of TorchDynamo 
524 | TorchDynamo is under active development. You may encounter
525 | errors that are TorchDynamo related.
526 | These should not be fundamental problems as we believe TorchDynamo
527 | will continue to improve its coverage.
528 | If you find a broken model, please [file an issue](https://github.com/octoml/octoml-profile/issues).
529 | 
530 | ## Contact the team
531 | - Discord: [OctoML community Discord](https://discord.gg/Quc8hSxpMe)
532 | - Github issues: https://github.com/octoml/octoml-profile/issues
533 | - Email: dynamite@octoml.ai
534 | 


--------------------------------------------------------------------------------