├── tests ├── requirements.txt └── test_pricing_calculator.py ├── NOTICE ├── requirements.txt ├── resources └── screenshot.jpg ├── CODE_OF_CONDUCT.md ├── utils ├── dashboard_creators │ ├── data_preview_viewer.py │ ├── static │ │ └── styles.css │ ├── data_stats_viewer_creator.py │ ├── output_viewer_creator.py │ ├── comparative_dashboard_creator.py │ ├── main_html_creator.py │ └── dashboard_template.py ├── model_ranker.py ├── tweetsumm_data_creator.py ├── model_runners │ ├── gpt_model_runner.py │ ├── bedrock_counting_runner.py │ └── pricing_calculator.py └── metrics │ └── bart_score.py ├── CONTRIBUTING.md ├── .gitignore ├── README.md ├── LICENSE └── summariziation_example.ipynb /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | fmeval 3 | pandas==2.1.4 4 | ipywidgets 5 | jupyterlab -------------------------------------------------------------------------------- /resources/screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/fm-leaderboarder/HEAD/resources/screenshot.jpg -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /utils/dashboard_creators/data_preview_viewer.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import re 3 | import json 4 | from unicodedata import normalize 5 | 6 | def create_data_preview_view(test_file_path, result_html_folder): 7 | 8 | with open(test_file_path, 'r') as json_file: 9 | json_list = list(json_file) 10 | 11 | # generate headers name 12 | headers = ['Input', 'Ground Truth'] 13 | 14 | # generate row data 15 | rows = [] 16 | for json_str in json_list: 17 | result = json.loads(json_str) 18 | row = [result['document'], result['summary']] 19 | rows.append(row) 20 | 21 | with open(f"{result_html_folder}/test_samples.html", "w", encoding='utf-8-sig') as file: 22 | from .dashboard_template import generate_dashboard_string 23 | file.write(generate_dashboard_string(title = "", column_names=headers, rows = rows)) 24 | -------------------------------------------------------------------------------- /utils/dashboard_creators/static/styles.css: -------------------------------------------------------------------------------- 1 | /* Tooltip container */ 2 | .tooltip { 3 | position: relative; 4 | display: inline-block; 5 | } 6 | 7 | /* Tooltip text */ 8 | .tooltip .tooltiptext { 9 | visibility: hidden; 10 | width: 300px; 11 | background-color: #555; 12 | color: #fff; 13 | text-align: center; 14 | padding: 5px; 15 | border-radius: 6px; 16 | position: absolute; 17 | z-index: 1; 18 | bottom: 150%; 19 | left: 50%; 20 | margin-left: -150px; 21 | opacity: 0; 22 | transition: opacity 0.3s; 23 | } 24 | 25 | /* Tooltip arrow */ 26 | .tooltip .tooltiptext::after { 27 | content: ""; 28 | position: absolute; 29 | top: 100%; 30 | left: 50%; 31 | margin-left: -5px; 32 | border-width: 5px; 33 | border-style: solid; 34 | border-color: #555 transparent transparent transparent; 35 | } 36 | 37 | /* Show the tooltip text when you mouse over the tooltip container */ 38 | .tooltip:hover .tooltiptext { 39 | visibility: visible; 40 | opacity: 1; 41 | } -------------------------------------------------------------------------------- /utils/model_ranker.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | def create_model_ranking(models_scores:dict): 4 | model_ranking = defaultdict(int) 5 | totals = len(models_scores) * len(list(models_scores.values())[0]) 6 | # Iterate over the metrics 7 | metrics = set() 8 | for model_id in models_scores: 9 | metrics.update(models_scores[model_id].keys()) 10 | 11 | for metric in metrics: 12 | # Sort the models for the current metric 13 | sorted_models = sorted( 14 | [(model_id, models_scores[model_id][metric]) for model_id in models_scores if metric in models_scores[model_id]], 15 | key=lambda x: x[1], 16 | reverse=False 17 | ) 18 | 19 | # Assign points to the models based on their ranking for the current metric 20 | for rank, (model_id, _) in enumerate(sorted_models, start=1): 21 | model_ranking[model_id] += rank / totals 22 | 23 | # Sort the models based on their total points 24 | return model_ranking 25 | 26 | for rank, (model_id, total_points) in enumerate(sorted_model_ranking, start=1): 27 | ret[model_id] = rank 28 | 29 | return ret 30 | -------------------------------------------------------------------------------- /utils/dashboard_creators/data_stats_viewer_creator.py: -------------------------------------------------------------------------------- 1 | import json 2 | def create_data_stats_view(test_file_path, result_img_folder): 3 | 4 | 5 | def get_doc_lengths(data): 6 | document_sentence_length, summary_sentence_length = [], [] 7 | 8 | for d in data: 9 | # TODO switch to token counting instead of characters 10 | document_sentence_length.append(len(d['document'].replace("\n"," ").split())) 11 | summary_sentence_length.append(len(d['summary'].replace("\n"," ").split())) 12 | 13 | return document_sentence_length, summary_sentence_length 14 | 15 | 16 | def read_data(filename): 17 | data = [] 18 | with open(filename, "r") as file: 19 | for line in file: 20 | data.append(json.loads(line)) 21 | return data 22 | 23 | test_data = read_data(f"{test_file_path}") 24 | test_doc_lengths, test_sum_lengths = get_doc_lengths(test_data) 25 | 26 | import matplotlib.pyplot as plt 27 | import numpy as np 28 | 29 | fig, axs = plt.subplots(2) 30 | axs[0].hist(test_doc_lengths, density=False, bins=50) 31 | axs[0].set_title('Test documents length') 32 | axs[1].hist(test_sum_lengths, density=False, bins=50) 33 | axs[1].set_title('Test summary length') 34 | 35 | for ax in axs.flat: 36 | ax.set(xlabel='Num. of words', ylabel='Count') 37 | 38 | for ax in axs.flat: 39 | ax.label_outer() 40 | 41 | plt.savefig(f"{result_img_folder}/dataset_stats.png") -------------------------------------------------------------------------------- /utils/dashboard_creators/output_viewer_creator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | from os import listdir 4 | from os.path import isfile, join 5 | 6 | def create_response_output_view(result_html_folder, tmp_json_files, models_scores): 7 | models_run = list(models_scores.keys()) 8 | if len(models_run) > 0: 9 | metrics_used = list(models_scores[models_run[0]].keys()) 10 | 11 | for model_id, scores in models_scores.items(): 12 | title = f'Model [{model_id}] - testset results' 13 | # generate headers name 14 | headers = ['Model input', 'Model output', 'Target Output'] 15 | for mu in metrics_used: 16 | headers.append(mu) 17 | 18 | model_json_filename = f"{tmp_json_files}/{model_id}_metrics.jsonl" 19 | data = [] 20 | with open(model_json_filename, "r") as file: 21 | for line in file: 22 | data.append(json.loads(line)) 23 | 24 | df = pd.DataFrame(data) 25 | for idx, mu in enumerate(metrics_used): 26 | df[mu] = df['scores'].apply(lambda x: x[idx]['value']) 27 | 28 | # generate row data 29 | rows = [] 30 | for index, item in df.iterrows(): 31 | row = [item['prompt'], item['model_output'], item['target_output']] 32 | for mu in metrics_used: 33 | row.append(item[mu]) 34 | rows.append(row) 35 | 36 | with open(f"{result_html_folder}/{model_id}_results.html", "w", encoding='utf-8-sig') as file: 37 | from .dashboard_template import generate_dashboard_string 38 | file.write(generate_dashboard_string(title = title, column_names = headers, rows = rows)) -------------------------------------------------------------------------------- /utils/dashboard_creators/comparative_dashboard_creator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import listdir 3 | from os.path import isfile, join 4 | 5 | 6 | def create_comparive_dashboard(result_html_folder, tmp_json_files): 7 | model_outputs = dict() 8 | test_samples = [] 9 | 10 | for result_file in listdir(tmp_json_files): 11 | if not result_file.endswith("_metrics.jsonl"): 12 | continue 13 | 14 | model = result_file.replace("_metrics.jsonl", "") 15 | 16 | model_outputs[model] = dict() 17 | 18 | data = [] 19 | filename = join(tmp_json_files, result_file) 20 | with open(filename, "r") as file: 21 | for line in file: 22 | data.append(json.loads(line)) 23 | 24 | if len(test_samples) == 0: 25 | for d in data: 26 | test_samples.append((d['model_input'], d['target_output'])) 27 | 28 | for d in data: 29 | model_outputs[model][d['target_output']] = d['model_output'] 30 | 31 | models = list(model_outputs.keys()) 32 | 33 | # generate headers name 34 | headers = ['Model input', 'Target output'] 35 | for m_name in models: 36 | headers.append(m_name) 37 | 38 | # generate row data 39 | rows = [] 40 | for samples in test_samples: 41 | row = [samples[0], samples[1]] 42 | for m_name in models: 43 | row.append(f'[{model_outputs[m_name][samples[1]]}] Output') 44 | rows.append(row) 45 | 46 | with open(f"{result_html_folder}/output_comparison.html", "w", encoding='utf-8-sig') as file: 47 | from .dashboard_template import generate_dashboard_string 48 | file.write(generate_dashboard_string(title = 'cross-model comparison', column_names = headers, rows = rows)) -------------------------------------------------------------------------------- /utils/tweetsumm_data_creator.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import re 3 | import json 4 | 5 | 6 | def clean_text(text): 7 | text = text.encode("ascii", "ignore").decode() 8 | text = re.sub(r"http\S+", "", text) 9 | text = re.sub(r"@[^\s]+", "", text) 10 | text = re.sub(r"\s+", " ", text) 11 | return re.sub(r"\^[^ ]+", "", text) 12 | 13 | 14 | def create_conversation_text(data_point): 15 | text = "" 16 | for item in data_point["log"]: 17 | user = clean_text(item["user utterance"]) 18 | text += f"user: {user.strip()}\n" 19 | 20 | agent = clean_text(item["system response"]) 21 | text += f"agent: {agent.strip()}\n" 22 | 23 | return text 24 | 25 | 26 | def generate_text(data_point): 27 | summaries = json.loads(data_point["original dialog info"])["summaries"][ 28 | "abstractive_summaries" 29 | ] 30 | summary = summaries[0] 31 | summary = " ".join(summary) 32 | 33 | conversation_text = create_conversation_text(data_point) 34 | return { 35 | "document": conversation_text, 36 | "summary": summary, 37 | "id": data_point['original dialog id'] 38 | } 39 | 40 | 41 | def create_train_test_files(folder): 42 | dataset = load_dataset("Salesforce/dialogstudio", "TweetSumm") 43 | tables = ["test", "validation"] 44 | modified_db = dict() 45 | 46 | for table in tables: 47 | for i in range(len(dataset[table])): 48 | 49 | example = generate_text(dataset[table][i]) 50 | if table not in modified_db: 51 | modified_db[table] = [] 52 | modified_db[table].append(example) 53 | 54 | modified_db['test'].extend(modified_db['validation']) 55 | del modified_db['validation'] 56 | 57 | print(f"Test set size: {len(modified_db['test'])}") 58 | 59 | for k, v in modified_db.items(): 60 | with open(f"{folder}/{k}_tweetsumm_modified.jsonl", 'w') as f: 61 | for item in v: 62 | f.write(json.dumps(item) + "\n") 63 | 64 | modified_db = [] 65 | 66 | for i in range(len(dataset["train"])): 67 | 68 | example = generate_text(dataset["train"][i]) 69 | modified_db.append({"prompt": f"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent:\n {example['document']}", "completion": f"{example['summary']}"}) 70 | 71 | with open(f"{folder}/bedrock_train_tweetsumm.jsonl", 'w') as f: 72 | for sample in modified_db: 73 | f.write(json.dumps(sample) + "\n") 74 | 75 | print(f"Train set size: {len(modified_db)}") 76 | print(f"Created training set file that can be used for Bedrock finetuning under the folder: {folder}/bedrock_train_tweetsumm.jsonl") 77 | 78 | test_file_path = f"{folder}/test_tweetsumm_modified.jsonl" 79 | return test_file_path -------------------------------------------------------------------------------- /utils/model_runners/gpt_model_runner.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from datetime import datetime, timezone 4 | import fcntl 5 | import os 6 | from dataclasses import dataclass 7 | from typing import Tuple, Optional 8 | 9 | from fmeval.model_runners.model_runner import ModelRunner 10 | 11 | @dataclass 12 | class GPTModelConfig: 13 | temperature: float 14 | top_p: float 15 | max_tokens: int 16 | api_key: str 17 | model_id: str 18 | 19 | 20 | class GPTModelRunner(ModelRunner): 21 | url = "https://api.openai.com/v1/chat/completions" 22 | 23 | def __init__(self, model_config: GPTModelConfig, metrics_folder: str = None, model_key:str = None): 24 | self.config = model_config 25 | self._metrics_folder = metrics_folder 26 | self._model_key = model_key 27 | 28 | def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]: 29 | print(prompt) 30 | payload = json.dumps({ 31 | "model": self.config.model_id, 32 | "messages": [ 33 | { 34 | "role": "user", 35 | "content": prompt 36 | } 37 | ], 38 | "temperature": self.config.temperature, 39 | "top_p": self.config.top_p, 40 | "n": 1, 41 | "stream": False, 42 | "presence_penalty": 0, 43 | "frequency_penalty": 0 44 | }) 45 | 46 | headers = { 47 | "Content-Type": "application/json", 48 | "Accept": "application/json", 49 | "Authorization": "Bearer " + self.config.api_key 50 | } 51 | start_time = datetime.now(timezone.utc) 52 | 53 | response = requests.request("POST", self.url, headers=headers, data=payload) 54 | delta = datetime.now(timezone.utc) - start_time 55 | processing_time = delta.total_seconds() 56 | 57 | 58 | response = json.loads(response.text) 59 | output = response["choices"][0]["message"]["content"] 60 | input_token_count = int(response["usage"]["prompt_tokens"]) 61 | output_token_count = int(response["usage"]["completion_tokens"]) 62 | 63 | sw = json.dumps({"input_tokens":input_token_count,"output_tokens":output_token_count, "processing_time":processing_time, "model_id":self.config.model_id}) 64 | fp = open(self._metrics_folder + f"/{self._model_key}_usage.jsonl", 'a') 65 | fcntl.flock(fp.fileno(), fcntl.LOCK_EX) 66 | fp.seek(0, 2) 67 | fp.write(sw + "\n") 68 | fcntl.flock(fp.fileno(), fcntl.LOCK_UN) 69 | fp.close() 70 | 71 | return output, None 72 | 73 | def __reduce__(self): 74 | """ 75 | Custom serializer method used by Ray when it serializes instances of this 76 | class in eval_algorithms.util.generate_model_predict_response_for_dataset. 77 | """ 78 | serialized_data = ( 79 | self.config, 80 | self._metrics_folder, 81 | self._model_key 82 | ) 83 | return self.__class__, serialized_data 84 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /utils/dashboard_creators/main_html_creator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import listdir 3 | from os.path import isfile, join 4 | 5 | from utils.model_runners.pricing_calculator import PricingCalculator 6 | 7 | 8 | def create_main_html(result_folder, models_scores, model_usage): 9 | model_outputs = dict() 10 | test_samples = [] 11 | 12 | title = 'Summarization Evaluation' 13 | 14 | pre_table_html = """ 15 | 22 | 23 |

24 | Leaderboard 25 | 26 | 27 | You can sort by columns and search by a keyword to filter 28 | 29 |

30 | 31 |
32 | Legend: 33 | 1st Best Result 34 | 2nd Best Result 35 | 3rd Best Result 36 |
37 | 38 | """ 39 | 40 | # generate headers name 41 | headers = ['Model'] 42 | models_run = list(models_scores.keys()) 43 | if len(models_run) > 0: 44 | metrics_used = list(models_scores[models_run[0]].keys()) 45 | for mu in metrics_used: 46 | headers.append(f'Metric: {mu}') 47 | headers.append('Testing Costs ($)') 48 | headers.append('Avg Latency (s)') 49 | headers.append('cost/1MT In ($)') 50 | headers.append('cost/1MT Out ($)') 51 | 52 | # generate row data 53 | rows = [] 54 | for model_id, scores in models_scores.items(): 55 | row = [f'{model_id}'] 56 | for mu in metrics_used: 57 | row.append("{:.4f}".format(scores[mu])) 58 | if model_id in model_usage and model_usage[model_id] is not None and model_usage[model_id]['cost_model'] == PricingCalculator.COST_PER_TOKEN: 59 | row.append("{:.2f}".format(model_usage[model_id]['cost'])) 60 | row.append("{:.2f}".format(model_usage[model_id]['avg_processing_time'])) 61 | row.append("{:.4f}".format(model_usage[model_id]['cost_input_1M'])) 62 | row.append("{:.4f}".format(model_usage[model_id]['cost_output_1M'])) 63 | else: 64 | row.append('-') 65 | row.append('-') 66 | row.append('-') 67 | row.append('-') 68 | rows.append(row) 69 | 70 | index_filename = f"{result_folder}/index.html" 71 | 72 | with open(index_filename, "w", encoding='utf-8-sig') as file: 73 | from .dashboard_template import generate_dashboard_string 74 | file.write(generate_dashboard_string(title = title, pre_table_html = pre_table_html, column_names = headers, rows = rows)) 75 | 76 | # CSS 77 | # copy CSS file from ./static/styles.css to the result folder 78 | # get current python file folder 79 | import os 80 | import shutil 81 | shutil.copyfile(f'{os.path.dirname(os.path.abspath(__file__))}/static/styles.css', f'{result_folder}/styles.css') 82 | 83 | return index_filename 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | .vscode/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | .ipynb_checkpoints/* 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | .venv2 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | utils/.DS_Store 165 | utils/key.py 166 | -------------------------------------------------------------------------------- /utils/model_runners/bedrock_counting_runner.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | from fmeval.constants import MIME_TYPE_JSON 3 | from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner 4 | import json 5 | from datetime import datetime, timezone 6 | import fcntl 7 | import os 8 | 9 | class CountingBedrockModelRunner(BedrockModelRunner): 10 | '''Decorates the base BedrockModelRunner to emit invocation metrics, 11 | emits accurate model consumption figures in tokens using Bedrock 12 | response metadata available from the response headers.''' 13 | 14 | def __init__(self, model_id: str, content_template: str, output: str | None = None, log_probability: str | None = None, content_type: str = MIME_TYPE_JSON, accept_type: str = MIME_TYPE_JSON, metrics_folder: str = None, model_key:str = None): 15 | """ 16 | :param model_id: Id of the Bedrock model to be used for model predictions 17 | :param content_template: String template to compose the model input from the prompt 18 | :param output: JMESPath expression of output in the model output 19 | :param log_probability: JMESPath expression of log probability in the model output 20 | :param content_type: The content type of the request sent to the model for inference 21 | :param accept_type: The accept type of the request sent to the model for inference 22 | :param metrics_folder: The destination of the invocation metric file 23 | :param model_key: The base name of the file to disambiguate metrics 24 | """ 25 | super().__init__(model_id = model_id, content_template = content_template, output = output, log_probability=log_probability, content_type = content_type, accept_type = accept_type) 26 | self._metrics_folder = metrics_folder 27 | self._model_key = model_key 28 | 29 | 30 | 31 | def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]: 32 | """ 33 | Invoke the Bedrock model and parse the model response. 34 | :param prompt: Input data for which you want the model to provide inference. 35 | """ 36 | 37 | composed_data = self._composer.compose(prompt) 38 | body = json.dumps(composed_data) 39 | start_time = datetime.now(timezone.utc) 40 | 41 | response = self._bedrock_runtime_client.invoke_model( 42 | body=body, modelId=self._model_id, accept=self._accept_type, contentType=self._content_type 43 | ) 44 | delta = datetime.now(timezone.utc) - start_time 45 | processing_time = delta.total_seconds() 46 | model_output = json.loads(response.get("body").read()) 47 | 48 | input_token_count = int(response["ResponseMetadata"]["HTTPHeaders"][ 49 | "x-amzn-bedrock-input-token-count" 50 | ]) 51 | 52 | output_token_count = int(response["ResponseMetadata"]["HTTPHeaders"][ 53 | "x-amzn-bedrock-output-token-count" 54 | ]) 55 | 56 | output = ( 57 | self._extractor.extract_output(data=model_output, num_records=1) 58 | if self._extractor.output_jmespath_expression 59 | else None 60 | ) 61 | log_probability = ( 62 | self._extractor.extract_log_probability(data=model_output, num_records=1) 63 | if self._extractor.log_probability_jmespath_expression 64 | else None 65 | ) 66 | 67 | sw = json.dumps({"input_tokens":input_token_count,"output_tokens":output_token_count, "processing_time":processing_time,"model_id":self._model_id}) 68 | fp = open(self._metrics_folder + f"/{self._model_key}_usage.jsonl", 'a') 69 | fcntl.flock(fp.fileno(), fcntl.LOCK_EX) 70 | fp.seek(0, 2) 71 | fp.write(sw + "\n") 72 | fcntl.flock(fp.fileno(), fcntl.LOCK_UN) 73 | fp.close() 74 | 75 | return output, log_probability 76 | 77 | def __reduce__(self): 78 | """ 79 | Custom serializer method used by Ray when it serializes instances of this 80 | class in eval_algorithms.util.generate_model_predict_response_for_dataset. 81 | """ 82 | serialized_data = ( 83 | self._model_id, 84 | self._content_template, 85 | self._output, 86 | self._log_probability, 87 | self._content_type, 88 | self._accept_type, 89 | self._metrics_folder, 90 | self._model_key 91 | ) 92 | return self.__class__, serialized_data 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **⚠️ MAINTENANCE NOTICE ** 2 | **This project is no longer actively maintained.** 3 | While the repository remains available for educational purposes, we recommend exploring more current alternatives for production use: 4 | - [RAGAS](https://github.com/explodinggradients/ragas) - A comprehensive framework for RAG evaluation 5 | - [Amazon Bedrock](https://aws.amazon.com/bedrock/) - A fully managed service for foundation models from Amazon 6 | For a practical example of using these alternatives, check out our [evaluation notebook using RAGAS and Bedrock](https://github.com/gilinachum/ragas-evaluation-and-bedrock-guardrails/blob/main/evaluate_prod_readiness.ipynb). 7 | 8 | Additionally, for Latency benchmarking check the code samples for [Latency Benchmarking tools for Amazon Bedrock](https://github.com/gilinachum/bedrock-latency/blob/main/README.md). 9 | 10 | ---- 11 | 12 |

13 | FM-Leaderboard-er 14 |

15 |

16 | Create your own private LLM leaderboard! 📊 17 | 18 |

19 | 20 | 21 | ## Introduction 22 | There's no one-fit-all leaderboard. `FM-Leaderboard-er` will allow you to find the best LLM for your own business use case based on your own tasks, prompts, and data. 23 | 24 | ## Features: 25 | 1. *Tasks* - Example notebooks for common tasks like Summarization, Classification, and RAG (coming soon). 26 | 2. *Models* - Amazon Bedrock, OpenAI, any API (with a code integration). 27 | 3. *Metrics* - Built-in metrics per task + custom metrics (via a code integration). 28 | 4. *Latency* - Latency metric per model 29 | 5. *Cost* - comparison. 30 | 6. *Prompt* - You could compare several prompts across one model 31 | 32 | ## Getting Started 33 | ### Prerequisits 34 | 1. AWS account with Amazon Bedrock access to selected models. 35 | 2. Hugging Face access token 36 | The code will download Dataset from Huggingface (```https://huggingface.co/api/datasets/Salesforce/dialogstudio```), this will require an access token, if you don't have one yet, follow these steps: 37 | 38 | * Signup to Hugging Face: ```https://huggingface.co``` 39 | * Generate an access token (save it for further use): ```https://huggingface.co/settings/tokens``` 40 | * Store the access token localy, by installing python lib huggingface_hub and execute from shell: 41 | ``` 42 | > pip install huggingface_hub 43 | > python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('YOUR_HUGGINGFACE_TOKEN')" 44 | ``` 45 | 46 | 47 | ***(Verify you now have: ```~/.cache/huggingface```)*** 48 | 49 | ### Installation 50 | 1. Clone the repository: 51 | ``` 52 | git clone https://github.com/aws-samples/fm-leaderboarder.git 53 | ``` 54 | ### Usage 55 | 56 | To get started, open the [example-1 notebook](./summariziation_example.ipynb) and follow the instructions provided. 57 | 58 | ### Architecture 59 | Coming soon. 60 | 61 | ## Dependency on third party libraries and services 62 | This code can interact with the OpenAI service which has [terms published here](https://openai.com/policies/terms-of-use) and [pricing described here](https://openai.com/pricing). You should be familiar with the pricing and confirm that your use case complies with the terms before proceeding. 63 | 64 | This repository makes use of [aws/fmeval Foundation Model Evaluations Library](https://github.com/aws/fmeval). Please review any license terms applicable to the dataset with your legal team and confirm that your use case complies with the terms before proceeding. 65 | 66 | ## Security 67 | 68 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 69 | 70 | ## Contributing 71 | 72 | Contributions to FM-Leaderboarder are welcome! Please refer to the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to contribute. 73 | 74 | ## Contributors 75 | 76 | [//]: contributor-faces 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | ## License 87 | 88 | This project is licensed under the Apache-2.0 License. 89 | 90 | -------------------------------------------------------------------------------- /tests/test_pricing_calculator.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from utils.model_runners.pricing_calculator import PricingCalculator 5 | 6 | 7 | def test_instance_pricing(): 8 | instance_type = "g5.12xlarge" 9 | instance_price = PricingCalculator._instance_pricing(instance_type) 10 | assert isinstance(instance_price, float) 11 | # Add more assertions to verify the returned instance price 12 | 13 | def test_retrieve_cost_structure(): 14 | model_id = "anthropic.claude-v2:1" 15 | cost_structure = PricingCalculator.retrieve_cost_structure(model_id) 16 | assert cost_structure is not None 17 | assert "model_id" in cost_structure 18 | assert "input_cost_per_1000_tokens" in cost_structure 19 | assert "output_cost_per_1000_tokens" in cost_structure 20 | 21 | # Test with a model_id that doesn't have a cost structure 22 | invalid_model_id = "invalid_model_id" 23 | cost_structure = PricingCalculator.retrieve_cost_structure(invalid_model_id) 24 | assert cost_structure is None 25 | 26 | def retrieve_cost_structure_variants(): 27 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1:2:100k') 28 | assert cost_structure is not None 29 | assert "model_name" in cost_structure 30 | assert "input_cost_per_1000_tokens" in cost_structure 31 | assert "output_cost_per_1000_tokens" in cost_structure 32 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008 33 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1:2') 34 | assert cost_structure is not None 35 | assert "model_name" in cost_structure 36 | assert "input_cost_per_1000_tokens" in cost_structure 37 | assert "output_cost_per_1000_tokens" in cost_structure 38 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008 39 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1') 40 | assert cost_structure is not None 41 | assert "model_name" in cost_structure 42 | assert "input_cost_per_1000_tokens" in cost_structure 43 | assert "output_cost_per_1000_tokens" in cost_structure 44 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008 45 | 46 | 47 | 48 | def test_read_model_score_aggregate(tmpdir): 49 | folder = str(tmpdir) 50 | PricingCalculator.cleanup_previous_runs(folder) 51 | model_name = "anthropic.claude-v2" 52 | usage_file = f"{folder}/{model_name}_usage.jsonl" 53 | 54 | # Create a temporary usage file with sample data 55 | with open(usage_file, "w") as f: 56 | f.write('{"model_id": "anthropic.claude-v2", "input_tokens": 10, "output_tokens": 20, "processing_time": 1.5}\n') 57 | f.write('{"model_id": "anthropic.claude-v2", "input_tokens": 15, "output_tokens": 25, "processing_time": 2.0}\n') 58 | 59 | result = PricingCalculator.read_model_score_aggregate(model_name, folder) 60 | assert result is not None 61 | assert result["input_tokens"] == 25 62 | assert result["output_tokens"] == 45 63 | assert result["processing_time"] == 3.5 64 | assert result["cost"] > 0 65 | 66 | def test_read_model_score_aggregate_from_api(tmpdir): 67 | folder = str(tmpdir) 68 | PricingCalculator.cleanup_previous_runs(folder) 69 | model_name = "amazon.titan-text-lite-v1" 70 | usage_file = f"{folder}/{model_name}_usage.jsonl" 71 | 72 | # Create a temporary usage file with sample data 73 | with open(usage_file, "w") as f: 74 | f.write('{"model_id": "amazon.titan-text-lite-v1", "input_tokens": 10, "output_tokens": 20, "processing_time": 1.5}\n') 75 | f.write('{"model_id": "amazon.titan-text-lite-v1", "input_tokens": 15, "output_tokens": 25, "processing_time": 2.0}\n') 76 | 77 | result = PricingCalculator.read_model_score_aggregate(model_name, folder) 78 | assert result is not None 79 | assert result["input_tokens"] == 25 80 | assert result["output_tokens"] == 45 81 | assert result["processing_time"] == 3.5 82 | assert result["cost"] > 0 83 | 84 | 85 | def test_read_timed_score_aggregate(tmpdir): 86 | folder = str(tmpdir) 87 | PricingCalculator.cleanup_previous_runs(folder) 88 | model_name = "self_hosted_test" 89 | usage_file = f"{folder}/{model_name}_usage.jsonl" 90 | 91 | # Create a temporary usage file with sample data 92 | with open(usage_file, "w") as f: 93 | f.write('{"model_id": "self_hosted_test", "input_tokens": 10, "output_tokens": 20, "instance_type":"g5.12xlarge", "processing_time": 1.5}\n') 94 | f.write('{"model_id": "self_hosted_test", "input_tokens": 15, "output_tokens": 25, "instance_type":"g5.12xlarge", "processing_time": 2.0}\n') 95 | 96 | result = PricingCalculator.read_model_score_aggregate(model_name, folder) 97 | assert result is not None 98 | assert result["input_tokens"] == 25 99 | assert result["output_tokens"] == 45 100 | assert result["processing_time"] == 3.5 101 | assert result["cost"] > 0 102 | 103 | def test_cleanup_previous_runs(tmpdir): 104 | folder = str(tmpdir) 105 | open(f"{folder}/test_model_usage.jsonl", "w").close() 106 | 107 | PricingCalculator.cleanup_previous_runs(folder) 108 | assert not any(fname.endswith("_usage.jsonl") for fname in os.listdir(folder)) -------------------------------------------------------------------------------- /utils/dashboard_creators/dashboard_template.py: -------------------------------------------------------------------------------- 1 | import html 2 | 3 | def get_optional_tooltip_html(name : str): 4 | tips_by_metric = { 5 | "win rate" : "How many models this model outpefrom on average per each metric", 6 | "meteor" : "METEOR is a metric for text similarity between the machine-produced summary and human-produced reference summaries.", 7 | "rouge" : "The ROUGE metric measures text similarity by computing overlapping n-grams between a machine-generated text and one or more reference human-written texts.", 8 | "bertscore" : "The BERTScore is a text similarity metric that leverages BERT's contextual embeddings to compute token similarities between the candidate and reference texts.", 9 | "bartscore" : "", 10 | } 11 | if name.lower().startswith("metric:") or name == 'Win Rate': 12 | if name.lower().startswith("metric:"): 13 | metric_name = name.lower().split(' ')[-1] 14 | else: 15 | metric_name = name.lower() 16 | if metric_name in tips_by_metric: 17 | tip = tips_by_metric[metric_name] 18 | tooltip_html =''' 19 | 20 | 21 | {} 22 | 23 | '''.format(tip) 24 | return tooltip_html 25 | return "" 26 | 27 | 28 | def generate_dashboard_string(title = 'page title', pre_table_html = "", column_names = [], rows = []): 29 | columns_html = "" 30 | for name in column_names: 31 | tooltip_html = get_optional_tooltip_html(str(name)) 32 | columns_html += f"{html.escape(str(name))}" + f"{tooltip_html}\n" 33 | 34 | table_data_html = "" 35 | for row in rows: 36 | table_data_html += f"\n" 37 | for item in row: 38 | str_item = str(item) # in case of a number 39 | if str_item.strip().startswith("{escaped_item}\n" 44 | table_data_html += "\n" 45 | 46 | args = {'title' : html.escape(str(title)), 'pre_table_html' : pre_table_html, 'columns_html' : columns_html, 'table_data_html' :table_data_html} 47 | 48 | return ''' 49 | 50 | 51 | 52 | {title} 53 | 54 | 55 | 56 | 91 | 92 | 93 |

{title}

94 | {pre_table_html} 95 | 96 | 97 | 98 | {columns_html} 99 | 100 | 101 | 102 | {table_data_html} 103 | 104 |
105 | 106 | 107 | '''.format(**args) 108 | 109 | # testcases 110 | def test_generate_dashboard_string(): 111 | print(generate_dashboard_string(title = 'mytitle', column_names = ["a", "b"], rows = [[1, 2], [3, 4]])) 112 | print(generate_dashboard_string(title = 'mytitle', pre_table_html= "
1
", column_names = ["a", "b"], rows = [[1, 2], [3, 4]])) 113 | print(generate_dashboard_string(column_names = ["a", "b"], rows = [['
link', 2], [3, 4]])) 114 | print(generate_dashboard_string(column_names = ["Metric: meteor", "b"], rows = [['link', 2], [3, 4]])) 115 | 116 | #test_generate_dashboard_string() -------------------------------------------------------------------------------- /utils/metrics/bart_score.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code calculates BARTscore. 3 | The metric was introduced in NeurIPS 2021. Paper: https://arxiv.org/pdf/2106.11520.pdf 4 | The code was adjusted from the official code in: https://github.com/neulab/BARTScore (Apache 2.0 license: https://github.com/neulab/BARTScore/blob/main/LICENSE) 5 | """ 6 | from transformers import BartTokenizer, BartForConditionalGeneration 7 | import torch 8 | import torch.nn as nn 9 | import traceback 10 | from typing import List 11 | import numpy as np 12 | import json 13 | from os import listdir 14 | from os.path import isfile, join 15 | 16 | 17 | class BARTScorer: 18 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'): 19 | # Set up model 20 | self.device = device 21 | self.max_length = max_length 22 | self.tokenizer = BartTokenizer.from_pretrained(checkpoint) 23 | self.model = BartForConditionalGeneration.from_pretrained(checkpoint) 24 | self.model.eval() 25 | self.model.to(device) 26 | 27 | # Set up loss 28 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id) 29 | self.lsm = nn.LogSoftmax(dim=1) 30 | 31 | def load(self, path=None): 32 | """ Load model from paraphrase finetuning """ 33 | if path is None: 34 | path = 'models/bart.pth' 35 | self.model.load_state_dict(torch.load(path, map_location=self.device)) 36 | 37 | def score(self, srcs, tgts, batch_size=4): 38 | """ Score a batch of examples """ 39 | score_list = [] 40 | for i in range(0, len(srcs), batch_size): 41 | src_list = srcs[i: i + batch_size] 42 | tgt_list = tgts[i: i + batch_size] 43 | try: 44 | with torch.no_grad(): 45 | encoded_src = self.tokenizer( 46 | src_list, 47 | max_length=self.max_length, 48 | truncation=True, 49 | padding=True, 50 | return_tensors='pt' 51 | ) 52 | encoded_tgt = self.tokenizer( 53 | tgt_list, 54 | max_length=self.max_length, 55 | truncation=True, 56 | padding=True, 57 | return_tensors='pt' 58 | ) 59 | src_tokens = encoded_src['input_ids'].to(self.device) 60 | src_mask = encoded_src['attention_mask'].to(self.device) 61 | 62 | tgt_tokens = encoded_tgt['input_ids'].to(self.device) 63 | tgt_mask = encoded_tgt['attention_mask'] 64 | tgt_len = tgt_mask.sum(dim=1).to(self.device) 65 | 66 | output = self.model( 67 | input_ids=src_tokens, 68 | attention_mask=src_mask, 69 | labels=tgt_tokens 70 | ) 71 | logits = output.logits.view(-1, self.model.config.vocab_size) 72 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) 73 | loss = loss.view(tgt_tokens.shape[0], -1) 74 | loss = loss.sum(dim=1) / tgt_len 75 | curr_score_list = [-x.item() for x in loss] 76 | score_list += curr_score_list 77 | 78 | except RuntimeError: 79 | traceback.print_exc() 80 | print(f'source: {src_list}') 81 | print(f'target: {tgt_list}') 82 | exit(0) 83 | return score_list 84 | 85 | 86 | def calculate_bartscore(tmp_json_files, models_scores, path_to_finetuned_bart): 87 | if torch.cuda.is_available(): 88 | device = 'cuda:0' 89 | else: 90 | device = 'cpu' 91 | 92 | bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnn') 93 | if len(path_to_finetuned_bart)>0: 94 | bart_scorer.load(path=path_to_finetuned_bart) 95 | scores_dict = dict() 96 | 97 | for result_file in listdir(tmp_json_files): 98 | if not result_file.endswith("_metrics.jsonl"): 99 | continue 100 | 101 | model = result_file.replace("_metrics.jsonl", "") 102 | 103 | if not model in models_scores: 104 | continue 105 | 106 | scores_dict[model] = [] 107 | data = [] 108 | 109 | filename = join(tmp_json_files, result_file) 110 | 111 | with open(filename, "r") as file: 112 | for line in file: 113 | data.append(json.loads(line)) 114 | 115 | print(f"Evaluating {model} model") 116 | 117 | 118 | processed_samples_ctr = 0 119 | 120 | for sample in data: 121 | model_output = sample['model_output'].strip() 122 | target_output = sample['target_output'].strip() 123 | score = bart_scorer.score([model_output], [target_output])[0] 124 | scores_dict[model].append(score) 125 | sample['scores'].append({'name': 'bartscore', 'value': score}) 126 | 127 | processed_samples_ctr += 1 128 | if processed_samples_ctr % 10 == 0: 129 | print(f"Processed {processed_samples_ctr}/{len(data)} samples.") 130 | 131 | # dump the new metric to appear in the the output view dashboard 132 | with open(filename, 'w') as outfile: 133 | for entry in data: 134 | json.dump(entry, outfile) 135 | outfile.write('\n') 136 | 137 | # update the models_score to appear in the index.html leaderboard 138 | current_metrics = models_scores[model] 139 | current_metrics['bartscore'] = np.average(scores_dict[model]) 140 | models_scores[model] = current_metrics 141 | 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /utils/model_runners/pricing_calculator.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | import traceback 4 | import sys 5 | import os 6 | 7 | import logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | class PricingCalculator(): 12 | '''Calculate the pricing of the inference depending on the model specific. 13 | Uses pricing API where available, lookup tables from pricing sources when not available, 14 | and can calculate cost both per token pricing and per hosting time models.''' 15 | COST_PER_TOKEN = 'cpt' 16 | COST_PER_HOUR = 'cph' 17 | 18 | _pricing_client = boto3.client('pricing', region_name = 'us-east-1') 19 | _model_prive_by_name = {} 20 | _model_name_by_id = {} 21 | @classmethod 22 | def static_init(self): 23 | for model in boto3.client('bedrock').list_foundation_models()['modelSummaries']: 24 | PricingCalculator._model_name_by_id[model['modelId']] = model['modelName'] 25 | 26 | pricing_data = [] 27 | 28 | paginator = PricingCalculator._pricing_client.get_paginator('get_products') 29 | operation_parameters = {'ServiceCode': 'AmazonBedrock'} 30 | page_iterator = paginator.paginate(**operation_parameters) 31 | 32 | try: 33 | for page in page_iterator: 34 | pricing_data.extend(page['PriceList']) 35 | except Exception as e: 36 | logger.log(logging.WARNING, f'Failed to fetch price list: {e}') 37 | return 38 | 39 | 40 | 41 | for item in pricing_data: 42 | try: 43 | price_item = json.loads(item) 44 | usage_type = price_item['product']['attributes']['usagetype'] 45 | region_code = price_item['product']['attributes']['regionCode'] 46 | if region_code != boto3.session.Session().region_name: 47 | continue 48 | if 'inferenceType' in price_item['product']['attributes']: 49 | inference_type = price_item['product']['attributes']['inferenceType'] 50 | else: 51 | inference_type = 'N/A' 52 | 53 | if 'model' in price_item['product']['attributes']: 54 | model_name = price_item['product']['attributes']['model'] 55 | elif 'titanModel' in price_item['product']['attributes']: 56 | model_name = price_item['product']['attributes']['titanModel'] 57 | elif 'titanModelUnit' in price_item['product']['attributes']: 58 | model_name = price_item['product']['attributes']['titanModelUnit'] 59 | else: 60 | logger.log(logging.ERROR, "Model name is missing. Skipping price item: {price_item['product']['attributes']}") 61 | continue; 62 | 63 | l1 = price_item['terms']['OnDemand'] 64 | l2 = list(l1.values())[0]['priceDimensions'] 65 | price_per_unit = list(l2.values())[0]['pricePerUnit']['USD'] 66 | unit = list(l2.values())[0]['unit'] 67 | if not model_name in PricingCalculator._model_prive_by_name: 68 | PricingCalculator._model_prive_by_name[model_name] = dict() 69 | PricingCalculator._model_prive_by_name[model_name]['model_id'] = model_name 70 | if 'input-tokens' in usage_type and unit =='1K tokens': 71 | PricingCalculator._model_prive_by_name[model_name]['input_cost_per_1000_tokens'] = price_per_unit 72 | elif 'output-tokens' in usage_type and unit =='1K tokens': 73 | PricingCalculator._model_prive_by_name[model_name]['output_cost_per_1000_tokens'] = price_per_unit 74 | elif 'ProvisionedThroughput' in usage_type: 75 | PricingCalculator._model_prive_by_name[model_name]['instance_type'] = usage_type 76 | PricingCalculator._model_prive_by_name[model_name]['hosting_cost_per_hour'] = price_per_unit 77 | else: 78 | pass 79 | except Exception as e: 80 | logger.log(logging.ERROR, 'ERROR: Failed to parse price item') 81 | raise e; 82 | 83 | 84 | 85 | #TODO Waiting for an official api to support markeplace's models 86 | _lookup_price_table = [ 87 | { 88 | "model_id": "anthropic.claude-v2:1", 89 | "id_type": "model_id", 90 | "input_cost_per_1000_tokens": 0.008, 91 | "output_cost_per_1000_tokens": 0.024 92 | }, 93 | { 94 | "model_id": "anthropic.claude-v2", 95 | "id_type": "model_id", 96 | "input_cost_per_1000_tokens": 0.008, 97 | "output_cost_per_1000_tokens": 0.024 98 | }, 99 | { 100 | "model_id": "anthropic.claude-instant-v1", 101 | "id_type": "model_id", 102 | "input_cost_per_1000_tokens": 0.0008, 103 | "output_cost_per_1000_tokens": 0.0024 104 | }, 105 | { 106 | "model_id": "amazon.titan-text-lite-v1", 107 | "id_type": "model_id", 108 | "input_cost_per_1000_tokens": 0.0003, 109 | "output_cost_per_1000_tokens": 0.0004 110 | }, 111 | { 112 | "model_id": "amazon.titan-text-express-v1", 113 | "id_type": "model_id", 114 | "input_cost_per_1000_tokens": 0.0008, 115 | "output_cost_per_1000_tokens": 0.0016 116 | }, 117 | { 118 | "model_id": "meta.llama2-13b-chat-v1", 119 | "id_type": "model_id", 120 | "input_cost_per_1000_tokens": 0.00075, 121 | "output_cost_per_1000_tokens": 0.001 122 | }, 123 | { 124 | "model_id": "cohere.command-light-text-v14", 125 | "id_type": "model_id", 126 | "input_cost_per_1000_tokens": 0.0003, 127 | "output_cost_per_1000_tokens": 0.0006 128 | }, 129 | 130 | { 131 | "model_id": "anthropic.claude-3-sonnet-20240229-v1:0", 132 | "id_type": "model_id", 133 | "input_cost_per_1000_tokens": 0.003, 134 | "output_cost_per_1000_tokens": 0.015 135 | }, 136 | { 137 | "model_id": "anthropic.claude-3-haiku-20240307-v1:0", 138 | "id_type": "model_id", 139 | "input_cost_per_1000_tokens": 0.00025, 140 | "output_cost_per_1000_tokens": 0.00125 141 | }, 142 | { 143 | "model_id": "meta.llama2-70b-chat-v1", 144 | "id_type": "model_id", 145 | "input_cost_per_1000_tokens": 0.00195, 146 | "output_cost_per_1000_tokens": 0.00256 147 | }, 148 | { 149 | "model_id": "ai21.j2-mid-v1", 150 | "id_type": "model_id", 151 | "input_cost_per_1000_tokens": 0.0125, 152 | "output_cost_per_1000_tokens": 0.0125 153 | }, 154 | { 155 | "model_id": "ai21.ai21.j2-ultra-v1", 156 | "id_type": "model_id", 157 | "input_cost_per_1000_tokens": 0.0188, 158 | "output_cost_per_1000_tokens": 0.0188 159 | }, 160 | { 161 | "model_id": "cohere.command-text-v14", 162 | "id_type": "model_id", 163 | "input_cost_per_1000_tokens": 0.0015, 164 | "output_cost_per_1000_tokens": 0.0020 165 | }, 166 | { 167 | "model_id": "mistral.mistral-7b-instruct-v0:2", 168 | "id_type": "model_id", 169 | "input_cost_per_1000_tokens": 0.00015, 170 | "output_cost_per_1000_tokens": 0.0002 171 | }, 172 | { 173 | "model_id": "mistral.mixtral-8x7b-instruct-v0:1", 174 | "id_type": "model_id", 175 | "input_cost_per_1000_tokens": 0.00045, 176 | "output_cost_per_1000_tokens": 0.0007 177 | 178 | }, 179 | { 180 | "model_id": "self_hosted_test", 181 | "id_type": "model_id", 182 | "instance_type": "g5.12xlarge", 183 | }, 184 | { 185 | "model_id": "gpt-4-0125-preview", 186 | "id_type": "model_id", 187 | "input_cost_per_1000_tokens": 0.01, 188 | "output_cost_per_1000_tokens": 0.03 189 | }, 190 | { 191 | "model_id": "gpt-4-1106-preview", 192 | "id_type": "model_id", 193 | "input_cost_per_1000_tokens": 0.01, 194 | "output_cost_per_1000_tokens": 0.03 195 | }, 196 | { 197 | "model_id": "gpt-4-1106-vision-preview", 198 | "id_type": "model_id", 199 | "input_cost_per_1000_tokens": 0.01, 200 | "output_cost_per_1000_tokens": 0.03 201 | }, 202 | { 203 | "model_id": "gpt-4", 204 | "id_type": "model_id", 205 | "input_cost_per_1000_tokens": 0.03, 206 | "output_cost_per_1000_tokens": 0.06 207 | }, 208 | { 209 | "model_id": "gpt-4-32k", 210 | "id_type": "model_id", 211 | "input_cost_per_1000_tokens": 0.06, 212 | "output_cost_per_1000_tokens": 0.12 213 | }, 214 | { 215 | "model_id": "gpt-3.5-turbo-0125", 216 | "id_type": "model_id", 217 | "input_cost_per_1000_tokens": 0.0005, 218 | "output_cost_per_1000_tokens": 0.0015 219 | }, 220 | { 221 | "model_id": "gpt-3.5-turbo-instruct", 222 | "id_type": "model_id", 223 | "input_cost_per_1000_tokens": 0.0015, 224 | "output_cost_per_1000_tokens": 0.002 225 | }, 226 | { 227 | "model_id": "gpt-3.5-turbo-1106", 228 | "id_type": "model_id", 229 | "input_cost_per_1000_tokens": 0.001, 230 | "output_cost_per_1000_tokens": 0.002 231 | }, 232 | { 233 | "model_id": "gpt-3.5-turbo-0613", 234 | "id_type": "model_id", 235 | "input_cost_per_1000_tokens": 0.0015, 236 | "output_cost_per_1000_tokens": 0.002 237 | }, 238 | { 239 | "model_id": "gpt-3.5-turbo-16k-0613", 240 | "id_type": "model_id", 241 | "input_cost_per_1000_tokens": 0.003, 242 | "output_cost_per_1000_tokens": 0.004 243 | }, 244 | { 245 | "model_id": "gpt-3.5-turbo-0301", 246 | "id_type": "model_id", 247 | "input_cost_per_1000_tokens": 0.0015, 248 | "output_cost_per_1000_tokens": 0.002 249 | } 250 | ] 251 | 252 | 253 | @classmethod 254 | def _instance_pricing(self, instance_type): 255 | data = PricingCalculator._pricing_client.get_products(ServiceCode='AmazonEC2', Filters=[{"Field": "tenancy", "Value": "shared", "Type": "TERM_MATCH"}, 256 | {"Field": "operatingSystem", "Value": "Linux", "Type": "TERM_MATCH"}, 257 | {"Field": "preInstalledSw", "Value": "NA", "Type": "TERM_MATCH"}, 258 | {"Field": "instanceType", "Value": instance_type, "Type": "TERM_MATCH"}, 259 | {"Field": "marketoption", "Value": "OnDemand", "Type": "TERM_MATCH"}, 260 | {"Field": "regionCode", "Value": boto3.session.Session().region_name , "Type": "TERM_MATCH"}, 261 | {"Field": "capacitystatus", "Value": "Used", "Type": "TERM_MATCH"}]) 262 | for price in (json.loads(x) for x in data['PriceList']): 263 | first_id = list(price['terms']['OnDemand'].keys())[0] 264 | price_data = price['terms']['OnDemand'][first_id] 265 | second_id = list(price_data['priceDimensions'].keys())[0] 266 | instance_price = price_data['priceDimensions'][second_id]['pricePerUnit']['USD'] 267 | if float(instance_price) > 0: 268 | return float(instance_price) 269 | raise(Exception(f'Failed to get instance pricing for instance type {instance_type}')) 270 | 271 | @classmethod 272 | def retrieve_cost_structure(self, model_id): 273 | cost_structure = None 274 | mName = PricingCalculator._model_name_by_id[model_id] if model_id in PricingCalculator._model_name_by_id else None 275 | if not (mName == None): 276 | #handle internal model 277 | if mName in PricingCalculator._model_prive_by_name: 278 | return PricingCalculator._model_prive_by_name[mName] 279 | for model_cost in PricingCalculator._lookup_price_table: 280 | if model_id in model_cost['model_id'] or model_id.split(':')[0] in model_cost['model_id'].split(':')[0]: 281 | return model_cost 282 | 283 | @classmethod 284 | def _calculate_usage_cost(self, model_id, input_tokens:int=0, output_tokens:int=0, inference_time_s:float=0, instance_type:str = None): 285 | try: 286 | cost_structure = PricingCalculator.retrieve_cost_structure(model_id) 287 | if cost_structure is None: 288 | return None 289 | if 'instance_type' in cost_structure and cost_structure['instance_type'] == instance_type: 290 | return PricingCalculator._calculate_usage_per_second(inference_time_s, cost_structure), cost_structure, PricingCalculator.COST_PER_HOUR 291 | else: 292 | return PricingCalculator._calculate_usage_per_token(input_tokens, output_tokens, cost_structure),cost_structure, PricingCalculator.COST_PER_TOKEN 293 | except Exception as e: 294 | logger.log(logging.ERROR, f'Failed to calculate cost for model {model_id}, invokation parameters: {input_tokens}, {output_tokens}, {inference_time_s}') 295 | raise e; 296 | 297 | @classmethod 298 | def _calculate_usage_per_second(self, inference_time_s:float=0, cost_structure = None): 299 | if 'hosting_cost_per_hour' in cost_structure: 300 | return cost_structure['hosting_cost_per_hour'] * inference_time_s / (60*60) 301 | return PricingCalculator._instance_pricing(cost_structure['instance_type']) * inference_time_s / (60*60) 302 | 303 | @classmethod 304 | def _calculate_usage_per_token(self, input_tokens, output_tokens, model_cost): 305 | input_cost = float(model_cost['input_cost_per_1000_tokens']) * input_tokens / 1000 306 | # Since 'output_cost_per_1000_tokens' is not always given... 307 | try: 308 | output_cost = float(model_cost['output_cost_per_1000_tokens']) * output_tokens / 1000 309 | except Exception: 310 | output_cost = input_cost 311 | 312 | return input_cost + output_cost 313 | 314 | @classmethod 315 | def read_model_score_aggregate(self, model_name, folder): 316 | '''Read model usage information from the test report and calculate the overall 317 | cost based on the known pricing, it expects to find a file {folder}/{model_name}_usage.jsonl 318 | containing json lines for each invocation with these keys: 319 | model_id #name of the model as used in the invocation API 320 | input_tokens #number of token in the prompt 321 | output_tokens #number of token in the output 322 | processing_time #total invocation time in second 323 | instance_type #type of the instance for models priced on hosting time 324 | ''' 325 | file = f"{folder}/{model_name}_usage.jsonl" 326 | if not os.path.exists(file): 327 | return None 328 | 329 | # Initialize the sum dictionary 330 | sum_dict = { 331 | 'input_tokens': 0, 332 | 'output_tokens': 0, 333 | 'processing_time': 0, 334 | 'cost': None, 335 | 'cost_model': None, 336 | 'cost_hour': None, 337 | 'cost_input_1M': None, 338 | 'cost_output_1M': None 339 | } 340 | samples = 0 341 | 342 | with open(file, 'r') as file: 343 | for line in file: 344 | samples = samples + 1 345 | item = json.loads(line) 346 | input_tokens = item['input_tokens'] if 'input_tokens' in item else 0 347 | output_tokens = item['output_tokens'] if 'output_tokens' in item else 0 348 | processing_time = item['processing_time'] if 'processing_time' in item else 0 349 | cost, cost_structure, cost_model = PricingCalculator._calculate_usage_cost(item['model_id'], input_tokens, output_tokens, processing_time, 350 | item['instance_type'] if 'instance_type' in item else None ) 351 | if sum_dict['cost_model'] == None: 352 | sum_dict['cost_model'] = cost_model 353 | if cost_model == PricingCalculator.COST_PER_HOUR: 354 | sum_dict['cost_hour'] = cost_structure['hosting_cost_per_hour'] if 'hosting_cost_per_hour' in cost_structure else PricingCalculator._instance_pricing(cost_structure['instance_type']) 355 | if cost_model == PricingCalculator.COST_PER_TOKEN: 356 | 357 | # cast to float to avoid exception when multiplying int with float ('cost_input_1M') 358 | sum_dict['cost_input_1M'] = float(cost_structure['input_cost_per_1000_tokens'])*1000.0 359 | 360 | # When 'cost_output_1M' value, does not exist 361 | try: 362 | output_cost_per_1000_tokens = float(cost_structure['output_cost_per_1000_tokens'])*1000.0 363 | except Exception: 364 | output_cost_per_1000_tokens = float(cost_structure['input_cost_per_1000_tokens'])*1000.0 365 | 366 | sum_dict['cost_output_1M'] = output_cost_per_1000_tokens 367 | sum_dict['input_tokens'] += input_tokens 368 | sum_dict['output_tokens'] += output_tokens 369 | sum_dict['processing_time'] += processing_time 370 | if cost is None: 371 | continue 372 | if sum_dict['cost'] is None: 373 | sum_dict["cost"] = cost 374 | else: 375 | sum_dict['cost'] += cost 376 | 377 | 378 | # Convert the sum dictionary to JSON string 379 | sum_dict['samples'] = samples 380 | sum_dict['avg_cost'] = sum_dict['cost'] / samples 381 | sum_dict['avg_processing_time'] = sum_dict['processing_time'] / samples 382 | return sum_dict 383 | 384 | @classmethod 385 | def cleanup_previous_runs(self, dir_path): 386 | for file_name in os.listdir(dir_path): 387 | if file_name.endswith('_usage.jsonl'): 388 | # Construct the full file path 389 | file_path = os.path.join(dir_path, file_name) 390 | os.remove(file_path) 391 | 392 | PricingCalculator.static_init() -------------------------------------------------------------------------------- /summariziation_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3a0e44ff-c46c-4998-88d7-fb3aee89841d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Text generation models evaluation\n", 9 | "\n", 10 | "#### This notebook evaluates several LLMs from Bedrock, HuggingFace, Jumpstart, Bedrock finetuned models\n", 11 | "#### Instance type used for the evaluation - ml.g4dn.2xlarge or m5.2xlarge, python 3.10\n", 12 | "#### The metrics evaluated are N-gram matching-based ([ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)), [METEOR](https://en.wikipedia.org/wiki/METEOR)) and sematic-based ([BERTScore](https://arxiv.org/abs/1904.09675)) from [FMEval](https://github.com/aws/fmeval/) library (can be further customized), and [BARTScore](https://arxiv.org/abs/2106.11520) using encoder-decoder architecture\n", 13 | "#### The datasets used is [TweetSumm](https://github.com/guyfe/Tweetsumm) (A Dialog Summarization Dataset for Customer Service, published in EMNLP 21)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "id": "68c047c8-0168-45eb-9d59-dda0117ff703", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "%load_ext autoreload\n", 24 | "%autoreload 2\n", 25 | "\n", 26 | "# Optional S3 path to upload results to (e.g. s3://yourbucket/results/ ) - Handy to as a way to download results and open html report on a local machine\n", 27 | "S3_OUTPUT_PATH = None \n", 28 | "\n", 29 | "MODELS_TO_EVAL = [] # if empty list will evaluate all the models available. For specific models, mention their ids from the list below, for example [\"anthropic.claude-v2:1\", \"amazon.titan-text-lite-v1\"]" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "eb788c86-761a-4749-927f-737c194e4613", 36 | "metadata": { 37 | "tags": [] 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "!pip install --upgrade pip --quiet\n", 42 | "!pip install -r requirements.txt --quiet" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "21f5d06f", 48 | "metadata": {}, 49 | "source": [ 50 | "### OPEN AI API key\n", 51 | "This is relevant if you'll be using models from OpenAI\n", 52 | "\n", 53 | "- Create a new file called `utils/key.py` in your project directory to store your API key.\n", 54 | "- Go to your OpenAI account and navigate to \"[View API keys](https://platform.openai.com/account/api-keys).\"\n", 55 | "- Select \"Create new secret key.\"\n", 56 | "- Copy the key and insert it into your file `utils/key.py` like this:\n", 57 | "```\n", 58 | "OPENAI_API_KEY = 'sk-actualLongKeyGoesHere123'\n", 59 | "```\n", 60 | "- Save the changes\n", 61 | "- IMPORTANT: Do **not** commit `key.py` to source control as will contain your private key. (It should already be in `.gitgnore`.** Review [this information about API safety](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).\n", 62 | "- Comment out `from utils.key import OPENAI_API_KEY` below." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "id": "75cfe9a8", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "OPENAI_API_KEY = None # uncommenting the line below will override this\n", 73 | "#from utils.key import OPENAI_API_KEY" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "d4fbff43-d21a-40c6-b430-3def8ae7c268", 79 | "metadata": { 80 | "tags": [] 81 | }, 82 | "source": [ 83 | "## Define bucket config" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "id": "11956018-0992-4ec5-bb51-d73b1c017988", 90 | "metadata": { 91 | "tags": [] 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "import json\n", 96 | "from pathlib import Path\n", 97 | "import boto3\n", 98 | "import glob\n", 99 | "import shutil\n", 100 | "import os\n", 101 | "from os import listdir\n", 102 | "\n", 103 | "from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner\n", 104 | "from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner\n", 105 | "\n", 106 | "from utils.model_runners.gpt_model_runner import GPTModelConfig, GPTModelRunner\n", 107 | "from utils.tweetsumm_data_creator import create_train_test_files\n", 108 | "from utils.model_ranker import create_model_ranking\n", 109 | "from utils.dashboard_creators.output_viewer_creator import create_response_output_view\n", 110 | "from utils.dashboard_creators.comparative_dashboard_creator import create_comparive_dashboard\n", 111 | "from utils.dashboard_creators.data_stats_viewer_creator import create_data_stats_view\n", 112 | "from utils.dashboard_creators.data_preview_viewer import create_data_preview_view\n", 113 | "from utils.dashboard_creators.main_html_creator import create_main_html\n", 114 | "from utils.metrics.bart_score import calculate_bartscore\n", 115 | "\n", 116 | "RESULT_FOLDER = \"/tmp/final_result\"\n", 117 | "if os.path.exists(RESULT_FOLDER):\n", 118 | " shutil.rmtree(RESULT_FOLDER)\n", 119 | "os.mkdir(RESULT_FOLDER)\n", 120 | "\n", 121 | "TMP_JSON_FILES = \"/tmp/jsonl_model_files\"\n", 122 | "if os.path.exists(TMP_JSON_FILES):\n", 123 | " shutil.rmtree(TMP_JSON_FILES)\n", 124 | "os.mkdir(TMP_JSON_FILES)\n", 125 | "\n", 126 | "TMP_DATASET_FILES = \"/tmp/dataset_files\"\n", 127 | "if os.path.exists(TMP_DATASET_FILES):\n", 128 | " shutil.rmtree(TMP_DATASET_FILES)\n", 129 | "os.mkdir(TMP_DATASET_FILES)\n", 130 | "\n", 131 | "RESULT_HTML_FOLDER = RESULT_FOLDER + \"/html_files\"\n", 132 | "if os.path.exists(RESULT_HTML_FOLDER):\n", 133 | " shutil.rmtree(RESULT_HTML_FOLDER)\n", 134 | "os.mkdir(RESULT_HTML_FOLDER)\n", 135 | "\n", 136 | "RESULT_IMG_FOLDER = RESULT_FOLDER + \"/imgs\"\n", 137 | "if os.path.exists(RESULT_IMG_FOLDER):\n", 138 | " shutil.rmtree(RESULT_IMG_FOLDER)\n", 139 | "os.mkdir(RESULT_IMG_FOLDER)\n", 140 | "\n", 141 | "from utils.tweetsumm_data_creator import create_train_test_files\n", 142 | "TEST_FILE_PATH = create_train_test_files(TMP_DATASET_FILES) # creating train and test files\n", 143 | "print(TEST_FILE_PATH)\n", 144 | "\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "805a915b-4918-4bf4-994a-7cff3701ec91", 150 | "metadata": {}, 151 | "source": [ 152 | "## List the models to benchmark" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 5, 158 | "id": "c05192f6-a93c-4c76-bb2b-d88e7205426f", 159 | "metadata": { 160 | "tags": [] 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# Bedrock models\n", 165 | "models_to_test = {}\n", 166 | "\n", 167 | "# Add Bedrock Random text generating model to serve as baseline callibration for the various metrics\n", 168 | "models_to_test.update({\n", 169 | " \"random\" : { \n", 170 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n", 171 | " \"platform\" : \"bedrock\",\n", 172 | " \"output\" : \"results[0].outputText\", \n", 173 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n", 174 | " \"prompt_template\" : \"Please ignore the following blob of text and create an unrelated text of around 2 sentences\\n $model_input\\n\"\n", 175 | " }\n", 176 | "})\n", 177 | "\n", 178 | "# Add Bedrock Anthropic models in zero-shot\n", 179 | "models_to_test.update({\n", 180 | " \"anthropic.claude-3-sonnet\" : { \n", 181 | " \"model_id\" : \"anthropic.claude-3-sonnet-20240229-v1:0\", \n", 182 | " \"platform\" : \"bedrock\",\n", 183 | " \"output\" : \"content[0].text\", \n", 184 | " \"content_template\" : \"{\\\"messages\\\": [{\\\"role\\\": \\\"user\\\", \\\"content\\\": $prompt}], \\\"max_tokens\\\": 100, \\\"anthropic_version\\\": \\\"bedrock-2023-05-31\\\"}\",\n", 185 | " \"prompt_template\" : \"Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: $model_input\"\n", 186 | " },\n", 187 | " \"anthropic.claude-3-haiku\" : { \n", 188 | " \"model_id\" : \"anthropic.claude-3-haiku-20240307-v1:0\", \n", 189 | " \"platform\" : \"bedrock\",\n", 190 | " \"output\" : \"content[0].text\", \n", 191 | " \"content_template\" : \"{\\\"messages\\\": [{\\\"role\\\": \\\"user\\\", \\\"content\\\": $prompt}], \\\"max_tokens\\\": 100, \\\"anthropic_version\\\": \\\"bedrock-2023-05-31\\\"}\",\n", 192 | " \"prompt_template\" : \"Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: $model_input\"\n", 193 | " }\n", 194 | "})\n", 195 | "\n", 196 | "# Add Bedrock Amazon Titan models in zero-shot\n", 197 | "models_to_test.update({\n", 198 | " \"amazon.titan-text-lite-v1\" : { \n", 199 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n", 200 | " \"platform\" : \"bedrock\",\n", 201 | " \"output\" : \"results[0].outputText\", \n", 202 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n", 203 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Do not include any additional information that does not appear in the dialog. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n$model_input\\n\\nsummary:\\n\"\n", 204 | " },\n", 205 | " \"amazon.titan-text-express-v1\" :{ \n", 206 | " \"model_id\" : \"amazon.titan-text-express-v1\", \n", 207 | " \"platform\" : \"bedrock\",\n", 208 | " \"output\" : \"results[0].outputText\", \n", 209 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n", 210 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n $model_input\\n\\nsummary:\\n\"\n", 211 | " },\n", 212 | "})\n", 213 | "\n", 214 | "# Add Cohere and Llama2 Bedrock models in zero-shot\n", 215 | "models_to_test.update({\n", 216 | " \"cohere.command-light-text-v14\" :{ \n", 217 | " \"model_id\" : \"cohere.command-light-text-v14\", \n", 218 | " \"platform\" : \"bedrock\",\n", 219 | " \"output\" : \"generations[0].text\", \n", 220 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_tokens\\\": 100}\",\n", 221 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n $model_input\\n\\nsummary:\\n\"\n", 222 | " },\n", 223 | " \"meta.llama2-13b-chat-v1\" :{ \n", 224 | " \"model_id\" : \"meta.llama2-13b-chat-v1\", \n", 225 | " \"platform\" : \"bedrock\",\n", 226 | " \"output\" : \"generation\", \n", 227 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_gen_len\\\": 100, \\\"top_p\\\": 1, \\\"temperature\\\": 1.0}\",\n", 228 | " \"prompt_template\" : \"[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:[/INST]\\n Transcript\\n $model_input \\n\\n Summary:\\n\"\n", 229 | " },\n", 230 | "})\n", 231 | "\n", 232 | "# Add various Bedrock models in one-shot\n", 233 | "models_to_test.update({\n", 234 | " \"amazon.titan-text-lite-v1-one-shot\" : { \n", 235 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n", 236 | " \"platform\" : \"bedrock\",\n", 237 | " \"output\" : \"results[0].outputText\", \n", 238 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n", 239 | " \"prompt_template\" : \"[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. \\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\n [/INST] [INST]\\n Transcript:\\n $model_input [/INST]\\n Summary:\"\n", 240 | " },\n", 241 | " \"meta.llama2-13b-chat-v1-one-shot\" :{ \n", 242 | " \"model_id\" : \"meta.llama2-13b-chat-v1\", \n", 243 | " \"platform\" : \"bedrock\",\n", 244 | " \"output\" : \"generation\", \n", 245 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_gen_len\\\": 100, \\\"top_p\\\": 1, \\\"temperature\\\": 1.0}\",\n", 246 | " \"prompt_template\" : \"[INST] <> Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.< \\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\n [/INST] [INST]\\n Transcript:\\n $model_input [/INST] Summary:\"\n", 247 | " },\n", 248 | " \"cohere.command-light-text-v14-one-shot\" :{ \n", 249 | " \"model_id\" : \"cohere.command-light-text-v14\", \n", 250 | " \"platform\" : \"bedrock\",\n", 251 | " \"output\" : \"generations[0].text\", \n", 252 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_tokens\\\": 100}\",\n", 253 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n\\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\nTranscript:\\n $model_input\\n Summary:\"\n", 254 | " },\n", 255 | " \"amazon.titan-text-express-v1-one-shot\" :{ \n", 256 | " \"model_id\" : \"amazon.titan-text-express-v1\", \n", 257 | " \"platform\" : \"bedrock\",\n", 258 | " \"output\" : \"results[0].outputText\", \n", 259 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n", 260 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n\\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\nTranscript:\\n $model_input\\n Summary:\"\n", 261 | " },\n", 262 | "})\n", 263 | "\n", 264 | "# Add OpenAI models in zero-shot\n", 265 | "models_to_test.update({\n", 266 | " \"gpt.3.5-turbu-0125\" :{ \n", 267 | " \"model_id\" : \"gpt-3.5-turbo-0125\", \n", 268 | " \"api_key\" : OPENAI_API_KEY,\n", 269 | " \"platform\" : \"openai\",\n", 270 | " \"temperature\" : 1,\n", 271 | " \"top_p\" : 1,\n", 272 | " \"max_tokens\" : 100,\n", 273 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n Transcript:\\n $model_input \\n Summary:\\n\"\n", 274 | " }\n", 275 | "})\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "id": "cb20053f-e76e-4ffc-b94d-a6589409776c", 281 | "metadata": {}, 282 | "source": [ 283 | "## Adding your own custom models\n", 284 | "In case you wish to add custom model, simply create custom model runner. For example, see custom model runner which wraps GPT-3.5 in the folder utils/model_runners/gpt_model_runner.py \n", 285 | "\n", 286 | "\n", 287 | "## Adding finetuned models\n", 288 | "In case you wish to add Bedrock finetuned model: \n", 289 | "1. First finetune a model (for details on finetuning on Berdrock visit https://aws.amazon.com/blogs/aws/customize-models-in-amazon-bedrock-with-your-own-data-using-fine-tuning-and-continued-pre-training/).\n", 290 | "2. Once training completed, from Bedrock copy the ARN from Bedrock 'provisioned throughput' dashboard and paste it as the model_id. A finetuning training set is provided. For more details see documentation\n", 291 | "3. Add to the model_dict in the cell above the configuration of your finetuned model as follows:\n", 292 | "\n", 293 | "\n", 294 | "{\n", 295 | " \"finetuned_amazon.titan-text-lite-v1\" : {\n", 296 | " \"platform\":\"bedrock\",\n", 297 | " \"model_id\": \"arn:aws:bedrock:us-east-1:333333333:provisioned-model/879asd6s75\",\n", 298 | " \"output\": \"results[0].outputText\",\n", 299 | " \"content_template\": {\"inputText\": $prompt, \"textGenerationConfig\": {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}},\n", 300 | " \"prompt_template\": \"YOUR PROMPT HERE\"\n", 301 | " }\n", 302 | "}\n", 303 | "\n", 304 | "\n", 305 | "\n", 306 | "## Adding Jumpstart models\n", 307 | "Example for evaluation Mistral-7B-Instruct from Jumpstart:\n", 308 | "1. Go to Jumpstart (press home button -> Jumpstart)\n", 309 | "2. Search in the bar for Mistral-7B-Instruct\n", 310 | "3. Click deploy from the model card (don't forget to close the endpoint once you done from SageMaker->inference endpoints)\n", 311 | "4. Add the following to the models list\n", 312 | "\n", 313 | "{\n", 314 | " \"platform\":\"jumpstart\",\n", 315 | " \"model_id\": \"huggingface-llm-mistral-7b-instruct\",\n", 316 | " \"endpoint_name\": \"jumpstart-dft-hf-llm-mistral-7b-instruct\",\n", 317 | " \"model_version\": \"*\",\n", 318 | " \"output\": \"[0].generated_text\",\n", 319 | " \"content_template\":\"{\\\"inputs\\\": $prompt, \\\"parameters\\\": {\\\"do_sample\\\": false, \\\"max_new_tokens\\\": 100}}\",\n", 320 | " \"prompt_template\": \"YOUR PROMPT HERE\"\n", 321 | "}\n", 322 | "\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "ac160609-dfb3-4eb5-bac0-753053f27184", 328 | "metadata": { 329 | "tags": [] 330 | }, 331 | "source": [ 332 | "## Creating ModelRunner" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 6, 338 | "id": "a9466069-430a-4f71-80a2-c0c6e3b4e918", 339 | "metadata": { 340 | "tags": [] 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "from utils.model_runners.bedrock_counting_runner import CountingBedrockModelRunner\n", 345 | "\n", 346 | "\n", 347 | "def get_models_to_eval():\n", 348 | " if len(MODELS_TO_EVAL) == 0:\n", 349 | " return list(models_to_test.keys())\n", 350 | " return MODELS_TO_EVAL\n", 351 | "\n", 352 | "models = dict() \n", 353 | "for fm in get_models_to_eval(): \n", 354 | " \n", 355 | " data = models_to_test[fm]\n", 356 | " platform = data['platform']\n", 357 | " \n", 358 | " if platform == \"bedrock\":\n", 359 | " runner = CountingBedrockModelRunner(model_id=data[\"model_id\"], output=data[\"output\"], content_template=data[\"content_template\"].replace(\"'\",\"\\\"\"),metrics_folder = TMP_JSON_FILES, model_key = fm)\n", 360 | " elif platform == \"jumpstart\":\n", 361 | " runner = JumpStartModelRunner(endpoint_name=data[\"endpoint_name\"], model_id=data[\"model_id\"], model_version=data[\"model_version\"], output=data[\"output\"].replace(\"'\",\"\\\"\"), content_template=data[\"content_template\"].replace(\"'\",\"\\\"\"))\n", 362 | " elif platform == \"openai\":\n", 363 | " if OPENAI_API_KEY:\n", 364 | " runner = GPTModelRunner(GPTModelConfig(model_id=data[\"model_id\"], api_key=data[\"api_key\"], temperature=data[\"temperature\"], top_p=data[\"top_p\"], max_tokens=data[\"max_tokens\"]),metrics_folder = TMP_JSON_FILES, model_key = fm)\n", 365 | " else:\n", 366 | " print(\"Skipping OpenAI models - Cannot run without an API key\")\n", 367 | " continue\n", 368 | " \n", 369 | " models[fm] = { \"model_runner\": runner, \"prompt_template\": data[\"prompt_template\"]}\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "e6198a20-9a87-4a16-af6d-35733f9845c7", 375 | "metadata": {}, 376 | "source": [ 377 | "## Evaluation run\n", 378 | "Evaluating METEOR, ROUGE, and BERTscore using FMEval library (https://github.com/aws/fmeval). This library is also used by Bedrock when finetuning or evaluating models.\n", 379 | "\n", 380 | "#### Note - if while running this cell you encounter the message - \"Error displaying widget: model not found\" in the evaluation phase...\", simply ignore it. It relates to the UI and does not effect the evaluation." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 7, 386 | "id": "7ec118e5-b099-49af-998a-1cacf6a7664b", 387 | "metadata": { 388 | "tags": [] 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "from fmeval.data_loaders.data_config import DataConfig\n", 393 | "from fmeval.constants import MIME_TYPE_JSONLINES\n", 394 | "from fmeval.eval_algorithms.summarization_accuracy import SummarizationAccuracy, SummarizationAccuracyConfig\n", 395 | "from utils.model_runners.pricing_calculator import PricingCalculator\n", 396 | "import pandas as pd\n", 397 | "import os\n", 398 | "\n", 399 | "os.environ[\"PARALLELIZATION_FACTOR\"] = \"1\" # will use a single workder for FMEval\n", 400 | "TMP_JSON_FILES = \"/tmp/jsonl_model_files\"\n", 401 | "if os.path.exists(TMP_JSON_FILES):\n", 402 | " shutil.rmtree(TMP_JSON_FILES)\n", 403 | "os.mkdir(TMP_JSON_FILES)\n", 404 | "\n", 405 | "models_scores = dict()\n", 406 | "models_usage = dict()\n", 407 | "models_to_eval = get_models_to_eval()\n", 408 | "for model_id in models_to_eval:\n", 409 | " print(f\"### Starting model {model_id} evaluation\")\n", 410 | " if not model_id in models:\n", 411 | " print(f\"###model {model_id} doesn't have a valid/complete entry in the model list\")\n", 412 | " continue\n", 413 | " model = models[model_id]\n", 414 | " config = DataConfig(\n", 415 | " dataset_name=f\"data\",\n", 416 | " dataset_uri=TEST_FILE_PATH,\n", 417 | " dataset_mime_type=MIME_TYPE_JSONLINES,\n", 418 | " model_input_location=\"document\",\n", 419 | " target_output_location=\"summary\"\n", 420 | " )\n", 421 | "\n", 422 | " model_runner = model['model_runner']\n", 423 | " eval_algo = SummarizationAccuracy(SummarizationAccuracyConfig())\n", 424 | " eval_output = eval_algo.evaluate(model=model_runner, \n", 425 | " dataset_config=config,\n", 426 | " prompt_template=model[\"prompt_template\"],\n", 427 | " num_records=10,\n", 428 | " save=True)\n", 429 | "\n", 430 | " scores = dict()\n", 431 | " for i in eval_output[0].dataset_scores:\n", 432 | " scores[i.name] = i.value\n", 433 | " \n", 434 | " models_scores[model_id] = scores\n", 435 | " models_usage[model_id] = PricingCalculator.read_model_score_aggregate(model_id, TMP_JSON_FILES)\n", 436 | " shutil.move('/tmp/eval_results/summarization_accuracy_data.jsonl', f'{TMP_JSON_FILES}/{model_id}_metrics.jsonl')\n" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "id": "ecf0a7ea-0ae1-423a-949a-6526b5b97ef9", 442 | "metadata": {}, 443 | "source": [ 444 | "## Calculate BARTscore" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 8, 450 | "id": "5e019170-e364-4c54-a030-b00207bcc5e8", 451 | "metadata": { 452 | "tags": [] 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "### Metrics to calc\n", 457 | "# BARTscore - for more details https://github.com/neulab/BARTScore/blob/main/README.md\n", 458 | "CALC_BARTSCORE = True\n", 459 | "\n", 460 | "PATH_TO_FINETUNED_BART = \"\" # if left empty will use vanilla BART. If you wish to load the finetuned BART, go to BARTscore's github, download the bart_score.pth (appear on the README) and provide the path here\n", 461 | "if CALC_BARTSCORE:\n", 462 | " calculate_bartscore(TMP_JSON_FILES, models_scores, PATH_TO_FINETUNED_BART)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "id": "604b4580-546c-48ca-be94-fffabd1cc280", 468 | "metadata": {}, 469 | "source": [ 470 | "## Create Leaderboard Report HTML" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 9, 476 | "id": "0c6485fe-e5b4-41b1-8323-470b963c8c20", 477 | "metadata": { 478 | "tags": [] 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "from utils.model_ranker import create_model_ranking\n", 483 | "create_response_output_view(RESULT_HTML_FOLDER, TMP_JSON_FILES, models_scores)\n", 484 | "create_comparive_dashboard(RESULT_HTML_FOLDER, TMP_JSON_FILES)\n", 485 | "create_data_stats_view(TEST_FILE_PATH, RESULT_IMG_FOLDER)\n", 486 | "create_data_preview_view(TEST_FILE_PATH, RESULT_HTML_FOLDER)\n", 487 | "main_html_filename = create_main_html(RESULT_FOLDER, models_scores, models_usage)\n", 488 | "\n", 489 | "print(f\"Created leaderboard in: {main_html_filename}\")\n", 490 | "\n", 491 | "# archive entire report\n", 492 | "from datetime import datetime\n", 493 | "today = datetime.now()\n", 494 | "my_datetime = str(today.strftime(\"%d-%m-%Y_%H-%M-%S\"))\n", 495 | "zip_filename_fullpath = shutil.make_archive(f\"/tmp/{my_datetime}\", 'zip', \"/tmp/final_result\")\n", 496 | "zip_filename = zip_filename_fullpath.split(\"/\")[-1] # filename without folders\n", 497 | "print(f\"Archived report in: {zip_filename_fullpath}\")" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "id": "36570a56-4ba5-4143-aac0-b4ba99afce65", 503 | "metadata": {}, 504 | "source": [ 505 | "## Upload Report to S3" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "id": "75e36a98-6c1d-49ed-8f43-e2992be1e96d", 512 | "metadata": { 513 | "tags": [] 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "if S3_OUTPUT_PATH: # if defined S3\n", 518 | " s3_key = f\"{S3_OUTPUT_PATH}/{zip_filename}\"\n", 519 | " !aws s3 cp {zip_filename_fullpath} {s3_key}\n", 520 | " print(f\"Uploaded to: {s3_key}\")\n", 521 | "else:\n", 522 | " print(f\"No S3_OUTPUT_PATH set, not uploading {zip_filename}\")" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "id": "983658a1", 528 | "metadata": {}, 529 | "source": [ 530 | "## Viewing results" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 10, 536 | "id": "ae6175ca", 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "if S3_OUTPUT_PATH:\n", 541 | " print(f'If running on a *remote* machine to view the results on your local computer copy-paste these commands in your terminal:\\n\\\n", 542 | " aws s3 cp {s3_key} /tmp/{zip_filename}\\n\\\n", 543 | " cd /tmp\\n\\\n", 544 | " unzip -d {zip_filename.replace(\".zip\",\"\")} {zip_filename}\\n\\\n", 545 | " open /tmp/{zip_filename.replace(\".zip\",\"\")}/index.html\\n')\n", 546 | "\n", 547 | "print(f'If running on a *local* machine copy-paste these commands in your terminal:\\n\\\n", 548 | " open {main_html_filename}')" 549 | ] 550 | } 551 | ], 552 | "metadata": { 553 | "availableInstances": [ 554 | { 555 | "_defaultOrder": 0, 556 | "_isFastLaunch": true, 557 | "category": "General purpose", 558 | "gpuNum": 0, 559 | "hideHardwareSpecs": false, 560 | "memoryGiB": 4, 561 | "name": "ml.t3.medium", 562 | "vcpuNum": 2 563 | }, 564 | { 565 | "_defaultOrder": 1, 566 | "_isFastLaunch": false, 567 | "category": "General purpose", 568 | "gpuNum": 0, 569 | "hideHardwareSpecs": false, 570 | "memoryGiB": 8, 571 | "name": "ml.t3.large", 572 | "vcpuNum": 2 573 | }, 574 | { 575 | "_defaultOrder": 2, 576 | "_isFastLaunch": false, 577 | "category": "General purpose", 578 | "gpuNum": 0, 579 | "hideHardwareSpecs": false, 580 | "memoryGiB": 16, 581 | "name": "ml.t3.xlarge", 582 | "vcpuNum": 4 583 | }, 584 | { 585 | "_defaultOrder": 3, 586 | "_isFastLaunch": false, 587 | "category": "General purpose", 588 | "gpuNum": 0, 589 | "hideHardwareSpecs": false, 590 | "memoryGiB": 32, 591 | "name": "ml.t3.2xlarge", 592 | "vcpuNum": 8 593 | }, 594 | { 595 | "_defaultOrder": 4, 596 | "_isFastLaunch": true, 597 | "category": "General purpose", 598 | "gpuNum": 0, 599 | "hideHardwareSpecs": false, 600 | "memoryGiB": 8, 601 | "name": "ml.m5.large", 602 | "vcpuNum": 2 603 | }, 604 | { 605 | "_defaultOrder": 5, 606 | "_isFastLaunch": false, 607 | "category": "General purpose", 608 | "gpuNum": 0, 609 | "hideHardwareSpecs": false, 610 | "memoryGiB": 16, 611 | "name": "ml.m5.xlarge", 612 | "vcpuNum": 4 613 | }, 614 | { 615 | "_defaultOrder": 6, 616 | "_isFastLaunch": false, 617 | "category": "General purpose", 618 | "gpuNum": 0, 619 | "hideHardwareSpecs": false, 620 | "memoryGiB": 32, 621 | "name": "ml.m5.2xlarge", 622 | "vcpuNum": 8 623 | }, 624 | { 625 | "_defaultOrder": 7, 626 | "_isFastLaunch": false, 627 | "category": "General purpose", 628 | "gpuNum": 0, 629 | "hideHardwareSpecs": false, 630 | "memoryGiB": 64, 631 | "name": "ml.m5.4xlarge", 632 | "vcpuNum": 16 633 | }, 634 | { 635 | "_defaultOrder": 8, 636 | "_isFastLaunch": false, 637 | "category": "General purpose", 638 | "gpuNum": 0, 639 | "hideHardwareSpecs": false, 640 | "memoryGiB": 128, 641 | "name": "ml.m5.8xlarge", 642 | "vcpuNum": 32 643 | }, 644 | { 645 | "_defaultOrder": 9, 646 | "_isFastLaunch": false, 647 | "category": "General purpose", 648 | "gpuNum": 0, 649 | "hideHardwareSpecs": false, 650 | "memoryGiB": 192, 651 | "name": "ml.m5.12xlarge", 652 | "vcpuNum": 48 653 | }, 654 | { 655 | "_defaultOrder": 10, 656 | "_isFastLaunch": false, 657 | "category": "General purpose", 658 | "gpuNum": 0, 659 | "hideHardwareSpecs": false, 660 | "memoryGiB": 256, 661 | "name": "ml.m5.16xlarge", 662 | "vcpuNum": 64 663 | }, 664 | { 665 | "_defaultOrder": 11, 666 | "_isFastLaunch": false, 667 | "category": "General purpose", 668 | "gpuNum": 0, 669 | "hideHardwareSpecs": false, 670 | "memoryGiB": 384, 671 | "name": "ml.m5.24xlarge", 672 | "vcpuNum": 96 673 | }, 674 | { 675 | "_defaultOrder": 12, 676 | "_isFastLaunch": false, 677 | "category": "General purpose", 678 | "gpuNum": 0, 679 | "hideHardwareSpecs": false, 680 | "memoryGiB": 8, 681 | "name": "ml.m5d.large", 682 | "vcpuNum": 2 683 | }, 684 | { 685 | "_defaultOrder": 13, 686 | "_isFastLaunch": false, 687 | "category": "General purpose", 688 | "gpuNum": 0, 689 | "hideHardwareSpecs": false, 690 | "memoryGiB": 16, 691 | "name": "ml.m5d.xlarge", 692 | "vcpuNum": 4 693 | }, 694 | { 695 | "_defaultOrder": 14, 696 | "_isFastLaunch": false, 697 | "category": "General purpose", 698 | "gpuNum": 0, 699 | "hideHardwareSpecs": false, 700 | "memoryGiB": 32, 701 | "name": "ml.m5d.2xlarge", 702 | "vcpuNum": 8 703 | }, 704 | { 705 | "_defaultOrder": 15, 706 | "_isFastLaunch": false, 707 | "category": "General purpose", 708 | "gpuNum": 0, 709 | "hideHardwareSpecs": false, 710 | "memoryGiB": 64, 711 | "name": "ml.m5d.4xlarge", 712 | "vcpuNum": 16 713 | }, 714 | { 715 | "_defaultOrder": 16, 716 | "_isFastLaunch": false, 717 | "category": "General purpose", 718 | "gpuNum": 0, 719 | "hideHardwareSpecs": false, 720 | "memoryGiB": 128, 721 | "name": "ml.m5d.8xlarge", 722 | "vcpuNum": 32 723 | }, 724 | { 725 | "_defaultOrder": 17, 726 | "_isFastLaunch": false, 727 | "category": "General purpose", 728 | "gpuNum": 0, 729 | "hideHardwareSpecs": false, 730 | "memoryGiB": 192, 731 | "name": "ml.m5d.12xlarge", 732 | "vcpuNum": 48 733 | }, 734 | { 735 | "_defaultOrder": 18, 736 | "_isFastLaunch": false, 737 | "category": "General purpose", 738 | "gpuNum": 0, 739 | "hideHardwareSpecs": false, 740 | "memoryGiB": 256, 741 | "name": "ml.m5d.16xlarge", 742 | "vcpuNum": 64 743 | }, 744 | { 745 | "_defaultOrder": 19, 746 | "_isFastLaunch": false, 747 | "category": "General purpose", 748 | "gpuNum": 0, 749 | "hideHardwareSpecs": false, 750 | "memoryGiB": 384, 751 | "name": "ml.m5d.24xlarge", 752 | "vcpuNum": 96 753 | }, 754 | { 755 | "_defaultOrder": 20, 756 | "_isFastLaunch": false, 757 | "category": "General purpose", 758 | "gpuNum": 0, 759 | "hideHardwareSpecs": true, 760 | "memoryGiB": 0, 761 | "name": "ml.geospatial.interactive", 762 | "supportedImageNames": [ 763 | "sagemaker-geospatial-v1-0" 764 | ], 765 | "vcpuNum": 0 766 | }, 767 | { 768 | "_defaultOrder": 21, 769 | "_isFastLaunch": true, 770 | "category": "Compute optimized", 771 | "gpuNum": 0, 772 | "hideHardwareSpecs": false, 773 | "memoryGiB": 4, 774 | "name": "ml.c5.large", 775 | "vcpuNum": 2 776 | }, 777 | { 778 | "_defaultOrder": 22, 779 | "_isFastLaunch": false, 780 | "category": "Compute optimized", 781 | "gpuNum": 0, 782 | "hideHardwareSpecs": false, 783 | "memoryGiB": 8, 784 | "name": "ml.c5.xlarge", 785 | "vcpuNum": 4 786 | }, 787 | { 788 | "_defaultOrder": 23, 789 | "_isFastLaunch": false, 790 | "category": "Compute optimized", 791 | "gpuNum": 0, 792 | "hideHardwareSpecs": false, 793 | "memoryGiB": 16, 794 | "name": "ml.c5.2xlarge", 795 | "vcpuNum": 8 796 | }, 797 | { 798 | "_defaultOrder": 24, 799 | "_isFastLaunch": false, 800 | "category": "Compute optimized", 801 | "gpuNum": 0, 802 | "hideHardwareSpecs": false, 803 | "memoryGiB": 32, 804 | "name": "ml.c5.4xlarge", 805 | "vcpuNum": 16 806 | }, 807 | { 808 | "_defaultOrder": 25, 809 | "_isFastLaunch": false, 810 | "category": "Compute optimized", 811 | "gpuNum": 0, 812 | "hideHardwareSpecs": false, 813 | "memoryGiB": 72, 814 | "name": "ml.c5.9xlarge", 815 | "vcpuNum": 36 816 | }, 817 | { 818 | "_defaultOrder": 26, 819 | "_isFastLaunch": false, 820 | "category": "Compute optimized", 821 | "gpuNum": 0, 822 | "hideHardwareSpecs": false, 823 | "memoryGiB": 96, 824 | "name": "ml.c5.12xlarge", 825 | "vcpuNum": 48 826 | }, 827 | { 828 | "_defaultOrder": 27, 829 | "_isFastLaunch": false, 830 | "category": "Compute optimized", 831 | "gpuNum": 0, 832 | "hideHardwareSpecs": false, 833 | "memoryGiB": 144, 834 | "name": "ml.c5.18xlarge", 835 | "vcpuNum": 72 836 | }, 837 | { 838 | "_defaultOrder": 28, 839 | "_isFastLaunch": false, 840 | "category": "Compute optimized", 841 | "gpuNum": 0, 842 | "hideHardwareSpecs": false, 843 | "memoryGiB": 192, 844 | "name": "ml.c5.24xlarge", 845 | "vcpuNum": 96 846 | }, 847 | { 848 | "_defaultOrder": 29, 849 | "_isFastLaunch": true, 850 | "category": "Accelerated computing", 851 | "gpuNum": 1, 852 | "hideHardwareSpecs": false, 853 | "memoryGiB": 16, 854 | "name": "ml.g4dn.xlarge", 855 | "vcpuNum": 4 856 | }, 857 | { 858 | "_defaultOrder": 30, 859 | "_isFastLaunch": false, 860 | "category": "Accelerated computing", 861 | "gpuNum": 1, 862 | "hideHardwareSpecs": false, 863 | "memoryGiB": 32, 864 | "name": "ml.g4dn.2xlarge", 865 | "vcpuNum": 8 866 | }, 867 | { 868 | "_defaultOrder": 31, 869 | "_isFastLaunch": false, 870 | "category": "Accelerated computing", 871 | "gpuNum": 1, 872 | "hideHardwareSpecs": false, 873 | "memoryGiB": 64, 874 | "name": "ml.g4dn.4xlarge", 875 | "vcpuNum": 16 876 | }, 877 | { 878 | "_defaultOrder": 32, 879 | "_isFastLaunch": false, 880 | "category": "Accelerated computing", 881 | "gpuNum": 1, 882 | "hideHardwareSpecs": false, 883 | "memoryGiB": 128, 884 | "name": "ml.g4dn.8xlarge", 885 | "vcpuNum": 32 886 | }, 887 | { 888 | "_defaultOrder": 33, 889 | "_isFastLaunch": false, 890 | "category": "Accelerated computing", 891 | "gpuNum": 4, 892 | "hideHardwareSpecs": false, 893 | "memoryGiB": 192, 894 | "name": "ml.g4dn.12xlarge", 895 | "vcpuNum": 48 896 | }, 897 | { 898 | "_defaultOrder": 34, 899 | "_isFastLaunch": false, 900 | "category": "Accelerated computing", 901 | "gpuNum": 1, 902 | "hideHardwareSpecs": false, 903 | "memoryGiB": 256, 904 | "name": "ml.g4dn.16xlarge", 905 | "vcpuNum": 64 906 | }, 907 | { 908 | "_defaultOrder": 35, 909 | "_isFastLaunch": false, 910 | "category": "Accelerated computing", 911 | "gpuNum": 1, 912 | "hideHardwareSpecs": false, 913 | "memoryGiB": 61, 914 | "name": "ml.p3.2xlarge", 915 | "vcpuNum": 8 916 | }, 917 | { 918 | "_defaultOrder": 36, 919 | "_isFastLaunch": false, 920 | "category": "Accelerated computing", 921 | "gpuNum": 4, 922 | "hideHardwareSpecs": false, 923 | "memoryGiB": 244, 924 | "name": "ml.p3.8xlarge", 925 | "vcpuNum": 32 926 | }, 927 | { 928 | "_defaultOrder": 37, 929 | "_isFastLaunch": false, 930 | "category": "Accelerated computing", 931 | "gpuNum": 8, 932 | "hideHardwareSpecs": false, 933 | "memoryGiB": 488, 934 | "name": "ml.p3.16xlarge", 935 | "vcpuNum": 64 936 | }, 937 | { 938 | "_defaultOrder": 38, 939 | "_isFastLaunch": false, 940 | "category": "Accelerated computing", 941 | "gpuNum": 8, 942 | "hideHardwareSpecs": false, 943 | "memoryGiB": 768, 944 | "name": "ml.p3dn.24xlarge", 945 | "vcpuNum": 96 946 | }, 947 | { 948 | "_defaultOrder": 39, 949 | "_isFastLaunch": false, 950 | "category": "Memory Optimized", 951 | "gpuNum": 0, 952 | "hideHardwareSpecs": false, 953 | "memoryGiB": 16, 954 | "name": "ml.r5.large", 955 | "vcpuNum": 2 956 | }, 957 | { 958 | "_defaultOrder": 40, 959 | "_isFastLaunch": false, 960 | "category": "Memory Optimized", 961 | "gpuNum": 0, 962 | "hideHardwareSpecs": false, 963 | "memoryGiB": 32, 964 | "name": "ml.r5.xlarge", 965 | "vcpuNum": 4 966 | }, 967 | { 968 | "_defaultOrder": 41, 969 | "_isFastLaunch": false, 970 | "category": "Memory Optimized", 971 | "gpuNum": 0, 972 | "hideHardwareSpecs": false, 973 | "memoryGiB": 64, 974 | "name": "ml.r5.2xlarge", 975 | "vcpuNum": 8 976 | }, 977 | { 978 | "_defaultOrder": 42, 979 | "_isFastLaunch": false, 980 | "category": "Memory Optimized", 981 | "gpuNum": 0, 982 | "hideHardwareSpecs": false, 983 | "memoryGiB": 128, 984 | "name": "ml.r5.4xlarge", 985 | "vcpuNum": 16 986 | }, 987 | { 988 | "_defaultOrder": 43, 989 | "_isFastLaunch": false, 990 | "category": "Memory Optimized", 991 | "gpuNum": 0, 992 | "hideHardwareSpecs": false, 993 | "memoryGiB": 256, 994 | "name": "ml.r5.8xlarge", 995 | "vcpuNum": 32 996 | }, 997 | { 998 | "_defaultOrder": 44, 999 | "_isFastLaunch": false, 1000 | "category": "Memory Optimized", 1001 | "gpuNum": 0, 1002 | "hideHardwareSpecs": false, 1003 | "memoryGiB": 384, 1004 | "name": "ml.r5.12xlarge", 1005 | "vcpuNum": 48 1006 | }, 1007 | { 1008 | "_defaultOrder": 45, 1009 | "_isFastLaunch": false, 1010 | "category": "Memory Optimized", 1011 | "gpuNum": 0, 1012 | "hideHardwareSpecs": false, 1013 | "memoryGiB": 512, 1014 | "name": "ml.r5.16xlarge", 1015 | "vcpuNum": 64 1016 | }, 1017 | { 1018 | "_defaultOrder": 46, 1019 | "_isFastLaunch": false, 1020 | "category": "Memory Optimized", 1021 | "gpuNum": 0, 1022 | "hideHardwareSpecs": false, 1023 | "memoryGiB": 768, 1024 | "name": "ml.r5.24xlarge", 1025 | "vcpuNum": 96 1026 | }, 1027 | { 1028 | "_defaultOrder": 47, 1029 | "_isFastLaunch": false, 1030 | "category": "Accelerated computing", 1031 | "gpuNum": 1, 1032 | "hideHardwareSpecs": false, 1033 | "memoryGiB": 16, 1034 | "name": "ml.g5.xlarge", 1035 | "vcpuNum": 4 1036 | }, 1037 | { 1038 | "_defaultOrder": 48, 1039 | "_isFastLaunch": false, 1040 | "category": "Accelerated computing", 1041 | "gpuNum": 1, 1042 | "hideHardwareSpecs": false, 1043 | "memoryGiB": 32, 1044 | "name": "ml.g5.2xlarge", 1045 | "vcpuNum": 8 1046 | }, 1047 | { 1048 | "_defaultOrder": 49, 1049 | "_isFastLaunch": false, 1050 | "category": "Accelerated computing", 1051 | "gpuNum": 1, 1052 | "hideHardwareSpecs": false, 1053 | "memoryGiB": 64, 1054 | "name": "ml.g5.4xlarge", 1055 | "vcpuNum": 16 1056 | }, 1057 | { 1058 | "_defaultOrder": 50, 1059 | "_isFastLaunch": false, 1060 | "category": "Accelerated computing", 1061 | "gpuNum": 1, 1062 | "hideHardwareSpecs": false, 1063 | "memoryGiB": 128, 1064 | "name": "ml.g5.8xlarge", 1065 | "vcpuNum": 32 1066 | }, 1067 | { 1068 | "_defaultOrder": 51, 1069 | "_isFastLaunch": false, 1070 | "category": "Accelerated computing", 1071 | "gpuNum": 1, 1072 | "hideHardwareSpecs": false, 1073 | "memoryGiB": 256, 1074 | "name": "ml.g5.16xlarge", 1075 | "vcpuNum": 64 1076 | }, 1077 | { 1078 | "_defaultOrder": 52, 1079 | "_isFastLaunch": false, 1080 | "category": "Accelerated computing", 1081 | "gpuNum": 4, 1082 | "hideHardwareSpecs": false, 1083 | "memoryGiB": 192, 1084 | "name": "ml.g5.12xlarge", 1085 | "vcpuNum": 48 1086 | }, 1087 | { 1088 | "_defaultOrder": 53, 1089 | "_isFastLaunch": false, 1090 | "category": "Accelerated computing", 1091 | "gpuNum": 4, 1092 | "hideHardwareSpecs": false, 1093 | "memoryGiB": 384, 1094 | "name": "ml.g5.24xlarge", 1095 | "vcpuNum": 96 1096 | }, 1097 | { 1098 | "_defaultOrder": 54, 1099 | "_isFastLaunch": false, 1100 | "category": "Accelerated computing", 1101 | "gpuNum": 8, 1102 | "hideHardwareSpecs": false, 1103 | "memoryGiB": 768, 1104 | "name": "ml.g5.48xlarge", 1105 | "vcpuNum": 192 1106 | }, 1107 | { 1108 | "_defaultOrder": 55, 1109 | "_isFastLaunch": false, 1110 | "category": "Accelerated computing", 1111 | "gpuNum": 8, 1112 | "hideHardwareSpecs": false, 1113 | "memoryGiB": 1152, 1114 | "name": "ml.p4d.24xlarge", 1115 | "vcpuNum": 96 1116 | }, 1117 | { 1118 | "_defaultOrder": 56, 1119 | "_isFastLaunch": false, 1120 | "category": "Accelerated computing", 1121 | "gpuNum": 8, 1122 | "hideHardwareSpecs": false, 1123 | "memoryGiB": 1152, 1124 | "name": "ml.p4de.24xlarge", 1125 | "vcpuNum": 96 1126 | }, 1127 | { 1128 | "_defaultOrder": 57, 1129 | "_isFastLaunch": false, 1130 | "category": "Accelerated computing", 1131 | "gpuNum": 0, 1132 | "hideHardwareSpecs": false, 1133 | "memoryGiB": 32, 1134 | "name": "ml.trn1.2xlarge", 1135 | "vcpuNum": 8 1136 | }, 1137 | { 1138 | "_defaultOrder": 58, 1139 | "_isFastLaunch": false, 1140 | "category": "Accelerated computing", 1141 | "gpuNum": 0, 1142 | "hideHardwareSpecs": false, 1143 | "memoryGiB": 512, 1144 | "name": "ml.trn1.32xlarge", 1145 | "vcpuNum": 128 1146 | }, 1147 | { 1148 | "_defaultOrder": 59, 1149 | "_isFastLaunch": false, 1150 | "category": "Accelerated computing", 1151 | "gpuNum": 0, 1152 | "hideHardwareSpecs": false, 1153 | "memoryGiB": 512, 1154 | "name": "ml.trn1n.32xlarge", 1155 | "vcpuNum": 128 1156 | } 1157 | ], 1158 | "instance_type": "ml.g4dn.xlarge", 1159 | "kernelspec": { 1160 | "display_name": "Python 3 (ipykernel)", 1161 | "language": "python", 1162 | "name": "python3" 1163 | }, 1164 | "language_info": { 1165 | "codemirror_mode": { 1166 | "name": "ipython", 1167 | "version": 3 1168 | }, 1169 | "file_extension": ".py", 1170 | "mimetype": "text/x-python", 1171 | "name": "python", 1172 | "nbconvert_exporter": "python", 1173 | "pygments_lexer": "ipython3", 1174 | "version": "3.10.11" 1175 | } 1176 | }, 1177 | "nbformat": 4, 1178 | "nbformat_minor": 5 1179 | } 1180 | --------------------------------------------------------------------------------