├── tests
├── requirements.txt
└── test_pricing_calculator.py
├── NOTICE
├── requirements.txt
├── resources
└── screenshot.jpg
├── CODE_OF_CONDUCT.md
├── utils
├── dashboard_creators
│ ├── data_preview_viewer.py
│ ├── static
│ │ └── styles.css
│ ├── data_stats_viewer_creator.py
│ ├── output_viewer_creator.py
│ ├── comparative_dashboard_creator.py
│ ├── main_html_creator.py
│ └── dashboard_template.py
├── model_ranker.py
├── tweetsumm_data_creator.py
├── model_runners
│ ├── gpt_model_runner.py
│ ├── bedrock_counting_runner.py
│ └── pricing_calculator.py
└── metrics
│ └── bart_score.py
├── CONTRIBUTING.md
├── .gitignore
├── README.md
├── LICENSE
└── summariziation_example.ipynb
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | fmeval
3 | pandas==2.1.4
4 | ipywidgets
5 | jupyterlab
--------------------------------------------------------------------------------
/resources/screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/fm-leaderboarder/HEAD/resources/screenshot.jpg
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/utils/dashboard_creators/data_preview_viewer.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | import re
3 | import json
4 | from unicodedata import normalize
5 |
6 | def create_data_preview_view(test_file_path, result_html_folder):
7 |
8 | with open(test_file_path, 'r') as json_file:
9 | json_list = list(json_file)
10 |
11 | # generate headers name
12 | headers = ['Input', 'Ground Truth']
13 |
14 | # generate row data
15 | rows = []
16 | for json_str in json_list:
17 | result = json.loads(json_str)
18 | row = [result['document'], result['summary']]
19 | rows.append(row)
20 |
21 | with open(f"{result_html_folder}/test_samples.html", "w", encoding='utf-8-sig') as file:
22 | from .dashboard_template import generate_dashboard_string
23 | file.write(generate_dashboard_string(title = "", column_names=headers, rows = rows))
24 |
--------------------------------------------------------------------------------
/utils/dashboard_creators/static/styles.css:
--------------------------------------------------------------------------------
1 | /* Tooltip container */
2 | .tooltip {
3 | position: relative;
4 | display: inline-block;
5 | }
6 |
7 | /* Tooltip text */
8 | .tooltip .tooltiptext {
9 | visibility: hidden;
10 | width: 300px;
11 | background-color: #555;
12 | color: #fff;
13 | text-align: center;
14 | padding: 5px;
15 | border-radius: 6px;
16 | position: absolute;
17 | z-index: 1;
18 | bottom: 150%;
19 | left: 50%;
20 | margin-left: -150px;
21 | opacity: 0;
22 | transition: opacity 0.3s;
23 | }
24 |
25 | /* Tooltip arrow */
26 | .tooltip .tooltiptext::after {
27 | content: "";
28 | position: absolute;
29 | top: 100%;
30 | left: 50%;
31 | margin-left: -5px;
32 | border-width: 5px;
33 | border-style: solid;
34 | border-color: #555 transparent transparent transparent;
35 | }
36 |
37 | /* Show the tooltip text when you mouse over the tooltip container */
38 | .tooltip:hover .tooltiptext {
39 | visibility: visible;
40 | opacity: 1;
41 | }
--------------------------------------------------------------------------------
/utils/model_ranker.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | def create_model_ranking(models_scores:dict):
4 | model_ranking = defaultdict(int)
5 | totals = len(models_scores) * len(list(models_scores.values())[0])
6 | # Iterate over the metrics
7 | metrics = set()
8 | for model_id in models_scores:
9 | metrics.update(models_scores[model_id].keys())
10 |
11 | for metric in metrics:
12 | # Sort the models for the current metric
13 | sorted_models = sorted(
14 | [(model_id, models_scores[model_id][metric]) for model_id in models_scores if metric in models_scores[model_id]],
15 | key=lambda x: x[1],
16 | reverse=False
17 | )
18 |
19 | # Assign points to the models based on their ranking for the current metric
20 | for rank, (model_id, _) in enumerate(sorted_models, start=1):
21 | model_ranking[model_id] += rank / totals
22 |
23 | # Sort the models based on their total points
24 | return model_ranking
25 |
26 | for rank, (model_id, total_points) in enumerate(sorted_model_ranking, start=1):
27 | ret[model_id] = rank
28 |
29 | return ret
30 |
--------------------------------------------------------------------------------
/utils/dashboard_creators/data_stats_viewer_creator.py:
--------------------------------------------------------------------------------
1 | import json
2 | def create_data_stats_view(test_file_path, result_img_folder):
3 |
4 |
5 | def get_doc_lengths(data):
6 | document_sentence_length, summary_sentence_length = [], []
7 |
8 | for d in data:
9 | # TODO switch to token counting instead of characters
10 | document_sentence_length.append(len(d['document'].replace("\n"," ").split()))
11 | summary_sentence_length.append(len(d['summary'].replace("\n"," ").split()))
12 |
13 | return document_sentence_length, summary_sentence_length
14 |
15 |
16 | def read_data(filename):
17 | data = []
18 | with open(filename, "r") as file:
19 | for line in file:
20 | data.append(json.loads(line))
21 | return data
22 |
23 | test_data = read_data(f"{test_file_path}")
24 | test_doc_lengths, test_sum_lengths = get_doc_lengths(test_data)
25 |
26 | import matplotlib.pyplot as plt
27 | import numpy as np
28 |
29 | fig, axs = plt.subplots(2)
30 | axs[0].hist(test_doc_lengths, density=False, bins=50)
31 | axs[0].set_title('Test documents length')
32 | axs[1].hist(test_sum_lengths, density=False, bins=50)
33 | axs[1].set_title('Test summary length')
34 |
35 | for ax in axs.flat:
36 | ax.set(xlabel='Num. of words', ylabel='Count')
37 |
38 | for ax in axs.flat:
39 | ax.label_outer()
40 |
41 | plt.savefig(f"{result_img_folder}/dataset_stats.png")
--------------------------------------------------------------------------------
/utils/dashboard_creators/output_viewer_creator.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 | from os import listdir
4 | from os.path import isfile, join
5 |
6 | def create_response_output_view(result_html_folder, tmp_json_files, models_scores):
7 | models_run = list(models_scores.keys())
8 | if len(models_run) > 0:
9 | metrics_used = list(models_scores[models_run[0]].keys())
10 |
11 | for model_id, scores in models_scores.items():
12 | title = f'Model [{model_id}] - testset results'
13 | # generate headers name
14 | headers = ['Model input', 'Model output', 'Target Output']
15 | for mu in metrics_used:
16 | headers.append(mu)
17 |
18 | model_json_filename = f"{tmp_json_files}/{model_id}_metrics.jsonl"
19 | data = []
20 | with open(model_json_filename, "r") as file:
21 | for line in file:
22 | data.append(json.loads(line))
23 |
24 | df = pd.DataFrame(data)
25 | for idx, mu in enumerate(metrics_used):
26 | df[mu] = df['scores'].apply(lambda x: x[idx]['value'])
27 |
28 | # generate row data
29 | rows = []
30 | for index, item in df.iterrows():
31 | row = [item['prompt'], item['model_output'], item['target_output']]
32 | for mu in metrics_used:
33 | row.append(item[mu])
34 | rows.append(row)
35 |
36 | with open(f"{result_html_folder}/{model_id}_results.html", "w", encoding='utf-8-sig') as file:
37 | from .dashboard_template import generate_dashboard_string
38 | file.write(generate_dashboard_string(title = title, column_names = headers, rows = rows))
--------------------------------------------------------------------------------
/utils/dashboard_creators/comparative_dashboard_creator.py:
--------------------------------------------------------------------------------
1 | import json
2 | from os import listdir
3 | from os.path import isfile, join
4 |
5 |
6 | def create_comparive_dashboard(result_html_folder, tmp_json_files):
7 | model_outputs = dict()
8 | test_samples = []
9 |
10 | for result_file in listdir(tmp_json_files):
11 | if not result_file.endswith("_metrics.jsonl"):
12 | continue
13 |
14 | model = result_file.replace("_metrics.jsonl", "")
15 |
16 | model_outputs[model] = dict()
17 |
18 | data = []
19 | filename = join(tmp_json_files, result_file)
20 | with open(filename, "r") as file:
21 | for line in file:
22 | data.append(json.loads(line))
23 |
24 | if len(test_samples) == 0:
25 | for d in data:
26 | test_samples.append((d['model_input'], d['target_output']))
27 |
28 | for d in data:
29 | model_outputs[model][d['target_output']] = d['model_output']
30 |
31 | models = list(model_outputs.keys())
32 |
33 | # generate headers name
34 | headers = ['Model input', 'Target output']
35 | for m_name in models:
36 | headers.append(m_name)
37 |
38 | # generate row data
39 | rows = []
40 | for samples in test_samples:
41 | row = [samples[0], samples[1]]
42 | for m_name in models:
43 | row.append(f'[{model_outputs[m_name][samples[1]]}] Output')
44 | rows.append(row)
45 |
46 | with open(f"{result_html_folder}/output_comparison.html", "w", encoding='utf-8-sig') as file:
47 | from .dashboard_template import generate_dashboard_string
48 | file.write(generate_dashboard_string(title = 'cross-model comparison', column_names = headers, rows = rows))
--------------------------------------------------------------------------------
/utils/tweetsumm_data_creator.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | import re
3 | import json
4 |
5 |
6 | def clean_text(text):
7 | text = text.encode("ascii", "ignore").decode()
8 | text = re.sub(r"http\S+", "", text)
9 | text = re.sub(r"@[^\s]+", "", text)
10 | text = re.sub(r"\s+", " ", text)
11 | return re.sub(r"\^[^ ]+", "", text)
12 |
13 |
14 | def create_conversation_text(data_point):
15 | text = ""
16 | for item in data_point["log"]:
17 | user = clean_text(item["user utterance"])
18 | text += f"user: {user.strip()}\n"
19 |
20 | agent = clean_text(item["system response"])
21 | text += f"agent: {agent.strip()}\n"
22 |
23 | return text
24 |
25 |
26 | def generate_text(data_point):
27 | summaries = json.loads(data_point["original dialog info"])["summaries"][
28 | "abstractive_summaries"
29 | ]
30 | summary = summaries[0]
31 | summary = " ".join(summary)
32 |
33 | conversation_text = create_conversation_text(data_point)
34 | return {
35 | "document": conversation_text,
36 | "summary": summary,
37 | "id": data_point['original dialog id']
38 | }
39 |
40 |
41 | def create_train_test_files(folder):
42 | dataset = load_dataset("Salesforce/dialogstudio", "TweetSumm")
43 | tables = ["test", "validation"]
44 | modified_db = dict()
45 |
46 | for table in tables:
47 | for i in range(len(dataset[table])):
48 |
49 | example = generate_text(dataset[table][i])
50 | if table not in modified_db:
51 | modified_db[table] = []
52 | modified_db[table].append(example)
53 |
54 | modified_db['test'].extend(modified_db['validation'])
55 | del modified_db['validation']
56 |
57 | print(f"Test set size: {len(modified_db['test'])}")
58 |
59 | for k, v in modified_db.items():
60 | with open(f"{folder}/{k}_tweetsumm_modified.jsonl", 'w') as f:
61 | for item in v:
62 | f.write(json.dumps(item) + "\n")
63 |
64 | modified_db = []
65 |
66 | for i in range(len(dataset["train"])):
67 |
68 | example = generate_text(dataset["train"][i])
69 | modified_db.append({"prompt": f"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent:\n {example['document']}", "completion": f"{example['summary']}"})
70 |
71 | with open(f"{folder}/bedrock_train_tweetsumm.jsonl", 'w') as f:
72 | for sample in modified_db:
73 | f.write(json.dumps(sample) + "\n")
74 |
75 | print(f"Train set size: {len(modified_db)}")
76 | print(f"Created training set file that can be used for Bedrock finetuning under the folder: {folder}/bedrock_train_tweetsumm.jsonl")
77 |
78 | test_file_path = f"{folder}/test_tweetsumm_modified.jsonl"
79 | return test_file_path
--------------------------------------------------------------------------------
/utils/model_runners/gpt_model_runner.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | from datetime import datetime, timezone
4 | import fcntl
5 | import os
6 | from dataclasses import dataclass
7 | from typing import Tuple, Optional
8 |
9 | from fmeval.model_runners.model_runner import ModelRunner
10 |
11 | @dataclass
12 | class GPTModelConfig:
13 | temperature: float
14 | top_p: float
15 | max_tokens: int
16 | api_key: str
17 | model_id: str
18 |
19 |
20 | class GPTModelRunner(ModelRunner):
21 | url = "https://api.openai.com/v1/chat/completions"
22 |
23 | def __init__(self, model_config: GPTModelConfig, metrics_folder: str = None, model_key:str = None):
24 | self.config = model_config
25 | self._metrics_folder = metrics_folder
26 | self._model_key = model_key
27 |
28 | def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
29 | print(prompt)
30 | payload = json.dumps({
31 | "model": self.config.model_id,
32 | "messages": [
33 | {
34 | "role": "user",
35 | "content": prompt
36 | }
37 | ],
38 | "temperature": self.config.temperature,
39 | "top_p": self.config.top_p,
40 | "n": 1,
41 | "stream": False,
42 | "presence_penalty": 0,
43 | "frequency_penalty": 0
44 | })
45 |
46 | headers = {
47 | "Content-Type": "application/json",
48 | "Accept": "application/json",
49 | "Authorization": "Bearer " + self.config.api_key
50 | }
51 | start_time = datetime.now(timezone.utc)
52 |
53 | response = requests.request("POST", self.url, headers=headers, data=payload)
54 | delta = datetime.now(timezone.utc) - start_time
55 | processing_time = delta.total_seconds()
56 |
57 |
58 | response = json.loads(response.text)
59 | output = response["choices"][0]["message"]["content"]
60 | input_token_count = int(response["usage"]["prompt_tokens"])
61 | output_token_count = int(response["usage"]["completion_tokens"])
62 |
63 | sw = json.dumps({"input_tokens":input_token_count,"output_tokens":output_token_count, "processing_time":processing_time, "model_id":self.config.model_id})
64 | fp = open(self._metrics_folder + f"/{self._model_key}_usage.jsonl", 'a')
65 | fcntl.flock(fp.fileno(), fcntl.LOCK_EX)
66 | fp.seek(0, 2)
67 | fp.write(sw + "\n")
68 | fcntl.flock(fp.fileno(), fcntl.LOCK_UN)
69 | fp.close()
70 |
71 | return output, None
72 |
73 | def __reduce__(self):
74 | """
75 | Custom serializer method used by Ray when it serializes instances of this
76 | class in eval_algorithms.util.generate_model_predict_response_for_dataset.
77 | """
78 | serialized_data = (
79 | self.config,
80 | self._metrics_folder,
81 | self._model_key
82 | )
83 | return self.__class__, serialized_data
84 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/utils/dashboard_creators/main_html_creator.py:
--------------------------------------------------------------------------------
1 | import json
2 | from os import listdir
3 | from os.path import isfile, join
4 |
5 | from utils.model_runners.pricing_calculator import PricingCalculator
6 |
7 |
8 | def create_main_html(result_folder, models_scores, model_usage):
9 | model_outputs = dict()
10 | test_samples = []
11 |
12 | title = 'Summarization Evaluation'
13 |
14 | pre_table_html = """
15 |
22 |
23 |
24 | Leaderboard
25 |
26 |
27 | You can sort by columns and search by a keyword to filter
28 |
29 |
30 |
31 |
32 | Legend:
33 | 1st Best Result
34 | 2nd Best Result
35 | 3rd Best Result
36 |
37 |
38 | """
39 |
40 | # generate headers name
41 | headers = ['Model']
42 | models_run = list(models_scores.keys())
43 | if len(models_run) > 0:
44 | metrics_used = list(models_scores[models_run[0]].keys())
45 | for mu in metrics_used:
46 | headers.append(f'Metric: {mu}')
47 | headers.append('Testing Costs ($)')
48 | headers.append('Avg Latency (s)')
49 | headers.append('cost/1MT In ($)')
50 | headers.append('cost/1MT Out ($)')
51 |
52 | # generate row data
53 | rows = []
54 | for model_id, scores in models_scores.items():
55 | row = [f'{model_id}']
56 | for mu in metrics_used:
57 | row.append("{:.4f}".format(scores[mu]))
58 | if model_id in model_usage and model_usage[model_id] is not None and model_usage[model_id]['cost_model'] == PricingCalculator.COST_PER_TOKEN:
59 | row.append("{:.2f}".format(model_usage[model_id]['cost']))
60 | row.append("{:.2f}".format(model_usage[model_id]['avg_processing_time']))
61 | row.append("{:.4f}".format(model_usage[model_id]['cost_input_1M']))
62 | row.append("{:.4f}".format(model_usage[model_id]['cost_output_1M']))
63 | else:
64 | row.append('-')
65 | row.append('-')
66 | row.append('-')
67 | row.append('-')
68 | rows.append(row)
69 |
70 | index_filename = f"{result_folder}/index.html"
71 |
72 | with open(index_filename, "w", encoding='utf-8-sig') as file:
73 | from .dashboard_template import generate_dashboard_string
74 | file.write(generate_dashboard_string(title = title, pre_table_html = pre_table_html, column_names = headers, rows = rows))
75 |
76 | # CSS
77 | # copy CSS file from ./static/styles.css to the result folder
78 | # get current python file folder
79 | import os
80 | import shutil
81 | shutil.copyfile(f'{os.path.dirname(os.path.abspath(__file__))}/static/styles.css', f'{result_folder}/styles.css')
82 |
83 | return index_filename
84 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | .vscode/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 | .ipynb_checkpoints/*
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | .venv2
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 |
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 |
138 | # Rope project settings
139 | .ropeproject
140 |
141 | # mkdocs documentation
142 | /site
143 |
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 |
149 | # Pyre type checker
150 | .pyre/
151 |
152 | # pytype static type analyzer
153 | .pytype/
154 |
155 | # Cython debug symbols
156 | cython_debug/
157 |
158 | # PyCharm
159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | # and can be added to the global gitignore or merged into this file. For a more nuclear
162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | utils/.DS_Store
165 | utils/key.py
166 |
--------------------------------------------------------------------------------
/utils/model_runners/bedrock_counting_runner.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple
2 | from fmeval.constants import MIME_TYPE_JSON
3 | from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner
4 | import json
5 | from datetime import datetime, timezone
6 | import fcntl
7 | import os
8 |
9 | class CountingBedrockModelRunner(BedrockModelRunner):
10 | '''Decorates the base BedrockModelRunner to emit invocation metrics,
11 | emits accurate model consumption figures in tokens using Bedrock
12 | response metadata available from the response headers.'''
13 |
14 | def __init__(self, model_id: str, content_template: str, output: str | None = None, log_probability: str | None = None, content_type: str = MIME_TYPE_JSON, accept_type: str = MIME_TYPE_JSON, metrics_folder: str = None, model_key:str = None):
15 | """
16 | :param model_id: Id of the Bedrock model to be used for model predictions
17 | :param content_template: String template to compose the model input from the prompt
18 | :param output: JMESPath expression of output in the model output
19 | :param log_probability: JMESPath expression of log probability in the model output
20 | :param content_type: The content type of the request sent to the model for inference
21 | :param accept_type: The accept type of the request sent to the model for inference
22 | :param metrics_folder: The destination of the invocation metric file
23 | :param model_key: The base name of the file to disambiguate metrics
24 | """
25 | super().__init__(model_id = model_id, content_template = content_template, output = output, log_probability=log_probability, content_type = content_type, accept_type = accept_type)
26 | self._metrics_folder = metrics_folder
27 | self._model_key = model_key
28 |
29 |
30 |
31 | def predict(self, prompt: str) -> Tuple[Optional[str], Optional[float]]:
32 | """
33 | Invoke the Bedrock model and parse the model response.
34 | :param prompt: Input data for which you want the model to provide inference.
35 | """
36 |
37 | composed_data = self._composer.compose(prompt)
38 | body = json.dumps(composed_data)
39 | start_time = datetime.now(timezone.utc)
40 |
41 | response = self._bedrock_runtime_client.invoke_model(
42 | body=body, modelId=self._model_id, accept=self._accept_type, contentType=self._content_type
43 | )
44 | delta = datetime.now(timezone.utc) - start_time
45 | processing_time = delta.total_seconds()
46 | model_output = json.loads(response.get("body").read())
47 |
48 | input_token_count = int(response["ResponseMetadata"]["HTTPHeaders"][
49 | "x-amzn-bedrock-input-token-count"
50 | ])
51 |
52 | output_token_count = int(response["ResponseMetadata"]["HTTPHeaders"][
53 | "x-amzn-bedrock-output-token-count"
54 | ])
55 |
56 | output = (
57 | self._extractor.extract_output(data=model_output, num_records=1)
58 | if self._extractor.output_jmespath_expression
59 | else None
60 | )
61 | log_probability = (
62 | self._extractor.extract_log_probability(data=model_output, num_records=1)
63 | if self._extractor.log_probability_jmespath_expression
64 | else None
65 | )
66 |
67 | sw = json.dumps({"input_tokens":input_token_count,"output_tokens":output_token_count, "processing_time":processing_time,"model_id":self._model_id})
68 | fp = open(self._metrics_folder + f"/{self._model_key}_usage.jsonl", 'a')
69 | fcntl.flock(fp.fileno(), fcntl.LOCK_EX)
70 | fp.seek(0, 2)
71 | fp.write(sw + "\n")
72 | fcntl.flock(fp.fileno(), fcntl.LOCK_UN)
73 | fp.close()
74 |
75 | return output, log_probability
76 |
77 | def __reduce__(self):
78 | """
79 | Custom serializer method used by Ray when it serializes instances of this
80 | class in eval_algorithms.util.generate_model_predict_response_for_dataset.
81 | """
82 | serialized_data = (
83 | self._model_id,
84 | self._content_template,
85 | self._output,
86 | self._log_probability,
87 | self._content_type,
88 | self._accept_type,
89 | self._metrics_folder,
90 | self._model_key
91 | )
92 | return self.__class__, serialized_data
93 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **⚠️ MAINTENANCE NOTICE **
2 | **This project is no longer actively maintained.**
3 | While the repository remains available for educational purposes, we recommend exploring more current alternatives for production use:
4 | - [RAGAS](https://github.com/explodinggradients/ragas) - A comprehensive framework for RAG evaluation
5 | - [Amazon Bedrock](https://aws.amazon.com/bedrock/) - A fully managed service for foundation models from Amazon
6 | For a practical example of using these alternatives, check out our [evaluation notebook using RAGAS and Bedrock](https://github.com/gilinachum/ragas-evaluation-and-bedrock-guardrails/blob/main/evaluate_prod_readiness.ipynb).
7 |
8 | Additionally, for Latency benchmarking check the code samples for [Latency Benchmarking tools for Amazon Bedrock](https://github.com/gilinachum/bedrock-latency/blob/main/README.md).
9 |
10 | ----
11 |
12 |
13 | FM-Leaderboard-er
14 |
15 |
16 | Create your own private LLM leaderboard! 📊
17 |
18 |
19 |
20 |
21 | ## Introduction
22 | There's no one-fit-all leaderboard. `FM-Leaderboard-er` will allow you to find the best LLM for your own business use case based on your own tasks, prompts, and data.
23 |
24 | ## Features:
25 | 1. *Tasks* - Example notebooks for common tasks like Summarization, Classification, and RAG (coming soon).
26 | 2. *Models* - Amazon Bedrock, OpenAI, any API (with a code integration).
27 | 3. *Metrics* - Built-in metrics per task + custom metrics (via a code integration).
28 | 4. *Latency* - Latency metric per model
29 | 5. *Cost* - comparison.
30 | 6. *Prompt* - You could compare several prompts across one model
31 |
32 | ## Getting Started
33 | ### Prerequisits
34 | 1. AWS account with Amazon Bedrock access to selected models.
35 | 2. Hugging Face access token
36 | The code will download Dataset from Huggingface (```https://huggingface.co/api/datasets/Salesforce/dialogstudio```), this will require an access token, if you don't have one yet, follow these steps:
37 |
38 | * Signup to Hugging Face: ```https://huggingface.co```
39 | * Generate an access token (save it for further use): ```https://huggingface.co/settings/tokens```
40 | * Store the access token localy, by installing python lib huggingface_hub and execute from shell:
41 | ```
42 | > pip install huggingface_hub
43 | > python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('YOUR_HUGGINGFACE_TOKEN')"
44 | ```
45 |
46 |
47 | ***(Verify you now have: ```~/.cache/huggingface```)***
48 |
49 | ### Installation
50 | 1. Clone the repository:
51 | ```
52 | git clone https://github.com/aws-samples/fm-leaderboarder.git
53 | ```
54 | ### Usage
55 |
56 | To get started, open the [example-1 notebook](./summariziation_example.ipynb) and follow the instructions provided.
57 |
58 | ### Architecture
59 | Coming soon.
60 |
61 | ## Dependency on third party libraries and services
62 | This code can interact with the OpenAI service which has [terms published here](https://openai.com/policies/terms-of-use) and [pricing described here](https://openai.com/pricing). You should be familiar with the pricing and confirm that your use case complies with the terms before proceeding.
63 |
64 | This repository makes use of [aws/fmeval Foundation Model Evaluations Library](https://github.com/aws/fmeval). Please review any license terms applicable to the dataset with your legal team and confirm that your use case complies with the terms before proceeding.
65 |
66 | ## Security
67 |
68 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
69 |
70 | ## Contributing
71 |
72 | Contributions to FM-Leaderboarder are welcome! Please refer to the [CONTRIBUTING.md](CONTRIBUTING.md) file for guidelines on how to contribute.
73 |
74 | ## Contributors
75 |
76 | [//]: contributor-faces
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 | ## License
87 |
88 | This project is licensed under the Apache-2.0 License.
89 |
90 |
--------------------------------------------------------------------------------
/tests/test_pricing_calculator.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import os
3 |
4 | from utils.model_runners.pricing_calculator import PricingCalculator
5 |
6 |
7 | def test_instance_pricing():
8 | instance_type = "g5.12xlarge"
9 | instance_price = PricingCalculator._instance_pricing(instance_type)
10 | assert isinstance(instance_price, float)
11 | # Add more assertions to verify the returned instance price
12 |
13 | def test_retrieve_cost_structure():
14 | model_id = "anthropic.claude-v2:1"
15 | cost_structure = PricingCalculator.retrieve_cost_structure(model_id)
16 | assert cost_structure is not None
17 | assert "model_id" in cost_structure
18 | assert "input_cost_per_1000_tokens" in cost_structure
19 | assert "output_cost_per_1000_tokens" in cost_structure
20 |
21 | # Test with a model_id that doesn't have a cost structure
22 | invalid_model_id = "invalid_model_id"
23 | cost_structure = PricingCalculator.retrieve_cost_structure(invalid_model_id)
24 | assert cost_structure is None
25 |
26 | def retrieve_cost_structure_variants():
27 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1:2:100k')
28 | assert cost_structure is not None
29 | assert "model_name" in cost_structure
30 | assert "input_cost_per_1000_tokens" in cost_structure
31 | assert "output_cost_per_1000_tokens" in cost_structure
32 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008
33 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1:2')
34 | assert cost_structure is not None
35 | assert "model_name" in cost_structure
36 | assert "input_cost_per_1000_tokens" in cost_structure
37 | assert "output_cost_per_1000_tokens" in cost_structure
38 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008
39 | cost_structure = PricingCalculator.retrieve_cost_structure('anthropic.claude-instant-v1')
40 | assert cost_structure is not None
41 | assert "model_name" in cost_structure
42 | assert "input_cost_per_1000_tokens" in cost_structure
43 | assert "output_cost_per_1000_tokens" in cost_structure
44 | assert cost_structure['input_cost_per_1000_tokens'] == 0.008
45 |
46 |
47 |
48 | def test_read_model_score_aggregate(tmpdir):
49 | folder = str(tmpdir)
50 | PricingCalculator.cleanup_previous_runs(folder)
51 | model_name = "anthropic.claude-v2"
52 | usage_file = f"{folder}/{model_name}_usage.jsonl"
53 |
54 | # Create a temporary usage file with sample data
55 | with open(usage_file, "w") as f:
56 | f.write('{"model_id": "anthropic.claude-v2", "input_tokens": 10, "output_tokens": 20, "processing_time": 1.5}\n')
57 | f.write('{"model_id": "anthropic.claude-v2", "input_tokens": 15, "output_tokens": 25, "processing_time": 2.0}\n')
58 |
59 | result = PricingCalculator.read_model_score_aggregate(model_name, folder)
60 | assert result is not None
61 | assert result["input_tokens"] == 25
62 | assert result["output_tokens"] == 45
63 | assert result["processing_time"] == 3.5
64 | assert result["cost"] > 0
65 |
66 | def test_read_model_score_aggregate_from_api(tmpdir):
67 | folder = str(tmpdir)
68 | PricingCalculator.cleanup_previous_runs(folder)
69 | model_name = "amazon.titan-text-lite-v1"
70 | usage_file = f"{folder}/{model_name}_usage.jsonl"
71 |
72 | # Create a temporary usage file with sample data
73 | with open(usage_file, "w") as f:
74 | f.write('{"model_id": "amazon.titan-text-lite-v1", "input_tokens": 10, "output_tokens": 20, "processing_time": 1.5}\n')
75 | f.write('{"model_id": "amazon.titan-text-lite-v1", "input_tokens": 15, "output_tokens": 25, "processing_time": 2.0}\n')
76 |
77 | result = PricingCalculator.read_model_score_aggregate(model_name, folder)
78 | assert result is not None
79 | assert result["input_tokens"] == 25
80 | assert result["output_tokens"] == 45
81 | assert result["processing_time"] == 3.5
82 | assert result["cost"] > 0
83 |
84 |
85 | def test_read_timed_score_aggregate(tmpdir):
86 | folder = str(tmpdir)
87 | PricingCalculator.cleanup_previous_runs(folder)
88 | model_name = "self_hosted_test"
89 | usage_file = f"{folder}/{model_name}_usage.jsonl"
90 |
91 | # Create a temporary usage file with sample data
92 | with open(usage_file, "w") as f:
93 | f.write('{"model_id": "self_hosted_test", "input_tokens": 10, "output_tokens": 20, "instance_type":"g5.12xlarge", "processing_time": 1.5}\n')
94 | f.write('{"model_id": "self_hosted_test", "input_tokens": 15, "output_tokens": 25, "instance_type":"g5.12xlarge", "processing_time": 2.0}\n')
95 |
96 | result = PricingCalculator.read_model_score_aggregate(model_name, folder)
97 | assert result is not None
98 | assert result["input_tokens"] == 25
99 | assert result["output_tokens"] == 45
100 | assert result["processing_time"] == 3.5
101 | assert result["cost"] > 0
102 |
103 | def test_cleanup_previous_runs(tmpdir):
104 | folder = str(tmpdir)
105 | open(f"{folder}/test_model_usage.jsonl", "w").close()
106 |
107 | PricingCalculator.cleanup_previous_runs(folder)
108 | assert not any(fname.endswith("_usage.jsonl") for fname in os.listdir(folder))
--------------------------------------------------------------------------------
/utils/dashboard_creators/dashboard_template.py:
--------------------------------------------------------------------------------
1 | import html
2 |
3 | def get_optional_tooltip_html(name : str):
4 | tips_by_metric = {
5 | "win rate" : "How many models this model outpefrom on average per each metric",
6 | "meteor" : "METEOR is a metric for text similarity between the machine-produced summary and human-produced reference summaries.",
7 | "rouge" : "The ROUGE metric measures text similarity by computing overlapping n-grams between a machine-generated text and one or more reference human-written texts.",
8 | "bertscore" : "The BERTScore is a text similarity metric that leverages BERT's contextual embeddings to compute token similarities between the candidate and reference texts.",
9 | "bartscore" : "",
10 | }
11 | if name.lower().startswith("metric:") or name == 'Win Rate':
12 | if name.lower().startswith("metric:"):
13 | metric_name = name.lower().split(' ')[-1]
14 | else:
15 | metric_name = name.lower()
16 | if metric_name in tips_by_metric:
17 | tip = tips_by_metric[metric_name]
18 | tooltip_html ='''
19 |
20 |
21 | {}
22 |
23 | '''.format(tip)
24 | return tooltip_html
25 | return ""
26 |
27 |
28 | def generate_dashboard_string(title = 'page title', pre_table_html = "", column_names = [], rows = []):
29 | columns_html = ""
30 | for name in column_names:
31 | tooltip_html = get_optional_tooltip_html(str(name))
32 | columns_html += f"{html.escape(str(name))}" + f"{tooltip_html} | \n"
33 |
34 | table_data_html = ""
35 | for row in rows:
36 | table_data_html += f"\n"
37 | for item in row:
38 | str_item = str(item) # in case of a number
39 | if str_item.strip().startswith("{escaped_item}\n"
44 | table_data_html += "
\n"
45 |
46 | args = {'title' : html.escape(str(title)), 'pre_table_html' : pre_table_html, 'columns_html' : columns_html, 'table_data_html' :table_data_html}
47 |
48 | return '''
49 |
50 |
51 |
52 | {title}
53 |
54 |
55 |
56 |
91 |
92 |
93 | {title}
94 | {pre_table_html}
95 |
96 |
97 |
98 | {columns_html}
99 |
100 |
101 |
102 | {table_data_html}
103 |
104 |
105 |
106 |
107 | '''.format(**args)
108 |
109 | # testcases
110 | def test_generate_dashboard_string():
111 | print(generate_dashboard_string(title = 'mytitle', column_names = ["a", "b"], rows = [[1, 2], [3, 4]]))
112 | print(generate_dashboard_string(title = 'mytitle', pre_table_html= "1
", column_names = ["a", "b"], rows = [[1, 2], [3, 4]]))
113 | print(generate_dashboard_string(column_names = ["a", "b"], rows = [['link', 2], [3, 4]]))
114 | print(generate_dashboard_string(column_names = ["Metric: meteor", "b"], rows = [['link', 2], [3, 4]]))
115 |
116 | #test_generate_dashboard_string()
--------------------------------------------------------------------------------
/utils/metrics/bart_score.py:
--------------------------------------------------------------------------------
1 | """
2 | This code calculates BARTscore.
3 | The metric was introduced in NeurIPS 2021. Paper: https://arxiv.org/pdf/2106.11520.pdf
4 | The code was adjusted from the official code in: https://github.com/neulab/BARTScore (Apache 2.0 license: https://github.com/neulab/BARTScore/blob/main/LICENSE)
5 | """
6 | from transformers import BartTokenizer, BartForConditionalGeneration
7 | import torch
8 | import torch.nn as nn
9 | import traceback
10 | from typing import List
11 | import numpy as np
12 | import json
13 | from os import listdir
14 | from os.path import isfile, join
15 |
16 |
17 | class BARTScorer:
18 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
19 | # Set up model
20 | self.device = device
21 | self.max_length = max_length
22 | self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
23 | self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
24 | self.model.eval()
25 | self.model.to(device)
26 |
27 | # Set up loss
28 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
29 | self.lsm = nn.LogSoftmax(dim=1)
30 |
31 | def load(self, path=None):
32 | """ Load model from paraphrase finetuning """
33 | if path is None:
34 | path = 'models/bart.pth'
35 | self.model.load_state_dict(torch.load(path, map_location=self.device))
36 |
37 | def score(self, srcs, tgts, batch_size=4):
38 | """ Score a batch of examples """
39 | score_list = []
40 | for i in range(0, len(srcs), batch_size):
41 | src_list = srcs[i: i + batch_size]
42 | tgt_list = tgts[i: i + batch_size]
43 | try:
44 | with torch.no_grad():
45 | encoded_src = self.tokenizer(
46 | src_list,
47 | max_length=self.max_length,
48 | truncation=True,
49 | padding=True,
50 | return_tensors='pt'
51 | )
52 | encoded_tgt = self.tokenizer(
53 | tgt_list,
54 | max_length=self.max_length,
55 | truncation=True,
56 | padding=True,
57 | return_tensors='pt'
58 | )
59 | src_tokens = encoded_src['input_ids'].to(self.device)
60 | src_mask = encoded_src['attention_mask'].to(self.device)
61 |
62 | tgt_tokens = encoded_tgt['input_ids'].to(self.device)
63 | tgt_mask = encoded_tgt['attention_mask']
64 | tgt_len = tgt_mask.sum(dim=1).to(self.device)
65 |
66 | output = self.model(
67 | input_ids=src_tokens,
68 | attention_mask=src_mask,
69 | labels=tgt_tokens
70 | )
71 | logits = output.logits.view(-1, self.model.config.vocab_size)
72 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
73 | loss = loss.view(tgt_tokens.shape[0], -1)
74 | loss = loss.sum(dim=1) / tgt_len
75 | curr_score_list = [-x.item() for x in loss]
76 | score_list += curr_score_list
77 |
78 | except RuntimeError:
79 | traceback.print_exc()
80 | print(f'source: {src_list}')
81 | print(f'target: {tgt_list}')
82 | exit(0)
83 | return score_list
84 |
85 |
86 | def calculate_bartscore(tmp_json_files, models_scores, path_to_finetuned_bart):
87 | if torch.cuda.is_available():
88 | device = 'cuda:0'
89 | else:
90 | device = 'cpu'
91 |
92 | bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnn')
93 | if len(path_to_finetuned_bart)>0:
94 | bart_scorer.load(path=path_to_finetuned_bart)
95 | scores_dict = dict()
96 |
97 | for result_file in listdir(tmp_json_files):
98 | if not result_file.endswith("_metrics.jsonl"):
99 | continue
100 |
101 | model = result_file.replace("_metrics.jsonl", "")
102 |
103 | if not model in models_scores:
104 | continue
105 |
106 | scores_dict[model] = []
107 | data = []
108 |
109 | filename = join(tmp_json_files, result_file)
110 |
111 | with open(filename, "r") as file:
112 | for line in file:
113 | data.append(json.loads(line))
114 |
115 | print(f"Evaluating {model} model")
116 |
117 |
118 | processed_samples_ctr = 0
119 |
120 | for sample in data:
121 | model_output = sample['model_output'].strip()
122 | target_output = sample['target_output'].strip()
123 | score = bart_scorer.score([model_output], [target_output])[0]
124 | scores_dict[model].append(score)
125 | sample['scores'].append({'name': 'bartscore', 'value': score})
126 |
127 | processed_samples_ctr += 1
128 | if processed_samples_ctr % 10 == 0:
129 | print(f"Processed {processed_samples_ctr}/{len(data)} samples.")
130 |
131 | # dump the new metric to appear in the the output view dashboard
132 | with open(filename, 'w') as outfile:
133 | for entry in data:
134 | json.dump(entry, outfile)
135 | outfile.write('\n')
136 |
137 | # update the models_score to appear in the index.html leaderboard
138 | current_metrics = models_scores[model]
139 | current_metrics['bartscore'] = np.average(scores_dict[model])
140 | models_scores[model] = current_metrics
141 |
142 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
--------------------------------------------------------------------------------
/utils/model_runners/pricing_calculator.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | import json
3 | import traceback
4 | import sys
5 | import os
6 |
7 | import logging
8 | logging.basicConfig(level=logging.INFO)
9 | logger = logging.getLogger(__name__)
10 |
11 | class PricingCalculator():
12 | '''Calculate the pricing of the inference depending on the model specific.
13 | Uses pricing API where available, lookup tables from pricing sources when not available,
14 | and can calculate cost both per token pricing and per hosting time models.'''
15 | COST_PER_TOKEN = 'cpt'
16 | COST_PER_HOUR = 'cph'
17 |
18 | _pricing_client = boto3.client('pricing', region_name = 'us-east-1')
19 | _model_prive_by_name = {}
20 | _model_name_by_id = {}
21 | @classmethod
22 | def static_init(self):
23 | for model in boto3.client('bedrock').list_foundation_models()['modelSummaries']:
24 | PricingCalculator._model_name_by_id[model['modelId']] = model['modelName']
25 |
26 | pricing_data = []
27 |
28 | paginator = PricingCalculator._pricing_client.get_paginator('get_products')
29 | operation_parameters = {'ServiceCode': 'AmazonBedrock'}
30 | page_iterator = paginator.paginate(**operation_parameters)
31 |
32 | try:
33 | for page in page_iterator:
34 | pricing_data.extend(page['PriceList'])
35 | except Exception as e:
36 | logger.log(logging.WARNING, f'Failed to fetch price list: {e}')
37 | return
38 |
39 |
40 |
41 | for item in pricing_data:
42 | try:
43 | price_item = json.loads(item)
44 | usage_type = price_item['product']['attributes']['usagetype']
45 | region_code = price_item['product']['attributes']['regionCode']
46 | if region_code != boto3.session.Session().region_name:
47 | continue
48 | if 'inferenceType' in price_item['product']['attributes']:
49 | inference_type = price_item['product']['attributes']['inferenceType']
50 | else:
51 | inference_type = 'N/A'
52 |
53 | if 'model' in price_item['product']['attributes']:
54 | model_name = price_item['product']['attributes']['model']
55 | elif 'titanModel' in price_item['product']['attributes']:
56 | model_name = price_item['product']['attributes']['titanModel']
57 | elif 'titanModelUnit' in price_item['product']['attributes']:
58 | model_name = price_item['product']['attributes']['titanModelUnit']
59 | else:
60 | logger.log(logging.ERROR, "Model name is missing. Skipping price item: {price_item['product']['attributes']}")
61 | continue;
62 |
63 | l1 = price_item['terms']['OnDemand']
64 | l2 = list(l1.values())[0]['priceDimensions']
65 | price_per_unit = list(l2.values())[0]['pricePerUnit']['USD']
66 | unit = list(l2.values())[0]['unit']
67 | if not model_name in PricingCalculator._model_prive_by_name:
68 | PricingCalculator._model_prive_by_name[model_name] = dict()
69 | PricingCalculator._model_prive_by_name[model_name]['model_id'] = model_name
70 | if 'input-tokens' in usage_type and unit =='1K tokens':
71 | PricingCalculator._model_prive_by_name[model_name]['input_cost_per_1000_tokens'] = price_per_unit
72 | elif 'output-tokens' in usage_type and unit =='1K tokens':
73 | PricingCalculator._model_prive_by_name[model_name]['output_cost_per_1000_tokens'] = price_per_unit
74 | elif 'ProvisionedThroughput' in usage_type:
75 | PricingCalculator._model_prive_by_name[model_name]['instance_type'] = usage_type
76 | PricingCalculator._model_prive_by_name[model_name]['hosting_cost_per_hour'] = price_per_unit
77 | else:
78 | pass
79 | except Exception as e:
80 | logger.log(logging.ERROR, 'ERROR: Failed to parse price item')
81 | raise e;
82 |
83 |
84 |
85 | #TODO Waiting for an official api to support markeplace's models
86 | _lookup_price_table = [
87 | {
88 | "model_id": "anthropic.claude-v2:1",
89 | "id_type": "model_id",
90 | "input_cost_per_1000_tokens": 0.008,
91 | "output_cost_per_1000_tokens": 0.024
92 | },
93 | {
94 | "model_id": "anthropic.claude-v2",
95 | "id_type": "model_id",
96 | "input_cost_per_1000_tokens": 0.008,
97 | "output_cost_per_1000_tokens": 0.024
98 | },
99 | {
100 | "model_id": "anthropic.claude-instant-v1",
101 | "id_type": "model_id",
102 | "input_cost_per_1000_tokens": 0.0008,
103 | "output_cost_per_1000_tokens": 0.0024
104 | },
105 | {
106 | "model_id": "amazon.titan-text-lite-v1",
107 | "id_type": "model_id",
108 | "input_cost_per_1000_tokens": 0.0003,
109 | "output_cost_per_1000_tokens": 0.0004
110 | },
111 | {
112 | "model_id": "amazon.titan-text-express-v1",
113 | "id_type": "model_id",
114 | "input_cost_per_1000_tokens": 0.0008,
115 | "output_cost_per_1000_tokens": 0.0016
116 | },
117 | {
118 | "model_id": "meta.llama2-13b-chat-v1",
119 | "id_type": "model_id",
120 | "input_cost_per_1000_tokens": 0.00075,
121 | "output_cost_per_1000_tokens": 0.001
122 | },
123 | {
124 | "model_id": "cohere.command-light-text-v14",
125 | "id_type": "model_id",
126 | "input_cost_per_1000_tokens": 0.0003,
127 | "output_cost_per_1000_tokens": 0.0006
128 | },
129 |
130 | {
131 | "model_id": "anthropic.claude-3-sonnet-20240229-v1:0",
132 | "id_type": "model_id",
133 | "input_cost_per_1000_tokens": 0.003,
134 | "output_cost_per_1000_tokens": 0.015
135 | },
136 | {
137 | "model_id": "anthropic.claude-3-haiku-20240307-v1:0",
138 | "id_type": "model_id",
139 | "input_cost_per_1000_tokens": 0.00025,
140 | "output_cost_per_1000_tokens": 0.00125
141 | },
142 | {
143 | "model_id": "meta.llama2-70b-chat-v1",
144 | "id_type": "model_id",
145 | "input_cost_per_1000_tokens": 0.00195,
146 | "output_cost_per_1000_tokens": 0.00256
147 | },
148 | {
149 | "model_id": "ai21.j2-mid-v1",
150 | "id_type": "model_id",
151 | "input_cost_per_1000_tokens": 0.0125,
152 | "output_cost_per_1000_tokens": 0.0125
153 | },
154 | {
155 | "model_id": "ai21.ai21.j2-ultra-v1",
156 | "id_type": "model_id",
157 | "input_cost_per_1000_tokens": 0.0188,
158 | "output_cost_per_1000_tokens": 0.0188
159 | },
160 | {
161 | "model_id": "cohere.command-text-v14",
162 | "id_type": "model_id",
163 | "input_cost_per_1000_tokens": 0.0015,
164 | "output_cost_per_1000_tokens": 0.0020
165 | },
166 | {
167 | "model_id": "mistral.mistral-7b-instruct-v0:2",
168 | "id_type": "model_id",
169 | "input_cost_per_1000_tokens": 0.00015,
170 | "output_cost_per_1000_tokens": 0.0002
171 | },
172 | {
173 | "model_id": "mistral.mixtral-8x7b-instruct-v0:1",
174 | "id_type": "model_id",
175 | "input_cost_per_1000_tokens": 0.00045,
176 | "output_cost_per_1000_tokens": 0.0007
177 |
178 | },
179 | {
180 | "model_id": "self_hosted_test",
181 | "id_type": "model_id",
182 | "instance_type": "g5.12xlarge",
183 | },
184 | {
185 | "model_id": "gpt-4-0125-preview",
186 | "id_type": "model_id",
187 | "input_cost_per_1000_tokens": 0.01,
188 | "output_cost_per_1000_tokens": 0.03
189 | },
190 | {
191 | "model_id": "gpt-4-1106-preview",
192 | "id_type": "model_id",
193 | "input_cost_per_1000_tokens": 0.01,
194 | "output_cost_per_1000_tokens": 0.03
195 | },
196 | {
197 | "model_id": "gpt-4-1106-vision-preview",
198 | "id_type": "model_id",
199 | "input_cost_per_1000_tokens": 0.01,
200 | "output_cost_per_1000_tokens": 0.03
201 | },
202 | {
203 | "model_id": "gpt-4",
204 | "id_type": "model_id",
205 | "input_cost_per_1000_tokens": 0.03,
206 | "output_cost_per_1000_tokens": 0.06
207 | },
208 | {
209 | "model_id": "gpt-4-32k",
210 | "id_type": "model_id",
211 | "input_cost_per_1000_tokens": 0.06,
212 | "output_cost_per_1000_tokens": 0.12
213 | },
214 | {
215 | "model_id": "gpt-3.5-turbo-0125",
216 | "id_type": "model_id",
217 | "input_cost_per_1000_tokens": 0.0005,
218 | "output_cost_per_1000_tokens": 0.0015
219 | },
220 | {
221 | "model_id": "gpt-3.5-turbo-instruct",
222 | "id_type": "model_id",
223 | "input_cost_per_1000_tokens": 0.0015,
224 | "output_cost_per_1000_tokens": 0.002
225 | },
226 | {
227 | "model_id": "gpt-3.5-turbo-1106",
228 | "id_type": "model_id",
229 | "input_cost_per_1000_tokens": 0.001,
230 | "output_cost_per_1000_tokens": 0.002
231 | },
232 | {
233 | "model_id": "gpt-3.5-turbo-0613",
234 | "id_type": "model_id",
235 | "input_cost_per_1000_tokens": 0.0015,
236 | "output_cost_per_1000_tokens": 0.002
237 | },
238 | {
239 | "model_id": "gpt-3.5-turbo-16k-0613",
240 | "id_type": "model_id",
241 | "input_cost_per_1000_tokens": 0.003,
242 | "output_cost_per_1000_tokens": 0.004
243 | },
244 | {
245 | "model_id": "gpt-3.5-turbo-0301",
246 | "id_type": "model_id",
247 | "input_cost_per_1000_tokens": 0.0015,
248 | "output_cost_per_1000_tokens": 0.002
249 | }
250 | ]
251 |
252 |
253 | @classmethod
254 | def _instance_pricing(self, instance_type):
255 | data = PricingCalculator._pricing_client.get_products(ServiceCode='AmazonEC2', Filters=[{"Field": "tenancy", "Value": "shared", "Type": "TERM_MATCH"},
256 | {"Field": "operatingSystem", "Value": "Linux", "Type": "TERM_MATCH"},
257 | {"Field": "preInstalledSw", "Value": "NA", "Type": "TERM_MATCH"},
258 | {"Field": "instanceType", "Value": instance_type, "Type": "TERM_MATCH"},
259 | {"Field": "marketoption", "Value": "OnDemand", "Type": "TERM_MATCH"},
260 | {"Field": "regionCode", "Value": boto3.session.Session().region_name , "Type": "TERM_MATCH"},
261 | {"Field": "capacitystatus", "Value": "Used", "Type": "TERM_MATCH"}])
262 | for price in (json.loads(x) for x in data['PriceList']):
263 | first_id = list(price['terms']['OnDemand'].keys())[0]
264 | price_data = price['terms']['OnDemand'][first_id]
265 | second_id = list(price_data['priceDimensions'].keys())[0]
266 | instance_price = price_data['priceDimensions'][second_id]['pricePerUnit']['USD']
267 | if float(instance_price) > 0:
268 | return float(instance_price)
269 | raise(Exception(f'Failed to get instance pricing for instance type {instance_type}'))
270 |
271 | @classmethod
272 | def retrieve_cost_structure(self, model_id):
273 | cost_structure = None
274 | mName = PricingCalculator._model_name_by_id[model_id] if model_id in PricingCalculator._model_name_by_id else None
275 | if not (mName == None):
276 | #handle internal model
277 | if mName in PricingCalculator._model_prive_by_name:
278 | return PricingCalculator._model_prive_by_name[mName]
279 | for model_cost in PricingCalculator._lookup_price_table:
280 | if model_id in model_cost['model_id'] or model_id.split(':')[0] in model_cost['model_id'].split(':')[0]:
281 | return model_cost
282 |
283 | @classmethod
284 | def _calculate_usage_cost(self, model_id, input_tokens:int=0, output_tokens:int=0, inference_time_s:float=0, instance_type:str = None):
285 | try:
286 | cost_structure = PricingCalculator.retrieve_cost_structure(model_id)
287 | if cost_structure is None:
288 | return None
289 | if 'instance_type' in cost_structure and cost_structure['instance_type'] == instance_type:
290 | return PricingCalculator._calculate_usage_per_second(inference_time_s, cost_structure), cost_structure, PricingCalculator.COST_PER_HOUR
291 | else:
292 | return PricingCalculator._calculate_usage_per_token(input_tokens, output_tokens, cost_structure),cost_structure, PricingCalculator.COST_PER_TOKEN
293 | except Exception as e:
294 | logger.log(logging.ERROR, f'Failed to calculate cost for model {model_id}, invokation parameters: {input_tokens}, {output_tokens}, {inference_time_s}')
295 | raise e;
296 |
297 | @classmethod
298 | def _calculate_usage_per_second(self, inference_time_s:float=0, cost_structure = None):
299 | if 'hosting_cost_per_hour' in cost_structure:
300 | return cost_structure['hosting_cost_per_hour'] * inference_time_s / (60*60)
301 | return PricingCalculator._instance_pricing(cost_structure['instance_type']) * inference_time_s / (60*60)
302 |
303 | @classmethod
304 | def _calculate_usage_per_token(self, input_tokens, output_tokens, model_cost):
305 | input_cost = float(model_cost['input_cost_per_1000_tokens']) * input_tokens / 1000
306 | # Since 'output_cost_per_1000_tokens' is not always given...
307 | try:
308 | output_cost = float(model_cost['output_cost_per_1000_tokens']) * output_tokens / 1000
309 | except Exception:
310 | output_cost = input_cost
311 |
312 | return input_cost + output_cost
313 |
314 | @classmethod
315 | def read_model_score_aggregate(self, model_name, folder):
316 | '''Read model usage information from the test report and calculate the overall
317 | cost based on the known pricing, it expects to find a file {folder}/{model_name}_usage.jsonl
318 | containing json lines for each invocation with these keys:
319 | model_id #name of the model as used in the invocation API
320 | input_tokens #number of token in the prompt
321 | output_tokens #number of token in the output
322 | processing_time #total invocation time in second
323 | instance_type #type of the instance for models priced on hosting time
324 | '''
325 | file = f"{folder}/{model_name}_usage.jsonl"
326 | if not os.path.exists(file):
327 | return None
328 |
329 | # Initialize the sum dictionary
330 | sum_dict = {
331 | 'input_tokens': 0,
332 | 'output_tokens': 0,
333 | 'processing_time': 0,
334 | 'cost': None,
335 | 'cost_model': None,
336 | 'cost_hour': None,
337 | 'cost_input_1M': None,
338 | 'cost_output_1M': None
339 | }
340 | samples = 0
341 |
342 | with open(file, 'r') as file:
343 | for line in file:
344 | samples = samples + 1
345 | item = json.loads(line)
346 | input_tokens = item['input_tokens'] if 'input_tokens' in item else 0
347 | output_tokens = item['output_tokens'] if 'output_tokens' in item else 0
348 | processing_time = item['processing_time'] if 'processing_time' in item else 0
349 | cost, cost_structure, cost_model = PricingCalculator._calculate_usage_cost(item['model_id'], input_tokens, output_tokens, processing_time,
350 | item['instance_type'] if 'instance_type' in item else None )
351 | if sum_dict['cost_model'] == None:
352 | sum_dict['cost_model'] = cost_model
353 | if cost_model == PricingCalculator.COST_PER_HOUR:
354 | sum_dict['cost_hour'] = cost_structure['hosting_cost_per_hour'] if 'hosting_cost_per_hour' in cost_structure else PricingCalculator._instance_pricing(cost_structure['instance_type'])
355 | if cost_model == PricingCalculator.COST_PER_TOKEN:
356 |
357 | # cast to float to avoid exception when multiplying int with float ('cost_input_1M')
358 | sum_dict['cost_input_1M'] = float(cost_structure['input_cost_per_1000_tokens'])*1000.0
359 |
360 | # When 'cost_output_1M' value, does not exist
361 | try:
362 | output_cost_per_1000_tokens = float(cost_structure['output_cost_per_1000_tokens'])*1000.0
363 | except Exception:
364 | output_cost_per_1000_tokens = float(cost_structure['input_cost_per_1000_tokens'])*1000.0
365 |
366 | sum_dict['cost_output_1M'] = output_cost_per_1000_tokens
367 | sum_dict['input_tokens'] += input_tokens
368 | sum_dict['output_tokens'] += output_tokens
369 | sum_dict['processing_time'] += processing_time
370 | if cost is None:
371 | continue
372 | if sum_dict['cost'] is None:
373 | sum_dict["cost"] = cost
374 | else:
375 | sum_dict['cost'] += cost
376 |
377 |
378 | # Convert the sum dictionary to JSON string
379 | sum_dict['samples'] = samples
380 | sum_dict['avg_cost'] = sum_dict['cost'] / samples
381 | sum_dict['avg_processing_time'] = sum_dict['processing_time'] / samples
382 | return sum_dict
383 |
384 | @classmethod
385 | def cleanup_previous_runs(self, dir_path):
386 | for file_name in os.listdir(dir_path):
387 | if file_name.endswith('_usage.jsonl'):
388 | # Construct the full file path
389 | file_path = os.path.join(dir_path, file_name)
390 | os.remove(file_path)
391 |
392 | PricingCalculator.static_init()
--------------------------------------------------------------------------------
/summariziation_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3a0e44ff-c46c-4998-88d7-fb3aee89841d",
6 | "metadata": {},
7 | "source": [
8 | "# Text generation models evaluation\n",
9 | "\n",
10 | "#### This notebook evaluates several LLMs from Bedrock, HuggingFace, Jumpstart, Bedrock finetuned models\n",
11 | "#### Instance type used for the evaluation - ml.g4dn.2xlarge or m5.2xlarge, python 3.10\n",
12 | "#### The metrics evaluated are N-gram matching-based ([ROUGE](https://en.wikipedia.org/wiki/ROUGE_(metric)), [METEOR](https://en.wikipedia.org/wiki/METEOR)) and sematic-based ([BERTScore](https://arxiv.org/abs/1904.09675)) from [FMEval](https://github.com/aws/fmeval/) library (can be further customized), and [BARTScore](https://arxiv.org/abs/2106.11520) using encoder-decoder architecture\n",
13 | "#### The datasets used is [TweetSumm](https://github.com/guyfe/Tweetsumm) (A Dialog Summarization Dataset for Customer Service, published in EMNLP 21)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "id": "68c047c8-0168-45eb-9d59-dda0117ff703",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "%load_ext autoreload\n",
24 | "%autoreload 2\n",
25 | "\n",
26 | "# Optional S3 path to upload results to (e.g. s3://yourbucket/results/ ) - Handy to as a way to download results and open html report on a local machine\n",
27 | "S3_OUTPUT_PATH = None \n",
28 | "\n",
29 | "MODELS_TO_EVAL = [] # if empty list will evaluate all the models available. For specific models, mention their ids from the list below, for example [\"anthropic.claude-v2:1\", \"amazon.titan-text-lite-v1\"]"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "id": "eb788c86-761a-4749-927f-737c194e4613",
36 | "metadata": {
37 | "tags": []
38 | },
39 | "outputs": [],
40 | "source": [
41 | "!pip install --upgrade pip --quiet\n",
42 | "!pip install -r requirements.txt --quiet"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "21f5d06f",
48 | "metadata": {},
49 | "source": [
50 | "### OPEN AI API key\n",
51 | "This is relevant if you'll be using models from OpenAI\n",
52 | "\n",
53 | "- Create a new file called `utils/key.py` in your project directory to store your API key.\n",
54 | "- Go to your OpenAI account and navigate to \"[View API keys](https://platform.openai.com/account/api-keys).\"\n",
55 | "- Select \"Create new secret key.\"\n",
56 | "- Copy the key and insert it into your file `utils/key.py` like this:\n",
57 | "```\n",
58 | "OPENAI_API_KEY = 'sk-actualLongKeyGoesHere123'\n",
59 | "```\n",
60 | "- Save the changes\n",
61 | "- IMPORTANT: Do **not** commit `key.py` to source control as will contain your private key. (It should already be in `.gitgnore`.** Review [this information about API safety](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).\n",
62 | "- Comment out `from utils.key import OPENAI_API_KEY` below."
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "id": "75cfe9a8",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "OPENAI_API_KEY = None # uncommenting the line below will override this\n",
73 | "#from utils.key import OPENAI_API_KEY"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "id": "d4fbff43-d21a-40c6-b430-3def8ae7c268",
79 | "metadata": {
80 | "tags": []
81 | },
82 | "source": [
83 | "## Define bucket config"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "id": "11956018-0992-4ec5-bb51-d73b1c017988",
90 | "metadata": {
91 | "tags": []
92 | },
93 | "outputs": [],
94 | "source": [
95 | "import json\n",
96 | "from pathlib import Path\n",
97 | "import boto3\n",
98 | "import glob\n",
99 | "import shutil\n",
100 | "import os\n",
101 | "from os import listdir\n",
102 | "\n",
103 | "from fmeval.model_runners.bedrock_model_runner import BedrockModelRunner\n",
104 | "from fmeval.model_runners.sm_jumpstart_model_runner import JumpStartModelRunner\n",
105 | "\n",
106 | "from utils.model_runners.gpt_model_runner import GPTModelConfig, GPTModelRunner\n",
107 | "from utils.tweetsumm_data_creator import create_train_test_files\n",
108 | "from utils.model_ranker import create_model_ranking\n",
109 | "from utils.dashboard_creators.output_viewer_creator import create_response_output_view\n",
110 | "from utils.dashboard_creators.comparative_dashboard_creator import create_comparive_dashboard\n",
111 | "from utils.dashboard_creators.data_stats_viewer_creator import create_data_stats_view\n",
112 | "from utils.dashboard_creators.data_preview_viewer import create_data_preview_view\n",
113 | "from utils.dashboard_creators.main_html_creator import create_main_html\n",
114 | "from utils.metrics.bart_score import calculate_bartscore\n",
115 | "\n",
116 | "RESULT_FOLDER = \"/tmp/final_result\"\n",
117 | "if os.path.exists(RESULT_FOLDER):\n",
118 | " shutil.rmtree(RESULT_FOLDER)\n",
119 | "os.mkdir(RESULT_FOLDER)\n",
120 | "\n",
121 | "TMP_JSON_FILES = \"/tmp/jsonl_model_files\"\n",
122 | "if os.path.exists(TMP_JSON_FILES):\n",
123 | " shutil.rmtree(TMP_JSON_FILES)\n",
124 | "os.mkdir(TMP_JSON_FILES)\n",
125 | "\n",
126 | "TMP_DATASET_FILES = \"/tmp/dataset_files\"\n",
127 | "if os.path.exists(TMP_DATASET_FILES):\n",
128 | " shutil.rmtree(TMP_DATASET_FILES)\n",
129 | "os.mkdir(TMP_DATASET_FILES)\n",
130 | "\n",
131 | "RESULT_HTML_FOLDER = RESULT_FOLDER + \"/html_files\"\n",
132 | "if os.path.exists(RESULT_HTML_FOLDER):\n",
133 | " shutil.rmtree(RESULT_HTML_FOLDER)\n",
134 | "os.mkdir(RESULT_HTML_FOLDER)\n",
135 | "\n",
136 | "RESULT_IMG_FOLDER = RESULT_FOLDER + \"/imgs\"\n",
137 | "if os.path.exists(RESULT_IMG_FOLDER):\n",
138 | " shutil.rmtree(RESULT_IMG_FOLDER)\n",
139 | "os.mkdir(RESULT_IMG_FOLDER)\n",
140 | "\n",
141 | "from utils.tweetsumm_data_creator import create_train_test_files\n",
142 | "TEST_FILE_PATH = create_train_test_files(TMP_DATASET_FILES) # creating train and test files\n",
143 | "print(TEST_FILE_PATH)\n",
144 | "\n"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "id": "805a915b-4918-4bf4-994a-7cff3701ec91",
150 | "metadata": {},
151 | "source": [
152 | "## List the models to benchmark"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 5,
158 | "id": "c05192f6-a93c-4c76-bb2b-d88e7205426f",
159 | "metadata": {
160 | "tags": []
161 | },
162 | "outputs": [],
163 | "source": [
164 | "# Bedrock models\n",
165 | "models_to_test = {}\n",
166 | "\n",
167 | "# Add Bedrock Random text generating model to serve as baseline callibration for the various metrics\n",
168 | "models_to_test.update({\n",
169 | " \"random\" : { \n",
170 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n",
171 | " \"platform\" : \"bedrock\",\n",
172 | " \"output\" : \"results[0].outputText\", \n",
173 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n",
174 | " \"prompt_template\" : \"Please ignore the following blob of text and create an unrelated text of around 2 sentences\\n $model_input\\n\"\n",
175 | " }\n",
176 | "})\n",
177 | "\n",
178 | "# Add Bedrock Anthropic models in zero-shot\n",
179 | "models_to_test.update({\n",
180 | " \"anthropic.claude-3-sonnet\" : { \n",
181 | " \"model_id\" : \"anthropic.claude-3-sonnet-20240229-v1:0\", \n",
182 | " \"platform\" : \"bedrock\",\n",
183 | " \"output\" : \"content[0].text\", \n",
184 | " \"content_template\" : \"{\\\"messages\\\": [{\\\"role\\\": \\\"user\\\", \\\"content\\\": $prompt}], \\\"max_tokens\\\": 100, \\\"anthropic_version\\\": \\\"bedrock-2023-05-31\\\"}\",\n",
185 | " \"prompt_template\" : \"Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: \"\n",
186 | " },\n",
187 | " \"anthropic.claude-3-haiku\" : { \n",
188 | " \"model_id\" : \"anthropic.claude-3-haiku-20240307-v1:0\", \n",
189 | " \"platform\" : \"bedrock\",\n",
190 | " \"output\" : \"content[0].text\", \n",
191 | " \"content_template\" : \"{\\\"messages\\\": [{\\\"role\\\": \\\"user\\\", \\\"content\\\": $prompt}], \\\"max_tokens\\\": 100, \\\"anthropic_version\\\": \\\"bedrock-2023-05-31\\\"}\",\n",
192 | " \"prompt_template\" : \"Below is a dialog between a customer and an agent. Please provide a short and concise summary of the conversation. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Please write the summary in a human readable format. Start you answer directly with the summary without any additional prefix.\\n Specify important and relevant amounts, dates and locations inside the summary. Here is the dialog: \"\n",
193 | " }\n",
194 | "})\n",
195 | "\n",
196 | "# Add Bedrock Amazon Titan models in zero-shot\n",
197 | "models_to_test.update({\n",
198 | " \"amazon.titan-text-lite-v1\" : { \n",
199 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n",
200 | " \"platform\" : \"bedrock\",\n",
201 | " \"output\" : \"results[0].outputText\", \n",
202 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n",
203 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below. The summary should be short and include a single sentence describing the customer's complaint or request, and single sentence of the agent's response or action. Do not include any additional information that does not appear in the dialog. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n$model_input\\n\\nsummary:\\n\"\n",
204 | " },\n",
205 | " \"amazon.titan-text-express-v1\" :{ \n",
206 | " \"model_id\" : \"amazon.titan-text-express-v1\", \n",
207 | " \"platform\" : \"bedrock\",\n",
208 | " \"output\" : \"results[0].outputText\", \n",
209 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n",
210 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n $model_input\\n\\nsummary:\\n\"\n",
211 | " },\n",
212 | "})\n",
213 | "\n",
214 | "# Add Cohere and Llama2 Bedrock models in zero-shot\n",
215 | "models_to_test.update({\n",
216 | " \"cohere.command-light-text-v14\" :{ \n",
217 | " \"model_id\" : \"cohere.command-light-text-v14\", \n",
218 | " \"platform\" : \"bedrock\",\n",
219 | " \"output\" : \"generations[0].text\", \n",
220 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_tokens\\\": 100}\",\n",
221 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:\\n $model_input\\n\\nsummary:\\n\"\n",
222 | " },\n",
223 | " \"meta.llama2-13b-chat-v1\" :{ \n",
224 | " \"model_id\" : \"meta.llama2-13b-chat-v1\", \n",
225 | " \"platform\" : \"bedrock\",\n",
226 | " \"output\" : \"generation\", \n",
227 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_gen_len\\\": 100, \\\"top_p\\\": 1, \\\"temperature\\\": 1.0}\",\n",
228 | " \"prompt_template\" : \"[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. Here is the dialog:[/INST]\\n Transcript\\n $model_input \\n\\n Summary:\\n\"\n",
229 | " },\n",
230 | "})\n",
231 | "\n",
232 | "# Add various Bedrock models in one-shot\n",
233 | "models_to_test.update({\n",
234 | " \"amazon.titan-text-lite-v1-one-shot\" : { \n",
235 | " \"model_id\" : \"amazon.titan-text-lite-v1\", \n",
236 | " \"platform\" : \"bedrock\",\n",
237 | " \"output\" : \"results[0].outputText\", \n",
238 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n",
239 | " \"prompt_template\" : \"[INST]Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary. \\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\n [/INST] [INST]\\n Transcript:\\n $model_input [/INST]\\n Summary:\"\n",
240 | " },\n",
241 | " \"meta.llama2-13b-chat-v1-one-shot\" :{ \n",
242 | " \"model_id\" : \"meta.llama2-13b-chat-v1\", \n",
243 | " \"platform\" : \"bedrock\",\n",
244 | " \"output\" : \"generation\", \n",
245 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_gen_len\\\": 100, \\\"top_p\\\": 1, \\\"temperature\\\": 1.0}\",\n",
246 | " \"prompt_template\" : \"[INST] <> Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.< \\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\n [/INST] [INST]\\n Transcript:\\n $model_input [/INST] Summary:\"\n",
247 | " },\n",
248 | " \"cohere.command-light-text-v14-one-shot\" :{ \n",
249 | " \"model_id\" : \"cohere.command-light-text-v14\", \n",
250 | " \"platform\" : \"bedrock\",\n",
251 | " \"output\" : \"generations[0].text\", \n",
252 | " \"content_template\" : \"{\\\"prompt\\\": $prompt, \\\"max_tokens\\\": 100}\",\n",
253 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n\\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\nTranscript:\\n $model_input\\n Summary:\"\n",
254 | " },\n",
255 | " \"amazon.titan-text-express-v1-one-shot\" :{ \n",
256 | " \"model_id\" : \"amazon.titan-text-express-v1\", \n",
257 | " \"platform\" : \"bedrock\",\n",
258 | " \"output\" : \"results[0].outputText\", \n",
259 | " \"content_template\" : \"{\\\"inputText\\\": $prompt, \\\"textGenerationConfig\\\": {\\\"maxTokenCount\\\": 100, \\\"stopSequences\\\": [], \\\"temperature\\\": 1.0, \\\"topP\\\": 1.0}}\",\n",
260 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n\\n Example Transcript:\\n user: bought a celcus tv from your Finchley store last year in December and it stopped working yesterday - can you repair it or change Your cctv recording from the date we bought it - agent: Can you confirm did you pay cash or card for the telvision? We accept credit/debit card statements as a proof of purchase. Steven user: Yes. I paid by card, I think there were other things I bought with the tv as well , but I remember the price of the television was 175 Actually, I just checked my bank statements and I bought the tv in January 2017 and not dec 2016 and paid for it by card - 175 agent: We would use the bank statements transaction ID to match our till receipts. If you return the television with your credit/debit card...1/2 ...statement our in store colleagues will advise you further. Steven 2/2 user: Great! Thank you. One last question, Ive recycled the Tvs box - is it rwqur Required** agent: As long as you've got proof of purchase you'll be fine Dimitar! Ewan.\\n Summary: Customer is asking to repair or change the television which is not working. Agent updated to return the television with their credit/debit card.\\n\\nTranscript:\\n $model_input\\n Summary:\"\n",
261 | " },\n",
262 | "})\n",
263 | "\n",
264 | "# Add OpenAI models in zero-shot\n",
265 | "models_to_test.update({\n",
266 | " \"gpt.3.5-turbu-0125\" :{ \n",
267 | " \"model_id\" : \"gpt-3.5-turbo-0125\", \n",
268 | " \"api_key\" : OPENAI_API_KEY,\n",
269 | " \"platform\" : \"openai\",\n",
270 | " \"temperature\" : 1,\n",
271 | " \"top_p\" : 1,\n",
272 | " \"max_tokens\" : 100,\n",
273 | " \"prompt_template\" : \"Please provide a short and concise summary of the conversation below that includes a summary of both the user and the agent. Specify important and relevant amounts, dates and locations inside the sentences of the summary.\\n Transcript:\\n $model_input \\n Summary:\\n\"\n",
274 | " }\n",
275 | "})\n"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "id": "cb20053f-e76e-4ffc-b94d-a6589409776c",
281 | "metadata": {},
282 | "source": [
283 | "## Adding your own custom models\n",
284 | "In case you wish to add custom model, simply create custom model runner. For example, see custom model runner which wraps GPT-3.5 in the folder utils/model_runners/gpt_model_runner.py \n",
285 | "\n",
286 | "\n",
287 | "## Adding finetuned models\n",
288 | "In case you wish to add Bedrock finetuned model: \n",
289 | "1. First finetune a model (for details on finetuning on Berdrock visit https://aws.amazon.com/blogs/aws/customize-models-in-amazon-bedrock-with-your-own-data-using-fine-tuning-and-continued-pre-training/).\n",
290 | "2. Once training completed, from Bedrock copy the ARN from Bedrock 'provisioned throughput' dashboard and paste it as the model_id. A finetuning training set is provided. For more details see documentation\n",
291 | "3. Add to the model_dict in the cell above the configuration of your finetuned model as follows:\n",
292 | "\n",
293 | "\n",
294 | "{\n",
295 | " \"finetuned_amazon.titan-text-lite-v1\" : {\n",
296 | " \"platform\":\"bedrock\",\n",
297 | " \"model_id\": \"arn:aws:bedrock:us-east-1:333333333:provisioned-model/879asd6s75\",\n",
298 | " \"output\": \"results[0].outputText\",\n",
299 | " \"content_template\": {\"inputText\": $prompt, \"textGenerationConfig\": {\"maxTokenCount\": 100, \"stopSequences\": [], \"temperature\": 1.0, \"topP\": 1.0}},\n",
300 | " \"prompt_template\": \"YOUR PROMPT HERE\"\n",
301 | " }\n",
302 | "}\n",
303 | "\n",
304 | "\n",
305 | "\n",
306 | "## Adding Jumpstart models\n",
307 | "Example for evaluation Mistral-7B-Instruct from Jumpstart:\n",
308 | "1. Go to Jumpstart (press home button -> Jumpstart)\n",
309 | "2. Search in the bar for Mistral-7B-Instruct\n",
310 | "3. Click deploy from the model card (don't forget to close the endpoint once you done from SageMaker->inference endpoints)\n",
311 | "4. Add the following to the models list\n",
312 | "\n",
313 | "{\n",
314 | " \"platform\":\"jumpstart\",\n",
315 | " \"model_id\": \"huggingface-llm-mistral-7b-instruct\",\n",
316 | " \"endpoint_name\": \"jumpstart-dft-hf-llm-mistral-7b-instruct\",\n",
317 | " \"model_version\": \"*\",\n",
318 | " \"output\": \"[0].generated_text\",\n",
319 | " \"content_template\":\"{\\\"inputs\\\": $prompt, \\\"parameters\\\": {\\\"do_sample\\\": false, \\\"max_new_tokens\\\": 100}}\",\n",
320 | " \"prompt_template\": \"YOUR PROMPT HERE\"\n",
321 | "}\n",
322 | "\n"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "id": "ac160609-dfb3-4eb5-bac0-753053f27184",
328 | "metadata": {
329 | "tags": []
330 | },
331 | "source": [
332 | "## Creating ModelRunner"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 6,
338 | "id": "a9466069-430a-4f71-80a2-c0c6e3b4e918",
339 | "metadata": {
340 | "tags": []
341 | },
342 | "outputs": [],
343 | "source": [
344 | "from utils.model_runners.bedrock_counting_runner import CountingBedrockModelRunner\n",
345 | "\n",
346 | "\n",
347 | "def get_models_to_eval():\n",
348 | " if len(MODELS_TO_EVAL) == 0:\n",
349 | " return list(models_to_test.keys())\n",
350 | " return MODELS_TO_EVAL\n",
351 | "\n",
352 | "models = dict() \n",
353 | "for fm in get_models_to_eval(): \n",
354 | " \n",
355 | " data = models_to_test[fm]\n",
356 | " platform = data['platform']\n",
357 | " \n",
358 | " if platform == \"bedrock\":\n",
359 | " runner = CountingBedrockModelRunner(model_id=data[\"model_id\"], output=data[\"output\"], content_template=data[\"content_template\"].replace(\"'\",\"\\\"\"),metrics_folder = TMP_JSON_FILES, model_key = fm)\n",
360 | " elif platform == \"jumpstart\":\n",
361 | " runner = JumpStartModelRunner(endpoint_name=data[\"endpoint_name\"], model_id=data[\"model_id\"], model_version=data[\"model_version\"], output=data[\"output\"].replace(\"'\",\"\\\"\"), content_template=data[\"content_template\"].replace(\"'\",\"\\\"\"))\n",
362 | " elif platform == \"openai\":\n",
363 | " if OPENAI_API_KEY:\n",
364 | " runner = GPTModelRunner(GPTModelConfig(model_id=data[\"model_id\"], api_key=data[\"api_key\"], temperature=data[\"temperature\"], top_p=data[\"top_p\"], max_tokens=data[\"max_tokens\"]),metrics_folder = TMP_JSON_FILES, model_key = fm)\n",
365 | " else:\n",
366 | " print(\"Skipping OpenAI models - Cannot run without an API key\")\n",
367 | " continue\n",
368 | " \n",
369 | " models[fm] = { \"model_runner\": runner, \"prompt_template\": data[\"prompt_template\"]}\n"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "id": "e6198a20-9a87-4a16-af6d-35733f9845c7",
375 | "metadata": {},
376 | "source": [
377 | "## Evaluation run\n",
378 | "Evaluating METEOR, ROUGE, and BERTscore using FMEval library (https://github.com/aws/fmeval). This library is also used by Bedrock when finetuning or evaluating models.\n",
379 | "\n",
380 | "#### Note - if while running this cell you encounter the message - \"Error displaying widget: model not found\" in the evaluation phase...\", simply ignore it. It relates to the UI and does not effect the evaluation."
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 7,
386 | "id": "7ec118e5-b099-49af-998a-1cacf6a7664b",
387 | "metadata": {
388 | "tags": []
389 | },
390 | "outputs": [],
391 | "source": [
392 | "from fmeval.data_loaders.data_config import DataConfig\n",
393 | "from fmeval.constants import MIME_TYPE_JSONLINES\n",
394 | "from fmeval.eval_algorithms.summarization_accuracy import SummarizationAccuracy, SummarizationAccuracyConfig\n",
395 | "from utils.model_runners.pricing_calculator import PricingCalculator\n",
396 | "import pandas as pd\n",
397 | "import os\n",
398 | "\n",
399 | "os.environ[\"PARALLELIZATION_FACTOR\"] = \"1\" # will use a single workder for FMEval\n",
400 | "TMP_JSON_FILES = \"/tmp/jsonl_model_files\"\n",
401 | "if os.path.exists(TMP_JSON_FILES):\n",
402 | " shutil.rmtree(TMP_JSON_FILES)\n",
403 | "os.mkdir(TMP_JSON_FILES)\n",
404 | "\n",
405 | "models_scores = dict()\n",
406 | "models_usage = dict()\n",
407 | "models_to_eval = get_models_to_eval()\n",
408 | "for model_id in models_to_eval:\n",
409 | " print(f\"### Starting model {model_id} evaluation\")\n",
410 | " if not model_id in models:\n",
411 | " print(f\"###model {model_id} doesn't have a valid/complete entry in the model list\")\n",
412 | " continue\n",
413 | " model = models[model_id]\n",
414 | " config = DataConfig(\n",
415 | " dataset_name=f\"data\",\n",
416 | " dataset_uri=TEST_FILE_PATH,\n",
417 | " dataset_mime_type=MIME_TYPE_JSONLINES,\n",
418 | " model_input_location=\"document\",\n",
419 | " target_output_location=\"summary\"\n",
420 | " )\n",
421 | "\n",
422 | " model_runner = model['model_runner']\n",
423 | " eval_algo = SummarizationAccuracy(SummarizationAccuracyConfig())\n",
424 | " eval_output = eval_algo.evaluate(model=model_runner, \n",
425 | " dataset_config=config,\n",
426 | " prompt_template=model[\"prompt_template\"],\n",
427 | " num_records=10,\n",
428 | " save=True)\n",
429 | "\n",
430 | " scores = dict()\n",
431 | " for i in eval_output[0].dataset_scores:\n",
432 | " scores[i.name] = i.value\n",
433 | " \n",
434 | " models_scores[model_id] = scores\n",
435 | " models_usage[model_id] = PricingCalculator.read_model_score_aggregate(model_id, TMP_JSON_FILES)\n",
436 | " shutil.move('/tmp/eval_results/summarization_accuracy_data.jsonl', f'{TMP_JSON_FILES}/{model_id}_metrics.jsonl')\n"
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "id": "ecf0a7ea-0ae1-423a-949a-6526b5b97ef9",
442 | "metadata": {},
443 | "source": [
444 | "## Calculate BARTscore"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 8,
450 | "id": "5e019170-e364-4c54-a030-b00207bcc5e8",
451 | "metadata": {
452 | "tags": []
453 | },
454 | "outputs": [],
455 | "source": [
456 | "### Metrics to calc\n",
457 | "# BARTscore - for more details https://github.com/neulab/BARTScore/blob/main/README.md\n",
458 | "CALC_BARTSCORE = True\n",
459 | "\n",
460 | "PATH_TO_FINETUNED_BART = \"\" # if left empty will use vanilla BART. If you wish to load the finetuned BART, go to BARTscore's github, download the bart_score.pth (appear on the README) and provide the path here\n",
461 | "if CALC_BARTSCORE:\n",
462 | " calculate_bartscore(TMP_JSON_FILES, models_scores, PATH_TO_FINETUNED_BART)"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "id": "604b4580-546c-48ca-be94-fffabd1cc280",
468 | "metadata": {},
469 | "source": [
470 | "## Create Leaderboard Report HTML"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 9,
476 | "id": "0c6485fe-e5b4-41b1-8323-470b963c8c20",
477 | "metadata": {
478 | "tags": []
479 | },
480 | "outputs": [],
481 | "source": [
482 | "from utils.model_ranker import create_model_ranking\n",
483 | "create_response_output_view(RESULT_HTML_FOLDER, TMP_JSON_FILES, models_scores)\n",
484 | "create_comparive_dashboard(RESULT_HTML_FOLDER, TMP_JSON_FILES)\n",
485 | "create_data_stats_view(TEST_FILE_PATH, RESULT_IMG_FOLDER)\n",
486 | "create_data_preview_view(TEST_FILE_PATH, RESULT_HTML_FOLDER)\n",
487 | "main_html_filename = create_main_html(RESULT_FOLDER, models_scores, models_usage)\n",
488 | "\n",
489 | "print(f\"Created leaderboard in: {main_html_filename}\")\n",
490 | "\n",
491 | "# archive entire report\n",
492 | "from datetime import datetime\n",
493 | "today = datetime.now()\n",
494 | "my_datetime = str(today.strftime(\"%d-%m-%Y_%H-%M-%S\"))\n",
495 | "zip_filename_fullpath = shutil.make_archive(f\"/tmp/{my_datetime}\", 'zip', \"/tmp/final_result\")\n",
496 | "zip_filename = zip_filename_fullpath.split(\"/\")[-1] # filename without folders\n",
497 | "print(f\"Archived report in: {zip_filename_fullpath}\")"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "id": "36570a56-4ba5-4143-aac0-b4ba99afce65",
503 | "metadata": {},
504 | "source": [
505 | "## Upload Report to S3"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "id": "75e36a98-6c1d-49ed-8f43-e2992be1e96d",
512 | "metadata": {
513 | "tags": []
514 | },
515 | "outputs": [],
516 | "source": [
517 | "if S3_OUTPUT_PATH: # if defined S3\n",
518 | " s3_key = f\"{S3_OUTPUT_PATH}/{zip_filename}\"\n",
519 | " !aws s3 cp {zip_filename_fullpath} {s3_key}\n",
520 | " print(f\"Uploaded to: {s3_key}\")\n",
521 | "else:\n",
522 | " print(f\"No S3_OUTPUT_PATH set, not uploading {zip_filename}\")"
523 | ]
524 | },
525 | {
526 | "cell_type": "markdown",
527 | "id": "983658a1",
528 | "metadata": {},
529 | "source": [
530 | "## Viewing results"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 10,
536 | "id": "ae6175ca",
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "if S3_OUTPUT_PATH:\n",
541 | " print(f'If running on a *remote* machine to view the results on your local computer copy-paste these commands in your terminal:\\n\\\n",
542 | " aws s3 cp {s3_key} /tmp/{zip_filename}\\n\\\n",
543 | " cd /tmp\\n\\\n",
544 | " unzip -d {zip_filename.replace(\".zip\",\"\")} {zip_filename}\\n\\\n",
545 | " open /tmp/{zip_filename.replace(\".zip\",\"\")}/index.html\\n')\n",
546 | "\n",
547 | "print(f'If running on a *local* machine copy-paste these commands in your terminal:\\n\\\n",
548 | " open {main_html_filename}')"
549 | ]
550 | }
551 | ],
552 | "metadata": {
553 | "availableInstances": [
554 | {
555 | "_defaultOrder": 0,
556 | "_isFastLaunch": true,
557 | "category": "General purpose",
558 | "gpuNum": 0,
559 | "hideHardwareSpecs": false,
560 | "memoryGiB": 4,
561 | "name": "ml.t3.medium",
562 | "vcpuNum": 2
563 | },
564 | {
565 | "_defaultOrder": 1,
566 | "_isFastLaunch": false,
567 | "category": "General purpose",
568 | "gpuNum": 0,
569 | "hideHardwareSpecs": false,
570 | "memoryGiB": 8,
571 | "name": "ml.t3.large",
572 | "vcpuNum": 2
573 | },
574 | {
575 | "_defaultOrder": 2,
576 | "_isFastLaunch": false,
577 | "category": "General purpose",
578 | "gpuNum": 0,
579 | "hideHardwareSpecs": false,
580 | "memoryGiB": 16,
581 | "name": "ml.t3.xlarge",
582 | "vcpuNum": 4
583 | },
584 | {
585 | "_defaultOrder": 3,
586 | "_isFastLaunch": false,
587 | "category": "General purpose",
588 | "gpuNum": 0,
589 | "hideHardwareSpecs": false,
590 | "memoryGiB": 32,
591 | "name": "ml.t3.2xlarge",
592 | "vcpuNum": 8
593 | },
594 | {
595 | "_defaultOrder": 4,
596 | "_isFastLaunch": true,
597 | "category": "General purpose",
598 | "gpuNum": 0,
599 | "hideHardwareSpecs": false,
600 | "memoryGiB": 8,
601 | "name": "ml.m5.large",
602 | "vcpuNum": 2
603 | },
604 | {
605 | "_defaultOrder": 5,
606 | "_isFastLaunch": false,
607 | "category": "General purpose",
608 | "gpuNum": 0,
609 | "hideHardwareSpecs": false,
610 | "memoryGiB": 16,
611 | "name": "ml.m5.xlarge",
612 | "vcpuNum": 4
613 | },
614 | {
615 | "_defaultOrder": 6,
616 | "_isFastLaunch": false,
617 | "category": "General purpose",
618 | "gpuNum": 0,
619 | "hideHardwareSpecs": false,
620 | "memoryGiB": 32,
621 | "name": "ml.m5.2xlarge",
622 | "vcpuNum": 8
623 | },
624 | {
625 | "_defaultOrder": 7,
626 | "_isFastLaunch": false,
627 | "category": "General purpose",
628 | "gpuNum": 0,
629 | "hideHardwareSpecs": false,
630 | "memoryGiB": 64,
631 | "name": "ml.m5.4xlarge",
632 | "vcpuNum": 16
633 | },
634 | {
635 | "_defaultOrder": 8,
636 | "_isFastLaunch": false,
637 | "category": "General purpose",
638 | "gpuNum": 0,
639 | "hideHardwareSpecs": false,
640 | "memoryGiB": 128,
641 | "name": "ml.m5.8xlarge",
642 | "vcpuNum": 32
643 | },
644 | {
645 | "_defaultOrder": 9,
646 | "_isFastLaunch": false,
647 | "category": "General purpose",
648 | "gpuNum": 0,
649 | "hideHardwareSpecs": false,
650 | "memoryGiB": 192,
651 | "name": "ml.m5.12xlarge",
652 | "vcpuNum": 48
653 | },
654 | {
655 | "_defaultOrder": 10,
656 | "_isFastLaunch": false,
657 | "category": "General purpose",
658 | "gpuNum": 0,
659 | "hideHardwareSpecs": false,
660 | "memoryGiB": 256,
661 | "name": "ml.m5.16xlarge",
662 | "vcpuNum": 64
663 | },
664 | {
665 | "_defaultOrder": 11,
666 | "_isFastLaunch": false,
667 | "category": "General purpose",
668 | "gpuNum": 0,
669 | "hideHardwareSpecs": false,
670 | "memoryGiB": 384,
671 | "name": "ml.m5.24xlarge",
672 | "vcpuNum": 96
673 | },
674 | {
675 | "_defaultOrder": 12,
676 | "_isFastLaunch": false,
677 | "category": "General purpose",
678 | "gpuNum": 0,
679 | "hideHardwareSpecs": false,
680 | "memoryGiB": 8,
681 | "name": "ml.m5d.large",
682 | "vcpuNum": 2
683 | },
684 | {
685 | "_defaultOrder": 13,
686 | "_isFastLaunch": false,
687 | "category": "General purpose",
688 | "gpuNum": 0,
689 | "hideHardwareSpecs": false,
690 | "memoryGiB": 16,
691 | "name": "ml.m5d.xlarge",
692 | "vcpuNum": 4
693 | },
694 | {
695 | "_defaultOrder": 14,
696 | "_isFastLaunch": false,
697 | "category": "General purpose",
698 | "gpuNum": 0,
699 | "hideHardwareSpecs": false,
700 | "memoryGiB": 32,
701 | "name": "ml.m5d.2xlarge",
702 | "vcpuNum": 8
703 | },
704 | {
705 | "_defaultOrder": 15,
706 | "_isFastLaunch": false,
707 | "category": "General purpose",
708 | "gpuNum": 0,
709 | "hideHardwareSpecs": false,
710 | "memoryGiB": 64,
711 | "name": "ml.m5d.4xlarge",
712 | "vcpuNum": 16
713 | },
714 | {
715 | "_defaultOrder": 16,
716 | "_isFastLaunch": false,
717 | "category": "General purpose",
718 | "gpuNum": 0,
719 | "hideHardwareSpecs": false,
720 | "memoryGiB": 128,
721 | "name": "ml.m5d.8xlarge",
722 | "vcpuNum": 32
723 | },
724 | {
725 | "_defaultOrder": 17,
726 | "_isFastLaunch": false,
727 | "category": "General purpose",
728 | "gpuNum": 0,
729 | "hideHardwareSpecs": false,
730 | "memoryGiB": 192,
731 | "name": "ml.m5d.12xlarge",
732 | "vcpuNum": 48
733 | },
734 | {
735 | "_defaultOrder": 18,
736 | "_isFastLaunch": false,
737 | "category": "General purpose",
738 | "gpuNum": 0,
739 | "hideHardwareSpecs": false,
740 | "memoryGiB": 256,
741 | "name": "ml.m5d.16xlarge",
742 | "vcpuNum": 64
743 | },
744 | {
745 | "_defaultOrder": 19,
746 | "_isFastLaunch": false,
747 | "category": "General purpose",
748 | "gpuNum": 0,
749 | "hideHardwareSpecs": false,
750 | "memoryGiB": 384,
751 | "name": "ml.m5d.24xlarge",
752 | "vcpuNum": 96
753 | },
754 | {
755 | "_defaultOrder": 20,
756 | "_isFastLaunch": false,
757 | "category": "General purpose",
758 | "gpuNum": 0,
759 | "hideHardwareSpecs": true,
760 | "memoryGiB": 0,
761 | "name": "ml.geospatial.interactive",
762 | "supportedImageNames": [
763 | "sagemaker-geospatial-v1-0"
764 | ],
765 | "vcpuNum": 0
766 | },
767 | {
768 | "_defaultOrder": 21,
769 | "_isFastLaunch": true,
770 | "category": "Compute optimized",
771 | "gpuNum": 0,
772 | "hideHardwareSpecs": false,
773 | "memoryGiB": 4,
774 | "name": "ml.c5.large",
775 | "vcpuNum": 2
776 | },
777 | {
778 | "_defaultOrder": 22,
779 | "_isFastLaunch": false,
780 | "category": "Compute optimized",
781 | "gpuNum": 0,
782 | "hideHardwareSpecs": false,
783 | "memoryGiB": 8,
784 | "name": "ml.c5.xlarge",
785 | "vcpuNum": 4
786 | },
787 | {
788 | "_defaultOrder": 23,
789 | "_isFastLaunch": false,
790 | "category": "Compute optimized",
791 | "gpuNum": 0,
792 | "hideHardwareSpecs": false,
793 | "memoryGiB": 16,
794 | "name": "ml.c5.2xlarge",
795 | "vcpuNum": 8
796 | },
797 | {
798 | "_defaultOrder": 24,
799 | "_isFastLaunch": false,
800 | "category": "Compute optimized",
801 | "gpuNum": 0,
802 | "hideHardwareSpecs": false,
803 | "memoryGiB": 32,
804 | "name": "ml.c5.4xlarge",
805 | "vcpuNum": 16
806 | },
807 | {
808 | "_defaultOrder": 25,
809 | "_isFastLaunch": false,
810 | "category": "Compute optimized",
811 | "gpuNum": 0,
812 | "hideHardwareSpecs": false,
813 | "memoryGiB": 72,
814 | "name": "ml.c5.9xlarge",
815 | "vcpuNum": 36
816 | },
817 | {
818 | "_defaultOrder": 26,
819 | "_isFastLaunch": false,
820 | "category": "Compute optimized",
821 | "gpuNum": 0,
822 | "hideHardwareSpecs": false,
823 | "memoryGiB": 96,
824 | "name": "ml.c5.12xlarge",
825 | "vcpuNum": 48
826 | },
827 | {
828 | "_defaultOrder": 27,
829 | "_isFastLaunch": false,
830 | "category": "Compute optimized",
831 | "gpuNum": 0,
832 | "hideHardwareSpecs": false,
833 | "memoryGiB": 144,
834 | "name": "ml.c5.18xlarge",
835 | "vcpuNum": 72
836 | },
837 | {
838 | "_defaultOrder": 28,
839 | "_isFastLaunch": false,
840 | "category": "Compute optimized",
841 | "gpuNum": 0,
842 | "hideHardwareSpecs": false,
843 | "memoryGiB": 192,
844 | "name": "ml.c5.24xlarge",
845 | "vcpuNum": 96
846 | },
847 | {
848 | "_defaultOrder": 29,
849 | "_isFastLaunch": true,
850 | "category": "Accelerated computing",
851 | "gpuNum": 1,
852 | "hideHardwareSpecs": false,
853 | "memoryGiB": 16,
854 | "name": "ml.g4dn.xlarge",
855 | "vcpuNum": 4
856 | },
857 | {
858 | "_defaultOrder": 30,
859 | "_isFastLaunch": false,
860 | "category": "Accelerated computing",
861 | "gpuNum": 1,
862 | "hideHardwareSpecs": false,
863 | "memoryGiB": 32,
864 | "name": "ml.g4dn.2xlarge",
865 | "vcpuNum": 8
866 | },
867 | {
868 | "_defaultOrder": 31,
869 | "_isFastLaunch": false,
870 | "category": "Accelerated computing",
871 | "gpuNum": 1,
872 | "hideHardwareSpecs": false,
873 | "memoryGiB": 64,
874 | "name": "ml.g4dn.4xlarge",
875 | "vcpuNum": 16
876 | },
877 | {
878 | "_defaultOrder": 32,
879 | "_isFastLaunch": false,
880 | "category": "Accelerated computing",
881 | "gpuNum": 1,
882 | "hideHardwareSpecs": false,
883 | "memoryGiB": 128,
884 | "name": "ml.g4dn.8xlarge",
885 | "vcpuNum": 32
886 | },
887 | {
888 | "_defaultOrder": 33,
889 | "_isFastLaunch": false,
890 | "category": "Accelerated computing",
891 | "gpuNum": 4,
892 | "hideHardwareSpecs": false,
893 | "memoryGiB": 192,
894 | "name": "ml.g4dn.12xlarge",
895 | "vcpuNum": 48
896 | },
897 | {
898 | "_defaultOrder": 34,
899 | "_isFastLaunch": false,
900 | "category": "Accelerated computing",
901 | "gpuNum": 1,
902 | "hideHardwareSpecs": false,
903 | "memoryGiB": 256,
904 | "name": "ml.g4dn.16xlarge",
905 | "vcpuNum": 64
906 | },
907 | {
908 | "_defaultOrder": 35,
909 | "_isFastLaunch": false,
910 | "category": "Accelerated computing",
911 | "gpuNum": 1,
912 | "hideHardwareSpecs": false,
913 | "memoryGiB": 61,
914 | "name": "ml.p3.2xlarge",
915 | "vcpuNum": 8
916 | },
917 | {
918 | "_defaultOrder": 36,
919 | "_isFastLaunch": false,
920 | "category": "Accelerated computing",
921 | "gpuNum": 4,
922 | "hideHardwareSpecs": false,
923 | "memoryGiB": 244,
924 | "name": "ml.p3.8xlarge",
925 | "vcpuNum": 32
926 | },
927 | {
928 | "_defaultOrder": 37,
929 | "_isFastLaunch": false,
930 | "category": "Accelerated computing",
931 | "gpuNum": 8,
932 | "hideHardwareSpecs": false,
933 | "memoryGiB": 488,
934 | "name": "ml.p3.16xlarge",
935 | "vcpuNum": 64
936 | },
937 | {
938 | "_defaultOrder": 38,
939 | "_isFastLaunch": false,
940 | "category": "Accelerated computing",
941 | "gpuNum": 8,
942 | "hideHardwareSpecs": false,
943 | "memoryGiB": 768,
944 | "name": "ml.p3dn.24xlarge",
945 | "vcpuNum": 96
946 | },
947 | {
948 | "_defaultOrder": 39,
949 | "_isFastLaunch": false,
950 | "category": "Memory Optimized",
951 | "gpuNum": 0,
952 | "hideHardwareSpecs": false,
953 | "memoryGiB": 16,
954 | "name": "ml.r5.large",
955 | "vcpuNum": 2
956 | },
957 | {
958 | "_defaultOrder": 40,
959 | "_isFastLaunch": false,
960 | "category": "Memory Optimized",
961 | "gpuNum": 0,
962 | "hideHardwareSpecs": false,
963 | "memoryGiB": 32,
964 | "name": "ml.r5.xlarge",
965 | "vcpuNum": 4
966 | },
967 | {
968 | "_defaultOrder": 41,
969 | "_isFastLaunch": false,
970 | "category": "Memory Optimized",
971 | "gpuNum": 0,
972 | "hideHardwareSpecs": false,
973 | "memoryGiB": 64,
974 | "name": "ml.r5.2xlarge",
975 | "vcpuNum": 8
976 | },
977 | {
978 | "_defaultOrder": 42,
979 | "_isFastLaunch": false,
980 | "category": "Memory Optimized",
981 | "gpuNum": 0,
982 | "hideHardwareSpecs": false,
983 | "memoryGiB": 128,
984 | "name": "ml.r5.4xlarge",
985 | "vcpuNum": 16
986 | },
987 | {
988 | "_defaultOrder": 43,
989 | "_isFastLaunch": false,
990 | "category": "Memory Optimized",
991 | "gpuNum": 0,
992 | "hideHardwareSpecs": false,
993 | "memoryGiB": 256,
994 | "name": "ml.r5.8xlarge",
995 | "vcpuNum": 32
996 | },
997 | {
998 | "_defaultOrder": 44,
999 | "_isFastLaunch": false,
1000 | "category": "Memory Optimized",
1001 | "gpuNum": 0,
1002 | "hideHardwareSpecs": false,
1003 | "memoryGiB": 384,
1004 | "name": "ml.r5.12xlarge",
1005 | "vcpuNum": 48
1006 | },
1007 | {
1008 | "_defaultOrder": 45,
1009 | "_isFastLaunch": false,
1010 | "category": "Memory Optimized",
1011 | "gpuNum": 0,
1012 | "hideHardwareSpecs": false,
1013 | "memoryGiB": 512,
1014 | "name": "ml.r5.16xlarge",
1015 | "vcpuNum": 64
1016 | },
1017 | {
1018 | "_defaultOrder": 46,
1019 | "_isFastLaunch": false,
1020 | "category": "Memory Optimized",
1021 | "gpuNum": 0,
1022 | "hideHardwareSpecs": false,
1023 | "memoryGiB": 768,
1024 | "name": "ml.r5.24xlarge",
1025 | "vcpuNum": 96
1026 | },
1027 | {
1028 | "_defaultOrder": 47,
1029 | "_isFastLaunch": false,
1030 | "category": "Accelerated computing",
1031 | "gpuNum": 1,
1032 | "hideHardwareSpecs": false,
1033 | "memoryGiB": 16,
1034 | "name": "ml.g5.xlarge",
1035 | "vcpuNum": 4
1036 | },
1037 | {
1038 | "_defaultOrder": 48,
1039 | "_isFastLaunch": false,
1040 | "category": "Accelerated computing",
1041 | "gpuNum": 1,
1042 | "hideHardwareSpecs": false,
1043 | "memoryGiB": 32,
1044 | "name": "ml.g5.2xlarge",
1045 | "vcpuNum": 8
1046 | },
1047 | {
1048 | "_defaultOrder": 49,
1049 | "_isFastLaunch": false,
1050 | "category": "Accelerated computing",
1051 | "gpuNum": 1,
1052 | "hideHardwareSpecs": false,
1053 | "memoryGiB": 64,
1054 | "name": "ml.g5.4xlarge",
1055 | "vcpuNum": 16
1056 | },
1057 | {
1058 | "_defaultOrder": 50,
1059 | "_isFastLaunch": false,
1060 | "category": "Accelerated computing",
1061 | "gpuNum": 1,
1062 | "hideHardwareSpecs": false,
1063 | "memoryGiB": 128,
1064 | "name": "ml.g5.8xlarge",
1065 | "vcpuNum": 32
1066 | },
1067 | {
1068 | "_defaultOrder": 51,
1069 | "_isFastLaunch": false,
1070 | "category": "Accelerated computing",
1071 | "gpuNum": 1,
1072 | "hideHardwareSpecs": false,
1073 | "memoryGiB": 256,
1074 | "name": "ml.g5.16xlarge",
1075 | "vcpuNum": 64
1076 | },
1077 | {
1078 | "_defaultOrder": 52,
1079 | "_isFastLaunch": false,
1080 | "category": "Accelerated computing",
1081 | "gpuNum": 4,
1082 | "hideHardwareSpecs": false,
1083 | "memoryGiB": 192,
1084 | "name": "ml.g5.12xlarge",
1085 | "vcpuNum": 48
1086 | },
1087 | {
1088 | "_defaultOrder": 53,
1089 | "_isFastLaunch": false,
1090 | "category": "Accelerated computing",
1091 | "gpuNum": 4,
1092 | "hideHardwareSpecs": false,
1093 | "memoryGiB": 384,
1094 | "name": "ml.g5.24xlarge",
1095 | "vcpuNum": 96
1096 | },
1097 | {
1098 | "_defaultOrder": 54,
1099 | "_isFastLaunch": false,
1100 | "category": "Accelerated computing",
1101 | "gpuNum": 8,
1102 | "hideHardwareSpecs": false,
1103 | "memoryGiB": 768,
1104 | "name": "ml.g5.48xlarge",
1105 | "vcpuNum": 192
1106 | },
1107 | {
1108 | "_defaultOrder": 55,
1109 | "_isFastLaunch": false,
1110 | "category": "Accelerated computing",
1111 | "gpuNum": 8,
1112 | "hideHardwareSpecs": false,
1113 | "memoryGiB": 1152,
1114 | "name": "ml.p4d.24xlarge",
1115 | "vcpuNum": 96
1116 | },
1117 | {
1118 | "_defaultOrder": 56,
1119 | "_isFastLaunch": false,
1120 | "category": "Accelerated computing",
1121 | "gpuNum": 8,
1122 | "hideHardwareSpecs": false,
1123 | "memoryGiB": 1152,
1124 | "name": "ml.p4de.24xlarge",
1125 | "vcpuNum": 96
1126 | },
1127 | {
1128 | "_defaultOrder": 57,
1129 | "_isFastLaunch": false,
1130 | "category": "Accelerated computing",
1131 | "gpuNum": 0,
1132 | "hideHardwareSpecs": false,
1133 | "memoryGiB": 32,
1134 | "name": "ml.trn1.2xlarge",
1135 | "vcpuNum": 8
1136 | },
1137 | {
1138 | "_defaultOrder": 58,
1139 | "_isFastLaunch": false,
1140 | "category": "Accelerated computing",
1141 | "gpuNum": 0,
1142 | "hideHardwareSpecs": false,
1143 | "memoryGiB": 512,
1144 | "name": "ml.trn1.32xlarge",
1145 | "vcpuNum": 128
1146 | },
1147 | {
1148 | "_defaultOrder": 59,
1149 | "_isFastLaunch": false,
1150 | "category": "Accelerated computing",
1151 | "gpuNum": 0,
1152 | "hideHardwareSpecs": false,
1153 | "memoryGiB": 512,
1154 | "name": "ml.trn1n.32xlarge",
1155 | "vcpuNum": 128
1156 | }
1157 | ],
1158 | "instance_type": "ml.g4dn.xlarge",
1159 | "kernelspec": {
1160 | "display_name": "Python 3 (ipykernel)",
1161 | "language": "python",
1162 | "name": "python3"
1163 | },
1164 | "language_info": {
1165 | "codemirror_mode": {
1166 | "name": "ipython",
1167 | "version": 3
1168 | },
1169 | "file_extension": ".py",
1170 | "mimetype": "text/x-python",
1171 | "name": "python",
1172 | "nbconvert_exporter": "python",
1173 | "pygments_lexer": "ipython3",
1174 | "version": "3.10.11"
1175 | }
1176 | },
1177 | "nbformat": 4,
1178 | "nbformat_minor": 5
1179 | }
1180 |
--------------------------------------------------------------------------------