├── .github └── workflows │ ├── cla.yml │ ├── publish-to-test-pypi.yml │ └── stale.yaml ├── README.md ├── cla.md ├── demo_helpers ├── MANIFEST.in ├── demo_helpers │ ├── .gitignore │ ├── __init__.py │ ├── args.py │ ├── compute_performance.py │ ├── dataset.py │ ├── datasets │ │ └── README.md │ ├── misc.py │ ├── model_download.py │ ├── models.py │ ├── pretrained_models │ │ ├── m5.pt │ │ └── pointnet.pth │ └── validate.py └── setup.py ├── docs ├── img │ └── groqflow.gif ├── install.md ├── readme.md ├── release_notes.md ├── user_guide.md └── versioning.md ├── examples ├── hummingbird │ ├── randomforest.py │ └── xgbclassifier.py ├── keras │ └── hello_world.py ├── onnx │ └── hello_world.py ├── pytorch │ ├── assembler_flags.py │ ├── benchmark.py │ ├── benchmark_abunch.py │ ├── build_name.py │ ├── cache_dir.py │ ├── compiler_flags.py │ ├── estimate_performance.py │ ├── groqview.py │ ├── hello_world.py │ ├── no_monitor.py │ ├── num_chips.py │ ├── quantization.py │ ├── rebuild_always.py │ ├── rebuild_never.py │ ├── run_abunch.py │ └── sequence.py └── readme.md ├── groqflow ├── __init__.py ├── common │ ├── __init__.py │ ├── build.py │ ├── onnx_helpers.py │ └── sdk_helpers.py ├── groqmodel │ ├── __init__.py │ ├── execute.py │ ├── groqmodel.py │ └── remote.py ├── justgroqit │ ├── __init__.py │ ├── assemble_multichip.py │ ├── compile.py │ ├── export.py │ ├── groqit.py │ └── ignition.py └── version.py ├── license.md ├── proof_points ├── README.md ├── computer_vision │ ├── deit │ │ ├── README.md │ │ ├── deit_tiny.py │ │ └── requirements.txt │ ├── googlenet │ │ ├── README.md │ │ ├── googlenet.py │ │ └── requirements.txt │ ├── mobilenetv2 │ │ ├── README.md │ │ ├── mobilenetv2.py │ │ └── requirements.txt │ ├── resnet50 │ │ ├── README.md │ │ ├── requirements.txt │ │ └── resnet50.py │ ├── squeezenet │ │ ├── README.md │ │ ├── requirements.txt │ │ └── squeezenet.py │ └── yolo │ │ ├── README.md │ │ ├── requirements.txt │ │ └── yolov6_nano.py ├── natural_language_processing │ ├── bert │ │ ├── README.md │ │ ├── bert_base.py │ │ ├── bert_quantize.py │ │ ├── bert_tiny.py │ │ └── requirements.txt │ ├── distilbert │ │ ├── README.md │ │ ├── distilbert.py │ │ └── requirements.txt │ ├── electra │ │ ├── README.md │ │ ├── electra.py │ │ └── requirements.txt │ ├── minilm │ │ ├── README.md │ │ ├── minilmv2.py │ │ └── requirements.txt │ └── roberta │ │ ├── README.md │ │ ├── requirements.txt │ │ └── roberta.py └── speech │ └── m5 │ ├── README.md │ ├── m5.py │ └── requirements.txt ├── pyproject.toml └── setup.py /.github/workflows/cla.yml: -------------------------------------------------------------------------------- 1 | name: "CLA Assistant" 2 | on: 3 | issue_comment: 4 | types: [created] 5 | pull_request_target: 6 | types: [opened, closed, synchronize] 7 | 8 | jobs: 9 | CLAAssistant: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: "CLA Assistant" 13 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' 14 | # Beta Release 15 | uses: contributor-assistant/github-action@v2.2.0 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | # the below token should have repo scope and must be manually added by you in the repository's secret 19 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 20 | with: 21 | path-to-signatures: "groqflow/version1/cla.json" 22 | path-to-document: "https://github.com/groq/groqflow/cla.md" 23 | # branch should not be protected 24 | branch: "main" 25 | allowlist: hozen-groq,MihailoMilenkovic,ataheridezfouli-groq,bot* 26 | remote-organization-name: groq 27 | remote-repository-name: cla 28 | 29 | # the followings are the optional inputs - If the optional inputs are not given, then default values will be taken 30 | #create-file-commit-message: 'For example: Creating file for storing CLA Signatures' 31 | #signed-commit-message: 'For example: $contributorName has signed the CLA in #$pullRequestNo' 32 | #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign' 33 | #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA' 34 | #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.' 35 | #lock-pullrequest-aftermerge: false - if you don't want this bot to automatically lock the pull request after merging (default - true) 36 | #use-dco-flag: true - If you are using DCO instead of CLA 37 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@main 11 | - name: Set up Python 3.8 12 | uses: actions/setup-python@v3 13 | with: 14 | python-version: "3.8" 15 | - name: Install pypa/build 16 | run: >- 17 | python -m 18 | pip install 19 | build 20 | --user 21 | - name: Build a binary wheel and a source tarball 22 | run: >- 23 | python -m 24 | build 25 | --sdist 26 | --wheel 27 | --outdir dist/ 28 | . 29 | - name: Publish distribution 📦 to Test PyPI 30 | if: startsWith(github.ref, 'refs/tags') != true 31 | uses: pypa/gh-action-pypi-publish@release/v1 32 | with: 33 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 34 | repository_url: https://test.pypi.org/legacy/ 35 | - name: Publish distribution 📦 to PyPI 36 | if: startsWith(github.ref, 'refs/tags') 37 | uses: pypa/gh-action-pypi-publish@release/v1 38 | with: 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/stale.yaml: -------------------------------------------------------------------------------- 1 | ##################################### 2 | # DO NOT EDIT DIRECTLY. # 3 | # This file is managed by Terraform # 4 | ##################################### 5 | 6 | name: "Close stale PRs" 7 | on: 8 | schedule: 9 | - cron: "30 1 * * *" 10 | 11 | jobs: 12 | stale: 13 | runs-on: ubuntu-latest 14 | # Read repo and write to PRs 15 | permissions: 16 | contents: read 17 | pull-requests: write 18 | issues: write 19 | steps: 20 | - uses: actions/stale@v9 21 | with: 22 | stale-pr-message: "This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days." 23 | close-pr-message: "This PR was closed because it has been stalled for 7 days with no activity." 24 | days-before-pr-stale: 30 25 | days-before-pr-close: 7 26 | exempt-pr-labels: "dependencies,security" 27 | operations-per-run: 60 # Default is 30 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GroqFlow 🚀 2 | 3 | GroqFlow™ is the easiest way to get started with Groq's technology. GroqFlow provides an automated workflow for compiling Machine Learning, Artifical Intelligence, and High-Performance Computing workloads into Groq programs and executing those programs on the Groq Language Processing Unit™ (LPU). 4 | 5 | --- 6 | 7 | ## System Requirements 8 | 9 | To begin, we recommend that your system meets the following software and hardware requirements: 10 | 11 | - Ubuntu 22.04 or Rocky 8.4 Linux distribution. 12 | - 32GB RAM (or more) to build models. 13 | - 8 LPUs (especially for larger models) to run models. 14 | - GroqWare Suite™ version >=0.9.2.1 installation*: 15 | - Groq Developer Tools Package (groq-devtools) for building and compiling models. 16 | - Groq Runtime Package (groq-runtime) for running compiled models on Groq hardware. 17 | 18 | *For information on how to install GroqWare Suite on your system, create an account on our [portal](https://support.groq.com/) and view the [GroqWare Quick Start Guide](https://support.groq.com/#/downloads/view/groqware-qsg) for installation instructions. 19 | 20 | --- 21 | 22 | ## Navigating GroqFlow 23 | 24 | * [Documentation](docs/): All GroqFlow documentation, including the installation guide, user guide, known issues, and versioning. 25 | 26 | * [Examples](examples/): Includes various GroqFlow examples. 27 | 28 | * [GroqFlow](groqflow/): The source code for the `groqflow` package. 29 | 30 | * [Proof Points](proof_points/): Machine learning proof points using GroqFlow. 31 | 32 | * [README.md](readme.md): This README. 33 | 34 | --- 35 | 36 | ## Contributors 37 | 38 | GroqFlow development is primarily conducted within Groq's internal repo and is periodically synced to GitHub. This approach means that developer contributions are not immediately obvious in the commit log. 39 | 40 | This project follows the [all-contributors](https://allcontributors.org) specification. 41 | Contributions of any kind are welcome! 42 | -------------------------------------------------------------------------------- /cla.md: -------------------------------------------------------------------------------- 1 | ## Individual Contributor License Agreement (CLA) 2 | 3 | **Thank you for submitting your contributions to this project.** 4 | 5 | By signing this CLA, you agree that the following terms apply to all of your past, present and future contributions 6 | to the project. 7 | 8 | ### License. 9 | 10 | You hereby represent that all present, past and future contributions are governed by the 11 | [MIT License](https://opensource.org/licenses/MIT) 12 | copyright statement. 13 | 14 | This entails that to the extent possible under law, you transfer all copyright and related or neighboring rights 15 | of the code or documents you contribute to the project itself or its maintainers. 16 | Furthermore you also represent that you have the authority to perform the above waiver 17 | with respect to the entirety of you contributions. 18 | 19 | ### Moral Rights. 20 | 21 | To the fullest extent permitted under applicable law, you hereby waive, and agree not to 22 | assert, all of your “moral rights” in or relating to your contributions for the benefit of the project. 23 | 24 | ### Third Party Content. 25 | 26 | If your Contribution includes or is based on any source code, object code, bug fixes, configuration changes, tools, 27 | specifications, documentation, data, materials, feedback, information or other works of authorship that were not 28 | authored by you (“Third Party Content”) or if you are aware of any third party intellectual property or proprietary 29 | rights associated with your Contribution (“Third Party Rights”), 30 | then you agree to include with the submission of your Contribution full details respecting such Third Party 31 | Content and Third Party Rights, including, without limitation, identification of which aspects of your 32 | Contribution contain Third Party Content or are associated with Third Party Rights, the owner/author of the 33 | Third Party Content and Third Party Rights, where you obtained the Third Party Content, and any applicable 34 | third party license terms or restrictions respecting the Third Party Content and Third Party Rights. For greater 35 | certainty, the foregoing obligations respecting the identification of Third Party Content and Third Party Rights 36 | do not apply to any portion of a Project that is incorporated into your Contribution to that same Project. 37 | 38 | ### Representations. 39 | 40 | You represent that, other than the Third Party Content and Third Party Rights identified by 41 | you in accordance with this Agreement, you are the sole author of your Contributions and are legally entitled 42 | to grant the foregoing licenses and waivers in respect of your Contributions. If your Contributions were 43 | created in the course of your employment with your past or present employer(s), you represent that such 44 | employer(s) has authorized you to make your Contributions on behalf of such employer(s) or such employer 45 | (s) has waived all of their right, title or interest in or to your Contributions. 46 | 47 | ### Disclaimer. 48 | 49 | To the fullest extent permitted under applicable law, your Contributions are provided on an "as is" 50 | basis, without any warranties or conditions, express or implied, including, without limitation, any implied 51 | warranties or conditions of non-infringement, merchantability or fitness for a particular purpose. You are not 52 | required to provide support for your Contributions, except to the extent you desire to provide support. 53 | 54 | ### No Obligation. 55 | 56 | You acknowledge that the maintainers of this project are under no obligation to use or incorporate your contributions 57 | into the project. The decision to use or incorporate your contributions into the project will be made at the 58 | sole discretion of the maintainers or their authorized delegates. 59 | -------------------------------------------------------------------------------- /demo_helpers/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include demo_helpers/datasets/README.md 2 | include demo_helpers/pretrained_models/m5.pt 3 | include demo_helpers/pretrained_models/pointnet.pth 4 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/.gitignore: -------------------------------------------------------------------------------- 1 | datasets 2 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # Needed to make pip install work 2 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_args(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument( 7 | "-b", 8 | "--build", 9 | action="store_true", 10 | dest="should_build", 11 | default=False, 12 | help="If specified, will build the model to be executed on GroqChip™ processor.", 13 | ) 14 | parser.add_argument( 15 | "-e", 16 | "--execute", 17 | action="store_true", 18 | dest="should_execute", 19 | default=False, 20 | help="If specified, will execute a pre-built model on GroqChip™ processor " 21 | "and print accuracy statistics.", 22 | ) 23 | args = parser.parse_args() 24 | 25 | should_build = args.should_build 26 | should_execute = args.should_execute 27 | 28 | # If neither set, perform both operations 29 | if not (should_build or should_execute): 30 | should_build = True 31 | should_execute = True 32 | 33 | return { 34 | "rebuild_policy": "if_needed" if should_build else "never", 35 | "should_execute": should_execute, 36 | } 37 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/compute_performance.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from dataclasses import dataclass, field 3 | from typing import List, Optional, Tuple 4 | import timeit 5 | 6 | import numpy as np 7 | import onnxruntime 8 | from prettytable import PrettyTable 9 | from tqdm import tqdm 10 | import torch 11 | 12 | from demo_helpers.dataset import Dataset, create_dataset 13 | from demo_helpers.validate import formatted_score, resolve_score_label 14 | 15 | 16 | @dataclass 17 | class PerformanceResult: 18 | name: str 19 | batch_size: int 20 | total_number_of_samples: int 21 | predictions: List = field(repr=False) 22 | 23 | on_chip_latency_ms: float = 0 24 | end_to_end_latency_ms: Optional[float] = None 25 | 26 | @property 27 | def on_chip_latency_s(self) -> float: 28 | return self.on_chip_latency_ms / 1000.0 if self.on_chip_latency_ms else None 29 | 30 | @property 31 | def on_chip_ips(self) -> float: 32 | return ( 33 | 1000.0 / self.on_chip_latency_ms * self.batch_size 34 | if self.on_chip_latency_ms 35 | else None 36 | ) 37 | 38 | @property 39 | def end_to_end_latency_s(self) -> float: 40 | return ( 41 | self.end_to_end_latency_ms / 1000.0 if self.end_to_end_latency_ms else None 42 | ) 43 | 44 | @property 45 | def end_to_end_ips(self) -> float: 46 | return ( 47 | 1000.0 / self.end_to_end_latency_ms * self.batch_size 48 | if self.end_to_end_latency_ms 49 | else None 50 | ) 51 | 52 | 53 | def generate_result_comparison_table( 54 | performance_result: List[PerformanceResult], 55 | dataset: Dataset, 56 | task: str, 57 | ) -> List[Tuple]: 58 | pretty_table = PrettyTable() 59 | row_data = [] 60 | 61 | score_label = resolve_score_label(task) 62 | 63 | pretty_table.field_names = [ 64 | "Source", 65 | score_label, 66 | "end-to-end latency (ms)", 67 | "end-to-end IPS", 68 | "on-chip latency (ms)", 69 | "on-chip IPS", 70 | ] 71 | 72 | for performance in performance_result: 73 | if isinstance(performance.predictions[0], torch.Tensor): 74 | prediction = torch.stack(performance.predictions).numpy() 75 | else: 76 | prediction = np.concatenate(performance.predictions, axis=0) 77 | score = formatted_score(prediction, dataset, task=task) 78 | 79 | on_chip_latency_ms = ( 80 | f"{performance.on_chip_latency_ms:.2f}" 81 | if performance.on_chip_latency_ms 82 | else "--" 83 | ) 84 | on_chip_ips = ( 85 | f"{performance.on_chip_ips:.2f}" if performance.on_chip_ips else "--" 86 | ) 87 | 88 | row_data.append( 89 | ( 90 | performance.name, 91 | score, 92 | f"{performance.end_to_end_latency_ms:.2f}", 93 | f"{performance.end_to_end_ips:.2f}", 94 | on_chip_latency_ms, 95 | on_chip_ips, 96 | ) 97 | ) 98 | 99 | for row in row_data: 100 | pretty_table.add_row(row) 101 | 102 | print(pretty_table) 103 | 104 | return row_data 105 | 106 | 107 | def compute_performance( 108 | groq_model, 109 | pytorch_model, 110 | dataset, 111 | tokenizer=None, 112 | max_seq_length=None, 113 | feature_extractor=None, 114 | task=None, 115 | ): 116 | print("Preprocessing data.") 117 | input_names = list(groq_model.state.expected_input_shapes.keys()) 118 | dataset = create_dataset( 119 | dataset, 120 | tokenizer=tokenizer, 121 | max_seq_length=max_seq_length, 122 | feature_extractor=feature_extractor, 123 | input_names=input_names, 124 | ) 125 | 126 | groq_performance_result = timed_inference_end_to_end_latency( 127 | dataset, 128 | groq_model, 129 | chip_type="groq", 130 | task=task, 131 | ) 132 | 133 | host_performance_result = timed_inference_end_to_end_latency( 134 | dataset, 135 | pytorch_model, 136 | chip_type="cpu", 137 | ) 138 | 139 | result_table = generate_result_comparison_table( 140 | [host_performance_result, groq_performance_result], 141 | dataset, 142 | task, 143 | ) 144 | return result_table 145 | 146 | 147 | def groq_model_inference(dataset, model, task: Optional[str] = None): 148 | print("Running inference on GroqChip.") 149 | pred = model.run_abunch(dataset.x) 150 | if isinstance(pred, torch.Tensor): 151 | pred = [pred] 152 | 153 | if isinstance(pred[0], tuple): 154 | if task == "sentence_similarity": 155 | pred = [p[0] for p in pred] 156 | else: 157 | pred = list(map(torch.vstack, pred)) 158 | 159 | return dataset.postprocess(pred) 160 | 161 | 162 | def onnx_model_inference(dataset, model): 163 | print("Running inference on CPU (ONNX).") 164 | session = onnxruntime.InferenceSession(model) 165 | result = [] 166 | 167 | for inputs in tqdm(dataset.x): 168 | out = session.run(None, inputs) 169 | if len(out) == 1: 170 | result.append(torch.tensor(out[0])) 171 | else: 172 | result.append(tuple([torch.tensor(out[i]) for i in range(len(out))])) 173 | 174 | return dataset.postprocess(result) 175 | 176 | 177 | def pytorch_model_inference(dataset, model): 178 | with torch.no_grad(): 179 | print("Running inference using PyTorch model (CPU).") 180 | pred = [] 181 | for inputs in tqdm(dataset.x): 182 | out = model(**inputs) 183 | 184 | if not isinstance(out, torch.Tensor): 185 | if isinstance(out, tuple): 186 | if len(out) == 1: 187 | out = out[0] 188 | else: 189 | raise ValueError("Cannot handle tuple with len", len(out)) 190 | elif isinstance(out, dict): 191 | if "logits" in out: 192 | out = out.logits 193 | elif "start_logits" in out and "end_logits" in out: 194 | out = torch.vstack((out["start_logits"], out["end_logits"])) 195 | elif "last_hidden_state" in out: 196 | out = out.last_hidden_state 197 | else: 198 | raise ValueError( 199 | "Unknown output key. List of keys:", list(out.keys()) 200 | ) 201 | else: 202 | raise ValueError("Unknown output type", type(out)) 203 | pred.append(out) 204 | 205 | return dataset.postprocess(pred) 206 | 207 | 208 | def timed_inference_end_to_end_latency( 209 | dataset, 210 | model, 211 | chip_type: str, 212 | task: Optional[str] = None, 213 | ) -> PerformanceResult: 214 | result = [] 215 | if chip_type == "groq": 216 | t = timeit.Timer( 217 | lambda: result.append(groq_model_inference(dataset, model, task)) 218 | ) 219 | 220 | on_chip_latency_ms = model.estimate_performance().compute_latency * 1000 221 | production_system_end_to_end_s = model.benchmark().latency 222 | 223 | elif chip_type == "cpu": 224 | if isinstance(model, str): # ONNX 225 | t = timeit.Timer( 226 | lambda: result.append(onnx_model_inference(dataset, model)) 227 | ) 228 | else: 229 | t = timeit.Timer( 230 | lambda: result.append(pytorch_model_inference(dataset, model)) 231 | ) 232 | on_chip_latency_ms = None 233 | 234 | latency_s = t.timeit(number=1) / len(dataset.x) 235 | 236 | # for groq chip, use the expected production system latency. 237 | if chip_type == "groq": 238 | latency_s = production_system_end_to_end_s 239 | 240 | return PerformanceResult( 241 | name=chip_type, 242 | batch_size=1, 243 | total_number_of_samples=len(dataset.x), 244 | predictions=result[0], 245 | on_chip_latency_ms=on_chip_latency_ms, 246 | end_to_end_latency_ms=latency_s * 1000, 247 | ) 248 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/datasets/README.md: -------------------------------------------------------------------------------- 1 | Place manually downloaded datasets here. 2 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/misc.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | import pkg_resources 7 | 8 | 9 | @contextmanager 10 | def suppress_stdout(): 11 | with open(os.devnull, "w", encoding="utf-8") as devnull: 12 | old_stdout = sys.stdout 13 | sys.stdout = devnull 14 | try: 15 | yield 16 | finally: 17 | sys.stdout = old_stdout 18 | 19 | 20 | def check_deps(script_filepath): 21 | dir_path = os.path.dirname(os.path.realpath(script_filepath)) 22 | reqs_filepath = os.path.join(dir_path, "requirements.txt") 23 | with open(reqs_filepath, "r", encoding="utf-8") as f: 24 | reqs = pkg_resources.parse_requirements(f) 25 | str_reqs = [str(req) for req in reqs] 26 | try: 27 | with suppress_stdout(): 28 | for req in str_reqs: 29 | pkg_resources.require(str(req)) 30 | except pkg_resources.DistributionNotFound as e: 31 | print("Some required packages below are missing:\n") 32 | reqs = pkg_resources.parse_requirements(f) 33 | for req in str_reqs: 34 | print(str(req)) 35 | print() 36 | reply = None 37 | question = "Install missing pacakges (y/n): " 38 | while reply not in ["y", "n"]: 39 | reply = str(input(question)).lower().strip() 40 | if reply == "n": 41 | raise e 42 | subprocess.check_call(["pip", "install", "-r", reqs_filepath]) 43 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/model_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | from datasets.utils.file_utils import cached_path 5 | from groqflow.common.build import DEFAULT_CACHE_DIR 6 | 7 | 8 | YOLOV6N_MODEL = "yolov6n_model" 9 | YOLOV6N_SOURCE = "yolov6n_source" 10 | 11 | 12 | DATA_URLS = { 13 | YOLOV6N_MODEL: "https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6n.pt", 14 | YOLOV6N_SOURCE: "https://github.com/meituan/YOLOv6/archive/refs/tags/0.4.0.zip", 15 | } 16 | 17 | 18 | DST_PATHS = { 19 | YOLOV6N_MODEL: "pytorch_models/yolov6_nano/yolov6n.pt", 20 | YOLOV6N_SOURCE: "pytorch_models/yolov6_nano/YOLOv6", 21 | } 22 | 23 | 24 | def download_model(model): 25 | dst_path = os.path.join(DEFAULT_CACHE_DIR, DST_PATHS[model]) 26 | if os.path.exists(dst_path): 27 | return dst_path 28 | 29 | os.makedirs(os.path.dirname(dst_path), exist_ok=True) 30 | url = DATA_URLS[model] 31 | download_path = cached_path(url) 32 | os.symlink(download_path, dst_path) 33 | return dst_path 34 | 35 | 36 | def download_source(source): 37 | dst_path = os.path.join(DEFAULT_CACHE_DIR, DST_PATHS[source]) 38 | if os.path.exists(dst_path): 39 | return dst_path 40 | 41 | os.makedirs(os.path.dirname(dst_path), exist_ok=True) 42 | url = DATA_URLS[source] 43 | download_path = cached_path(url) 44 | with zipfile.ZipFile(download_path, "r") as zip_ref: 45 | extracted_dir = os.path.dirname(dst_path) 46 | zip_ref.extractall(extracted_dir) 47 | os.rename(os.path.join(extracted_dir, zip_ref.infolist()[0].filename), dst_path) 48 | return dst_path 49 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from demo_helpers.model_download import ( 10 | YOLOV6N_MODEL, 11 | YOLOV6N_SOURCE, 12 | download_model, 13 | download_source, 14 | ) 15 | 16 | 17 | class M5(nn.Module): 18 | def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32): 19 | super().__init__() 20 | self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride) 21 | self.bn1 = nn.BatchNorm1d(n_channel) 22 | self.pool1 = nn.MaxPool1d(4) 23 | self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3) 24 | self.bn2 = nn.BatchNorm1d(n_channel) 25 | self.pool2 = nn.MaxPool1d(4) 26 | self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3) 27 | self.bn3 = nn.BatchNorm1d(2 * n_channel) 28 | self.pool3 = nn.MaxPool1d(4) 29 | self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3) 30 | self.bn4 = nn.BatchNorm1d(2 * n_channel) 31 | self.pool4 = nn.MaxPool1d(4) 32 | self.avg_pool1 = nn.AvgPool1d(3) 33 | self.fc1 = nn.Linear(2 * n_channel, n_output) 34 | 35 | def forward(self, x): 36 | x = self.conv1(x) 37 | x = F.relu(self.bn1(x)) 38 | x = self.pool1(x) 39 | x = self.conv2(x) 40 | x = F.relu(self.bn2(x)) 41 | x = self.pool2(x) 42 | x = self.conv3(x) 43 | x = F.relu(self.bn3(x)) 44 | x = self.pool3(x) 45 | x = self.conv4(x) 46 | x = F.relu(self.bn4(x)) 47 | x = self.pool4(x) 48 | x = torch.mean(x, 2, keepdim=True) 49 | x = x.permute(0, 2, 1) 50 | x = self.fc1(x) 51 | return F.log_softmax(x, dim=2) 52 | 53 | 54 | class Tnet(nn.Module): 55 | def __init__(self, k=3): 56 | super().__init__() 57 | self.k = k 58 | self.conv1 = nn.Conv1d(k, 64, 1) 59 | self.conv2 = nn.Conv1d(64, 128, 1) 60 | self.conv3 = nn.Conv1d(128, 1024, 1) 61 | self.fc1 = nn.Linear(1024, 512) 62 | self.fc2 = nn.Linear(512, 256) 63 | self.fc3 = nn.Linear(256, k * k) 64 | 65 | self.bn1 = nn.BatchNorm1d(64) 66 | self.bn2 = nn.BatchNorm1d(128) 67 | self.bn3 = nn.BatchNorm1d(1024) 68 | self.bn4 = nn.BatchNorm1d(512) 69 | self.bn5 = nn.BatchNorm1d(256) 70 | 71 | def forward(self, input): 72 | # input.shape == (bs,n,3) 73 | bs = input.size(0) 74 | xb = F.relu(self.bn1(self.conv1(input))) 75 | xb = F.relu(self.bn2(self.conv2(xb))) 76 | xb = F.relu(self.bn3(self.conv3(xb))) 77 | pool_size = int(xb.size(-1)) 78 | pool = nn.MaxPool1d(pool_size)(xb) 79 | flat = nn.Flatten(1)(pool) 80 | xb = F.relu(self.bn4(self.fc1(flat))) 81 | xb = F.relu(self.bn5(self.fc2(xb))) 82 | 83 | # initialize as identity 84 | init = torch.eye(self.k, requires_grad=True).repeat(bs, 1, 1) 85 | if xb.is_cuda: 86 | init = init.cuda() 87 | matrix = self.fc3(xb).view(-1, self.k, self.k) + init 88 | return matrix 89 | 90 | 91 | class Transform(nn.Module): 92 | def __init__(self): 93 | super().__init__() 94 | self.input_transform = Tnet(k=3) 95 | self.feature_transform = Tnet(k=64) 96 | self.conv1 = nn.Conv1d(3, 64, 1) 97 | 98 | self.conv2 = nn.Conv1d(64, 128, 1) 99 | self.conv3 = nn.Conv1d(128, 1024, 1) 100 | 101 | self.bn1 = nn.BatchNorm1d(64) 102 | self.bn2 = nn.BatchNorm1d(128) 103 | self.bn3 = nn.BatchNorm1d(1024) 104 | 105 | def forward(self, input): 106 | matrix3x3 = self.input_transform(input) 107 | # batch matrix multiplication 108 | xb = torch.bmm(torch.transpose(input, 1, 2), matrix3x3).transpose(1, 2) 109 | 110 | xb = F.relu(self.bn1(self.conv1(xb))) 111 | 112 | matrix64x64 = self.feature_transform(xb) 113 | xb = torch.bmm(torch.transpose(xb, 1, 2), matrix64x64).transpose(1, 2) 114 | 115 | xb = F.relu(self.bn2(self.conv2(xb))) 116 | xb = self.bn3(self.conv3(xb)) 117 | xb = nn.MaxPool1d(int(xb.size(-1)))(xb) 118 | output = nn.Flatten(1)(xb) 119 | return output, matrix3x3, matrix64x64 120 | 121 | 122 | class PointNet(nn.Module): 123 | def __init__(self, classes=10): 124 | super().__init__() 125 | self.transform = Transform() 126 | self.fc1 = nn.Linear(1024, 512) 127 | self.fc2 = nn.Linear(512, 256) 128 | self.fc3 = nn.Linear(256, classes) 129 | 130 | self.bn1 = nn.BatchNorm1d(512) 131 | self.bn2 = nn.BatchNorm1d(256) 132 | self.dropout = nn.Dropout(p=0.3) 133 | self.logsoftmax = nn.LogSoftmax(dim=1) 134 | 135 | def forward(self, input): 136 | xb, _, _ = self.transform(input) 137 | xb = F.relu(self.bn1(self.fc1(xb))) 138 | xb = F.relu(self.bn2(self.dropout(self.fc2(xb)))) 139 | output = self.fc3(xb) 140 | return self.logsoftmax(output) 141 | 142 | 143 | def get_yolov6n_model(): 144 | weights = download_model(YOLOV6N_MODEL) 145 | source = download_source(YOLOV6N_SOURCE) 146 | export_script = os.path.join(source, "deploy/ONNX/export_onnx.py") 147 | 148 | cmd = [ 149 | sys.executable, 150 | export_script, 151 | "--weights", 152 | weights, 153 | "--img", 154 | "640", 155 | "--batch", 156 | "1", 157 | "--simplify", 158 | ] 159 | p = subprocess.Popen( 160 | cmd, cwd=source, stdout=subprocess.PIPE, stderr=subprocess.PIPE 161 | ) 162 | p.communicate() 163 | if p.returncode != 0: 164 | raise RuntimeError("Unable to get ONNX model") 165 | 166 | onnx_file = weights.replace(".pt", ".onnx") 167 | return onnx_file 168 | 169 | 170 | def load_pretrained(model_name): 171 | """Loads a pre-trained model 172 | 173 | :param model_name: The name of model that needs to be loaded. 174 | :type model_name: `str` 175 | 176 | :return: The pre-trained torch model. 177 | :rtype: `torch.nn.Module` 178 | """ 179 | if model_name == "m5": 180 | # create model 181 | model = M5() 182 | 183 | # create absolute path {} 184 | model_filename = os.path.join( 185 | os.path.dirname(__file__), f"pretrained_models/{model_name}.pt" 186 | ) 187 | # load model's state dict. 188 | model.load_state_dict(torch.load(model_filename)) 189 | 190 | return model 191 | elif model_name == "pointnet": 192 | model = PointNet() 193 | model_filename = os.path.join( 194 | os.path.dirname(__file__), f"pretrained_models/{model_name}.pth" 195 | ) 196 | 197 | # load model's state dict. 198 | model.load_state_dict( 199 | torch.load(model_filename, map_location=torch.device("cpu")) 200 | ) 201 | 202 | return model 203 | else: 204 | raise ValueError("Unknown model: " + model_name) 205 | -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/pretrained_models/m5.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groq/groqflow/32740b44aea43d4ecf5d2fa4a2ce3d0f040e8bf0/demo_helpers/demo_helpers/pretrained_models/m5.pt -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/pretrained_models/pointnet.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groq/groqflow/32740b44aea43d4ecf5d2fa4a2ce3d0f040e8bf0/demo_helpers/demo_helpers/pretrained_models/pointnet.pth -------------------------------------------------------------------------------- /demo_helpers/demo_helpers/validate.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | from collections import Counter 4 | from typing import List 5 | from datasets import load_metric 6 | import numpy as np 7 | import torch 8 | import torch.nn.functional as F 9 | from sklearn.metrics.pairwise import paired_cosine_distances 10 | from scipy.stats import spearmanr 11 | 12 | from demo_helpers.misc import suppress_stdout 13 | 14 | 15 | def formatted_score(pred, dataset, ids=None, tokenizer=None, task="classification"): 16 | sc = score(pred, dataset, ids=ids, tokenizer=tokenizer, task=task) 17 | if task in ["classification", "qa", "ner", "keyword_spotting"]: 18 | sc = f"{sc:.2%}" 19 | elif task in ["regression", "sentence_similarity", "coco_map"]: 20 | sc = f"{sc:.4f}" 21 | elif task == "semantic_segmentation": 22 | sc = sc["mean_iou"] 23 | sc = f"{sc:.4f}" 24 | else: 25 | raise Exception(f"unrecognized task: {task}") 26 | 27 | return sc 28 | 29 | 30 | def normalize_answer(s): 31 | """ 32 | Lower text and remove punctuation, articles and extra whitespace. 33 | From official SQuAD evaluation script. 34 | """ 35 | 36 | def remove_articles(text): 37 | return re.sub(r"\b(a|an|the)\b", " ", text) 38 | 39 | def white_space_fix(text): 40 | return " ".join(text.split()) 41 | 42 | def remove_punc(text): 43 | exclude = set(string.punctuation) 44 | return "".join(ch for ch in text if ch not in exclude) 45 | 46 | def lower(text): 47 | return text.lower() 48 | 49 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 50 | 51 | 52 | def f1_score(prediction, ground_truth): 53 | """From official SQuAD evaluation script.""" 54 | prediction_tokens = normalize_answer(prediction).split() 55 | ground_truth_tokens = normalize_answer(ground_truth).split() 56 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 57 | num_same = sum(common.values()) 58 | if num_same == 0: 59 | return 0 60 | precision = 1.0 * num_same / len(prediction_tokens) 61 | recall = 1.0 * num_same / len(ground_truth_tokens) 62 | f1 = (2 * precision * recall) / (precision + recall) 63 | return f1 64 | 65 | 66 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 67 | """From official SQuAD evaluation script.""" 68 | scores_for_ground_truths = [] 69 | for ground_truth in ground_truths: 70 | score = metric_fn(prediction, ground_truth) 71 | scores_for_ground_truths.append(score) 72 | return max(scores_for_ground_truths) 73 | 74 | 75 | def score(pred, dataset, ids=None, tokenizer=None, task="classification"): 76 | inputs, test = dataset.x, dataset.y 77 | if task == "classification": 78 | sc = np.mean(pred.argmax(axis=-1).reshape(test.shape) == test) 79 | elif task == "keyword_spotting": 80 | sc = np.equal(pred.argmax(axis=-1).ravel(), test).mean() 81 | elif task == "ner": 82 | # unroll gt labels across time steps 83 | flat_test = np.array(test).ravel() 84 | 85 | # get best label for each time step 86 | pred_labels = np.argmax(pred, -1) 87 | # unroll pred labels across time steps 88 | flat_preds = pred_labels.ravel() 89 | 90 | # all samples are padded to max_seq_len. reduce to valid 91 | # time steps only 92 | valid_indices = flat_test >= 0 93 | flat_test, flat_preds = flat_test[valid_indices], flat_preds[valid_indices] 94 | 95 | # calculate score 96 | sc = np.equal(flat_preds, flat_test).mean() 97 | elif task == "regression": 98 | sc = np.mean(np.square(test - pred)) 99 | elif task == "qa": 100 | pred = pred.argmax(axis=-1) 101 | 102 | def answers(y): 103 | return [ 104 | tokenizer.decode(id[start:end]) 105 | for (id, start, end) in zip(ids, y[:, 0], y[:, 1]) 106 | ] 107 | 108 | pred = answers(pred) 109 | 110 | sc = np.mean( 111 | [ 112 | metric_max_over_ground_truths(f1_score, p, t) 113 | for (p, t) in zip(pred, test) 114 | ] 115 | ) 116 | elif task == "semantic_segmentation": 117 | sc = calculate_miou_score(pred, test) 118 | elif task == "sentence_similarity": 119 | sc = calculate_spearman_correlation(pred, test, inputs) 120 | elif task == "coco_map": 121 | # pylint: disable=import-error 122 | from pycocotools.coco import COCO 123 | from pycocotools.cocoeval import COCOeval 124 | 125 | with suppress_stdout(): 126 | anno = COCO(dataset.anno_path) 127 | pred = anno.loadRes(pred) 128 | cocoEval = COCOeval(anno, pred, "bbox") 129 | cocoEval.evaluate() 130 | cocoEval.accumulate() 131 | cocoEval.summarize() 132 | sc = cocoEval.stats[0] 133 | else: 134 | raise Exception(f"Unrecognized task: {task}") 135 | return sc 136 | 137 | 138 | def resolve_score_label(task: str) -> str: 139 | if task in ["classification", "ner", "keyword_spotting"]: 140 | label = "Accuracy" 141 | elif task == "regression": 142 | label = "MSE" 143 | elif task == "qa": 144 | label = "F1 Score" 145 | elif task == "semantic_segmentation": 146 | label = "Mean IoU" 147 | elif task == "sentence_similarity": 148 | label = "Spearman Rank Correlation Coefficient" 149 | elif task == "coco_map": 150 | label = "mAP @ 0.5:0.95" 151 | else: 152 | raise Exception(f"Unrecognized task: {task}") 153 | return label 154 | 155 | 156 | def calculate_miou_score(pred: List, test: List): 157 | metric = load_metric("mean_iou") 158 | 159 | upsample_size = test[0].shape[-2:] 160 | num_labels = pred[0].shape[1] 161 | 162 | for p, t in zip(pred, test): 163 | p = _upsample_logits(torch.tensor(p), upsample_size).squeeze() 164 | t = t.squeeze() 165 | metric.add(prediction=p, reference=t) 166 | 167 | score = metric.compute( 168 | num_labels=num_labels, 169 | ignore_index=255, 170 | reduce_labels=False, 171 | ) 172 | return score 173 | 174 | 175 | def calculate_spearman_correlation(pred, test, encoded_input): 176 | sentence_1_embeddings = [] 177 | sentence_2_embeddings = [] 178 | for p, i in zip(pred, encoded_input): 179 | p = torch.tensor(p) 180 | 181 | sentence_embeddings = _mean_pooling(p, i["attention_mask"]) 182 | sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) 183 | sentence_1_embeddings.append(sentence_embeddings[0].reshape(1, -1)) 184 | sentence_2_embeddings.append(sentence_embeddings[1].reshape(1, -1)) 185 | 186 | cosine_scores = 1 - ( 187 | paired_cosine_distances( 188 | torch.stack(sentence_1_embeddings).squeeze(), 189 | torch.stack(sentence_2_embeddings).squeeze(), 190 | ) 191 | ) 192 | 193 | spearman_cosine, _ = spearmanr(test, cosine_scores) 194 | 195 | return spearman_cosine 196 | 197 | 198 | def _upsample_logits(logits, size): 199 | return F.interpolate( 200 | logits.double(), 201 | size=size, 202 | mode="bilinear", 203 | align_corners=False, 204 | ).argmax(dim=1) 205 | 206 | 207 | def _mean_pooling(model_output, attention_mask): 208 | input_mask_expanded = ( 209 | attention_mask.unsqueeze(-1).expand(model_output.shape).float() 210 | ) 211 | 212 | return torch.sum(model_output * input_mask_expanded, 1) / torch.clamp( 213 | input_mask_expanded.sum(1), min=1e-9 214 | ) 215 | 216 | 217 | def formatted_ips(ips): 218 | return f"{ips:.2f}" 219 | -------------------------------------------------------------------------------- /demo_helpers/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="groqflow_demo_helpers", 5 | version="0.2.0", 6 | description="Helper functions to run GroqFlow demos and proof points", 7 | author="Groq", 8 | author_email="sales@groq.com", 9 | license="groq-license", 10 | packages=find_packages( 11 | exclude=["*.__pycache__.*"], 12 | ), 13 | include_package_data=True, 14 | install_requires=[ 15 | "charset-normalizer==3.3.2", 16 | "transformers>=4.20.0", 17 | "datasets>=2.3.2", 18 | "prettytable>=3.3.0", 19 | "wget>=3.2", 20 | "setuptools==57.2.0", 21 | "torchvision==0.16.0", 22 | "torchaudio==2.1.0", 23 | "path>=16.4.0", 24 | ], 25 | classifiers=[], 26 | entry_points={}, 27 | ) 28 | -------------------------------------------------------------------------------- /docs/img/groqflow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groq/groqflow/32740b44aea43d4ecf5d2fa4a2ce3d0f040e8bf0/docs/img/groqflow.gif -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # GroqFlow™ Installation Guide 2 | 3 | The following describes how to install GroqFlow. These instructions enable users to build models for Groq hardware, as well as execute those builds in systems that have GroqCard™ accelerators physically installed. 4 | 5 | ## Prerequisites 6 | 7 | ### Check your versions 8 | 9 | - Ensure that you are using one of the following Linux distributions: Ubuntu 22.04 or Rocky 8.4. 10 | - Download and install the GroqWare™ Suite version >=0.9.2.1. 11 | - For more information, see the GroqWare Quick Start Guide at [support.groq.com](https://support.groq.com). 12 | - To compile your model for Groq hardware, GroqFlow requires the Groq Developer Tools Package (groq-devtools). To run your compiled model on hardware, GroqFlow requires the Groq Runtime Package (groq-runtime). 13 | 14 | Make sure that your combination of GroqWare™ Suite version, OS version, and Python version are compatible. Our supported matrix of versions is: 15 | 16 | | GroqWare | OS | Python Version | 17 | |-----------|--------------|----------------| 18 | | 0.9.2.1 | Ubuntu 22.04 | 3.10 | 19 | | 0.9.3 | Ubuntu 18.04 | 3.8 | 20 | | 0.9.3 | Ubuntu 22.04 | 3.8 | 21 | | 0.9.3 | Rocky 8.4 | 3.8 | 22 | | 0.10.0 | Ubuntu 22.04 | 3.10 | 23 | | 0.10.0 | Rocky 8.4 | 3.8 | 24 | 25 | ### Install GroqWare 26 | 27 | Download and install the GroqWare Suite version >=0.9.2.1. 28 | - For more information, see the GroqWare Quick Start Guide at [support.groq.com](https://support.groq.com). 29 | - To compile your model for Groq hardware, GroqFlow requires the Groq Developer Tools Package (groq-devtools). To run your compiled model on hardware, GroqFlow requires the Groq Runtime Package (groq-runtime). 30 | 31 | ## Trying out GroqFlow 32 | 33 | If you want to try out GroqFlow by running the [examples](https://github.com/groq/groqflow/tree/main/examples) and [proof points](https://github.com/groq/groqflow/tree/main/proof_points), we recommend that you take the following steps. If you want to use GroqFlow with your own environment and model, we suggest skipping ahead to [Developing with GroqFlow](#developing-with-groqflow). 34 | 35 | ### Step 1: Create and activate a virtual environment 36 | 37 | First, download, install, and create a Miniconda virtual environment. 38 | 39 | ``` 40 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 41 | bash Miniconda3-latest-Linux-x86_64.sh 42 | conda create -n groqflow python=$GF_PYTHON_VERSION 43 | conda deactivate 44 | conda activate groqflow 45 | ``` 46 | 47 | Where `$GF_PYTHON_VERSION` is the version of Python corresponding to your OS and GroqWare version in the [compatibility chart](#check-your-versions) above. 48 | 49 | > _Note_: it is important to deactivate your base conda environment when first setting up a new groqflow environment. This helps to prevent conda from making unwanted changes in the PATHs of your environments. 50 | 51 | ### Step 2: Pip install GroqFlow 52 | 53 | Install the `groqflow` package into your virtual environment: 54 | 55 | ``` 56 | git clone https://github.com/groq/groqflow.git 57 | pip install --upgrade pip 58 | cd groqflow 59 | pip install . 60 | ``` 61 | 62 | where `groqflow` is the directory where you cloned the GroqFlow repo in the [prerequisites](#prerequisites). 63 | 64 | _Optional_: if you want to use GroqFlow with TensorFlow, use this install command instead of `pip install .`: 65 | 66 | ``` 67 | pip install .[tensorflow] 68 | ``` 69 | 70 | ### Step 3: Add GroqWare Suite to Python Path 71 | 72 | This adds the Groq tools to your path: 73 | 74 | ``` 75 | conda env config vars set PYTHONPATH="/opt/groq/runtime/site-packages:$PYTHONPATH" 76 | ``` 77 | 78 | **Note:** you will need to reactivate your conda environment for this to take effect. 79 | 80 | **Note:** if you encounter errors later that say GroqFlow is unable to find a tool from the GroqWare suite (Groq API, Groq Runtime, Groq DevTools, Groq Compiler, etc.) it usually means either: 81 | - You forgot to complete this step. 82 | - Your GroqWare Suite installation failed and you should attempt to re-install the GroqWare Suite. 83 | 84 | ### Step 4: Rock-It with groqit() 85 | 86 | To confirm that you're setup correctly, navigate to the examples folder at `groqflow/examples/` and run the `hello_world.py` example that can be found in the `keras`, `onnx`, and `pytorch` folder depending on your preferred framework: 87 | 88 | ``` 89 | cd groqflow/examples/ 90 | python hello_world.py 91 | ``` 92 | 93 | ### Step 5: Take-off with a Proof Point 94 | 95 | Included in the directory: `groqflow/proof_points`, are multiple examples of various machine learning and linear algebra workloads. To run these proof points, the `groqflow/demo_helpers` must be installed in your groqflow environment. 96 | 97 | ``` 98 | cd groqflow/demo_helpers/ 99 | pip install -e . 100 | ``` 101 | 102 | Then you can learn about how to run proof points [here](https://github.com/groq/groqflow/tree/main/proof_points). 103 | 104 | ## Developing with GroqFlow 105 | 106 | When you are ready to try out your own model with GroqFlow, we recommend taking the following steps: 107 | 108 | 1. Activate the conda virtual environment where you are able to run your model 109 | 1. Install the GroqFlow package from PyPI: 110 | - If you are developing a PyTorch, ONNX, or Hummingbird model, use `pip install groqflow` 111 | - If you are developing a Keras model, use `pip install groqflow[tensorflow]` 112 | 1. Follow steps 3 and 4 in [Testing Out GroqFlow](#testing-out-groqflow) to complete setup 113 | 1. Import `groqflow` into the script where you are running your model and call `groqit(model, inputs)` to build your model (see the [examples](https://github.com/groq/groqflow/tree/main/examples) to learn more about calling `groqit()`) 114 | 115 | **Note:** The supported Python/OS combinations in [Check your Versions](#check-your-versions) apply here as well. 116 | 117 | **Note:** We recommend using separate conda environments for PyTorch/ONNX/Hummingbird development vs. TensorFlow development. The reason we make TensorFlow support optional in GroqFlow is to help you avoid dependency conflicts between the TensorFlow package and the other Groq/GroqFlow dependencies. Do not `pip install groqflow[tensorflow]` into an environment where you already did `pip install groqflow`, as this will cause errors. 118 | -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | The following are links to GroqFlow documentation: 4 | 5 | - [Install Guide](install.md): Instructions on how to install GroqFlow. 6 | 7 | - [User Guide](user_guide.md): Overview and examples for all of GroqFlow's methods, flags, and options. 8 | 9 | - [Known Issues](known_issues.md): Currently known issues that may occur when using GroqFlow. 10 | 11 | - [Versioning](versioning.md): Explanation of GroqFlow's versioning scheme. 12 | 13 | - [README.md](readme.md): This README. 14 | -------------------------------------------------------------------------------- /docs/release_notes.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## v4.3.1 4 | 5 | ### Changes 6 | 7 | * Support for SDK 0.11. 8 | * Add beta support for groq-torch-importer front-end support. 9 | * Clean up package dependencies. 10 | * Various bug fixes. 11 | 12 | ### Known Issues 13 | 14 | * Yolo V6 proof point downloads the pytorch weights and invokes the export script to get the ONNX file. 15 | * Pip install of GroqFlow may complain about incompatible protobuf version. 16 | 17 | ## v4.2.1 18 | 19 | ### Known Issues 20 | 21 | * Runtime errors due to mismatches in tensor sizes may occur even though GroqFlow checks the data shape. (G14148) 22 | * Whacky terminal line wrapping when printing groqit error messages. (G13235) 23 | * GroqFlow requires both the runtime and developer package to be installed. (G18283, G18284) 24 | * GroqFlow BERT Quantization Proof Point fails to compile in SDK0.9.3 due to a scheduling error. (G16739) 25 | * Yolo v6 Proof Points fails to run the evaluation after compilation in SDK0.9.2.1. (G18209) 26 | -------------------------------------------------------------------------------- /docs/versioning.md: -------------------------------------------------------------------------------- 1 | # GroqFlow Versioning Policy 2 | 3 | The `groqflow` package applies semantic versioning for its 3-digit version number. The version number is stored in `groqflow/version.py`. 4 | 5 | The 3 digits correspond to MAJOR.MINOR.PATCH, which can be interpreted as follows: 6 | * MAJOR: changes indicate breaking API changes that may require the user to change their own code 7 | * MINOR: changes indicate that builds against a previous minor version may not be compatible, and the user may need to rebuild those models 8 | * PATCH: no user action required when the patch number changes 9 | -------------------------------------------------------------------------------- /examples/hummingbird/randomforest.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example trains a RandomForestClassifier against random data 3 | then compares the sklearn result to GroqChip executed via GroqFlow. 4 | """ 5 | 6 | import numpy as np 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.metrics import classification_report 10 | from groqflow import groqit 11 | 12 | batch_size = 320 13 | 14 | # Generate random points in a 10-dimensional space with binary labels 15 | np.random.seed(0) 16 | x = np.random.rand(1000, 10).astype(np.float32) 17 | y = np.random.randint(2, size=1000) 18 | 19 | # Perform a test/train split of the (random) dataset 20 | x_train, x_test, y_train, y_test = train_test_split( 21 | x, y, test_size=batch_size, random_state=0 22 | ) 23 | 24 | # Fit the model using standard sklearn patterns 25 | skl_model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0) 26 | skl_model.fit(x_train, y_train) 27 | 28 | # Build the model 29 | groq_model = groqit(skl_model, {"input_0": x_test}) 30 | 31 | # Display a report of standard classifier statistics 32 | print("SKLearn classification report") 33 | print(classification_report(y_test, skl_model.predict(x_test))) 34 | print("Groq classification report") 35 | print(classification_report(y_test, groq_model.predict(x_test))) 36 | 37 | print("Example randomforest.py finished") 38 | -------------------------------------------------------------------------------- /examples/hummingbird/xgbclassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example trains an XGBClassifier against random data 3 | then compares the xgboost result to GroqChip executed via GroqFlow. 4 | """ 5 | 6 | import numpy as np 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import classification_report 9 | from xgboost import XGBClassifier # pylint: disable=import-error 10 | from groqflow import groqit 11 | 12 | batch_size = 320 13 | 14 | # Generate random points in a 10-dimensional space with binary labels 15 | np.random.seed(0) 16 | x = np.random.rand(1000, 10).astype(np.float32) 17 | y = np.random.randint(2, size=1000) 18 | 19 | # Perform a test/train split of the (random) dataset 20 | x_train, x_test, y_train, y_test = train_test_split( 21 | x, y, test_size=batch_size, random_state=0 22 | ) 23 | 24 | # Fit the model using standard sklearn patterns 25 | xgb_model = XGBClassifier( 26 | n_estimators=10, max_depth=5, random_state=0, objective="binary:logistic" 27 | ) 28 | xgb_model.fit(x_train, y_train) 29 | 30 | # Build the model 31 | groq_model = groqit(xgb_model, {"input_0": x_test}) 32 | 33 | # Display a report of standard classifier statistics 34 | print("XGBoost classification report") 35 | print(classification_report(y_test, xgb_model.predict(x_test))) 36 | print("Groq classification report") 37 | print(classification_report(y_test, groq_model.predict(x_test))) 38 | 39 | print("Example xgbclassifier.py finished") 40 | -------------------------------------------------------------------------------- /examples/keras/hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hello ** Keras ** World! 3 | 4 | This example uses a small model to carry out a single vector matrix 5 | multiplication to demonstrate building and running a Keras model 6 | with GroqFlow. 7 | 8 | This example will help identify what you should expect from each groqit() 9 | Keras build. You can find the build results in the cache directory at 10 | ~/.cache/groqflow/hello_keras_world/ (unless otherwise specified). 11 | """ 12 | 13 | import tensorflow as tf 14 | from groqflow import groqit 15 | 16 | tf.random.set_seed(0) 17 | 18 | # Define model class 19 | class SmallKerasModel(tf.keras.Model): # pylint: disable=abstract-method 20 | def __init__(self, output_size): 21 | super(SmallKerasModel, self).__init__() 22 | self.dense = tf.keras.layers.Dense(output_size, activation="relu") 23 | 24 | def call(self, x): # pylint: disable=arguments-differ 25 | output = self.dense(x) 26 | return output 27 | 28 | 29 | # Instantiate model and generate inputs 30 | batch_size = 1 31 | input_size = 10 32 | output_size = 5 33 | keras_model = SmallKerasModel(output_size) 34 | keras_model.build(input_shape=(batch_size, input_size)) 35 | inputs = {"x": tf.random.uniform((batch_size, input_size), dtype=tf.float32)} 36 | 37 | # Build model 38 | groq_model = groqit(keras_model, inputs, build_name="hello_keras_world") 39 | 40 | # Compute Keras and Groq results 41 | keras_outputs = keras_model(**inputs) 42 | groq_outputs = groq_model(**inputs) 43 | 44 | # Print Keras and Groq results 45 | print(f"Keras_outputs: {keras_outputs}") 46 | print(f"Groq_outputs: {groq_outputs}") 47 | 48 | print("Example hello_world.py finished") 49 | -------------------------------------------------------------------------------- /examples/onnx/hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hello ** ONNX ** World! 3 | 4 | This example uses a small model to carry out a single vector matrix 5 | multiplication to demonstrate building and running an ONNX model 6 | with GroqFlow. 7 | 8 | This example will help identify what you should expect from each groqit() 9 | ONNX build. You can find the build results in the cache directory at 10 | ~/.cache/groqflow/hello_onnx_world/ (unless otherwise specified). 11 | """ 12 | 13 | import os 14 | import torch 15 | from groqflow import groqit 16 | import onnxruntime as ort 17 | 18 | torch.manual_seed(0) 19 | 20 | # Start from a PyTorch model so you can generate an ONNX 21 | # file to pass into groqit(). 22 | class SmallModel(torch.nn.Module): 23 | def __init__(self, input_size, output_size): 24 | super(SmallModel, self).__init__() 25 | self.fc = torch.nn.Linear(input_size, output_size) 26 | 27 | def forward(self, x): 28 | output = self.fc(x) 29 | return output 30 | 31 | 32 | # Instantiate PyTorch model and generate inputs 33 | input_size = 10 34 | output_size = 5 35 | pytorch_model = SmallModel(input_size, output_size) 36 | onnx_model = "small_onnx_model.onnx" 37 | input_tensor = torch.rand(input_size) 38 | inputs = {"input": input_tensor} 39 | 40 | # Export PyTorch Model to ONNX 41 | torch.onnx.export( 42 | pytorch_model, 43 | input_tensor, 44 | onnx_model, 45 | opset_version=14, 46 | input_names=["input"], 47 | output_names=["output"], 48 | ) 49 | 50 | # You can use numpy arrays as inputs to our ONNX model 51 | def to_numpy(tensor): 52 | return ( 53 | tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() 54 | ) 55 | 56 | 57 | # Setup OnnxRuntime session for ONNX model so that you can 58 | # present a CPU baseline for the ONNX model inference 59 | ort_sess = ort.InferenceSession(onnx_model) 60 | input_name = ort_sess.get_inputs()[0].name 61 | numpy_inputs = to_numpy(input_tensor) 62 | 63 | # Build ONNX model 64 | groq_model = groqit(onnx_model, inputs, build_name="hello_onnx_world") 65 | 66 | # Remove intermediate onnx file so that you don't pollute your disk 67 | if os.path.exists(onnx_model): 68 | os.remove(onnx_model) 69 | 70 | # Compute ONNX and Groq results 71 | onnx_outputs = ort_sess.run(None, {input_name: numpy_inputs}) 72 | groq_outputs = groq_model.run(inputs) 73 | 74 | # Print ONNX and Groq results 75 | print(f"Groq_outputs: {groq_outputs}") 76 | print(f"Onnx_outputs: {onnx_outputs}") 77 | 78 | print("Example hello_world.py finished") 79 | -------------------------------------------------------------------------------- /examples/pytorch/assembler_flags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to build a small model with 3 | a list of assembler flags. Valid assembler flags can be found 4 | in the Compiler User Guide on the customer portal at 5 | support.groq.com 6 | 7 | If a list of assembler flags is provided to groqit(), then the 8 | default flags are not used. Any of the default flags needed 9 | should also be provided. 10 | 11 | To check the assembler flags used in a build, you can either print the 12 | value of the 'gmodel.state.info.assembler_command' or view the yaml file 13 | in the cache directory for your build. 14 | """ 15 | 16 | import torch 17 | from groqflow import groqit 18 | 19 | torch.manual_seed(0) 20 | 21 | # Define model class 22 | class SmallModel(torch.nn.Module): 23 | def __init__(self, input_size, output_size): 24 | super(SmallModel, self).__init__() 25 | self.fc = torch.nn.Linear(input_size, output_size) 26 | 27 | def forward(self, x): 28 | output = self.fc(x) 29 | return output 30 | 31 | 32 | # Instantiate model and generate inputs 33 | input_size = 10 34 | output_size = 5 35 | pytorch_model = SmallModel(input_size, output_size) 36 | inputs = {"x": torch.rand(input_size)} 37 | user_provided_assembler_flags = ["--ifetch-from-self", "--no-metrics"] 38 | 39 | # Build model with user-provided assembler flags 40 | # Note that assembler_flags are only allowed when num_chips=1 41 | gmodel = groqit( 42 | pytorch_model, inputs, assembler_flags=user_provided_assembler_flags, num_chips=1 43 | ) 44 | 45 | # Print the user-provided flags and the Groq Assembler command 46 | # to verify your flags were applied. 47 | print(f"\nUser-provided flags: {user_provided_assembler_flags}") 48 | print(f"Groq Assembler command: {gmodel.state.info.assembler_command}") 49 | 50 | print("Example assembler_flags.py finished") 51 | -------------------------------------------------------------------------------- /examples/pytorch/benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example illustrates how to get benchmarked performance of your build on a GroqNode 3 | system using the method `GroqModel.benchmark()`. You can read the details of 4 | `benchmark()` in the Benchmark section in docs/user_guide.md. 5 | """ 6 | 7 | import torch 8 | from groqflow import groqit 9 | 10 | torch.manual_seed(0) 11 | 12 | # Define model class 13 | class SmallModel(torch.nn.Module): 14 | def __init__(self, input_size, output_size): 15 | super(SmallModel, self).__init__() 16 | self.fc = torch.nn.Linear(input_size, output_size) 17 | 18 | def forward(self, x): 19 | output = self.fc(x) 20 | return output 21 | 22 | 23 | # Instantiate model and generate inputs 24 | input_size = 10 25 | output_size = 5 26 | pytorch_model = SmallModel(input_size, output_size) 27 | inputs = {"x": torch.rand(input_size)} 28 | 29 | # Build model 30 | gmodel = groqit(pytorch_model, inputs, groqview=True) 31 | 32 | # Get benchmarked performance in terms of latency and throughput 33 | performance = gmodel.benchmark() 34 | print("Your build's estimated performance is:") 35 | print(f"{performance.latency:.7f} {performance.latency_units}") 36 | print(f"{performance.throughput:.1f} {performance.throughput_units}") 37 | 38 | print("Example benchmark.py finished") 39 | -------------------------------------------------------------------------------- /examples/pytorch/benchmark_abunch.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example illustrates how to get benchmarked performance of your build on a GroqNode 3 | system using the method `GroqModel.benchmark_abunch()`. You can read the details of 4 | `benchmark_abunch()` in the Benchmark section in docs/user_guide.md. 5 | """ 6 | 7 | import torch 8 | from groqflow import groqit 9 | 10 | torch.manual_seed(0) 11 | 12 | # Define model class 13 | class SmallModel(torch.nn.Module): 14 | def __init__(self, input_size, output_size): 15 | super(SmallModel, self).__init__() 16 | self.fc = torch.nn.Linear(input_size, output_size) 17 | 18 | def forward(self, x): 19 | output = self.fc(x) 20 | return output 21 | 22 | 23 | # Instantiate model and generate inputs 24 | input_size = 10 25 | output_size = 5 26 | pytorch_model = SmallModel(input_size, output_size) 27 | inputs = {"x": torch.rand(input_size)} 28 | 29 | # Compile model 30 | gmodel = groqit(pytorch_model, inputs) 31 | 32 | # Create a bunch of inputs 33 | num_inputs = 10 34 | abunch_o_inputs = [{"x": torch.rand(input_size)} for _ in range(num_inputs)] 35 | 36 | # Get benchmarked performance in terms of latency and throughput 37 | performance = gmodel.benchmark_abunch(input_collection=abunch_o_inputs) 38 | print("Your build's estimated performance is:") 39 | print(f"{performance.latency:.7f} {performance.latency_units}") 40 | print(f"{performance.throughput:.1f} {performance.throughput_units}") 41 | 42 | print("Example benchmark_abunch.py finished") 43 | -------------------------------------------------------------------------------- /examples/pytorch/build_name.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates changing the directory name within the cache directory 3 | (~/.cache/groqflow) where all the logs, artifacts, and the state file will be written. 4 | 5 | To change the directory name, use the build_name argument with a unique name. 6 | 7 | The directory for each build defaults to the name of the file it was built in; 8 | 'build_name' would be the default for this file. 9 | 10 | Note: If a single script is used to build multiple models, (or if a build_name 11 | matches a build directory within cache already), then a unique build_name will 12 | need to be defined, or the subsequent build(s) will overwrite (or load) the 13 | previous build found in ~/.cache/groqflow/{non_unique_build_name}. 14 | See docs/user_guide.md for more information. 15 | """ 16 | 17 | import torch 18 | from groqflow import groqit 19 | 20 | torch.manual_seed(0) 21 | 22 | 23 | # Define model class 24 | class SmallModel(torch.nn.Module): 25 | def __init__(self, input_size, output_size): 26 | super(SmallModel, self).__init__() 27 | self.fc = torch.nn.Linear(input_size, output_size) 28 | 29 | def forward(self, x): 30 | output = self.fc(x) 31 | return output 32 | 33 | 34 | # Create two different model instances, each with a different output 35 | # size. You can check the build artifacts to verify that both models 36 | # are built and stored separately. 37 | input_size = 10 38 | output_size_1 = 5 39 | output_size_2 = 8 40 | 41 | pytorch_model_1 = SmallModel(input_size, output_size_1) 42 | pytorch_model_2 = SmallModel(input_size, output_size_2) 43 | inputs = {"x": torch.rand(input_size)} 44 | 45 | # Build pytorch_model_1 and write build files to ~/.cache/groqflow/Thing_1 46 | groq_model_1 = groqit(pytorch_model_1, inputs, build_name="Thing_1") 47 | 48 | # Build pytorch_model_2 and write build files to ~/.cache/groqflow/Thing_2 49 | groq_model_2 = groqit(pytorch_model_2, inputs, build_name="Thing_2") 50 | 51 | print("\nNote that each build is saved to their own build directories") 52 | print("as indicated at the completion of each build above.") 53 | 54 | print("Example build_name.py finished") 55 | -------------------------------------------------------------------------------- /examples/pytorch/cache_dir.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates how to set the location of the GroqFlow build cache 3 | directory, using groqit()'s cache_dir argument. The default value for 4 | cache_dir is `~/.cache/groqflow`. 5 | 6 | To specify a different cache directory than the default set cache_dir to 7 | your location of choice. 8 | 9 | Note 1: To change the cache directory for every build, a global default can be 10 | set with the `GROQFLOW_CACHE_DIR` environment variable: 11 | export GROQFLOW_CACHE_DIR=/path_of_your_choosing 12 | 13 | Note 2: Setting the cache_dir argument within groqit() will override the 14 | `GROQFLOW_CACHE_DIR' setting. 15 | """ 16 | 17 | import torch 18 | from groqflow import groqit 19 | 20 | torch.manual_seed(0) 21 | 22 | 23 | # Define model class 24 | class SmallModel(torch.nn.Module): 25 | def __init__(self, input_size, output_size): 26 | super(SmallModel, self).__init__() 27 | self.fc = torch.nn.Linear(input_size, output_size) 28 | 29 | def forward(self, x): 30 | output = self.fc(x) 31 | return output 32 | 33 | 34 | # Instantiate PyTorch model and generate inputs 35 | input_size = 10 36 | output_size = 5 37 | pytorch_model = SmallModel(input_size, output_size) 38 | inputs = {"x": torch.rand(input_size)} 39 | 40 | # Build pytorch_model and set the cache_dir 41 | # We also set the build_name to make the build easy to identify 42 | my_local_cache = "local_cache" 43 | groqit(pytorch_model, inputs, cache_dir=my_local_cache, build_name="my_cache_dir_build") 44 | 45 | print( 46 | f"\nCheck out the cache created in the local directory by running 'ls {my_local_cache}'" 47 | ) 48 | 49 | print("Example cache_dir.py finished") 50 | -------------------------------------------------------------------------------- /examples/pytorch/compiler_flags.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to build a small model with 3 | a list of compiler flags. Valid compiler flags can be found 4 | in the Compiler User Guide on the customer portal at 5 | support.groq.com 6 | 7 | If a list of compiler flags is provided to groqit(), then the 8 | default flags are not used. Any of the default flags needed 9 | should also be provided. 10 | 11 | To check the compiler flags used in a build, you can either print the 12 | value of the 'gmodel.state.info.compiler_command' or view the yaml file 13 | in the cache directory for your build. 14 | """ 15 | 16 | import torch 17 | from groqflow import groqit 18 | 19 | torch.manual_seed(0) 20 | 21 | # Define model class 22 | class SmallModel(torch.nn.Module): 23 | def __init__(self, input_size, output_size): 24 | super(SmallModel, self).__init__() 25 | self.fc = torch.nn.Linear(input_size, output_size) 26 | 27 | def forward(self, x): 28 | output = self.fc(x) 29 | return output 30 | 31 | 32 | # Instantiate model and generate inputs 33 | input_size = 10 34 | output_size = 5 35 | pytorch_model = SmallModel(input_size, output_size) 36 | inputs = {"x": torch.rand(input_size)} 37 | user_provided_compiler_flags = ["--no-print-stats", "--disableAddressCompaction"] 38 | 39 | # Build model with user provided compiler flags 40 | gmodel = groqit(pytorch_model, inputs, compiler_flags=user_provided_compiler_flags) 41 | 42 | # Print the user-provided flags and the Groq Compiler command 43 | # to verify your flags were applied. 44 | print(f"\nUser-provided flags: {user_provided_compiler_flags}") 45 | print(f"Groq Assembler command: {gmodel.state.info.compiler_command}") 46 | 47 | print("Example compiler_flags.py finished") 48 | -------------------------------------------------------------------------------- /examples/pytorch/estimate_performance.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example illustrates how to get the estimated performance of your build using the 3 | method `GroqModel.estimate_performance()`. You can read the details of 4 | `estimate_performance()` in the Performance Estimation section in docs/user_guide.md. 5 | """ 6 | 7 | import torch 8 | from groqflow import groqit 9 | 10 | torch.manual_seed(0) 11 | 12 | # Define model class 13 | class SmallModel(torch.nn.Module): 14 | def __init__(self, input_size, output_size): 15 | super(SmallModel, self).__init__() 16 | self.fc = torch.nn.Linear(input_size, output_size) 17 | 18 | def forward(self, x): 19 | output = self.fc(x) 20 | return output 21 | 22 | 23 | # Instantiate model and generate inputs 24 | input_size = 10 25 | output_size = 5 26 | pytorch_model = SmallModel(input_size, output_size) 27 | inputs = {"x": torch.rand(input_size)} 28 | 29 | # Build model 30 | gmodel = groqit(pytorch_model, inputs, groqview=True) 31 | 32 | # Get performance estimates in terms of latency and throughput 33 | estimate = gmodel.estimate_performance() 34 | print("Your build's estimated performance is:") 35 | print(f"{estimate.latency:.7f} {estimate.latency_units}") 36 | print(f"{estimate.throughput:.1f} {estimate.throughput_units}") 37 | 38 | print("Example estimate_performance.py finished") 39 | -------------------------------------------------------------------------------- /examples/pytorch/groqview.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to build a small model and collect the data necessary 3 | to visualize and profile a model using GroqView. When you run the 4 | `GroqModel.groqview()` method, the visualizer is opened in a web browser. 5 | See the GroqView User Guide at support.groq.com to read all about it. 6 | """ 7 | 8 | import torch 9 | from groqflow import groqit 10 | 11 | torch.manual_seed(0) 12 | 13 | # Define model class 14 | class SmallModel(torch.nn.Module): 15 | def __init__(self, input_size, output_size): 16 | super(SmallModel, self).__init__() 17 | self.fc = torch.nn.Linear(input_size, output_size) 18 | 19 | def forward(self, x): 20 | output = self.fc(x) 21 | return output 22 | 23 | 24 | # Instantiate model and generate inputs 25 | input_size = 10 26 | output_size = 5 27 | pytorch_model = SmallModel(input_size, output_size) 28 | inputs = {"x": torch.rand(input_size)} 29 | 30 | # Build model 31 | gmodel = groqit(pytorch_model, inputs, groqview=True) 32 | 33 | # Open GroqView 34 | gmodel.groqview() 35 | 36 | print("Example groqview.py finished") 37 | -------------------------------------------------------------------------------- /examples/pytorch/hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hello ** PyTorch ** World! 3 | 4 | This example uses a small model to carry out a single vector matrix 5 | multiplication to demonstrate building and running a PyTorch model 6 | with GroqFlow. 7 | 8 | This example will help identify what you should expect from each groqit() 9 | PyTorch build. You can find the build results in the cache directory at 10 | ~/.cache/groqflow/hello_pytorch_world/ (unless otherwise specified). 11 | """ 12 | 13 | import torch 14 | from groqflow import groqit 15 | 16 | torch.manual_seed(0) 17 | 18 | # Define model class 19 | class SmallModel(torch.nn.Module): 20 | def __init__(self, input_size, output_size): 21 | super(SmallModel, self).__init__() 22 | self.fc = torch.nn.Linear(input_size, output_size) 23 | 24 | def forward(self, x): 25 | output = self.fc(x) 26 | return output 27 | 28 | 29 | # Instantiate model and generate inputs 30 | input_size = 10 31 | output_size = 5 32 | pytorch_model = SmallModel(input_size, output_size) 33 | inputs = {"x": torch.rand(input_size)} 34 | 35 | # Build model 36 | groq_model = groqit(pytorch_model, inputs, build_name="hello_pytorch_world") 37 | 38 | # Compute Pytorch and Groq results 39 | pytorch_outputs = pytorch_model(**inputs) 40 | groq_outputs = groq_model(**inputs) 41 | 42 | # Print Pytorch and Groq results 43 | print(f"Pytorch_outputs: {pytorch_outputs}") 44 | print(f"Groq_outputs: {groq_outputs}") 45 | 46 | print("Example hello_world.py finished") 47 | -------------------------------------------------------------------------------- /examples/pytorch/no_monitor.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example demonstrates the difference between the groqit() argument, 3 | monitor, when set to "True" (its default value) and then "False". 4 | """ 5 | 6 | import torch 7 | from groqflow import groqit 8 | 9 | torch.manual_seed(0) 10 | 11 | # Define model class 12 | class SmallModel(torch.nn.Module): 13 | def __init__(self, input_size, output_size): 14 | super(SmallModel, self).__init__() 15 | self.fc = torch.nn.Linear(input_size, output_size) 16 | 17 | def forward(self, x): 18 | output = self.fc(x) 19 | return output 20 | 21 | 22 | # Instantiate model and generate inputs 23 | input_size = 10 24 | output_size = 5 25 | pytorch_model = SmallModel(input_size, output_size) 26 | inputs = {"x": torch.rand(input_size)} 27 | 28 | # Build pytorch_model with `monitor` explicitly set to True 29 | print("\ngroqit() will now build the model with the monitor enabled...") 30 | groq_model = groqit(pytorch_model, inputs, monitor=True, build_name="monitor_enabled") 31 | 32 | # Rebuild pytorch_model with the monitor disabled 33 | print("\ngroqit() will now build the model with the monitor disabled...") 34 | groq_model = groqit(pytorch_model, inputs, monitor=False, build_name="monitor_disabled") 35 | 36 | print("Example no_monitor.py finished") 37 | -------------------------------------------------------------------------------- /examples/pytorch/num_chips.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to specify the number of GroqChip processors 3 | used in your build. 4 | 5 | You will need to be able to put at least one layer on each chip. So, the 6 | small model here will have two layers. 7 | 8 | To check the number of chips used in a build, you can either print the 9 | value of the 'gmodel.state.num_chips_used' or view the yaml file 10 | in the cache directory for your build. 11 | 12 | You can read more about the `num_chips` argument and multi-chip builds 13 | in the Multi-Chip section in the docs/user_guide.md. 14 | """ 15 | 16 | import torch 17 | from groqflow import groqit 18 | 19 | torch.manual_seed(0) 20 | 21 | # Define model class 22 | class TwoLayerModel(torch.nn.Module): 23 | def __init__(self, input_size, output_size): 24 | super(TwoLayerModel, self).__init__() 25 | self.fc1 = torch.nn.Linear(input_size, output_size) 26 | self.fc2 = torch.nn.Linear(output_size, output_size) 27 | 28 | def forward(self, x): 29 | output = self.fc1(x) 30 | output = self.fc2(output) 31 | return output 32 | 33 | 34 | # Create model and inputs 35 | input_size = 10 36 | output_size = 5 37 | pytorch_model = TwoLayerModel(input_size, output_size) 38 | inputs = {"x": torch.rand(input_size)} 39 | 40 | # Build model for 2 chips 41 | gmodel = groqit(pytorch_model, inputs, num_chips=2) 42 | 43 | print( 44 | "\nThe number of GroqChip processors required to run the build is " 45 | f"{gmodel.state.num_chips_used}." 46 | ) 47 | 48 | print("Example num_chips.py finished") 49 | -------------------------------------------------------------------------------- /examples/pytorch/quantization.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example shows how to specify the data samples to be used to 3 | perform post training quantization on the equivalent ONNX model 4 | before compiling and assembling the model into a GroqModel. 5 | 6 | You can read more about the `quantization_samples` argument 7 | in the corresponding section in the docs/user_guide.md. 8 | """ 9 | 10 | import torch 11 | import numpy as np 12 | from groqflow import groqit 13 | 14 | torch.manual_seed(0) 15 | 16 | # Define model class 17 | class TwoLayerModel(torch.nn.Module): 18 | def __init__(self, input_size, output_size): 19 | super(TwoLayerModel, self).__init__() 20 | self.fc1 = torch.nn.Linear(input_size, output_size) 21 | self.fc2 = torch.nn.Linear(output_size, output_size) 22 | 23 | def forward(self, x): 24 | output = self.fc1(x) 25 | output = self.fc2(output) 26 | return output 27 | 28 | 29 | if __name__ == "__main__": 30 | 31 | # Create model and inputs 32 | input_size, output_size = 10, 5 33 | pytorch_model = TwoLayerModel(input_size, output_size) 34 | torch_tensor = torch.rand(input_size) 35 | inputs = {"x": torch_tensor} 36 | 37 | # Prepare quantization data 38 | # Datatype should be the same type for the model inputs, the model's expected inputs 39 | # and the quantization samples 40 | sample_size = 100 41 | quantization_data = [ 42 | (np.array([np.random.rand(input_size)], dtype=np.float32)) 43 | for _ in range(sample_size) 44 | ] 45 | 46 | # Convert pytorch model into ONNX, quantize the ONNX model and 47 | # convert quantized ONNX to GroqModel 48 | gmodel = groqit( 49 | pytorch_model, 50 | inputs, 51 | rebuild="always", 52 | quantization_samples=quantization_data, 53 | ) 54 | 55 | # Inference both PyTorch model and Quantized GroqModel 56 | simple_pytorch_dataset = [ 57 | inputs, 58 | inputs, 59 | ] 60 | groq_outputs = gmodel.run_abunch(simple_pytorch_dataset) 61 | with torch.no_grad(): 62 | torch_outputs = [pytorch_model(**example) for example in simple_pytorch_dataset] 63 | 64 | # See if inference results match 65 | value_pass = all( 66 | [ 67 | np.allclose(torch_outputs[i], groq_outputs[i], rtol=0.01, atol=0.001) 68 | for i in range(len(simple_pytorch_dataset)) 69 | ] 70 | ) 71 | match_str = "" if value_pass else "not " 72 | print( 73 | "Results of PyTorch model and quantized GroqModel do {}match.".format(match_str) 74 | ) 75 | 76 | print("Example quantization.py finished") 77 | -------------------------------------------------------------------------------- /examples/pytorch/rebuild_always.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example is built to demonstrate groqit()'s rebuild = "always" setting. 3 | 4 | groqit() will always rebuild the model, even when a build of that model is 5 | found in the GroqFlow build cache, when the `rebuild` argument is set to 6 | "always". 7 | 8 | You can demonstrate the functionality for rebuild="always" by running this 9 | script twice and seeing that the model still gets rebuilt even when the model 10 | is cached and there are no changes to the model. 11 | """ 12 | 13 | import torch 14 | from groqflow import groqit 15 | 16 | torch.manual_seed(0) 17 | 18 | # Define model class 19 | class SmallModel(torch.nn.Module): 20 | def __init__(self, input_size, output_size): 21 | super(SmallModel, self).__init__() 22 | self.fc = torch.nn.Linear(input_size, output_size) 23 | 24 | def forward(self, x): 25 | output = self.fc(x) 26 | return output 27 | 28 | 29 | # Instantiate model and generate inputs 30 | input_size = 10 31 | output_size = 5 32 | pytorch_model = SmallModel(input_size, output_size) 33 | inputs = {"x": torch.rand(input_size)} 34 | 35 | # Build/Rebuild model 36 | groq_model = groqit(pytorch_model, inputs, rebuild="always") 37 | 38 | print("Example rebuild_always.py finished") 39 | -------------------------------------------------------------------------------- /examples/pytorch/rebuild_never.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example is built to demonstrate groqit()'s rebuild = "never" setting. 3 | 4 | When rebuild is set to "never" groqit() will look within the cache 5 | for a build with a matching build_name and load it, if it exists. 6 | You will see a warning printed to stout if the model has changed, but the 7 | existing build will be loaded regardless of functionality or correctness. 8 | 9 | Try the following experiment. 10 | 1. Run this script to build and save the model in cache. 11 | 2. Run the script again, and observe the warning printed when the 12 | cached model is loaded even though there is a detected change. 13 | 14 | Note: To make sure the model changes, the random seed is not set 15 | for this example. 16 | """ 17 | 18 | import torch 19 | from groqflow import groqit 20 | 21 | # Define model class 22 | class SmallModel(torch.nn.Module): 23 | def __init__(self, input_size, output_size): 24 | super(SmallModel, self).__init__() 25 | self.fc = torch.nn.Linear(input_size, output_size) 26 | 27 | def forward(self, x): 28 | output = self.fc(x) 29 | return output 30 | 31 | 32 | # Instantiate model and generate inputs 33 | input_size = 10 34 | output_size = 5 35 | pytorch_model = SmallModel(input_size, output_size) 36 | inputs = {"x": torch.rand(input_size)} 37 | 38 | # Build or load the model with rebuild="never" applied 39 | groq_model = groqit(pytorch_model, inputs, rebuild="never") 40 | 41 | print("Example rebuild_never.py finished") 42 | -------------------------------------------------------------------------------- /examples/pytorch/run_abunch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hello World, again! 3 | 4 | This example uses the same small model as the hello_world example, 5 | but this time we are going to run a bunch of inferences with the 6 | GroqModel.run_abunch() method. 7 | """ 8 | 9 | import torch 10 | from groqflow import groqit 11 | 12 | torch.manual_seed(0) 13 | 14 | # Define model class 15 | class SmallModel(torch.nn.Module): 16 | def __init__(self, input_size, output_size): 17 | super(SmallModel, self).__init__() 18 | self.fc = torch.nn.Linear(input_size, output_size) 19 | 20 | def forward(self, x): 21 | output = self.fc(x) 22 | return output 23 | 24 | 25 | # Instantiate model and generate inputs 26 | input_size = 10 27 | output_size = 5 28 | pytorch_model = SmallModel(input_size, output_size) 29 | inputs = {"x": torch.rand(input_size)} 30 | 31 | # Compile model 32 | groq_model = groqit(pytorch_model, inputs) 33 | 34 | # Create a bunch of inputs 35 | num_inputs = 10 36 | abunch_o_inputs = [{"x": torch.rand(input_size)} for _ in range(num_inputs)] 37 | 38 | print(f"Calculating the results of the {num_inputs} inputs!") 39 | 40 | # Run groq_model computations on abunch_o_inputs 41 | abunch_o_outputs = groq_model.run_abunch(input_collection=abunch_o_inputs) 42 | 43 | # Print abunch of outputs 44 | for count, output in enumerate(abunch_o_outputs): 45 | print(f"output {count}: {list(output.numpy())}") 46 | 47 | print("Example run_abunch.py finished") 48 | -------------------------------------------------------------------------------- /examples/pytorch/sequence.py: -------------------------------------------------------------------------------- 1 | """ This example uses GroqFlow features recommended for power users only. 2 | 3 | By default, GroqFlow completes the following steps: 4 | > Convert to ONNX 5 | > Optimize ONNX file 6 | > Check op support 7 | > Convert to FP16 8 | > Compile Model 9 | > Assemble Model 10 | 11 | This example illustrates how to alter the default sequence of steps. In this 12 | example, the conversion to FP16 is skipped. 13 | """ 14 | 15 | import torch 16 | from groqflow import groqit 17 | import onnxflow.justbuildit.export as of_export 18 | import onnxflow.justbuildit.stage as stage 19 | import groqflow.justgroqit.compile as compile 20 | import groqflow.justgroqit.export as gf_export 21 | 22 | 23 | torch.manual_seed(0) 24 | 25 | # Define model class 26 | class SmallModel(torch.nn.Module): 27 | def __init__(self, input_size, output_size): 28 | super(SmallModel, self).__init__() 29 | self.fc = torch.nn.Linear(input_size, output_size) 30 | 31 | def forward(self, x): 32 | output = self.fc(x) 33 | return output 34 | 35 | 36 | # Instantiate model and generate inputs 37 | input_size = 10 38 | output_size = 5 39 | 40 | pytorch_model = SmallModel(input_size, output_size) 41 | inputs = {"x": torch.rand(input_size, dtype=torch.float32)} 42 | 43 | onnx_sequence = stage.Sequence( 44 | "onnx_sequence", 45 | "Building ONNX Model without fp16 conversion", 46 | [ 47 | of_export.ExportPytorchModel(), 48 | of_export.OptimizeOnnxModel(), 49 | gf_export.CheckOnnxCompatibility(), 50 | # of_export.ConvertOnnxToFp16(), #<-- This is the step we want to skip 51 | compile.CompileOnnx(), 52 | compile.Assemble(), 53 | ], 54 | enable_model_validation=True, 55 | ) 56 | 57 | # Build model 58 | groq_model = groqit(pytorch_model, inputs, sequence=onnx_sequence) 59 | 60 | # Compute Pytorch and Groq results 61 | pytorch_outputs = pytorch_model(**inputs) 62 | groq_outputs = groq_model(**inputs) 63 | 64 | # Print Pytorch and Groq results 65 | print(f"Pytorch_outputs: {pytorch_outputs}") 66 | print(f"Groq_outputs: {groq_outputs}") 67 | -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | # GroqFlow™ Examples 2 | 3 | This folder contains examples that demonstrate the use of `groqit()` arguments and `GroqModel` methods. 4 | 5 | You can learn more about the concepts demonstrated in the examples by referencing the GroqFlow User Guide at `docs/user_guide.md`. 6 | 7 | ## Table Of Contents 8 | 9 | - [Groq Tool Requirements](#groq-tool-requirements) 10 | - [Understanding Examples](#understanding-examples) 11 | - [Running Examples](#running-examples) 12 | - [Hello Worlds](#hello-worlds) 13 | - [Hummingbird Examples](#hummingbird-examples) 14 | - [Additional Pytorch Examples](#additional-pytorch-examples) 15 | 16 | ## Groq Tool Requirements 17 | 18 | The Groq tools packages and the **Quick Start Guide** can be found at the [Groq Customer Portal](https://support.groq.com/) 19 | 20 | - To build a `groq_model` the `groq-devtools` package should be installed. 21 | - To run a `GroqModel` on hardware the `groq-runtime` package should be installed. 22 | - Both Groq packages should be installed to enable both a build and to run on hardware 23 | from the same script. 24 | 25 | ## Understanding Examples 26 | 27 | Here are some properties shared by all of the examples: 28 | 29 | - Each example will create a build directory in the GroqFlow build cache, which is located at `~/.cache/groqflow` by default. 30 | - **Note**: Most builds will load from this cache after the first time you run them, as opposed to rebuilding, unless otherwise specified in the example (check out the `rebuild` argument and its examples to change this behavior). 31 | - **Note**: Most examples set `torch.manual_seed(0)` or `tf.random.set_seed(0)`, unless otherwise specified in the example, which prevents the randomly generated weights in the example from changing between runs. 32 | - The build directory will be named after the example unless the example specifies a name change with the `build_name` argument (see the `build_name.py` example). 33 | - The model being built in each example is a small one- or two-layer fully-connected graph. 34 | 35 | ## Running Examples 36 | 37 | To run any of the examples, open a terminal and type the following command: 38 | 39 | ```python 40 | python /path/to/example/example_name.py 41 | ``` 42 | 43 | ## Hello Worlds 44 | 45 | | **Example Name** | **Demonstrates** | 46 | |:--------|:-----------| 47 | | `pytorch/hello_world.py` | building and running a model defined in PyTorch| 48 | | `keras/hello_world.py` | building and running a model defined in Keras| 49 | | `onnx/hello_world.py` | building and running a model defined as an ONNX file| 50 | 51 | ## Hummingbird Examples 52 | 53 | | **Example Name** | **Demonstrates** | 54 | |:--------|:-----------| 55 | | `hummingbird/randomforest.py` | building and running a Hummingbird RandomForestClassifier against random data | 56 | | `hummingbird/xgbclassifier.py` | building and running a Hummingbird XGBClassifier against random data | 57 | 58 | ## Additional PyTorch Examples 59 | 60 | | **Example Name** | **Demonstrates** | 61 | |:--------|:-----------| 62 | | `pytorch/assembler_flags.py` | the `assembler_flags` argument to `groqit()` | 63 | | `pytorch/benchmark.py` | the `benchmark()` method of `GroqModel` | 64 | | `pytorch/benchmark_abunch.py` | the `benchmark_abunch()` method of `GroqModel` | 65 | | `pytorch/build_name.py` | the `build_name` argument to `groqit()` | 66 | | `pytorch/cache_dir.py` | the `cache_dir` argument to `groqit()` | 67 | | `pytorch/compiler_flags.py` | the `compiler_flags` argument to `groqit()` | 68 | | `pytorch/estimate_performance.py` | the performance estimation feature of GroqFlow | 69 | | `pytorch/groqview.py` | how to create and open a GroqView visualization using GroqFlow | 70 | | `pytorch/no_monitor.py` | the `monitor` argument to `groqit()` | 71 | | `pytorch/num_chips.py` | the `num_chips` argument to groqit()| 72 | | `pytorch/rebuild_always.py` | `groqit()`'s caching behavior when the `rebuild` argument is set to "always" | 73 | | `pytorch/rebuild_never.py` | groqit()'s caching behavior when the `rebuild` argument is set to "never" | 74 | | `pytorch/run_abunch.py` | running multiple inputs at a time with the `run_abunch()` method | 75 | | `pytorch/sequence.py` | the `sequence` argument for changing the default GroqFlow steps for porting your model | 76 | -------------------------------------------------------------------------------- /groqflow/__init__.py: -------------------------------------------------------------------------------- 1 | from groqflow.version import __version__ 2 | 3 | from groqflow.common.build import load_state 4 | 5 | from groqflow.justgroqit.groqit import ( 6 | groqit, 7 | ) 8 | -------------------------------------------------------------------------------- /groqflow/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groq/groqflow/32740b44aea43d4ecf5d2fa4a2ce3d0f040e8bf0/groqflow/common/__init__.py -------------------------------------------------------------------------------- /groqflow/common/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import enum 3 | import math 4 | from typing import Optional, List, Dict 5 | import dataclasses 6 | import onnxflow.common.build as of_build 7 | from groqflow.version import __version__ as groqflow_version 8 | 9 | 10 | DEFAULT_ONNX_OPSET = 16 11 | MINIMUM_ONNX_OPSET = 13 12 | 13 | # Identifiers for specific GroqCard Accelerators 14 | GROQCARD_A14 = "A1.4" 15 | 16 | # Identifiers for specific chip topologies 17 | DRAGONFLY = "Dragonfly" 18 | ROTATIONAL = "Rotational" 19 | 20 | # WARNING: The "internal" env var may cause unexpected behavior if enabled 21 | # outside of the internal Groq dev environment. 22 | environment_variables = { 23 | "cache_dir": "GROQFLOW_CACHE_DIR", 24 | "rebuild": "GROQIT_REBUILD_POLICY", 25 | "dont_use_sdk": "GROQFLOW_BAKE_SDK", 26 | "debug": "GROQFLOW_DEBUG", 27 | "internal": "GROQFLOW_INTERNAL_FEATURES", 28 | "torch_importer": "GROQFLOW_USE_TORCH_IMPORTER", 29 | } 30 | 31 | # Allow an environment variable to override the default 32 | # location for the GroqFlow build cache 33 | if os.environ.get(environment_variables["cache_dir"]): 34 | DEFAULT_CACHE_DIR = os.environ.get(environment_variables["cache_dir"]) 35 | else: 36 | DEFAULT_CACHE_DIR = os.path.expanduser("~/.cache/groqflow") 37 | 38 | # Allow an environment variable to override the default 39 | # rebuild policy 40 | if os.environ.get(environment_variables["rebuild"]): 41 | DEFAULT_REBUILD_POLICY = os.environ.get(environment_variables["rebuild"]) 42 | rebuild_allowed_values = ["if_needed", "always", "never"] 43 | if DEFAULT_REBUILD_POLICY not in rebuild_allowed_values: 44 | raise ValueError( 45 | f'Environment variable set for {environment_variables["rebuild"]} has ' 46 | f"value {DEFAULT_REBUILD_POLICY}, which is not one of the following allowed " 47 | f"values: {rebuild_allowed_values} " 48 | ) 49 | else: 50 | DEFAULT_REBUILD_POLICY = "if_needed" 51 | 52 | # Allow an environment variable to tell groqit to build an SDK 53 | # with bake instead of using an installed copy of the SDK (only 54 | # useful for internal Groq developers) 55 | if os.environ.get(environment_variables["dont_use_sdk"]) == "True": 56 | USE_SDK = False 57 | else: 58 | USE_SDK = True 59 | 60 | # Direct builds to target the default GroqCard A1.4 accelerators. 61 | GROQCARD = GROQCARD_A14 62 | 63 | # By default, choose the dragonfly topology. Users can change this by passing in 64 | # the topology argument to groqit(). 65 | TOPOLOGY = DRAGONFLY 66 | 67 | # Allow users to use the Torch Importer and bypass ONNX. Only applicable for 68 | # Torch models, has no other effect on other model types. 69 | if os.environ.get(environment_variables["torch_importer"]): 70 | USE_TORCH_IMPORTER = True 71 | else: 72 | USE_TORCH_IMPORTER = False 73 | 74 | 75 | class Backend(enum.Enum): 76 | AUTO = "auto" 77 | LOCAL = "local" 78 | CLOUD = "cloud" 79 | REMOTE = "remote" 80 | 81 | 82 | def supported_topology(groqcard: str, topology: str) -> Dict[int, str]: 83 | """ 84 | Return a map of the number of chips to the topology string, given a groqcard 85 | and connection topology. Only groqcard value of GROQCARD_A14 and topologies 86 | of value DRAGONFLY, ROTATIONAL are currently supported. 87 | """ 88 | 89 | topo_df_a14 = { 90 | 2: "DF_A14_2_CHIP", 91 | 4: "DF_A14_4_CHIP", 92 | 8: "DF_A14_8_CHIP", 93 | 16: "DF_A14_16_CHIP", 94 | 32: "DF_A14_32_CHIP", 95 | 64: "DF_A14_64_CHIP", 96 | } 97 | topo_rt_a14 = { 98 | 16: "RT09_A14_16_CHIP", 99 | 32: "RT09_A14_32_CHIP", 100 | 40: "RT09_A14_40_CHIP", 101 | 48: "RT09_A14_48_CHIP", 102 | 56: "RT09_A14_56_CHIP", 103 | 64: "RT09_A14_64_CHIP", 104 | 72: "RT09_A14_72_CHIP", 105 | } 106 | 107 | if groqcard != GROQCARD_A14: 108 | return {} 109 | 110 | if topology == DRAGONFLY: 111 | return topo_df_a14 112 | elif topology == ROTATIONAL: 113 | return topo_rt_a14 114 | else: 115 | return {} 116 | 117 | 118 | def max_chips(groqcard: str, topology: str): 119 | chips = list(supported_topology(groqcard, topology).keys()) 120 | if len(chips) == 0: 121 | raise ValueError( 122 | f"Could not find the number of chips for groqcard {groqcard}, " 123 | f"topology {topology}." 124 | ) 125 | return chips[-1] 126 | 127 | 128 | # Each chip can hold approximately 50M parameters 129 | # Number of chips need to be either 1, 2, 4, 8, 16, 32 or 64 130 | def calculate_num_chips(num_parameters, estimate=False): 131 | if num_parameters is not None: 132 | if num_parameters == 0: 133 | return 1 134 | else: 135 | x = math.ceil(num_parameters / 50000000) 136 | if estimate: 137 | return x 138 | else: 139 | return 2 ** (x - 1).bit_length() 140 | else: 141 | return None 142 | 143 | 144 | @dataclasses.dataclass(frozen=True) 145 | class GroqConfig(of_build.Config): 146 | """ 147 | User-provided build configuration. GroqFlow is not allowed 148 | to change instances of Config once they have been 149 | instantiated (frozen=True enforces this). 150 | 151 | Inherits `build_name`, `auto_name`, `onnx_opset`, and `sequence` from onnxflow. 152 | 153 | Note: modifying this struct can create a breaking change that 154 | requires users to rebuild their models. Increment the minor 155 | version number of the groqflow package if you do make a build- 156 | breaking change. 157 | """ 158 | 159 | compiler_flags: Optional[List[str]] = None 160 | assembler_flags: Optional[List[str]] = None 161 | groqview: bool = False 162 | groqcard: str = GROQCARD 163 | topology: str = TOPOLOGY 164 | num_chips: Optional[int] = None 165 | 166 | 167 | @dataclasses.dataclass 168 | class GroqInfo(of_build.Info): 169 | """ 170 | Information about a build that may be useful for analysis 171 | or debugging purposes. 172 | 173 | Note: GroqFlow does not guarantee that members of this class will 174 | have non-None values at the end of a build. GroqFlow code must 175 | not take a dependence on any member of this class. 176 | """ 177 | 178 | num_parameters: Optional[int] = None 179 | opt_onnx_unsupported_ops: Optional[List[str]] = None 180 | opt_onnx_all_ops_supported: Optional[bool] = None 181 | torch_script_exported: Optional[bool] = None 182 | torch_importer_success: Optional[bool] = None 183 | torch_importer_command: Optional[str] = None 184 | compiler_success: Optional[bool] = None 185 | compiler_command: Optional[str] = None 186 | assembler_success: Optional[bool] = None 187 | assembler_command: Optional[str] = None 188 | measured_latency: Optional[float] = None 189 | measured_throughput: Optional[float] = None 190 | estimated_pcie_input_latency: Optional[float] = None 191 | deterministic_compute_latency: Optional[float] = None 192 | estimated_pcie_output_latency: Optional[float] = None 193 | estimated_throughput: Optional[float] = None 194 | estimated_latency: Optional[float] = None 195 | compiled_model_input_bytes: Optional[int] = None 196 | compiled_model_output_bytes: Optional[int] = None 197 | compiler_ram_bytes: Optional[float] = None 198 | 199 | 200 | @dataclasses.dataclass 201 | class GroqState(of_build.State): 202 | # User-provided args that influence the generated model 203 | config: GroqConfig = None 204 | 205 | # User-provided args that do not influence the generated model 206 | use_sdk: bool = False 207 | 208 | # Optional information about the build 209 | info: GroqInfo = GroqInfo() 210 | 211 | # All of the following are critical aspects of the build, 212 | # including properties of GroqFlow and choices made by GroqFlow 213 | # while building the model, which determine the outcome of the build. 214 | # NOTE: adding or changing a member name in this struct can create 215 | # a breaking change that requires users to rebuild their models. 216 | # Increment the minor version number of the groqflow package if you 217 | # do make a build-breaking change. 218 | 219 | groqflow_version: str = groqflow_version 220 | num_chips_used: Optional[int] = None 221 | 222 | @property 223 | def original_inputs_file(self): 224 | return os.path.join( 225 | of_build.output_dir(self.cache_dir, self.config.build_name), 226 | "inputs_original.npy", 227 | ) 228 | 229 | @property 230 | def execution_inputs_file(self): 231 | return os.path.join( 232 | of_build.output_dir(self.cache_dir, self.config.build_name), "inputs.npy" 233 | ) 234 | 235 | @property 236 | def outputs_file(self): 237 | return os.path.join( 238 | of_build.output_dir(self.cache_dir, self.config.build_name), "outputs.npy" 239 | ) 240 | 241 | @property 242 | def latency_file(self): 243 | return os.path.join( 244 | of_build.output_dir(self.cache_dir, self.config.build_name), "latency.npy" 245 | ) 246 | 247 | @property 248 | def torch_script_dir(self): 249 | return os.path.join( 250 | of_build.output_dir(self.cache_dir, self.config.build_name), "torchscript" 251 | ) 252 | 253 | @property 254 | def torch_script_file(self): 255 | return os.path.join( 256 | self.torch_script_dir, 257 | f"{self.config.build_name}.pt", 258 | ) 259 | 260 | @property 261 | def compile_dir(self): 262 | return os.path.join( 263 | of_build.output_dir(self.cache_dir, self.config.build_name), "compile" 264 | ) 265 | 266 | @property 267 | def stats_file(self): 268 | return os.path.join(self.compile_dir, "stats.json") 269 | 270 | @property 271 | def groqview_file(self): 272 | return os.path.join(self.compile_dir, "output_bind") 273 | 274 | @property 275 | def topology(self): 276 | topology = supported_topology(self.config.groqcard, self.config.topology) 277 | if self.num_chips_used in topology.keys(): 278 | return topology[self.num_chips_used] 279 | else: 280 | return "Unknown" 281 | 282 | def prepare_file_system(self): 283 | super().prepare_file_system() 284 | os.makedirs(self.compile_dir, exist_ok=True) 285 | 286 | 287 | def load_state( 288 | cache_dir=DEFAULT_CACHE_DIR, build_name=None, state_path=None 289 | ) -> GroqState: 290 | 291 | return of_build.load_state( 292 | cache_dir=cache_dir, 293 | build_name=build_name, 294 | state_path=state_path, 295 | state_type=GroqState, 296 | ) 297 | -------------------------------------------------------------------------------- /groqflow/common/onnx_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for dealing with ONNX files and ONNX models 3 | """ 4 | 5 | import subprocess 6 | import ast 7 | import onnxflow.common.printing as printing 8 | import groqflow.common.sdk_helpers as sdk 9 | 10 | 11 | def check_ops(input_onnx, use_sdk=False): 12 | 13 | print("Checking unsupported ops...") 14 | 15 | # Select either bake or SDK 16 | if use_sdk: 17 | cmd = sdk.find_tool("onnxmodelanalyzer") 18 | else: 19 | cmd = [ 20 | "bake", 21 | "r", 22 | "//Groq/Compiler:OnnxModelAnalyze", 23 | ] 24 | cmd = cmd + ["-u", "-i", input_onnx] 25 | 26 | # Run process and decode outputs 27 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 28 | out, _ = p.communicate() 29 | out = out.decode("utf-8").split("\n") 30 | all_ops = ast.literal_eval(out[-4]) 31 | unsupported_ops = ast.literal_eval(out[-2]) 32 | 33 | # print results accordingly 34 | num_ops = len(all_ops) 35 | num_unsupported = len(unsupported_ops) 36 | num_supported = num_ops - num_unsupported 37 | if num_unsupported == 0: 38 | printing.logn("\t\tDONE", printing.Colors.OKGREEN) 39 | printing.logn( 40 | "\t" + f"{num_supported}/{num_ops} ops supported", printing.Colors.OKGREEN 41 | ) 42 | else: 43 | printing.logn("\t\tDONE", printing.Colors.OKGREEN) 44 | printing.logn( 45 | "\t" + f"{num_supported}/{num_ops} ops supported", printing.Colors.WARNING 46 | ) 47 | printing.logn( 48 | "\tUnsupported ops: " + ", ".join(unsupported_ops), 49 | printing.Colors.WARNING, 50 | ) 51 | return all_ops, unsupported_ops 52 | -------------------------------------------------------------------------------- /groqflow/common/sdk_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for interfacing with the GroqWare SDK 3 | """ 4 | 5 | import os 6 | import enum 7 | import subprocess 8 | import shutil 9 | from typing import Type, Union 10 | from pkg_resources import parse_version 11 | import onnxflow.common.exceptions as exp 12 | import groqflow.common.build as build 13 | 14 | 15 | MIN_RELEASE_VERSION = "0.9.2.1" 16 | 17 | 18 | class OS(enum.Enum): 19 | UBUNTU = "Ubuntu" 20 | ROCKY = "Rocky Linux" 21 | 22 | 23 | def get_num_chips_available(pci_devices=None): 24 | 25 | # The location of lspci may vary according to the OS used 26 | if shutil.which("lspci"): 27 | lspci = shutil.which("lspci") 28 | # This is important to ensure that CI works 29 | elif os.path.isfile("/usr/bin/lspci"): 30 | lspci = "/usr/bin/lspci" 31 | else: 32 | raise exp.EnvError("lspci not found") 33 | 34 | # Capture the list of pci devices on the system using the linux lspci utility 35 | if pci_devices is None: 36 | pci_devices = ( 37 | subprocess.check_output([lspci, "-n"], stderr=subprocess.DEVNULL) 38 | .decode("utf-8") 39 | .split("\n") 40 | ) 41 | 42 | # Unique registered vendor id: 1de0, and device id: "0000" 43 | groq_card_id = "1de0:0000" 44 | 45 | # number of chips per device: "1de0:0000":1 46 | chips_per_card = 1 47 | 48 | # Sum the number of GroqCards in the list of devices 49 | num_cards = 0 50 | for device in pci_devices: 51 | if groq_card_id in device: 52 | num_cards += 1 53 | 54 | # Calculate total number of chips 55 | num_chips_available = num_cards * chips_per_card 56 | 57 | return num_chips_available 58 | 59 | 60 | def find_tool(tool, soft_fail=False): 61 | if shutil.which(tool): 62 | return [tool] 63 | elif os.path.isfile(f"/usr/local/groq/bin/{tool}"): 64 | return [f"/usr/local/groq/bin/{tool}"] 65 | elif soft_fail: 66 | return False 67 | else: 68 | raise exp.StageError(f"{tool} not found") 69 | 70 | 71 | def _installed_package_version(package: str, os_version: OS) -> Union[bool, str]: 72 | """ 73 | This function is a simple wrapper around "apt-cache policy" that 74 | avoids a dependency on python-apt. It returns the installed version 75 | of the package when installed or "False" when not installed. 76 | """ 77 | if os_version == OS.UBUNTU: 78 | # Get package info 79 | try: 80 | cmd = ["apt-cache", "policy", package] 81 | package_info = ( 82 | subprocess.check_output(cmd, stderr=subprocess.DEVNULL) 83 | .decode("utf-8") 84 | .split("\n") 85 | ) 86 | except (FileNotFoundError, subprocess.CalledProcessError) as e: 87 | raise exp.Error("apt-cache policy command failed") from e 88 | 89 | # Return False if package was not found 90 | if len(package_info) == 1: 91 | return False 92 | 93 | # Return version number 94 | # package_info[1] has the format "Installed: " 95 | return package_info[1].split(":")[1].replace(" ", "") 96 | elif os_version == OS.ROCKY: 97 | # Get package info 98 | cmd = ["dnf", "info", package] 99 | try: 100 | package_info = ( 101 | subprocess.check_output(cmd, stderr=subprocess.DEVNULL) 102 | .decode("utf-8") 103 | .split("\n") 104 | ) 105 | except FileNotFoundError as e: 106 | raise exp.Error("dnf info command failed") from e 107 | except subprocess.CalledProcessError as e: 108 | # Return False if package was not found 109 | return False 110 | 111 | # Return version number 112 | # package_info[3] has the format "Version : " 113 | return package_info[3].split(":")[1].replace(" ", "") 114 | else: 115 | # The following exception will only be raised if a GroqFlow dev forgets to update 116 | # _installed_package_version() when adding support for a new OS 117 | raise exp.EnvError( 118 | f"_installed_package_version not implemented for {os_version}" 119 | ) 120 | 121 | 122 | def version_a_less_than_b(version_a, version_b: str): 123 | """ 124 | Return true if version_a >= version_b, following the scheme: 125 | major.minor.patch.patchpatch~release_candidate_number 126 | 127 | The release_candidate_number should be ignored. 128 | """ 129 | 130 | # Strip the release candidate number, if any 131 | clean_version_a = version_a.split("~")[0] 132 | clean_version_b = version_b.split("~")[0] 133 | 134 | return parse_version(clean_version_a) < parse_version(clean_version_b) 135 | 136 | 137 | def version_is_valid( 138 | sdkv: Union[str, bool], 139 | required: bool, 140 | requirement_name: str, 141 | exception_type: Type[Exception] = exp.EnvError, 142 | hint: str = "", 143 | ): 144 | """ 145 | Raise an exception if the required version number is not installed 146 | """ 147 | 148 | msg = ( 149 | f"{requirement_name}>={MIN_RELEASE_VERSION} is a required dependency " 150 | "for this part of GroqFlow" 151 | ) 152 | 153 | # Package not found 154 | if not sdkv and required: 155 | msg = msg + f". However, {requirement_name} was not found. " 156 | raise exception_type(msg + hint) 157 | 158 | # Package found, but version is not acceptable 159 | elif version_a_less_than_b(sdkv, MIN_RELEASE_VERSION) and required: 160 | msg = msg + f" ({sdkv} is installed). " 161 | raise exception_type(msg + hint) 162 | 163 | 164 | def validate_os_version() -> OS: 165 | 166 | supported_os_names = [x.value for x in OS] 167 | unsupported_os_msg = ( 168 | "Your OS must be one of the following Linux distributions: " 169 | f"{', '.join(supported_os_names)}. Please refer to our installation " 170 | "guide for more details on supported versions." 171 | ) 172 | 173 | # Check if this is a linux-based OS 174 | if not os.path.isfile("/etc/os-release"): 175 | raise exp.EnvError(unsupported_os_msg) 176 | 177 | # Parse OS-release data 178 | with open("/etc/os-release", encoding="utf-8") as f: 179 | os_release = {} 180 | for line in f: 181 | k, v = line.rstrip().split("=") 182 | os_release[k] = v.replace('"', "") 183 | 184 | # Check if OS is supported 185 | if os_release["NAME"] not in supported_os_names: 186 | raise exp.EnvError(unsupported_os_msg) 187 | 188 | return OS(os_release["NAME"]) 189 | 190 | 191 | def validate_devtools( 192 | os_version: OS, 193 | required=False, 194 | exception_type: Type[Exception] = exp.EnvError, 195 | ): 196 | version = _installed_package_version("groq-devtools", os_version) 197 | hint = "Please contact sales@groq.com to get access to groq-devtools." 198 | version_is_valid(version, required, "groq-devtools", exception_type, hint) 199 | 200 | 201 | def validate_runtime( 202 | os_version: OS, 203 | required=False, 204 | exception_type: Type[Exception] = exp.EnvError, 205 | ): 206 | version = _installed_package_version("groq-runtime", os_version) 207 | hint = "Please contact sales@groq.com to get access to groq-runtime." 208 | version_is_valid(version, required, "groq-runtime", exception_type, hint) 209 | 210 | 211 | # Returns the root directory of the current git repo and any associated 212 | # error from running the git command 213 | def get_repo_root(): 214 | p = subprocess.Popen( 215 | ["git", "rev-parse", "--show-toplevel"], 216 | stdout=subprocess.PIPE, 217 | stderr=subprocess.PIPE, 218 | ) 219 | out, err = p.communicate() 220 | repo = out.decode("utf-8") 221 | repo = repo.rstrip("\n") 222 | err = err.decode("utf-8") 223 | return repo, err 224 | 225 | 226 | def validate_bake(): 227 | if not shutil.which("bake"): 228 | raise exp.EnvError( 229 | ( 230 | "Bake must be available when the env var " 231 | f'{build.environment_variables["dont_use_sdk"]} is set to True' 232 | ) 233 | ) 234 | 235 | # bake commands require Groq to be current git repo 236 | repo, err = get_repo_root() 237 | groq_root = repo.split("/")[-1] == "Groq" 238 | 239 | if err: 240 | raise exp.EnvError( 241 | ( 242 | "You must be inside the Groq repo when the env var " 243 | f'{build.environment_variables["dont_use_sdk"]} is set to True. ' 244 | f"groqit() returned with error {err}" 245 | ) 246 | ) 247 | 248 | elif not groq_root: 249 | raise exp.EnvError( 250 | ( 251 | "You must be inside the Groq repo when the env var " 252 | f'{build.environment_variables["dont_use_sdk"]} is set to True. ' 253 | f"groqit() detected you are inside repo {repo}" 254 | ) 255 | ) 256 | 257 | 258 | def check_dependencies( 259 | require_devtools: bool = False, 260 | require_runtime: bool = False, 261 | exception_type: Type[Exception] = exp.EnvError, 262 | ): 263 | 264 | # Skip dependency check if necessary 265 | if os.environ.get("GROQFLOW_SKIP_SDK_CHECK") == "True": 266 | return True 267 | 268 | # Check for bake if SDK is not being used 269 | if not build.USE_SDK: 270 | validate_bake() 271 | # Check for the different SDK components when using the SDK 272 | # Skip all checks if using CI 273 | else: 274 | os_version = validate_os_version() 275 | 276 | # Only check for the package that is required 277 | if require_devtools: 278 | validate_devtools( 279 | os_version=os_version, 280 | required=require_devtools, 281 | exception_type=exception_type, 282 | ) 283 | 284 | # Only check for the package that is required 285 | if require_runtime: 286 | validate_runtime( 287 | os_version=os_version, 288 | required=require_runtime, 289 | exception_type=exception_type, 290 | ) 291 | -------------------------------------------------------------------------------- /groqflow/groqmodel/__init__.py: -------------------------------------------------------------------------------- 1 | from .groqmodel import GroqModel 2 | from .groqmodel import load 3 | -------------------------------------------------------------------------------- /groqflow/groqmodel/execute.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following script is used to get the latency and outputs of a given run on the GroqChip. 3 | This script doesn't depend on GroqFlow to be executed. 4 | """ 5 | # pylint: disable = no-name-in-module 6 | # pylint: disable = import-error 7 | import argparse 8 | from timeit import Timer 9 | from typing import Tuple, List 10 | import numpy as np 11 | import groq.api as g 12 | import groq.runner.tsp as tsp 13 | 14 | 15 | def get_multi_tsp_runner( 16 | compile_dir: str, topology: str, bringup_topology: bool = False 17 | ) -> tsp.local_runner.MultichipTSPRunner: 18 | 19 | # FIXME: topo_config is defined in two files, both assembler_multichip.py 20 | # and execute.py. If you modify this code, make sure to modify it in 21 | # both places. We will remove this code replication when we are able to 22 | # import the groqit.misc package into execute.py. 23 | 24 | # Declare different topologies 25 | topo_config = { 26 | "DF_A14_2_CHIP": g.TopologyConfig.DF_A14_2_CHIP, 27 | "DF_A14_4_CHIP": g.TopologyConfig.DF_A14_4_CHIP, 28 | "DF_A14_8_CHIP": g.TopologyConfig.DF_A14_8_CHIP, 29 | } 30 | 31 | if bringup_topology: 32 | print("Bringup C2C topology...") 33 | tsp.bringup_topology(user_config=topo_config[topology]) 34 | 35 | program_name = "output" 36 | tsp_runner = tsp.create_multi_tsp_runner( 37 | program_name, 38 | compile_dir, 39 | program_name, 40 | user_config=topo_config[topology], 41 | ) 42 | return tsp_runner 43 | 44 | 45 | def rtime(func, num_times: int, *args, **kwargs) -> Tuple[float, List]: 46 | """ 47 | Measure time of a given function multiple times and return 48 | the average time in seconds 49 | """ 50 | output_container = [] 51 | 52 | def wrapper(): 53 | output_container.append(func(*args, **kwargs)) 54 | 55 | timer = Timer(wrapper) 56 | delta = timer.timeit(num_times) 57 | return delta, output_container.pop() 58 | 59 | 60 | def run( 61 | input_batch: np.ndarray, 62 | num_chips: int, 63 | output_dir: str, 64 | topology: str, 65 | bringup_topology: bool, 66 | repetitions=1, 67 | ) -> Tuple[float, List]: 68 | 69 | # Get tsp_runner 70 | if num_chips == 1: 71 | iop_file = f"{output_dir}/compile/output.iop" 72 | tsp_runner = tsp.create_tsp_runner(iop_file) 73 | else: 74 | compile_dir = f"{output_dir}/compile" 75 | tsp_runner = get_multi_tsp_runner(compile_dir, topology, bringup_topology) 76 | 77 | # Multi-TSP Runner will run a pipeline of inputs 78 | # through the entire topology of the program 1-chip at a time 79 | # to get the actual output from the entire graph we need to invoke `num_chip` times 80 | def forward_multichip(example): 81 | for _ in range(num_chips): 82 | output = tsp_runner(**example) 83 | return output 84 | 85 | # Forward function for models compiled for a single chip 86 | def forward_singlechip(example): 87 | return tsp_runner(**example) 88 | 89 | forward = forward_singlechip if num_chips == 1 else forward_multichip 90 | batch_size = len(input_batch) 91 | output_batch = [] 92 | total_latency = 0.0 93 | for idx in range(batch_size): 94 | example = input_batch[idx] 95 | latency, output = rtime(forward, repetitions, example) 96 | total_latency += latency 97 | output_batch.append(output) 98 | 99 | total_latency = total_latency / repetitions / batch_size 100 | 101 | return total_latency, output_batch 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | # Disabling lint warning for using pickle 107 | # pylint: disable = unexpected-keyword-arg 108 | 109 | # Terminology: 110 | # This function receives a batch of inputs (input_batch) 111 | # Each element of this batch is called an "example" 112 | # Each example may contain one or more arguments 113 | 114 | # Parse Inputs 115 | parser = argparse.ArgumentParser(description="Execute models built by GroqFlow") 116 | parser.add_argument( 117 | "num_chips", 118 | type=int, 119 | help="Number of chips used to build the model", 120 | ) 121 | parser.add_argument("output_dir", help="Path where the build files are stored") 122 | parser.add_argument("outputs_file", help="File in which the outputs will be saved") 123 | parser.add_argument("latency_file", help="File in which the latency will be saved") 124 | parser.set_defaults(bringup_topology=False) 125 | parser.add_argument("topology", help="GroqChip topology used when building model") 126 | parser.add_argument( 127 | "repetitions", 128 | type=int, 129 | help="Number of times to execute the received inputs", 130 | ) 131 | parser.add_argument( 132 | "--bringup_topology", 133 | help="Describes whether or not the topology should be initialized", 134 | action="store_true", 135 | ) 136 | args = vars(parser.parse_args()) 137 | 138 | # Read inputs 139 | input_file = f"{args['output_dir']}/inputs.npy" 140 | input_batch = np.load(input_file, allow_pickle=True) 141 | 142 | # Get latency/output_data 143 | latency, output_data = run( 144 | input_batch, 145 | args["num_chips"], 146 | args["output_dir"], 147 | args["topology"], 148 | args["bringup_topology"], 149 | repetitions=args["repetitions"], 150 | ) 151 | 152 | # Save results to file 153 | with open(args["outputs_file"], "wb") as f: 154 | np.save(args["outputs_file"], output_data) 155 | with open(args["latency_file"], "wb") as f: 156 | np.save(args["latency_file"], latency) 157 | -------------------------------------------------------------------------------- /groqflow/groqmodel/remote.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import io 4 | import os 5 | from typing import Any, Collection, Dict, List, Tuple 6 | from dataclasses import dataclass 7 | import requests 8 | import numpy as np 9 | import groqflow.common.build as build 10 | 11 | 12 | @dataclass 13 | class UploadUrls: 14 | iops: List[str] 15 | inputs: List[Dict[str, str]] 16 | use_cached_iop: bool 17 | 18 | 19 | @dataclass 20 | class DownloadUrls: 21 | outputs: List[Dict[str, str]] 22 | 23 | 24 | # NOTE: frozen=True because mutations between upload and run wouldn't be caught 25 | # otherwise (objects are passed by reference) 26 | @dataclass(frozen=True) 27 | class RemoteGroqModel: 28 | user_name: str 29 | build_name: str 30 | iop_path: str 31 | num_chips: int 32 | num_iterations: int 33 | input_batch: List[Dict[str, np.ndarray]] 34 | input_names: List[str] 35 | post_check_remote_cache_endpoint: str 36 | post_upload_urls_endpoint: str 37 | post_execute_endpoint: str 38 | 39 | def _serialize(self, data: np.ndarray) -> bytes: 40 | buffer = io.BytesIO() 41 | np.save(buffer, data, allow_pickle=False) 42 | buffer.seek(0) 43 | return buffer.read() 44 | 45 | def _deserialize(self, data_raw: bytes) -> np.ndarray: 46 | buffer = io.BytesIO(data_raw) 47 | buffer.seek(0) 48 | return np.load(buffer, allow_pickle=False) 49 | 50 | def _upload_helper(self, url: str, data_raw: bytes) -> None: 51 | # TODO:(epatrick): error handling 52 | _response = requests.put( 53 | url, 54 | headers={"Content-Type": "application/octet-stream"}, 55 | data=data_raw, 56 | ) 57 | 58 | def _download_helper(self, url: str) -> bytes: 59 | response = requests.get( 60 | url, 61 | headers={"Content-Type": "application/octet-stream"}, 62 | ) 63 | return response.content 64 | 65 | def _calc_file_md5(self, file: str) -> str: 66 | with open(file, "rb") as iop: 67 | data_bytes = iop.read() 68 | md5_bytes = hashlib.md5(data_bytes).digest() 69 | return base64.b64encode(md5_bytes).decode("utf-8") 70 | 71 | def _calc_iop_md5s(self) -> List[str]: 72 | iop_files = ( 73 | [f"{self.iop_path}/output.iop"] 74 | if self.num_chips == 1 75 | else [f"{self.iop_path}/output.{i}.iop" for i in range(self.num_chips)] 76 | ) 77 | return list(map(self._calc_file_md5, iop_files)) 78 | 79 | def check_remote_cache(self) -> bool: 80 | iop_md5s = self._calc_iop_md5s() 81 | response = requests.post( 82 | self.post_check_remote_cache_endpoint, 83 | json={ 84 | "user_name": self.user_name, 85 | "build_name": self.build_name, 86 | "num_chips": self.num_chips, 87 | "iop_md5s": iop_md5s, 88 | }, 89 | ) 90 | 91 | if not response.ok: 92 | # NOTE: we may choose to ignore the exception and pretend the cache 93 | # returned false but that should be done at the callsite 94 | raise Exception( 95 | f"error status code: {response.status_code}, message: {response.text}" 96 | ) 97 | 98 | body = response.json() 99 | cache_hit: bool = body["cache_hit"] 100 | return cache_hit 101 | 102 | def get_upload_urls(self, skip_iop_urls: bool = False) -> UploadUrls: 103 | response = requests.post( 104 | self.post_upload_urls_endpoint, 105 | json={ 106 | "user_name": self.user_name, 107 | "build_name": self.build_name, 108 | "num_chips": self.num_chips, 109 | "input_names": self.input_names, 110 | "batch_size": len(self.input_batch), 111 | "skip_iop_urls": skip_iop_urls, 112 | }, 113 | ) 114 | 115 | if not response.ok: 116 | raise Exception( 117 | f"error status code: {response.status_code}, message: {response.text}" 118 | ) 119 | 120 | body = response.json() 121 | 122 | input_urls: List[Dict[str, str]] = body["input_urls"] 123 | iop_urls: List[str] = [] if skip_iop_urls else body["iop_urls"] 124 | 125 | return UploadUrls( 126 | iops=iop_urls, inputs=input_urls, use_cached_iop=skip_iop_urls 127 | ) 128 | 129 | def _upload_batch( 130 | self, input_batch: Dict[str, np.ndarray], upload_urls_batch: Dict[str, str] 131 | ) -> None: 132 | for input_name, input_data in input_batch.items(): 133 | input_url = upload_urls_batch[input_name] 134 | input_raw = self._serialize(input_data) 135 | self._upload_helper(input_url, input_raw) 136 | 137 | def upload(self, upload_urls: UploadUrls) -> None: 138 | # TODO: error handling 139 | if not upload_urls.use_cached_iop: 140 | if self.num_chips == 1: 141 | iop_files = [f"{self.iop_path}/output.iop"] 142 | else: 143 | iop_files = [ 144 | f"{self.iop_path}/output.{i}.iop" for i in range(self.num_chips) 145 | ] 146 | 147 | for iop_file, iop_url in zip(iop_files, upload_urls.iops): 148 | with open(iop_file, "rb") as iop: 149 | self._upload_helper(iop_url, iop) 150 | 151 | for batch_index, input_batch in enumerate(self.input_batch): 152 | self._upload_batch(input_batch, upload_urls.inputs[batch_index]) 153 | 154 | def _execute(self) -> Tuple[DownloadUrls, Dict[str, Any]]: 155 | response = requests.post( 156 | self.post_execute_endpoint, 157 | json={ 158 | "user_name": self.user_name, 159 | "build_name": self.build_name, 160 | "num_chips": self.num_chips, 161 | "input_names": self.input_names, 162 | "batch_size": len(self.input_batch), 163 | "num_iterations": self.num_iterations, 164 | }, 165 | ) 166 | 167 | if not response.ok: 168 | raise Exception( 169 | f"error status code: {response.status_code}, message: {response.text}" 170 | ) 171 | 172 | body = response.json() 173 | output_urls = body["output_urls"] 174 | stats = body["stats"] 175 | 176 | return DownloadUrls(outputs=output_urls), stats 177 | 178 | def _download(self, download_urls: DownloadUrls) -> List[Dict[str, np.ndarray]]: 179 | outputs = [] 180 | for output_urls in download_urls.outputs: 181 | output = {} 182 | for output_name, output_url in output_urls.items(): 183 | output_raw = self._download_helper(output_url) 184 | output[output_name] = self._deserialize(output_raw) 185 | outputs.append(output) 186 | return outputs 187 | 188 | def run(self) -> Tuple[List[Dict[str, np.ndarray]], Dict[str, Any]]: 189 | """ 190 | Invokes this remote groq model. 191 | 192 | Returns: (output_batch, stats) 193 | An output_batch where output_batch[i] corresponds to input_batch[i] 194 | A dictionary of stats for how the model ran on TSPs 195 | """ 196 | download_urls, stats = self._execute() 197 | output_batch = self._download(download_urls) 198 | return output_batch, stats 199 | 200 | 201 | class RemoteClient: 202 | """ 203 | A client for running TSP models using remote backend 204 | """ 205 | 206 | # Backend URL is the IP of where the remote server is hosted 207 | # TODO: Replace backend_url by a hostname 208 | def __init__(self, backend_url: str = "http://34.125.159.215"): 209 | self.post_check_remote_cache_endpoint = f"{backend_url}/storage/cache/check" 210 | self.post_upload_urls_endpoint = f"{backend_url}/storage/upload-urls" 211 | self.post_execute_endpoint = f"{backend_url}/execute" 212 | self.user_name = os.getlogin() 213 | 214 | def upload( 215 | self, 216 | user_name: str, 217 | build_name: str, 218 | compile_dir: str, 219 | num_chips: int, 220 | input_batch: Collection[Dict[str, np.ndarray]], 221 | num_iterations: int = 1, 222 | ) -> RemoteGroqModel: 223 | """ 224 | A lower level interface to upload a remote groq model ahead of time. You may 225 | invoke the remote groq model with the returned RemoteGroqModel object. 226 | 227 | You should also use this interface if you want to combine the functionality of 228 | benchmark and run_abunch. 229 | 230 | Args: 231 | user_name: Username of the caller 232 | build_name: Name of the build 233 | compile_dir: Full path to the directory containing the IOP file(s) 234 | num_chips: Number of chips for the remote groq model 235 | input_batch: Data used as input for the remote groq model. Execution 236 | will be done once per batch 237 | num_iterations: How many executions the statistics should be averaged over 238 | (default = 1) 239 | 240 | Returns: 241 | A RemoteGroqModel object that can be used to invoke the uploaded model 242 | """ 243 | 244 | input_names = [] if len(input_batch) == 0 else list(list(input_batch)[0].keys()) 245 | remote_gm = RemoteGroqModel( 246 | user_name, 247 | build_name, 248 | compile_dir, 249 | num_chips, 250 | num_iterations, 251 | input_batch, 252 | input_names, 253 | self.post_check_remote_cache_endpoint, 254 | self.post_upload_urls_endpoint, 255 | self.post_execute_endpoint, 256 | ) 257 | cache_hit = remote_gm.check_remote_cache() 258 | upload_urls = remote_gm.get_upload_urls(skip_iop_urls=cache_hit) 259 | remote_gm.upload(upload_urls) 260 | return remote_gm 261 | 262 | def execute( 263 | self, 264 | state: build.GroqState, 265 | repetitions: int, 266 | ): 267 | """ 268 | Executes a build on the given inputs and saves results to disk. 269 | 270 | Args: 271 | state: State of the build being executed 272 | repetitions: Number of times to execute a build 273 | """ 274 | inputs_file = state.execution_inputs_file 275 | inputs_data = np.load(inputs_file, allow_pickle=True) 276 | latency_file = state.latency_file 277 | outputs_file = state.outputs_file 278 | remote_gm = self.upload( 279 | self.user_name, 280 | state.config.build_name, 281 | state.compile_dir, 282 | state.num_chips_used, 283 | inputs_data, 284 | repetitions, 285 | ) 286 | output_batch, stats = remote_gm.run() 287 | latency_avg = stats["exec_time_seconds"]["mean"] 288 | np.save(latency_file, latency_avg) 289 | outputs_data = output_batch 290 | np.save(outputs_file, outputs_data, allow_pickle=True) 291 | -------------------------------------------------------------------------------- /groqflow/justgroqit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/groq/groqflow/32740b44aea43d4ecf5d2fa4a2ce3d0f040e8bf0/groqflow/justgroqit/__init__.py -------------------------------------------------------------------------------- /groqflow/justgroqit/assemble_multichip.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import groq.api as g 3 | 4 | 5 | def assembler_multichip(topology, compile_dir, is_large_program=False): 6 | 7 | # FIXME: topo_config is defined in two files, both assembler_multichip.py 8 | # and benchmark.py. If you modify this code, make sure to modify it in 9 | # both places. We will remove this code replication when we are able to 10 | # import groqflow packages into these files. 11 | 12 | # Identify the topology. The topology specified with 13 | # groq-compiler should match the one configured here. 14 | topo_config = { 15 | "DF_A14_2_CHIP": g.TopologyConfig.DF_A14_2_CHIP, 16 | "DF_A14_4_CHIP": g.TopologyConfig.DF_A14_4_CHIP, 17 | "DF_A14_8_CHIP": g.TopologyConfig.DF_A14_8_CHIP, 18 | "DF_A14_16_CHIP": g.TopologyConfig.DF_A14_16_CHIP, 19 | "DF_A14_32_CHIP": g.TopologyConfig.DF_A14_32_CHIP, 20 | "DF_A14_64_CHIP": g.TopologyConfig.DF_A14_64_CHIP, 21 | "RT09_A14_16_CHIP": g.TopologyConfig.RT09_A14_16_CHIP, 22 | "RT09_A14_32_CHIP": g.TopologyConfig.RT09_A14_32_CHIP, 23 | "RT09_A14_40_CHIP": g.TopologyConfig.RT09_A14_40_CHIP, 24 | "RT09_A14_48_CHIP": g.TopologyConfig.RT09_A14_48_CHIP, 25 | "RT09_A14_56_CHIP": g.TopologyConfig.RT09_A14_56_CHIP, 26 | "RT09_A14_64_CHIP": g.TopologyConfig.RT09_A14_64_CHIP, 27 | "RT09_A14_72_CHIP": g.TopologyConfig.RT09_A14_72_CHIP, 28 | } 29 | 30 | # Select topology 31 | topo = g.configure_topology(config=topo_config[topology]) 32 | 33 | # Initiate the program package object with package name and output directory 34 | md_pgm_pkg = g.ProgramPackage(name="output", output_dir=compile_dir) 35 | 36 | # assign the name and topology to the create_program_context 37 | pgm_ctx = md_pgm_pkg.create_program_context("output", topo) 38 | 39 | # add the .aa files created by the groq-compiler and add them to the program 40 | md_pgm_pkg.add_precompiled_program(pgm_ctx, compile_dir, "output") 41 | 42 | # if any extra instruction memory slices were defined 43 | # during groq-compiler add them here. 44 | if is_large_program: 45 | extra_slices = [ 46 | "West 18", 47 | "West 19", 48 | "East 17", 49 | "East 18", 50 | "East 19", 51 | "East 38", 52 | ] 53 | else: 54 | extra_slices = [] 55 | 56 | # The .assemble method takes all the files and topologies and 57 | # assembles the multi chip program package. 58 | print("Starting multi-chip assembling process", flush=True) 59 | md_pgm_pkg.assemble(extra_ifetch_slices=extra_slices, ifetch_from_self=True) 60 | 61 | 62 | if __name__ == "__main__": 63 | 64 | # Parse command line arguments 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument( 67 | "-t", 68 | dest="topology", 69 | help="GroqCard topology for multi-chip assembly", 70 | required=True, 71 | ) 72 | parser.add_argument( 73 | "-d", 74 | dest="compile_dir", 75 | help="Directory for inputs and outputs", 76 | required=True, 77 | ) 78 | parser.add_argument( 79 | "-l", 80 | dest="is_large_program", 81 | help="If compiler uses --large-program the set to True", 82 | required=False, 83 | default=None, 84 | ) 85 | 86 | args = parser.parse_args() 87 | 88 | # Run script 89 | assembler_multichip(args.topology, args.compile_dir, args.is_large_program) 90 | -------------------------------------------------------------------------------- /groqflow/justgroqit/export.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | import sys 4 | import warnings 5 | import torch 6 | import onnxflow.justbuildit.stage as stage 7 | import onnxflow.common.exceptions as exp 8 | import onnxflow.common.tensor_helpers as tensor_helpers 9 | import groqflow.common.build as build 10 | import groqflow.common.onnx_helpers as onnx_helpers 11 | import groqflow.common.sdk_helpers as sdk 12 | 13 | 14 | def _warn_to_stdout(message, category, filename, line_number, _, line): 15 | sys.stdout.write( 16 | warnings.formatwarning(message, category, filename, line_number, line) 17 | ) 18 | 19 | 20 | class CheckOnnxCompatibility(stage.Stage): 21 | """ 22 | Stage that takes an ONNX file, checks whether it is compatible 23 | with Groq Compiler, and raises an exception if the ONNX file is 24 | not compatible. 25 | 26 | Expected inputs: 27 | - state.intermediate_results contains a single .onnx file 28 | 29 | Outputs: 30 | - The same ONNX file as the input 31 | """ 32 | 33 | def __init__(self): 34 | super().__init__( 35 | unique_name="check_compatibility", 36 | monitor_message="Checking for Op support", 37 | ) 38 | 39 | def fire(self, state: build.GroqState): 40 | 41 | sdk.check_dependencies(require_devtools=True, exception_type=exp.StageError) 42 | 43 | # TODO: validate this input 44 | # https://git.groq.io/code/Groq/-/issues/13947 45 | input_onnx = state.intermediate_results[0] 46 | 47 | ( 48 | state.info.opt_onnx_ops, 49 | state.info.opt_onnx_unsupported_ops, 50 | ) = onnx_helpers.check_ops(input_onnx, state.use_sdk) 51 | print(f"Model has {len(state.info.opt_onnx_unsupported_ops)} unsupported ops") 52 | 53 | state.info.opt_onnx_all_ops_supported = ( 54 | len(state.info.opt_onnx_unsupported_ops) == 0 55 | and len(state.info.opt_onnx_ops) != 0 56 | ) 57 | 58 | if not state.info.opt_onnx_all_ops_supported: 59 | ops = ", ".join(state.info.opt_onnx_unsupported_ops) 60 | msg = f""" 61 | You model contains ONNX operation(s) that are not supported by Groq Compiler: 62 | **{ops}** 63 | Please replace these operation(s) in your model or contact 64 | sales@groq.com to request improved operation support in Groq Compiler. 65 | """ 66 | raise exp.StageError(msg) 67 | 68 | return state 69 | 70 | 71 | class ExportPytorchToTorchScript(stage.Stage): 72 | """ 73 | Stage that takes a Pytorch module and exports it to TorchScript using 74 | torch.jit API. 75 | 76 | Expected inputs: 77 | - state.model is a torch.nn.Module or torch.jit.ScriptModule 78 | - state.inputs is a dict that represents valid kwargs to the forward 79 | function of state.model 80 | 81 | Outputs: 82 | - A *.pt file that implements state.model given state.inputs 83 | """ 84 | 85 | def __init__(self): 86 | super().__init__( 87 | unique_name="export_pytorch_to_torch_script", 88 | monitor_message="Exporting PyTorch to TorchScript", 89 | ) 90 | 91 | @staticmethod 92 | def _check_model(torch_script_file, success_message, fail_message) -> bool: 93 | if os.path.isfile(torch_script_file): 94 | print(success_message) 95 | return True 96 | else: 97 | print(fail_message) 98 | return False 99 | 100 | def fire(self, state: build.GroqState): 101 | if not isinstance(state.model, (torch.nn.Module, torch.jit.ScriptModule)): 102 | msg = f""" 103 | The current stage (ExportPytorchToTorchScript) is only compatible 104 | with models of type torch.nn.Module or torch.jit.ScriptModule, 105 | however the stage received a model of type {type(state.model)}. 106 | """ 107 | raise exp.StageError(msg) 108 | 109 | if isinstance(state.model, torch.nn.Module): 110 | # Validate user provided args 111 | all_args = list(inspect.signature(state.model.forward).parameters.keys()) 112 | 113 | for inp in list(state.inputs.keys()): 114 | if inp not in all_args: 115 | msg = f""" 116 | Input name {inp} not found in the model's forward method. Available 117 | input names are: {all_args}" 118 | """ 119 | raise ValueError(msg) 120 | 121 | # Send torch export warnings to stdout (and therefore the log file) 122 | # so that they don't fill up the command line 123 | default_warnings = warnings.showwarning 124 | warnings.showwarning = _warn_to_stdout 125 | 126 | # Export the model to TorchScript 127 | jit_module = torch.jit.trace( 128 | state.model, 129 | example_kwarg_inputs=state.inputs, 130 | ) 131 | 132 | # Save model to disk 133 | os.makedirs(state.torch_script_dir, exist_ok=True) 134 | jit_module.save(state.torch_script_file) 135 | 136 | # Save output names to ensure we are preserving the order of the outputs. 137 | # We have to re-load the torchscript module because the output names 138 | # will change during serialization. 139 | loaded_jit_module = torch.jit.load(state.torch_script_file) 140 | state.expected_output_names = [ 141 | output.debugName() for output in loaded_jit_module.graph.outputs() 142 | ] 143 | 144 | # Restore default warnings behavior 145 | warnings.showwarning = default_warnings 146 | 147 | tensor_helpers.save_inputs( 148 | [state.inputs], state.original_inputs_file, downcast=False 149 | ) 150 | 151 | # Check the if the base mode has been exported successfully 152 | success_msg = "\tSuccess exporting model to TorchScript" 153 | fail_msg = "\tFailed exporting model to TorchScript" 154 | state.info.torch_script_exported = self._check_model( 155 | state.torch_script_file, success_msg, fail_msg 156 | ) 157 | 158 | if state.info.torch_script_exported: 159 | state.intermediate_results = [state.torch_script_file] 160 | else: 161 | msg = f""" 162 | Unable to export model to TorchScript using Torch's jit exporter. 163 | We recommend that you modify your model until it is 164 | compatible with this third party software, then re-run. 165 | More information may be available in the log file at **{self.logfile_path}** 166 | """ 167 | raise exp.StageError(msg) 168 | 169 | return state 170 | -------------------------------------------------------------------------------- /groqflow/justgroqit/groqit.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict, Any 2 | from collections.abc import Collection 3 | import onnxflow.common.printing as printing 4 | import onnxflow.justbuildit.stage as stage 5 | import onnxflow.common.build as of_build 6 | import groqflow.justgroqit.ignition as ignition 7 | import groqflow.groqmodel as groqmodel 8 | import groqflow.common.build as build 9 | 10 | 11 | def groqit( 12 | model: of_build.UnionValidModelInstanceTypes = None, 13 | inputs: Optional[Dict[str, Any]] = None, 14 | build_name: Optional[str] = None, 15 | cache_dir: Optional[str] = build.DEFAULT_CACHE_DIR, 16 | monitor: bool = True, 17 | rebuild: Optional[str] = None, 18 | compiler_flags: Optional[List[str]] = None, 19 | assembler_flags: Optional[List[str]] = None, 20 | num_chips: Optional[int] = None, 21 | topology: str = build.TOPOLOGY, 22 | groqview: bool = False, 23 | sequence: Optional[stage.Sequence] = None, 24 | quantization_samples: Collection = None, 25 | ) -> groqmodel.GroqModel: 26 | 27 | """Use GroqFlow to build a model instance into a GroqModel 28 | object that can be executed on GroqChip processors. 29 | 30 | Args: 31 | model: Model to be mapped to a GroqModel, which can be a PyTorch 32 | model instance, Keras model instance, a path to an ONNX file, or 33 | a path to a Python script that follows the GroqFlow model.py template. 34 | inputs: Example inputs to the user's model. The GroqModel will be 35 | compiled to handle inputs with the same static shape only. Argument 36 | is not required if the model input is a GroqFlow model.py file. 37 | build_name: Unique name for the model that will be 38 | used to store the GroqModel and build state on disk. Defaults to the 39 | name of the file that calls groqit(). 40 | cache_dir: Directory to use as the GroqFlow cache for this build. Output files 41 | from this build will be stored at cache_dir/build_name/ 42 | Defaults to ~/.cache/groqflow 43 | monitor: Display a monitor on the command line that 44 | tracks the progress of groqit as it builds the GroqModel. 45 | rebuild: determines whether to rebuild or load a cached build. Options: 46 | - "if_needed" (default): overwrite invalid cached builds with a rebuild 47 | - "always": overwrite valid cached builds with a rebuild 48 | - "never": load cached builds without checking validity, with no guarantee 49 | of functionality or correctness 50 | - None: Falls back to default 51 | compiler_flags: Override groqit's default compiler flags with a list 52 | of user-specified flags. 53 | assembler_flags: Override groqit's default assembler flags with a 54 | list of user-specified flags. 55 | num_chips: Override the default number of GroqChip processors to be 56 | used instead of letting groqit decide automatically. Power users 57 | only. 58 | topology: Override the default topology when num_chips > 1. Power users 59 | only. 60 | groqview: If set, creates a GroqView file for the model during the 61 | build process. Defaults to false because this option uses up 62 | significant time and compute/RAM resources. 63 | sequence: Override groqit's default sequence of build stages. Power 64 | users only. 65 | quantization_samples: If set, performs post-training quantization 66 | on the ONNX model using the provided samples, then generates 67 | GroqModel from the quantized model. If the previous build used samples 68 | that are different to the samples used in current build, the "rebuild" 69 | argument needs to be manually set to "always" in the current build 70 | in order to create a new GroqModel. 71 | """ 72 | 73 | # Validate and lock in the groqit() config (user arguments that 74 | # configure the build) that will be used by the rest of groqit() 75 | config = ignition.lock_config( 76 | model=model, 77 | build_name=build_name, 78 | compiler_flags=compiler_flags, 79 | assembler_flags=assembler_flags, 80 | groqview=groqview, 81 | groqcard=build.GROQCARD, 82 | num_chips=num_chips, 83 | topology=topology, 84 | sequence=sequence, 85 | ) 86 | 87 | # Analyze the user's model argument and lock in the model, inputs, 88 | # and sequence that will be used by the rest of groqit() 89 | (model_locked, inputs_locked, sequence_locked, model_type) = ignition.model_intake( 90 | model, 91 | inputs, 92 | sequence, 93 | config, 94 | user_quantization_samples=quantization_samples, 95 | ) 96 | 97 | # Get the state of the model from the GroqFlow cache if a valid build is available 98 | state = ignition.load_or_make_state( 99 | config=config, 100 | cache_dir=cache_dir, 101 | rebuild=rebuild or build.DEFAULT_REBUILD_POLICY, 102 | model_type=model_type, 103 | monitor=monitor, 104 | use_sdk=build.USE_SDK, 105 | model=model_locked, 106 | inputs=inputs_locked, 107 | quantization_samples=quantization_samples, 108 | ) 109 | 110 | # Return a cached build if possible, otherwise prepare the model State for 111 | # a build 112 | if state.build_status == of_build.Status.SUCCESSFUL_BUILD: 113 | # Successful builds can be loaded from cache and returned with 114 | # no additional steps 115 | additional_msg = " (build_name auto-selected)" if config.auto_name else "" 116 | printing.log_success( 117 | f' Build "{config.build_name}"{additional_msg} found in cache. Loading it!', 118 | ) 119 | 120 | return groqmodel.load(config.build_name, state.cache_dir) 121 | 122 | state.quantization_samples = quantization_samples 123 | 124 | sequence_locked.show_monitor(config, state.monitor) 125 | state = sequence_locked.launch(state) 126 | 127 | if state.build_status == of_build.Status.SUCCESSFUL_BUILD: 128 | printing.log_success( 129 | f"\n Saved to **{of_build.output_dir(state.cache_dir, config.build_name)}**" 130 | ) 131 | 132 | return groqmodel.load(config.build_name, state.cache_dir) 133 | 134 | else: 135 | printing.log_success( 136 | f"Build Sequence {sequence_locked.unique_name} completed successfully" 137 | ) 138 | msg = """ 139 | groqit() only returns a GroqModel instance if the Sequence includes a Stage 140 | that sets state.build_status=onnxflow.build.Status.SUCCESSFUL_BUILD. 141 | """ 142 | printing.log_warning(msg) 143 | return None 144 | -------------------------------------------------------------------------------- /groqflow/justgroqit/ignition.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Tuple, Union, Dict, Any 2 | from collections.abc import Collection 3 | from typeguard import typechecked 4 | 5 | import onnxflow.common.build as of_build 6 | import onnxflow.common.exceptions as exp 7 | import onnxflow.justbuildit.export as of_export 8 | import onnxflow.justbuildit.hummingbird as hummingbird 9 | import onnxflow.justbuildit.stage as stage 10 | import onnxflow.justbuildit.ignition as of_ignition 11 | 12 | import groqflow.common.build as build 13 | import groqflow.justgroqit.compile as compile 14 | import groqflow.justgroqit.export as gf_export 15 | 16 | 17 | from groqflow.version import __version__ as groqflow_version 18 | 19 | default_pytorch_export_sequence = stage.Sequence( 20 | "default_pytorch_export_sequence", 21 | "Exporting PyTorch Model", 22 | [ 23 | of_export.ExportPytorchModel(), 24 | of_export.OptimizeOnnxModel(), 25 | gf_export.CheckOnnxCompatibility(), 26 | of_export.ConvertOnnxToFp16(), 27 | ], 28 | ) 29 | 30 | default_pytorch_sequence = stage.Sequence( 31 | "default_pytorch_sequence", 32 | "Building PyTorch Model", 33 | [ 34 | default_pytorch_export_sequence, 35 | compile.CompileOnnx(), 36 | compile.Assemble(), 37 | ], 38 | ) 39 | 40 | pytorch_export_sequence_with_quantization = stage.Sequence( 41 | "pytorch_export_sequence_with_quantization", 42 | "Exporting PyTorch Model and Quantizing Exported ONNX", 43 | [ 44 | of_export.ExportPytorchModel(), 45 | of_export.OptimizeOnnxModel(), 46 | gf_export.CheckOnnxCompatibility(), 47 | of_export.QuantizeONNXModel(), 48 | ], 49 | ) 50 | 51 | pytorch_sequence_with_quantization = stage.Sequence( 52 | "pytorch_sequence_with_quantization", 53 | "Building PyTorch Model", 54 | [ 55 | pytorch_export_sequence_with_quantization, 56 | compile.CompileOnnx(), 57 | compile.Assemble(), 58 | ], 59 | ) 60 | 61 | 62 | pytorch_export_torch_script_sequence = stage.Sequence( 63 | "pytorch_export_torch_script", 64 | "Export Pytorch Model to TorchScript", 65 | [ 66 | gf_export.ExportPytorchToTorchScript(), 67 | ], 68 | ) 69 | 70 | pytorch_torch_importer_sequence = stage.Sequence( 71 | "pytorch_torch_importer", 72 | "Build PyTorch Model using Torch Importer Front-end", 73 | [ 74 | pytorch_export_torch_script_sequence, 75 | compile.CompileTorchScript(), 76 | compile.Assemble(), 77 | ], 78 | ) 79 | 80 | default_keras_export_sequence = stage.Sequence( 81 | "default_keras_export_sequence", 82 | "Exporting Keras Model", 83 | [ 84 | of_export.ExportKerasModel(), 85 | of_export.OptimizeOnnxModel(), 86 | gf_export.CheckOnnxCompatibility(), 87 | of_export.ConvertOnnxToFp16(), 88 | ], 89 | ) 90 | 91 | default_keras_sequence = stage.Sequence( 92 | "default_keras_sequence", 93 | "Building Keras Model", 94 | [ 95 | default_keras_export_sequence, 96 | compile.CompileOnnx(), 97 | compile.Assemble(), 98 | ], 99 | ) 100 | 101 | 102 | default_onnx_sequence = stage.Sequence( 103 | "default_onnx_sequence", 104 | "Building ONNX Model", 105 | [ 106 | of_export.ReceiveOnnxModel(), 107 | of_export.OptimizeOnnxModel(), 108 | gf_export.CheckOnnxCompatibility(), 109 | of_export.ConvertOnnxToFp16(), 110 | compile.CompileOnnx(), 111 | compile.Assemble(), 112 | ], 113 | ) 114 | 115 | default_hummingbird_sequence = stage.Sequence( 116 | "default_hummingbird_sequence", 117 | "Building Hummingbird Model", 118 | [ 119 | hummingbird.ConvertHummingbirdModel(), 120 | of_export.OptimizeOnnxModel(), 121 | gf_export.CheckOnnxCompatibility(), 122 | compile.CompileOnnx(), 123 | compile.Assemble(), 124 | ], 125 | ) 126 | 127 | default_compiler_flags: List[str] = [] 128 | 129 | default_assembler_flags = [ 130 | "--ifetch-from-self", 131 | "--ifetch-slice-ordering=round-robin", 132 | ] 133 | 134 | 135 | @typechecked 136 | def _validate_args( # pylint: disable = unused-argument 137 | build_name: Optional[str] = None, 138 | compiler_flags: Optional[List[str]] = None, 139 | assembler_flags: Optional[List[str]] = None, 140 | groqview: bool = False, 141 | groqcard: Optional[str] = build.GROQCARD_A14, 142 | num_chips: Optional[int] = None, 143 | topology: Optional[str] = build.TOPOLOGY, 144 | ): 145 | 146 | if groqcard is not build.GROQCARD_A14: 147 | msg = f""" 148 | You set groqit()'s groqcard argument to {groqcard}, which is not a supported value. The 149 | currently supported value is: {build.GROQCARD_A14}. 150 | """ 151 | raise exp.ArgError(msg) 152 | 153 | if num_chips is not None and num_chips > 1: 154 | if topology is not build.DRAGONFLY and topology is not build.ROTATIONAL: 155 | msg = f""" 156 | You set groqit()'s topology argument to {topology} 157 | for build {build_name}, which is not a supported value. Choose from the 158 | currently supported values: {build.DRAGONFLY}, {build.ROTATIONAL}. 159 | """ 160 | raise exp.ArgError(msg) 161 | 162 | supported_topology = build.supported_topology(groqcard, topology) 163 | if num_chips not in supported_topology.keys(): 164 | msg = f""" 165 | You set groqit()'s num_chips argument to {num_chips} with topology {topology} 166 | for build {build_name}, which is not a supported value. Choose from the 167 | currently supported chip counts: {supported_topology.keys()}. 168 | """ 169 | raise exp.ArgError(msg) 170 | 171 | if compiler_flags: 172 | if "--auto-asm" in compiler_flags: 173 | if assembler_flags: 174 | msg = """ 175 | The --auto-asm compiler flag is mutually exclusive with the assembler_flags argument 176 | argument to groqit(). Either set assembler_flags=None or do not use --auto-asm. 177 | """ 178 | raise exp.ArgError(msg) 179 | 180 | # groqit() may automatically apply certain Groq Compiler flags to each build 181 | # This check makes sure the user isn't creating a collision by also applying 182 | # any of these flags 183 | disallowed_compiler_flags = [ 184 | "--multichip", 185 | "--groqview", 186 | "--save-stats", 187 | "-o", 188 | ] 189 | for user_flag in compiler_flags: 190 | for disallowed_flag in disallowed_compiler_flags: 191 | if user_flag.startswith(disallowed_flag): 192 | msg = f""" 193 | The following compiler flags are reserved by groqit() and cannot be used 194 | in the groqit(compiler_flags=...) argument: {disallowed_compiler_flags}. 195 | However, your compiler_flags argument includes {user_flag}. 196 | """ 197 | raise exp.ArgError(msg) 198 | 199 | if assembler_flags and num_chips != 1: 200 | msg = """ 201 | The assembler_flags argument is incompatible with multi-chip models. 202 | Either set num_chips=1 or do not use assembler_flags. 203 | """ 204 | raise exp.ArgError(msg) 205 | 206 | 207 | def lock_config( 208 | model, 209 | build_name: Optional[str] = None, 210 | compiler_flags: Optional[List[str]] = None, 211 | assembler_flags: Optional[List[str]] = None, 212 | groqview: bool = False, 213 | groqcard: Optional[str] = build.GROQCARD_A14, 214 | num_chips: Optional[int] = None, 215 | topology: Optional[str] = build.TOPOLOGY, 216 | sequence: stage.Sequence = None, 217 | ) -> build.GroqConfig: 218 | 219 | """ 220 | Process the user's configuration arguments to groqit(): 221 | 1. Raise exceptions for illegal arguments 222 | 2. Replace unset arguments with default values 223 | 3. Lock the configuration into an immutable object 224 | """ 225 | 226 | _validate_args( 227 | build_name=build_name, 228 | compiler_flags=compiler_flags, 229 | assembler_flags=assembler_flags, 230 | groqview=groqview, 231 | groqcard=groqcard, 232 | num_chips=num_chips, 233 | topology=topology, 234 | ) 235 | 236 | # Override the onnxflow default opset with GroqFlow's default 237 | of_build.DEFAULT_ONNX_OPSET = build.DEFAULT_ONNX_OPSET 238 | 239 | of_config = of_ignition.lock_config( 240 | model=model, 241 | build_name=build_name, 242 | sequence=sequence, 243 | ) 244 | 245 | # Use default compiler flags if no flags were specified 246 | if compiler_flags is None: 247 | compiler_flags = default_compiler_flags 248 | 249 | # Use default assembler flags if no flags were specified 250 | if assembler_flags is None: 251 | assembler_flags = default_assembler_flags 252 | 253 | # Store the args that should be immutable 254 | config = build.GroqConfig( # pylint: disable=unexpected-keyword-arg 255 | build_name=of_config.build_name, 256 | auto_name=of_config.auto_name, 257 | compiler_flags=compiler_flags, 258 | assembler_flags=assembler_flags, 259 | groqview=groqview, 260 | groqcard=groqcard, 261 | topology=topology, 262 | num_chips=num_chips, 263 | sequence=of_config.sequence, 264 | onnx_opset=of_config.onnx_opset, 265 | ) 266 | 267 | return config 268 | 269 | 270 | def _validate_cached_model( 271 | config: build.GroqConfig, 272 | model_type: of_build.ModelType, 273 | state: build.GroqState, 274 | model: of_build.UnionValidModelInstanceTypes = None, 275 | inputs: Optional[Dict[str, Any]] = None, 276 | ) -> List[str]: 277 | """ 278 | Verify whether anything in the call to groqit changed 279 | We require the user to resolve the discrepancy when such a 280 | change occurs, so the purpose of this function is simply to 281 | detect these conditions and raise an appropriate error. 282 | If this function returns without raising an exception then 283 | the cached model is valid to use in the build. 284 | """ 285 | 286 | result = of_ignition.validate_cached_model( 287 | config=config, 288 | model_type=model_type, 289 | state=state, 290 | model=model, 291 | inputs=inputs, 292 | ) 293 | 294 | current_version_decoded = of_ignition.decode_version_number(groqflow_version) 295 | state_version_decoded = of_ignition.decode_version_number(state.groqflow_version) 296 | 297 | out_of_date: Union[str, bool] = False 298 | if current_version_decoded["major"] > state_version_decoded["major"]: 299 | out_of_date = "major" 300 | elif current_version_decoded["minor"] > state_version_decoded["minor"]: 301 | out_of_date = "minor" 302 | 303 | if out_of_date: 304 | msg = ( 305 | f"Your build {state.config.build_name} was previously built against " 306 | f"GroqFlow version {state.groqflow_version}, " 307 | f"however you are now using GroqFlow version {groqflow_version}. The previous build is " 308 | f"incompatible with this version of GroqFlow, as indicated by the {out_of_date} " 309 | "version number changing. See **docs/versioning.md** for details." 310 | ) 311 | result.append(msg) 312 | 313 | return result 314 | 315 | 316 | def load_or_make_state( 317 | config: build.GroqConfig, 318 | cache_dir: str, 319 | rebuild: str, 320 | model_type: of_build.ModelType, 321 | monitor: bool, 322 | use_sdk: bool, 323 | model: of_build.UnionValidModelInstanceTypes = None, 324 | inputs: Optional[Dict[str, Any]] = None, 325 | quantization_samples: Optional[Collection] = None, 326 | ) -> build.GroqState: 327 | """ 328 | Decide whether we can load the model from the GroqFlow model cache 329 | (return a valid State instance) or whether we need to rebuild it (return 330 | a new State instance). 331 | """ 332 | 333 | return of_ignition.load_or_make_state( 334 | config=config, 335 | cache_dir=cache_dir, 336 | rebuild=rebuild, 337 | model_type=model_type, 338 | monitor=monitor, 339 | model=model, 340 | inputs=inputs, 341 | quantization_samples=quantization_samples, 342 | state_type=build.GroqState, 343 | cache_validation_func=_validate_cached_model, 344 | extra_state_args={"use_sdk": use_sdk}, 345 | ) 346 | 347 | 348 | groq_model_type_to_sequence = { 349 | of_build.ModelType.PYTORCH: default_pytorch_sequence, 350 | of_build.ModelType.KERAS: default_keras_sequence, 351 | of_build.ModelType.ONNX_FILE: default_onnx_sequence, 352 | of_build.ModelType.HUMMINGBIRD: default_hummingbird_sequence, 353 | } 354 | 355 | groq_model_type_to_sequence_with_quantization = { 356 | of_build.ModelType.PYTORCH: pytorch_sequence_with_quantization, 357 | } 358 | 359 | groq_model_type_torch_importer_override_to_sequence = { 360 | of_build.ModelType.PYTORCH: pytorch_torch_importer_sequence, 361 | of_build.ModelType.KERAS: default_keras_sequence, 362 | of_build.ModelType.ONNX_FILE: default_onnx_sequence, 363 | of_build.ModelType.HUMMINGBIRD: default_hummingbird_sequence, 364 | } 365 | 366 | 367 | def model_intake( 368 | user_model, 369 | user_inputs, 370 | user_sequence: Optional[stage.Sequence], 371 | config: build.GroqConfig, 372 | user_quantization_samples: Optional[Collection] = None, 373 | ) -> Tuple[Any, Any, stage.Sequence, of_build.ModelType]: 374 | 375 | override_sequence_map = groq_model_type_to_sequence 376 | if build.USE_TORCH_IMPORTER: 377 | override_sequence_map = groq_model_type_torch_importer_override_to_sequence 378 | 379 | model, inputs, sequence, model_type = of_ignition.model_intake( 380 | user_model=user_model, 381 | user_inputs=user_inputs, 382 | user_sequence=user_sequence, 383 | user_quantization_samples=user_quantization_samples, 384 | override_quantization_sequence_map=groq_model_type_to_sequence_with_quantization, 385 | override_sequence_map=override_sequence_map, 386 | ) 387 | 388 | if "--auto-asm" in config.compiler_flags: 389 | sequence.stages = [ 390 | stage 391 | for stage in sequence.stages 392 | if not isinstance(stage, compile.Assemble) 393 | ] 394 | 395 | return (model, inputs, sequence, model_type) 396 | -------------------------------------------------------------------------------- /groqflow/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "4.3.1" 2 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | Copyright 2023 Groq Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /proof_points/README.md: -------------------------------------------------------------------------------- 1 | # GroqFlow Proof Points 2 | 3 | Proof points demonstrate how GroqFlow is able to successfully build and execute a model on Groq hardware, while maintaining model accuracy. The models are organized by category. 4 | 5 | - Computer Vision (CV) 6 | - Natural Language Processing (NLP) 7 | - Speech Processing 8 | 9 | ## Table of Contents 10 | 11 | - [Prerequisites](#prerequisites) 12 | - [Support Matrix](#support-matrix) 13 | - [Computer Vision](#computer-vision) 14 | - [Natural Language Processing](#natural-language-processing) 15 | - [Speach Processing](#speech-processing) 16 | - [Running a Script](#running-a-script) 17 | - [Build and Evaluate on a Single Machine](#build-and-evaluate-on-a-single-machine) 18 | - [Build and Evaluate on Separate Machines](#build-and-evaluate-on-separate-machines) 19 | 20 | ## Prerequisites 21 | 22 | The following tasks are required to enable running proof point scripts: 23 | 24 | - Download and install the GroqWare Suite packages from from the [Groq Customer Portal](https://support.groq.com/): 25 | - `groq-devtools` package, for model development and builds 26 | - `groq-runtime` package, for running computations on hardware (Groq Hardware must be present to install) 27 | - If building and executing a proof point on the same host machine, download and install both of the above packages. 28 | - Clone the [GroqFlow Repository](https://github.com/groq/groqflow) 29 | - Set up and activate a `groqflow` environment 30 | - Follow the [GroqFlow Installation Guide](https://github.com/groq/groqflow/blob/main/docs/install.md) 31 | - Pip install the helper files for the proof points 32 | - `pip install -e {path_to}/groqflow/demo_helpers` 33 | 34 | ## Support Matrix 35 | 36 | The following relates the proof point models with the version of the GroqWare Suite (SDK) in which they are supported. 37 | 38 | ### Computer Vision 39 | 40 | | Proof Point Model | Supported SDK Version| 41 | |:------------------|:------------------------| 42 | | [DeiT-tiny](computer_vision/deit/) | >=0.9.2.1 43 | | [GoogleNet](computer_vision/googlenet/) | >=0.9.2.1 44 | | [MobileNetV2](computer_vision/mobilenetv2/) | >=0.9.2.1 45 | | [ResNet50](computer_vision/resnet50/) | >=0.9.2.1 46 | | [SqueezeNet](computer_vision/squeezenet/) | >=0.9.2.1 47 | | [Yolo v6](computer_vision/yolo/) | >=0.11.0 48 | 49 | ### Natural Language Processing 50 | 51 | | Proof Point Model | Supported SDK Version(s)| 52 | |:------------------|:------------------------| 53 | | [Bert Tiny](natural_language_processing/bert/) | >=0.9.2.1 54 | | [Bert Base](natural_language_processing/bert/) | >=0.9.2.1 55 | | [Bert Quantized](natural_language_processing/bert/) | 0.9.2.1 56 | | [DistilBERT](natural_language_processing/distilbert/) | >=0.9.2.1 57 | | [ELECTRA](natural_language_processing/electra/) | >=0.9.2.1 58 | | [MiniLM v2](natural_language_processing/minilm/) | >=0.9.2.1 59 | | [RoBERTa](natural_language_processing/roberta/) | >=0.9.2.1 60 | 61 | ### Speech Processing 62 | 63 | | Proof Point Model | Supported SDK Version| 64 | |:------------------|:------------------------| 65 | | [M5](speech/m5/) | >=0.9.2.1 66 | 67 | ## Running A Script 68 | 69 | Each proof point will first build a GroqModel, and then evaluate the model on both a CPU and Groq hardware. If access to Groq hardware is available, the build and model evaluation steps can be run in a single step. However, a two step process has also been provided in case resource management requires that the build and evaluation steps be carried out on separate machines. Provided here are the general steps to run a script, but each proof point has a README with that provides any requirements and features that are specific to the model. 70 | 71 | **Note**: Builds for large models can take several minutes. To avoid a time commitment surprise, the build time is included in the README for each proof point. 72 | 73 | ## Build and Evaluate on a Single Machine 74 | 75 | Navigate to the folder containing the proof point and read the model's details in the `README`. 76 | 77 | - Install the `requirement.txt` file. 78 | 79 | ```bash 80 | pip install -r requirements.txt 81 | ``` 82 | 83 | - Build and evaluate the proof point: 84 | 85 | ```bash 86 | python {proof_point_name}.py 87 | ``` 88 | 89 | ## Build and Evaluate on Separate Machines 90 | 91 | Navigate to the folder containing the proof point and read the model's details in the `README`. 92 | 93 | - Install the `requirement.txt` file. 94 | 95 | ```bash 96 | pip install -r requirements.txt 97 | ``` 98 | 99 | - Build the model by running the command with the `--build` flag as shown below: 100 | 101 | ```bash 102 | python {proof_point_name}.py --build 103 | ``` 104 | 105 | - If the model already exists in cache, it will not be rebuilt unless the model code or build changes. 106 | - Transfer the proof point script and the `.iop` files to the machine connected to Groq Hardware. 107 | - The resulting build artifacts will be located in the GroqFlow Cache directory for the proof point, `~/.cache/groqflow/{proof_point_name}`. These artifacts include log files, ONNX files, inputs, the yaml state file, and the compile folder. 108 | - The `.iop` files can be found within the compile folder in the cache directory. There will be a file for each card used to execute the model on Groq hardware. Copy these files to the same location on the second machine: 109 | 110 | `~/.cache/groqflow/{proof_point_name}/compile/*.iop` 111 | 112 | - Once the proof point is copied to the same cache directory and the initial prerequisites are met, the script can be run a second time with the `--execute` flag to evaluate the model. 113 | 114 | ```bash 115 | python {proof_point_name}.py --execute 116 | ``` 117 | -------------------------------------------------------------------------------- /proof_points/computer_vision/deit/README.md: -------------------------------------------------------------------------------- 1 | # DeiT 2 | 3 | The [DeiT](https://arxiv.org/abs/2012.12877) (Data-efficient image Transformer) is a convolution-free transformer model designed for computer vision. DeiT models are efficiently trained for image classification using a novel token distillation process that can learn more from a convolutional teacher model than a transformer teacher. DeiT models also require less data to train than the original Vision Transformers [(ViT)](https://arxiv.org/abs/2010.11929v2). 4 | 5 | This proof point obtains a [pre-trained DeiT-tiny](https://huggingface.co/facebook/deit-tiny-patch16-224) from Hugging Face for the task of Image Classification. The model implementations are evaluated using the 10-class [Imagenette dataset](https://github.com/fastai/imagenette) which is a sampling from the [ImageNet dataset](https://www.image-net.org/). The tiny version of the DeiT model illustrates the ability of GroqFlow™ and GroqWare™ Suite to support all of the necessary operations used to build and run the ConvNeXt models. 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate DeiT-tiny: 23 | 24 | ```bash 25 | python deit_tiny.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 4 minutes for DeiT-tiny to build and about 2 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation on a GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/computer_vision/deit/deit_tiny.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example downloads a pre-trained DeiT tiny from Hugging 3 | Face (https://huggingface.co/facebook/deit-tiny-patch16-224) and 4 | executes against Imagenette, the 10-class, sampled ImageNet 5 | dataset (https://github.com/fastai/imagenette) on CPU and 6 | GroqChip™ processor by using the GroqFlow toolchain. 7 | """ 8 | import torch 9 | from transformers import ViTForImageClassification 10 | 11 | from groqflow import groqit 12 | from demo_helpers.compute_performance import compute_performance 13 | from demo_helpers.args import parse_args 14 | 15 | 16 | def evaluate_deit_tiny(rebuild_policy=None, should_execute=True): 17 | # load torch model 18 | model = ViTForImageClassification.from_pretrained( 19 | "facebook/deit-tiny-patch16-224", torchscript=True 20 | ) 21 | model.eval() 22 | 23 | # create dummy inputs to prime groq model 24 | dummy_inputs = {"pixel_values": torch.ones([1, 3, 224, 224])} 25 | 26 | # generate groq model 27 | groq_model = groqit(model, dummy_inputs, rebuild=rebuild_policy) 28 | 29 | # compute performance on CPU and GroqChip 30 | if should_execute: 31 | compute_performance( 32 | groq_model, 33 | model, 34 | dataset="sampled_imagenet", 35 | task="classification", 36 | ) 37 | print(f"Proof point {__file__} finished!") 38 | 39 | 40 | if __name__ == "__main__": 41 | evaluate_deit_tiny(**parse_args()) 42 | -------------------------------------------------------------------------------- /proof_points/computer_vision/deit/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | transformers>=4.20.0 3 | -------------------------------------------------------------------------------- /proof_points/computer_vision/googlenet/README.md: -------------------------------------------------------------------------------- 1 | # GoogLeNet 2 | 3 | [GoogLeNet](https://arxiv.org/abs/1409.4842v1) is the convolutional neural network (CNN) based on the Inception architecture that received top marks in the ImageNet Large-Scale Visual Recognition Challenge 2014 ([ILSVRC 2014](https://www.image-net.org/challenges/LSVRC/2014/)). The stacked Inception modules applied multiple convolutional filter sizes (1x1, 3x3, & 5x5) before aggregating the results so that the next stage could simultaneously extract features of varying scale. The number of parameters and computational complexity were kept in check by using 1x1 convolution layers before the larger filters to reduce the layer dimensions before convolving over large patch sizes. 4 | 5 | In this proof point, GoogleNet is used for the task of image classification and evaluated using the Imagenette [dataset](https://github.com/fastai/imagenette), a 10 class, sampled version of the ImageNet [dataset](https://www.image-net.org/). The model weights are downloaded from the [PyTorch website](https://pytorch.org/hub/pytorch_vision_googlenet/). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate GoogLeNet: 23 | 24 | ```bash 25 | python googlenet.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 6 minutes for GoogLeNet to build and about 2 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation on a GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/computer_vision/googlenet/googlenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes a pre-trained GoogLeNet model 3 | (https://pytorch.org/hub/pytorch_vision_googlenet/) and 4 | executes against Imagenette, the 10-class, sampled ImageNet 5 | dataset (https://github.com/fastai/imagenette) on CPU and 6 | GroqChip™ processor by using the GroqFlow toolchain. 7 | """ 8 | import torch 9 | 10 | from demo_helpers.compute_performance import compute_performance 11 | from demo_helpers.args import parse_args 12 | from groqflow import groqit 13 | 14 | 15 | def evaluate_googlenet(rebuild_policy=None, should_execute=None): 16 | # set seed for consistency 17 | torch.manual_seed(0) 18 | 19 | # load torch model 20 | torch_model = torch.hub.load( 21 | "pytorch/vision:v0.10.0", "googlenet", weights="GoogLeNet_Weights.DEFAULT" 22 | ) 23 | torch_model.eval() # disable training specific layers 24 | 25 | # create dummy input to prime groq model 26 | dummy_inputs = torch.randn((1, 3, 224, 224), dtype=torch.float32) 27 | 28 | # generate groq model 29 | build_name = "googlenet" 30 | groq_model = groqit( 31 | torch_model, 32 | {"x": dummy_inputs}, 33 | rebuild=rebuild_policy, 34 | build_name=build_name, 35 | ) 36 | 37 | # compute performance on CPU and GroqChip 38 | if should_execute: 39 | compute_performance( 40 | groq_model, torch_model, "sampled_imagenet", task="classification" 41 | ) 42 | 43 | print(f"Proof point {__file__} finished!") 44 | 45 | 46 | if __name__ == "__main__": 47 | evaluate_googlenet(**parse_args()) 48 | -------------------------------------------------------------------------------- /proof_points/computer_vision/googlenet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | -------------------------------------------------------------------------------- /proof_points/computer_vision/mobilenetv2/README.md: -------------------------------------------------------------------------------- 1 | # MobileNetV2 2 | 3 | [MobileNetV2](https://arxiv.org/abs/1801.04381) is a CNN model that was designed to perform well on mobile devices. The architecture makes use of an inverted residual structure where the residual connections are between the bottleneck layers. The intermediate expansion layer uses lightweight depthwise convolutions to filter features as a source of non-linearity and to reduce the memory footprint of the model. 4 | 5 | This proof point uses a [MobileNet V2 model]((https://pytorch.org/hub/pytorch_vision_mobilenet_v2/)) pre-trained on the [ImageNet dataset](https://www.image-net.org/) and downloaded from PyTorch's model hub. The model is evaluated on the sampled, 10 class version of the ImageNet dataset, [Imagenette](https://github.com/fastai/imagenette). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | # Build and Evaluate 21 | 22 | To build and evaluate MobileNetV2: 23 | 24 | ```bash 25 | python mobilenetv2.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 12 minutes for MobileNetV2 to build and about 2 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/computer_vision/mobilenetv2/mobilenetv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes a pre-trained MobileNetV2 model from 3 | https://pytorch.org/hub/pytorch_vision_mobilenet_v2/, and 4 | executes against the 10-class, sampled ImageNet dataset, Imagenette 5 | (https://github.com/fastai/imagenette) on CPU and GroqChip™ 6 | processor by using the GroqFlow toolchain. 7 | """ 8 | 9 | import torch 10 | 11 | from demo_helpers.compute_performance import compute_performance 12 | from demo_helpers.args import parse_args 13 | from groqflow import groqit 14 | 15 | 16 | def evaluate_mobilenetv2(rebuild_policy=None, should_execute=None): 17 | # set seed for consistency 18 | torch.manual_seed(0) 19 | 20 | # load torch model 21 | torch_model = torch.hub.load( 22 | "pytorch/vision:v0.10.0", 23 | "mobilenet_v2", 24 | weights="MobileNet_V2_Weights.IMAGENET1K_V1", 25 | ) 26 | torch_model.eval() # disable normalization and dropout layers 27 | 28 | # create dummy input to prime groq model 29 | dummy_inputs = torch.randn((1, 3, 224, 224), dtype=torch.float32) 30 | 31 | # generate groq model 32 | build_name = "mobilenetv2" 33 | groq_model = groqit( 34 | torch_model, 35 | {"x": dummy_inputs}, 36 | rebuild=rebuild_policy, 37 | build_name=build_name, 38 | ) 39 | 40 | # compute performance on CPU and GroqChip 41 | if should_execute: 42 | compute_performance( 43 | groq_model, torch_model, "sampled_imagenet", task="classification" 44 | ) 45 | 46 | print(f"Proof point {__file__} finished!") 47 | 48 | 49 | if __name__ == "__main__": 50 | evaluate_mobilenetv2(**parse_args()) 51 | -------------------------------------------------------------------------------- /proof_points/computer_vision/mobilenetv2/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | -------------------------------------------------------------------------------- /proof_points/computer_vision/resnet50/README.md: -------------------------------------------------------------------------------- 1 | # ResNet50 2 | 3 | ResNet50 is a Convolutional Neural Network (CNN) model used for image classification. Kaiming He, et al. first introduced ResNet models and the revolutionary residual connection (also known as skip connection) in their 2015 paper, [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). The residual connection enables easier optimization and better accuracy while training deep models. 4 | 5 | This proof point uses a [ResNet50 model](https://pytorch.org/hub/pytorch_vision_resnet/) pre-trained on the [ImageNet dataset](https://www.image-net.org/) and downloaded from PyTorch's model hub. The model is evaluated on the sampled, 10 class version of the ImageNet dataset, [Imagenette](https://github.com/fastai/imagenette). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies for this proof point with the following: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate ResNet50: 23 | 24 | ```bash 25 | python resnet50.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 18 minutes for ResNet50 to build and about 3 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/computer_vision/resnet50/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | -------------------------------------------------------------------------------- /proof_points/computer_vision/resnet50/resnet50.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained ResNet-50 from torchvision and executes against 3 | Imagenet1k on CPU and GroqChip1 through GroqFlow. 4 | """ 5 | 6 | from groqflow import groqit 7 | from demo_helpers.args import parse_args 8 | from demo_helpers.compute_performance import compute_performance 9 | 10 | import torch 11 | 12 | 13 | def get_model(): 14 | """PyTorch Model setup.""" 15 | pytorch_model = torch.hub.load( 16 | "pytorch/vision:v0.10.0", "resnet50", weights="ResNet50_Weights.IMAGENET1K_V1" 17 | ) 18 | return pytorch_model.eval() 19 | 20 | 21 | def evaluate_resnet50(rebuild_policy=None, should_execute=True): 22 | pytorch_model = get_model() 23 | dummy_inputs = {"x": torch.ones([1, 3, 224, 224])} 24 | 25 | # Get Groq Model using groqit 26 | groq_model = groqit(pytorch_model, dummy_inputs, rebuild=rebuild_policy) 27 | 28 | # Execute PyTorch model on CPU, Groq Model and print accuracy 29 | if should_execute: 30 | compute_performance( 31 | groq_model, pytorch_model, "sampled_imagenet", task="classification" 32 | ) 33 | 34 | print(f"Proof point {__file__} finished!") 35 | 36 | 37 | if __name__ == "__main__": 38 | evaluate_resnet50(**parse_args()) 39 | -------------------------------------------------------------------------------- /proof_points/computer_vision/squeezenet/README.md: -------------------------------------------------------------------------------- 1 | # SqueezeNet 2 | 3 | [SqueezeNet](https://arxiv.org/abs/1602.07360?context=cs) is advertised as a small convolutional neural network (CNN) that achieves "AlexNet level accuracy on ImageNet with 50x fewer parameters" as quoted in the linked paper. SqueezeNet models are highly efficient in terms of size and speed while providing relatively good accuracies. This makes them ideal for platforms with strict constraints on size. 4 | 5 | In this proof point, SqueezeNet is performing image classification. It is evaluated on the [Imagenette dataset](https://github.com/fastai/imagenette) which is a sampled, 10 class version of the [ImageNet dataset](https://www.image-net.org/). The model weights will be downloaded from the [PyTorch website](https://pytorch.org/hub/pytorch_vision_squeezenet/). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies for this proof point with the following: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate SqueezeNet: 23 | 24 | ```bash 25 | python squeezenet.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 5 minutes for SqueezeNet to build and about 1 minute to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/computer_vision/squeezenet/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.1 2 | -------------------------------------------------------------------------------- /proof_points/computer_vision/squeezenet/squeezenet.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes a pre-trained SqueezeNet model and 3 | executes against the Imagenette dataset on a CPU and GroqChip™ processor 4 | by using the GroqFlow toolchain. 5 | """ 6 | 7 | import torch 8 | 9 | from demo_helpers.compute_performance import compute_performance 10 | from demo_helpers.args import parse_args 11 | from groqflow import groqit 12 | 13 | 14 | def evaluate_squeezenet(rebuild_policy=None, should_execute=None): 15 | # set seed for consistency 16 | torch.manual_seed(0) 17 | 18 | # load torch model 19 | torch_model = torch.hub.load( 20 | "pytorch/vision:v0.10.0", 21 | "squeezenet1_0", 22 | weights="SqueezeNet1_0_Weights.DEFAULT", 23 | ) 24 | 25 | # create dummy inputs to prime groq model 26 | dummy_inputs = torch.randn((1, 3, 224, 224), dtype=torch.float32) 27 | 28 | # generate groq model 29 | build_name = "squeezenet" 30 | groq_model = groqit( 31 | torch_model, 32 | {"x": dummy_inputs}, 33 | rebuild=rebuild_policy, 34 | build_name=build_name, 35 | ) 36 | 37 | # compute performance on CPU and GroqChip 38 | if should_execute: 39 | compute_performance( 40 | groq_model, torch_model, "sampled_imagenet", task="classification" 41 | ) 42 | 43 | print(f"Proof point {__file__} finished!") 44 | 45 | 46 | if __name__ == "__main__": 47 | evaluate_squeezenet(**parse_args()) 48 | -------------------------------------------------------------------------------- /proof_points/computer_vision/yolo/README.md: -------------------------------------------------------------------------------- 1 | # YOLO v6 2 | 3 | YOLOv6 is a Convolutional Neural Network (CNN) model used for [Object Detection](https://en.wikipedia.org/wiki/Object_detection). It is an extension of the original YOLO model developed by [Joseph Redmon](https://pjreddie.com/), et al. in their 2015 paper, [You Only Look Once: Unified, Real-Time Object Detection](https://arxiv.org/abs/1506.02640). The key innovation of YOLO is the improved efficiency in inference speed and computation compared to other object detection models. Yolo locates the objects in an image and classifies them in a single "look". Other state of the art image detection models use a many module approach which required separate steps to first identify possible objects and then another to classify located objects. Redmon argued that this required multiple "looks" at an image and while it could achieved good results, they were larger, more computationally intense, and therefore slower. 4 | 5 | This variation of YOLO was released by the Meituan Vision AI Department and [published on github](https://github.com/meituan/YOLOv6) in different sizes ranging from YOLOv6-nano at 4.3M parameters to YOLOv6-large at 58.5M parameters. This proof point compiles the YOLOv6-nano model for an input size of 640 X 640 pixels. 6 | 7 | This proof point evaluates YOLOv6-nano on the [COCO dataset](https://cocodataset.org/). The success of the model is measured using the "mAP @ 0.5:0.95" metric, which computes an average mAP (Mean Average Precision) using different IoU (Intersection over Union) thresholds varying from 0.5 to 0.95. An explanation of this evaluation method can also be found at the COCO website under the [Evaluate tab](https://cocodataset.org/#detection-eval). 8 | 9 | ## Prerequisites 10 | 11 | - Ensure you've completed the install prerequisites: 12 | - Installed the GroqWare™ Suite 13 | - Installed GroqFlow 14 | - Installed Groq Demo Helpers 15 | - For more information on these steps, see the [Proof Points README](../../README.md). 16 | - Install the python dependencies for this proof point with the following: 17 | 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | ## Build and Evaluate 23 | 24 | To build and evaluate YOLOv6-nano: 25 | 26 | ```bash 27 | python yolov6_nano.py 28 | ``` 29 | 30 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 31 | 32 | ## Expected Results 33 | 34 | It takes approximately 60 minutes for YOLOv6 to build and about 10 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 35 | -------------------------------------------------------------------------------- /proof_points/computer_vision/yolo/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python>=4.1.2 2 | pycocotools>=2.0 3 | -------------------------------------------------------------------------------- /proof_points/computer_vision/yolo/yolov6_nano.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained YOLOv6 model 3 | (https://github.com/meituan/YOLOv6) and executes against 4 | the COCO dataset (https://cocodataset.org/) on CPU and 5 | GroqChip™ processor using the GroqFlow toolchain. 6 | """ 7 | import torch 8 | 9 | from groqflow import groqit 10 | from demo_helpers.args import parse_args 11 | from demo_helpers.compute_performance import compute_performance 12 | from demo_helpers.models import get_yolov6n_model 13 | from demo_helpers.misc import check_deps 14 | 15 | 16 | def evaluate_yolov6n(rebuild_policy=None, should_execute=True): 17 | check_deps(__file__) 18 | model = get_yolov6n_model() 19 | dummy_inputs = {"images": torch.ones([1, 3, 640, 640])} 20 | 21 | # Get Groq Model using groqit 22 | groq_model = groqit( 23 | model, 24 | dummy_inputs, 25 | rebuild=rebuild_policy, 26 | compiler_flags=["--effort=high"], 27 | ) 28 | if should_execute: 29 | compute_performance(groq_model, model, "coco", task="coco_map") 30 | 31 | print(f"Proof point {__file__} finished!") 32 | 33 | 34 | if __name__ == "__main__": 35 | evaluate_yolov6n(**parse_args()) 36 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/bert/README.md: -------------------------------------------------------------------------------- 1 | # BERT 2 | 3 | This folder contains proof points that demonstrate two variants of the Natural Language processing model, [BERT](https://arxiv.org/pdf/1810.04805.pdf): BERT-tiny, and BERT-base. BERT is a bidirectional transformer architecture pretrained using Masked Language Modeling. The success of these proof points illustrate the ability of GroqFlow and the GroqWare™ Suite to support both the operations and size of the classic transformer architecture used by BERT models. 4 | 5 | BERT-tiny is a small (tiny, even!) variant of the BERT architecture. The paper, [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/pdf/1908.08962.pdf), introduces BERT-tiny along with other BERT variants of reduced size: BERT-mini, BERT-small, and BERT-medium. They are studied further in the paper [Generalization in NLI: Ways (Not) To Go Beyond Simple Heuristics](https://arxiv.org/pdf/2110.01518.pdf) 6 | 7 | The Bert-tiny proof point uses a model fine-tuned on the [Stanford Sentiment Treebank (SST) dataset](https://paperswithcode.com/dataset/sst), loaded from [Huggingface](https://huggingface.co/M-FAC/bert-tiny-finetuned-sst2) to perform [Sentiment Classification](https://paperswithcode.com/task/sentiment-analysis). 8 | 9 | The BERT-base proof point also uses a pre-trained model that is fine-tuned on the SST dataset for Sentiment Classification. [Huggingface](https://huggingface.co/howey/bert-base-uncased-sst2) provides the BERT-base model. 10 | 11 | The Bert-quantize proof point performs post training quantization on the BERT-base model specified above. 1000 data samples from the SST Sentiment Classification dataset are chosen at random to determine the quantization parameters. 12 | 13 | ## Prerequisites 14 | 15 | - Ensure you've completed the install prerequisites: 16 | - Installed GroqWare™ Suite 17 | - Installed GroqFlow 18 | - Installed Groq Demo Helpers 19 | - For more information on these steps, see the [Proof Points README](../../README.md). 20 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 21 | 22 | ```bash 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ## Build and Evaluate 27 | 28 | To build and evaluate BERT-tiny: 29 | 30 | ```bash 31 | python bert_tiny.py 32 | ``` 33 | 34 | To build and evaluate BERT-base: 35 | 36 | ```bash 37 | python bert_base.py 38 | ``` 39 | 40 | Note: The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 41 | 42 | ## Expected Results 43 | 44 | Each script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation. The table below details the approximate time to run each part of the script, and the required number of GroqCard™ accelerator. 45 | 46 | | Proof Point Model | Approx Build Time | Approx Evaluation Time | Num of GroqCard™ Accelerators | 47 | |:-----------|:--------|:---------|:----------| 48 | | BERT-tiny | 1 min | 30 sec | 1 | 49 | | BERT-base | 15 min | 4 min | 4 | 50 | | BERT-quantize | 17 min | 4 min | 4 | 51 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/bert/bert_base.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained Bert from Hugging Face 3 | (https://huggingface.co/howey/bert-base-uncased-sst2) and 4 | executes against SST dataset (https://paperswithcode.com/dataset/sst) 5 | on CPU and GroqCard™ processor using the GroqFlow toolchain. 6 | """ 7 | import os 8 | import numpy as np 9 | import torch 10 | import transformers 11 | from groqflow import groqit 12 | 13 | from demo_helpers.compute_performance import compute_performance 14 | from demo_helpers.args import parse_args 15 | 16 | 17 | def get_model(): 18 | """PyTorch Model setup.""" 19 | pretrained_model_name = "howey/bert-base-uncased-sst2" 20 | 21 | tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name) 22 | pytorch_model = transformers.AutoModelForSequenceClassification.from_pretrained( 23 | pretrained_model_name, torchscript=True 24 | ) 25 | 26 | return pytorch_model.eval(), tokenizer 27 | 28 | 29 | def evaluate_bert(rebuild_policy=None, should_execute=True): 30 | # set seed for consistency 31 | np.random.seed(1) 32 | torch.manual_seed(0) 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | # load pre-trained torch model 36 | pytorch_model, tokenizer = get_model() 37 | 38 | # dummy inputs to generate the groq model 39 | batch_size = 1 40 | max_seq_length = 128 41 | dummy_inputs = { 42 | "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long), 43 | "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.bool), 44 | } 45 | 46 | # generate groq model 47 | groq_model = groqit(pytorch_model, dummy_inputs, rebuild=rebuild_policy) 48 | 49 | # compute performance on CPU and GroqChip 50 | if should_execute: 51 | compute_performance( 52 | groq_model, 53 | pytorch_model, 54 | dataset="sst", 55 | tokenizer=tokenizer, 56 | max_seq_length=max_seq_length, 57 | task="classification", 58 | ) 59 | 60 | print(f"Proof point {__file__} finished!") 61 | 62 | 63 | if __name__ == "__main__": 64 | evaluate_bert(**parse_args()) 65 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/bert/bert_quantize.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained Bert from Hugging Face 3 | (https://huggingface.co/howey/bert-base-uncased-sst2), performs 4 | post-training quantization on the exported ONNX model, and 5 | executes against SST dataset (https://paperswithcode.com/dataset/sst) 6 | on CPU and GroqCard™ accelerator using the GroqFlow toolchain. 7 | """ 8 | 9 | import os 10 | import numpy as np 11 | import torch 12 | import transformers 13 | from groqflow import groqit 14 | 15 | from demo_helpers.compute_performance import compute_performance 16 | from demo_helpers.args import parse_args 17 | from demo_helpers.dataset import get_sst_quantization_samples 18 | 19 | from datasets import logging 20 | 21 | logging.set_verbosity(logging.ERROR) 22 | 23 | 24 | def get_model(): 25 | """PyTorch Model setup.""" 26 | pretrained_model_name = "howey/bert-base-uncased-sst2" 27 | 28 | tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name) 29 | pytorch_model = transformers.AutoModelForSequenceClassification.from_pretrained( 30 | pretrained_model_name 31 | ) 32 | 33 | return pytorch_model.eval(), tokenizer 34 | 35 | 36 | def evaluate_bert(rebuild_policy=None, should_execute=True): 37 | # set seed for consistency 38 | np.random.seed(1) 39 | torch.manual_seed(0) 40 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 41 | 42 | # load pre-trained torch model 43 | pytorch_model, tokenizer = get_model() 44 | 45 | # dummy inputs to generate the groq model 46 | batch_size = 1 47 | max_seq_length = 128 48 | dummy_inputs = { 49 | "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.int32), 50 | "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.bool), 51 | } 52 | 53 | # process quantization sample data 54 | x_train = get_sst_quantization_samples() 55 | 56 | # generate groq model 57 | groq_model = groqit( 58 | pytorch_model, 59 | dummy_inputs, 60 | rebuild=rebuild_policy, 61 | quantization_samples=x_train, 62 | compiler_flags=["--large-program"], 63 | ) 64 | 65 | if should_execute: 66 | compute_performance( 67 | groq_model, 68 | pytorch_model, 69 | dataset="sst-int32", 70 | tokenizer=tokenizer, 71 | max_seq_length=max_seq_length, 72 | task="classification", 73 | ) 74 | 75 | print(f"Proof point {__file__} finished!") 76 | 77 | 78 | if __name__ == "__main__": 79 | evaluate_bert(**parse_args()) 80 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/bert/bert_tiny.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained Bert-tiny from Hugging Face 3 | (https://huggingface.co/M-FAC/bert-tiny-finetuned-sst2) and 4 | executes against SST dataset (https://paperswithcode.com/dataset/sston) 5 | on CPU and GroqCard™ processor using the GroqFlow toolchain. 6 | """ 7 | import os 8 | import numpy as np 9 | import torch 10 | import transformers 11 | from groqflow import groqit 12 | 13 | from demo_helpers.compute_performance import compute_performance 14 | from demo_helpers.args import parse_args 15 | 16 | 17 | def get_model(): 18 | """PyTorch Model setup.""" 19 | pretrained_model_name = "M-FAC/bert-tiny-finetuned-sst2" 20 | 21 | tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name) 22 | pytorch_model = transformers.AutoModelForSequenceClassification.from_pretrained( 23 | pretrained_model_name, torchscript=True 24 | ) 25 | 26 | return pytorch_model.eval(), tokenizer 27 | 28 | 29 | def evaluate_bert_tiny(rebuild_policy=None, should_execute=True): 30 | # set seed for consistency 31 | np.random.seed(1) 32 | torch.manual_seed(0) 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | # load pre-trained torch model 36 | pytorch_model, tokenizer = get_model() 37 | 38 | # dummy inputs to generate the groq model 39 | batch_size = 1 40 | max_seq_length = 128 41 | dummy_inputs = { 42 | "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long), 43 | "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.bool), 44 | } 45 | 46 | # generate groq model 47 | groq_model = groqit(pytorch_model, dummy_inputs, rebuild=rebuild_policy) 48 | 49 | # compute performance on CPU and GroqChip 50 | if should_execute: 51 | compute_performance( 52 | groq_model, 53 | pytorch_model, 54 | dataset="sst", 55 | tokenizer=tokenizer, 56 | max_seq_length=max_seq_length, 57 | task="classification", 58 | ) 59 | 60 | print(f"Proof point {__file__} finished!") 61 | 62 | 63 | if __name__ == "__main__": 64 | evaluate_bert_tiny(**parse_args()) 65 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/bert/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.21.6 2 | torch>=1.12.1 3 | transformers>=4.20.0 4 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/distilbert/README.md: -------------------------------------------------------------------------------- 1 | # DistilBERT 2 | 3 | [DistilBERT](https://arxiv.org/pdf/1910.01108.pdf) is a [distilled model](https://arxiv.org/pdf/1503.02531.pdf) using the [BERT model](https://arxiv.org/abs/1810.04805) as the teacher. DistilBERT has the same general architecture as BERT except half the layers, the pooler, and token embeddings are removed. This reduction in size allows the model to train faster and requires much less memory and power to run. DistilBert boasts that it retains 97% of the Bert model scores with 40% fewer parameters. 4 | 5 | In this proof point, DistilBert performs the task of [Sentiment Classification](https://paperswithcode.com/task/sentiment-analysis) and is evaluated using the Stanford Sentiment Treebank [(SST) dataset](https://paperswithcode.com/dataset/sst). The model weights are downloaded from the [Hugging Face website](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate DistilBERT: 23 | 24 | ```bash 25 | python distilbert.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 8 minutes for DistilBERT to build and about 2 minutes to evaluate the model's accuracy. The example returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation on 4 GroqCard™ accelerators within a GroqNode™ server. 33 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/distilbert/distilbert.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from transformers import AutoTokenizer, DistilBertForSequenceClassification 4 | import torch 5 | from demo_helpers.compute_performance import compute_performance 6 | from demo_helpers.args import parse_args 7 | 8 | from groqflow import groqit 9 | 10 | 11 | def evaluate_distilbert(rebuild_policy=None, should_execute=True): 12 | # set seed for consistency 13 | np.random.seed(1) 14 | torch.manual_seed(0) 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | 17 | # load pre-trained torch model 18 | pretrained_model = "distilbert-base-uncased-finetuned-sst-2-english" 19 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model) 20 | pytorch_model = DistilBertForSequenceClassification.from_pretrained( 21 | pretrained_model, torchscript=True 22 | ) 23 | 24 | # dummy inputs to generate the groq model 25 | batch_size = 1 26 | max_seq_length = 128 27 | 28 | dummy_inputs = { 29 | "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long), 30 | "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.bool), 31 | } 32 | 33 | # generate groq model 34 | build_name = "distilbert" 35 | groq_model = groqit( 36 | pytorch_model, 37 | dummy_inputs, 38 | rebuild=rebuild_policy, 39 | build_name=build_name, 40 | num_chips=4, 41 | compiler_flags=["--partition-mode=group-fit"], 42 | ) 43 | 44 | # compute performance on CPU and GroqChip 45 | if should_execute: 46 | compute_performance( 47 | groq_model, 48 | pytorch_model, 49 | dataset="sst", 50 | tokenizer=tokenizer, 51 | max_seq_length=max_seq_length, 52 | task="classification", 53 | ) 54 | 55 | print(f"Proof point {__file__} finished!") 56 | 57 | 58 | if __name__ == "__main__": 59 | evaluate_distilbert(**parse_args()) 60 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/distilbert/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | transformers>=4.20.0 3 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/electra/README.md: -------------------------------------------------------------------------------- 1 | # ELECTRA 2 | 3 | [ELECTRA](https://openreview.net/pdf?id=r1xMH1BtvB) uses a self-supervised pre-training method for language representation learning that is similar to a [Generative Adversarial Network (GAN)](https://en.wikipedia.org/wiki/Generative_adversarial_network), without the adversarial part. During pre-training, instead of masking an input token and learning what the masked token is, like many other NLP models, a few input tokens are replaced with tokens of similar meaning by a small generative network. Then, the edited input is fed into a discriminator network to learn to differentiate between the original and replacement tokens. After training, the generator network is discarded and the discriminator network is used for inference. With this architecture and training method, ELECTRA boasts that it learns more efficiently and meets or outperforms, in terms of accuracy, models that only learn the masked tokens. 4 | 5 | In this proof point, ELECTRA is fine-tuned on the [Stanford Sentiment Treebank (SST) dataset](https://paperswithcode.com/dataset/sst), loaded from [Huggingface](https://huggingface.co/M-FAC/bert-tiny-finetuned-sst2), and performs the task of [Sentiment Classification](https://paperswithcode.com/task/sentiment-analysis). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate ELECTRA: 23 | 24 | ```bash 25 | python electra.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 15 minutes for ELECTRA to build and about 4 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation on 4 GroqCard™ accelerators within a GroqNode™ server. 33 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/electra/electra.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained ELECTRA small v2 from 3 | huggingface models repository and executes against SST dataset on CPU 4 | and GroqChip1 through GroqFlow. 5 | """ 6 | import os 7 | import transformers 8 | from groqflow import groqit 9 | import torch 10 | import numpy as np 11 | 12 | from demo_helpers.compute_performance import compute_performance 13 | from demo_helpers.args import parse_args 14 | 15 | 16 | def evaluate_electra(rebuild_policy=None, should_execute=True): 17 | # set seed for consistency 18 | np.random.seed(1) 19 | torch.manual_seed(0) 20 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 21 | 22 | # load pre-trained torch model 23 | pretrained_model_name = "howey/electra-base-sst2" 24 | 25 | tokenizer = transformers.ElectraTokenizerFast.from_pretrained(pretrained_model_name) 26 | pytorch_model = transformers.ElectraForSequenceClassification.from_pretrained( 27 | pretrained_model_name, torchscript=True 28 | ) 29 | pytorch_model.eval() 30 | 31 | # dummy inputs to generate the groq model 32 | batch_size = 1 33 | max_seq_length = 128 34 | dummy_inputs = { 35 | "input_ids": torch.ones(batch_size, max_seq_length, dtype=torch.long), 36 | "attention_mask": torch.ones(batch_size, max_seq_length, dtype=torch.bool), 37 | } 38 | 39 | # generate groq model 40 | groq_model = groqit(pytorch_model, dummy_inputs, rebuild=rebuild_policy) 41 | 42 | # compute performance on CPU and GroqChip 43 | if should_execute: 44 | compute_performance( 45 | groq_model, 46 | pytorch_model, 47 | dataset="sst", 48 | tokenizer=tokenizer, 49 | max_seq_length=max_seq_length, 50 | task="classification", 51 | ) 52 | 53 | print(f"Proof point {__file__} finished!") 54 | 55 | 56 | if __name__ == "__main__": 57 | evaluate_electra(**parse_args()) 58 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/electra/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | transformers>=4.20.0 3 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/minilm/README.md: -------------------------------------------------------------------------------- 1 | # MiniLM v2 2 | 3 | [MiniLM v2](https://arxiv.org/abs/2012.15828) is a [distilled model](https://arxiv.org/pdf/1503.02531.pdf) that employs a generalization of the deep self-attention distillation method that the authors of the linked paper introduced in their first paper [MiniLm](https://arxiv.org/abs/2002.10957). The distillation is generalized by employing multi-head self-attention distillation. 4 | 5 | In this proof point, MiniLM v2 is used for the task of [sentence similarity](https://huggingface.co/tasks/sentence-similarity) and evaluated using the [machine translated multi-lingual](https://github.com/PhilipMay/stsb-multi-mt) version of the Semantic Textual Similarity [(STS) benchmark dataset](https://ixa2.si.ehu.eus/stswiki/index.php/STSbenchmark). Both the [model](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) and the [dataset](https://huggingface.co/datasets/stsb_multi_mt#citation-information) are downloaded from Hugging Face. 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate MiniLM v2: 23 | 24 | ```bash 25 | python minilmv2.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 10 minutes for MiniLM v2 to build and about 1 minutes to evaluate the [Spearman Rank Correlation Coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) for both implementations. The script returns the Spearman Rank Correlation Coefficients for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 33 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/minilm/minilmv2.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes pre-trained MiniLM v2 from 3 | huggingface models repository and executes against STS benchmark dataset 4 | on CPU and GroqChip1 through GroqFlow. 5 | """ 6 | import os 7 | from transformers import AutoTokenizer, AutoModel 8 | import torch 9 | from demo_helpers.compute_performance import compute_performance 10 | from demo_helpers.args import parse_args 11 | 12 | from groqflow import groqit 13 | 14 | 15 | def evaluate_minilm(rebuild_policy=None, should_execute=True): 16 | # set seed for consistency 17 | torch.manual_seed(0) 18 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 19 | 20 | # load pre-trained torch model 21 | tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") 22 | model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") 23 | 24 | # dummy inputs to generate the groq model 25 | max_seq_length = 128 26 | dummy_inputs = { 27 | "input_ids": torch.ones((2, max_seq_length), dtype=torch.long), 28 | "token_type_ids": torch.ones((2, max_seq_length), dtype=torch.long), 29 | "attention_mask": torch.ones((2, max_seq_length), dtype=torch.bool), 30 | } 31 | 32 | # generate groq model 33 | groq_model = groqit(model, dummy_inputs, rebuild=rebuild_policy) 34 | 35 | # compute performance on CPU and GroqChip 36 | if should_execute: 37 | compute_performance( 38 | groq_model, 39 | model, 40 | dataset="stsb_multi_mt", 41 | tokenizer=tokenizer, 42 | max_seq_length=max_seq_length, 43 | task="sentence_similarity", 44 | ) 45 | 46 | print(f"Proof point {__file__} finished!") 47 | 48 | 49 | if __name__ == "__main__": 50 | evaluate_minilm(**parse_args()) 51 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/minilm/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | transformers>=4.20.0 3 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/roberta/README.md: -------------------------------------------------------------------------------- 1 | # RoBERTa 2 | 3 | [RoBERTa](https://arxiv.org/abs/1907.11692) is one of many derivatives of the [BERT model](https://arxiv.org/abs/1810.04805). Its name is an acronym created from the phrase, "Robustly optimized BERT approach". RoBERTa improves on BERT by hyperparameter tuning and altering the training recipe. Optimizations employed by RoBERTa include longer training with larger batch sizes, more data, longer sequence lengths, and dynamically changing masking patterns. As with many of the other BERT model variations, RoBERTa also removes the next sentence proposal (NSP) loss from the loss function. 4 | 5 | In this proof point, RoBERTa is used for the task of [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) and evaluated using the [CoNLL-2003 dataset](https://paperswithcode.com/dataset/conll-2003). The model weights are downloaded from the [Hugging Face website](https://huggingface.co/dominiqueblok/roberta-base-finetuned-ner). 6 | 7 | ## Prerequisites 8 | 9 | - Ensure you've completed the install prerequisites: 10 | - Installed the GroqWare™ Suite 11 | - Installed GroqFlow 12 | - Installed Groq Demo Helpers 13 | - For more information on these steps, see the [Proof Points README](../../README.md). 14 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Build and Evaluate 21 | 22 | To build and evaluate RoBERTa: 23 | 24 | ```bash 25 | python roberta.py 26 | ``` 27 | 28 | **Note:** The Proof Points directory [readme.md](../../README.md) details how to build and execute on two machines. 29 | 30 | ## Expected Results 31 | 32 | It takes approximately 15 minutes for RoBERTa to build and about 5 minutes to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation on 4 GroqCard™ accelerators within a GroqNode™ server. 33 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/roberta/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.0 2 | transformers>=4.20.0 3 | -------------------------------------------------------------------------------- /proof_points/natural_language_processing/roberta/roberta.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes a pre-trained RoBERTa model and executes 3 | against CoNNL 2003 dataset on CPU and GroqChip™ processor by using 4 | the GroqFlow toolchain. The model and data set can be downloaded 5 | here: https://huggingface.co/dominiqueblok/roberta-base-finetuned-ner 6 | """ 7 | 8 | import os 9 | 10 | import torch 11 | 12 | from demo_helpers.compute_performance import compute_performance 13 | from demo_helpers.args import parse_args 14 | from groqflow import groqit 15 | from transformers import RobertaForTokenClassification, RobertaTokenizerFast 16 | 17 | 18 | def evaluate_roberta(rebuild_policy=None, should_execute=None): 19 | # set seed for consistency 20 | torch.manual_seed(0) 21 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 22 | 23 | # load pre-trained torch model 24 | model_path = "dominiqueblok/roberta-base-finetuned-ner" 25 | tokenizer = RobertaTokenizerFast.from_pretrained(model_path) 26 | torch_model = RobertaForTokenClassification.from_pretrained( 27 | model_path, torchscript=True 28 | ) 29 | 30 | # dummy inputs to generate the groq model 31 | batch_size, max_seq_length = 1, 128 32 | dummy_inputs = { 33 | "input_ids": torch.ones((batch_size, max_seq_length), dtype=torch.long), 34 | "attention_mask": torch.ones((batch_size, max_seq_length), dtype=torch.float), 35 | } 36 | 37 | # generate groq model 38 | build_name = "roberta" 39 | groq_model = groqit( 40 | torch_model, 41 | dummy_inputs, 42 | compiler_flags=["--large-program"], 43 | rebuild=rebuild_policy, 44 | build_name=build_name, 45 | ) 46 | 47 | # compute performance on CPU and GroqChip 48 | if should_execute: 49 | compute_performance( 50 | groq_model, 51 | torch_model, 52 | dataset="conll2003", 53 | tokenizer=tokenizer, 54 | max_seq_length=max_seq_length, 55 | task="ner", 56 | ) 57 | 58 | print(f"Proof point {__file__} finished!") 59 | 60 | 61 | if __name__ == "__main__": 62 | evaluate_roberta(**parse_args()) 63 | -------------------------------------------------------------------------------- /proof_points/speech/m5/README.md: -------------------------------------------------------------------------------- 1 | # M5 2 | 3 | [M5](https://arxiv.org/abs/1610.00087) is a convolutional neural network (CNN) that works directly on raw audio waveform. Since M5 accepts raw data, there is no need to generate frequency spectrums, a required pre-processing step used by many audio/acoustic models. 4 | 5 | This proof point uses the M5 model on the task of [Keyword Spotting](https://en.wikipedia.org/wiki/Keyword_spotting). The M5 adaptation for this task replaces the global average pool in the original M5 model with a fully connected layer; the architecture definition can be viewed in the [demo_helpers folder](../../../demo_helpers/models.py). 6 | 7 | M5's Keyword Spotting accuracy is evaluated using the [SpeechCommands dataset](https://arxiv.org/abs/1804.03209) from PyTorch's `torchaudio.datasets` library. 8 | 9 | ## Prerequisites 10 | 11 | - Ensure you've completed the install prerequisites: 12 | - Installed the GroqWare™ Suite 13 | - Installed GroqFlow 14 | - Installed Groq Demo Helpers 15 | - For more information on these steps, see the [Proof Points README](../../README.md). 16 | - Install the python dependencies using the requirements.txt file included with this proof point using the following command: 17 | 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | 22 | - Since this proofpoint uses audio files, often the audio libraries must be installed on system. 23 | - For Ubuntu OS: 24 | 25 | ```bash 26 | sudo apt install libsox-dev 27 | ``` 28 | 29 | - For Rocky OS: 30 | 31 | ```bash 32 | sudo dnf install sox-devel 33 | ``` 34 | 35 | ## Build and Evaluate 36 | 37 | To build and evaluate M5: 38 | 39 | ```bash 40 | python m5.py 41 | ``` 42 | 43 | Note: The [Proof Points directory README](../../README.md) details how to build and execute on two machines. 44 | 45 | ## Expected Results 46 | 47 | It takes approximately 5 minutes for M5 to build and about 1 minute to evaluate the implementation accuracies. The script returns the accuracies for both the PyTorch implementation on a CPU and the Groq implementation using a single GroqCard™ accelerator. 48 | -------------------------------------------------------------------------------- /proof_points/speech/m5/m5.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following example takes a pre-trained M5 model and executes against 3 | SpeechCommands dataset on CPU and GroqChip™ processor using Groqflow. 4 | """ 5 | 6 | import torch 7 | 8 | from demo_helpers.compute_performance import compute_performance 9 | from demo_helpers.models import load_pretrained 10 | from demo_helpers.args import parse_args 11 | from groqflow import groqit 12 | 13 | 14 | def evaluate_m5(rebuild_policy=None, should_execute=True): 15 | # set seed for consistency 16 | torch.manual_seed(0) 17 | 18 | # load pre-trained torch model 19 | torch_model = load_pretrained("m5") 20 | torch_model.eval() 21 | 22 | # dummy inputs to generate groq model 23 | dummy_input = torch.randn([1, 1, 16000]) 24 | 25 | # generate groq_model 26 | build_name = "m5" 27 | groq_model = groqit( 28 | torch_model, {"x": dummy_input}, rebuild=rebuild_policy, build_name=build_name 29 | ) 30 | 31 | # compute performance on CPU, GroqChip 32 | if should_execute: 33 | compute_performance( 34 | groq_model, 35 | torch_model, 36 | dataset="speechcommands", 37 | task="keyword_spotting", 38 | ) 39 | 40 | print(f"Proof point {__file__} finished!") 41 | 42 | 43 | if __name__ == "__main__": 44 | evaluate_m5(**parse_args()) 45 | -------------------------------------------------------------------------------- /proof_points/speech/m5/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.12.1 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("groqflow/version.py", encoding="utf-8") as fp: 4 | version = fp.read().split('"')[1] 5 | 6 | setup( 7 | name="groqflow", 8 | version=version, 9 | description="GroqFlow toolchain library", 10 | url="https://github.com/groq/groqflow", 11 | author="Groq", 12 | author_email="sales@groq.com", 13 | license="MIT", 14 | packages=find_packages( 15 | exclude=["*.__pycache__.*"], 16 | ), 17 | install_requires=[ 18 | "mlagility==3.3.1", 19 | "onnx==1.14.0", 20 | "onnxruntime==1.15.1", 21 | "protobuf==3.20.3", 22 | "scikit-learn==1.1.1", 23 | "torch==2.1.0", 24 | "typeguard==4.0.0", 25 | ], 26 | extras_require={ 27 | "tensorflow": ["tensorflow-cpu>=2.8.1", "tf2onnx>=1.12.0"], 28 | }, 29 | classifiers=[], 30 | python_requires=">=3.8, <3.11", 31 | long_description=open("README.md", "r", encoding="utf-8").read(), 32 | long_description_content_type="text/markdown", 33 | ) 34 | --------------------------------------------------------------------------------