├── docs └── source │ ├── _toctree.yml │ ├── installation.mdx │ ├── index.mdx │ ├── quickstart.mdx │ └── guides │ ├── contribute.mdx │ └── export.mdx ├── .github └── workflows │ ├── upload_pr_documentation.yml │ ├── quality.yml │ ├── build_pr_documentation.yml │ ├── build_documentation.yml │ └── test_models.yml ├── tests ├── __init__.py └── models │ ├── __init__.py │ ├── test_modeling_gptneox.py │ ├── test_modeling_gpt2.py │ ├── test_modeling_starcoder2.py │ ├── test_modeling_phi.py │ ├── test_modeling_mistral.py │ ├── test_modeling_glm.py │ ├── test_modeling_granite.py │ ├── test_modeling_gptj.py │ ├── test_modeling_codegen.py │ ├── test_modeling_gptneoxjapanese.py │ ├── test_modeling_cvt.py │ ├── test_modeling_pvt.py │ ├── test_modeling_dit.py │ ├── test_modeling_focalnet.py │ ├── test_modeling_swin.py │ ├── test_modeling_deit.py │ ├── test_modeling_mobilevit.py │ ├── test_modeling_albert.py │ ├── test_modeling_roberta.py │ ├── test_modeling_mobilevit2.py │ ├── test_modeling_distilbert.py │ ├── test_modeling_efficientnet.py │ ├── test_modeling_vit.py │ ├── test_modeling_qwen3_embedding.py │ ├── test_modeling_smollm3.py │ ├── test_modeling_granite_speech.py │ ├── test_modeling_bert.py │ ├── test_modeling_qwen2.py │ ├── test_modeling_gemma.py │ ├── test_modeling_gemma2.py │ ├── test_modeling_whisper.py │ └── test_modeling_llama.py ├── optimum ├── executorch │ ├── version.py │ ├── passes │ │ └── remove_padding_idx_embedding_pass.py │ └── __init__.py ├── exporters │ └── executorch │ │ ├── recipes │ │ ├── __init__.py │ │ ├── cuda-windows.py │ │ ├── portable.py │ │ ├── xnnpack.py │ │ ├── metal.py │ │ ├── cuda.py │ │ └── coreml.py │ │ ├── tasks │ │ ├── __init__.py │ │ ├── image_classification.py │ │ ├── masked_lm.py │ │ ├── seq2seq_lm.py │ │ └── asr.py │ │ ├── __init__.py │ │ ├── task_registry.py │ │ ├── recipe_registry.py │ │ ├── README.md │ │ └── convert.py └── commands │ └── register │ └── register_export.py ├── Makefile ├── install_dev.py ├── pyproject.toml ├── .gitignore └── CONTRIBUTING.MD /docs/source/_toctree.yml: -------------------------------------------------------------------------------- 1 | - sections: 2 | - local: index 3 | title: 🤗 Optimum ExecuTorch 4 | - local: installation 5 | title: Installation 6 | - local: quickstart 7 | title: Quickstart 8 | - sections: 9 | - local: guides/export 10 | title: Export 11 | - local: guides/contribute 12 | title: Contribution 13 | title: How-To Guides 14 | title: Optimum ExecuTorch 15 | isExpanded: true 16 | -------------------------------------------------------------------------------- /.github/workflows/upload_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: optimum-executorch 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /optimum/executorch/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | __version__ = "0.2.0.dev0" 16 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import xnnpack 16 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import causal_lm, image_classification, masked_lm, multimodal_text_to_text, seq2seq_lm 16 | -------------------------------------------------------------------------------- /docs/source/installation.mdx: -------------------------------------------------------------------------------- 1 | 12 | 13 | # Installation 14 | 15 | 16 | To install Optimum ExecuTorch, you can do: 17 | 18 | ```bash 19 | git clone https://github.com/huggingface/optimum-executorch.git 20 | cd optimum-executorch 21 | pip install . 22 | ``` 23 | -------------------------------------------------------------------------------- /optimum/commands/register/register_export.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from optimum.commands.export.base import ExportCommand 16 | from optimum.commands.export.executorch import ExecuTorchExportCommand 17 | 18 | 19 | REGISTER_COMMANDS = [(ExecuTorchExportCommand, ExportCommand)] 20 | -------------------------------------------------------------------------------- /.github/workflows/quality.yml: -------------------------------------------------------------------------------- 1 | name: Code Quality 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - v*-release 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | quality: 17 | runs-on: ubuntu-22.04 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v4 22 | 23 | - name: Setup Python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: 3.9 27 | 28 | - name: Install dependencies 29 | run: | 30 | pip install --upgrade pip 31 | pip install "black~=23.1" "ruff==0.4.4" 32 | 33 | - name: Check style with black 34 | run: | 35 | black --check . 36 | 37 | - name: Check style with ruff 38 | run: | 39 | ruff check . -------------------------------------------------------------------------------- /optimum/executorch/passes/remove_padding_idx_embedding_pass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from executorch.exir.dialects._ops import ops as exir_ops 3 | from executorch.exir.pass_base import ExportPass, PassResult 4 | 5 | 6 | class RemovePaddingIdxEmbeddingPass(ExportPass): 7 | """ 8 | An ExportPass that removes the `padding_idx` keyword argument 9 | from all aten.embedding.default operator calls. 10 | """ 11 | 12 | def __init__(self) -> None: 13 | super().__init__() 14 | 15 | def call(self, graph_module: torch.fx.GraphModule) -> PassResult: 16 | for node in graph_module.graph.nodes: 17 | if node.op == "call_function" and node.target == exir_ops.edge.aten.embedding.default: 18 | # node.args[2] is the padding_idx 19 | if len(node.args) == 3: 20 | node.args = (node.args[0], node.args[1]) 21 | graph_module.recompile() 22 | return PassResult(graph_module, True) 23 | -------------------------------------------------------------------------------- /docs/source/index.mdx: -------------------------------------------------------------------------------- 1 | 16 | 17 | # 🤗 Optimum ExecuTorch 18 | 19 | Optimum ExecuTorch enables efficient deployment of transformer models using Meta's ExecuTorch framework. It provides: 20 | 21 | * 🔄 Easy conversion of Hugging Face models to ExecuTorch format 22 | 23 | * ⚡ Optimized inference with hardware-specific optimizations 24 | 25 | * 🤝 Seamless integration with Hugging Face Transformers 26 | 27 | * Efficient deployment on various devices -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | SHELL := /bin/bash 15 | CURRENT_DIR = $(shell pwd) 16 | DEFAULT_CLONE_URL := https://github.com/huggingface/optimum-executorch.git 17 | # If CLONE_URL is empty, revert to DEFAULT_CLONE_URL 18 | REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) 19 | 20 | .PHONY: style test 21 | 22 | # Run code quality checks 23 | style_check: 24 | black --check . 25 | ruff check . 26 | 27 | style: 28 | black . 29 | ruff check . --fix 30 | 31 | # Run tests for the library 32 | test: 33 | python -m pytest tests 34 | 35 | # Utilities to release to PyPi 36 | build_dist_install_tools: 37 | pip install build 38 | pip install twine 39 | 40 | build_dist: 41 | rm -rf build 42 | rm -rf dist 43 | python -m build 44 | 45 | pypi_upload: build_dist 46 | python -m twine upload dist/* -------------------------------------------------------------------------------- /optimum/executorch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from transformers.utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "modeling": [ 22 | "ExecuTorchModelForCausalLM", 23 | "ExecuTorchModelForImageClassification", 24 | "ExecuTorchModelForMaskedLM", 25 | "ExecuTorchModelForSeq2SeqLM", 26 | "ExecuTorchModelForSpeechSeq2Seq", 27 | "ExecuTorchModelForMultiModalToText", 28 | ], 29 | } 30 | 31 | if TYPE_CHECKING: 32 | from .modeling import ( 33 | ExecuTorchModelForCausalLM, 34 | ExecuTorchModelForImageClassification, 35 | ExecuTorchModelForMaskedLM, 36 | ExecuTorchModelForMultiModalToText, 37 | ExecuTorchModelForSeq2SeqLM, 38 | ExecuTorchModelForSpeechSeq2Seq, 39 | ) 40 | else: 41 | import sys 42 | 43 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 44 | -------------------------------------------------------------------------------- /docs/source/quickstart.mdx: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Quickstart 18 | 19 | ## Export 20 | 21 | You can export your 🤗 Transformers models to ExecuTorch easily: 22 | 23 | ```bash 24 | optimum-cli export executorch --model meta-llama/Llama-3.2-1B --recipe xnnpack --output_dir meta_llama3_2_1b_executorch 25 | ``` 26 | 27 | 28 | ## Inference 29 | 30 | To load a model and run inference, you can just replace your `AutoModelForCausalLM` class with the corresponding `ExecuTorchModelForCausalLM` class. You can also load a PyTorch checkpoint and convert it to ExecuTorch on-the-fly when loading your model. 31 | 32 | ```diff 33 | - from transformers import AutoModelForCausalLM 34 | + from optimum.executorch import ExecuTorchModelForCausalLM 35 | from transformers import AutoTokenizer 36 | 37 | model_id = "meta-llama/Llama-3.2-1B" 38 | tokenizer = AutoTokenizer.from_pretrained(model_id) 39 | - model = AutoModelForCausalLM.from_pretrained(model_id) 40 | + model = ExecuTorchModelForCausalLM.from_pretrained(model_id) 41 | ``` -------------------------------------------------------------------------------- /optimum/exporters/executorch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from transformers.utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "convert": [ 22 | "export_to_executorch", 23 | ], 24 | "recipe_registry": [ 25 | "discover_recipes", 26 | "register_recipe", 27 | ], 28 | "task_registry": [ 29 | "discover_tasks", 30 | "register_task", 31 | ], 32 | "tasks": [ 33 | "causal_lm", 34 | "seq2seq_lm", 35 | ], 36 | "recipes": [ 37 | "xnnpack", 38 | ], 39 | "utils": [ 40 | "save_config_to_constant_methods", 41 | ], 42 | "integrations": [ 43 | "Seq2SeqLMExportableModule", 44 | ], 45 | "__main__": ["main_export"], 46 | } 47 | 48 | if TYPE_CHECKING: 49 | from .__main__ import main_export 50 | from .convert import export_to_executorch 51 | else: 52 | import sys 53 | 54 | sys.modules[__name__] = _LazyModule( 55 | __name__, 56 | globals()["__file__"], 57 | _import_structure, 58 | module_spec=__spec__, 59 | ) 60 | -------------------------------------------------------------------------------- /.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | build_documentation: 14 | runs-on: ubuntu-22.04 15 | env: 16 | COMMIT_SHA: ${{ github.event.pull_request.head.sha }} 17 | PR_NUMBER: ${{ github.event.number }} 18 | EVENT_CONTEXT: ${{ toJSON(github.event) }} 19 | PR_CLONE_URL: ${{ github.event.pull_request.head.repo.clone_url }} 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: actions/setup-node@v4 24 | with: 25 | node-version: '18' 26 | cache-dependency-path: "kit/package-lock.json" 27 | 28 | - name: Set up Python 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: '3.11' 32 | 33 | - name: Setup environment 34 | run: | 35 | pip install --upgrade pip 36 | pip install git+https://github.com/huggingface/doc-builder 37 | pip install .[quality] 38 | 39 | - name: Make documentation 40 | shell: bash 41 | run: | 42 | doc-builder build optimum.executorch docs/source/ \ 43 | --repo_name optimum-executorch \ 44 | --build_dir executorch-doc-build/ \ 45 | --version pr_${{ env.PR_NUMBER }} \ 46 | --version_tag_suffix "" \ 47 | --html \ 48 | --clean \ 49 | 50 | - name: Save commit_sha & pr_number 51 | run: | 52 | sudo chmod -R ugo+rwx executorch-doc-build 53 | cd executorch-doc-build 54 | sudo mv optimum.executorch optimum-executorch 55 | echo ${{ env.COMMIT_SHA }} > ./commit_sha 56 | echo ${{ env.PR_NUMBER }} > ./pr_number 57 | 58 | - uses: actions/upload-artifact@v4 59 | with: 60 | name: doc-build-artifact 61 | path: executorch-doc-build/ 62 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/tasks/image_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers import AutoModelForImageClassification 16 | 17 | from ..integrations import VisionEncoderExportableModule 18 | from ..task_registry import register_task 19 | 20 | 21 | # NOTE: It’s important to map the registered task name to the pipeline name in https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py. 22 | # This will streamline using inferred task names and make exporting models to Hugging Face pipelines easier. 23 | @register_task("image-classification") 24 | def load_image_classification_model(model_name_or_path: str, **kwargs) -> VisionEncoderExportableModule: 25 | """ 26 | Loads a vision model for image classification and registers it under the task 27 | 'image-classification' using Hugging Face's `AutoModelForImageClassification`. 28 | 29 | Args: 30 | model_name_or_path (str): 31 | Model ID on huggingface.co or path on disk to the model repository to export. For example: 32 | `model_name_or_path="google/vit-base-patch16-224"` or `mode_name_or_path="/path/to/model_folder` 33 | **kwargs: 34 | Additional configuration options for the model. 35 | 36 | Returns: 37 | VisionEncoderExportableModule: 38 | An instance of `VisionEncoderExportableModule` for exporting and lowering to ExecuTorch. 39 | """ 40 | 41 | eager_model = AutoModelForImageClassification.from_pretrained(model_name_or_path, **kwargs).to("cpu").eval() 42 | return VisionEncoderExportableModule(eager_model) 43 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/task_registry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | import logging 17 | import pkgutil 18 | 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | task_registry = {} 23 | 24 | package_name = "optimum.exporters.executorch.tasks" 25 | 26 | 27 | def register_task(task_name): 28 | """ 29 | Decorator to register a task under a specific name. 30 | 31 | Args: 32 | task_name (`str`): 33 | The name of the task to associate with a callable task. 34 | 35 | Returns: 36 | `Callable`: 37 | The original function wrapped as a registered task. 38 | 39 | Example: 40 | ```python 41 | @register_task("my_new_task") 42 | def my_new_task(...): 43 | ... 44 | ``` 45 | """ 46 | 47 | def decorator(func): 48 | task_registry[task_name] = func 49 | return func 50 | 51 | return decorator 52 | 53 | 54 | def discover_tasks(): 55 | """ 56 | Dynamically discovers and imports all task modules within the `optimum.exporters.executorch.tasks` package. 57 | 58 | Ensures tasks under `./tasks` directory are dynamically loaded without requiring manual imports. 59 | 60 | Notes: 61 | New tasks **must** be added to the `./tasks` directory to be discovered and used by `main_export`. 62 | Failure to do so will prevent dynamic discovery and registration. Tasks must also use the 63 | `@register_task` decorator to be properly registered in the `task_registry`. 64 | """ 65 | package = importlib.import_module(package_name) 66 | package_path = package.__path__ 67 | 68 | for _, module_name, _ in pkgutil.iter_modules(package_path): 69 | logger.info(f"Importing {package_name}.{module_name}") 70 | importlib.import_module(f"{package_name}.{module_name}") 71 | -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v[0-9]+.[0-9]+.[0-9]+' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build_documentation: 13 | runs-on: ubuntu-22.04 14 | env: 15 | COMMIT_SHA: ${{ github.event.pull_request.head.sha }} 16 | PR_NUMBER: ${{ github.event.number }} 17 | EVENT_CONTEXT: ${{ toJSON(github.event) }} 18 | PR_CLONE_URL: ${{ github.event.pull_request.head.repo.clone_url }} 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - uses: actions/setup-node@v4 23 | with: 24 | node-version: '18' 25 | cache-dependency-path: "kit/package-lock.json" 26 | 27 | - name: Set up Python 28 | uses: actions/setup-python@v4 29 | with: 30 | python-version: '3.11' 31 | 32 | - name: Set environment variables 33 | run: | 34 | cd optimum 35 | version=`echo "$(grep '^__version__ =' executorch/version.py | cut -d '=' -f 2- | xargs)"` 36 | 37 | if [[ $version == *.dev0 ]] 38 | then 39 | echo "VERSION=main" >> $GITHUB_ENV 40 | else 41 | echo "VERSION=v$version" >> $GITHUB_ENV 42 | fi 43 | 44 | cd .. 45 | 46 | - name: Setup environment 47 | run: | 48 | python -m pip install --upgrade pip 49 | python -m ensurepip --upgrade 50 | python -m pip install --upgrade setuptools 51 | python -m pip install git+https://github.com/huggingface/doc-builder 52 | python -m pip install .[quality] 53 | 54 | - name: Make documentation 55 | shell: bash 56 | run: | 57 | doc-builder build optimum.executorch docs/source/ \ 58 | --repo_name optimum-executorch \ 59 | --build_dir executorch-doc-build/ \ 60 | --version ${{ env.VERSION }} \ 61 | --version_tag_suffix "" \ 62 | --html \ 63 | --clean \ 64 | 65 | - name: Push documentation 66 | run: | 67 | sudo chmod -R ugo+rwx executorch-doc-build 68 | cd executorch-doc-build 69 | sudo mv optimum.executorch optimum-executorch 70 | doc-builder push optimum-executorch --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/optimum-executorch/commit/$COMMIT_SHA" --n_retries 5 71 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipe_registry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | import logging 17 | import pkgutil 18 | 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | recipe_registry = {} 23 | 24 | package_name = "optimum.exporters.executorch.recipes" 25 | 26 | 27 | def register_recipe(recipe_name): 28 | """ 29 | Decorator to register a recipe for exporting and lowering an ExecuTorch model under a specific name. 30 | 31 | Args: 32 | recipe_name (`str`): 33 | The name of the recipe to associate with a callable recipe. 34 | 35 | Returns: 36 | `Callable`: 37 | The original function wrapped as a registered recipe. 38 | 39 | Example: 40 | ```python 41 | @register_recipe("my_new_recipe") 42 | def my_new_recipe(...): 43 | ... 44 | ``` 45 | """ 46 | 47 | def decorator(func): 48 | recipe_registry[recipe_name] = func 49 | return func 50 | 51 | return decorator 52 | 53 | 54 | def discover_recipes(): 55 | """ 56 | Dynamically discovers and imports all recipe modules within the `optimum.exporters.executorch.recipes` package. 57 | 58 | Ensures recipes under `./recipes` directory are dynamically loaded without requiring manual imports. 59 | 60 | Notes: 61 | New recipes **must** be added to the `./recipes` directory to be discovered and used by `main_export`. 62 | Failure to do so will prevent dynamic discovery and registration. Recipes must also use the 63 | `@register_recipe` decorator to be properly registered in the `recipe_registry`. 64 | """ 65 | package = importlib.import_module(package_name) 66 | package_path = package.__path__ 67 | 68 | for _, module_name, _ in pkgutil.iter_modules(package_path): 69 | logger.info(f"Importing {package_name}.{module_name}") 70 | importlib.import_module(f"{package_name}.{module_name}") 71 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/tasks/masked_lm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers import AutoModelForMaskedLM 16 | 17 | from ..integrations import MaskedLMExportableModule 18 | from ..quantization import quantize_model_ 19 | from ..task_registry import register_task 20 | 21 | 22 | # NOTE: It’s important to map the registered task name to the pipeline name in https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py. 23 | # This will streamline using inferred task names and make exporting models to Hugging Face pipelines easier. 24 | @register_task("fill-mask") 25 | def load_masked_lm_model(model_name_or_path: str, **kwargs) -> MaskedLMExportableModule: 26 | """ 27 | Loads a seq2seq language model for conditional text generation and registers it under the task 28 | 'fill-mask' using Hugging Face's `AutoModelForMaskedLM`. 29 | 30 | Args: 31 | model_name_or_path (str): 32 | Model ID on huggingface.co or path on disk to the model repository to export. For example: 33 | `model_name_or_path="google-bert/bert-base-uncased"` or `mode_name_or_path="/path/to/model_folder` 34 | **kwargs: 35 | Additional configuration options for the model. 36 | 37 | Returns: 38 | MaskedLMExportableModule: 39 | An instance of `MaskedLMExportableModule` for exporting and lowering to ExecuTorch. 40 | """ 41 | 42 | eager_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path).to("cpu").eval() 43 | 44 | qlinear_config = kwargs.get("qlinear", None) 45 | qlinear_packing_format = kwargs.get("qlinear_packing_format", None) 46 | qembedding_config = kwargs.get("qembedding", None) 47 | quantize_model_( 48 | eager_model, 49 | qlinear_config=qlinear_config, 50 | qlinear_packing_format=qlinear_packing_format, 51 | qembedding_config=qembedding_config, 52 | ) 53 | 54 | return MaskedLMExportableModule(eager_model) 55 | -------------------------------------------------------------------------------- /install_dev.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import sys 4 | 5 | 6 | def install_torch_nightly_deps(): 7 | """Install torch related dependencies from pinned nightly""" 8 | EXECUTORCH_NIGHTLY_VERSION = "dev20251104" 9 | TORCHAO_NIGHTLY_VERSION = "dev20251104" 10 | # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/torch_pin.py#L2 11 | TORCH_NIGHTLY_VERSION = "dev20251104" 12 | subprocess.check_call( 13 | [ 14 | sys.executable, 15 | "-m", 16 | "pip", 17 | "install", 18 | "--no-cache-dir", # Prevent cached CUDA packages 19 | f"executorch==1.1.0.{EXECUTORCH_NIGHTLY_VERSION}", 20 | f"torch==2.10.0.{TORCH_NIGHTLY_VERSION}", 21 | f"torchvision==0.25.0.{TORCH_NIGHTLY_VERSION}", 22 | f"torchaudio==2.10.0.{TORCH_NIGHTLY_VERSION}", 23 | f"torchao==0.15.0.{TORCHAO_NIGHTLY_VERSION}", 24 | "--extra-index-url", 25 | "https://download.pytorch.org/whl/nightly/cpu", 26 | ] 27 | ) 28 | 29 | 30 | def install_dep_from_source(): 31 | """Install deps from source at pinned commits""" 32 | subprocess.check_call( 33 | [ 34 | sys.executable, 35 | "-m", 36 | "pip", 37 | "install", 38 | "git+https://github.com/huggingface/transformers@bdc85cb85c8772d37aa29ce447860b44d7fad6ef#egg=transformers", # v5.0.0rc0 39 | ] 40 | ) 41 | subprocess.check_call( 42 | [ 43 | sys.executable, 44 | "-m", 45 | "pip", 46 | "install", 47 | "git+https://github.com/pytorch-labs/tokenizers@3aada3fe28c945d14d5ec62254eb56ccdf10eb11#egg=pytorch-tokenizers", 48 | ] 49 | ) 50 | 51 | 52 | def main(): 53 | """Install optimum-executorch in dev mode with nightly dependencies""" 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument( 56 | "--skip_override_torch", 57 | action="store_true", 58 | help="Skip installation of nightly executorch and torch dependencies", 59 | ) 60 | args = parser.parse_args() 61 | 62 | # Install nightly torch dependencies FIRST to avoid pulling CUDA versions 63 | if not args.skip_override_torch: 64 | install_torch_nightly_deps() 65 | 66 | # Install package with dev extras 67 | subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"]) 68 | 69 | # Install source dependencies 70 | install_dep_from_source() 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/cuda-windows.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Union 16 | 17 | from ..integrations import ( 18 | CausalLMExportableModule, 19 | MaskedLMExportableModule, 20 | MultiModalTextToTextExportableModule, 21 | Seq2SeqLMExportableModule, 22 | ) 23 | from ..recipe_registry import register_recipe 24 | from .cuda import lower_to_executorch 25 | 26 | 27 | @register_recipe("cuda-windows") 28 | def export_to_executorch_with_cuda_windows( 29 | model: Union[ 30 | CausalLMExportableModule, 31 | MaskedLMExportableModule, 32 | Seq2SeqLMExportableModule, 33 | MultiModalTextToTextExportableModule, 34 | ], 35 | **kwargs, 36 | ): 37 | """ 38 | Export a PyTorch model to ExecuTorch w/ delegation to CUDA backend. 39 | This function also write metadata required by the ExecuTorch runtime to the .pte file. 40 | Args: 41 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule, MultiModalTextToTextExportableModule]): 42 | The PyTorch model to be exported to ExecuTorch. 43 | **kwargs: 44 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 45 | Returns: 46 | Dict[str, ExecutorchProgram]: 47 | A map of exported and optimized program for ExecuTorch. 48 | For encoder-decoder models or multimodal models, it may generate multiple programs. 49 | """ 50 | if ( 51 | model.config._attn_implementation == "custom_sdpa" 52 | or model.config._attn_implementation == "custom_sdpa_ring_kv_cache" 53 | ): 54 | raise NotImplementedError( 55 | "Custom SDPA implementation is not supported for CUDA yet. Please use 'flash_attention' instead." 56 | ) 57 | 58 | exported_progs = model.export() 59 | 60 | return lower_to_executorch( 61 | exported_progs, model.metadata, is_windows=True, model_config=getattr(model, "config", None) 62 | ) 63 | -------------------------------------------------------------------------------- /docs/source/guides/contribute.mdx: -------------------------------------------------------------------------------- 1 | 12 | 13 | # Adding support for an unsupported architecture 14 | 15 | We welcome contributions to extend the functionality of ExecuTorch export. This guide provides high-level instructions for contributors who want to: 16 | 17 | 1. Export a new model that is not currently supported. 18 | 2. Add new recipes or support a new task for export. 19 | 20 | --- 21 | 22 | ## Exporting a New Model 23 | 24 | If you want to export a model that is not already supported by the library, follow these steps: 25 | 26 | ### Step 1: Export and Test the Model 27 | 1. Attempt to export and lower the model using an existing task and recipe. On success, it will store the exported model in a `.pte` file. 28 | 2. Add a test case for the model in the appropriate test suite. 29 | - For example, you can make sure tests pass for the new `my_new_model` by running: 30 | ```bash 31 | pytest tests/executorch/export/test_*.py -k "test_my_new_model" # doctest: +SKIP 32 | pytest tests/executorch/runtime/test_*.py -k "test_my_new_model" # doctest: +SKIP 33 | ``` 34 | 35 | ### Step 2: Handle Export Failures 36 | 1. If the export fails in Step 1, report the issue by opening a GitHub issue. 37 | 2. If the issue requires changes to the model’s architecture or its Hugging Face implementation, these modifications may be made upstream in the Hugging Face Transformers library. 38 | 39 | --- 40 | 41 | ## Adding New Recipes or Tasks 42 | 43 | To extend ExecuTorch with new recipes or tasks, follow these guidelines: 44 | 45 | ### Registering a New Recipe 46 | You can add a custom recipe to define specific optimizations or configurations for exporting models. Below is an example: 47 | 48 | ```python 49 | from exporters.executorch import register_recipe 50 | 51 | @register_recipe("my_custom_recipe") 52 | def export_with_custom_recipe(model, config, *args, **kwargs): 53 | # Example: Apply a custom quantization 54 | ``` 55 | 56 | ### Registering a Task 57 | The task registration process is same as adding a recipe. Besides that you may need to implement a new `ExecuTorchModelForXXX` class. 58 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gptneox.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_gpt2neox_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "EleutherAI/pythia-14m" 48 | prompt = "My favorite food is" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_gpt2sw3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "AI-Sweden-Models/gpt-sw3-126m" 48 | prompt = "Träd är fina för att" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_starcoder2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_starcoder2_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "bigcode/starcoder2-3b" 48 | prompt = "def hello_world():" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_phi.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_phi_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "johnsnowlabs/JSL-MedPhi2-2.7B" 48 | prompt = "What is a large language model?" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_mistral.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_mistral_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "ministral/Ministral-3b-instruct" 48 | prompt = "My favourite condiment is " 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_glm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_glm_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "THUDM/glm-edge-1.5b-chat" 48 | prompt = "hello!" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | task="text-generation", 53 | recipe="xnnpack", 54 | attn_implementation="custom_sdpa", 55 | use_custom_kv_cache=True, 56 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 57 | ) 58 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 59 | self.assertIsInstance(model.model, ExecuTorchModule) 60 | generated_text = model.text_generation( 61 | tokenizer=tokenizer, 62 | prompt=prompt, 63 | max_seq_len=64, 64 | ) 65 | logging.info(f"\nGenerated text:\n\t{generated_text}") 66 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 67 | 68 | # Free memory before loading eager for quality check 69 | del model 70 | del tokenizer 71 | gc.collect() 72 | 73 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 74 | -------------------------------------------------------------------------------- /.github/workflows/test_models.yml: -------------------------------------------------------------------------------- 1 | name: ExecuTorch E2E / Python - Test 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | discover-tests: 15 | runs-on: ubuntu-22.04 16 | outputs: 17 | model_names: ${{ steps.set-matrix.outputs.model_names }} 18 | steps: 19 | - uses: actions/checkout@v3 20 | - name: Find model tests 21 | id: set-matrix 22 | run: | 23 | # Find all test files and extract model names correctly 24 | MODEL_NAMES=$(find tests/models -name "test_modeling_*.py" -type f | sed 's|tests/models/test_modeling_||' | sed 's|\.py$||' | paste -sd "," -) 25 | echo "model_names=[\"${MODEL_NAMES//,/\",\"}\"]" >> $GITHUB_OUTPUT 26 | 27 | # Display all discovered models 28 | echo "Discovered models:" 29 | echo "$MODEL_NAMES" | tr ',' '\n' | sort | awk '{print "- " $0}' 30 | 31 | run-tests: 32 | needs: discover-tests 33 | strategy: 34 | fail-fast: false 35 | matrix: 36 | test-modeling: ${{ fromJson(needs.discover-tests.outputs.model_names) }} 37 | executorch-version: ['1.0.0', 'nightly'] 38 | python-version: ['3.11'] 39 | # os: [macos-15, ubuntu-22.04] # TODO(#122): Re-enable the mac tests after fixing seg fault. 40 | os: [ubuntu-22.04] 41 | 42 | # Custom job name, now shortened and cleaner 43 | name: ${{ matrix.test-modeling }} (et=${{ matrix.executorch-version }}, py=${{ matrix.python-version }}, ${{ matrix.os }}) 44 | runs-on: ${{ matrix.os }} 45 | env: 46 | MODEL_NAME: ${{ matrix.test-modeling }} 47 | steps: 48 | - uses: actions/checkout@v2 49 | - name: Setup Python ${{ matrix.python-version }} 50 | uses: actions/setup-python@v2 51 | with: 52 | python-version: ${{ matrix.python-version }} 53 | - name: Install dependencies for ExecuTorch 54 | run: | 55 | # Clean up cache to save space 56 | pip cache purge || true 57 | rm -rf ~/.cache/huggingface/hub/* || true 58 | 59 | if [ "${{ matrix.executorch-version }}" == "nightly" ]; then 60 | python install_dev.py 61 | else 62 | # Use CPU-only torch to avoid CUDA dependencies (saves ~5GB) 63 | pip install --no-cache-dir '.[dev]' \ 64 | --extra-index-url https://download.pytorch.org/whl/cpu 65 | pip install --no-cache-dir executorch==${{ matrix.executorch-version }} 66 | fi 67 | pip list 68 | - name: Run tests 69 | run: | 70 | RUN_SLOW=1 pytest tests/models/test_modeling_${{ matrix.test-modeling }}.py -s -vvvv --durations=0 --log-cli-level=INFO 71 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "optimum-executorch" 3 | dynamic = ["version"] 4 | description = "Optimum Executorch is an interface between the Hugging Face libraries and ExecuTorch" 5 | readme = { file = "README.md", content-type = "text/markdown" } 6 | license = { text = "Apache" } 7 | authors = [ 8 | { name = "HuggingFace Inc. Special Ops Team", email = "hardware@huggingface.co" }, 9 | ] 10 | requires-python = ">=3.10.0" 11 | keywords = ["transformers", "quantization", "inference", "executorch"] 12 | classifiers = [ 13 | "Development Status :: 2 - Pre-Alpha", 14 | "License :: OSI Approved :: Apache Software License", 15 | "Intended Audience :: Developers", 16 | "Intended Audience :: Education", 17 | "Intended Audience :: Science/Research", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 24 | ] 25 | 26 | dependencies = [ 27 | "optimum~=2.0.0", 28 | "executorch>=1.0.0", 29 | "transformers==5.0.0rc1", 30 | "pytorch-tokenizers>=1.0.1", 31 | "accelerate>=0.26.0", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "accelerate>=0.26.0", 37 | "coremltools>=8.2.0", 38 | "datasets==3.6.0", 39 | "parameterized", 40 | "pytest", 41 | "safetensors", 42 | "sentencepiece", 43 | "numba!=0.58.0", 44 | "librosa", 45 | "soundfile", 46 | "tiktoken", 47 | "black~=23.1", 48 | "ruff==0.4.4", 49 | ] 50 | 51 | [project.urls] 52 | Homepage = "https://github.com/huggingface/optimum-executorch" 53 | 54 | # ---- setuptools config ---- 55 | 56 | [tool.setuptools] 57 | # Equivalent of include_package_data=True 58 | include-package-data = true 59 | 60 | [tool.setuptools.packages.find] 61 | # Mirrors find_namespace_packages(include=["optimum*"]) 62 | include = ["optimum*"] 63 | namespaces = true 64 | 65 | [tool.setuptools.dynamic] 66 | # Pull version from the Python attribute 67 | version = { attr = "optimum.executorch.version.__version__" } 68 | 69 | # ---- your existing tool configs (kept, but one tweak suggested) ---- 70 | 71 | [tool.black] 72 | line-length = 119 73 | # Recommended to match your supported interpreters: 74 | target-version = ["py310", "py311", "py312"] 75 | 76 | [tool.ruff] 77 | ignore = ["C901", "E501", "E741", "W605"] 78 | select = ["C", "E", "F", "I", "W"] 79 | line-length = 119 80 | 81 | [tool.ruff.per-file-ignores] 82 | "__init__.py" = ["E402", "F401", "F403", "F811"] 83 | 84 | [tool.ruff.isort] 85 | lines-after-imports = 2 86 | known-first-party = ["optimum"] 87 | 88 | [tool.pytest.ini_options] 89 | markers = [ 90 | "run_slow", 91 | "portable", 92 | ] 93 | 94 | [build-system] 95 | requires = ["setuptools >= 77.0.3", "wheel"] 96 | build-backend = "setuptools.build_meta" 97 | -------------------------------------------------------------------------------- /tests/models/test_modeling_granite.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_granite_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 47 | model_id = "ibm-granite/granite-3.3-2b-instruct" 48 | prompt = "Take a current environmental issue and work backward to devise an innovative prevention strategy" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | model = ExecuTorchModelForCausalLM.from_pretrained( 51 | model_id, 52 | recipe="xnnpack", 53 | attn_implementation="custom_sdpa", 54 | use_custom_kv_cache=True, 55 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 56 | ) 57 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 58 | self.assertIsInstance(model.model, ExecuTorchModule) 59 | generated_text = model.text_generation( 60 | tokenizer=tokenizer, 61 | prompt=prompt, 62 | max_seq_len=64, 63 | ) 64 | logging.info(f"\nGenerated text:\n\t{generated_text}") 65 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 66 | 67 | # Free memory before loading eager for quality check 68 | del model 69 | del tokenizer 70 | gc.collect() 71 | 72 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 73 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gptj.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoConfig, AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_gptj_text_generation_with_8da4w_8we(self): 47 | model_id = "Milos/slovak-gpt-j-405M" 48 | prompt = "Tradičné jedlo na Orave sú" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | config = AutoConfig.from_pretrained(model_id) 51 | config.bos_token_id = tokenizer.bos_token_id 52 | config.eos_token_id = tokenizer.eos_token_id 53 | model = ExecuTorchModelForCausalLM.from_pretrained( 54 | model_id, 55 | config=config, 56 | recipe="xnnpack", 57 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 58 | ) 59 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 60 | self.assertIsInstance(model.model, ExecuTorchModule) 61 | generated_text = model.text_generation( 62 | tokenizer=tokenizer, 63 | prompt=prompt, 64 | max_seq_len=64, 65 | ) 66 | logging.info(f"\nGenerated text:\n\t{generated_text}") 67 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 68 | 69 | # Free memory before loading eager for quality check 70 | del model 71 | del tokenizer 72 | gc.collect() 73 | 74 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 75 | -------------------------------------------------------------------------------- /tests/models/test_modeling_codegen.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | import torchao 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoConfig, AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | @pytest.mark.skipif( 43 | parse(torchao.__version__) < parse("0.11.0"), 44 | reason="Quantization is only available on torchao >= 0.11.0.", 45 | ) 46 | def test_codegen_text_generation_with_8da4w_8we(self): 47 | model_id = "Salesforce/codegen-350M-mono" 48 | prompt = "def hello_world():" 49 | tokenizer = AutoTokenizer.from_pretrained(model_id) 50 | config = AutoConfig.from_pretrained(model_id) 51 | config.bos_token_id = tokenizer.bos_token_id 52 | config.eos_token_id = tokenizer.eos_token_id 53 | model = ExecuTorchModelForCausalLM.from_pretrained( 54 | model_id, 55 | config=config, 56 | recipe="xnnpack", 57 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 58 | ) 59 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 60 | self.assertIsInstance(model.model, ExecuTorchModule) 61 | generated_text = model.text_generation( 62 | tokenizer=tokenizer, 63 | prompt=prompt, 64 | max_seq_len=64, 65 | ) 66 | logging.info(f"\nGenerated text:\n\t{generated_text}") 67 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 68 | 69 | # Free memory before loading eager for quality check 70 | del model 71 | del tokenizer 72 | gc.collect() 73 | 74 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 75 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/tasks/seq2seq_lm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers import AutoModelForSeq2SeqLM 16 | 17 | from ..integrations import Seq2SeqLMExportableModule 18 | from ..task_registry import register_task 19 | 20 | 21 | # NOTE: It’s important to map the registered task name to the pipeline name in https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py. 22 | # This will streamline using inferred task names and make exporting models to Hugging Face pipelines easier. 23 | @register_task("text2text-generation") 24 | def load_seq2seq_lm_model(model_name_or_path: str, **kwargs) -> Seq2SeqLMExportableModule: 25 | """ 26 | Loads a seq2seq language model for conditional text generation and registers it under the task 27 | 'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`. 28 | 29 | Args: 30 | model_name_or_path (str): 31 | Model ID on huggingface.co or path on disk to the model repository to export. For example: 32 | `model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder` 33 | **kwargs: 34 | Additional configuration options for the model: 35 | - dtype (str, optional): 36 | Data type for model weights (default: "float32"). 37 | Options include "float16" and "bfloat16". 38 | - max_hidden_seq_length (int, optional): 39 | Maximum hidden sequence length (default: 4096). 40 | - max_cache_length (int, optional): 41 | Maximum sequence length for generation (default: 1024). 42 | 43 | Returns: 44 | Seq2SeqLMExportableModule: 45 | An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch. 46 | n""" 47 | device = "cpu" 48 | batch_size = 1 49 | max_hidden_seq_len = kwargs.get("max_hidden_seq_len", 4096) 50 | max_seq_len = kwargs.get("max_seq_len", 1024) 51 | 52 | full_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device).eval() 53 | return Seq2SeqLMExportableModule( 54 | full_model, 55 | batch_size=batch_size, 56 | max_seq_len=max_seq_len, 57 | max_hidden_seq_len=max_hidden_seq_len, 58 | ) 59 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gptneoxjapanese.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import sys 20 | import unittest 21 | 22 | import pytest 23 | import torchao 24 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 25 | from packaging.version import parse 26 | from transformers import AutoConfig, AutoTokenizer 27 | from transformers.testing_utils import slow 28 | 29 | from optimum.executorch import ExecuTorchModelForCausalLM 30 | 31 | from ..utils import check_causal_lm_output_quality 32 | 33 | 34 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 35 | is_ci = os.environ.get("GITHUB_ACTIONS") == "true" 36 | is_linux_ci = sys.platform.startswith("linux") and is_ci 37 | 38 | 39 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 40 | def __init__(self, *args, **kwargs): 41 | super().__init__(*args, **kwargs) 42 | 43 | @slow 44 | @pytest.mark.run_slow 45 | @pytest.mark.skipif( 46 | is_linux_ci or parse(torchao.__version__) < parse("0.11.0"), 47 | reason="Quantization is only available on torchao >= 0.11.0.", 48 | ) 49 | def test_gptneoxjapanese_text_generation_with_8da4w_8we(self): 50 | model_id = "abeja/gpt-neox-japanese-2.7b" 51 | prompt = "人とAIが協調するためには、" 52 | tokenizer = AutoTokenizer.from_pretrained(model_id) 53 | config = AutoConfig.from_pretrained(model_id) 54 | config.bos_token_id = tokenizer.bos_token_id 55 | config.eos_token_id = tokenizer.eos_token_id 56 | model = ExecuTorchModelForCausalLM.from_pretrained( 57 | model_id, 58 | config=config, 59 | recipe="xnnpack", 60 | **{"qlinear": "8da4w", "qembeeding": "8w"}, 61 | ) 62 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 63 | self.assertIsInstance(model.model, ExecuTorchModule) 64 | generated_text = model.text_generation( 65 | tokenizer=tokenizer, 66 | prompt=prompt, 67 | max_seq_len=64, 68 | ) 69 | logging.info(f"\nGenerated text:\n\t{generated_text}") 70 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 71 | # Free memory before loading eager for quality check 72 | del model 73 | del tokenizer 74 | gc.collect() 75 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 76 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/README.md: -------------------------------------------------------------------------------- 1 | # Exporting Transformers Models to ExecuTorch 2 | 3 | Optimum ExecuTorch enables exporting models from Transformers to ExecuTorch. 4 | The models supported by Optimum ExecuTorch are listed [here](../../../README.md#-supported-models). 5 | 6 | ### LLMs (Large Language Models) 7 | LLMs can be exported using the `text-generation` task like so: 8 | ``` 9 | optimum-cli export executorch \ 10 | --model \ 11 | --task text-generation \ 12 | --recipe xnnpack \ 13 | --use_custom_sdpa \ 14 | --use_custom_kv_cache \ 15 | --qlinear 8da4w \ 16 | --qembedding 8w 17 | ...etc... 18 | ``` 19 | 20 | The export will produce a `.pte` with a single forward method for the decoder: `model`. 21 | 22 | Note that most of the arguments here are only applicable to LLMs (multimodal included): 23 | ``` 24 | --use_custom_sdpa \ 25 | --use_custom_kv_cache \ 26 | --qlinear 8da4w \ 27 | --qembedding 8w 28 | ``` 29 | 30 | ### Multimodal LLMs 31 | Multimodal LLMs can be exported using the `multimodal-text-to-text` task like so: 32 | ``` 33 | optimum-cli export executorch \ 34 | --model mistralai/Voxtral-Mini-3B-2507 \ 35 | --task multimodal-text-to-text \ 36 | --recipe xnnpack \ 37 | --use_custom_sdpa \ 38 | --use_custom_kv_cache \ 39 | --qlinear 8da4w \ 40 | --qembedding 8w 41 | ...etc... 42 | ``` 43 | 44 | The export will produce a `.pte` with the following methods: 45 | - `text_decoder`: the text decoder or language model backbone 46 | - `audio_encoder` or `vision_encoder`: the encoder which feeds into the decoder 47 | - `token_embedding`: the embedding layer of the language model backbone 48 | - This is needed in order to cleanly separate the entire multimodal model into subgraphs. The text decoder subgraph will take in token embeddings, so multimodal input will be processed into embeddings by the encoder while text input will be processed into embeddings by this method. 49 | 50 | ### Seq2Seq 51 | Seq2Seq models can be exported using the `text2text-generation` task like so: 52 | ``` 53 | optimum-cli export executorch \ 54 | --model google-t5/t5-small \ 55 | --task text2text-generation \ 56 | --recipe xnnpack 57 | ``` 58 | 59 | The export will produce a `.pte` with the following methods: 60 | - `text_decoder`: the decoder half of the Seq2Seq model 61 | - `encoder`: the encoder half of the Seq2Seq model. This encoder can support a variety of modalities, such as text for T5 and audio for Whisper. 62 | 63 | ### Image classification 64 | Image classification models can be exported using the `image-classification` task like so: 65 | ``` 66 | optimum-cli export executorch \ 67 | --model google/vit-base-patch16-224 \ 68 | --task image-classification \ 69 | --recipe xnnpack 70 | ``` 71 | 72 | The export will produce a `.pte` with a single forward method for the decoder: `model`. 73 | 74 | ### ASR (Automatic speech recognition) 75 | ASR is a special case of Seq2Seq that uses the base Seq2Seq exportable modules. It can be exported using the `automatic-speech-recognition` task like so: 76 | ``` 77 | optimum-cli export executorch \ 78 | --model openai/whisper-tiny \ 79 | --task automatic-speech-recognition \ 80 | --recipe xnnpack 81 | ``` 82 | 83 | The export will produce a `.pte` with the following methods: 84 | - `text_decoder`: the decoder half of the Seq2Seq model 85 | - `encoder`: the encoder half of the Seq2Seq model. 86 | -------------------------------------------------------------------------------- /tests/models/test_modeling_cvt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_cvt_export_to_executorch(self): 39 | model_id = "microsoft/cvt-13" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_cvt_image_classification(self, recipe: str): 51 | model_id = "microsoft/cvt-13" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_cvt_image_classification(self): 76 | self._helper_cvt_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_cvt_image_classification_portable(self): 82 | self._helper_cvt_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_pvt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_pvt_export_to_executorch(self): 39 | model_id = "Zetatech/pvt-tiny-224" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_pvt_image_classification(self, recipe: str): 51 | model_id = "Zetatech/pvt-tiny-224" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_pvt_image_classification(self): 76 | self._helper_pvt_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_pvt_image_classification_portable(self): 82 | self._helper_pvt_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_dit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_dit_export_to_executorch(self): 39 | model_id = "microsoft/dit-base-finetuned-rvlcdip" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_dit_image_classification(self, recipe: str): 51 | model_id = "microsoft/dit-base-finetuned-rvlcdip" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_dit_image_classification(self): 76 | self._helper_dit_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_dit_image_classification_portable(self): 82 | self._helper_dit_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_focalnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_focalnet_export_to_executorch(self): 39 | model_id = "microsoft/focalnet-tiny" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_focalnet_image_classification(self, recipe: str): 51 | model_id = "microsoft/focalnet-tiny" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_focalnet_image_classification(self): 76 | self._helper_focalnet_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_focalnet_image_classification_portable(self): 82 | self._helper_focalnet_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_swin.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_swin_export_to_executorch(self): 39 | model_id = "microsoft/swin-tiny-patch4-window7-224" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_swin_image_classification(self, recipe: str): 51 | model_id = "microsoft/swin-tiny-patch4-window7-224" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_swin_image_classification(self): 76 | self._helper_swin_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_swin_image_classification_portable(self): 82 | self._helper_swin_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_deit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_deit_export_to_executorch(self): 39 | model_id = "facebook/deit-base-distilled-patch16-224" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_deit_image_classification(self, recipe: str): 51 | model_id = "facebook/deit-base-distilled-patch16-224" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_deit_image_classification(self): 76 | self._helper_deit_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_deit_image_classification_portable(self): 82 | self._helper_deit_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_mobilevit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_mobilevit_export_to_executorch(self): 39 | model_id = "apple/mobilevit-xx-small" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_mobilevit_image_classification(self, recipe: str): 51 | model_id = "apple/mobilevit-xx-small" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_mobilevit_image_classification(self): 76 | self._helper_mobilevit_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_mobilevit_image_classification_portable(self): 82 | self._helper_mobilevit_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_albert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoTokenizer 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForMaskedLM 28 | 29 | 30 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 31 | def __init__(self, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | 34 | @slow 35 | @pytest.mark.run_slow 36 | def test_albert_export_to_executorch(self): 37 | model_id = "albert/albert-base-v2" 38 | task = "fill-mask" 39 | recipe = "xnnpack" 40 | with tempfile.TemporaryDirectory() as tempdir: 41 | subprocess.run( 42 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 43 | shell=True, 44 | check=True, 45 | ) 46 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 47 | 48 | def _helper_albert_fill_mask(self, recipe: str): 49 | model_id = "albert/albert-base-v2" 50 | tokenizer = AutoTokenizer.from_pretrained(model_id) 51 | 52 | # Test fetching and lowering the model to ExecuTorch 53 | model = ExecuTorchModelForMaskedLM.from_pretrained(model_id=model_id, recipe=recipe) 54 | self.assertIsInstance(model, ExecuTorchModelForMaskedLM) 55 | self.assertIsInstance(model.model, ExecuTorchModule) 56 | 57 | input_text = f"Paris is the {tokenizer.mask_token} of France." 58 | inputs = tokenizer( 59 | input_text, 60 | return_tensors="pt", 61 | padding="max_length", 62 | max_length=10, 63 | ) 64 | 65 | # Test inference using ExecuTorch model 66 | exported_outputs = model.forward(inputs["input_ids"], inputs["attention_mask"]) 67 | predicted_masks = tokenizer.decode(exported_outputs[0, 4].topk(5).indices) 68 | logging.info(f"\nInput text:\n\t{input_text}\nPredicted masks:\n\t{predicted_masks}") 69 | self.assertTrue( 70 | any(word in predicted_masks for word in ["capital", "center", "heart", "birthplace"]), 71 | f"Exported model predictions {predicted_masks} don't contain any of the most common expected words", 72 | ) 73 | 74 | @slow 75 | @pytest.mark.run_slow 76 | def test_albert_fill_mask(self): 77 | self._helper_albert_fill_mask("xnnpack") 78 | 79 | @slow 80 | @pytest.mark.run_slow 81 | @pytest.mark.portable 82 | def test_albert_fill_mask_portable(self): 83 | self._helper_albert_fill_mask("portable") 84 | -------------------------------------------------------------------------------- /tests/models/test_modeling_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoTokenizer 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForMaskedLM 28 | 29 | 30 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 31 | def __init__(self, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | 34 | @slow 35 | @pytest.mark.run_slow 36 | def test_roberta_export_to_executorch(self): 37 | model_id = "FacebookAI/xlm-roberta-base" 38 | task = "fill-mask" 39 | recipe = "xnnpack" 40 | with tempfile.TemporaryDirectory() as tempdir: 41 | subprocess.run( 42 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 43 | shell=True, 44 | check=True, 45 | ) 46 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 47 | 48 | def _helper_roberta_fill_mask(self, recipe: str): 49 | model_id = "FacebookAI/xlm-roberta-base" 50 | tokenizer = AutoTokenizer.from_pretrained(model_id) 51 | 52 | # Test fetching and lowering the model to ExecuTorch 53 | model = ExecuTorchModelForMaskedLM.from_pretrained(model_id=model_id, recipe=recipe) 54 | self.assertIsInstance(model, ExecuTorchModelForMaskedLM) 55 | self.assertIsInstance(model.model, ExecuTorchModule) 56 | 57 | input_text = f"Paris is the {tokenizer.mask_token} of France." 58 | inputs = tokenizer( 59 | input_text, 60 | return_tensors="pt", 61 | padding="max_length", 62 | max_length=10, 63 | ) 64 | 65 | # Test inference using ExecuTorch model 66 | exported_outputs = model.forward(inputs["input_ids"], inputs["attention_mask"]) 67 | predicted_masks = tokenizer.decode(exported_outputs[0, 4].topk(5).indices) 68 | logging.info(f"\nInput text:\n\t{input_text}\nPredicted masks:\n\t{predicted_masks}") 69 | self.assertTrue( 70 | any(word in predicted_masks for word in ["capital", "center", "heart", "birthplace"]), 71 | f"Exported model predictions {predicted_masks} don't contain any of the most common expected words", 72 | ) 73 | 74 | @slow 75 | @pytest.mark.run_slow 76 | def test_roberta_fill_mask(self): 77 | self._helper_roberta_fill_mask(recipe="xnnpack") 78 | 79 | @slow 80 | @pytest.mark.run_slow 81 | @pytest.mark.portable 82 | def test_roberta_fill_mask_portable(self): 83 | self._helper_roberta_fill_mask(recipe="portable") 84 | -------------------------------------------------------------------------------- /tests/models/test_modeling_mobilevit2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoConfig, AutoModelForImageClassification 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForImageClassification 28 | 29 | from ..utils import check_close_recursively 30 | 31 | 32 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 33 | def __init__(self, *args, **kwargs): 34 | super().__init__(*args, **kwargs) 35 | 36 | @slow 37 | @pytest.mark.run_slow 38 | def test_mobilevit2_export_to_executorch(self): 39 | model_id = "apple/mobilevitv2-1.0-imagenet1k-256" 40 | task = "image-classification" 41 | recipe = "xnnpack" 42 | with tempfile.TemporaryDirectory() as tempdir: 43 | subprocess.run( 44 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 45 | shell=True, 46 | check=True, 47 | ) 48 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 49 | 50 | def _helper_mobilevit2_image_classification(self, recipe: str): 51 | model_id = "apple/mobilevitv2-1.0-imagenet1k-256" 52 | 53 | config = AutoConfig.from_pretrained(model_id) 54 | batch_size = 1 55 | num_channels = config.num_channels 56 | height = config.image_size 57 | width = config.image_size 58 | pixel_values = torch.rand(batch_size, num_channels, height, width) 59 | 60 | # Test fetching and lowering the model to ExecuTorch 61 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 62 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 63 | self.assertIsInstance(et_model.model, ExecuTorchModule) 64 | 65 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 66 | with torch.no_grad(): 67 | eager_output = eager_model(pixel_values) 68 | et_output = et_model.forward(pixel_values) 69 | 70 | # Compare with eager outputs 71 | self.assertTrue(check_close_recursively(eager_output.logits, et_output, atol=1e-3, rtol=1e-3)) 72 | 73 | @slow 74 | @pytest.mark.run_slow 75 | def test_mobilevit2_image_classification(self): 76 | self._helper_mobilevit2_image_classification(recipe="xnnpack") 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | @pytest.mark.portable 81 | def test_mobilevit2_image_classification_portable(self): 82 | self._helper_mobilevit2_image_classification(recipe="portable") 83 | -------------------------------------------------------------------------------- /tests/models/test_modeling_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoTokenizer 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForMaskedLM 28 | 29 | 30 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 31 | def __init__(self, *args, **kwargs): 32 | super().__init__(*args, **kwargs) 33 | 34 | @slow 35 | @pytest.mark.run_slow 36 | def test_distilbert_export_to_executorch(self): 37 | model_id = "distilbert/distilbert-base-uncased" 38 | task = "fill-mask" 39 | recipe = "xnnpack" 40 | with tempfile.TemporaryDirectory() as tempdir: 41 | subprocess.run( 42 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 43 | shell=True, 44 | check=True, 45 | ) 46 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 47 | 48 | def _helper_distilbert_fill_mask(self, recipe: str): 49 | model_id = "distilbert/distilbert-base-uncased" 50 | tokenizer = AutoTokenizer.from_pretrained(model_id) 51 | 52 | # Test fetching and lowering the model to ExecuTorch 53 | model = ExecuTorchModelForMaskedLM.from_pretrained(model_id=model_id, recipe=recipe) 54 | self.assertIsInstance(model, ExecuTorchModelForMaskedLM) 55 | self.assertIsInstance(model.model, ExecuTorchModule) 56 | 57 | input_text = f"Paris is the {tokenizer.mask_token} of France." 58 | inputs = tokenizer( 59 | input_text, 60 | return_tensors="pt", 61 | padding="max_length", 62 | max_length=10, 63 | ) 64 | 65 | # Test inference using ExecuTorch model 66 | exported_outputs = model.forward(inputs["input_ids"], inputs["attention_mask"]) 67 | predicted_masks = tokenizer.decode(exported_outputs[0, 4].topk(5).indices) 68 | logging.info(f"\nInput text:\n\t{input_text}\nPredicted masks:\n\t{predicted_masks}") 69 | self.assertTrue( 70 | any(word in predicted_masks for word in ["capital", "center", "heart", "birthplace"]), 71 | f"Exported model predictions {predicted_masks} don't contain any of the most common expected words", 72 | ) 73 | 74 | @slow 75 | @pytest.mark.run_slow 76 | def test_distilbert_fill_mask(self): 77 | self._helper_distilbert_fill_mask(recipe="xnnpack") 78 | 79 | @slow 80 | @pytest.mark.run_slow 81 | @pytest.mark.portable 82 | def test_distilbert_fill_mask_portable(self): 83 | self._helper_distilbert_fill_mask(recipe="portable") 84 | -------------------------------------------------------------------------------- /tests/models/test_modeling_efficientnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import tempfile 19 | import unittest 20 | 21 | import pytest 22 | import torch 23 | from executorch import version 24 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 25 | from transformers import AutoConfig, AutoModelForImageClassification 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForImageClassification 29 | 30 | from ..utils import check_close_recursively 31 | 32 | 33 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 34 | def __init__(self, *args, **kwargs): 35 | super().__init__(*args, **kwargs) 36 | 37 | @slow 38 | @pytest.mark.run_slow 39 | def test_efficientnet_export_to_executorch(self): 40 | model_id = "google/efficientnet-b7" # ~66M params 41 | task = "image-classification" 42 | recipe = "xnnpack" 43 | with tempfile.TemporaryDirectory() as tempdir: 44 | subprocess.run( 45 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 46 | shell=True, 47 | check=True, 48 | ) 49 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 50 | 51 | def _helper_efficientnet_image_classification(self, recipe: str): 52 | model_id = "google/efficientnet-b0" # ~5.3M params 53 | 54 | config = AutoConfig.from_pretrained(model_id) 55 | batch_size = 1 56 | num_channels = config.num_channels 57 | height = config.image_size 58 | width = config.image_size 59 | pixel_values = torch.rand(batch_size, num_channels, height, width) 60 | 61 | # Test fetching and lowering the model to ExecuTorch 62 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 63 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 64 | self.assertIsInstance(et_model.model, ExecuTorchModule) 65 | 66 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 67 | with torch.no_grad(): 68 | eager_output = eager_model(pixel_values) 69 | et_output = et_model.forward(pixel_values) 70 | 71 | # Compare with eager outputs 72 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 73 | 74 | @slow 75 | @pytest.mark.run_slow 76 | @pytest.mark.skipif( 77 | version.__version__ < "0.6.0", 78 | reason="The fix in XNNPACK is cherry-picked in 0.6.0 release", 79 | ) 80 | def test_efficientnet_image_classification(self): 81 | self._helper_efficientnet_image_classification(recipe="xnnpack") 82 | 83 | @slow 84 | @pytest.mark.run_slow 85 | @pytest.mark.portable 86 | def test_efficientnet_image_classification_portable(self): 87 | self._helper_efficientnet_image_classification(recipe="portable") 88 | -------------------------------------------------------------------------------- /tests/models/test_modeling_vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import subprocess 18 | import sys 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | import torch 24 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 25 | from transformers import AutoConfig, AutoModelForImageClassification 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForImageClassification 29 | 30 | from ..utils import check_close_recursively 31 | 32 | 33 | is_not_macos = sys.platform != "darwin" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | def test_vit_export_to_executorch(self): 43 | model_id = "google/vit-base-patch16-224" 44 | task = "image-classification" 45 | recipe = "xnnpack" 46 | with tempfile.TemporaryDirectory() as tempdir: 47 | subprocess.run( 48 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 49 | shell=True, 50 | check=True, 51 | ) 52 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 53 | 54 | def _helper_vit_image_classification(self, recipe: str): 55 | model_id = "google/vit-base-patch16-224" 56 | 57 | config = AutoConfig.from_pretrained(model_id) 58 | batch_size = 1 59 | num_channels = config.num_channels 60 | height = config.image_size 61 | width = config.image_size 62 | pixel_values = torch.rand(batch_size, num_channels, height, width) 63 | 64 | # Test fetching and lowering the model to ExecuTorch 65 | et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_id, recipe=recipe) 66 | self.assertIsInstance(et_model, ExecuTorchModelForImageClassification) 67 | self.assertIsInstance(et_model.model, ExecuTorchModule) 68 | 69 | eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu") 70 | with torch.no_grad(): 71 | eager_output = eager_model(pixel_values) 72 | et_output = et_model.forward(pixel_values) 73 | 74 | # Compare with eager outputs 75 | self.assertTrue(check_close_recursively(eager_output.logits, et_output)) 76 | 77 | @slow 78 | @pytest.mark.run_slow 79 | def test_vit_image_classification(self): 80 | self._helper_vit_image_classification(recipe="xnnpack") 81 | 82 | @slow 83 | @pytest.mark.run_slow 84 | @pytest.mark.portable 85 | def test_vit_image_classification_portable(self): 86 | self._helper_vit_image_classification(recipe="portable") 87 | 88 | @slow 89 | @pytest.mark.run_slow 90 | @pytest.mark.skipif(is_not_macos, reason="Only runs on MacOS") 91 | def test_vit_image_classification_coreml_fp32_cpu(self): 92 | self._helper_vit_image_classification(recipe="coreml_fp32_cpu") 93 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/portable.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, Union 16 | 17 | from torch.export import ExportedProgram 18 | 19 | from executorch.exir import ( 20 | EdgeCompileConfig, 21 | ExecutorchProgram, 22 | to_edge_transform_and_lower, 23 | ) 24 | from optimum.executorch.passes.remove_padding_idx_embedding_pass import RemovePaddingIdxEmbeddingPass 25 | 26 | from ..integrations import ( 27 | CausalLMExportableModule, 28 | MaskedLMExportableModule, 29 | Seq2SeqLMExportableModule, 30 | ) 31 | from ..recipe_registry import register_recipe 32 | 33 | 34 | @register_recipe("portable") 35 | def export_to_executorch_with_portable( 36 | model: Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule], 37 | **kwargs, 38 | ): 39 | """ 40 | Export a PyTorch model to ExecuTorch with Portable kernels. 41 | 42 | This function also write metadata required by the ExecuTorch runtime to the model. 43 | 44 | Args: 45 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule]): 46 | The PyTorch model to be exported to ExecuTorch. 47 | **kwargs: 48 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 49 | 50 | Returns: 51 | Dict[str, ExecutorchProgram]: 52 | A map of exported and optimized program for ExecuTorch. 53 | For encoder-decoder models or multimodal models, it may generate multiple programs. 54 | """ 55 | 56 | def _lower_to_executorch( 57 | exported_programs: Dict[str, ExportedProgram], 58 | metadata=None, 59 | ) -> Dict[str, ExecutorchProgram]: 60 | # If just one exported program, the method name in the .pte for it should be "forward". 61 | if len(exported_programs) == 1: 62 | exported_programs = {"forward": next(iter(exported_programs.values()))} 63 | 64 | et_prog = to_edge_transform_and_lower( 65 | exported_programs, 66 | partitioner=[], 67 | compile_config=EdgeCompileConfig( 68 | _check_ir_validity=False, 69 | _skip_dim_order=True, 70 | ), 71 | constant_methods=metadata, 72 | transform_passes=[RemovePaddingIdxEmbeddingPass()], 73 | ).to_executorch() 74 | pte_name = "model" 75 | return {pte_name: et_prog} 76 | 77 | exported_progs = model.export() 78 | 79 | if ( 80 | model.config._attn_implementation == "custom_sdpa" 81 | or model.config._attn_implementation == "custom_sdpa_ring_kv_cache" 82 | ): 83 | # Sanity check to make sure the exported program contains the custom sdpa operator. 84 | if not any( 85 | node.op == "call_function" and "custom_sdpa" in str(node.target) 86 | for exported_program in exported_progs.values() 87 | for node in exported_program.graph_module.graph.nodes 88 | ): 89 | raise ValueError("'custom_sdpa' not found in the graph.") 90 | 91 | return _lower_to_executorch(exported_progs, model.metadata) 92 | -------------------------------------------------------------------------------- /tests/models/test_modeling_qwen3_embedding.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import unittest 20 | 21 | import pytest 22 | from executorch import version 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from packaging.version import parse 25 | from transformers import AutoTokenizer 26 | from transformers.testing_utils import slow 27 | 28 | from optimum.executorch import ExecuTorchModelForCausalLM 29 | 30 | from ..utils import check_causal_lm_output_quality 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | def test_qwen3_embedding_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 43 | model_id = "Qwen/Qwen3-Embedding-0.6B" 44 | prompt = "Explain gravity" 45 | tokenizer = AutoTokenizer.from_pretrained(model_id) 46 | model = ExecuTorchModelForCausalLM.from_pretrained( 47 | model_id, 48 | task="text-generation", 49 | recipe="xnnpack", 50 | attn_implementation="custom_sdpa", 51 | use_custom_kv_cache=True, 52 | **{"qlinear": "8da4w", "qembedding": "8w"}, 53 | ) 54 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 55 | self.assertIsInstance(model.model, ExecuTorchModule) 56 | generated_text = model.text_generation( 57 | tokenizer=tokenizer, 58 | prompt=prompt, 59 | max_seq_len=64, 60 | ) 61 | logging.info(f"\nGenerated text:\n\t{generated_text}") 62 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 63 | 64 | # Free memory before loading eager for quality check 65 | del model 66 | del tokenizer 67 | gc.collect() 68 | 69 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 70 | 71 | @slow 72 | @pytest.mark.run_slow 73 | @pytest.mark.portable 74 | @pytest.mark.skipif( 75 | parse(version.__version__) < parse("0.7.0"), 76 | reason="Fixed on executorch >= 0.7.0", 77 | ) 78 | def test_qwen3_embedding_text_generation_portable(self): 79 | model_id = "Qwen/Qwen3-Embedding-0.6B" 80 | prompt = "Explain gravity" 81 | tokenizer = AutoTokenizer.from_pretrained(model_id) 82 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, task="text-generation", recipe="portable") 83 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 84 | self.assertIsInstance(model.model, ExecuTorchModule) 85 | generated_text = model.text_generation( 86 | tokenizer=tokenizer, 87 | prompt=prompt, 88 | max_seq_len=64, 89 | ) 90 | logging.info(f"\nGenerated text:\n\t{generated_text}") 91 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 92 | 93 | # Free memory before loading eager for quality check 94 | del model 95 | del tokenizer 96 | gc.collect() 97 | 98 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 99 | -------------------------------------------------------------------------------- /tests/models/test_modeling_smollm3.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import sys 20 | import unittest 21 | 22 | import pytest 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoTokenizer 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForCausalLM 28 | 29 | from ..utils import check_causal_lm_output_quality 30 | 31 | 32 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 33 | is_ci = os.environ.get("GITHUB_ACTIONS") == "true" 34 | is_linux_ci = sys.platform.startswith("linux") and is_ci 35 | 36 | 37 | @pytest.mark.skipif(is_linux_ci, reason="Runner OOM") 38 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 39 | def __init__(self, *args, **kwargs): 40 | super().__init__(*args, **kwargs) 41 | 42 | @slow 43 | @pytest.mark.run_slow 44 | def test_smollm3_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 45 | model_id = "HuggingFaceTB/SmolLM3-3B" 46 | prompt = "Give me a brief explanation of gravity in simple terms." 47 | tokenizer = AutoTokenizer.from_pretrained(model_id) 48 | model = ExecuTorchModelForCausalLM.from_pretrained( 49 | model_id, 50 | recipe="xnnpack", 51 | attn_implementation="custom_sdpa", 52 | use_custom_kv_cache=True, 53 | **{"qlinear": "8da4w", "qembedding": "8w"}, 54 | ) 55 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 56 | self.assertIsInstance(model.model, ExecuTorchModule) 57 | generated_text = model.text_generation( 58 | tokenizer=tokenizer, 59 | prompt=prompt, 60 | max_seq_len=64, 61 | ) 62 | logging.info(f"\nGenerated text:\n\t{generated_text}") 63 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 64 | 65 | # Free memory before loading eager for quality check 66 | del model 67 | del tokenizer 68 | gc.collect() 69 | 70 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 71 | 72 | @slow 73 | @pytest.mark.run_slow 74 | @pytest.mark.portable 75 | @pytest.mark.skipif(is_ci, reason="Runner OOM") 76 | def test_smollm3_text_generation_portable(self): 77 | model_id = "HuggingFaceTB/SmolLM3-3B" 78 | prompt = "Give me a brief explanation of gravity in simple terms." 79 | tokenizer = AutoTokenizer.from_pretrained(model_id) 80 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe="portable") 81 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 82 | self.assertIsInstance(model.model, ExecuTorchModule) 83 | generated_text = model.text_generation( 84 | tokenizer=tokenizer, 85 | prompt=prompt, 86 | max_seq_len=64, 87 | ) 88 | logging.info(f"\nGenerated text:\n\t{generated_text}") 89 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 90 | 91 | # Free memory before loading eager for quality check 92 | del model 93 | del tokenizer 94 | gc.collect() 95 | 96 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 97 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/convert.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ExecuTorch model check and export functions.""" 16 | 17 | import logging 18 | import os 19 | from pathlib import Path 20 | from typing import Union 21 | 22 | from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, AttentionMaskInterface 23 | from transformers.modeling_utils import AttentionInterface 24 | 25 | from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward 26 | 27 | from .recipe_registry import discover_recipes, recipe_registry 28 | 29 | 30 | AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward) 31 | AttentionMaskInterface.register("custom_sdpa", ALL_MASK_ATTENTION_FUNCTIONS["sdpa"]) 32 | 33 | 34 | def export_to_executorch( 35 | model, 36 | task: str, 37 | recipe: str, 38 | output_dir: Union[str, Path], 39 | **kwargs, 40 | ): 41 | """ 42 | Export a pre-trained PyTorch model to the ExecuTorch format using a specified recipe. 43 | 44 | This function facilitates the transformation of a PyTorch model into an optimized ExecuTorch program. 45 | 46 | Args: 47 | model (`Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"]`): 48 | A PyTorch model to be exported. This can be a standard HuggingFace `PreTrainedModel` or a wrapped 49 | module like `TorchExportableModuleWithStaticCache` for text generation task. 50 | task (`str`): 51 | The specific task the exported model will perform, e.g., "text-generation". 52 | recipe (`str`): 53 | The recipe to guide the export process, e.g., "xnnpack". Recipes define the optimization and lowering steps. 54 | Will raise an exception if the specified recipe is not registered in the recipe registry. 55 | output_dir (`Union[str, Path]`): 56 | Path to the directory where the resulting ExecuTorch model will be saved. 57 | **kwargs: 58 | Additional configuration options passed to the recipe. 59 | 60 | Returns: 61 | `ExecuTorchProgram`: 62 | The lowered ExecuTorch program object. 63 | 64 | Notes: 65 | - The function uses a dynamic recipe discovery mechanism to identify and import the specified recipe. 66 | - The exported model is stored in the specified output directory with the fixed filename `model.pte`. 67 | - The resulting ExecuTorch program is serialized and saved to the output directory. 68 | """ 69 | 70 | # Dynamically discover and import registered recipes 71 | discover_recipes() 72 | 73 | # Export and lower the model to ExecuTorch with the recipe 74 | try: 75 | recipe_func = recipe_registry.get(recipe) 76 | except KeyError as e: 77 | raise RuntimeError(f"The recipe '{recipe}' isn't registered. Detailed error: {e}") 78 | 79 | executorch_progs = recipe_func(model, **kwargs) 80 | 81 | for name, prog in executorch_progs.items(): 82 | full_path = os.path.join(f"{output_dir}", f"{name}.pte") 83 | with open(full_path, "wb") as f: 84 | prog.write_to_file(f) 85 | logging.info( 86 | f"Saved exported program to {full_path} ({os.path.getsize(full_path) / (1024 * 1024):.2f} MB)" 87 | ) 88 | prog.write_tensor_data_to_file(output_dir) 89 | 90 | return executorch_progs 91 | -------------------------------------------------------------------------------- /tests/models/test_modeling_granite_speech.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import sys 20 | import unittest 21 | 22 | import pytest 23 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 24 | from transformers import AutoProcessor, AutoTokenizer 25 | from transformers.testing_utils import slow 26 | 27 | from optimum.executorch import ExecuTorchModelForMultiModalToText 28 | 29 | from ..utils import check_multimodal_output_quality 30 | 31 | 32 | is_linux_ci = sys.platform.startswith("linux") and os.environ.get("GITHUB_ACTIONS") == "true" 33 | 34 | logging.basicConfig(level=logging.DEBUG) 35 | 36 | 37 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 38 | def __init__(self, *args, **kwargs): 39 | super().__init__(*args, **kwargs) 40 | 41 | @slow 42 | @pytest.mark.run_slow 43 | @pytest.mark.skipif(is_linux_ci, reason="OOM") 44 | def test_granite_audio_text_to_text_generation_with_custom_sdpa_kv_cache_8da4w_8we_pte(self): 45 | model_id = "ibm-granite/granite-speech-3.3-2b" 46 | tokenizer = AutoTokenizer.from_pretrained(model_id) 47 | processor = AutoProcessor.from_pretrained(model_id) 48 | system_prompt = "Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant" 49 | user_prompt = "<|audio|>can you transcribe the speech into a written format?" 50 | conversation = [ 51 | {"role": "system", "content": system_prompt}, 52 | {"role": "user", "content": user_prompt}, 53 | { 54 | "role": "user", 55 | "type": "audio", 56 | "content": "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav", 57 | }, 58 | ] 59 | 60 | model = ExecuTorchModelForMultiModalToText.from_pretrained( 61 | model_id, 62 | # "/home/jackzhxng/models/granite/granite_1", 63 | recipe="xnnpack", 64 | attn_implementation="custom_sdpa", 65 | use_custom_kv_cache=True, 66 | **{ 67 | "qlinear": "8da4w", 68 | "qlinear_encoder": "8da4w", 69 | "qembedding": "4w", 70 | "qembedding_group_size": 32, 71 | "task": "multimodal-text-to-text", 72 | }, 73 | ) 74 | self.assertIsInstance(model, ExecuTorchModelForMultiModalToText) 75 | self.assertIsInstance(model.model, ExecuTorchModule) 76 | 77 | generated_text = model.text_generation( 78 | processor=processor, 79 | tokenizer=tokenizer, 80 | input_conversation=conversation, 81 | max_seq_len=64, 82 | ) 83 | logging.info(f"\nGenerated text:\n\t{generated_text}") 84 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 85 | 86 | del model 87 | del tokenizer 88 | gc.collect() 89 | 90 | # Should be something like: 'Certainly! Here's the transcribed written format of Timothy's actions and thoughts: 91 | # After his nap, Timothy leisurely stretched, first one gray velvet foot, then the other. He then slowly rolled, 92 | # indolently, to his plate.' 93 | self.assertTrue("Timothy" in generated_text) 94 | self.assertTrue("nap" in generated_text) 95 | self.assertTrue("stretch" in generated_text) 96 | self.assertTrue( 97 | check_multimodal_output_quality(model_id, generated_tokens, conversation, max_perplexity_threshold=5) 98 | ) 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | -------------------------------------------------------------------------------- /tests/models/test_modeling_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | import torchao 24 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 25 | from packaging.version import parse 26 | from transformers import AutoTokenizer 27 | from transformers.testing_utils import slow 28 | 29 | from optimum.executorch import ExecuTorchModelForMaskedLM 30 | 31 | 32 | @pytest.mark.skipif( 33 | parse(torchao.__version__) < parse("0.11.0.dev0"), 34 | reason="Only available on torchao >= 0.11.0.dev0", 35 | ) 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | @slow 41 | @pytest.mark.run_slow 42 | def test_bert_export_to_executorch(self): 43 | model_id = "google-bert/bert-base-uncased" 44 | task = "fill-mask" 45 | recipe = "xnnpack" 46 | with tempfile.TemporaryDirectory() as tempdir: 47 | subprocess.run( 48 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 49 | shell=True, 50 | check=True, 51 | ) 52 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 53 | 54 | @slow 55 | @pytest.mark.run_slow 56 | def test_bert_export_to_executorch_quantized(self): 57 | model_id = "google-bert/bert-base-uncased" 58 | task = "fill-mask" 59 | recipe = "xnnpack" 60 | with tempfile.TemporaryDirectory() as tempdir: 61 | subprocess.run( 62 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --qlinear 8da4w --output_dir {tempdir}/executorch", 63 | shell=True, 64 | check=True, 65 | ) 66 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 67 | 68 | def _helper_bert_fill_mask(self, recipe: str): 69 | model_id = "google-bert/bert-base-uncased" 70 | tokenizer = AutoTokenizer.from_pretrained(model_id) 71 | 72 | # Test fetching and lowering the model to ExecuTorch 73 | model = ExecuTorchModelForMaskedLM.from_pretrained(model_id=model_id, recipe=recipe) 74 | self.assertIsInstance(model, ExecuTorchModelForMaskedLM) 75 | self.assertIsInstance(model.model, ExecuTorchModule) 76 | 77 | input_text = f"Paris is the {tokenizer.mask_token} of France." 78 | inputs = tokenizer( 79 | input_text, 80 | return_tensors="pt", 81 | padding="max_length", 82 | max_length=10, 83 | ) 84 | 85 | # Test inference using ExecuTorch model 86 | exported_outputs = model.forward(inputs["input_ids"], inputs["attention_mask"]) 87 | predicted_masks = tokenizer.decode(exported_outputs[0, 4].topk(5).indices) 88 | logging.info(f"\nInput text:\n\t{input_text}\nPredicted masks:\n\t{predicted_masks}") 89 | self.assertTrue( 90 | any(word in predicted_masks for word in ["capital", "center", "heart", "birthplace"]), 91 | f"Exported model predictions {predicted_masks} don't contain any of the most common expected words", 92 | ) 93 | 94 | @slow 95 | @pytest.mark.run_slow 96 | def test_bert_fill_mask(self): 97 | self._helper_bert_fill_mask("xnnpack") 98 | 99 | @slow 100 | @pytest.mark.run_slow 101 | @pytest.mark.portable 102 | def test_bert_fill_mask_portable(self): 103 | self._helper_bert_fill_mask("portable") 104 | -------------------------------------------------------------------------------- /tests/models/test_modeling_qwen2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import subprocess 20 | import tempfile 21 | import unittest 22 | 23 | import pytest 24 | from executorch import version 25 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 26 | from packaging.version import parse 27 | from transformers import AutoTokenizer 28 | from transformers.testing_utils import slow 29 | 30 | from optimum.executorch import ExecuTorchModelForCausalLM 31 | 32 | from ..utils import check_causal_lm_output_quality 33 | 34 | 35 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 36 | 37 | 38 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 39 | def __init__(self, *args, **kwargs): 40 | super().__init__(*args, **kwargs) 41 | 42 | @slow 43 | @pytest.mark.run_slow 44 | def test_qwen2_5_export_to_executorch(self): 45 | model_id = "Qwen/Qwen2.5-0.5B" 46 | task = "text-generation" 47 | recipe = "xnnpack" 48 | with tempfile.TemporaryDirectory() as tempdir: 49 | subprocess.run( 50 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 51 | shell=True, 52 | check=True, 53 | ) 54 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 55 | 56 | def _helper_qwen2_5_text_generation(self, recipe: str): 57 | model_id = "Qwen/Qwen2.5-0.5B" 58 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe=recipe) 59 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 60 | self.assertIsInstance(model.model, ExecuTorchModule) 61 | 62 | tokenizer = AutoTokenizer.from_pretrained(model_id) 63 | generated_text = model.text_generation( 64 | tokenizer=tokenizer, 65 | prompt="My favourite condiment is ", 66 | max_seq_len=32, 67 | ) 68 | logging.info(f"\nGenerated text:\n\t{generated_text}") 69 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 70 | 71 | # Free memory before loading eager for quality check 72 | del model 73 | del tokenizer 74 | gc.collect() 75 | 76 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 77 | 78 | @slow 79 | @pytest.mark.run_slow 80 | def test_qwen2_5_text_generation(self): 81 | self._helper_qwen2_5_text_generation(recipe="xnnpack") 82 | 83 | @slow 84 | @pytest.mark.run_slow 85 | @pytest.mark.portable 86 | @pytest.mark.skipif( 87 | parse(version.__version__) < parse("0.7.0"), 88 | reason="Fixed on executorch >= 0.7.0", 89 | ) 90 | def test_qwen2_5_text_generation_portable(self): 91 | self._helper_qwen2_5_text_generation(recipe="portable") 92 | 93 | @slow 94 | @pytest.mark.run_slow 95 | def test_qwen2_5_text_generation_with_custom_sdpa(self): 96 | model_id = "Qwen/Qwen2.5-0.5B" 97 | prompt = "My favourite condiment is " 98 | max_seq_len = 32 99 | tokenizer = AutoTokenizer.from_pretrained(model_id) 100 | 101 | # ExecuTorch model + custom sdpa 102 | model = ExecuTorchModelForCausalLM.from_pretrained( 103 | model_id, 104 | recipe="xnnpack", 105 | attn_implementation="custom_sdpa", 106 | ) 107 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 108 | self.assertIsInstance(model.model, ExecuTorchModule) 109 | generated_text = model.text_generation( 110 | tokenizer=tokenizer, 111 | prompt=prompt, 112 | max_seq_len=max_seq_len, 113 | ) 114 | logging.info(f"\nGenerated text:\n\t{generated_text}") 115 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 116 | 117 | # Free memory before loading eager for quality check 118 | del model 119 | del tokenizer 120 | gc.collect() 121 | 122 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 123 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gemma.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import subprocess 20 | import tempfile 21 | import unittest 22 | 23 | import pytest 24 | import torchao 25 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 26 | from packaging.version import parse 27 | from transformers import AutoTokenizer 28 | from transformers.testing_utils import slow 29 | 30 | from optimum.executorch import ExecuTorchModelForCausalLM 31 | 32 | 33 | is_ci = os.environ.get("GITHUB_ACTIONS") == "true" 34 | 35 | 36 | @pytest.mark.skipif( 37 | parse(torchao.__version__) < parse("0.11.0.dev0"), 38 | reason="Only available on torchao >= 0.11.0.dev0", 39 | ) 40 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 41 | def __init__(self, *args, **kwargs): 42 | super().__init__(*args, **kwargs) 43 | 44 | @slow 45 | @pytest.mark.run_slow 46 | def test_gemma_export_to_executorch(self): 47 | model_id = "weqweasdas/RM-Gemma-2B" 48 | task = "text-generation" 49 | recipe = "xnnpack" 50 | with tempfile.TemporaryDirectory() as tempdir: 51 | out_dir = f"{tempdir}/executorch" 52 | subprocess.run( 53 | f"optimum-cli export executorch \ 54 | --model {model_id} \ 55 | --task {task} \ 56 | --recipe {recipe} \ 57 | --output_dir {tempdir}/executorch \ 58 | --use_custom_sdpa \ 59 | --qlinear 8da4w \ 60 | --qembedding 8w", 61 | shell=True, 62 | check=True, 63 | ) 64 | pte_full_path = f"{out_dir}/model.pte" 65 | self.assertTrue(os.path.exists(pte_full_path)) 66 | 67 | # Explicitly delete the PTE file to free up disk space 68 | if os.path.exists(pte_full_path): 69 | os.remove(pte_full_path) 70 | gc.collect() 71 | 72 | @slow 73 | @pytest.mark.run_slow 74 | def test_gemma_text_generation_with_custom_sdpa_8da4w_8we(self): 75 | # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed 76 | # model_id = "google/gemma-2b" 77 | model_id = "weqweasdas/RM-Gemma-2B" 78 | # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization 79 | kwargs = {"qlinear": "8da4w", "qembedding": "8w"} 80 | model = ExecuTorchModelForCausalLM.from_pretrained( 81 | model_id, 82 | task="text-generation", 83 | recipe="xnnpack", 84 | attn_implementation="custom_sdpa", 85 | **kwargs, 86 | ) 87 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 88 | self.assertIsInstance(model.model, ExecuTorchModule) 89 | 90 | tokenizer = AutoTokenizer.from_pretrained(model_id) 91 | generated_text = model.text_generation( 92 | tokenizer=tokenizer, 93 | prompt="Hello I am doing", 94 | max_seq_len=21, 95 | ) 96 | logging.info(f"\nGenerated text:\n\t{generated_text}") 97 | 98 | @slow 99 | @pytest.mark.run_slow 100 | @pytest.mark.portable 101 | @pytest.mark.skipif(is_ci, reason="Too big for CI runners") 102 | def test_gemma_text_generation_portable(self): 103 | # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed 104 | # model_id = "google/gemma-2b" 105 | model_id = "weqweasdas/RM-Gemma-2B" 106 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, task="text-generation", recipe="portable") 107 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 108 | self.assertIsInstance(model.model, ExecuTorchModule) 109 | 110 | tokenizer = AutoTokenizer.from_pretrained(model_id) 111 | generated_text = model.text_generation( 112 | tokenizer=tokenizer, 113 | prompt="Hello I am doing", 114 | max_seq_len=21, 115 | ) 116 | logging.info(f"\nGenerated text:\n\t{generated_text}") 117 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/tasks/asr.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import torchao 15 | from transformers import AutoModelForSpeechSeq2Seq 16 | 17 | from ..integrations import Seq2SeqLMExportableModule 18 | from ..quantization import quantize_model_ 19 | from ..task_registry import register_task 20 | 21 | 22 | # NOTE: It’s important to map the registered task name to the pipeline name in https://github.com/huggingface/transformers/blob/main/utils/update_metadata.py. 23 | # This will streamline using inferred task names and make exporting models to Hugging Face pipelines easier. 24 | @register_task("automatic-speech-recognition") 25 | def load_seq2seq_speech_model(model_name_or_path: str, **kwargs) -> Seq2SeqLMExportableModule: 26 | """ 27 | Loads a model for speech seq2seq and registers it under the task 28 | 'automatic-speech-recognition' using Hugging Face's `AutoModelForSpeechSeq2Seq`. 29 | 30 | Args: 31 | model_name_or_path (str): 32 | Model ID on huggingface.co or path on disk to the model repository to export. For example: 33 | `model_name_or_path="openai/whisper-tiny"` or `mode_name_or_path="/path/to/model_folder` 34 | **kwargs: 35 | Additional configuration options for the model: 36 | - dtype (str, optional): 37 | Data type for model weights (default: "float32"). 38 | Options include "float16" and "bfloat16". 39 | - max_hidden_seq_length (int, optional): 40 | Maximum hidden sequence length (default: 4096). 41 | - max_cache_length (int, optional): 42 | Maximum sequence length for generation (default: 1024). 43 | 44 | Returns: 45 | Seq2SeqLMExportableModule: 46 | An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch. 47 | """ 48 | device = kwargs.get("device", "cpu") 49 | batch_size = 1 50 | max_hidden_seq_len = kwargs.get("max_hidden_seq_len", 4096) 51 | max_seq_len = kwargs.get("max_seq_len", 1024) 52 | dtype = kwargs.get("dtype", "float32") 53 | 54 | full_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, dtype=dtype, device_map=device).eval() 55 | 56 | for param in full_model.parameters(): 57 | if isinstance(param, torchao.utils.TorchAOBaseTensor): 58 | param.requires_grad = False 59 | 60 | qlinear_config = kwargs.get("qlinear", None) 61 | qlinear_group_size = kwargs.get("qlinear_group_size", None) 62 | qlinear_packing_format = kwargs.get("qlinear_packing_format", None) 63 | qlinear_encoder_config = kwargs.get("qlinear_encoder", None) 64 | qlinear_encoder_group_size = kwargs.get("qlinear_encoder_group_size", None) 65 | qlinear_encoder_packing_format = kwargs.get("qlinear_encoder_packing_format", None) 66 | qembedding_config = kwargs.get("qembedding", None) 67 | qembedding_group_size = kwargs.get("qembedding_group_size", None) 68 | 69 | # Quantize decoder linear weights. 70 | quantize_decoder_kwargs = { 71 | "eager_model": getattr(full_model.model, "decoder"), 72 | "qlinear_config": qlinear_config, 73 | } 74 | if qlinear_group_size is not None: 75 | quantize_decoder_kwargs["qlinear_group_size"] = qlinear_group_size 76 | if qlinear_packing_format is not None: 77 | quantize_decoder_kwargs["qlinear_packing_format"] = qlinear_packing_format 78 | quantize_model_(**quantize_decoder_kwargs) 79 | 80 | # Quantize encoder linear weights. 81 | quantize_encoder_kwargs = { 82 | "eager_model": getattr(full_model.model, "encoder"), 83 | "qlinear_config": qlinear_encoder_config, 84 | } 85 | if qlinear_encoder_group_size is not None: 86 | quantize_encoder_kwargs["qlinear_group_size"] = qlinear_encoder_group_size 87 | if qlinear_encoder_packing_format is not None: 88 | quantize_encoder_kwargs["qlinear_packing_format"] = qlinear_encoder_packing_format 89 | quantize_model_(**quantize_encoder_kwargs) 90 | 91 | # Quantize decoder embeddings. 92 | quantize_decoder_embedding_kwargs = { 93 | "eager_model": full_model, 94 | "qembedding_config": qembedding_config, 95 | } 96 | if qembedding_group_size is not None: 97 | quantize_decoder_embedding_kwargs["qembedding_group_size"] = qembedding_group_size 98 | quantize_model_(**quantize_decoder_embedding_kwargs) 99 | 100 | return Seq2SeqLMExportableModule( 101 | full_model, 102 | batch_size=batch_size, 103 | max_seq_len=max_seq_len, 104 | max_hidden_seq_len=max_hidden_seq_len, 105 | ) 106 | -------------------------------------------------------------------------------- /CONTRIBUTING.MD: -------------------------------------------------------------------------------- 1 | Thank you for your interest in contributing to Optimum ExecuTorch! 2 | 3 | ## Developing Optimum ExecuTorch 4 | 5 | ### Setting up the development environment 6 | To install Optimum ExecuTorch for development: 7 | ``` 8 | python install_dev.py 9 | ``` 10 | 11 | ### Testing local chagnes 12 | Optimum ExecuTorch does not have an editable install at the moment, so to test your local changes, you will need to reinstall. 13 | To prevent the reinstall from overwriting other dependencies, some of which you may have modified, you can run the following ahead of your test: 14 | ``` 15 | pip install --no-deps --no-build-isolation . 16 | ``` 17 | 18 | An example command for testing local changes to Gemma3: 19 | ``` 20 | pip install --no-deps --no-build-isolation . 21 | RUN_SLOW=1 python -m pytest tests/models/test_modeling_gemma3.py -s -k test_gemma3_image_vision_with_custom_sdpa_kv_cache_8da4w_8we --log-cli-level=INFO 22 | ``` 23 | 24 | To run tests marked with `@slow`, just set `RUN_SLOW=1`. 25 | 26 | ## Enabling a new model on Optimum 27 | 28 | Our design philsophy is to have as little model-specific code as possible, which means all optimizations, export code, etc. are model-agnostic. 29 | This allows us to theoretically export any new model straight from the source, with a few caveats which will be explained later. 30 | For example, most Large Language Models should be able to be exported using this library. 31 | 32 | ### 💡 How to "enable" a model on Optimum 33 | ❓ Currently, the [homepage README](README.md?tab=readme-ov-file#-supported-models) lists all of the "supported" models. What does this mean, and what about models not on this list? 34 | 35 | 👉 These supported models all have a test file associated with them, such as [Gemma3](https://github.com/huggingface/optimum-executorch/blob/main/tests/models/test_modeling_gemma3.py), which has been used to validate the E2E of the model (export + run generation loop on exported artifact). 36 | The test file is then used in CI to guard against potential regressions. 37 | Once you have a PR up for adding the test to the repo, feel free to edit the homepage README to include the new model. 38 | 39 | As an example, in the Gemma3 test file, we have validated that the model is able to export and returns correct output to a test prompt for different export configurations - now other users will know that Gemma3 works and are able to export the model like so: 40 | ``` 41 | optimum-cli export executorch \ 42 | --model google/gemma-3-1b-it \ 43 | --task text-generation \ 44 | --recipe xnnpack \ 45 | --use_custom_sdpa \ 46 | --use_custom_kv_cache \ 47 | --qlinear 8da4w \ 48 | --qembedding 8w 49 | ``` 50 | 51 | However, there are many models without test files in Optimum that probably still work - just that no one has went through the trouble of validating them. 52 | This is where you come in - feel free to contribute if there is a model you are interested in that does not yet have a test file! 53 | 54 | If you run into any issues, they will most likely stem from the following: 55 | - ❓ How much model-specific code is in Transformers for this model? 56 | - ❓ Do we already have the model type supported in Optimum? 57 | - ❓ Is the model itself torch.exportable? 58 | 59 | ### ❌ Model-specific code is in Transformers 60 | To address this issue, we will need to upstream changes to the Transformers library, or update our code to match. 61 | For instance, if hypothetically Transformers introduced a new type of cache, and this cache is used in a new LLM, we would need to handle this new cache type in Optimum. 62 | Or, hypothetically if we are expecting a certain attribute in a Transformers model and it exists instead with a slighly different name, this may be an opportunity to upstream some naming standardization changes to Transformers. 63 | [Here](https://github.com/huggingface/transformers/pull/40919) is an example of one such standardization. 64 | 65 | ### ❌ Model type is not supported in Optimum 66 | All of the supported model types are in [integrations.py](https://github.com/huggingface/optimum-executorch/blob/main/optimum/exporters/executorch/integrations.py), which contains wrapper classes that facilitate torch.exporting a model: 67 | - `CausalLMExportableModule` - LLMs (Large Language Models) 68 | - `MultiModalTextToTextExportableModule` - Multimodal LLMs (Large Language Models with support for audio/image input) 69 | - `VisionEncoderExportableModule` - Vision Encoder backbones (such as DiT or MobileViT) 70 | - `MaskedLMExportableModule` - Masked language models (for predicting masked characters) 71 | - `Seq2SeqLMExportableModule` - General Seq2Seq encoder-decoder models (such as T5 and Whisper) 72 | 73 | This is where most of the complexity around "enabling" a model on Optimum arises from, since post torch.export() every model follows the same flow per backend for transforming the torch.export() artifact into an Excecutorch `.pte` artifact. 74 | If the model type doesn't exist in Optimum then we will need to write a new class for it. 75 | 76 | ### ❌ Model is not torch.exportable 77 | To address this issue, we will need to upstream changes to the model's modeling file in Transformers to make the model exportable. 78 | After doing this, it's a good idea to add a torch.export test to guard against future regressions (which tend to happen frequently since Transformers moves fast). 79 | [Here](https://github.com/huggingface/transformers/blob/87f38dbfcec48027d4bf2ea7ec8b8eecd5a7bc85/tests/models/smollm3/test_modeling_smollm3.py#L175) is an example. 80 | -------------------------------------------------------------------------------- /tests/models/test_modeling_gemma2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import subprocess 20 | import sys 21 | import tempfile 22 | import unittest 23 | 24 | import pytest 25 | import torchao 26 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 27 | from packaging.version import parse 28 | from transformers import AutoTokenizer 29 | from transformers.testing_utils import slow 30 | 31 | from optimum.executorch import ExecuTorchModelForCausalLM 32 | 33 | from ..utils import check_causal_lm_output_quality 34 | 35 | 36 | is_ci = os.environ.get("GITHUB_ACTIONS") == "true" 37 | is_linux_ci = sys.platform.startswith("linux") and is_ci 38 | 39 | 40 | @pytest.mark.skipif( 41 | parse(torchao.__version__) < parse("0.11.0.dev0"), 42 | reason="Only available on torchao >= 0.11.0.dev0", 43 | ) 44 | @pytest.mark.skipif(is_linux_ci, reason="OOM on linux runner") 45 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 46 | def __init__(self, *args, **kwargs): 47 | super().__init__(*args, **kwargs) 48 | 49 | @slow 50 | @pytest.mark.run_slow 51 | def test_gemma2_export_to_executorch(self): 52 | model_id = "unsloth/gemma-2-2b-it" 53 | task = "text-generation" 54 | recipe = "xnnpack" 55 | with tempfile.TemporaryDirectory() as tempdir: 56 | out_dir = f"{tempdir}/executorch" 57 | subprocess.run( 58 | f"optimum-cli export executorch \ 59 | --model {model_id} \ 60 | --task {task} \ 61 | --recipe {recipe} \ 62 | --output_dir {tempdir}/executorch \ 63 | --use_custom_sdpa \ 64 | --qlinear 8da4w \ 65 | --qembedding 8w", 66 | shell=True, 67 | check=True, 68 | ) 69 | pte_full_path = f"{out_dir}/model.pte" 70 | self.assertTrue(os.path.exists(pte_full_path)) 71 | 72 | # Explicitly delete the PTE file to free up disk space 73 | if os.path.exists(pte_full_path): 74 | os.remove(pte_full_path) 75 | gc.collect() 76 | 77 | @slow 78 | @pytest.mark.run_slow 79 | def test_gemma2_text_generation_with_custom_sdpa_8da4w_8we(self): 80 | # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed 81 | # model_id = "google/gemma-2-2b" 82 | model_id = "unsloth/gemma-2-2b-it" 83 | # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization 84 | kwargs = {"qlinear": "8da4w", "qembedding": "8w"} 85 | model = ExecuTorchModelForCausalLM.from_pretrained( 86 | model_id, 87 | recipe="xnnpack", 88 | attn_implementation="custom_sdpa", 89 | **kwargs, 90 | ) 91 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 92 | self.assertIsInstance(model.model, ExecuTorchModule) 93 | 94 | tokenizer = AutoTokenizer.from_pretrained(model_id) 95 | generated_text = model.text_generation( 96 | tokenizer=tokenizer, 97 | prompt="Hello I am doing a project", 98 | max_seq_len=12, 99 | ) 100 | logging.info(f"\nGenerated text:\n\t{generated_text}") 101 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 102 | 103 | # Free memory before loading eager for quality check 104 | del model 105 | del tokenizer 106 | gc.collect() 107 | 108 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 109 | 110 | @slow 111 | @pytest.mark.run_slow 112 | @pytest.mark.portable 113 | @pytest.mark.skipif(is_ci, reason="Too big for CI runners") 114 | def test_gemma2_text_generation_portable(self): 115 | # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed 116 | # model_id = "google/gemma-2-2b" 117 | model_id = "unsloth/gemma-2-2b-it" 118 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe="portable") 119 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 120 | self.assertIsInstance(model.model, ExecuTorchModule) 121 | 122 | tokenizer = AutoTokenizer.from_pretrained(model_id) 123 | generated_text = model.text_generation( 124 | tokenizer=tokenizer, 125 | prompt="Hello I am doing a project", 126 | max_seq_len=12, 127 | ) 128 | logging.info(f"\nGenerated text:\n\t{generated_text}") 129 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 130 | 131 | # Free memory before loading eager for quality check 132 | del model 133 | del tokenizer 134 | gc.collect() 135 | 136 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 137 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/xnnpack.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from typing import Dict, Union 17 | 18 | from packaging.version import parse 19 | from tabulate import tabulate 20 | from torch import __version__ as torch_version 21 | from torch.export import ExportedProgram 22 | from torchao.utils import unwrap_tensor_subclass 23 | 24 | from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner 25 | from executorch.devtools.backend_debug import get_delegation_info 26 | from executorch.exir import ( 27 | EdgeCompileConfig, 28 | ExecutorchBackendConfig, 29 | ExecutorchProgram, 30 | to_edge_transform_and_lower, 31 | ) 32 | from executorch.exir.passes import MemoryPlanningPass 33 | from optimum.executorch.passes.remove_padding_idx_embedding_pass import RemovePaddingIdxEmbeddingPass 34 | 35 | from ..integrations import ( 36 | CausalLMExportableModule, 37 | MaskedLMExportableModule, 38 | MultiModalTextToTextExportableModule, 39 | Seq2SeqLMExportableModule, 40 | ) 41 | from ..recipe_registry import register_recipe 42 | 43 | 44 | @register_recipe("xnnpack") 45 | def export_to_executorch_with_xnnpack( 46 | model: Union[ 47 | CausalLMExportableModule, 48 | MaskedLMExportableModule, 49 | Seq2SeqLMExportableModule, 50 | MultiModalTextToTextExportableModule, 51 | ], 52 | **kwargs, 53 | ): 54 | """ 55 | Export a PyTorch model to ExecuTorch w/ delegation to XNNPACK backend. 56 | 57 | This function also write metadata required by the ExecuTorch runtime to the model. 58 | 59 | Args: 60 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule, MultiModalTextToTextExportableModule]): 61 | The PyTorch model to be exported to ExecuTorch. 62 | **kwargs: 63 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 64 | 65 | Returns: 66 | Dict[str, ExecutorchProgram]: 67 | A map of exported and optimized program for ExecuTorch. 68 | For encoder-decoder models or multimodal models, it may generate multiple programs. 69 | """ 70 | 71 | def _lower_to_executorch( 72 | exported_programs: Dict[str, ExportedProgram], 73 | metadata=None, 74 | ) -> Dict[str, ExecutorchProgram]: 75 | backend_config_dict = { 76 | "extract_delegate_segments": True, 77 | "memory_planning_pass": MemoryPlanningPass(alloc_graph_input=False), 78 | } 79 | backend_config_dict["do_quant_fusion_and_const_prop"] = True 80 | logging.debug(f"\nExported program: {exported_programs}") 81 | 82 | # If just one exported program, the method name in the .pte for it should be "forward". 83 | if len(exported_programs) == 1: 84 | exported_programs = {"forward": next(iter(exported_programs.values()))} 85 | 86 | et_prog = to_edge_transform_and_lower( 87 | exported_programs, 88 | partitioner=[XnnpackPartitioner()], 89 | compile_config=EdgeCompileConfig( 90 | _check_ir_validity=False, 91 | _skip_dim_order=True, 92 | ), 93 | constant_methods=metadata, 94 | transform_passes=[RemovePaddingIdxEmbeddingPass()], 95 | ) 96 | et_prog = et_prog.to_executorch( 97 | config=ExecutorchBackendConfig(**backend_config_dict), 98 | ) 99 | pte_name = "model" 100 | for method in et_prog.methods: 101 | logging.debug(f"---------------------- Method: {method} ----------------------") 102 | logging.debug(f"\nExecuTorch program for {pte_name}.pte: {et_prog.exported_program(method).graph_module}") 103 | delegation_info = get_delegation_info(et_prog.exported_program(method).graph_module) 104 | logging.debug(f"\nDelegation info Summary for {pte_name}.pte: {delegation_info.get_summary()}") 105 | logging.debug( 106 | f"\nDelegation info for {pte_name}.pte: {tabulate(delegation_info.get_operator_delegation_dataframe(), headers='keys', tablefmt='fancy_grid')}" 107 | ) 108 | return {pte_name: et_prog} 109 | 110 | # TODO: remove after ExecuTorch dep on Torch >= 2.10.0. 111 | if parse(torch_version) < parse("2.10.0.dev20251104"): 112 | model = unwrap_tensor_subclass(model) 113 | exported_progs = model.export() 114 | 115 | if ( 116 | model.config._attn_implementation == "custom_sdpa" 117 | or model.config._attn_implementation == "custom_sdpa_ring_kv_cache" 118 | ): 119 | # Sanity check to make sure the exported program contains the custom sdpa operator. 120 | if not any( 121 | node.op == "call_function" and "custom_sdpa" in str(node.target) 122 | for exported_program in exported_progs.values() 123 | for node in exported_program.graph_module.graph.nodes 124 | ): 125 | raise ValueError("'custom_sdpa' not found in the graph.") 126 | 127 | return _lower_to_executorch(exported_progs, model.metadata) 128 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/metal.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from typing import Dict, Union 17 | 18 | from packaging.version import parse 19 | 20 | from executorch import version as executorch_version 21 | 22 | 23 | EXECUTORCH_VERSION = parse(executorch_version.__version__) 24 | METAL_BACKEND_AVAILABLE = EXECUTORCH_VERSION >= parse("1.1.0.dev20251017") 25 | 26 | if METAL_BACKEND_AVAILABLE: 27 | try: 28 | from executorch.backends.apple.metal.metal_backend import MetalBackend 29 | from executorch.backends.apple.metal.metal_partitioner import MetalPartitioner 30 | except ImportError: 31 | METAL_BACKEND_AVAILABLE = False 32 | 33 | if METAL_BACKEND_AVAILABLE: 34 | from tabulate import tabulate 35 | from torch.export import ExportedProgram 36 | 37 | from executorch.backends.apple.metal.metal_backend import MetalBackend 38 | from executorch.backends.apple.metal.metal_partitioner import MetalPartitioner 39 | from executorch.devtools.backend_debug import get_delegation_info 40 | from executorch.exir import ( 41 | EdgeCompileConfig, 42 | ExecutorchProgram, 43 | to_edge_transform_and_lower, 44 | ) 45 | from optimum.executorch.passes.remove_padding_idx_embedding_pass import RemovePaddingIdxEmbeddingPass 46 | 47 | from ..integrations import ( 48 | CausalLMExportableModule, 49 | MaskedLMExportableModule, 50 | MultiModalTextToTextExportableModule, 51 | Seq2SeqLMExportableModule, 52 | ) 53 | from ..recipe_registry import register_recipe 54 | 55 | @register_recipe("metal") 56 | def export_to_executorch_with_metal( 57 | model: Union[ 58 | CausalLMExportableModule, 59 | MaskedLMExportableModule, 60 | Seq2SeqLMExportableModule, 61 | MultiModalTextToTextExportableModule, 62 | ], 63 | **kwargs, 64 | ): 65 | """ 66 | Export a PyTorch model to ExecuTorch w/ delegation to Metal backend. 67 | 68 | This function also write metadata required by the ExecuTorch runtime to the model. 69 | 70 | Args: 71 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule, MultiModalTextToTextExportableModule]): 72 | The PyTorch model to be exported to ExecuTorch. 73 | **kwargs: 74 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 75 | 76 | Returns: 77 | Dict[str, ExecutorchProgram]: 78 | A map of exported and optimized program for ExecuTorch. 79 | For encoder-decoder models or multimodal models, it may generate multiple programs. 80 | """ 81 | 82 | def _lower_to_executorch( 83 | exported_programs: Dict[str, ExportedProgram], 84 | metadata=None, 85 | ) -> Dict[str, ExecutorchProgram]: 86 | logging.debug(f"\nExported program: {exported_programs}") 87 | 88 | # If just one exported program, the method name in the .pte for it should be "forward". 89 | if len(exported_programs) == 1: 90 | exported_programs = {"forward": next(iter(exported_programs.values()))} 91 | 92 | partitioners = { 93 | key: [MetalPartitioner([MetalBackend.generate_method_name_compile_spec(key)])] 94 | for key in exported_programs.keys() 95 | } 96 | 97 | et_prog = to_edge_transform_and_lower( 98 | exported_programs, 99 | partitioner=partitioners, 100 | compile_config=EdgeCompileConfig( 101 | _check_ir_validity=False, 102 | _skip_dim_order=True, 103 | ), 104 | constant_methods=metadata, 105 | transform_passes=[RemovePaddingIdxEmbeddingPass()], 106 | ) 107 | et_prog = et_prog.to_executorch() 108 | pte_name = "model" 109 | for method in et_prog.methods: 110 | logging.debug(f"---------------------- Method: {method} ----------------------") 111 | logging.debug( 112 | f"\nExecuTorch program for {pte_name}.pte: {et_prog.exported_program(method).graph_module}" 113 | ) 114 | delegation_info = get_delegation_info(et_prog.exported_program(method).graph_module) 115 | logging.debug(f"\nDelegation info Summary for {pte_name}.pte: {delegation_info.get_summary()}") 116 | logging.debug( 117 | f"\nDelegation info for {pte_name}.pte: {tabulate(delegation_info.get_operator_delegation_dataframe(), headers='keys', tablefmt='fancy_grid')}" 118 | ) 119 | return {pte_name: et_prog} 120 | 121 | if ( 122 | model.config._attn_implementation == "custom_sdpa" 123 | or model.config._attn_implementation == "custom_sdpa_ring_kv_cache" 124 | ): 125 | raise NotImplementedError("Custom SDPA implementation is not supported for Metal.") 126 | 127 | exported_progs = model.export() 128 | 129 | return _lower_to_executorch(exported_progs, model.metadata) 130 | -------------------------------------------------------------------------------- /docs/source/guides/export.mdx: -------------------------------------------------------------------------------- 1 | 12 | 13 | # Export a model to ExecuTorch with optimum.exporters.executorch 14 | 15 | If you need to deploy 🤗 Transformers models for on-device use cases, we recommend 16 | exporting them to a serialized format that can be distributed and executed on specialized 17 | runtimes and hardware. In this guide, we'll show you how to export these 18 | models to [ExecuTorch](https://pytorch.org/executorch/main/intro-overview.html). 19 | 20 | 21 | ## Why ExecuTorch? 22 | 23 | ExecuTorch is the ideal solution for deploying PyTorch models on edge devices, offering a streamlined process from 24 | export to deployment without leaving PyTorch ecosystem. 25 | 26 | Supporting on-device AI presents unique challenges with diverse hardware, critical power requirements, low/no internet 27 | connectivity, and realtime processing needs. These constraints have historically prevented or slowed down the creation 28 | of scalable and performant on-device AI solutions. We designed ExecuTorch, backed by our industry partners like Meta, 29 | Arm, Apple, Qualcomm, MediaTek, etc. to be highly portable and provide superior developer productivity without losing on 30 | performance. 31 | 32 | 33 | ## Summary 34 | 35 | Exporting a PyTorch model to ExecuTorch is as simple as 36 | 37 | ```bash 38 | optimum-cli export executorch \ 39 | --model HuggingFaceTB/SmolLM2-135M \ 40 | --task text-generation \ 41 | --recipe xnnpack \ 42 | --output_dir hf_smollm2 \ 43 | --use_custom_sdpa 44 | ``` 45 | 46 | Check out the help for more options: 47 | 48 | ```bash 49 | optimum-cli export executorch --help 50 | ``` 51 | 52 | 53 | ## Exporting a model to ExecuTorch using the CLI 54 | 55 | The Optimum ExecuTorch export can be used through Optimum command-line: 56 | 57 | ```bash 58 | optimum-cli export executorch --help 59 | 60 | usage: optimum-cli export executorch [-h] -m MODEL [-o OUTPUT_DIR] [--task TASK] [--recipe RECIPE] 61 | 62 | options: 63 | -h, --help show this help message and exit 64 | 65 | Required arguments: 66 | -m MODEL, --model MODEL 67 | Model ID on huggingface.co or path on disk to load model from. 68 | -o OUTPUT_DIR, --output_dir OUTPUT_DIR 69 | Path indicating the directory where to store the generated ExecuTorch model. 70 | --task TASK The task to export the model for. Available tasks depend on the model, but are among: ['audio-classification', 'feature-extraction', 'image-to-text', 71 | 'sentence-similarity', 'depth-estimation', 'image-segmentation', 'audio-frame-classification', 'masked-im', 'semantic-segmentation', 'text-classification', 72 | 'audio-xvector', 'mask-generation', 'question-answering', 'text-to-audio', 'automatic-speech-recognition', 'image-to-image', 'multiple-choice', 'image- 73 | classification', 'text2text-generation', 'token-classification', 'object-detection', 'zero-shot-object-detection', 'zero-shot-image-classification', 'text- 74 | generation', 'fill-mask']. 75 | --recipe RECIPE Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack". 76 | --use_custom_sdpa For decoder-only models to use custom sdpa with static kv cache to boost performance. Defaults to False. 77 | 78 | ``` 79 | 80 | You should see a `model.pte` file is stored under "./hf_smollm2/": 81 | 82 | ```bash 83 | hf_smollm2/ 84 | └── model.pte 85 | ``` 86 | 87 | This will fetch the model on the Hub and exports the PyTorch model with the specialized recipe. The resulting `model.pte` file can then be run on the [XNNPACK backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html), or on many 88 | other ExecuTorh supported backends if exports with different recipes, e.g. Apple's [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) or [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [Qualcomm's SoCs](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html), [ARM's Ethos-U](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html), [Xtensa HiFi4 DSP](https://pytorch.org/executorch/main/build-run-xtensa.html), [Vulkan GPU](https://pytorch.org/executorch/main/build-run-vulkan.html), [MediaTek](https://pytorch.org/executorch/main/build-run-mediatek-backend.html), etc. 89 | 90 | For example, we can load and run the model with [ExecuTorch Runtime](https://pytorch.org/executorch/main/runtime-overview.html) using the `optimum.executorch` package as follows: 91 | 92 | ```python 93 | from transformers import AutoTokenizer 94 | from optimum.executorch import ExecuTorchModelForCausalLM 95 | 96 | tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M") 97 | model = ExecuTorchModelForCausalLM.from_pretrained("hf_smollm2/") 98 | prompt = "Simply put, the theory of relativity states that" 99 | print(f"\nGenerated texts:\n\t{model.text_generation(tokenizer=tokenizer, prompt=prompt, max_seq_len=45)}") 100 | ``` 101 | 102 | As you can see, converting a model to ExecuTorch does not mean leaving the Hugging Face ecosystem. You end up with a similar API as regular 🤗 Transformers models! 103 | 104 | In case your model wasn't already exported to ExecuTorch, it can also be converted on-the-fly when loading your model: 105 | 106 | ```python 107 | from optimum.executorch import ExecuTorchModelForCausalLM 108 | 109 | model = ExecuTorchModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M", recipe="xnnpack", attn_implementation="custom_sdpa") 110 | ``` 111 | -------------------------------------------------------------------------------- /tests/models/test_modeling_whisper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import tempfile 20 | import unittest 21 | 22 | import pytest 23 | from datasets import load_dataset 24 | from executorch import version 25 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 26 | from packaging.version import parse 27 | from transformers import AutoProcessor, AutoTokenizer 28 | from transformers.testing_utils import slow 29 | 30 | from optimum.executorch import ExecuTorchModelForSpeechSeq2Seq 31 | 32 | 33 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 34 | 35 | 36 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | 40 | # @slow 41 | # @pytest.mark.run_slow 42 | def test_whisper_export_to_executorch(self): 43 | model_id = "openai/whisper-tiny" 44 | task = "automatic-speech-recognition" 45 | recipe = "xnnpack" 46 | with tempfile.TemporaryDirectory() as tempdir: 47 | subprocess.run( 48 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", 49 | shell=True, 50 | check=True, 51 | ) 52 | self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) 53 | model = ExecuTorchModelForSpeechSeq2Seq.from_pretrained(f"{tempdir}/executorch") 54 | self._test_whisper_transcription(model_id, model) 55 | 56 | def _test_whisper_transcription(self, model_id: str, model: ExecuTorchModelForSpeechSeq2Seq): 57 | tokenizer = AutoTokenizer.from_pretrained(model_id) 58 | processor = AutoProcessor.from_pretrained(model_id) 59 | 60 | self.assertIsInstance(model, ExecuTorchModelForSpeechSeq2Seq) 61 | self.assertTrue(hasattr(model, "model")) 62 | self.assertIsInstance(model.model, ExecuTorchModule) 63 | 64 | dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation") 65 | sample = dataset[0]["audio"] 66 | 67 | input_features = processor( 68 | sample["array"], 69 | return_tensors="pt", 70 | truncation=False, 71 | sampling_rate=sample["sampling_rate"], 72 | ).input_features 73 | # Current implementation of the transcibe method accepts up to 30 seconds of audio, therefore I trim the audio here. 74 | input_features_trimmed = input_features[:, :, :3000].contiguous() 75 | 76 | generated_transcription = model.transcribe(tokenizer, input_features_trimmed) 77 | expected_text = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all, and can discover that." 78 | logging.info( 79 | f"\nExpected transcription:\n\t{expected_text}\nGenerated transcription:\n\t{generated_transcription}" 80 | ) 81 | self.assertEqual(generated_transcription, expected_text) 82 | 83 | def _helper_whisper_transcription(self, recipe: str): 84 | model_id = "openai/whisper-tiny" 85 | model = ExecuTorchModelForSpeechSeq2Seq.from_pretrained(model_id, recipe=recipe) 86 | self._test_whisper_transcription(model_id, model) 87 | 88 | @slow 89 | @pytest.mark.run_slow 90 | def test_whisper_transcription(self): 91 | self._helper_whisper_transcription(recipe="xnnpack") 92 | 93 | @slow 94 | @pytest.mark.run_slow 95 | @pytest.mark.portable 96 | @pytest.mark.skipif( 97 | parse(version.__version__) < parse("0.7.0"), 98 | reason="Fixed on executorch >= 0.7.0", 99 | ) 100 | def test_whisper_transcription_portable(self): 101 | self._helper_whisper_transcription(recipe="portable") 102 | 103 | @slow 104 | @pytest.mark.run_slow 105 | def test_whisper_large_v3_turbo_export_bfloat16(self): 106 | """Test exporting whisper-large-v3-turbo with bfloat16 and check file size is ~1.6GB""" 107 | model_id = "openai/whisper-large-v3-turbo" 108 | task = "automatic-speech-recognition" 109 | recipe = "xnnpack" 110 | dtype = "bfloat16" 111 | with tempfile.TemporaryDirectory() as tempdir: 112 | subprocess.run( 113 | f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch --dtype {dtype}", 114 | shell=True, 115 | check=True, 116 | ) 117 | 118 | # Check that model.pte exists 119 | model_path = os.path.join(tempdir, "executorch", "model.pte") 120 | self.assertTrue(os.path.exists(model_path), f"model.pte not found at {model_path}") 121 | 122 | # Check file size is approximately 1.6GB (allow 10% tolerance) 123 | file_size_bytes = os.path.getsize(model_path) 124 | file_size_gb = file_size_bytes / (1024**3) 125 | expected_size_gb = 1.6 126 | tolerance = 0.1 # 10% tolerance 127 | 128 | logging.info(f"model.pte size: {file_size_gb:.2f} GB") 129 | self.assertAlmostEqual( 130 | file_size_gb, 131 | expected_size_gb, 132 | delta=expected_size_gb * tolerance, 133 | msg=f"Expected file size ~{expected_size_gb}GB, but got {file_size_gb:.2f}GB", 134 | ) 135 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/cuda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from typing import Dict, Union 17 | 18 | import torch 19 | from tabulate import tabulate 20 | from torch.export import ExportedProgram 21 | 22 | from executorch.devtools.backend_debug import get_delegation_info 23 | from executorch.exir import ( 24 | EdgeCompileConfig, 25 | ExecutorchBackendConfig, 26 | ExecutorchProgram, 27 | to_edge_transform_and_lower, 28 | ) 29 | from executorch.exir.backend.compile_spec_schema import CompileSpec 30 | from executorch.exir.passes import MemoryPlanningPass 31 | from optimum.executorch.passes.remove_padding_idx_embedding_pass import ( 32 | RemovePaddingIdxEmbeddingPass, 33 | ) 34 | 35 | from ..integrations import ( 36 | CausalLMExportableModule, 37 | MaskedLMExportableModule, 38 | MultiModalTextToTextExportableModule, 39 | Seq2SeqLMExportableModule, 40 | ) 41 | from ..recipe_registry import register_recipe 42 | 43 | 44 | aten = torch.ops.aten 45 | 46 | 47 | def lower_to_executorch( 48 | exported_programs: Dict[str, ExportedProgram], 49 | metadata=None, 50 | is_windows: bool = False, 51 | model_config=None, 52 | ) -> Dict[str, ExecutorchProgram]: 53 | # Import here to avoid version conflicts. 54 | from torch._inductor.decomposition import conv1d_to_conv2d 55 | 56 | from executorch.backends.cuda.cuda_backend import CudaBackend 57 | from executorch.backends.cuda.cuda_partitioner import CudaPartitioner 58 | 59 | logging.debug(f"\nExported program: {exported_programs}") 60 | 61 | # If just one exported program, the method name in the .pte for it should be "forward". 62 | if len(exported_programs) == 1: 63 | exported_programs = {"forward": next(iter(exported_programs.values()))} 64 | 65 | # Check if this is a Gemma3 model 66 | model_type = getattr(model_config, "model_type", None) if model_config else None 67 | 68 | # CUDA backend compile spec with method name. 69 | partitioners = {} 70 | for key in exported_programs.keys(): 71 | compile_specs = [CudaBackend.generate_method_name_compile_spec(key)] 72 | if is_windows: 73 | compile_specs.append(CompileSpec("platform", "windows".encode("utf-8"))) 74 | 75 | # Add Gemma3-specific compile spec if needed 76 | if model_type == "gemma3": 77 | compile_specs.append(CompileSpec(key="triton_kernel_mode", value=b"OFF")) 78 | 79 | partitioners[key] = [CudaPartitioner(compile_specs)] 80 | 81 | # Add decompositions for triton to generate kernels. 82 | for key, ep in exported_programs.items(): 83 | exported_programs[key] = ep.run_decompositions( 84 | { 85 | aten.conv1d.default: conv1d_to_conv2d, 86 | } 87 | ) 88 | et_prog = to_edge_transform_and_lower( 89 | exported_programs, 90 | partitioner=partitioners, 91 | compile_config=EdgeCompileConfig( 92 | _check_ir_validity=False, 93 | _skip_dim_order=True, 94 | ), 95 | constant_methods=metadata, 96 | transform_passes=[RemovePaddingIdxEmbeddingPass()], 97 | ) 98 | et_prog = et_prog.to_executorch( 99 | ExecutorchBackendConfig( 100 | memory_planning_pass=MemoryPlanningPass( 101 | alloc_graph_input=False, 102 | ) 103 | ), 104 | ) 105 | pte_name = "model" 106 | for method in et_prog.methods: 107 | logging.debug(f"---------------------- Method: {method} ----------------------") 108 | logging.debug(f"\nExecuTorch program for {pte_name}.pte: {et_prog.exported_program(method).graph_module}") 109 | delegation_info = get_delegation_info(et_prog.exported_program(method).graph_module) 110 | logging.debug(f"\nDelegation info Summary for {pte_name}.pte: {delegation_info.get_summary()}") 111 | logging.debug( 112 | f"\nDelegation info for {pte_name}.pte: {tabulate(delegation_info.get_operator_delegation_dataframe(), headers='keys', tablefmt='fancy_grid')}" 113 | ) 114 | return {pte_name: et_prog} 115 | 116 | 117 | @register_recipe("cuda") 118 | def export_to_executorch_with_cuda( 119 | model: Union[ 120 | CausalLMExportableModule, 121 | MaskedLMExportableModule, 122 | Seq2SeqLMExportableModule, 123 | MultiModalTextToTextExportableModule, 124 | ], 125 | **kwargs, 126 | ): 127 | """ 128 | Export a PyTorch model to ExecuTorch w/ delegation to CUDA backend. 129 | This function also write metadata required by the ExecuTorch runtime to the .pte file. 130 | Args: 131 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule, MultiModalTextToTextExportableModule]): 132 | The PyTorch model to be exported to ExecuTorch. 133 | **kwargs: 134 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 135 | Returns: 136 | Dict[str, ExecutorchProgram]: 137 | A map of exported and optimized program for ExecuTorch. 138 | For encoder-decoder models or multimodal models, it may generate multiple programs. 139 | """ 140 | if ( 141 | model.config._attn_implementation == "custom_sdpa" 142 | or model.config._attn_implementation == "custom_sdpa_ring_kv_cache" 143 | ): 144 | raise NotImplementedError( 145 | "Custom SDPA implementation is not supported for CUDA yet. Please use 'flash_attention' instead." 146 | ) 147 | 148 | exported_progs = model.export() 149 | 150 | return lower_to_executorch(exported_progs, model.metadata, model_config=getattr(model, "config", None)) 151 | -------------------------------------------------------------------------------- /optimum/exporters/executorch/recipes/coreml.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from itertools import product 17 | from typing import Any, Dict, Union 18 | 19 | from tabulate import tabulate 20 | from torch.export import ExportedProgram 21 | 22 | from executorch.devtools.backend_debug import get_delegation_info 23 | from executorch.exir import ( 24 | EdgeCompileConfig, 25 | ExecutorchBackendConfig, 26 | ExecutorchProgram, 27 | to_edge_transform_and_lower, 28 | ) 29 | 30 | from ..integrations import ( 31 | CausalLMExportableModule, 32 | MaskedLMExportableModule, 33 | Seq2SeqLMExportableModule, 34 | ) 35 | from ..recipe_registry import register_recipe 36 | 37 | 38 | def _export_to_executorch( 39 | model: Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule], 40 | **kwargs, 41 | ): 42 | """ 43 | Export a PyTorch model to ExecuTorch w/ delegation to CoreML backend. 44 | 45 | This function also write metadata required by the ExecuTorch runtime to the model. 46 | 47 | Args: 48 | model (Union[CausalLMExportableModule, MaskedLMExportableModule, Seq2SeqLMExportableModule]): 49 | The PyTorch model to be exported to ExecuTorch. 50 | **kwargs: 51 | Additional keyword arguments for recipe-specific configurations, e.g. export using different example inputs, or different compile/bechend configs. 52 | 53 | Returns: 54 | Dict[str, ExecutorchProgram]: 55 | A map of exported and optimized program for ExecuTorch. 56 | For encoder-decoder models or multimodal models, it may generate multiple programs. 57 | """ 58 | # Import here because coremltools might not be available in all environments 59 | import coremltools as ct 60 | 61 | from executorch.backends.apple.coreml.compiler import CoreMLBackend 62 | from executorch.backends.apple.coreml.partition import CoreMLPartitioner 63 | 64 | def _lower_to_executorch( 65 | exported_programs: Dict[str, ExportedProgram], 66 | metadata, 67 | compute_unit, 68 | minimum_deployment_target, 69 | compute_precision, 70 | ) -> Dict[str, ExecutorchProgram]: 71 | et_progs = {} 72 | backend_config_dict = {} 73 | for pte_name, exported_program in exported_programs.items(): 74 | logging.debug(f"\nExported program for {pte_name}.pte: {exported_program}") 75 | et_progs[pte_name] = to_edge_transform_and_lower( 76 | exported_program, 77 | partitioner=[ 78 | CoreMLPartitioner( 79 | compile_specs=CoreMLBackend.generate_compile_specs( 80 | compute_unit=compute_unit, 81 | minimum_deployment_target=minimum_deployment_target, 82 | compute_precision=compute_precision, 83 | model_type=CoreMLBackend.MODEL_TYPE.MODEL, 84 | ), 85 | take_over_mutable_buffer=(minimum_deployment_target >= ct.target.iOS18), 86 | ) 87 | ], 88 | compile_config=EdgeCompileConfig( 89 | _check_ir_validity=False, 90 | # In ET 0.7, we can set _skip_dim_order=False 91 | _skip_dim_order=True, 92 | ), 93 | constant_methods=metadata, 94 | ).to_executorch( 95 | config=ExecutorchBackendConfig(**backend_config_dict), 96 | ) 97 | logging.debug( 98 | f"\nExecuTorch program for {pte_name}.pte: {et_progs[pte_name].exported_program().graph_module}" 99 | ) 100 | delegation_info = get_delegation_info(et_progs[pte_name].exported_program().graph_module) 101 | logging.debug(f"\nDelegation info Summary for {pte_name}.pte: {delegation_info.get_summary()}") 102 | logging.debug( 103 | f"\nDelegation info for {pte_name}.pte: {tabulate(delegation_info.get_operator_delegation_dataframe(), headers='keys', tablefmt='fancy_grid')}" 104 | ) 105 | return et_progs 106 | 107 | exported_progs = model.export() 108 | return _lower_to_executorch(exported_progs, model.metadata, **kwargs) 109 | 110 | 111 | def _get_recipe_kwargs(dtype: str, compute_unit: str) -> Dict[str, Any]: 112 | import coremltools as ct 113 | 114 | compute_precision = { 115 | "fp16": ct.precision.FLOAT16, 116 | "fp32": ct.precision.FLOAT32, 117 | }[dtype] 118 | 119 | compute_unit = { 120 | "cpu": ct.ComputeUnit.CPU_ONLY, 121 | "gpu": ct.ComputeUnit.CPU_AND_GPU, 122 | "ne": ct.ComputeUnit.CPU_AND_NE, 123 | "all": ct.ComputeUnit.ALL, 124 | }[compute_unit] 125 | 126 | recipe_kwargs = { 127 | "compute_precision": compute_precision, 128 | "compute_unit": compute_unit, 129 | "minimum_deployment_target": ct.target.iOS18, 130 | } 131 | return recipe_kwargs 132 | 133 | 134 | def _make_recipe(recipe_name, recipe_kwargs): 135 | @register_recipe(recipe_name) 136 | def recipe_fn(exported_programs: Dict[str, ExportedProgram], **kwargs): 137 | return _export_to_executorch( 138 | exported_programs, 139 | **recipe_kwargs, 140 | ) 141 | 142 | return recipe_fn 143 | 144 | 145 | # Register recipes for CoreML backend 146 | for dtype, compute_unit in product(["fp32", "fp16"], ["cpu", "gpu", "ne", "all"]): 147 | recipe_name = f"coreml_{dtype}" 148 | if compute_unit != "all": 149 | recipe_name += f"_{compute_unit}" 150 | recipe_kwargs = _get_recipe_kwargs(dtype=dtype, compute_unit=compute_unit) 151 | _make_recipe(recipe_name, recipe_kwargs) 152 | -------------------------------------------------------------------------------- /tests/models/test_modeling_llama.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2024 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import gc 17 | import logging 18 | import os 19 | import subprocess 20 | import tempfile 21 | import unittest 22 | 23 | import pytest 24 | import torchao 25 | from executorch.extension.pybindings.portable_lib import ExecuTorchModule 26 | from packaging.version import parse 27 | from transformers import AutoTokenizer 28 | from transformers.testing_utils import slow 29 | 30 | from optimum.executorch import ExecuTorchModelForCausalLM 31 | 32 | from ..utils import check_causal_lm_output_quality 33 | 34 | 35 | @pytest.mark.skipif( 36 | parse(torchao.__version__) < parse("0.11.0.dev0"), 37 | reason="Only available on torchao >= 0.11.0.dev0", 38 | ) 39 | class ExecuTorchModelIntegrationTest(unittest.TestCase): 40 | def __init__(self, *args, **kwargs): 41 | super().__init__(*args, **kwargs) 42 | 43 | @slow 44 | @pytest.mark.run_slow 45 | def test_llama3_2_1b_export_to_executorch(self): 46 | model_id = "NousResearch/Llama-3.2-1B" 47 | task = "text-generation" 48 | recipe = "xnnpack" 49 | with tempfile.TemporaryDirectory() as tempdir: 50 | out_dir = f"{tempdir}/executorch" 51 | subprocess.run( 52 | f"optimum-cli export executorch \ 53 | --model {model_id} \ 54 | --task {task} \ 55 | --recipe {recipe} \ 56 | --use_custom_sdpa \ 57 | --use_custom_kv_cache \ 58 | --qlinear 8da4w \ 59 | --qembedding 8w \ 60 | --output_dir {tempdir}/executorch", 61 | shell=True, 62 | check=True, 63 | ) 64 | pte_full_path = f"{out_dir}/model.pte" 65 | self.assertTrue(os.path.exists(pte_full_path)) 66 | 67 | # Explicitly delete the PTE file to free up disk space 68 | if os.path.exists(pte_full_path): 69 | os.remove(pte_full_path) 70 | gc.collect() 71 | 72 | @slow 73 | @pytest.mark.run_slow 74 | def test_llama_text_generation_with_custom_sdpa_8da4w_8we(self): 75 | # ExecuTorch model + custom sdpa + 8da4w linear quantization + int8 embedding quantization 76 | model_id = "NousResearch/Llama-3.2-1B" 77 | kwargs = {"qlinear": "8da4w", "qembedding": "8w"} 78 | model = ExecuTorchModelForCausalLM.from_pretrained( 79 | model_id, 80 | recipe="xnnpack", 81 | attn_implementation="custom_sdpa", 82 | **kwargs, 83 | ) 84 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 85 | self.assertIsInstance(model.model, ExecuTorchModule) 86 | tokenizer = AutoTokenizer.from_pretrained(model_id) 87 | generated_text = model.text_generation( 88 | tokenizer=tokenizer, 89 | prompt="Simply put, the theory of relativity states that", 90 | max_seq_len=32, 91 | ) 92 | logging.info(f"\nGenerated text:\n\t{generated_text}") 93 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 94 | 95 | # Free memory before loading eager for quality check 96 | del model 97 | del tokenizer 98 | gc.collect() 99 | 100 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 101 | 102 | @slow 103 | @pytest.mark.run_slow 104 | def test_llama_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we(self): 105 | model_id = "NousResearch/Llama-3.2-1B" 106 | tokenizer = AutoTokenizer.from_pretrained(model_id) 107 | model = ExecuTorchModelForCausalLM.from_pretrained( 108 | model_id, 109 | recipe="xnnpack", 110 | attn_implementation="custom_sdpa", 111 | use_custom_kv_cache=True, 112 | **{"qlinear": "8da4w", "qembedding": "8w"}, 113 | ) 114 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 115 | self.assertIsInstance(model.model, ExecuTorchModule) 116 | generated_text = model.text_generation( 117 | tokenizer=tokenizer, 118 | prompt="Simply put, the theory of relativity states that", 119 | max_seq_len=32, 120 | ) 121 | logging.info(f"\nGenerated text:\n\t{generated_text}") 122 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 123 | 124 | # Free memory before loading eager for quality check 125 | del model 126 | del tokenizer 127 | gc.collect() 128 | 129 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 130 | 131 | @slow 132 | @pytest.mark.run_slow 133 | @pytest.mark.portable 134 | def test_llama_text_generation_portable(self): 135 | model_id = "NousResearch/Llama-3.2-1B" 136 | model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe="portable") 137 | self.assertIsInstance(model, ExecuTorchModelForCausalLM) 138 | self.assertIsInstance(model.model, ExecuTorchModule) 139 | tokenizer = AutoTokenizer.from_pretrained(model_id) 140 | generated_text = model.text_generation( 141 | tokenizer=tokenizer, 142 | prompt="Simply put, the theory of relativity states that", 143 | max_seq_len=32, 144 | ) 145 | logging.info(f"\nGenerated text:\n\t{generated_text}") 146 | generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids 147 | 148 | # Free memory before loading eager for quality check 149 | del model 150 | del tokenizer 151 | gc.collect() 152 | 153 | self.assertTrue(check_causal_lm_output_quality(model_id, generated_tokens)) 154 | --------------------------------------------------------------------------------