├── .github
    └── pull_request_template.md
├── .gitignore
├── ACKNOWLEDGEMENTS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.md
├── Package.swift
├── README.md
├── assets
    ├── a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space
    │   ├── randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   └── randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png
    ├── a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space
    │   ├── randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   ├── randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   ├── randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   └── randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png
    ├── controlnet_readme_reel.png
    ├── float16_cpuandne_readmereel.png
    ├── float16_gpu_readmereel.png
    ├── mbp
    │   ├── a_high_quality_photo_of_a_surfing_dog.7667.final_3.41-bits.png
    │   ├── a_high_quality_photo_of_a_surfing_dog.7667.final_4.50-bits.png
    │   ├── a_high_quality_photo_of_a_surfing_dog.7667.final_6.55-bits.png
    │   ├── a_high_quality_photo_of_a_surfing_dog.7667.final_float16_original.png
    │   ├── runwayml_stable-diffusion-v1-5_psnr_vs_size.png
    │   ├── stabilityai_stable-diffusion-2-1-base_psnr_vs_size.png
    │   └── stabilityai_stable-diffusion-xl-base-1.0_psnr_vs_size.png
    ├── palette6_cpuandne_readmereel.png
    └── readme_reel.png
├── python_coreml_stable_diffusion
    ├── __init__.py
    ├── _version.py
    ├── activation_quantization.py
    ├── attention.py
    ├── chunk_mlprogram.py
    ├── controlnet.py
    ├── coreml_model.py
    ├── layer_norm.py
    ├── mixed_bit_compression_apply.py
    ├── mixed_bit_compression_pre_analysis.py
    ├── multilingual_projection.py
    ├── pipeline.py
    ├── torch2coreml.py
    └── unet.py
├── requirements.txt
├── setup.py
├── swift
    ├── StableDiffusion
    │   ├── pipeline
    │   │   ├── CGImage+vImage.swift
    │   │   ├── ControlNet.swift
    │   │   ├── DPMSolverMultistepScheduler.swift
    │   │   ├── Decoder.swift
    │   │   ├── DiscreteFlowScheduler.swift
    │   │   ├── Encoder.swift
    │   │   ├── ManagedMLModel.swift
    │   │   ├── MultiModalDiffusionTransformer.swift
    │   │   ├── MultilingualTextEncoder.swift
    │   │   ├── NumPyRandomSource.swift
    │   │   ├── NvRandomSource.swift
    │   │   ├── RandomSource.swift
    │   │   ├── ResourceManaging.swift
    │   │   ├── SafetyChecker.swift
    │   │   ├── SampleTimer.swift
    │   │   ├── Scheduler.swift
    │   │   ├── StableDiffusion3Pipeline+Resources.swift
    │   │   ├── StableDiffusion3Pipeline.swift
    │   │   ├── StableDiffusionPipeline+Resources.swift
    │   │   ├── StableDiffusionPipeline.Configuration.swift
    │   │   ├── StableDiffusionPipeline.swift
    │   │   ├── StableDiffusionXL+Resources.swift
    │   │   ├── StableDiffusionXLPipeline.swift
    │   │   ├── TextEncoder.swift
    │   │   ├── TextEncoderT5.swift
    │   │   ├── TextEncoderXL.swift
    │   │   ├── TorchRandomSource.swift
    │   │   └── Unet.swift
    │   └── tokenizer
    │   │   ├── BPETokenizer+Reading.swift
    │   │   ├── BPETokenizer.swift
    │   │   └── T5Tokenizer.swift
    ├── StableDiffusionCLI
    │   └── main.swift
    └── StableDiffusionTests
    │   ├── Resources
    │       ├── merges.txt
    │       └── vocab.json
    │   └── StableDiffusionTests.swift
└── tests
    ├── __init__.py
    └── test_stable_diffusion.py


/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Thank you for your interest in contributing to Core ML Stable Diffusion! Please review [CONTRIBUTING.md](../CONTRIBUTING.md) first. We appreciate your interest in the project!
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *~
  2 | 
  3 | # Swift Package
  4 | .DS_Store
  5 | /.build
  6 | /Packages
  7 | /*.xcodeproj
  8 | .swiftpm
  9 | .vscode
 10 | .*.sw?
 11 | *.docc-build
 12 | *.vs
 13 | Package.resolved
 14 | 
 15 | # Byte-compiled / optimized / DLL files
 16 | __pycache__/
 17 | *.py[cod]
 18 | *$py.class
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | pip-wheel-metadata/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .nox/
 58 | .coverage
 59 | .coverage.*
 60 | .cache
 61 | nosetests.xml
 62 | coverage.xml
 63 | *.cover
 64 | *.py,cover
 65 | .hypothesis/
 66 | .pytest_cache/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | local_settings.py
 75 | db.sqlite3
 76 | db.sqlite3-journal
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # IPython
 95 | profile_default/
 96 | ipython_config.py
 97 | 
 98 | # pyenv
 99 | .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
109 | __pypackages__/
110 | 
111 | # Celery stuff
112 | celerybeat-schedule
113 | celerybeat.pid
114 | 
115 | # SageMath parsed files
116 | *.sage.py
117 | 
118 | # Environments
119 | .env
120 | .venv
121 | env/
122 | venv/
123 | ENV/
124 | env.bak/
125 | venv.bak/
126 | 
127 | # Spyder project settings
128 | .spyderproject
129 | .spyproject
130 | 
131 | # Rope project settings
132 | .ropeproject
133 | 
134 | # mkdocs documentation
135 | /site
136 | 
137 | # mypy
138 | .mypy_cache/
139 | .dmypy.json
140 | dmypy.json
141 | 
142 | # Pyre type checker
143 | .pyre/
144 | 
145 | # macOS filesystem
146 | *.DS_Store
147 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thank you for your interest in contributing to Core ML Stable Diffusion! This project was released for system demonstration purposes and there are limited plans for future development of the repository. While we welcome new pull requests and issues please note that our response may be limited.
 4 | 
 5 | 
 6 | ## Submitting a Pull Request
 7 | 
 8 | The project is licensed under the MIT license. By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the MIT license.
 9 | 
10 | ## Code of Conduct
11 | 
12 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).
13 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Apple Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.8
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "stable-diffusion",
 8 |     platforms: [
 9 |         .macOS(.v13),
10 |         .iOS(.v16),
11 |     ],
12 |     products: [
13 |         .library(
14 |             name: "StableDiffusion",
15 |             targets: ["StableDiffusion"]),
16 |         .executable(
17 |             name: "StableDiffusionSample",
18 |             targets: ["StableDiffusionCLI"])
19 |     ],
20 |     dependencies: [
21 |         .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.2.3"),
22 |         .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.8"),
23 |     ],
24 |     targets: [
25 |         .target(
26 |             name: "StableDiffusion",
27 |             dependencies:  [
28 |                 .product(name: "Transformers", package: "swift-transformers"),
29 |             ],
30 |             path: "swift/StableDiffusion"),
31 |         .executableTarget(
32 |             name: "StableDiffusionCLI",
33 |             dependencies: [
34 |                 "StableDiffusion",
35 |                 .product(name: "ArgumentParser", package: "swift-argument-parser")],
36 |             path: "swift/StableDiffusionCLI"),
37 |         .testTarget(
38 |             name: "StableDiffusionTests",
39 |             dependencies: ["StableDiffusion"],
40 |             path: "swift/StableDiffusionTests",
41 |             resources: [
42 |                 .copy("Resources/vocab.json"),
43 |                 .copy("Resources/merges.txt")
44 |             ]),
45 |     ]
46 | )
47 | 


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/controlnet_readme_reel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/controlnet_readme_reel.png


--------------------------------------------------------------------------------
/assets/float16_cpuandne_readmereel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/float16_cpuandne_readmereel.png


--------------------------------------------------------------------------------
/assets/float16_gpu_readmereel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/float16_gpu_readmereel.png


--------------------------------------------------------------------------------
/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_3.41-bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_3.41-bits.png


--------------------------------------------------------------------------------
/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_4.50-bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_4.50-bits.png


--------------------------------------------------------------------------------
/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_6.55-bits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_6.55-bits.png


--------------------------------------------------------------------------------
/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_float16_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/a_high_quality_photo_of_a_surfing_dog.7667.final_float16_original.png


--------------------------------------------------------------------------------
/assets/mbp/runwayml_stable-diffusion-v1-5_psnr_vs_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/runwayml_stable-diffusion-v1-5_psnr_vs_size.png


--------------------------------------------------------------------------------
/assets/mbp/stabilityai_stable-diffusion-2-1-base_psnr_vs_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/stabilityai_stable-diffusion-2-1-base_psnr_vs_size.png


--------------------------------------------------------------------------------
/assets/mbp/stabilityai_stable-diffusion-xl-base-1.0_psnr_vs_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/mbp/stabilityai_stable-diffusion-xl-base-1.0_psnr_vs_size.png


--------------------------------------------------------------------------------
/assets/palette6_cpuandne_readmereel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/palette6_cpuandne_readmereel.png


--------------------------------------------------------------------------------
/assets/readme_reel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/assets/readme_reel.png


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.0"
2 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/attention.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | logger = logging.getLogger(__name__)
  4 | logger.setLevel(logging.INFO)
  5 | 
  6 | import torch
  7 | import math
  8 | 
  9 | SPLIT_SOFTMAX = False
 10 | 
 11 | def softmax(x, dim):
 12 |     # Reduction max
 13 |     max_x = x.max(dim=dim, keepdim=True).values
 14 |     # EW sub
 15 |     x -= max_x
 16 |     # Scale for EXP to EXP2, Activation EXP2
 17 |     scaled_x = x * (1 / math.log(2))
 18 |     exp_act = torch.exp2(scaled_x)
 19 |     # Reduction Sum + Inv
 20 |     exp_sum_inv = 1 / exp_act.sum(dim=dim, keepdims=True)
 21 |     # EW Mult
 22 |     return exp_act * exp_sum_inv
 23 | 
 24 | def split_einsum(q, k, v, mask, heads, dim_head):
 25 |     """ Attention Implementation backing AttentionImplementations.SPLIT_EINSUM
 26 | 
 27 |     - Implements https://machinelearning.apple.com/research/neural-engine-transformers
 28 |     - Recommended for ANE
 29 |     - Marginally slower on GPU
 30 |     """
 31 |     mh_q = [
 32 |         q[:, head_idx * dim_head:(head_idx + 1) *
 33 |           dim_head, :, :] for head_idx in range(heads)
 34 |     ]  # (bs, dim_head, 1, max_seq_length) * heads
 35 | 
 36 |     k = k.transpose(1, 3)
 37 |     mh_k = [
 38 |         k[:, :, :,
 39 |           head_idx * dim_head:(head_idx + 1) * dim_head]
 40 |         for head_idx in range(heads)
 41 |     ]  # (bs, max_seq_length, 1, dim_head) * heads
 42 | 
 43 |     mh_v = [
 44 |         v[:, head_idx * dim_head:(head_idx + 1) *
 45 |           dim_head, :, :] for head_idx in range(heads)
 46 |     ]  # (bs, dim_head, 1, max_seq_length) * heads
 47 | 
 48 |     attn_weights = [
 49 |         torch.einsum("bchq,bkhc->bkhq", [qi, ki]) * (dim_head**-0.5)
 50 |         for qi, ki in zip(mh_q, mh_k)
 51 |     ]  # (bs, max_seq_length, 1, max_seq_length) * heads
 52 | 
 53 |     if mask is not None:
 54 |         for head_idx in range(heads):
 55 |             attn_weights[head_idx] = attn_weights[head_idx] + mask
 56 | 
 57 |     if SPLIT_SOFTMAX:
 58 |         attn_weights = [
 59 |             softmax(aw, dim=1) for aw in attn_weights
 60 |         ]  # (bs, max_seq_length, 1, max_seq_length) * heads
 61 |     else:
 62 |         attn_weights = [
 63 |             aw.softmax(dim=1) for aw in attn_weights
 64 |         ]  # (bs, max_seq_length, 1, max_seq_length) * heads
 65 | 
 66 |     attn = [
 67 |         torch.einsum("bkhq,bchk->bchq", wi, vi)
 68 |         for wi, vi in zip(attn_weights, mh_v)
 69 |     ]  # (bs, dim_head, 1, max_seq_length) * heads
 70 | 
 71 |     attn = torch.cat(attn, dim=1)  # (bs, dim, 1, max_seq_length)
 72 |     return attn
 73 | 
 74 | 
 75 | CHUNK_SIZE = 512
 76 | 
 77 | def split_einsum_v2(q, k, v, mask, heads, dim_head):
 78 |     """ Attention Implementation backing AttentionImplementations.SPLIT_EINSUM_V2
 79 | 
 80 |     - Implements https://machinelearning.apple.com/research/neural-engine-transformers
 81 |     - Recommended for ANE
 82 |     - Marginally slower on GPU
 83 |     - Chunks the query sequence to avoid large intermediate tensors and improves ANE performance
 84 |     """
 85 |     query_seq_length = q.size(3)
 86 |     num_chunks = query_seq_length // CHUNK_SIZE
 87 |     
 88 |     if num_chunks == 0:
 89 |         logger.info(
 90 |             "AttentionImplementations.SPLIT_EINSUM_V2: query sequence too short to chunk "
 91 |             f"({query_seq_length}<{CHUNK_SIZE}), fall back to AttentionImplementations.SPLIT_EINSUM (safe to ignore)")
 92 |         return split_einsum(q, k, v, mask, heads, dim_head)
 93 |     
 94 |     logger.info(
 95 |         "AttentionImplementations.SPLIT_EINSUM_V2: Splitting query sequence length of "
 96 |         f"{query_seq_length} into {num_chunks} chunks")
 97 | 
 98 |     mh_q = [
 99 |         q[:, head_idx * dim_head:(head_idx + 1) *
100 |           dim_head, :, :] for head_idx in range(heads)
101 |     ]  # (bs, dim_head, 1, max_seq_length) * heads
102 | 
103 |     # Chunk the query sequence for each head
104 |     mh_q_chunked = [
105 |         [h_q[..., chunk_idx * CHUNK_SIZE:(chunk_idx + 1) * CHUNK_SIZE] for chunk_idx in range(num_chunks)]
106 |         for h_q in mh_q
107 |     ]  # ((bs, dim_head, 1, QUERY_SEQ_CHUNK_SIZE) * num_chunks) * heads
108 | 
109 |     k = k.transpose(1, 3)
110 |     mh_k = [
111 |         k[:, :, :,
112 |           head_idx * dim_head:(head_idx + 1) * dim_head]
113 |         for head_idx in range(heads)
114 |     ]  # (bs, max_seq_length, 1, dim_head) * heads
115 | 
116 |     mh_v = [
117 |         v[:, head_idx * dim_head:(head_idx + 1) *
118 |           dim_head, :, :] for head_idx in range(heads)
119 |     ]  # (bs, dim_head, 1, max_seq_length) * heads
120 | 
121 |     attn_weights = [
122 |         [
123 |             torch.einsum("bchq,bkhc->bkhq", [qi_chunk, ki]) * (dim_head**-0.5)
124 |             for qi_chunk in h_q_chunked
125 |         ] for h_q_chunked, ki in zip(mh_q_chunked, mh_k)
126 |     ]  # ((bs, max_seq_length, 1, chunk_size) * num_chunks) * heads
127 | 
128 |     attn_weights = [
129 |         [aw_chunk.softmax(dim=1) for aw_chunk in aw_chunked]
130 |         for aw_chunked in attn_weights
131 |     ]  # ((bs, max_seq_length, 1, chunk_size) * num_chunks) * heads
132 | 
133 |     attn = [
134 |         [
135 |             torch.einsum("bkhq,bchk->bchq", wi_chunk, vi)
136 |             for wi_chunk in wi_chunked
137 |         ] for wi_chunked, vi in zip(attn_weights, mh_v)
138 |     ]  # ((bs, dim_head, 1, chunk_size) * num_chunks) * heads
139 | 
140 |     attn = torch.cat([
141 |         torch.cat(attn_chunked, dim=3) for attn_chunked in attn
142 |     ], dim=1)  # (bs, dim, 1, max_seq_length)
143 | 
144 |     return attn
145 | 
146 | 
147 | def original(q, k, v, mask, heads, dim_head):
148 |     """ Attention Implementation backing AttentionImplementations.ORIGINAL
149 | 
150 |     - Not recommended for ANE
151 |     - Recommended for GPU
152 |     """
153 |     bs = q.size(0)
154 |     mh_q = q.view(bs, heads, dim_head, -1)
155 |     mh_k = k.view(bs, heads, dim_head, -1)
156 |     mh_v = v.view(bs, heads, dim_head, -1)
157 | 
158 |     attn_weights = torch.einsum("bhcq,bhck->bhqk", [mh_q, mh_k])
159 |     attn_weights.mul_(dim_head**-0.5)
160 | 
161 |     if mask is not None:
162 |         attn_weights = attn_weights + mask
163 | 
164 |     attn_weights = attn_weights.softmax(dim=3)
165 | 
166 |     attn = torch.einsum("bhqk,bhck->bhcq", [attn_weights, mh_v])
167 |     attn = attn.contiguous().view(bs, heads * dim_head, 1, -1)
168 |     return attn
169 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/controlnet.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | from diffusers.configuration_utils import ConfigMixin, register_to_config
  7 | from diffusers import ModelMixin
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | 
 13 | from .unet import Timesteps, TimestepEmbedding, get_down_block, UNetMidBlock2DCrossAttn, linear_to_conv2d_map
 14 | 
 15 | class ControlNetConditioningEmbedding(nn.Module):
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         conditioning_embedding_channels,
 20 |         conditioning_channels=3,
 21 |         block_out_channels=(16, 32, 96, 256),
 22 |     ):
 23 |         super().__init__()
 24 | 
 25 |         self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
 26 | 
 27 |         self.blocks = nn.ModuleList([])
 28 | 
 29 |         for i in range(len(block_out_channels) - 1):
 30 |             channel_in = block_out_channels[i]
 31 |             channel_out = block_out_channels[i + 1]
 32 |             self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
 33 |             self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
 34 | 
 35 |         self.conv_out = nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
 36 | 
 37 |     def forward(self, conditioning):
 38 |         embedding = self.conv_in(conditioning)
 39 |         embedding = F.silu(embedding)
 40 | 
 41 |         for block in self.blocks:
 42 |             embedding = block(embedding)
 43 |             embedding = F.silu(embedding)
 44 | 
 45 |         embedding = self.conv_out(embedding)
 46 | 
 47 |         return embedding
 48 | 
 49 | class ControlNetModel(ModelMixin, ConfigMixin):
 50 | 
 51 |     @register_to_config
 52 |     def __init__(
 53 |         self,
 54 |         in_channels=4,
 55 |         flip_sin_to_cos=True,
 56 |         freq_shift=0,
 57 |         down_block_types=(
 58 |             "CrossAttnDownBlock2D",
 59 |             "CrossAttnDownBlock2D",
 60 |             "CrossAttnDownBlock2D",
 61 |             "DownBlock2D",
 62 |         ),
 63 |         only_cross_attention=False,
 64 |         block_out_channels=(320, 640, 1280, 1280),
 65 |         layers_per_block=2,
 66 |         downsample_padding=1,
 67 |         mid_block_scale_factor=1,
 68 |         act_fn="silu",
 69 |         norm_num_groups=32,
 70 |         norm_eps=1e-5,
 71 |         cross_attention_dim=1280,
 72 |         transformer_layers_per_block=1,
 73 |         attention_head_dim=8,
 74 |         use_linear_projection=False,
 75 |         upcast_attention=False,
 76 |         resnet_time_scale_shift="default",
 77 |         conditioning_embedding_out_channels=(16, 32, 96, 256),
 78 |         **kwargs,
 79 |     ):
 80 |         super().__init__()
 81 | 
 82 |         # Check inputs
 83 |         if len(block_out_channels) != len(down_block_types):
 84 |             raise ValueError(
 85 |                 f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
 86 |             )
 87 | 
 88 |         if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
 89 |             raise ValueError(
 90 |                 f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
 91 |             )
 92 | 
 93 |         if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
 94 |             raise ValueError(
 95 |                 f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
 96 |             )
 97 | 
 98 |         self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
 99 | 
100 |         # input
101 |         conv_in_kernel = 3
102 |         conv_in_padding = (conv_in_kernel - 1) // 2
103 |         self.conv_in = nn.Conv2d(
104 |             in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
105 |         )
106 | 
107 |         # time
108 |         time_embed_dim = block_out_channels[0] * 4
109 | 
110 |         self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
111 |         timestep_input_dim = block_out_channels[0]
112 | 
113 |         self.time_embedding = TimestepEmbedding(
114 |             timestep_input_dim,
115 |             time_embed_dim,
116 |         )
117 | 
118 |         # control net conditioning embedding
119 |         self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
120 |             conditioning_embedding_channels=block_out_channels[0],
121 |             block_out_channels=conditioning_embedding_out_channels,
122 |         )
123 | 
124 |         self.down_blocks = nn.ModuleList([])
125 |         self.controlnet_down_blocks = nn.ModuleList([])
126 | 
127 |         if isinstance(only_cross_attention, bool):
128 |             only_cross_attention = [only_cross_attention] * len(down_block_types)
129 | 
130 |         if isinstance(attention_head_dim, int):
131 |             attention_head_dim = (attention_head_dim,) * len(down_block_types)
132 | 
133 |         if isinstance(transformer_layers_per_block, int):
134 |             transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
135 | 
136 |         # down
137 |         output_channel = block_out_channels[0]
138 | 
139 |         controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
140 |         self.controlnet_down_blocks.append(controlnet_block)
141 | 
142 |         for i, down_block_type in enumerate(down_block_types):
143 |             input_channel = output_channel
144 |             output_channel = block_out_channels[i]
145 |             is_final_block = i == len(block_out_channels) - 1
146 | 
147 |             down_block = get_down_block(
148 |                 down_block_type,
149 |                 transformer_layers_per_block=transformer_layers_per_block[i],
150 |                 num_layers=layers_per_block,
151 |                 in_channels=input_channel,
152 |                 out_channels=output_channel,
153 |                 temb_channels=time_embed_dim,
154 |                 resnet_eps=norm_eps,
155 |                 resnet_act_fn=act_fn,
156 |                 cross_attention_dim=cross_attention_dim,
157 |                 attn_num_head_channels=attention_head_dim[i],
158 |                 downsample_padding=downsample_padding,
159 |                 add_downsample=not is_final_block,
160 |             )
161 |             self.down_blocks.append(down_block)
162 | 
163 |             for _ in range(layers_per_block):
164 |                 controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
165 |                 self.controlnet_down_blocks.append(controlnet_block)
166 | 
167 |             if not is_final_block:
168 |                 controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
169 |                 self.controlnet_down_blocks.append(controlnet_block)
170 | 
171 |         # mid
172 |         mid_block_channel = block_out_channels[-1]
173 | 
174 |         controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
175 |         self.controlnet_mid_block = controlnet_block
176 | 
177 |         self.mid_block = UNetMidBlock2DCrossAttn(
178 |             in_channels=mid_block_channel,
179 |             temb_channels=time_embed_dim,
180 |             resnet_eps=norm_eps,
181 |             resnet_act_fn=act_fn,
182 |             output_scale_factor=mid_block_scale_factor,
183 |             resnet_time_scale_shift=resnet_time_scale_shift,
184 |             cross_attention_dim=cross_attention_dim,
185 |             attn_num_head_channels=attention_head_dim[-1],
186 |             resnet_groups=norm_num_groups,
187 |             use_linear_projection=use_linear_projection,
188 |             upcast_attention=upcast_attention,
189 |         )
190 | 
191 |     def get_num_residuals(self):
192 |         num_res = 2 # initial sample + mid block
193 |         for down_block in self.down_blocks:
194 |             num_res += len(down_block.resnets)
195 |             if hasattr(down_block, "downsamplers") and down_block.downsamplers is not None:
196 |                 num_res += len(down_block.downsamplers)
197 |         return num_res
198 | 
199 |     def forward(
200 |         self,
201 |         sample,
202 |         timestep,
203 |         encoder_hidden_states,
204 |         controlnet_cond,
205 |     ):
206 |         # 1. time
207 |         t_emb = self.time_proj(timestep)
208 |         emb = self.time_embedding(t_emb)
209 | 
210 |         # 2. pre-process
211 |         sample = self.conv_in(sample)
212 | 
213 |         controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
214 | 
215 |         sample += controlnet_cond
216 | 
217 |         # 3. down
218 |         down_block_res_samples = (sample,)
219 |         for downsample_block in self.down_blocks:
220 |             if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
221 |                 sample, res_samples = downsample_block(
222 |                     hidden_states=sample,
223 |                     temb=emb,
224 |                     encoder_hidden_states=encoder_hidden_states,
225 |                 )
226 |             else:
227 |                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
228 | 
229 |             down_block_res_samples += res_samples
230 | 
231 |         # 4. mid
232 |         if self.mid_block is not None:
233 |             sample = self.mid_block(
234 |                 sample,
235 |                 emb,
236 |                 encoder_hidden_states=encoder_hidden_states,
237 |             )
238 | 
239 |         # 5. Control net blocks
240 |         controlnet_down_block_res_samples = ()
241 | 
242 |         for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
243 |             down_block_res_sample = controlnet_block(down_block_res_sample)
244 |             controlnet_down_block_res_samples += (down_block_res_sample,)
245 | 
246 |         down_block_res_samples = controlnet_down_block_res_samples
247 | 
248 |         mid_block_res_sample = self.controlnet_mid_block(sample)
249 | 
250 |         return down_block_res_samples, mid_block_res_sample


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/coreml_model.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import coremltools as ct
  7 | 
  8 | import logging
  9 | import json
 10 | 
 11 | logging.basicConfig()
 12 | logger = logging.getLogger(__name__)
 13 | logger.setLevel(logging.INFO)
 14 | 
 15 | import numpy as np
 16 | 
 17 | import os
 18 | import time
 19 | import subprocess
 20 | import sys
 21 | 
 22 | 
 23 | def _macos_version():
 24 |     """
 25 |     Returns macOS version as a tuple of integers. On non-Macs, returns an empty tuple.
 26 |     """
 27 |     if sys.platform == "darwin":
 28 |         try:
 29 |             ver_str = subprocess.run(["sw_vers", "-productVersion"], stdout=subprocess.PIPE).stdout.decode('utf-8').strip('\n')
 30 |             return tuple([int(v) for v in ver_str.split(".")])
 31 |         except:
 32 |             raise Exception("Unable to determine the macOS version")
 33 |     return ()
 34 | 
 35 | 
 36 | class CoreMLModel:
 37 |     """ Wrapper for running CoreML models using coremltools
 38 |     """
 39 | 
 40 |     def __init__(self, model_path, compute_unit, sources='packages', optimization_hints=None):
 41 | 
 42 |         logger.info(f"Loading {model_path}")
 43 | 
 44 |         start = time.time()
 45 |         if sources == 'packages':
 46 |             assert os.path.exists(model_path) and model_path.endswith(".mlpackage")
 47 | 
 48 |             self.model = ct.models.MLModel(
 49 |                 model_path,
 50 |                 compute_units=ct.ComputeUnit[compute_unit],
 51 |                 optimization_hints=optimization_hints,
 52 |             )
 53 |             DTYPE_MAP = {
 54 |                 65552: np.float16,
 55 |                 65568: np.float32,
 56 |                 131104: np.int32,
 57 |             }
 58 |             self.expected_inputs = {
 59 |                 input_tensor.name: {
 60 |                     "shape": tuple(input_tensor.type.multiArrayType.shape),
 61 |                     "dtype": DTYPE_MAP[input_tensor.type.multiArrayType.dataType],
 62 |                 }
 63 |                 for input_tensor in self.model._spec.description.input
 64 |             }
 65 |         elif sources == 'compiled':
 66 |             assert os.path.exists(model_path) and model_path.endswith(".mlmodelc")
 67 | 
 68 |             self.model = ct.models.CompiledMLModel(
 69 |                 model_path,
 70 |                 compute_units=ct.ComputeUnit[compute_unit],
 71 |                 optimization_hints=optimization_hints,
 72 |             )
 73 | 
 74 |             # Grab expected inputs from metadata.json
 75 |             with open(os.path.join(model_path, 'metadata.json'), 'r') as f:
 76 |                 config = json.load(f)[0]
 77 | 
 78 |             self.expected_inputs = {
 79 |                 input_tensor['name']: {
 80 |                     "shape": tuple(eval(input_tensor['shape'])),
 81 |                     "dtype": np.dtype(input_tensor['dataType'].lower()),
 82 |                 }
 83 |                 for input_tensor in config['inputSchema']
 84 |             }
 85 |         else:
 86 |             raise ValueError(f'Expected `packages` or `compiled` for sources, received {sources}')
 87 | 
 88 |         load_time = time.time() - start
 89 |         logger.info(f"Done. Took {load_time:.1f} seconds.")
 90 | 
 91 |         if load_time > LOAD_TIME_INFO_MSG_TRIGGER:
 92 |             logger.info(
 93 |                 "Loading a CoreML model through coremltools triggers compilation every time. "
 94 |                 "The Swift package we provide uses precompiled Core ML models (.mlmodelc) to avoid compile-on-load."
 95 |             )
 96 | 
 97 |     def _verify_inputs(self, **kwargs):
 98 |         for k, v in kwargs.items():
 99 |             if k in self.expected_inputs:
100 |                 if not isinstance(v, np.ndarray):
101 |                     raise TypeError(
102 |                         f"Expected numpy.ndarray, got {v} for input: {k}")
103 | 
104 |                 expected_dtype = self.expected_inputs[k]["dtype"]
105 |                 if not v.dtype == expected_dtype:
106 |                     raise TypeError(
107 |                         f"Expected dtype {expected_dtype}, got {v.dtype} for input: {k}"
108 |                     )
109 | 
110 |                 expected_shape = self.expected_inputs[k]["shape"]
111 |                 if not v.shape == expected_shape:
112 |                     raise TypeError(
113 |                         f"Expected shape {expected_shape}, got {v.shape} for input: {k}"
114 |                     )
115 |             else:
116 |                 raise ValueError(f"Received unexpected input kwarg: {k}")
117 | 
118 |     def __call__(self, **kwargs):
119 |         self._verify_inputs(**kwargs)
120 |         return self.model.predict(kwargs)
121 | 
122 | 
123 | LOAD_TIME_INFO_MSG_TRIGGER = 10  # seconds
124 | 
125 | 
126 | def get_resource_type(resources_dir: str) -> str:
127 |     """
128 |         Detect resource type based on filepath extensions.
129 |         returns:
130 |             `packages`: for .mlpackage resources
131 |             'compiled`: for .mlmodelc resources
132 |     """
133 |     directories = [f for f in os.listdir(resources_dir) if os.path.isdir(os.path.join(resources_dir, f))]
134 | 
135 |     # consider directories ending with extension
136 |     extensions = set([os.path.splitext(e)[1] for e in directories if os.path.splitext(e)[1]])
137 | 
138 |     # if one extension present we may be able to infer sources type
139 |     if len(set(extensions)) == 1:
140 |         extension = extensions.pop()
141 |     else:
142 |         raise ValueError(f'Multiple file extensions found at {resources_dir}.'
143 |                          f'Cannot infer resource type from contents.')
144 | 
145 |     if extension == '.mlpackage':
146 |         sources = 'packages'
147 |     elif extension == '.mlmodelc':
148 |         sources = 'compiled'
149 |     else:
150 |         raise ValueError(f'Did not find .mlpackage or .mlmodelc at {resources_dir}')
151 | 
152 |     return sources
153 | 
154 | 
155 | def _load_mlpackage(submodule_name,
156 |                     mlpackages_dir,
157 |                     model_version,
158 |                     compute_unit,
159 |                     sources=None):
160 |     """
161 |         Load Core ML (mlpackage) models from disk (As exported by torch2coreml.py)
162 | 
163 |     """
164 | 
165 |     # if sources not provided, attempt to infer `packages` or `compiled` from the
166 |     # resources directory
167 |     if sources is None:
168 |         sources = get_resource_type(mlpackages_dir)
169 | 
170 |     if sources == 'packages':
171 |         logger.info(f"Loading {submodule_name} mlpackage")
172 |         fname = f"Stable_Diffusion_version_{model_version}_{submodule_name}.mlpackage".replace(
173 |             "/", "_")
174 |         mlpackage_path = os.path.join(mlpackages_dir, fname)
175 | 
176 |         if not os.path.exists(mlpackage_path):
177 |             raise FileNotFoundError(
178 |                 f"{submodule_name} CoreML model doesn't exist at {mlpackage_path}")
179 | 
180 |     elif sources == 'compiled':
181 |         logger.info(f"Loading {submodule_name} mlmodelc")
182 | 
183 |         # FixMe: Submodule names and compiled resources names differ. Can change if names match in the future.
184 |         submodule_names = ["text_encoder", "text_encoder_2", "unet", "vae_decoder", "vae_encoder", "safety_checker"]
185 |         compiled_names = ['TextEncoder', 'TextEncoder2', 'Unet', 'VAEDecoder', 'VAEEncoder', 'SafetyChecker']
186 |         name_map = dict(zip(submodule_names, compiled_names))
187 | 
188 |         cname = name_map[submodule_name] + '.mlmodelc'
189 |         mlpackage_path = os.path.join(mlpackages_dir, cname)
190 | 
191 |         if not os.path.exists(mlpackage_path):
192 |             raise FileNotFoundError(
193 |                 f"{submodule_name} CoreML model doesn't exist at {mlpackage_path}")
194 | 
195 |     # On macOS 15+, set fast prediction optimization hint for the unet.
196 |     optimization_hints = None
197 |     if submodule_name == "unet" and _macos_version() >= (15, 0):
198 |         optimization_hints = {"specializationStrategy": ct.SpecializationStrategy.FastPrediction}
199 | 
200 |     return CoreMLModel(mlpackage_path,
201 |                        compute_unit,
202 |                        sources=sources,
203 |                        optimization_hints=optimization_hints)
204 | 
205 | 
206 | def _load_mlpackage_controlnet(mlpackages_dir, model_version, compute_unit):
207 |     """ Load Core ML (mlpackage) models from disk (As exported by torch2coreml.py)
208 |     """
209 |     model_name = model_version.replace("/", "_")
210 | 
211 |     logger.info(f"Loading controlnet_{model_name} mlpackage")
212 | 
213 |     fname = f"ControlNet_{model_name}.mlpackage"
214 | 
215 |     mlpackage_path = os.path.join(mlpackages_dir, fname)
216 | 
217 |     if not os.path.exists(mlpackage_path):
218 |         raise FileNotFoundError(
219 |             f"controlnet_{model_name} CoreML model doesn't exist at {mlpackage_path}")
220 | 
221 |     return CoreMLModel(mlpackage_path, compute_unit)
222 | 
223 | 
224 | def get_available_compute_units():
225 |     return tuple(cu for cu in ct.ComputeUnit._member_names_)
226 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE.md file.
 3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | # Reference: https://github.com/apple/ml-ane-transformers/blob/main/ane_transformers/reference/layer_norm.py
11 | class LayerNormANE(nn.Module):
12 |     """ LayerNorm optimized for Apple Neural Engine (ANE) execution
13 | 
14 |     Note: This layer only supports normalization over the final dim. It expects `num_channels`
15 |     as an argument and not `normalized_shape` which is used by `torch.nn.LayerNorm`.
16 |     """
17 | 
18 |     def __init__(self,
19 |                  num_channels,
20 |                  clip_mag=None,
21 |                  eps=1e-5,
22 |                  elementwise_affine=True):
23 |         """
24 |         Args:
25 |             num_channels:       Number of channels (C) where the expected input data format is BC1S. S stands for sequence length.
26 |             clip_mag:           Optional float value to use for clamping the input range before layer norm is applied.
27 |                                 If specified, helps reduce risk of overflow.
28 |             eps:                Small value to avoid dividing by zero
29 |             elementwise_affine: If true, adds learnable channel-wise shift (bias) and scale (weight) parameters
30 |         """
31 |         super().__init__()
32 |         # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine)
33 |         self.expected_rank = len("BC1S")
34 | 
35 |         self.num_channels = num_channels
36 |         self.eps = eps
37 |         self.clip_mag = clip_mag
38 |         self.elementwise_affine = elementwise_affine
39 | 
40 |         if self.elementwise_affine:
41 |             self.weight = nn.Parameter(torch.Tensor(num_channels))
42 |             self.bias = nn.Parameter(torch.Tensor(num_channels))
43 | 
44 |         self._reset_parameters()
45 | 
46 |     def _reset_parameters(self):
47 |         if self.elementwise_affine:
48 |             nn.init.ones_(self.weight)
49 |             nn.init.zeros_(self.bias)
50 | 
51 |     def forward(self, inputs):
52 |         input_rank = len(inputs.size())
53 | 
54 |         # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine)
55 |         # Migrate the data format from BSC to BC1S (most conducive to ANE)
56 |         if input_rank == 3 and inputs.size(2) == self.num_channels:
57 |             inputs = inputs.transpose(1, 2).unsqueeze(2)
58 |             input_rank = len(inputs.size())
59 | 
60 |         assert input_rank == self.expected_rank
61 |         assert inputs.size(1) == self.num_channels
62 | 
63 |         if self.clip_mag is not None:
64 |             inputs.clamp_(-self.clip_mag, self.clip_mag)
65 | 
66 |         channels_mean = inputs.mean(dim=1, keepdims=True)
67 | 
68 |         zero_mean = inputs - channels_mean
69 | 
70 |         zero_mean_sq = zero_mean * zero_mean
71 | 
72 |         denom = (zero_mean_sq.mean(dim=1, keepdims=True) + self.eps).rsqrt()
73 | 
74 |         out = zero_mean * denom
75 | 
76 |         if self.elementwise_affine:
77 |             out = (out + self.bias.view(1, self.num_channels, 1, 1)
78 |                    ) * self.weight.view(1, self.num_channels, 1, 1)
79 | 
80 |         return out
81 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/mixed_bit_compression_apply.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gc
  3 | import json
  4 | import logging
  5 | import os
  6 | 
  7 | import coremltools as ct
  8 | import coremltools.optimize.coreml as cto
  9 | import numpy as np
 10 | 
 11 | from python_coreml_stable_diffusion.torch2coreml import get_pipeline
 12 | from python_coreml_stable_diffusion.mixed_bit_compression_pre_analysis import (
 13 |     NBITS,
 14 |     PALETTIZE_MIN_SIZE as MIN_SIZE
 15 | )
 16 | 
 17 | 
 18 | logging.basicConfig()
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(logging.INFO)
 21 | 
 22 | 
 23 | def main(args):
 24 |     # Load Core ML model
 25 |     coreml_model = ct.models.MLModel(args.mlpackage_path, compute_units=ct.ComputeUnit.CPU_ONLY)
 26 |     logger.info(f"Loaded {args.mlpackage_path}")
 27 | 
 28 |     # Load palettization recipe
 29 |     with open(args.pre_analysis_json_path, 'r') as f:
 30 |         pre_analysis = json.load(f)
 31 | 
 32 |     if args.selected_recipe not in list(pre_analysis["recipes"]):
 33 |         raise KeyError(
 34 |             f"--selected-recipe ({args.selected_recipe}) not found in "
 35 |             f"--pre-analysis-json-path ({args.pre_analysis_json_path}). "
 36 |             f" Available recipes: {list(pre_analysis['recipes'])}"
 37 |         )
 38 | 
 39 | 
 40 |     recipe = pre_analysis["recipes"][args.selected_recipe]
 41 |     assert all(nbits in NBITS + [16] for nbits in recipe.values()), \
 42 |         f"Some nbits values in the recipe are illegal. Allowed values: {NBITS}"
 43 | 
 44 |     # Hash tensors to be able to match torch tensor names to mil tensors
 45 |     def get_tensor_hash(tensor):
 46 |         assert tensor.dtype == np.float16
 47 |         return tensor.ravel()[0] + np.prod(tensor.shape)
 48 | 
 49 |     args.model_version = pre_analysis["model_version"]
 50 |     pipe = get_pipeline(args)
 51 |     torch_model = pipe.unet
 52 | 
 53 |     hashed_recipe = {}
 54 |     for torch_module_name, nbits in recipe.items():
 55 |         tensor = [
 56 |             tensor.cpu().numpy().astype(np.float16) for name,tensor in torch_model.named_parameters()
 57 |             if name == torch_module_name + '.weight'
 58 |         ][0]
 59 |         hashed_recipe[get_tensor_hash(tensor)] = nbits
 60 | 
 61 |     del pipe
 62 |     gc.collect()
 63 | 
 64 |     op_name_configs = {}
 65 |     weight_metadata = cto.get_weights_metadata(coreml_model, weight_threshold=MIN_SIZE)
 66 |     hashes = np.array(list(hashed_recipe))
 67 |     for name, metadata in weight_metadata.items():
 68 |         # Look up target bits for this weight
 69 |         tensor_hash = get_tensor_hash(metadata.val)
 70 |         pdist = np.abs(hashes - tensor_hash)
 71 |         assert(pdist.min() < 0.01)
 72 |         matched = pdist.argmin()
 73 |         target_nbits = hashed_recipe[hashes[matched]]
 74 | 
 75 |         if target_nbits == 16:
 76 |             continue
 77 | 
 78 |         op_name_configs[name] = cto.OpPalettizerConfig(
 79 |             mode="kmeans",
 80 |             nbits=target_nbits,
 81 |             weight_threshold=int(MIN_SIZE)
 82 |         )
 83 | 
 84 |     config = ct.optimize.coreml.OptimizationConfig(op_name_configs=op_name_configs)
 85 |     coreml_model = ct.optimize.coreml.palettize_weights(coreml_model, config)
 86 | 
 87 |     coreml_model.save(args.o)
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument(
 93 |         "-o",
 94 |         required=True,
 95 |         help="Output directory to save the custom palettized model"
 96 |     )
 97 |     parser.add_argument(
 98 |         "--mlpackage-path",
 99 |         required=True,
100 |         help="Path to .mlpackage model to be palettized"
101 |     )
102 |     parser.add_argument(
103 |         "--pre-analysis-json-path",
104 |         required=True,
105 |         type=str,
106 |         help=("The JSON file generated by mixed_bit_compression_pre_analysis.py"
107 |     ))
108 |     parser.add_argument(
109 |         "--selected-recipe",
110 |         required=True,
111 |         type=str,
112 |         help=("The string key into --pre-analysis-json-path's baselines dict"
113 |     ))
114 |     parser.add_argument(
115 |         "--custom-vae-version",
116 |         type=str,
117 |         default=None,
118 |         help=
119 |         ("Custom VAE checkpoint to override the pipeline's built-in VAE. "
120 |             "If specified, the specified VAE will be converted instead of the one associated to the `--model-version` checkpoint. "
121 |             "No precision override is applied when using a custom VAE."
122 |     ))
123 | 
124 |     args = parser.parse_args()
125 | 
126 |     if not os.path.exists(args.mlpackage_path):
127 |         raise FileNotFoundError
128 |     if not os.path.exists(args.pre_analysis_json_path):
129 |         raise FileNotFoundError
130 |     if not args.pre_analysis_json_path.endswith('.json'):
131 |         raise ValueError("--recipe-json-path should end with '.json'")
132 | 
133 |     main(args)
134 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/multilingual_projection.py:
--------------------------------------------------------------------------------
 1 | from python_coreml_stable_diffusion.torch2coreml import _compile_coreml_model
 2 | 
 3 | import argparse
 4 | import coremltools as ct
 5 | import numpy as np
 6 | import os
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | # TODO: Read these values off of the NLContextualEmbedding API to enforce dimensions and track API versioning
11 | MAX_SEQUENCE_LENGTH = 256
12 | EMBED_DIM = 512
13 | BATCH_SIZE = 1
14 | 
15 | def main(args):
16 |     # Layer that was trained to map NLContextualEmbedding to your text_encoder.hidden_size dimensionality
17 |     text_encoder_projection = torch.jit.load(args.input_path)
18 | 
19 |     # Prepare random inputs for tracing the network before conversion
20 |     random_input = torch.randn(BATCH_SIZE, MAX_SEQUENCE_LENGTH, EMBED_DIM)
21 | 
22 |     # Create a class to bake in the reshape operations required to fit the existing model interface
23 |     class TextEncoderProjection(nn.Module):
24 |         def __init__(self, proj):
25 |             super().__init__()
26 |             self.proj = proj
27 | 
28 |         def forward(self, x):
29 |             return self.proj(x).transpose(1, 2).unsqueeze(2) # BSC, BC1S
30 | 
31 |     # Trace the torch model
32 |     text_encoder_projection = torch.jit.trace(TextEncoderProjection(text_encoder_projection), (random_input,))
33 | 
34 |     # Convert the model to Core ML
35 |     mlpackage_path = os.path.join(args.output_dir, "MultilingualTextEncoderProjection.mlpackage")
36 |     ct.convert(
37 |         text_encoder_projection,
38 |         inputs=[ct.TensorType('nlcontextualembeddings_output', shape=(1, MAX_SEQUENCE_LENGTH, EMBED_DIM), dtype=np.float32)],
39 |         outputs=[ct.TensorType('encoder_hidden_states', dtype=np.float32)],
40 |         minimum_deployment_target=ct.target.macOS14,  # NLContextualEmbedding minimum availability build
41 |         convert_to='mlprogram',
42 |     ).save()
43 | 
44 |     # Compile the model and save it under the specified directory
45 |     _compile_coreml_model(mlpackage_path, args.output_dir, final_name="MultilingualTextEncoderProjection")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument(
51 |         "--input-path",
52 |         help="Path to the torchscript file that contains the projection layer"
53 |     )
54 |     parser.add_argument(
55 |         "--output-dir",
56 |         help="Output directory in which the Core ML model should be saved",
57 |     )
58 |     args = parser.parse_args()
59 | 
60 |     main(args)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | coremltools>=8.0
 2 | diffusers[torch]==0.30.2
 3 | diffusionkit==0.4.0
 4 | torch
 5 | transformers==4.44.2
 6 | scipy
 7 | scikit-learn
 8 | pytest
 9 | invisible-watermark
10 | safetensors
11 | matplotlib
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | from python_coreml_stable_diffusion._version import __version__
 4 | 
 5 | with open('README.md') as f:
 6 |     readme = f.read()
 7 | 
 8 | setup(
 9 |     name='python_coreml_stable_diffusion',
10 |     version=__version__,
11 |     url='https://github.com/apple/ml-stable-diffusion',
12 |     description="Run Stable Diffusion on Apple Silicon with Core ML (Python and Swift)",
13 |     long_description=readme,
14 |     long_description_content_type='text/markdown',
15 |     author='Apple Inc.',
16 |     install_requires=[
17 |         "coremltools>=8.0",
18 |         "diffusers[torch]==0.30.2",
19 |         "torch",
20 |         "transformers==4.44.2",
21 |         "huggingface-hub==0.24.6",
22 |         "scipy",
23 |         "numpy<1.24",
24 |         "pytest",
25 |         "scikit-learn",
26 |         "invisible-watermark",
27 |         "safetensors",
28 |         "matplotlib",
29 |         "diffusionkit==0.4.0",
30 |     ],
31 |     packages=find_packages(),
32 |     classifiers=[
33 |         "Development Status :: 4 - Beta",
34 |         "Intended Audience :: Developers",
35 |         "Operating System :: MacOS :: MacOS X",
36 |         "Programming Language :: Python :: 3",
37 |         "Programming Language :: Python :: 3.7",
38 |         "Programming Language :: Python :: 3.8",
39 |         "Programming Language :: Python :: 3.9",
40 |         "Topic :: Artificial Intelligence",
41 |         "Topic :: Scientific/Engineering",
42 |         "Topic :: Software Development",
43 |     ],
44 | )
45 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/CGImage+vImage.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import Accelerate
  6 | import CoreML
  7 | import CoreGraphics
  8 | 
  9 | @available(iOS 16.0, macOS 13.0, *)
 10 | extension CGImage {
 11 |     
 12 |     typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
 13 |     typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
 14 |     typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
 15 |     typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
 16 |     
 17 |     public enum ShapedArrayError: String, Swift.Error {
 18 |         case wrongNumberOfChannels
 19 |         case incorrectFormatsConvertingToShapedArray
 20 |         case vImageConverterNotInitialized
 21 |     }
 22 |     
 23 |     public static func fromShapedArray(_ array: MLShapedArray<Float32>) throws -> CGImage {
 24 |         
 25 |         // array is [N,C,H,W], where C==3
 26 |         let channelCount = array.shape[1]
 27 |         guard channelCount == 3 else {
 28 |             throw ShapedArrayError.wrongNumberOfChannels
 29 |         }
 30 |         
 31 |         let height = array.shape[2]
 32 |         let width = array.shape[3]
 33 | 
 34 |         // Normalize each channel into a float between 0 and 1.0
 35 |         let floatChannels = (0..<channelCount).map { i in
 36 | 
 37 |             // Normalized channel output
 38 |             let cOut = PixelBufferPFx1(width: width, height:height)
 39 | 
 40 |             // Reference this channel in the array and normalize
 41 |             array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
 42 |                 let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
 43 |                                           width: width, height: height,
 44 |                                           byteCountPerRow: strides[0]*4)
 45 |                 // Map [-1.0 1.0] -> [0.0 1.0]
 46 |                 cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
 47 |             }
 48 |             return cOut
 49 |         }
 50 | 
 51 |         // Convert to interleaved and then to UInt8
 52 |         let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
 53 |         let uint8Image = PixelBufferI8x3(width: width, height: height)
 54 |         floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
 55 | 
 56 |         // Convert to uint8x3 to RGB CGImage (no alpha)
 57 |         let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
 58 |         let cgImage = uint8Image.makeCGImage(cgImageFormat:
 59 |                 .init(bitsPerComponent: 8,
 60 |                       bitsPerPixel: 3*8,
 61 |                       colorSpace: CGColorSpace(name: CGColorSpace.sRGB) ?? CGColorSpaceCreateDeviceRGB(),
 62 |                       bitmapInfo: bitmapInfo)!)!
 63 | 
 64 |         return cgImage
 65 |     }
 66 |     
 67 |     public func planarRGBShapedArray(minValue: Float, maxValue: Float)
 68 |         throws -> MLShapedArray<Float32> {
 69 |             guard
 70 |                 var sourceFormat = vImage_CGImageFormat(cgImage: self),
 71 |                 var mediumFormat = vImage_CGImageFormat(
 72 |                     bitsPerComponent: 8 * MemoryLayout<UInt8>.size,
 73 |                     bitsPerPixel: 8 * MemoryLayout<UInt8>.size * 4,
 74 |                     colorSpace: CGColorSpaceCreateDeviceRGB(),
 75 |                     bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)),
 76 |                 let width = vImagePixelCount(exactly: self.width),
 77 |                 let height = vImagePixelCount(exactly: self.height)
 78 |             else {
 79 |                 throw ShapedArrayError.incorrectFormatsConvertingToShapedArray
 80 |             }
 81 | 
 82 |             var sourceImageBuffer = try vImage_Buffer(cgImage: self)
 83 |             
 84 |             var mediumDestination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel)
 85 |             
 86 |             let converter = vImageConverter_CreateWithCGImageFormat(
 87 |                 &sourceFormat,
 88 |                 &mediumFormat,
 89 |                 nil,
 90 |                 vImage_Flags(kvImagePrintDiagnosticsToConsole),
 91 |                 nil)
 92 | 
 93 |             guard let converter = converter?.takeRetainedValue() else {
 94 |                 throw ShapedArrayError.vImageConverterNotInitialized
 95 |             }
 96 |             
 97 |             vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDestination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole))
 98 |             
 99 |             var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
100 |             var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
101 |             var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
102 |             var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
103 | 
104 |             var minFloat: [Float] = Array(repeating: minValue, count: 4)
105 |             var maxFloat: [Float] = Array(repeating: maxValue, count: 4)
106 |             
107 |             vImageConvert_ARGB8888toPlanarF(&mediumDestination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero)
108 |            
109 |             let destAPtr = destinationA.data.assumingMemoryBound(to: Float.self)
110 |             let destRPtr = destinationR.data.assumingMemoryBound(to: Float.self)
111 |             let destGPtr = destinationG.data.assumingMemoryBound(to: Float.self)
112 |             let destBPtr = destinationB.data.assumingMemoryBound(to: Float.self)
113 | 
114 |             for i in 0..<Int(width) * Int(height) {
115 |                 if destAPtr.advanced(by: i).pointee == 0 {
116 |                     destRPtr.advanced(by: i).pointee = -1
117 |                     destGPtr.advanced(by: i).pointee = -1
118 |                     destBPtr.advanced(by: i).pointee = -1
119 |                 }
120 |             }
121 |             
122 |             let redData = destinationR.unpaddedData()
123 |             let greenData = destinationG.unpaddedData()
124 |             let blueData = destinationB.unpaddedData()
125 | 
126 |             let imageData = redData + greenData + blueData
127 | 
128 |             let shapedArray = MLShapedArray<Float32>(data: imageData, shape: [1, 3, self.height, self.width])
129 | 
130 |             return shapedArray
131 |     }
132 | 
133 |     private func normalizePixelValues(pixel: UInt8) -> Float {
134 |         return (Float(pixel) / 127.5) - 1.0
135 |     }
136 | 
137 |     public func toRGBShapedArray(minValue: Float, maxValue: Float)
138 |         throws -> MLShapedArray<Float32> {
139 |             let image = self
140 |             let width = image.width
141 |             let height = image.height
142 |             let alphaMaskValue: Float = minValue
143 | 
144 |             guard let colorSpace = CGColorSpace(name: CGColorSpace.sRGB),
145 |                   let context = CGContext(data: nil, width: width, height: height, bitsPerComponent: 8, bytesPerRow: 4 * width, space: colorSpace, bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue),
146 |                   let ptr = context.data?.bindMemory(to: UInt8.self, capacity: width * height * 4) else {
147 |                 return []
148 |             }
149 | 
150 |             context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
151 | 
152 |             var redChannel = [Float](repeating: 0, count: width * height)
153 |             var greenChannel = [Float](repeating: 0, count: width * height)
154 |             var blueChannel = [Float](repeating: 0, count: width * height)
155 | 
156 |             for y in 0..<height {
157 |                 for x in 0..<width {
158 |                     let i = 4 * (y * width + x)
159 |                     if ptr[i+3] == 0 {
160 |                         // Alpha mask for controlnets
161 |                         redChannel[y * width + x] = alphaMaskValue
162 |                         greenChannel[y * width + x] = alphaMaskValue
163 |                         blueChannel[y * width + x] = alphaMaskValue
164 |                     } else {
165 |                         redChannel[y * width + x] = normalizePixelValues(pixel: ptr[i])
166 |                         greenChannel[y * width + x] = normalizePixelValues(pixel: ptr[i+1])
167 |                         blueChannel[y * width + x] = normalizePixelValues(pixel: ptr[i+2])
168 |                     }
169 |                 }
170 |             }
171 | 
172 |             let colorShape = [1, 1, height, width]
173 |             let redShapedArray = MLShapedArray<Float32>(scalars: redChannel, shape: colorShape)
174 |             let greenShapedArray = MLShapedArray<Float32>(scalars: greenChannel, shape: colorShape)
175 |             let blueShapedArray = MLShapedArray<Float32>(scalars: blueChannel, shape: colorShape)
176 | 
177 |             let shapedArray = MLShapedArray<Float32>(concatenating: [redShapedArray, greenShapedArray, blueShapedArray], alongAxis: 1)
178 | 
179 |             return shapedArray
180 |     }
181 | }
182 | 
183 | extension vImage_Buffer {
184 |     func unpaddedData() -> Data {
185 |         let bytesPerPixel = self.rowBytes / Int(self.width)
186 |         let bytesPerRow = Int(self.width) * bytesPerPixel
187 | 
188 |         var contiguousPixelData = Data(capacity: bytesPerRow * Int(self.height))
189 |         for row in 0..<Int(self.height) {
190 |             let rowStart = self.data!.advanced(by: row * self.rowBytes)
191 |             let rowData = Data(bytes: rowStart, count: bytesPerRow)
192 |             contiguousPixelData.append(rowData)
193 |         }
194 | 
195 |         return contiguousPixelData
196 |     }
197 | }
198 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/ControlNet.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Accelerate
  7 | 
  8 | @available(iOS 16.2, macOS 13.1, *)
  9 | public struct ControlNet: ResourceManaging {
 10 |     
 11 |     var models: [ManagedMLModel]
 12 |     
 13 |     public init(modelAt urls: [URL],
 14 |                 configuration: MLModelConfiguration) {
 15 |         self.models = urls.map { ManagedMLModel(modelAt: $0, configuration: configuration) }
 16 |     }
 17 |     
 18 |     /// Load resources.
 19 |     public func loadResources() throws {
 20 |         for model in models {
 21 |             try model.loadResources()
 22 |         }
 23 |     }
 24 | 
 25 |     /// Unload the underlying model to free up memory
 26 |     public func unloadResources() {
 27 |         for model in models {
 28 |             model.unloadResources()
 29 |         }
 30 |     }
 31 |     
 32 |     /// Pre-warm resources
 33 |     public func prewarmResources() throws {
 34 |         // Override default to pre-warm each model
 35 |         for model in models {
 36 |             try model.loadResources()
 37 |             model.unloadResources()
 38 |         }
 39 |     }
 40 |     
 41 |     var inputImageDescriptions: [MLFeatureDescription] {
 42 |         models.map { model in
 43 |             try! model.perform {
 44 |                 $0.modelDescription.inputDescriptionsByName["controlnet_cond"]!
 45 |             }
 46 |         }
 47 |     }
 48 |     
 49 |     /// The expected shape of the models image input
 50 |     public var inputImageShapes: [[Int]] {
 51 |         inputImageDescriptions.map { desc in
 52 |             desc.multiArrayConstraint!.shape.map { $0.intValue }
 53 |         }
 54 |     }
 55 |     
 56 |     /// Calculate additional inputs for Unet to generate intended image following provided images
 57 |     ///
 58 |     /// - Parameters:
 59 |     ///   - latents: Batch of latent samples in an array
 60 |     ///   - timeStep: Current diffusion timestep
 61 |     ///   - hiddenStates: Hidden state to condition on
 62 |     ///   - images: Images for each ControlNet
 63 |     /// - Returns: Array of predicted noise residuals
 64 |     func execute(
 65 |         latents: [MLShapedArray<Float32>],
 66 |         timeStep: Int,
 67 |         hiddenStates: MLShapedArray<Float32>,
 68 |         images: [MLShapedArray<Float32>]
 69 |     ) throws -> [[String: MLShapedArray<Float32>]] {
 70 |         // Match time step batch dimension to the model / latent samples
 71 |         let t = MLShapedArray(scalars: [Float(timeStep), Float(timeStep)], shape: [2])
 72 |         
 73 |         var outputs: [[String: MLShapedArray<Float32>]] = []
 74 |         
 75 |         for (modelIndex, model) in models.enumerated() {
 76 |             let inputs = try latents.map { latent in
 77 |                 let dict: [String: Any] = [
 78 |                     "sample": MLMultiArray(latent),
 79 |                     "timestep": MLMultiArray(t),
 80 |                     "encoder_hidden_states": MLMultiArray(hiddenStates),
 81 |                     "controlnet_cond": MLMultiArray(images[modelIndex])
 82 |                 ]
 83 |                 return try MLDictionaryFeatureProvider(dictionary: dict)
 84 |             }
 85 |             
 86 |             let batch = MLArrayBatchProvider(array: inputs)
 87 |             
 88 |             let results = try model.perform {
 89 |                 try $0.predictions(fromBatch: batch)
 90 |             }
 91 |             
 92 |             // pre-allocate MLShapedArray with a specific shape in outputs
 93 |             if outputs.isEmpty {
 94 |                 outputs = initOutputs(
 95 |                     batch: latents.count,
 96 |                     shapes: results.features(at: 0).featureValueDictionary
 97 |                 )
 98 |             }
 99 |             
100 |             for n in 0..<results.count {
101 |                 let result = results.features(at: n)
102 |                 for k in result.featureNames {
103 |                     let newValue = result.featureValue(for: k)!.multiArrayValue!
104 |                     if modelIndex == 0 {
105 |                         outputs[n][k] = MLShapedArray<Float32>(newValue)
106 |                     } else {
107 |                         let outputArray = MLMultiArray(outputs[n][k]!)
108 |                         let count = newValue.count
109 |                         let inputPointer = newValue.dataPointer.assumingMemoryBound(to: Float.self)
110 |                         let outputPointer = outputArray.dataPointer.assumingMemoryBound(to: Float.self)
111 |                         vDSP_vadd(inputPointer, 1, outputPointer, 1, outputPointer, 1, vDSP_Length(count))
112 |                     }
113 |                 }
114 |             }
115 |         }
116 |         
117 |         return outputs
118 |     }
119 |     
120 |     private func initOutputs(batch: Int, shapes: [String: MLFeatureValue]) -> [[String: MLShapedArray<Float32>]] {
121 |         var output: [String: MLShapedArray<Float32>] = [:]
122 |         for (outputName, featureValue) in shapes {
123 |             output[outputName] = MLShapedArray<Float32>(
124 |                 repeating: 0.0,
125 |                 shape: featureValue.multiArrayValue!.shape.map { $0.intValue }
126 |             )
127 |         }
128 |         return Array(repeating: output, count: batch)
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. and The HuggingFace Team. All Rights Reserved.
  3 | 
  4 | import Accelerate
  5 | import CoreML
  6 | 
  7 | /// How to space timesteps for inference
  8 | public enum TimeStepSpacing {
  9 |     case linspace
 10 |     case leading
 11 |     case karras
 12 | }
 13 | 
 14 | /// A scheduler used to compute a de-noised image
 15 | ///
 16 | ///  This implementation matches:
 17 | ///  [Hugging Face Diffusers DPMSolverMultistepScheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py)
 18 | ///
 19 | /// It uses the DPM-Solver++ algorithm: [code](https://github.com/LuChengTHU/dpm-solver) [paper](https://arxiv.org/abs/2211.01095).
 20 | /// Limitations:
 21 | ///  - Only implemented for DPM-Solver++ algorithm (not DPM-Solver).
 22 | ///  - Second order only.
 23 | ///  - Assumes the model predicts epsilon.
 24 | ///  - No dynamic thresholding.
 25 | ///  - `midpoint` solver algorithm.
 26 | @available(iOS 16.2, macOS 13.1, *)
 27 | public final class DPMSolverMultistepScheduler: Scheduler {
 28 |     public let trainStepCount: Int
 29 |     public let inferenceStepCount: Int
 30 |     public let betas: [Float]
 31 |     public let alphas: [Float]
 32 |     public let alphasCumProd: [Float]
 33 |     public let timeSteps: [Int]
 34 | 
 35 |     public let alpha_t: [Float]
 36 |     public let sigma_t: [Float]
 37 |     public let lambda_t: [Float]
 38 |     
 39 |     public let solverOrder = 2
 40 |     private(set) var lowerOrderStepped = 0
 41 |     
 42 |     private var usingKarrasSigmas = false
 43 | 
 44 |     /// Whether to use lower-order solvers in the final steps. Only valid for less than 15 inference steps.
 45 |     /// We empirically find this trick can stabilize the sampling of DPM-Solver, especially with 10 or fewer steps.
 46 |     public let useLowerOrderFinal = true
 47 |     
 48 |     // Stores solverOrder (2) items
 49 |     public private(set) var modelOutputs: [MLShapedArray<Float32>] = []
 50 | 
 51 |     /// Create a scheduler that uses a second order DPM-Solver++ algorithm.
 52 |     ///
 53 |     /// - Parameters:
 54 |     ///   - stepCount: Number of inference steps to schedule
 55 |     ///   - trainStepCount: Number of training diffusion steps
 56 |     ///   - betaSchedule: Method to schedule betas from betaStart to betaEnd
 57 |     ///   - betaStart: The starting value of beta for inference
 58 |     ///   - betaEnd: The end value for beta for inference
 59 |     ///   - timeStepSpacing: How to space time steps
 60 |     /// - Returns: A scheduler ready for its first step
 61 |     public init(
 62 |         stepCount: Int = 50,
 63 |         trainStepCount: Int = 1000,
 64 |         betaSchedule: BetaSchedule = .scaledLinear,
 65 |         betaStart: Float = 0.00085,
 66 |         betaEnd: Float = 0.012,
 67 |         timeStepSpacing: TimeStepSpacing = .linspace
 68 |     ) {
 69 |         self.trainStepCount = trainStepCount
 70 |         self.inferenceStepCount = stepCount
 71 |         
 72 |         switch betaSchedule {
 73 |         case .linear:
 74 |             self.betas = linspace(betaStart, betaEnd, trainStepCount)
 75 |         case .scaledLinear:
 76 |             self.betas = linspace(pow(betaStart, 0.5), pow(betaEnd, 0.5), trainStepCount).map({ $0 * $0 })
 77 |         }
 78 |         
 79 |         self.alphas = betas.map({ 1.0 - $0 })
 80 |         var alphasCumProd = self.alphas
 81 |         for i in 1..<alphasCumProd.count {
 82 |             alphasCumProd[i] *= alphasCumProd[i -  1]
 83 |         }
 84 |         self.alphasCumProd = alphasCumProd
 85 | 
 86 |         switch timeStepSpacing {
 87 |         case .linspace:
 88 |             self.timeSteps = linspace(0, Float(self.trainStepCount-1), stepCount+1).dropFirst().reversed().map { Int(round($0)) }
 89 |             self.alpha_t = vForce.sqrt(self.alphasCumProd)
 90 |             self.sigma_t = vForce.sqrt(vDSP.subtract([Float](repeating: 1, count: self.alphasCumProd.count), self.alphasCumProd))
 91 |         case .leading:
 92 |             let lastTimeStep = trainStepCount - 1
 93 |             let stepRatio = lastTimeStep / (stepCount + 1)
 94 |             // Creates integer timesteps by multiplying by ratio
 95 |             self.timeSteps = (0...stepCount).map { 1 + $0 * stepRatio }.dropFirst().reversed()
 96 |             self.alpha_t = vForce.sqrt(self.alphasCumProd)
 97 |             self.sigma_t = vForce.sqrt(vDSP.subtract([Float](repeating: 1, count: self.alphasCumProd.count), self.alphasCumProd))
 98 |         case .karras:
 99 |             // sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
100 |             let scaled = vDSP.multiply(
101 |                 subtraction: ([Float](repeating: 1, count: self.alphasCumProd.count), self.alphasCumProd),
102 |                 subtraction: (vDSP.divide(1, self.alphasCumProd), [Float](repeating: 0, count: self.alphasCumProd.count))
103 |             )
104 |             let sigmas = vForce.sqrt(scaled)
105 |             let logSigmas = sigmas.map { log($0) }
106 | 
107 |             let sigmaMin = sigmas.first!
108 |             let sigmaMax = sigmas.last!
109 |             let rho: Float = 7
110 |             let ramp = linspace(0, 1, stepCount)
111 |             let minInvRho = pow(sigmaMin, (1 / rho))
112 |             let maxInvRho = pow(sigmaMax, (1 / rho))
113 | 
114 |             var karrasSigmas = ramp.map { pow(maxInvRho + $0 * (minInvRho - maxInvRho), rho) }
115 |             let karrasTimeSteps = karrasSigmas.map { sigmaToTimestep(sigma: $0, logSigmas: logSigmas) }
116 |             self.timeSteps = karrasTimeSteps
117 | 
118 |             karrasSigmas.append(karrasSigmas.last!)
119 | 
120 |             self.alpha_t = vDSP.divide(1, vForce.sqrt(vDSP.add(1, vDSP.square(karrasSigmas))))
121 |             self.sigma_t = vDSP.multiply(karrasSigmas, self.alpha_t)
122 |             usingKarrasSigmas = true
123 |         }
124 | 
125 |         self.lambda_t = zip(self.alpha_t, self.sigma_t).map { α, σ in log(α) - log(σ) }
126 |     }
127 |     
128 |     func timestepToIndex(_ timestep: Int) -> Int {
129 |         guard usingKarrasSigmas else { return timestep }
130 |         return self.timeSteps.firstIndex(of: timestep) ?? 0
131 |     }
132 |     
133 |     /// Convert the model output to the corresponding type the algorithm needs.
134 |     /// This implementation is for second-order DPM-Solver++ assuming epsilon prediction.
135 |     func convertModelOutput(modelOutput: MLShapedArray<Float32>, timestep: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
136 |         assert(modelOutput.scalarCount == sample.scalarCount)
137 |         let scalarCount = modelOutput.scalarCount
138 |         let sigmaIndex = timestepToIndex(timestep)
139 |         let (alpha_t, sigma_t) = (self.alpha_t[sigmaIndex], self.sigma_t[sigmaIndex])
140 | 
141 |         return MLShapedArray(unsafeUninitializedShape: modelOutput.shape) { scalars, _ in
142 |             assert(scalars.count == scalarCount)
143 |             modelOutput.withUnsafeShapedBufferPointer { modelOutput, _, _ in
144 |                 sample.withUnsafeShapedBufferPointer { sample, _, _ in
145 |                     for i in 0 ..< scalarCount {
146 |                         scalars.initializeElement(at: i, to: (sample[i] - modelOutput[i] * sigma_t) / alpha_t)
147 |                     }
148 |                 }
149 |             }
150 |         }
151 |     }
152 | 
153 |     /// One step for the first-order DPM-Solver (equivalent to DDIM).
154 |     /// See https://arxiv.org/abs/2206.00927 for the detailed derivation.
155 |     /// var names and code structure mostly follow https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
156 |     func firstOrderUpdate(
157 |         modelOutput: MLShapedArray<Float32>,
158 |         timestep: Int,
159 |         prevTimestep: Int,
160 |         sample: MLShapedArray<Float32>
161 |     ) -> MLShapedArray<Float32> {
162 |         let prevIndex = timestepToIndex(prevTimestep)
163 |         let currIndex = timestepToIndex(timestep)
164 |         let (p_lambda_t, lambda_s) = (Double(lambda_t[prevIndex]), Double(lambda_t[currIndex]))
165 |         let p_alpha_t = Double(alpha_t[prevIndex])
166 |         let (p_sigma_t, sigma_s) = (Double(sigma_t[prevIndex]), Double(sigma_t[currIndex]))
167 |         let h = p_lambda_t - lambda_s
168 |         // x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
169 |         let x_t = weightedSum(
170 |             [p_sigma_t / sigma_s, -p_alpha_t * (exp(-h) - 1)],
171 |             [sample, modelOutput]
172 |         )
173 |         return x_t
174 |     }
175 | 
176 |     /// One step for the second-order multistep DPM-Solver++ algorithm, using the midpoint method.
177 |     /// var names and code structure mostly follow https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
178 |     func secondOrderUpdate(
179 |         modelOutputs: [MLShapedArray<Float32>],
180 |         timesteps: [Int],
181 |         prevTimestep t: Int,
182 |         sample: MLShapedArray<Float32>
183 |     ) -> MLShapedArray<Float32> {
184 |         let (s0, s1) = (timesteps[back: 1], timesteps[back: 2])
185 |         let (m0, m1) = (modelOutputs[back: 1], modelOutputs[back: 2])
186 |         let (p_lambda_t, lambda_s0, lambda_s1) = (
187 |             Double(lambda_t[timestepToIndex(t)]),
188 |             Double(lambda_t[timestepToIndex(s0)]),
189 |             Double(lambda_t[timestepToIndex(s1)])
190 |         )
191 |         let p_alpha_t = Double(alpha_t[timestepToIndex(t)])
192 |         let (p_sigma_t, sigma_s0) = (Double(sigma_t[timestepToIndex(t)]), Double(sigma_t[timestepToIndex(s0)]))
193 |         let (h, h_0) = (p_lambda_t - lambda_s0, lambda_s0 - lambda_s1)
194 |         let r0 = h_0 / h
195 |         let D0 = m0
196 |         
197 |         // D1 = (1.0 / r0) * (m0 - m1)
198 |         let D1 = weightedSum(
199 |             [1/r0, -1/r0],
200 |             [m0, m1]
201 |         )
202 |         
203 |         // See https://arxiv.org/abs/2211.01095 for detailed derivations
204 |         // x_t = (
205 |         //     (sigma_t / sigma_s0) * sample
206 |         //     - (alpha_t * (torch.exp(-h) - 1.0)) * D0
207 |         //     - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
208 |         // )
209 |         let x_t = weightedSum(
210 |             [p_sigma_t/sigma_s0, -p_alpha_t * (exp(-h) - 1), -0.5 * p_alpha_t * (exp(-h) - 1)],
211 |             [sample, D0, D1]
212 |         )
213 |         return x_t
214 |     }
215 | 
216 |     public func step(output: MLShapedArray<Float32>, timeStep t: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
217 |         let stepIndex = timeSteps.firstIndex(of: t) ?? timeSteps.count - 1
218 |         let prevTimestep = stepIndex == timeSteps.count - 1 ? 0 : timeSteps[stepIndex + 1]
219 | 
220 |         let lowerOrderFinal = useLowerOrderFinal && stepIndex == timeSteps.count - 1 && timeSteps.count < 15
221 |         let lowerOrderSecond = useLowerOrderFinal && stepIndex == timeSteps.count - 2 && timeSteps.count < 15
222 |         let lowerOrder = lowerOrderStepped < 1 || lowerOrderFinal || lowerOrderSecond
223 |         
224 |         let modelOutput = convertModelOutput(modelOutput: output, timestep: t, sample: sample)
225 |         if modelOutputs.count == solverOrder { modelOutputs.removeFirst() }
226 |         modelOutputs.append(modelOutput)
227 |         
228 |         let prevSample: MLShapedArray<Float32>
229 |         if lowerOrder {
230 |             prevSample = firstOrderUpdate(modelOutput: modelOutput, timestep: t, prevTimestep: prevTimestep, sample: sample)
231 |         } else {
232 |             prevSample = secondOrderUpdate(
233 |                 modelOutputs: modelOutputs,
234 |                 timesteps: [timeSteps[stepIndex - 1], t],
235 |                 prevTimestep: prevTimestep,
236 |                 sample: sample
237 |             )
238 |         }
239 |         if lowerOrderStepped < solverOrder {
240 |             lowerOrderStepped += 1
241 |         }
242 |         
243 |         return prevSample
244 |     }
245 | }
246 | 
247 | func sigmaToTimestep(sigma: Float, logSigmas: [Float]) -> Int {
248 |     let logSigma = log(sigma)
249 |     let dists = logSigmas.map { logSigma - $0 }
250 | 
251 |     // last index that is not negative, clipped to last index - 1
252 |     var lowIndex = dists.reduce(-1) { partialResult, dist in
253 |         return dist >= 0 && partialResult < dists.endIndex-2 ? partialResult + 1 : partialResult
254 |     }
255 |     lowIndex = max(lowIndex, 0)
256 |     let highIndex = lowIndex + 1
257 | 
258 |     let low = logSigmas[lowIndex]
259 |     let high = logSigmas[highIndex]
260 | 
261 |     // Interpolate sigmas
262 |     let w = ((low - logSigma) / (low - high)).clipped(to: 0...1)
263 | 
264 |     // transform interpolated value to time range
265 |     let t = (1 - w) * Float(lowIndex) + w * Float(highIndex)
266 |     return Int(round(t))
267 | }
268 | 
269 | extension FloatingPoint {
270 |     func clipped(to range: ClosedRange<Self>) -> Self {
271 |         return min(max(self, range.lowerBound), range.upperBound)
272 |     }
273 | }
274 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Decoder.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | import CoreML
 6 | 
 7 | /// A decoder model which produces RGB images from latent samples
 8 | @available(iOS 16.2, macOS 13.1, *)
 9 | public struct Decoder: ResourceManaging {
10 | 
11 |     /// VAE decoder model
12 |     var model: ManagedMLModel
13 | 
14 |     /// Create decoder from Core ML model
15 |     ///
16 |     /// - Parameters:
17 |     ///     - url: Location of compiled VAE decoder Core ML model
18 |     ///     - configuration: configuration to be used when the model is loaded
19 |     /// - Returns: A decoder that will lazily load its required resources when needed or requested
20 |     public init(modelAt url: URL, configuration: MLModelConfiguration) {
21 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
22 |     }
23 | 
24 |     /// Ensure the model has been loaded into memory
25 |     public func loadResources() throws {
26 |         try model.loadResources()
27 |     }
28 | 
29 |     /// Unload the underlying model to free up memory
30 |     public func unloadResources() {
31 |         model.unloadResources()
32 |     }
33 | 
34 |     /// Batch decode latent samples into images
35 |     ///
36 |     ///  - Parameters:
37 |     ///    - latents: Batch of latent samples to decode
38 |     ///    - scaleFactor: scalar divisor on latents before decoding
39 |     ///  - Returns: decoded images
40 |     public func decode(
41 |         _ latents: [MLShapedArray<Float32>],
42 |         scaleFactor: Float32,
43 |         shiftFactor: Float32 = 0.0
44 |     ) throws -> [CGImage] {
45 | 
46 |         // Form batch inputs for model
47 |         let inputs: [MLFeatureProvider] = try latents.map { sample in
48 |             // Reference pipeline scales the latent samples before decoding
49 |             let sampleScaled = MLShapedArray<Float32>(
50 |                 scalars: sample.scalars.map { $0 / scaleFactor + shiftFactor },
51 |                 shape: sample.shape)
52 | 
53 |             let dict = [inputName: MLMultiArray(sampleScaled)]
54 |             return try MLDictionaryFeatureProvider(dictionary: dict)
55 |         }
56 |         let batch = MLArrayBatchProvider(array: inputs)
57 | 
58 |         // Batch predict with model
59 |         let results = try model.perform { model in
60 |             try model.predictions(fromBatch: batch)
61 |         }
62 | 
63 |         // Transform the outputs to CGImages
64 |         let images: [CGImage] = try (0..<results.count).map { i in
65 |             let result = results.features(at: i)
66 |             let outputName = result.featureNames.first!
67 |             let output = result.featureValue(for: outputName)!.multiArrayValue!
68 |             return try CGImage.fromShapedArray(MLShapedArray<Float32>(converting: output))
69 |         }
70 | 
71 |         return images
72 |     }
73 | 
74 |     var inputName: String {
75 |         try! model.perform { model in
76 |             model.modelDescription.inputDescriptionsByName.first!.key
77 |         }
78 |     }
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/DiscreteFlowScheduler.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2024 Apple Inc. All Rights Reserved.
  3 | 
  4 | import CoreML
  5 | 
  6 | /// A scheduler used to compute a de-noised image
  7 | @available(iOS 16.2, macOS 13.1, *)
  8 | public final class DiscreteFlowScheduler: Scheduler {
  9 |     public let trainStepCount: Int
 10 |     public let inferenceStepCount: Int
 11 |     public var timeSteps = [Int]()
 12 |     public var betas = [Float]()
 13 |     public var alphas = [Float]()
 14 |     public var alphasCumProd = [Float]()
 15 | 
 16 |     public private(set) var modelOutputs: [MLShapedArray<Float32>] = []
 17 | 
 18 |     var trainSteps: Float
 19 |     var shift: Float
 20 |     var counter: Int
 21 |     var sigmas = [Float]()
 22 | 
 23 |     /// Create a scheduler that uses a second order DPM-Solver++ algorithm.
 24 |     ///
 25 |     /// - Parameters:
 26 |     ///   - stepCount: Number of inference steps to schedule
 27 |     ///   - trainStepCount: Number of training diffusion steps
 28 |     ///   - timeStepShift: Amount to shift the timestep schedule
 29 |     /// - Returns: A scheduler ready for its first step
 30 |     public init(
 31 |         stepCount: Int = 50,
 32 |         trainStepCount: Int = 1000,
 33 |         timeStepShift: Float = 3.0
 34 |     ) {
 35 |         self.trainStepCount = trainStepCount
 36 |         self.inferenceStepCount = stepCount
 37 |         self.trainSteps = Float(trainStepCount)
 38 |         self.shift = timeStepShift
 39 |         self.counter = 0
 40 | 
 41 |         let sigmaDistribution = linspace(1, trainSteps, Int(trainSteps)).map { sigmaFromTimestep($0) }
 42 |         let timeStepDistribution = linspace(sigmaDistribution.first!, sigmaDistribution.last!, stepCount).reversed()
 43 |         self.timeSteps = timeStepDistribution.map { Int($0 * trainSteps) }
 44 |         self.sigmas = timeStepDistribution.map { sigmaFromTimestep($0 * trainSteps) }
 45 |     }
 46 | 
 47 |     func sigmaFromTimestep(_ timestep: Float) -> Float {
 48 |         if shift == 1.0 {
 49 |             return timestep / trainSteps
 50 |         } else {
 51 |             // shift * timestep / (1 + (shift - 1) * timestep)
 52 |             let t = timestep / trainSteps
 53 |             return shift * t / (1 + (shift - 1) * t)
 54 |         }
 55 |     }
 56 | 
 57 |     func timestepsFromSigmas() -> [Float] {
 58 |         return sigmas.map { $0 * trainSteps }
 59 |     }
 60 | 
 61 |     /// Convert the model output to the corresponding type the algorithm needs.
 62 |     func convertModelOutput(modelOutput: MLShapedArray<Float32>, timestep: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
 63 |         assert(modelOutput.scalarCount == sample.scalarCount)
 64 |         let stepIndex = timeSteps.firstIndex(of: timestep) ?? counter
 65 |         let sigma = sigmas[stepIndex]
 66 | 
 67 |         return MLShapedArray<Float>(unsafeUninitializedShape: modelOutput.shape) { result, _ in
 68 |             modelOutput.withUnsafeShapedBufferPointer { noiseScalars, _, _ in
 69 |                 sample.withUnsafeShapedBufferPointer { latentScalars, _, _ in
 70 |                     for i in 0..<result.count {
 71 |                         let denoised = latentScalars[i] - noiseScalars[i] * sigma
 72 |                         result.initializeElement(
 73 |                             at: i,
 74 |                             to: denoised
 75 |                         )
 76 |                     }
 77 |                 }
 78 |             }
 79 |         }
 80 |     }
 81 | 
 82 |     public func calculateTimestepsFromSigmas(strength: Float?) -> [Float] {
 83 |         guard let strength else { return timestepsFromSigmas() }
 84 |         let startStep = max(inferenceStepCount - Int(Float(inferenceStepCount) * strength), 0)
 85 |         let actualTimesteps = Array(timestepsFromSigmas()[startStep...])
 86 |         return actualTimesteps
 87 |     }
 88 | 
 89 |     public func step(output: MLShapedArray<Float32>, timeStep t: Int, sample: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
 90 |         let stepIndex = timeSteps.firstIndex(of: t) ?? counter // TODO: allow float timesteps in scheduler step protocol
 91 |         let modelOutput = convertModelOutput(modelOutput: output, timestep: t, sample: sample)
 92 |         modelOutputs.append(modelOutput)
 93 | 
 94 |         let sigma = sigmas[stepIndex]
 95 |         var dt = sigma
 96 |         var prevSigma: Float = 0
 97 |         if stepIndex < sigmas.count - 1 {
 98 |             prevSigma = sigmas[stepIndex + 1]
 99 |             dt = prevSigma - sigma
100 |         }
101 | 
102 |         let prevSample: MLShapedArray<Float32> = MLShapedArray<Float>(unsafeUninitializedShape: modelOutput.shape) { result, _ in
103 |             modelOutput.withUnsafeShapedBufferPointer { noiseScalars, _, _ in
104 |                 sample.withUnsafeShapedBufferPointer { latentScalars, _, _ in
105 |                     for i in 0..<result.count {
106 |                         let denoised = noiseScalars[i]
107 |                         let x = latentScalars[i]
108 | 
109 |                         let d = (x - denoised) / sigma
110 |                         let prev_x = x + d * dt
111 |                         result.initializeElement(
112 |                             at: i,
113 |                             to: prev_x
114 |                         )
115 |                     }
116 |                 }
117 |             }
118 |         }
119 | 
120 |         counter += 1
121 |         return prevSample
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Encoder.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// A encoder model which produces latent samples from RGB images
  8 | @available(iOS 16.2, macOS 13.1, *)
  9 | public struct Encoder: ResourceManaging {
 10 |     
 11 |     public enum Error: String, Swift.Error {
 12 |         case sampleInputShapeNotCorrect
 13 |     }
 14 |     
 15 |     /// VAE encoder model + post math and adding noise from schedular
 16 |     var model: ManagedMLModel
 17 |     
 18 |     /// Create encoder from Core ML model
 19 |     ///
 20 |     /// - Parameters:
 21 |     ///     - url: Location of compiled VAE encoder Core ML model
 22 |     ///     - configuration: configuration to be used when the model is loaded
 23 |     /// - Returns: An encoder that will lazily load its required resources when needed or requested
 24 |     public init(modelAt url: URL, configuration: MLModelConfiguration) {
 25 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
 26 |     }
 27 |     
 28 |     /// Ensure the model has been loaded into memory
 29 |     public func loadResources() throws {
 30 |         try model.loadResources()
 31 |     }
 32 | 
 33 |     /// Unload the underlying model to free up memory
 34 |     public func unloadResources() {
 35 |        model.unloadResources()
 36 |     }
 37 |     
 38 |     /// Prediction queue
 39 |     let queue = DispatchQueue(label: "encoder.predict")
 40 | 
 41 |     /// Encode image into latent sample
 42 |     ///
 43 |     ///  - Parameters:
 44 |     ///    - image: Input image
 45 |     ///    - scaleFactor: scalar multiplier on latents before encoding image
 46 |     ///    - random
 47 |     ///  - Returns: The encoded latent space as MLShapedArray
 48 |     public func encode(
 49 |         _ image: CGImage,
 50 |         scaleFactor: Float32,
 51 |         random: inout RandomSource
 52 |     ) throws -> MLShapedArray<Float32> {
 53 |         let imageData = try image.planarRGBShapedArray(minValue: -1.0, maxValue: 1.0)
 54 |         guard imageData.shape == inputShape else {
 55 |             // TODO: Consider auto resizing and croping similar to how Vision or CoreML auto-generated Swift code can accomplish with `MLFeatureValue`
 56 |             throw Error.sampleInputShapeNotCorrect
 57 |         }
 58 |         let dict = [inputName: MLMultiArray(imageData)]
 59 |         let input = try MLDictionaryFeatureProvider(dictionary: dict)
 60 |         
 61 |         let result = try model.perform { model in
 62 |             try model.prediction(from: input)
 63 |         }
 64 |         let outputName = result.featureNames.first!
 65 |         let outputValue = result.featureValue(for: outputName)!.multiArrayValue!
 66 |         let output = MLShapedArray<Float32>(converting: outputValue)
 67 |         
 68 |         // DiagonalGaussianDistribution
 69 |         let mean = output[0][0..<4]
 70 |         let logvar = MLShapedArray<Float32>(
 71 |             scalars: output[0][4..<8].scalars.map { min(max($0, -30), 20) },
 72 |             shape: mean.shape
 73 |         )
 74 |         let std = MLShapedArray<Float32>(
 75 |             scalars: logvar.scalars.map { exp(0.5 * $0) },
 76 |             shape: logvar.shape
 77 |         )
 78 |         let latent = MLShapedArray<Float32>(
 79 |             scalars: zip(mean.scalars, std.scalars).map {
 80 |                 Float32(random.nextNormal(mean: Double($0), stdev: Double($1)))
 81 |             },
 82 |             shape: logvar.shape
 83 |         )
 84 |         
 85 |         // Reference pipeline scales the latent after encoding
 86 |         let latentScaled = MLShapedArray<Float32>(
 87 |             scalars: latent.scalars.map { $0 * scaleFactor },
 88 |             shape: [1] + latent.shape
 89 |         )
 90 | 
 91 |         return latentScaled
 92 |     }
 93 |     
 94 |     var inputDescription: MLFeatureDescription {
 95 |         try! model.perform { model in
 96 |             model.modelDescription.inputDescriptionsByName.first!.value
 97 |         }
 98 |     }
 99 |     
100 |     var inputName: String {
101 |         inputDescription.name
102 |     }
103 |     
104 |     /// The expected shape of the models latent sample input
105 |     var inputShape: [Int] {
106 |         inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/ManagedMLModel.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import CoreML
  5 | 
  6 | /// A class to manage and gate access to a Core ML model
  7 | ///
  8 | /// It will automatically load a model into memory when needed or requested
  9 | /// It allows one to request to unload the model from memory
 10 | @available(iOS 16.2, macOS 13.1, *)
 11 | public final class ManagedMLModel: ResourceManaging {
 12 | 
 13 |     /// The location of the model
 14 |     var modelURL: URL
 15 | 
 16 |     /// The configuration to be used when the model is loaded
 17 |     var configuration: MLModelConfiguration
 18 | 
 19 |     /// The loaded model (when loaded)
 20 |     var loadedModel: MLModel?
 21 | 
 22 |     /// Queue to protect access to loaded model
 23 |     var queue: DispatchQueue
 24 | 
 25 |     /// Create a managed model given its location and desired loaded configuration
 26 |     ///
 27 |     /// - Parameters:
 28 |     ///     - url: The location of the model
 29 |     ///     - configuration: The configuration to be used when the model is loaded/used
 30 |     /// - Returns: A managed model that has not been loaded
 31 |     public init(modelAt url: URL, configuration: MLModelConfiguration) {
 32 |         self.modelURL = url
 33 |         self.configuration = configuration
 34 |         self.loadedModel = nil
 35 |         self.queue = DispatchQueue(label: "managed.\(url.lastPathComponent)")
 36 |     }
 37 | 
 38 |     /// Instantiation and load model into memory
 39 |     public func loadResources() throws {
 40 |         try queue.sync {
 41 |             try loadModel()
 42 |         }
 43 |     }
 44 | 
 45 |     /// Unload the model if it was loaded
 46 |     public func unloadResources() {
 47 |         queue.sync {
 48 |             loadedModel = nil
 49 |         }
 50 |     }
 51 | 
 52 |     /// Perform an operation with the managed model via a supplied closure.
 53 |     ///  The model will be loaded and supplied to the closure and should only be
 54 |     ///  used within the closure to ensure all resource management is synchronized
 55 |     ///
 56 |     /// - Parameters:
 57 |     ///     - body: Closure which performs and action on a loaded model
 58 |     /// - Returns: The result of the closure
 59 |     /// - Throws: An error if the model cannot be loaded or if the closure throws
 60 |     public func perform<R>(_ body: (MLModel) throws -> R) throws -> R {
 61 |         return try queue.sync {
 62 |             try autoreleasepool {
 63 |                 try loadModel()
 64 |                 return try body(loadedModel!)
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     private func loadModel() throws {
 70 |         if loadedModel == nil {
 71 |             loadedModel = try MLModel(contentsOf: modelURL,
 72 |                                       configuration: configuration)
 73 |         }
 74 |     }
 75 | }
 76 | 
 77 | @available(iOS 16.2, macOS 13.1, *)
 78 | public extension Array where Element == ManagedMLModel {
 79 |     /// Performs batch predictions using an array of `[ManagedMLModel]` instances in a pipeline.
 80 |     /// - Parameter batch: Inputs for btached predictions.
 81 |     /// - Returns: Final prediction results after processing through all models.
 82 |     /// - Throws: Errors if the array is empty, predictions fail, or results can't be combined.
 83 |     func predictions(from batch: MLBatchProvider) throws -> MLBatchProvider {
 84 |         var results = try self.first!.perform { model in
 85 |             try model.predictions(fromBatch: batch)
 86 |         }
 87 | 
 88 |         if self.count == 1 {
 89 |             return results
 90 |         }
 91 | 
 92 |         // Manual pipeline batch prediction
 93 |         let inputs = batch.arrayOfFeatureValueDictionaries
 94 |         for stage in self.dropFirst() {
 95 |             // Combine the original inputs with the outputs of the last stage
 96 |             let next = try results.arrayOfFeatureValueDictionaries
 97 |                 .enumerated().map { index, dict in
 98 |                     let nextDict = dict.merging(inputs[index]) { out, _ in out }
 99 |                     return try MLDictionaryFeatureProvider(dictionary: nextDict)
100 |                 }
101 |             let nextBatch = MLArrayBatchProvider(array: next)
102 | 
103 |             // Predict
104 |             results = try stage.perform { model in
105 |                 try model.predictions(fromBatch: nextBatch)
106 |             }
107 |         }
108 | 
109 |         return results
110 |     }
111 | }
112 | 
113 | extension MLFeatureProvider {
114 |     var featureValueDictionary: [String : MLFeatureValue] {
115 |         self.featureNames.reduce(into: [String : MLFeatureValue]()) { result, name in
116 |             result[name] = self.featureValue(for: name)
117 |         }
118 |     }
119 | }
120 | 
121 | extension MLBatchProvider {
122 |     var arrayOfFeatureValueDictionaries: [[String : MLFeatureValue]] {
123 |         (0..<self.count).map {
124 |             self.features(at: $0).featureValueDictionary
125 |         }
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/MultiModalDiffusionTransformer.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | 
  6 | import CoreML
  7 | 
  8 | /// MMDiT noise prediction model for stable diffusion
  9 | @available(iOS 16.2, macOS 13.1, *)
 10 | public struct MultiModalDiffusionTransformer: ResourceManaging {
 11 |     /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
 12 |     ///
 13 |     /// It can be in the form of a single model or multiple stages
 14 |     var models: [ManagedMLModel]
 15 | 
 16 |     /// Creates a MMDiT noise prediction model
 17 |     ///
 18 |     /// - Parameters:
 19 |     ///   - url: Location of single MMDiT compiled Core ML model
 20 |     ///   - configuration: Configuration to be used when the model is loaded
 21 |     /// - Returns: MMDiT model that will lazily load its required resources when needed or requested
 22 |     public init(modelAt url: URL,
 23 |                 configuration: MLModelConfiguration)
 24 |     {
 25 |         self.models = [ManagedMLModel(modelAt: url, configuration: configuration)]
 26 |     }
 27 | 
 28 |     /// Load resources.
 29 |     public func loadResources() throws {
 30 |         for model in models {
 31 |             try model.loadResources()
 32 |         }
 33 |     }
 34 | 
 35 |     /// Unload the underlying model to free up memory
 36 |     public func unloadResources() {
 37 |         for model in models {
 38 |             model.unloadResources()
 39 |         }
 40 |     }
 41 | 
 42 |     /// Pre-warm resources
 43 |     public func prewarmResources() throws {
 44 |         // Override default to pre-warm each model
 45 |         for model in models {
 46 |             try model.loadResources()
 47 |             model.unloadResources()
 48 |         }
 49 |     }
 50 | 
 51 |     var latentImageEmbeddingsDescription: MLFeatureDescription {
 52 |         try! models.first!.perform { model in
 53 |             model.modelDescription.inputDescriptionsByName["latent_image_embeddings"]!
 54 |         }
 55 |     }
 56 | 
 57 |     /// The expected shape of the models latent sample input
 58 |     public var latentImageEmbeddingsShape: [Int] {
 59 |         latentImageEmbeddingsDescription.multiArrayConstraint!.shape.map { $0.intValue }
 60 |     }
 61 | 
 62 |     var tokenLevelTextEmbeddingsDescription: MLFeatureDescription {
 63 |         try! models.first!.perform { model in
 64 |             model.modelDescription.inputDescriptionsByName["token_level_text_embeddings"]!
 65 |         }
 66 |     }
 67 | 
 68 |     /// The expected shape of the geometry conditioning
 69 |     public var tokenLevelTextEmbeddingsShape: [Int] {
 70 |         tokenLevelTextEmbeddingsDescription.multiArrayConstraint!.shape.map { $0.intValue }
 71 |     }
 72 | 
 73 |     /// Batch prediction noise from latent samples
 74 |     ///
 75 |     /// - Parameters:
 76 |     ///   - latents: Batch of latent samples in an array
 77 |     ///   - timeStep: Current diffusion timestep
 78 |     ///   - hiddenStates: Hidden state to condition on
 79 |     /// - Returns: Array of predicted noise residuals
 80 |     func predictNoise(
 81 |         latents: [MLShapedArray<Float32>],
 82 |         timeStep: Float,
 83 |         tokenLevelTextEmbeddings: MLShapedArray<Float32>,
 84 |         pooledTextEmbeddings: MLShapedArray<Float32>
 85 |     ) throws -> [MLShapedArray<Float32>] {
 86 |         // Match time step batch dimension to the model / latent samples
 87 |         let t = MLShapedArray<Float32>(scalars: [timeStep, timeStep], shape: [2])
 88 | 
 89 |         // Form batch input to model
 90 |         let inputs = try latents.enumerated().map {
 91 |             let dict: [String: Any] = [
 92 |                 "latent_image_embeddings": MLMultiArray($0.element),
 93 |                 "timestep": MLMultiArray(t),
 94 |                 "token_level_text_embeddings": MLMultiArray(tokenLevelTextEmbeddings),
 95 |                 "pooled_text_embeddings": MLMultiArray(pooledTextEmbeddings),
 96 |             ]
 97 |             return try MLDictionaryFeatureProvider(dictionary: dict)
 98 |         }
 99 |         let batch = MLArrayBatchProvider(array: inputs)
100 | 
101 |         // Make predictions
102 |         let results = try models.predictions(from: batch)
103 | 
104 |         // Pull out the results in Float32 format
105 |         let noise = (0..<results.count).map { i in
106 | 
107 |             let result = results.features(at: i)
108 |             let outputName = result.featureNames.first!
109 | 
110 |             let outputNoise = result.featureValue(for: outputName)!.multiArrayValue!
111 | 
112 |             // To conform to this func return type make sure we return float32
113 |             // Use the fact that the concatenating constructor for MLMultiArray
114 |             // can do type conversion:
115 |             let fp32Noise = MLMultiArray(
116 |                 concatenating: [outputNoise],
117 |                 axis: 0,
118 |                 dataType: .float32
119 |             )
120 |             return MLShapedArray<Float32>(fp32Noise)
121 |         }
122 | 
123 |         return noise
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/MultilingualTextEncoder.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2023 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import NaturalLanguage
  6 | import CoreML
  7 | 
  8 | #if canImport(NaturalLanguage.NLContextualEmbedding)
  9 | @available(iOS 17.0, macOS 14.0, *)
 10 | public struct MultilingualTextEncoder: TextEncoderModel {
 11 |     let adapter: ManagedMLModel?
 12 | 
 13 |     let embeddingModel: NLContextualEmbedding
 14 | 
 15 |     // TODO: use maximum sequence length from embedding.
 16 |     let maximumEmbeddingSequenceLength = 256
 17 | 
 18 |     /// Creates a multilingual text encoder.
 19 |     ///
 20 |     /// - Parameters:
 21 |     ///   - url: The location of the compiled Core ML adapter model. The model is a linear projection layer that
 22 |     ///   transforms the contextual embedding size of 512 to the default text encoder CLIP size of 768.
 23 |     ///   - configuration: The configuration to be used when the model is loaded.
 24 |     ///   - script: The scipt of the contextual embedding.
 25 |     public init(
 26 |         modelAt url: URL? = nil,
 27 |         configuration: MLModelConfiguration = .init(),
 28 |         script: Script = .latin
 29 |     ) {
 30 |         if let url {
 31 |             self.adapter = ManagedMLModel(modelAt: url, configuration: configuration)
 32 |         } else {
 33 |             self.adapter = nil
 34 |         }
 35 |         self.embeddingModel = NLContextualEmbedding(script: script.asNLScript)!
 36 |         self.embeddingModel.requestAssets { _, _ in }
 37 |     }
 38 | 
 39 |     /// Loads model resources into memory.
 40 |     public func loadResources() throws {
 41 |         try adapter?.loadResources()
 42 |         try embeddingModel.load()
 43 |     }
 44 | 
 45 |     /// Unloads the model resources to free up memory.
 46 |     public func unloadResources() {
 47 |         adapter?.unloadResources()
 48 |         embeddingModel.unload()
 49 |     }
 50 | 
 51 |     /// Encodes the input text.
 52 |     ///
 53 |     ///  - Parameter text: The input text.
 54 |     ///  - Returns: An embedding shaped array.
 55 |     public func encode(_ text: String) throws -> MLShapedArray<Float> {
 56 |         guard embeddingModel.hasAvailableAssets else {
 57 |             throw Error.missingEmbeddingResource
 58 |         }
 59 | 
 60 |         // Create the text embedding result.
 61 |         let embedding = try embeddingModel.embeddingResult(for: text, language: nil)
 62 | 
 63 |         // Create embedding array from token vectors.
 64 |         var shapedEmbeddings = MLShapedArray<Double>(
 65 |             repeating: 0.0,
 66 |             shape: [1, maximumEmbeddingSequenceLength, embeddingModel.dimension]
 67 |         )
 68 |         shapedEmbeddings.withUnsafeMutableShapedBufferPointer { pointer, _, _ in
 69 |             var tokenIndex = 0
 70 |             embedding.enumerateTokenVectors(in: text.startIndex ..< text.endIndex) { (tokenEmbeddings, _) -> Bool in
 71 |                 for tokenEmbeddingIndex in 0 ..< tokenEmbeddings.count {
 72 |                     pointer[tokenIndex * embeddingModel.dimension + tokenEmbeddingIndex] = tokenEmbeddings[tokenEmbeddingIndex]
 73 |                 }
 74 |                 tokenIndex += 1
 75 |                 return true
 76 |             }
 77 |         }
 78 | 
 79 |         if adapter == nil {
 80 |             // Return embeddings with shape [1, 256, 512].
 81 |             return MLShapedArray(converting: shapedEmbeddings)
 82 |         } else {
 83 |             // Project the embeddings to the correct CLIP model input shape of [1, 768, 1, 256].
 84 |             return try projectEmbeddings(shapedEmbeddings)
 85 |         }
 86 |     }
 87 | 
 88 |     /// Creates the adapter model input feature provider.
 89 |     private func prepareProjectionInput(_ input: MLShapedArray<Double>) throws -> MLDictionaryFeatureProvider {
 90 |         guard let adapter else {
 91 |             fatalError("Cannot prepare projection input without an adapter.")
 92 |         }
 93 |         return try adapter.perform { model in
 94 |             guard let inputDescription = model.modelDescription.inputDescriptionsByName.first?.value else {
 95 |                 throw Error.missingAdapterInput
 96 |             }
 97 |             return try MLDictionaryFeatureProvider(dictionary: [inputDescription.name: MLMultiArray(input)])
 98 |         }
 99 |     }
100 | 
101 |     /// Processes the adapter model output feature provider.
102 |     private func processProjectionOutput(_ output: MLFeatureProvider) throws -> MLShapedArray<Float> {
103 |         guard let adapter else {
104 |             fatalError("Cannot process projection output without an adapter.")
105 |         }
106 |         return try adapter.perform { model in
107 |             guard let outputDescription = model.modelDescription.outputDescriptionsByName.first?.value else {
108 |                 throw Error.missingAdapterOutput
109 |             }
110 |             guard let result = output
111 |                 .featureValue(for: outputDescription.name)?
112 |                 .multiArrayValue else {
113 | 
114 |                 throw Error.incompatibleAdapterOutputDataFormat(
115 |                     expected: .multiArray,
116 |                     actual: outputDescription.type
117 |                 )
118 |             }
119 | 
120 |             return MLShapedArray(converting: result)
121 |         }
122 |     }
123 | 
124 |     /// Projects the embeddings.
125 |     private func projectEmbeddings(_ embeddings: MLShapedArray<Double>) throws -> MLShapedArray<Float> {
126 |         guard let adapter else {
127 |             fatalError("Cannot project embeddings without an adapter.")
128 |         }
129 |         let inputFeatureProvider = try prepareProjectionInput(embeddings)
130 |         let projection = try adapter.perform { model in
131 |             return try model.prediction(from: inputFeatureProvider)
132 |         }
133 |         return try processProjectionOutput(projection)
134 |     }
135 | }
136 | 
137 | @available(iOS 17.0, macOS 14.0, *)
138 | extension MultilingualTextEncoder {
139 |     /// A multilingual text encoder error.
140 |     public enum Error: Swift.Error, LocalizedError, Equatable, CustomDebugStringConvertible {
141 |         /// An error that indicates that the resource for the embedding is missing.
142 |         case missingEmbeddingResource
143 | 
144 |         /// An error that indicates that the adapter model input data has the wrong format.
145 |         case incompatibleAdapterInputDataFormat(expected: MLFeatureType, actual: MLFeatureType)
146 | 
147 |         /// An error that indicates that the adapter model output data has the wrong format.
148 |         case incompatibleAdapterOutputDataFormat(expected: MLFeatureType, actual: MLFeatureType)
149 | 
150 |         /// An error that indicates that the adapter model is missing an input.
151 |         case missingAdapterInput
152 | 
153 |         /// An error that indicates that the adapter model is missing an output.
154 |         case missingAdapterOutput
155 | 
156 |         /// A debug description of the error.
157 |         public var errorDescription: String? {
158 |             debugDescription
159 |         }
160 | 
161 |         /// A text representation of the error.
162 |         public var debugDescription: String {
163 |             switch self {
164 |             case .missingEmbeddingResource:
165 |                 return "Resources required for generating embeddings are missing. Make sure that your device is connected to the internet and try again."
166 |             case .incompatibleAdapterInputDataFormat(expected: let expected, actual: let actual):
167 |                 return "The adapter model input expected to be \(expected) but is \(actual)."
168 |             case .incompatibleAdapterOutputDataFormat(expected: let expected, actual: let actual):
169 |                 return "The adapter model output expected to be \(expected) but is \(actual)."
170 |             case .missingAdapterInput:
171 |                 return "The adapter model is missing an input."
172 |             case .missingAdapterOutput:
173 |                 return "The adapter model is missing an output."
174 |             }
175 |         }
176 |     }
177 | }
178 | #endif
179 | 
180 | @available(iOS 16.2, macOS 13.1, *)
181 | public enum Script: String {
182 |     case latin, cyrillic, cjk
183 | 
184 | #if canImport(NaturalLanguage.NLScript)
185 |     @available(iOS 17.0, macOS 14.0, *)
186 |     var asNLScript: NLScript {
187 |         switch self {
188 |         case .latin: return .latin
189 |         case .cyrillic: return .cyrillic
190 |         case .cjk: return .simplifiedChinese
191 |         }
192 |     }
193 | #endif
194 | }
195 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/NumPyRandomSource.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// A random source consistent with NumPy
  8 | ///
  9 | ///  This implementation matches:
 10 | ///  [NumPy's older randomkit.c](https://github.com/numpy/numpy/blob/v1.0/numpy/random/mtrand/randomkit.c)
 11 | ///
 12 | @available(iOS 16.2, macOS 13.1, *)
 13 | struct NumPyRandomSource: RandomNumberGenerator, RandomSource {
 14 | 
 15 |     struct State {
 16 |         var key = [UInt32](repeating: 0, count: 624)
 17 |         var pos: Int = 0
 18 |         var nextGauss: Double? = nil
 19 |     }
 20 | 
 21 |     var state: State
 22 | 
 23 |     /// Initialize with a random seed
 24 |     ///
 25 |     /// - Parameters
 26 |     ///     - seed: Seed for underlying Mersenne Twister 19937 generator
 27 |     /// - Returns random source
 28 |     init(seed: UInt32) {
 29 |         state = .init()
 30 |         var s = seed & 0xffffffff
 31 |         for i in 0 ..< state.key.count {
 32 |             state.key[i] = s
 33 |             s = UInt32((UInt64(1812433253) * UInt64(s ^ (s >> 30)) + UInt64(i) + 1) & 0xffffffff)
 34 |         }
 35 |         state.pos = state.key.count
 36 |         state.nextGauss = nil
 37 |     }
 38 | 
 39 |     /// Generate next UInt32 using fast 32bit Mersenne Twister
 40 |     mutating func nextUInt32() -> UInt32 {
 41 |         let n = 624
 42 |         let m = 397
 43 |         let matrixA: UInt64    = 0x9908b0df
 44 |         let upperMask: UInt32  = 0x80000000
 45 |         let lowerMask: UInt32  = 0x7fffffff
 46 | 
 47 |         var y: UInt32
 48 |         if state.pos == state.key.count {
 49 |             for i in 0 ..< (n - m) {
 50 |                 y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 51 |                 state.key[i] = state.key[i + m] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 52 |             }
 53 |             for i in (n - m) ..< (n - 1) {
 54 |                 y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 55 |                 state.key[i] = state.key[i + (m - n)] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 56 |             }
 57 |             y = (state.key[n - 1] & upperMask) | (state.key[0] & lowerMask)
 58 |             state.key[n - 1] = state.key[m - 1] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 59 |             state.pos = 0
 60 |         }
 61 |         y = state.key[state.pos]
 62 |         state.pos += 1
 63 | 
 64 |         y ^= (y >> 11)
 65 |         y ^= (y << 7) & 0x9d2c5680
 66 |         y ^= (y << 15) & 0xefc60000
 67 |         y ^= (y >> 18)
 68 | 
 69 |         return y
 70 |     }
 71 | 
 72 |     mutating func next() -> UInt64 {
 73 |         let low = nextUInt32()
 74 |         let high = nextUInt32()
 75 |         return (UInt64(high) << 32) | UInt64(low)
 76 |     }
 77 | 
 78 |     /// Generate next random double value
 79 |     mutating func nextDouble() -> Double {
 80 |         let a = Double(nextUInt32() >> 5)
 81 |         let b = Double(nextUInt32() >> 6)
 82 |         return (a * 67108864.0 + b) / 9007199254740992.0
 83 |     }
 84 | 
 85 |     /// Generate next random value from a standard normal
 86 |     mutating func nextGauss() -> Double {
 87 |         if let nextGauss = state.nextGauss {
 88 |             state.nextGauss = nil
 89 |             return nextGauss
 90 |         }
 91 |         var x1, x2, r2: Double
 92 |         repeat {
 93 |             x1 = 2.0 * nextDouble() - 1.0
 94 |             x2 = 2.0 * nextDouble() - 1.0
 95 |             r2 = x1 * x1 + x2 * x2
 96 |         } while r2 >= 1.0 || r2 == 0.0
 97 | 
 98 |         // Box-Muller transform
 99 |         let f = sqrt(-2.0 * log(r2) / r2)
100 |         state.nextGauss = f * x1
101 |         return f * x2
102 |     }
103 | 
104 |     /// Generates a random value from a normal distribution with given mean and standard deviation.
105 |     mutating func nextNormal(mean: Double = 0.0, stdev: Double = 1.0) -> Double {
106 |         nextGauss() * stdev + mean
107 |     }
108 | 
109 |     /// Generates an array of random values from a normal distribution with given mean and standard deviation.
110 |     mutating func normalArray(count: Int, mean: Double = 0.0, stdev: Double = 1.0) -> [Double] {
111 |         (0 ..< count).map { _ in nextNormal(mean: mean, stdev: stdev) }
112 |     }
113 | 
114 |     /// Generate a shaped array with scalars from a normal distribution with given mean and standard deviation.
115 |     mutating func normalShapedArray(_ shape: [Int], mean: Double = 0.0, stdev: Double = 1.0) -> MLShapedArray<Double> {
116 |         let count = shape.reduce(1, *)
117 |         return .init(scalars: normalArray(count: count, mean: mean, stdev: stdev), shape: shape)
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/NvRandomSource.swift:
--------------------------------------------------------------------------------
 1 | import Foundation
 2 | import CoreML
 3 | 
 4 | /// A random source consistent with NVIDIA curandom
 5 | ///
 6 | ///  This implementation references to:
 7 | ///  https://github.com/dsnz/random/blob/master/philox.py for Philox_M4_32 configuration.
 8 | ///
 9 | @available(iOS 16.2, macOS 13.1, *)
10 | struct NvRandomSource: RandomSource {
11 |   public let seed: UInt64
12 |   private var offset: UInt32
13 | 
14 |   /// Initialize with a random seed
15 |   ///
16 |   /// - Parameters
17 |   ///     - seed: Seed for underlying Philox M4 32 generator
18 |   /// - Returns random source
19 |   public init(seed: UInt32) {
20 |     self.seed = UInt64(seed)
21 |     offset = 0
22 |   }
23 | 
24 |   static private let PHILOX_M4_32: (UInt32, UInt32) = (0xD251_1F53, 0xCD9E_8D57)
25 |   static private let PHILOX_W_32: (UInt32, UInt32) = (0x9E37_79B9, 0xBB67_AE85)
26 | 
27 |   static private func philox4Round(counter: inout [[UInt32]], key: [[UInt32]]) {
28 |     for i in 0..<counter[0].count {
29 |       let v1: UInt64 = UInt64(counter[0][i]) * UInt64(PHILOX_M4_32.0)
30 |       let v2: UInt64 = UInt64(counter[2][i]) * UInt64(PHILOX_M4_32.1)
31 |       counter[0][i] = UInt32(v2 >> 32) ^ counter[1][i] ^ key[0][i]
32 |       counter[1][i] = UInt32(v2 & 0xffff_ffff)
33 |       counter[2][i] = UInt32(v1 >> 32) ^ counter[3][i] ^ key[1][i]
34 |       counter[3][i] = UInt32(v1 & 0xffff_ffff)
35 |     }
36 |   }
37 | 
38 |   static private func philox4Bumpkey(key: inout [[UInt32]]) {
39 |     for (i, element) in key[0].enumerated() {
40 |       key[0][i] = element &+ PHILOX_W_32.0
41 |     }
42 |     for (i, element) in key[1].enumerated() {
43 |       key[1][i] = element &+ PHILOX_W_32.1
44 |     }
45 |   }
46 | 
47 |   static private func philox4_32(counter: inout [[UInt32]], key: inout [[UInt32]], rounds: Int = 10) {
48 |     for _ in 0..<(rounds - 1) {
49 |       philox4Round(counter: &counter, key: key)
50 |       philox4Bumpkey(key: &key)
51 |     }
52 |     philox4Round(counter: &counter, key: key)
53 |   }
54 | 
55 |   private func boxMuller(_ counter1: [UInt32], _ counter2: [UInt32], mean: Double, stdev: Double) -> [Double] {
56 |     // Box-Muller transform
57 |     return zip(counter1, counter2).map {
58 |       let u: Double = Double($0) / 4294967296.0 + (1.0 / 8589934592.0)
59 |       let v: Double = Double($1) * (.pi / 2147483648.0) + (.pi / 4294967296.0)
60 |       let radius = stdev * sqrt(-2.0 * log(u))
61 |       return radius * sin(v) + mean
62 |     }
63 |   }
64 | 
65 |   private mutating func normalArray(count: Int, mean: Double, stdev: Double) -> [Double] {
66 |     var counter: [[UInt32]] = [
67 |       Array(repeating: offset, count: count),
68 |       Array(repeating: 0, count: count),
69 |       Array(0..<UInt32(count)),
70 |       Array(repeating: 0, count: count),
71 |     ]
72 |     offset += 1
73 |     var key: [[UInt32]] = [
74 |       Array(repeating: UInt32(seed & 0xffff_ffff), count: count),
75 |       Array(repeating: UInt32(seed >> 32), count: count),
76 |     ]
77 |     Self.philox4_32(counter: &counter, key: &key)
78 |     return boxMuller(counter[0], counter[1], mean: mean, stdev: stdev)
79 |   }
80 | 
81 |   /// Generates a random value from a normal distribution with given mean and standard deviation.
82 |   mutating func nextNormal(mean: Double = 0.0, stdev: Double = 1.0) -> Double {
83 |     return normalArray(count: 1, mean: mean, stdev: stdev)[0]
84 |   }
85 | 
86 |   /// Generate a shaped array with scalars from a normal distribution with given mean and standard deviation.
87 |   mutating func normalShapedArray(_ shape: [Int], mean: Double = 0.0, stdev: Double = 1.0) -> MLShapedArray<Double> {
88 |     let count = shape.reduce(1, *)
89 |     return .init(scalars: normalArray(count: count, mean: mean, stdev: stdev), shape: shape)
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/RandomSource.swift:
--------------------------------------------------------------------------------
1 | import CoreML
2 | 
3 | @available(iOS 16.2, macOS 13.1, *)
4 | public protocol RandomSource {
5 |     mutating func nextNormal(mean: Double, stdev: Double) -> Double
6 |     
7 |     mutating func normalShapedArray(_ shape: [Int], mean: Double, stdev: Double) -> MLShapedArray<Double>
8 | }
9 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/ResourceManaging.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | /// Protocol for managing internal resources
 5 | public protocol ResourceManaging {
 6 | 
 7 |     /// Request resources to be loaded and ready if possible
 8 |     func loadResources() throws
 9 | 
10 |     /// Request resources are unloaded / remove from memory if possible
11 |     func unloadResources()
12 | }
13 | 
14 | extension ResourceManaging {
15 |     /// Request resources are pre-warmed by loading and unloading
16 |     func prewarmResources() throws {
17 |         try loadResources()
18 |         unloadResources()
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/SafetyChecker.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Accelerate
  7 | 
  8 | /// Image safety checking model
  9 | @available(iOS 16.2, macOS 13.1, *)
 10 | public struct SafetyChecker: ResourceManaging {
 11 | 
 12 |     /// Safety checking Core ML model
 13 |     var model: ManagedMLModel
 14 | 
 15 |     /// Creates safety checker
 16 |     ///
 17 |     /// - Parameters:
 18 |     ///     - url: Location of compiled safety checking  Core ML model
 19 |     ///     - configuration: configuration to be used when the model is loaded
 20 |     /// - Returns: A safety cherker that will lazily load its required resources when needed or requested
 21 |     public init(modelAt url: URL, configuration: MLModelConfiguration) {
 22 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
 23 |     }
 24 | 
 25 |     /// Ensure the model has been loaded into memory
 26 |     public func loadResources() throws {
 27 |         try model.loadResources()
 28 |     }
 29 | 
 30 |     /// Unload the underlying model to free up memory
 31 |     public func unloadResources() {
 32 |        model.unloadResources()
 33 |     }
 34 | 
 35 |     typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
 36 |     typealias PixelBufferP8x1 = vImage.PixelBuffer<vImage.Planar8>
 37 |     typealias PixelBufferPFx3 = vImage.PixelBuffer<vImage.PlanarFx3>
 38 |     typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
 39 |     typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
 40 |     typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
 41 |     typealias PixelBufferI8x4 = vImage.PixelBuffer<vImage.Interleaved8x4>
 42 | 
 43 |     enum SafetyCheckError: Error {
 44 |         case imageResizeFailure
 45 |         case imageToFloatFailure
 46 |         case modelInputFailure
 47 |         case unexpectedModelOutput
 48 |     }
 49 | 
 50 |     /// Check if image is safe
 51 |     ///
 52 |     /// - Parameters:
 53 |     ///     - image: Image to check
 54 |     /// - Returns: Whether the model considers the image to be safe
 55 |     public func isSafe(_ image: CGImage) throws -> Bool {
 56 | 
 57 |         let inputName = "clip_input"
 58 |         let adjustmentName = "adjustment"
 59 |         let imagesNames = "images"
 60 | 
 61 |         let inputInfo = try model.perform { model in
 62 |             model.modelDescription.inputDescriptionsByName
 63 |         }
 64 |         let inputShape = inputInfo[inputName]!.multiArrayConstraint!.shape
 65 | 
 66 |         let width = inputShape[2].intValue
 67 |         let height = inputShape[3].intValue
 68 | 
 69 |         let resizedImage = try resizeToRGBA(image, width: width, height: height)
 70 | 
 71 |         let bufferP8x3 = try getRGBPlanes(of: resizedImage)
 72 | 
 73 |         let arrayPFx3 = normalizeToFloatShapedArray(bufferP8x3)
 74 | 
 75 |         guard let input = try? MLDictionaryFeatureProvider(
 76 |             dictionary:[
 77 |                 // Input that is analyzed for safety
 78 |                 inputName      : MLMultiArray(arrayPFx3),
 79 |                 // No adjustment, use default threshold
 80 |                 adjustmentName : MLMultiArray(MLShapedArray<Float32>(scalars: [0], shape: [1])),
 81 |                 // Supplying dummy images to be filtered (will be ignored)
 82 |                 imagesNames    : MLMultiArray(shape:[1, 512, 512, 3], dataType: .float16)
 83 |             ]
 84 |         ) else {
 85 |             throw SafetyCheckError.modelInputFailure
 86 |         }
 87 | 
 88 |         let result = try model.perform { model in
 89 |             try model.prediction(from: input)
 90 |         }
 91 | 
 92 |         let output = result.featureValue(for: "has_nsfw_concepts")
 93 | 
 94 |         guard let unsafe = output?.multiArrayValue?[0].boolValue else {
 95 |             throw SafetyCheckError.unexpectedModelOutput
 96 |         }
 97 | 
 98 |         return !unsafe
 99 |     }
100 | 
101 |     func resizeToRGBA(_ image: CGImage,
102 |                       width: Int, height: Int) throws -> CGImage {
103 | 
104 |         guard let context = CGContext(
105 |             data: nil,
106 |             width: width,
107 |             height: height,
108 |             bitsPerComponent: 8,
109 |             bytesPerRow: width*4,
110 |             space: CGColorSpaceCreateDeviceRGB(),
111 |             bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue) else {
112 |             throw SafetyCheckError.imageResizeFailure
113 |         }
114 | 
115 |         context.interpolationQuality = .high
116 |         context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
117 |         guard let resizedImage = context.makeImage() else {
118 |             throw SafetyCheckError.imageResizeFailure
119 |         }
120 | 
121 |         return resizedImage
122 |     }
123 | 
124 |     func getRGBPlanes(of rgbaImage: CGImage) throws -> PixelBufferP8x3 {
125 |         // Reference as interleaved 8 bit vImage PixelBuffer
126 |         var emptyFormat = vImage_CGImageFormat()
127 |         guard let bufferI8x4 = try? PixelBufferI8x4(
128 |             cgImage: rgbaImage,
129 |             cgImageFormat:&emptyFormat) else {
130 |             throw SafetyCheckError.imageToFloatFailure
131 |         }
132 | 
133 |         // Drop the alpha channel, keeping RGB
134 |         let bufferI8x3 = PixelBufferI8x3(width: rgbaImage.width, height:rgbaImage.height)
135 |         bufferI8x4.convert(to: bufferI8x3, channelOrdering: .RGBA)
136 | 
137 |         // De-interleave into 8-bit planes
138 |         return PixelBufferP8x3(interleavedBuffer: bufferI8x3)
139 |     }
140 | 
141 |     func normalizeToFloatShapedArray(_ bufferP8x3: PixelBufferP8x3) -> MLShapedArray<Float32> {
142 |         let width = bufferP8x3.width
143 |         let height = bufferP8x3.height
144 | 
145 |         let means = [0.485, 0.456, 0.406] as [Float]
146 |         let stds  = [0.229, 0.224, 0.225] as [Float]
147 | 
148 |         // Convert to normalized float 1x3xWxH input (plannar)
149 |         let arrayPFx3 = MLShapedArray<Float32>(repeating: 0.0, shape: [1, 3, width, height])
150 |         for c in 0..<3 {
151 |             arrayPFx3[0][c].withUnsafeShapedBufferPointer { ptr, _, strides in
152 |                 let floatChannel = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
153 |                                                    width: width, height: height,
154 |                                                    byteCountPerRow: strides[0]*4)
155 | 
156 |                 bufferP8x3.withUnsafePixelBuffer(at: c) { uint8Channel in
157 |                     uint8Channel.convert(to: floatChannel) // maps [0 255] -> [0 1]
158 |                     floatChannel.multiply(by: 1.0/stds[c],
159 |                                           preBias: -means[c],
160 |                                           postBias: 0.0,
161 |                                           destination: floatChannel)
162 |                 }
163 |             }
164 |         }
165 |         return arrayPFx3
166 |     }
167 | }
168 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/SampleTimer.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | 
 6 | /// A utility for timing events and tracking time statistics
 7 | ///
 8 | /// Typical usage
 9 | /// ```
10 | /// let timer: SampleTimer
11 | ///
12 | /// for i in 0...<iterationCount {
13 | ///     timer.start()
14 | ///     doStuff()
15 | ///     timer.stop()
16 | /// }
17 | ///
18 | /// print(String(format: "mean: %.2f, var: %.2f",
19 | ///              timer.mean, timer.variance))
20 | /// ```
21 | @available(iOS 16.2, macOS 13.1, *)
22 | public final class SampleTimer: Codable {
23 |     var startTime: CFAbsoluteTime?
24 |     var sum: Double = 0.0
25 |     var sumOfSquares: Double = 0.0
26 |     var count = 0
27 |     var samples: [Double] = []
28 | 
29 |     public init() {}
30 | 
31 |     /// Start a sample, noting the current time
32 |     public func start() {
33 |         startTime = CFAbsoluteTimeGetCurrent()
34 |     }
35 | 
36 |     // Stop a sample and record the elapsed time
37 |     @discardableResult public func stop() -> Double {
38 |         guard let startTime = startTime else {
39 |             return 0
40 |         }
41 | 
42 |         let elapsed = CFAbsoluteTimeGetCurrent() - startTime
43 |         sum += elapsed
44 |         sumOfSquares += elapsed * elapsed
45 |         count += 1
46 |         samples.append(elapsed)
47 |         return elapsed
48 |     }
49 | 
50 |     /// Mean of all sampled times
51 |     public var mean: Double { sum / Double(count) }
52 | 
53 |     /// Variance of all sampled times
54 |     public var variance: Double {
55 |         guard count > 1 else {
56 |             return 0.0
57 |         }
58 |         return sumOfSquares / Double(count - 1) - mean * mean
59 |     }
60 | 
61 |     /// Standard deviation of all sampled times
62 |     public var stdev: Double { variance.squareRoot() }
63 | 
64 |     /// Median of all sampled times
65 |     public var median: Double {
66 |         let sorted = samples.sorted()
67 |         let (q, r) = sorted.count.quotientAndRemainder(dividingBy: 2)
68 |         if r == 0 {
69 |             return (sorted[q] + sorted[q - 1]) / 2.0
70 |         } else {
71 |             return Double(sorted[q])
72 |         }
73 |     }
74 | 
75 |     public var allSamples: [Double] {
76 |         samples
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusion3Pipeline+Resources.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | import CoreML
 5 | import Foundation
 6 | import Tokenizers
 7 | import Hub
 8 | 
 9 | @available(iOS 17.0, macOS 14.0, *)
10 | public extension StableDiffusion3Pipeline {
11 |     struct ResourceURLs {
12 |         public let textEncoderURL: URL
13 |         public let textEncoder2URL: URL
14 |         public let textEncoderT5URL: URL
15 |         public let mmditURL: URL
16 |         public let decoderURL: URL
17 |         public let encoderURL: URL
18 |         public let vocabURL: URL
19 |         public let mergesURL: URL
20 |         public let configT5URL: URL
21 |         public let dataT5URL: URL
22 | 
23 |         public init(resourcesAt baseURL: URL) {
24 |             textEncoderURL = baseURL.appending(path: "TextEncoder.mlmodelc")
25 |             textEncoder2URL = baseURL.appending(path: "TextEncoder2.mlmodelc")
26 |             textEncoderT5URL = baseURL.appending(path: "TextEncoderT5.mlmodelc")
27 |             mmditURL = baseURL.appending(path: "MultiModalDiffusionTransformer.mlmodelc")
28 |             decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
29 |             encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
30 |             vocabURL = baseURL.appending(path: "vocab.json")
31 |             mergesURL = baseURL.appending(path: "merges.txt")
32 |             configT5URL = baseURL.appending(path: "tokenizer_config.json")
33 |             dataT5URL = baseURL.appending(path: "tokenizer.json")
34 |         }
35 |     }
36 | 
37 |     /// Create stable diffusion pipeline using model resources at a
38 |     /// specified URL
39 |     ///
40 |     /// - Parameters:
41 |     ///   - baseURL: URL pointing to directory holding all model and tokenization resources
42 |     ///   - configuration: The configuration to load model resources with
43 |     ///   - reduceMemory: Setup pipeline in reduced memory mode
44 |     /// - Returns:
45 |     ///  Pipeline ready for image generation if all  necessary resources loaded
46 |     init(
47 |         resourcesAt baseURL: URL,
48 |         configuration config: MLModelConfiguration = .init(),
49 |         reduceMemory: Bool = false
50 |     ) throws {
51 |         // Expect URL of each resource
52 |         let urls = ResourceURLs(resourcesAt: baseURL)
53 |         let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
54 |         let textEncoder = TextEncoderXL(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
55 | 
56 |         // padToken is different in the second XL text encoder
57 |         let tokenizer2 = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL, padToken: "!")
58 |         let textEncoder2 = TextEncoderXL(tokenizer: tokenizer2, modelAt: urls.textEncoder2URL, configuration: config)
59 | 
60 |         // Optional T5 encoder
61 |         var textEncoderT5: TextEncoderT5?
62 |         if FileManager.default.fileExists(atPath: urls.configT5URL.path),
63 |            FileManager.default.fileExists(atPath: urls.dataT5URL.path),
64 |            FileManager.default.fileExists(atPath: urls.textEncoderT5URL.path)
65 |         {
66 |             let tokenizerT5 = try PreTrainedTokenizer(tokenizerConfig: Config(fileURL: urls.configT5URL), tokenizerData: Config(fileURL: urls.dataT5URL))
67 |             textEncoderT5 = TextEncoderT5(tokenizer: tokenizerT5, modelAt: urls.textEncoderT5URL, configuration: config)
68 |         } else {
69 |             textEncoderT5 = nil
70 |         }
71 | 
72 |         // Denoiser model
73 |         let mmdit = MultiModalDiffusionTransformer(modelAt: urls.mmditURL, configuration: config)
74 | 
75 |         // Image Decoder
76 |         let decoder = Decoder(modelAt: urls.decoderURL, configuration: config)
77 | 
78 |         // Optional Image Encoder
79 |         let encoder: Encoder?
80 |         if FileManager.default.fileExists(atPath: urls.encoderURL.path) {
81 |             encoder = Encoder(modelAt: urls.encoderURL, configuration: config)
82 |         } else {
83 |             encoder = nil
84 |         }
85 | 
86 |         // Construct pipeline
87 |         self.init(
88 |             textEncoder: textEncoder,
89 |             textEncoder2: textEncoder2,
90 |             textEncoderT5: textEncoderT5,
91 |             mmdit: mmdit,
92 |             decoder: decoder,
93 |             encoder: encoder,
94 |             reduceMemory: reduceMemory
95 |         )
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import NaturalLanguage
  7 | 
  8 | @available(iOS 16.2, macOS 13.1, *)
  9 | public extension StableDiffusionPipeline {
 10 | 
 11 |     struct ResourceURLs {
 12 | 
 13 |         public let textEncoderURL: URL
 14 |         public let unetURL: URL
 15 |         public let unetChunk1URL: URL
 16 |         public let unetChunk2URL: URL
 17 |         public let decoderURL: URL
 18 |         public let encoderURL: URL
 19 |         public let safetyCheckerURL: URL
 20 |         public let vocabURL: URL
 21 |         public let mergesURL: URL
 22 |         public let controlNetDirURL: URL
 23 |         public let controlledUnetURL: URL
 24 |         public let controlledUnetChunk1URL: URL
 25 |         public let controlledUnetChunk2URL: URL
 26 |         public let multilingualTextEncoderProjectionURL: URL
 27 | 
 28 |         public init(resourcesAt baseURL: URL) {
 29 |             textEncoderURL = baseURL.appending(path: "TextEncoder.mlmodelc")
 30 |             unetURL = baseURL.appending(path: "Unet.mlmodelc")
 31 |             unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
 32 |             unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
 33 |             decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
 34 |             encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
 35 |             safetyCheckerURL = baseURL.appending(path: "SafetyChecker.mlmodelc")
 36 |             vocabURL = baseURL.appending(path: "vocab.json")
 37 |             mergesURL = baseURL.appending(path: "merges.txt")
 38 |             controlNetDirURL = baseURL.appending(path: "controlnet")
 39 |             controlledUnetURL = baseURL.appending(path: "ControlledUnet.mlmodelc")
 40 |             controlledUnetChunk1URL = baseURL.appending(path: "ControlledUnetChunk1.mlmodelc")
 41 |             controlledUnetChunk2URL = baseURL.appending(path: "ControlledUnetChunk2.mlmodelc")
 42 |             multilingualTextEncoderProjectionURL = baseURL.appending(path: "MultilingualTextEncoderProjection.mlmodelc")
 43 |         }
 44 |     }
 45 | 
 46 |     /// Create stable diffusion pipeline using model resources at a
 47 |     /// specified URL
 48 |     ///
 49 |     /// - Parameters:
 50 |     ///   - baseURL: URL pointing to directory holding all model and tokenization resources
 51 |     ///   - controlNetModelNames: Specify ControlNet models to use in generation
 52 |     ///   - configuration: The configuration to load model resources with
 53 |     ///   - disableSafety: Load time disable of safety to save memory
 54 |     ///   - reduceMemory: Setup pipeline in reduced memory mode
 55 |     ///   - useMultilingualTextEncoder: Option to use system multilingual NLContextualEmbedding as encoder
 56 |     ///   - script: Optional natural language script to use for the text encoder.
 57 |     /// - Returns:
 58 |     ///  Pipeline ready for image generation if all  necessary resources loaded
 59 |     init(
 60 |         resourcesAt baseURL: URL,
 61 |         controlNet controlNetModelNames: [String],
 62 |         configuration config: MLModelConfiguration = .init(),
 63 |         disableSafety: Bool = false,
 64 |         reduceMemory: Bool = false,
 65 |         useMultilingualTextEncoder: Bool = false,
 66 |         script: Script? = nil
 67 |     ) throws {
 68 | 
 69 |         /// Expect URL of each resource
 70 |         let urls = ResourceURLs(resourcesAt: baseURL)
 71 |         let textEncoder: TextEncoderModel
 72 | 
 73 | #if canImport(NaturalLanguage.NLScript)
 74 |         if useMultilingualTextEncoder {
 75 |             guard #available(macOS 14.0, iOS 17.0, *) else { throw PipelineError.unsupportedOSVersion }
 76 |             textEncoder = MultilingualTextEncoder(
 77 |                 modelAt: urls.multilingualTextEncoderProjectionURL,
 78 |                 configuration: config,
 79 |                 script: script ?? .latin
 80 |             )
 81 |         } else {
 82 |             let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
 83 |             textEncoder = TextEncoder(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
 84 |         }
 85 | #else
 86 |         let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
 87 |         textEncoder = TextEncoder(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
 88 | #endif
 89 | 
 90 |         // ControlNet model
 91 |         var controlNet: ControlNet? = nil
 92 |         let controlNetURLs = controlNetModelNames.map { model in
 93 |             let fileName = model + ".mlmodelc"
 94 |             return urls.controlNetDirURL.appending(path: fileName)
 95 |         }
 96 |         if !controlNetURLs.isEmpty {
 97 |             controlNet = ControlNet(modelAt: controlNetURLs, configuration: config)
 98 |         }
 99 | 
100 |         // Unet model
101 |         let unet: Unet
102 |         let unetURL: URL, unetChunk1URL: URL, unetChunk2URL: URL
103 |         
104 |         // if ControlNet available, Unet supports additional inputs from ControlNet
105 |         if controlNet == nil {
106 |             unetURL = urls.unetURL
107 |             unetChunk1URL = urls.unetChunk1URL
108 |             unetChunk2URL = urls.unetChunk2URL
109 |         } else {
110 |             unetURL = urls.controlledUnetURL
111 |             unetChunk1URL = urls.controlledUnetChunk1URL
112 |             unetChunk2URL = urls.controlledUnetChunk2URL
113 |         }
114 |         
115 |         if FileManager.default.fileExists(atPath: unetChunk1URL.path) &&
116 |             FileManager.default.fileExists(atPath: unetChunk2URL.path) {
117 |             unet = Unet(chunksAt: [unetChunk1URL, unetChunk2URL],
118 |                         configuration: config)
119 |         } else {
120 |             unet = Unet(modelAt: unetURL, configuration: config)
121 |         }
122 | 
123 |         // Image Decoder
124 |         let decoder = Decoder(modelAt: urls.decoderURL, configuration: config)
125 | 
126 |         // Optional safety checker
127 |         var safetyChecker: SafetyChecker? = nil
128 |         if !disableSafety &&
129 |             FileManager.default.fileExists(atPath: urls.safetyCheckerURL.path) {
130 |             safetyChecker = SafetyChecker(modelAt: urls.safetyCheckerURL, configuration: config)
131 |         }
132 |         
133 |         // Optional Image Encoder
134 |         let encoder: Encoder?
135 |         if FileManager.default.fileExists(atPath: urls.encoderURL.path) {
136 |             encoder = Encoder(modelAt: urls.encoderURL, configuration: config)
137 |         } else {
138 |             encoder = nil
139 |         }
140 | 
141 |         // Construct pipeline
142 |         if #available(macOS 14.0, iOS 17.0, *) {
143 |             self.init(
144 |                 textEncoder: textEncoder,
145 |                 unet: unet,
146 |                 decoder: decoder,
147 |                 encoder: encoder,
148 |                 controlNet: controlNet,
149 |                 safetyChecker: safetyChecker,
150 |                 reduceMemory: reduceMemory,
151 |                 useMultilingualTextEncoder: useMultilingualTextEncoder,
152 |                 script: script
153 |             )
154 |         } else {
155 |             self.init(
156 |                 textEncoder: textEncoder,
157 |                 unet: unet,
158 |                 decoder: decoder,
159 |                 encoder: encoder,
160 |                 controlNet: controlNet,
161 |                 safetyChecker: safetyChecker,
162 |                 reduceMemory: reduceMemory
163 |             )
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreGraphics
  6 | 
  7 | /// Type of processing that will be performed to generate an image
  8 | public enum PipelineMode {
  9 |     case textToImage
 10 |     case imageToImage
 11 |     // case inPainting
 12 | }
 13 | 
 14 | /// Image generation configuration
 15 | public struct PipelineConfiguration: Hashable {
 16 |     
 17 |     /// Text prompt to guide sampling
 18 |     public var prompt: String
 19 |     /// Negative text prompt to guide sampling
 20 |     public var negativePrompt: String = ""
 21 |     /// Starting image for image2image or in-painting
 22 |     public var startingImage: CGImage? = nil
 23 |     /// Fraction of inference steps to be used in `.imageToImage` pipeline mode
 24 |     /// Must be between 0 and 1
 25 |     /// Higher values will result in greater transformation of the `startingImage`
 26 |     public var strength: Float = 1.0
 27 |     /// Fraction of inference steps to at which to start using the refiner unet if present in `textToImage` mode
 28 |     /// Must be between 0 and 1
 29 |     /// Higher values will result in fewer refiner steps
 30 |     public var refinerStart: Float = 0.8
 31 |     /// Number of images to generate
 32 |     public var imageCount: Int = 1
 33 |     /// Number of inference steps to perform
 34 |     public var stepCount: Int = 50
 35 |     /// Random seed which to start generation
 36 |     public var seed: UInt32 = 0
 37 |     /// Controls the influence of the text prompt on sampling process (0=random images)
 38 |     public var guidanceScale: Float = 7.5
 39 |     /// List of Images for available ControlNet Models
 40 |     public var controlNetInputs: [CGImage] = []
 41 |     /// Safety checks are only performed if `self.canSafetyCheck && !disableSafety`
 42 |     public var disableSafety: Bool = false
 43 |     /// Enables progress updates to decode `currentImages` from denoised latent images for better previews
 44 |     public var useDenoisedIntermediates: Bool = false
 45 |     /// The type of Scheduler to use.
 46 |     public var schedulerType: StableDiffusionScheduler = .pndmScheduler
 47 |     /// The spacing to use for scheduler sigmas and time steps. Only supported when using `.dpmppScheduler`.
 48 |     public var schedulerTimestepSpacing: TimeStepSpacing = .linspace
 49 |     /// Resolution dependent shifting of timestep schedules
 50 |     public var schedulerTimestepShift: Float = 3.0
 51 |     /// The type of RNG to use
 52 |     public var rngType: StableDiffusionRNG = .numpyRNG
 53 |     /// Scale factor to use on the latent after encoding
 54 |     public var encoderScaleFactor: Float32 = 0.18215
 55 |     /// Scale factor to use on the latent before decoding
 56 |     public var decoderScaleFactor: Float32 = 0.18215
 57 |     /// Shift factor to use on the latent before decoding
 58 |     public var decoderShiftFactor: Float32 = 0.0
 59 |     /// If `originalSize` is not the same as `targetSize` the image will appear to be down- or upsampled.
 60 |     /// Part of SDXL’s micro-conditioning as explained in section 2.2 of https://huggingface.co/papers/2307.01952.
 61 |     public var originalSize: Float32 = 1024
 62 |     /// `cropsCoordsTopLeft` can be used to generate an image that appears to be “cropped” from the position `cropsCoordsTopLeft` downwards.
 63 |     /// Favorable, well-centered images are usually achieved by setting `cropsCoordsTopLeft` to (0, 0).
 64 |     public var cropsCoordsTopLeft: Float32 = 0
 65 |     /// For most cases, `target_size` should be set to the desired height and width of the generated image.
 66 |     public var targetSize: Float32 = 1024
 67 |     /// Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
 68 |     public var aestheticScore: Float32 = 6
 69 |     /// Can be used to simulate an aesthetic score of the generated image by influencing the negative text condition.
 70 |     public var negativeAestheticScore: Float32 = 2.5
 71 | 
 72 |     /// Given the configuration, what mode will be used for generation
 73 |     public var mode: PipelineMode {
 74 |         guard startingImage != nil else {
 75 |             return .textToImage
 76 |         }
 77 |         guard strength < 1.0 else {
 78 |             return .textToImage
 79 |         }
 80 |         return .imageToImage
 81 |     }
 82 | 
 83 |     public init(
 84 |         prompt: String
 85 |     ) {
 86 |         self.prompt = prompt
 87 |     }
 88 | 
 89 | }
 90 | 
 91 | 
 92 | @available(iOS 16.2, macOS 13.1, *)
 93 | public extension StableDiffusionPipeline {
 94 | 
 95 |     /// Type of processing that will be performed to generate an image
 96 |     typealias Mode = PipelineMode
 97 | 
 98 |     /// Image generation configuration
 99 |     typealias Configuration = PipelineConfiguration
100 | }
101 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusionXL+Resources.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2023 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import NaturalLanguage
  7 | 
  8 | @available(iOS 17.0, macOS 14.0, *)
  9 | public extension StableDiffusionXLPipeline {
 10 | 
 11 |     struct ResourceURLs {
 12 | 
 13 |         public let textEncoderURL: URL
 14 |         public let textEncoder2URL: URL
 15 |         public let unetURL: URL
 16 |         public let unetChunk1URL: URL
 17 |         public let unetChunk2URL: URL
 18 |         public let unetRefinerURL: URL
 19 |         public let unetRefinerChunk1URL: URL
 20 |         public let unetRefinerChunk2URL: URL
 21 |         public let decoderURL: URL
 22 |         public let encoderURL: URL
 23 |         public let vocabURL: URL
 24 |         public let mergesURL: URL
 25 | 
 26 |         public init(resourcesAt baseURL: URL) {
 27 |             textEncoderURL = baseURL.appending(path: "TextEncoder.mlmodelc")
 28 |             textEncoder2URL = baseURL.appending(path: "TextEncoder2.mlmodelc")
 29 |             unetURL = baseURL.appending(path: "Unet.mlmodelc")
 30 |             unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
 31 |             unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
 32 |             unetRefinerURL = baseURL.appending(path: "UnetRefiner.mlmodelc")
 33 |             unetRefinerChunk1URL = baseURL.appending(path: "UnetRefinerChunk1.mlmodelc")
 34 |             unetRefinerChunk2URL = baseURL.appending(path: "UnetRefinerChunk2.mlmodelc")
 35 |             decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
 36 |             encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
 37 |             vocabURL = baseURL.appending(path: "vocab.json")
 38 |             mergesURL = baseURL.appending(path: "merges.txt")
 39 |         }
 40 |     }
 41 | 
 42 |     /// Create stable diffusion pipeline using model resources at a
 43 |     /// specified URL
 44 |     ///
 45 |     /// - Parameters:
 46 |     ///   - baseURL: URL pointing to directory holding all model and tokenization resources
 47 |     ///   - configuration: The configuration to load model resources with
 48 |     ///   - reduceMemory: Setup pipeline in reduced memory mode
 49 |     /// - Returns:
 50 |     ///  Pipeline ready for image generation if all  necessary resources loaded
 51 |     init(
 52 |         resourcesAt baseURL: URL,
 53 |         configuration config: MLModelConfiguration = .init(),
 54 |         reduceMemory: Bool = false
 55 |     ) throws {
 56 | 
 57 |         /// Expect URL of each resource
 58 |         let urls = ResourceURLs(resourcesAt: baseURL)
 59 |         let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
 60 |         let textEncoder: TextEncoderXL?
 61 |         if FileManager.default.fileExists(atPath: urls.textEncoderURL.path) {
 62 |             textEncoder = TextEncoderXL(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
 63 |         } else {
 64 |             textEncoder = nil
 65 |         }
 66 |         
 67 |         // padToken is different in the second XL text encoder
 68 |         let tokenizer2 = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL, padToken: "!")
 69 |         let textEncoder2 = TextEncoderXL(tokenizer: tokenizer2, modelAt: urls.textEncoder2URL, configuration: config)
 70 | 
 71 |         // Unet model
 72 |         let unet: Unet
 73 |         if FileManager.default.fileExists(atPath: urls.unetChunk1URL.path) &&
 74 |             FileManager.default.fileExists(atPath: urls.unetChunk2URL.path) {
 75 |             unet = Unet(chunksAt: [urls.unetChunk1URL, urls.unetChunk2URL],
 76 |                         configuration: config)
 77 |         } else {
 78 |             unet = Unet(modelAt: urls.unetURL, configuration: config)
 79 |         }
 80 | 
 81 |         // Refiner Unet model
 82 |         let unetRefiner: Unet?
 83 |         if FileManager.default.fileExists(atPath: urls.unetRefinerChunk1URL.path) &&
 84 |             FileManager.default.fileExists(atPath: urls.unetRefinerChunk2URL.path) {
 85 |             unetRefiner = Unet(chunksAt: [urls.unetRefinerChunk1URL, urls.unetRefinerChunk2URL],
 86 |                                configuration: config)
 87 |         } else if FileManager.default.fileExists(atPath: urls.unetRefinerURL.path) {
 88 |             unetRefiner = Unet(modelAt: urls.unetRefinerURL, configuration: config)
 89 |         } else {
 90 |             unetRefiner = nil
 91 |         }
 92 | 
 93 | 
 94 |         // Image Decoder
 95 |         // FIXME: Hardcoding to .cpuAndGPU since ANE doesn't support FLOAT32
 96 |         let vaeConfig = config.copy() as! MLModelConfiguration
 97 |         vaeConfig.computeUnits = .cpuAndGPU
 98 |         let decoder = Decoder(modelAt: urls.decoderURL, configuration: vaeConfig)
 99 | 
100 |         // Optional Image Encoder
101 |         let encoder: Encoder?
102 |         if FileManager.default.fileExists(atPath: urls.encoderURL.path) {
103 |             encoder = Encoder(modelAt: urls.encoderURL, configuration: vaeConfig)
104 |         } else {
105 |             encoder = nil
106 |         }
107 | 
108 |         // Construct pipeline
109 |         self.init(
110 |             textEncoder: textEncoder,
111 |             textEncoder2: textEncoder2,
112 |             unet: unet,
113 |             unetRefiner: unetRefiner,
114 |             decoder: decoder,
115 |             encoder: encoder,
116 |             reduceMemory: reduceMemory
117 |         )
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/TextEncoder.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | @available(iOS 16.2, macOS 13.1, *)
  8 | public protocol TextEncoderModel: ResourceManaging {
  9 | 
 10 |     func encode(_ text: String) throws -> MLShapedArray<Float32>
 11 | }
 12 | 
 13 | ///  A model for encoding text
 14 | @available(iOS 16.2, macOS 13.1, *)
 15 | public struct TextEncoder: TextEncoderModel {
 16 | 
 17 |     /// Text tokenizer
 18 |     var tokenizer: BPETokenizer
 19 | 
 20 |     /// Embedding model
 21 |     var model: ManagedMLModel
 22 | 
 23 |     /// Creates text encoder which embeds a tokenized string
 24 |     ///
 25 |     /// - Parameters:
 26 |     ///   - tokenizer: Tokenizer for input text
 27 |     ///   - url: Location of compiled text encoding  Core ML model
 28 |     ///   - configuration: configuration to be used when the model is loaded
 29 |     /// - Returns: A text encoder that will lazily load its required resources when needed or requested
 30 |     public init(tokenizer: BPETokenizer,
 31 |                 modelAt url: URL,
 32 |                 configuration: MLModelConfiguration) {
 33 |         self.tokenizer = tokenizer
 34 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
 35 |     }
 36 | 
 37 |     /// Ensure the model has been loaded into memory
 38 |     public func loadResources() throws {
 39 |         try model.loadResources()
 40 |     }
 41 | 
 42 |     /// Unload the underlying model to free up memory
 43 |     public func unloadResources() {
 44 |        model.unloadResources()
 45 |     }
 46 | 
 47 |     /// Encode input text/string
 48 |     ///
 49 |     ///  - Parameters:
 50 |     ///     - text: Input text to be tokenized and then embedded
 51 |     ///  - Returns: Embedding representing the input text
 52 |     public func encode(_ text: String) throws -> MLShapedArray<Float32> {
 53 | 
 54 |         // Get models expected input length
 55 |         let inputLength = inputShape.last!
 56 | 
 57 |         // Tokenize, padding to the expected length
 58 |         var (tokens, ids) = tokenizer.tokenize(input: text, minCount: inputLength)
 59 | 
 60 |         // Truncate if necessary
 61 |         if ids.count > inputLength {
 62 |             tokens = tokens.dropLast(tokens.count - inputLength)
 63 |             ids = ids.dropLast(ids.count - inputLength)
 64 |             let truncated = tokenizer.decode(tokens: tokens)
 65 |             print("Needed to truncate input '\(text)' to '\(truncated)'")
 66 |         }
 67 | 
 68 |         // Use the model to generate the embedding
 69 |         return try encode(ids: ids)
 70 |     }
 71 | 
 72 |     /// Prediction queue
 73 |     let queue = DispatchQueue(label: "textencoder.predict")
 74 | 
 75 |     func encode(ids: [Int]) throws -> MLShapedArray<Float32> {
 76 |         let inputName = inputDescription.name
 77 |         let inputShape = inputShape
 78 | 
 79 |         let floatIds = ids.map { Float32($0) }
 80 |         let inputArray = MLShapedArray<Float32>(scalars: floatIds, shape: inputShape)
 81 |         let inputFeatures = try! MLDictionaryFeatureProvider(
 82 |             dictionary: [inputName: MLMultiArray(inputArray)])
 83 | 
 84 |         let result = try model.perform { model in
 85 |             try model.prediction(from: inputFeatures)
 86 |         }
 87 | 
 88 |         let embeddingFeature = result.featureValue(for: "last_hidden_state")
 89 |         return MLShapedArray<Float32>(converting: embeddingFeature!.multiArrayValue!)
 90 |     }
 91 | 
 92 |     var inputDescription: MLFeatureDescription {
 93 |         try! model.perform { model in
 94 |             model.modelDescription.inputDescriptionsByName.first!.value
 95 |         }
 96 |     }
 97 | 
 98 |     var inputShape: [Int] {
 99 |         inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
100 |     }
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/TextEncoderT5.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2023 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Tokenizers
  7 | 
  8 | @available(iOS 17.0, macOS 14.0, *)
  9 | public protocol TextEncoderT5Model: ResourceManaging {
 10 |     func encode(_ text: String) throws -> TextEncoderT5Output
 11 | }
 12 | 
 13 | @available(iOS 17.0, macOS 14.0, *)
 14 | public struct TextEncoderT5Output {
 15 |     public let encoderHiddenStates: MLShapedArray<Float32>
 16 | }
 17 | 
 18 | ///  A model for encoding text, suitable for SD3
 19 | @available(iOS 17.0, macOS 14.0, *)
 20 | public struct TextEncoderT5: TextEncoderT5Model {
 21 | 
 22 |     /// Text tokenizer
 23 |     var tokenizer: Tokenizer
 24 | 
 25 |     /// Embedding model
 26 |     var model: ManagedMLModel
 27 | 
 28 |     /// Creates text encoder which embeds a tokenized string
 29 |     ///
 30 |     /// - Parameters:
 31 |     ///   - tokenizer: Tokenizer for input text
 32 |     ///   - url: Location of compiled text encoding  Core ML model
 33 |     ///   - configuration: configuration to be used when the model is loaded
 34 |     /// - Returns: A text encoder that will lazily load its required resources when needed or requested
 35 |     public init(tokenizer: Tokenizer,
 36 |                 modelAt url: URL,
 37 |                 configuration: MLModelConfiguration) {
 38 |         self.tokenizer = tokenizer
 39 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
 40 |     }
 41 | 
 42 |     /// Ensure the model has been loaded into memory
 43 |     public func loadResources() throws {
 44 |         try model.loadResources()
 45 |     }
 46 | 
 47 |     /// Unload the underlying model to free up memory
 48 |     public func unloadResources() {
 49 |        model.unloadResources()
 50 |     }
 51 | 
 52 |     /// Encode input text/string
 53 |     ///
 54 |     ///  - Parameters:
 55 |     ///     - text: Input text to be tokenized and then embedded
 56 |     ///  - Returns: Embedding representing the input text
 57 |     public func encode(_ text: String) throws -> TextEncoderT5Output {
 58 | 
 59 |         // Get models expected input length
 60 |         let inputLength = inputShape.last!
 61 | 
 62 |         // Tokenize, padding to the expected length
 63 |         var tokens = tokenizer.tokenize(text: text)
 64 |         var ids = tokens.map { tokenizer.convertTokenToId($0) ?? 0 }
 65 | 
 66 |         // Truncate if necessary
 67 |         if ids.count > inputLength {
 68 |             tokens = tokens.dropLast(tokens.count - inputLength)
 69 |             ids = ids.dropLast(ids.count - inputLength)
 70 |             print("Needed to truncate input for TextEncoderT5")
 71 |         }
 72 | 
 73 |         // Use the model to generate the embedding
 74 |         let encodedText = try encode(ids: ids)
 75 |         return encodedText
 76 |     }
 77 | 
 78 |     func encode(ids: [Int]) throws -> TextEncoderT5Output {
 79 |         let inputName = "input_ids"
 80 |         let inputShape = inputShape
 81 |         let inputLength = inputShape[1]
 82 |                 
 83 |         let bosToken = tokenizer.bosTokenId ?? 0
 84 |         let eosToken = tokenizer.eosTokenId ?? 1
 85 |         let padToken = bosToken
 86 |         let maskToken = eosToken
 87 | 
 88 |         // Truncate and pad input to the expected length
 89 |         let truncatedIds = ids.prefix(inputLength - 1) + [eosToken]
 90 |         let inputIds = truncatedIds + Array(repeating: padToken, count: inputLength - truncatedIds.count)
 91 | 
 92 |         let attentionMaskName = "attention_mask"
 93 |         var attentionMask: [Int] = inputIds.map { token in
 94 |             token == padToken ? maskToken : padToken
 95 |         }
 96 |         attentionMask[0] = bosToken
 97 | 
 98 |         let floatIds = inputIds.map { Float32($0) }
 99 |         let floatMask = attentionMask.map { Float32($0) }
100 | 
101 |         let inputArray = MLShapedArray<Float32>(scalars: floatIds, shape: inputShape)
102 |         let maskArray = MLShapedArray<Float32>(scalars: floatMask, shape: inputShape)
103 |         let inputFeatures = try! MLDictionaryFeatureProvider(
104 |             dictionary: [inputName: MLMultiArray(inputArray),
105 |                          attentionMaskName: MLMultiArray(maskArray)])
106 | 
107 |         let result = try model.perform { model in
108 |             try model.prediction(from: inputFeatures)
109 |         }
110 | 
111 |         let embeddingFeature = result.featureValue(for: "encoder_hidden_states")
112 |         return TextEncoderT5Output(encoderHiddenStates: MLShapedArray<Float32>(converting: embeddingFeature!.multiArrayValue!))
113 |     }
114 | 
115 |     var inputDescription: MLFeatureDescription {
116 |         try! model.perform { model in
117 |             model.modelDescription.inputDescriptionsByName.first!.value
118 |         }
119 |     }
120 |     
121 |     var inputShape: [Int] {
122 |         inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/TextEncoderXL.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2023 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | @available(iOS 17.0, macOS 14.0, *)
  8 | public protocol TextEncoderXLModel: ResourceManaging {
  9 |     typealias TextEncoderXLOutput = (hiddenEmbeddings: MLShapedArray<Float32>, pooledOutputs: MLShapedArray<Float32>)
 10 |     func encode(_ text: String) throws -> TextEncoderXLOutput
 11 | }
 12 | 
 13 | ///  A model for encoding text, suitable for SDXL
 14 | @available(iOS 17.0, macOS 14.0, *)
 15 | public struct TextEncoderXL: TextEncoderXLModel {
 16 | 
 17 |     /// Text tokenizer
 18 |     var tokenizer: BPETokenizer
 19 | 
 20 |     /// Embedding model
 21 |     var model: ManagedMLModel
 22 | 
 23 |     /// Creates text encoder which embeds a tokenized string
 24 |     ///
 25 |     /// - Parameters:
 26 |     ///   - tokenizer: Tokenizer for input text
 27 |     ///   - url: Location of compiled text encoding  Core ML model
 28 |     ///   - configuration: configuration to be used when the model is loaded
 29 |     /// - Returns: A text encoder that will lazily load its required resources when needed or requested
 30 |     public init(tokenizer: BPETokenizer,
 31 |                 modelAt url: URL,
 32 |                 configuration: MLModelConfiguration) {
 33 |         self.tokenizer = tokenizer
 34 |         self.model = ManagedMLModel(modelAt: url, configuration: configuration)
 35 |     }
 36 | 
 37 |     /// Ensure the model has been loaded into memory
 38 |     public func loadResources() throws {
 39 |         try model.loadResources()
 40 |     }
 41 | 
 42 |     /// Unload the underlying model to free up memory
 43 |     public func unloadResources() {
 44 |        model.unloadResources()
 45 |     }
 46 | 
 47 |     /// Encode input text/string
 48 |     ///
 49 |     ///  - Parameters:
 50 |     ///     - text: Input text to be tokenized and then embedded
 51 |     ///  - Returns: Embedding representing the input text
 52 |     public func encode(_ text: String) throws -> TextEncoderXLOutput {
 53 | 
 54 |         // Get models expected input length
 55 |         let inputLength = inputShape.last!
 56 | 
 57 |         // Tokenize, padding to the expected length
 58 |         var (tokens, ids) = tokenizer.tokenize(input: text, minCount: inputLength)
 59 | 
 60 |         // Truncate if necessary
 61 |         if ids.count > inputLength {
 62 |             tokens = tokens.dropLast(tokens.count - inputLength)
 63 |             ids = ids.dropLast(ids.count - inputLength)
 64 |             let truncated = tokenizer.decode(tokens: tokens)
 65 |             print("Needed to truncate input '\(text)' to '\(truncated)'")
 66 |         }
 67 | 
 68 |         // Use the model to generate the embedding
 69 |         return try encode(ids: ids)
 70 |     }
 71 | 
 72 |     func encode(ids: [Int]) throws -> TextEncoderXLOutput {
 73 |         let inputName = inputDescription.name
 74 |         let inputShape = inputShape
 75 | 
 76 |         let floatIds = ids.map { Float32($0) }
 77 |         let inputArray = MLShapedArray<Float32>(scalars: floatIds, shape: inputShape)
 78 |         let inputFeatures = try! MLDictionaryFeatureProvider(
 79 |             dictionary: [inputName: MLMultiArray(inputArray)])
 80 | 
 81 |         let result = try model.perform { model in
 82 |             try model.prediction(from: inputFeatures)
 83 |         }
 84 | 
 85 |         let embeddingFeature = result.featureValue(for: "hidden_embeds")
 86 |         let pooledFeature = result.featureValue(for: "pooled_outputs")
 87 |         return (MLShapedArray<Float32>(converting: embeddingFeature!.multiArrayValue!), MLShapedArray<Float32>(converting: pooledFeature!.multiArrayValue!))
 88 |     }
 89 | 
 90 |     var inputDescription: MLFeatureDescription {
 91 |         try! model.perform { model in
 92 |             model.modelDescription.inputDescriptionsByName.first!.value
 93 |         }
 94 |     }
 95 | 
 96 |     var inputShape: [Int] {
 97 |         inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/TorchRandomSource.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// A random source consistent with PyTorch
  8 | ///
  9 | ///  This implementation matches:
 10 | ///  https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core/DistributionsHelper.h
 11 | ///  https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/DistributionTemplates.h
 12 | ///  https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/DistributionKernels.cpp
 13 | ///  https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/core/TransformationHelper.h
 14 | ///
 15 | @available(iOS 16.2, macOS 13.1, *)
 16 | struct TorchRandomSource: RandomNumberGenerator, RandomSource {
 17 | 
 18 |   struct State {
 19 |     var key = [UInt32](repeating: 0, count: 624)
 20 |     var pos: Int = 0
 21 |     var nextGauss: Double? = nil
 22 |   }
 23 | 
 24 |   var state: State
 25 | 
 26 |   /// Initialize with a random seed
 27 |   ///
 28 |   /// - Parameters
 29 |   ///     - seed: Seed for underlying Mersenne Twister 19937 generator
 30 |   /// - Returns random source
 31 |   init(seed: UInt32) {
 32 |     state = .init()
 33 |     var s = seed & 0xffff_ffff
 34 |     for i in 0..<state.key.count {
 35 |       state.key[i] = s
 36 |       s = UInt32((UInt64(1_812_433_253) * UInt64(s ^ (s >> 30)) + UInt64(i) + 1) & 0xffff_ffff)
 37 |     }
 38 |     state.pos = state.key.count
 39 |     state.nextGauss = nil
 40 |   }
 41 | 
 42 |   /// Generate next UInt32 using fast 32bit Mersenne Twister
 43 |   mutating func nextUInt32() -> UInt32 {
 44 |     let n = 624
 45 |     let m = 397
 46 |     let matrixA: UInt64 = 0x9908_b0df
 47 |     let upperMask: UInt32 = 0x8000_0000
 48 |     let lowerMask: UInt32 = 0x7fff_ffff
 49 | 
 50 |     var y: UInt32
 51 |     if state.pos == state.key.count {
 52 |       for i in 0..<(n - m) {
 53 |         y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 54 |         state.key[i] = state.key[i + m] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 55 |       }
 56 |       for i in (n - m)..<(n - 1) {
 57 |         y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 58 |         state.key[i] = state.key[i + (m - n)] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 59 |       }
 60 |       y = (state.key[n - 1] & upperMask) | (state.key[0] & lowerMask)
 61 |       state.key[n - 1] = state.key[m - 1] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 62 |       state.pos = 0
 63 |     }
 64 |     y = state.key[state.pos]
 65 |     state.pos += 1
 66 | 
 67 |     y ^= (y >> 11)
 68 |     y ^= (y << 7) & 0x9d2c_5680
 69 |     y ^= (y << 15) & 0xefc6_0000
 70 |     y ^= (y >> 18)
 71 | 
 72 |     return y
 73 |   }
 74 | 
 75 |   mutating func next() -> UInt64 {
 76 |     let high = nextUInt32()
 77 |     let low = nextUInt32()
 78 |     return (UInt64(high) << 32) | UInt64(low)
 79 |   }
 80 | 
 81 |   /// Generate next random double value
 82 |   mutating func nextDouble() -> Double {
 83 |     let a = next()
 84 |     return Double(a & 9_007_199_254_740_991) * (1.0 / 9007199254740992.0)
 85 |   }
 86 | 
 87 |   /// Generate next random float value
 88 |   mutating func nextFloat() -> Float {
 89 |     let a = nextUInt32()
 90 |     return Float(a & 16_777_215) * (1.0 / 16777216.0)
 91 |   }
 92 | 
 93 |   /// Generate next random value from a standard normal
 94 |   mutating func nextGauss() -> Double {
 95 |     if let nextGauss = state.nextGauss {
 96 |       state.nextGauss = nil
 97 |       return nextGauss
 98 |     }
 99 |     // Box-Muller transform
100 |     let u1: Double = nextDouble()
101 |     let u2: Double = 1 - nextDouble()
102 |     let radius = sqrt(-2.0 * log(u2))
103 |     let theta = 2.0 * .pi * u1
104 |     state.nextGauss = radius * sin(theta)
105 |     return radius * cos(theta)
106 |   }
107 |     
108 |   /// Generates a random value from a normal distribution with given mean and standard deviation.
109 |   mutating func nextNormal(mean: Double = 0.0, stdev: Double = 1.0) -> Double {
110 |     nextGauss() * stdev + mean
111 |   }
112 | 
113 |   /// Generates an array of random values from a normal distribution with given mean and standard deviation.
114 |   /// This simulates torch.randn([1, 4, 64, 64], dtype=torch.float), note that for dtype=torch.double, it
115 |   /// will be slightly different.
116 |   mutating func normalArray(count: Int, mean: Double = 0.0, stdev: Double = 1.0) -> [Double] {
117 |     // If it is smaller than 16 elements, Torch generates from Box-Muller transform directly.
118 |     // Note that even if this is used to generate Float, it will use Double underneath.
119 |     guard count >= 16 else {
120 |       return (0..<count).map { _ in nextNormal(mean: mean, stdev: stdev) }
121 |     }
122 |     // Otherwise, Torch first fill a uniform distribution array, then do Box-Muller
123 |     // transformation over this array.
124 |     var data = (0..<count).map { _ in Double(nextFloat()) }
125 |     for i in stride(from: 0, to: count - 15, by: 16) {
126 |       for j in 0..<8 {
127 |         let u1 = 1 - data[i + j]
128 |         let u2 = data[i + j + 8]
129 |         let radius = sqrt(-2.0 * log(u1))
130 |         let theta = 2.0 * .pi * u2
131 |         data[i + j] = radius * cos(theta) * stdev + mean
132 |         data[i + j + 8] = radius * sin(theta) * stdev + mean
133 |       }
134 |     }
135 |     if count % 16 != 0 {
136 |       for i in (count - 16)..<count {
137 |         data[i] = nextDouble()
138 |       }
139 |       let i = count - 16
140 |       for j in 0..<8 {
141 |         let u1 = 1 - data[i + j]
142 |         let u2 = data[i + j + 8]
143 |         let radius = sqrt(-2.0 * log(u1))
144 |         let theta = 2.0 * .pi * u2
145 |         data[i + j] = radius * cos(theta) * stdev + mean
146 |         data[i + j + 8] = radius * sin(theta) * stdev + mean
147 |       }
148 |     }
149 |     return data
150 |   }
151 | 
152 |   /// Generate a shaped array with scalars from a normal distribution with given mean and standard deviation.
153 |   mutating func normalShapedArray(_ shape: [Int], mean: Double = 0.0, stdev: Double = 1.0) -> MLShapedArray<Double> {
154 |     let count = shape.reduce(1, *)
155 |     return .init(scalars: normalArray(count: count, mean: mean, stdev: stdev), shape: shape)
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Unet.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// U-Net noise prediction model for stable diffusion
  8 | @available(iOS 16.2, macOS 13.1, *)
  9 | public struct Unet: ResourceManaging {
 10 | 
 11 |     /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
 12 |     ///
 13 |     /// It can be in the form of a single model or multiple stages
 14 |     var models: [ManagedMLModel]
 15 | 
 16 |     /// Creates a U-Net noise prediction model
 17 |     ///
 18 |     /// - Parameters:
 19 |     ///   - url: Location of single U-Net  compiled Core ML model
 20 |     ///   - configuration: Configuration to be used when the model is loaded
 21 |     /// - Returns: U-net model that will lazily load its required resources when needed or requested
 22 |     public init(modelAt url: URL,
 23 |                 configuration: MLModelConfiguration) {
 24 |         self.models = [ManagedMLModel(modelAt: url, configuration: configuration)]
 25 |     }
 26 | 
 27 |     /// Creates a U-Net noise prediction model
 28 |     ///
 29 |     /// - Parameters:
 30 |     ///   - urls: Location of chunked U-Net via urls to each compiled chunk
 31 |     ///   - configuration: Configuration to be used when the model is loaded
 32 |     /// - Returns: U-net model that will lazily load its required resources when needed or requested
 33 |     public init(chunksAt urls: [URL],
 34 |                 configuration: MLModelConfiguration) {
 35 |         self.models = urls.map { ManagedMLModel(modelAt: $0, configuration: configuration) }
 36 |     }
 37 | 
 38 |     /// Load resources.
 39 |     public func loadResources() throws {
 40 |         for model in models {
 41 |             try model.loadResources()
 42 |         }
 43 |     }
 44 | 
 45 |     /// Unload the underlying model to free up memory
 46 |     public func unloadResources() {
 47 |         for model in models {
 48 |             model.unloadResources()
 49 |         }
 50 |     }
 51 | 
 52 |     /// Pre-warm resources
 53 |     public func prewarmResources() throws {
 54 |         // Override default to pre-warm each model
 55 |         for model in models {
 56 |             try model.loadResources()
 57 |             model.unloadResources()
 58 |         }
 59 |     }
 60 | 
 61 |     var latentSampleDescription: MLFeatureDescription {
 62 |         try! models.first!.perform { model in
 63 |             model.modelDescription.inputDescriptionsByName["sample"]!
 64 |         }
 65 |     }
 66 | 
 67 |     /// The expected shape of the models latent sample input
 68 |     public var latentSampleShape: [Int] {
 69 |         latentSampleDescription.multiArrayConstraint!.shape.map { $0.intValue }
 70 |     }
 71 | 
 72 |     var latentTimeIdDescription: MLFeatureDescription {
 73 |         try! models.first!.perform { model in
 74 |             model.modelDescription.inputDescriptionsByName["time_ids"]!
 75 |         }
 76 |     }
 77 | 
 78 |     /// The expected shape of the geometry conditioning
 79 |     public var latentTimeIdShape: [Int] {
 80 |         latentTimeIdDescription.multiArrayConstraint!.shape.map { $0.intValue }
 81 |     }
 82 | 
 83 |     /// Batch prediction noise from latent samples
 84 |     ///
 85 |     /// - Parameters:
 86 |     ///   - latents: Batch of latent samples in an array
 87 |     ///   - timeStep: Current diffusion timestep
 88 |     ///   - hiddenStates: Hidden state to condition on
 89 |     /// - Returns: Array of predicted noise residuals
 90 |     func predictNoise(
 91 |         latents: [MLShapedArray<Float32>],
 92 |         timeStep: Int,
 93 |         hiddenStates: MLShapedArray<Float32>,
 94 |         additionalResiduals: [[String: MLShapedArray<Float32>]]? = nil
 95 |     ) throws -> [MLShapedArray<Float32>] {
 96 | 
 97 |         // Match time step batch dimension to the model / latent samples
 98 |         let t: MLShapedArray<Float32>
 99 |         if hiddenStates.shape[0] == 2 {
100 |             t = MLShapedArray(scalars: [Float(timeStep), Float(timeStep)], shape: [2])
101 |         } else {
102 |             t = MLShapedArray(scalars: [Float(timeStep)], shape: [1])
103 |         }
104 | 
105 |         // Form batch input to model
106 |         let inputs = try latents.enumerated().map {
107 |             var dict: [String: Any] = [
108 |                 "sample" : MLMultiArray($0.element),
109 |                 "timestep" : MLMultiArray(t),
110 |                 "encoder_hidden_states": MLMultiArray(hiddenStates)
111 |             ]
112 |             if let residuals = additionalResiduals?[$0.offset] {
113 |                 for (k, v) in residuals {
114 |                     dict[k] = MLMultiArray(v)
115 |                 }
116 |             }
117 |             return try MLDictionaryFeatureProvider(dictionary: dict)
118 |         }
119 |         let batch = MLArrayBatchProvider(array: inputs)
120 | 
121 |         // Make predictions
122 |         let results = try models.predictions(from: batch)
123 | 
124 |         // Pull out the results in Float32 format
125 |         let noise = (0..<results.count).map { i in
126 | 
127 |             let result = results.features(at: i)
128 |             let outputName = result.featureNames.first!
129 | 
130 |             let outputNoise = result.featureValue(for: outputName)!.multiArrayValue!
131 | 
132 |             // To conform to this func return type make sure we return float32
133 |             // Use the fact that the concatenating constructor for MLMultiArray
134 |             // can do type conversion:
135 |             let fp32Noise = MLMultiArray(
136 |                 concatenating: [outputNoise],
137 |                 axis: 0,
138 |                 dataType: .float32
139 |             )
140 |             return MLShapedArray<Float32>(fp32Noise)
141 |         }
142 | 
143 |         return noise
144 |     }
145 | 
146 |     /// Batch prediction noise from latent samples, for Stable Diffusion XL
147 |     ///
148 |     /// - Parameters:
149 |     ///   - latents: Batch of latent samples in an array
150 |     ///   - timeStep: Current diffusion timestep
151 |     ///   - hiddenStates: Hidden state to condition on
152 |     ///   - pooledStates: Additional text states to condition on
153 |     ///   - geometryConditioning: Condition on image geometry
154 |     /// - Returns: Array of predicted noise residuals
155 |     @available(iOS 17.0, macOS 14.0, *)
156 |     func predictNoise(
157 |         latents: [MLShapedArray<Float32>],
158 |         timeStep: Int,
159 |         hiddenStates: MLShapedArray<Float32>,
160 |         pooledStates: MLShapedArray<Float32>,
161 |         geometryConditioning: MLShapedArray<Float32>
162 |     ) throws -> [MLShapedArray<Float32>] {
163 | 
164 |         // Match time step batch dimension to the model / latent samples
165 |         let t = MLShapedArray<Float32>(scalars:[Float(timeStep), Float(timeStep)],shape:[2])
166 | 
167 |         // Form batch input to model
168 |         let inputs = try latents.enumerated().map {
169 |             let dict: [String: Any] = [
170 |                 "sample" : MLMultiArray($0.element),
171 |                 "timestep" : MLMultiArray(t),
172 |                 "encoder_hidden_states": MLMultiArray(hiddenStates),
173 |                 "text_embeds": MLMultiArray(pooledStates),
174 |                 "time_ids": MLMultiArray(geometryConditioning)
175 |             ]
176 |             return try MLDictionaryFeatureProvider(dictionary: dict)
177 |         }
178 |         let batch = MLArrayBatchProvider(array: inputs)
179 | 
180 |         // Make predictions
181 |         let results = try models.predictions(from: batch)
182 | 
183 |         // Pull out the results in Float32 format
184 |         let noise = (0..<results.count).map { i in
185 | 
186 |             let result = results.features(at: i)
187 |             let outputName = result.featureNames.first!
188 | 
189 |             let outputNoise = result.featureValue(for: outputName)!.multiArrayValue!
190 | 
191 |             // To conform to this func return type make sure we return float32
192 |             // Use the fact that the concatenating constructor for MLMultiArray
193 |             // can do type conversion:
194 |             let fp32Noise = MLMultiArray(
195 |                 concatenating: [outputNoise],
196 |                 axis: 0,
197 |                 dataType: .float32
198 |             )
199 |             return MLShapedArray<Float32>(fp32Noise)
200 |         }
201 | 
202 |         return noise
203 |     }
204 | }
205 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/tokenizer/BPETokenizer+Reading.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | 
 6 | @available(iOS 16.2, macOS 13.1, *)
 7 | extension BPETokenizer {
 8 |     enum FileReadError: Error {
 9 |         case invalidMergeFileLine(Int)
10 |     }
11 | 
12 |     /// Read vocab.json file at URL into a dictionary mapping a String to its Int token id
13 |     static func readVocabulary(url: URL) throws -> [String: Int] {
14 |         let content = try Data(contentsOf: url)
15 |         return try JSONDecoder().decode([String: Int].self, from: content)
16 |     }
17 | 
18 |     /// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
19 |     static func readMerges(url: URL) throws -> [TokenPair: Int] {
20 |         let data = try Data(contentsOf: url)
21 |         var merges = [TokenPair: Int]()
22 |         var index = 0
23 |         var line = [UInt8]()
24 |         for byte in data {
25 |             if byte == UInt8(ascii: "\n") {
26 |                 if let pair = try parseMergesLine(line, index: index) {
27 |                     merges[pair] = index
28 |                 }
29 |                 line.removeAll(keepingCapacity: true)
30 |                 index += 1
31 |             } else {
32 |                 line.append(byte)
33 |             }
34 |         }
35 | 
36 |         return merges
37 |     }
38 | 
39 |     static func parseMergesLine(_ line: [UInt8], index: Int) throws -> TokenPair? {
40 |         if line.isEmpty || line.first == UInt8(ascii: "#") {
41 |             return nil
42 |         }
43 |         let pair = line.split(separator: UInt8(ascii: " "))
44 |         if pair.count != 2 {
45 |             throw FileReadError.invalidMergeFileLine(index + 1)
46 |         }
47 |         return TokenPair( String(bytes: pair[0]), String(bytes: pair[1]))
48 |     }
49 | }
50 | 
51 | extension String {
52 |     init(bytes: some Collection<UInt8>) {
53 |         self.init(unsafeUninitializedCapacity: bytes.count) { pointer in
54 |             _ = pointer.initialize(fromContentsOf: bytes)
55 |             return bytes.count
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/tokenizer/BPETokenizer.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | 
  6 | /// A tokenizer based on byte pair encoding.
  7 | @available(iOS 16.2, macOS 13.1, *)
  8 | public struct BPETokenizer {
  9 |     /// A dictionary that maps pairs of tokens to the rank/order of the merge.
 10 |     let merges: [TokenPair : Int]
 11 | 
 12 |     /// A dictionary from of tokens to identifiers.
 13 |     let vocabulary: [String: Int]
 14 | 
 15 |     /// The token used for padding
 16 |     let padToken: String
 17 | 
 18 |     /// The start token.
 19 |     let startToken: String = "<|startoftext|>"
 20 | 
 21 |     /// The end token.
 22 |     let endToken: String = "<|endoftext|>"
 23 | 
 24 |     /// The unknown token.
 25 |     let unknownToken: String = "<|endoftext|>"
 26 | 
 27 |     var unknownTokenID: Int {
 28 |         vocabulary[unknownToken, default: 0]
 29 |     }
 30 | 
 31 |     /// Creates a tokenizer.
 32 |     ///
 33 |     /// - Parameters:
 34 |     ///   - merges: A dictionary that maps pairs of tokens to the rank/order of the merge.
 35 |     ///   - vocabulary: A dictionary from of tokens to identifiers.
 36 |     public init(merges: [TokenPair: Int], vocabulary: [String: Int], padToken: String = "<|endoftext|>") {
 37 |         self.merges = merges
 38 |         self.vocabulary = vocabulary
 39 |         self.padToken = padToken
 40 |     }
 41 | 
 42 |     /// Creates a tokenizer by loading merges and vocabulary from URLs.
 43 |     ///
 44 |     /// - Parameters:
 45 |     ///   - mergesURL: The URL of a text file containing merges.
 46 |     ///   - vocabularyURL: The URL of a JSON file containing the vocabulary.
 47 |     public init(mergesAt mergesURL: URL, vocabularyAt vocabularyURL: URL, padToken: String = "<|endoftext|>") throws {
 48 |         self.merges = try Self.readMerges(url: mergesURL)
 49 |         self.vocabulary = try! Self.readVocabulary(url: vocabularyURL)
 50 |         self.padToken = padToken
 51 |     }
 52 | 
 53 |     /// Tokenizes an input string.
 54 |     ///
 55 |     /// - Parameters:
 56 |     ///   - input: A string.
 57 |     ///   - minCount: The minimum number of tokens to return.
 58 |     /// - Returns: An array of tokens and an array of token identifiers.
 59 |     public func tokenize(input: String, minCount: Int? = nil) -> (tokens: [String], tokenIDs: [Int]) {
 60 |         var tokens: [String] = []
 61 | 
 62 |         tokens.append(startToken)
 63 |         tokens.append(contentsOf: encode(input: input))
 64 |         tokens.append(endToken)
 65 | 
 66 |         // Pad if there was a min length specified
 67 |         if let minLen = minCount, minLen > tokens.count {
 68 |             tokens.append(contentsOf: repeatElement(padToken, count: minLen - tokens.count))
 69 |         }
 70 | 
 71 |         let ids = tokens.map({ vocabulary[$0, default: unknownTokenID] })
 72 |         return (tokens: tokens, tokenIDs: ids)
 73 |     }
 74 | 
 75 |     /// Returns the token identifier for a token.
 76 |     public func tokenID(for token: String) -> Int? {
 77 |         vocabulary[token]
 78 |     }
 79 | 
 80 |     /// Returns the token for a token identifier.
 81 |     public func token(id: Int) -> String? {
 82 |         vocabulary.first(where: { $0.value == id })?.key
 83 |     }
 84 | 
 85 |     /// Decodes a sequence of tokens into a fully formed string
 86 |     public func decode(tokens: [String]) -> String {
 87 |         String(tokens.joined())
 88 |             .replacingOccurrences(of: "</w>", with: " ")
 89 |             .replacingOccurrences(of: startToken, with: "")
 90 |             .replacingOccurrences(of: endToken, with: "")
 91 |     }
 92 | 
 93 |     /// Encode an input string to a sequence of tokens
 94 |     func encode(input: String) -> [String] {
 95 |         let normalized = input.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
 96 |         let words = normalized.split(separator: " ")
 97 |         return words.flatMap({ encode(word: $0) })
 98 |     }
 99 | 
100 |     /// Encode a single word into a sequence of tokens
101 |     func encode(word: Substring) -> [String] {
102 |         var tokens = word.map { String($0) }
103 |         if let last = tokens.indices.last {
104 |             tokens[last] = tokens[last] + "</w>"
105 |         }
106 | 
107 |         while true {
108 |             let pairs = pairs(for: tokens)
109 |             let canMerge = pairs.filter { merges[$0] != nil }
110 | 
111 |             if canMerge.isEmpty {
112 |                 break
113 |             }
114 | 
115 |             // If multiple merges are found, use the one with the lowest rank
116 |             let shouldMerge = canMerge.min { merges[$0]! < merges[$1]! }!
117 |             tokens = update(tokens, merging: shouldMerge)
118 |         }
119 |         return tokens
120 |     }
121 | 
122 |     /// Get  the set of adjacent pairs / bigrams from a sequence of tokens
123 |     func pairs(for tokens: [String]) -> Set<TokenPair> {
124 |         guard tokens.count > 1 else {
125 |             return Set()
126 |         }
127 | 
128 |         var pairs = Set<TokenPair>(minimumCapacity: tokens.count - 1)
129 |         var prev = tokens.first!
130 |         for current in tokens.dropFirst() {
131 |             pairs.insert(TokenPair(prev, current))
132 |             prev = current
133 |         }
134 |         return pairs
135 |     }
136 | 
137 |     /// Update the sequence of tokens by greedily merging instance of a specific bigram
138 |     func update(_ tokens: [String], merging bigram: TokenPair) -> [String] {
139 |         guard tokens.count > 1 else {
140 |             return []
141 |         }
142 | 
143 |         var newTokens = [String]()
144 |         newTokens.reserveCapacity(tokens.count - 1)
145 | 
146 |         var index = 0
147 |         while index < tokens.count {
148 |             let remainingTokens = tokens[index...]
149 |             if let startMatchIndex = remainingTokens.firstIndex(of: bigram.first) {
150 |                 // Found a possible match, append everything before it
151 |                 newTokens.append(contentsOf: tokens[index..<startMatchIndex])
152 | 
153 |                 if index < tokens.count - 1 && tokens[startMatchIndex + 1] == bigram.second {
154 |                     // Full match, merge
155 |                     newTokens.append(bigram.first + bigram.second)
156 |                     index = startMatchIndex + 2
157 |                 } else {
158 |                     // Only matched the first, no merge
159 |                     newTokens.append(bigram.first)
160 |                     index = startMatchIndex + 1
161 |                 }
162 |             } else {
163 |                 // Didn't find any more matches, append the rest unmerged
164 |                 newTokens.append(contentsOf: remainingTokens)
165 |                 break
166 |             }
167 |         }
168 |         return newTokens
169 |     }
170 | }
171 | 
172 | @available(iOS 16.2, macOS 13.1, *)
173 | extension BPETokenizer {
174 | 
175 |     /// A hashable tuple of strings
176 |     public struct TokenPair: Hashable {
177 |         let first: String
178 |         let second: String
179 | 
180 |         init(_ first: String, _ second: String) {
181 |             self.first = first
182 |             self.second = second
183 |         }
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/tokenizer/T5Tokenizer.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2024 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | import Hub
 6 | import Tokenizers
 7 | 
 8 | /// Extension to swift-transfomers Hub.swift to load local Config files
 9 | public extension Config {
10 |     /// Assumes the file is already present at local url.
11 |     /// `fileURL` is a complete local file path for the given model
12 |     public init(fileURL: URL) throws  {
13 |         let data = try Data(contentsOf: fileURL)
14 |         let parsed = try JSONSerialization.jsonObject(with: data, options: [])
15 |         guard var dictionary = parsed as? [String: Any] else { throw Hub.HubClientError.parse }
16 |         
17 |         // Necessary override for loading local tokenizer configs
18 |         dictionary["tokenizer_class"] = "T5Tokenizer"
19 |         self.init(dictionary)
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/swift/StableDiffusionCLI/main.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import ArgumentParser
  5 | import CoreGraphics
  6 | import CoreML
  7 | import Foundation
  8 | import StableDiffusion
  9 | import UniformTypeIdentifiers
 10 | import Cocoa
 11 | import CoreImage
 12 | import NaturalLanguage
 13 | 
 14 | @available(iOS 16.2, macOS 13.1, *)
 15 | struct StableDiffusionSample: ParsableCommand {
 16 | 
 17 |     static let configuration = CommandConfiguration(
 18 |         abstract: "Run stable diffusion to generate images guided by a text prompt",
 19 |         version: "0.1"
 20 |     )
 21 | 
 22 |     @Argument(help: "Input string prompt")
 23 |     var prompt: String
 24 | 
 25 |     @Option(help: "Input string negative prompt")
 26 |     var negativePrompt: String = ""
 27 | 
 28 |     @Option(
 29 |         help: ArgumentHelp(
 30 |             "Path to stable diffusion resources.",
 31 |             discussion: "The resource directory should contain\n" +
 32 |                 " - *compiled* models: {TextEncoder,Unet,VAEDecoder}.mlmodelc\n" +
 33 |                 " - tokenizer info: vocab.json, merges.txt",
 34 |             valueName: "directory-path"
 35 |         )
 36 |     )
 37 |     var resourcePath: String = "./"
 38 |     
 39 |     @Flag(name: .customLong("xl"), help: "The resources correspond to a Stable Diffusion XL model")
 40 |     var isXL: Bool = false
 41 | 
 42 |     @Flag(name: .customLong("sd3"), help: "The resources correspond to a Stable Diffusion 3 model")
 43 |     var isSD3: Bool = false
 44 | 
 45 |     @Option(help: "Path to starting image.")
 46 |     var image: String? = nil
 47 |     
 48 |     @Option(help: "Strength for image2image.")
 49 |     var strength: Float = 0.5
 50 | 
 51 |     @Option(help: "Number of images to sample / generate")
 52 |     var imageCount: Int = 1
 53 | 
 54 |     @Option(help: "Number of diffusion steps to perform")
 55 |     var stepCount: Int = 50
 56 | 
 57 |     @Option(
 58 |         help: ArgumentHelp(
 59 |             "How often to save samples at intermediate steps",
 60 |             discussion: "Set to 0 to only save the final sample"
 61 |         )
 62 |     )
 63 |     var saveEvery: Int = 0
 64 | 
 65 |     @Option(help: "Output path")
 66 |     var outputPath: String = "./"
 67 | 
 68 |     @Option(help: "Random seed")
 69 |     var seed: UInt32 = UInt32.random(in: 0...UInt32.max)
 70 | 
 71 |     @Option(help: "Controls the influence of the text prompt on sampling process (0=random images)")
 72 |     var guidanceScale: Float = 7.5
 73 | 
 74 |     @Option(help: "Compute units to load model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine}")
 75 |     var computeUnits: ComputeUnits = .all
 76 | 
 77 |     @Option(help: "Scheduler to use, one of {pndm, dpmpp}")
 78 |     var scheduler: SchedulerOption = .pndm
 79 | 
 80 |     @Option(help: "Random number generator to use, one of {numpy, torch, nvidia}")
 81 |     var rng: RNGOption = .numpy
 82 |     
 83 |     @Option(
 84 |         parsing: .upToNextOption,
 85 |         help: "ControlNet models used in image generation (enter file names in Resources/controlnet without extension)"
 86 |     )
 87 |     var controlnet: [String] = []
 88 |     
 89 |     @Option(
 90 |         parsing: .upToNextOption,
 91 |         help: "image for each controlNet model (corresponding to the same order as --controlnet)"
 92 |     )
 93 |     var controlnetInputs: [String] = []
 94 | 
 95 |     @Flag(help: "Disable safety checking")
 96 |     var disableSafety: Bool = false
 97 | 
 98 |     @Flag(help: "Reduce memory usage")
 99 |     var reduceMemory: Bool = false
100 | 
101 |     @Flag(help: "Use system multilingual NLContextualEmbedding as encoder model")
102 |     var useMultilingualTextEncoder: Bool = false
103 | 
104 |     @Option(help: "The natural language script for the multilingual contextual embedding")
105 |     var script: Script = .latin
106 | 
107 |     mutating func run() throws {
108 |         guard FileManager.default.fileExists(atPath: resourcePath) else {
109 |             throw RunError.resources("Resource path does not exist \(resourcePath)")
110 |         }
111 | 
112 |         let config = MLModelConfiguration()
113 |         config.computeUnits = computeUnits.asMLComputeUnits
114 |         let resourceURL = URL(filePath: resourcePath)
115 | 
116 |         log("Loading resources and creating pipeline\n")
117 |         log("(Note: This can take a while the first time using these resources)\n")
118 |         let pipeline: StableDiffusionPipelineProtocol
119 |         var scaleFactor: Float32 = 0.18215
120 |         var shiftFactor: Float32 = 0.0
121 |         var timestepShift: Float32 = 1.0
122 |         if #available(macOS 14.0, iOS 17.0, *) {
123 |             if isXL {
124 |                 scaleFactor = 0.13025
125 |                 if !controlnet.isEmpty {
126 |                     throw RunError.unsupported("ControlNet is not supported for Stable Diffusion XL")
127 |                 }
128 |                 if useMultilingualTextEncoder {
129 |                     throw RunError.unsupported("Multilingual text encoder is not yet supported for Stable Diffusion XL")
130 |                 }
131 |                 pipeline = try StableDiffusionXLPipeline(
132 |                     resourcesAt: resourceURL,
133 |                     configuration: config,
134 |                     reduceMemory: reduceMemory
135 |                 )
136 |             } else if isSD3 {
137 |                 scaleFactor = 1.5305
138 |                 shiftFactor = 0.0609
139 |                 timestepShift = 3.0
140 |                 if !controlnet.isEmpty {
141 |                     throw RunError.unsupported("ControlNet is not supported for Stable Diffusion 3")
142 |                 }
143 |                 if useMultilingualTextEncoder {
144 |                     throw RunError.unsupported("Multilingual text encoder is not yet supported for Stable Diffusion 3")
145 |                 }
146 |                 pipeline = try StableDiffusion3Pipeline(
147 |                     resourcesAt: resourceURL,
148 |                     configuration: config,
149 |                     reduceMemory: reduceMemory
150 |                 )
151 |             } else {
152 |                 pipeline = try StableDiffusionPipeline(
153 |                     resourcesAt: resourceURL,
154 |                     controlNet: controlnet,
155 |                     configuration: config,
156 |                     disableSafety: disableSafety,
157 |                     reduceMemory: reduceMemory,
158 |                     useMultilingualTextEncoder: useMultilingualTextEncoder,
159 |                     script: script
160 |                 )
161 |             }
162 |         } else  {
163 |             pipeline = try StableDiffusionPipeline(
164 |                 resourcesAt: resourceURL,
165 |                 controlNet: controlnet,
166 |                 configuration: config,
167 |                 disableSafety: disableSafety,
168 |                 reduceMemory: reduceMemory
169 |             )
170 |         }
171 | 
172 |         try pipeline.loadResources()
173 |         
174 |         let startingImage: CGImage?
175 |         if let image {
176 |             let imageURL = URL(filePath: image)
177 |             do {
178 |                 startingImage = try convertImageToCGImage(imageURL: imageURL)
179 |             } catch let error {
180 |                 throw RunError.resources("Starting image not found \(imageURL), error: \(error)")
181 |             }
182 |             
183 |         } else {
184 |             startingImage = nil
185 |         }
186 |         
187 |         // convert image for ControlNet into CGImage when controlNet available
188 |         let controlNetInputs: [CGImage]
189 |         if !controlnet.isEmpty {
190 |             controlNetInputs = try controlnetInputs.map { imagePath in
191 |                 let imageURL = URL(filePath: imagePath)
192 |                 do {
193 |                     return try convertImageToCGImage(imageURL: imageURL)
194 |                 } catch let error {
195 |                     throw RunError.resources("Image for ControlNet not found \(imageURL), error: \(error)")
196 |                 }
197 |             }
198 |         } else {
199 |             controlNetInputs = []
200 |         }
201 | 
202 |         log("Sampling ...\n")
203 |         let sampleTimer = SampleTimer()
204 |         sampleTimer.start()
205 | 
206 |         var pipelineConfig = StableDiffusionPipeline.Configuration(prompt: prompt)
207 |         
208 |         pipelineConfig.negativePrompt = negativePrompt
209 |         pipelineConfig.startingImage = startingImage
210 |         pipelineConfig.strength = strength
211 |         pipelineConfig.imageCount = imageCount
212 |         pipelineConfig.stepCount = stepCount
213 |         pipelineConfig.seed = seed
214 |         pipelineConfig.controlNetInputs = controlNetInputs
215 |         pipelineConfig.guidanceScale = guidanceScale
216 |         pipelineConfig.schedulerType = scheduler.stableDiffusionScheduler
217 |         pipelineConfig.rngType = rng.stableDiffusionRNG
218 |         pipelineConfig.useDenoisedIntermediates = true
219 |         pipelineConfig.encoderScaleFactor = scaleFactor
220 |         pipelineConfig.decoderScaleFactor = scaleFactor
221 |         pipelineConfig.decoderShiftFactor = shiftFactor
222 |         pipelineConfig.schedulerTimestepShift = timestepShift
223 | 
224 |         let images = try pipeline.generateImages(
225 |             configuration: pipelineConfig) { progress in
226 |                 sampleTimer.stop()
227 |                 handleProgress(progress,sampleTimer)
228 |                 if progress.stepCount != progress.step {
229 |                     sampleTimer.start()
230 |                 }
231 |                 return true
232 |             }
233 | 
234 |         _ = try saveImages(images, logNames: true)
235 |     }
236 |     
237 |     func convertImageToCGImage(imageURL: URL) throws -> CGImage {
238 |         let imageData = try Data(contentsOf: imageURL)
239 |         guard
240 |             let nsImage = NSImage(data: imageData),
241 |             let loadedImage = nsImage.cgImage(forProposedRect: nil, context: nil, hints: nil)
242 |         else {
243 |             throw RunError.resources("Image not available \(resourcePath)")
244 |         }
245 |         return loadedImage
246 |     }
247 | 
248 |     func handleProgress(
249 |         _ progress: StableDiffusionPipeline.Progress,
250 |         _ sampleTimer: SampleTimer
251 |     ) {
252 |         log("\u{1B}[1A\u{1B}[K")
253 |         log("Step \(progress.step) of \(progress.stepCount) ")
254 |         log(" [")
255 |         log(String(format: "mean: %.2f, ", 1.0/sampleTimer.mean))
256 |         log(String(format: "median: %.2f, ", 1.0/sampleTimer.median))
257 |         log(String(format: "last %.2f", 1.0/sampleTimer.allSamples.last!))
258 |         log("] step/sec")
259 | 
260 |         if saveEvery > 0, progress.step % saveEvery == 0 {
261 |             let saveCount = (try? saveImages(progress.currentImages, step: progress.step)) ?? 0
262 |             log(" saved \(saveCount) image\(saveCount != 1 ? "s" : "")")
263 |         }
264 |         log("\n")
265 |     }
266 | 
267 |     func saveImages(
268 |         _ images: [CGImage?],
269 |         step: Int? = nil,
270 |         logNames: Bool = false
271 |     ) throws -> Int {
272 |         let url = URL(filePath: outputPath)
273 |         var saved = 0
274 |         for i in 0 ..< images.count {
275 | 
276 |             guard let image = images[i] else {
277 |                 if logNames {
278 |                     log("Image \(i) failed safety check and was not saved")
279 |                 }
280 |                 continue
281 |             }
282 | 
283 |             let name = imageName(i, step: step)
284 |             let fileURL = url.appending(path:name)
285 | 
286 |             guard let dest = CGImageDestinationCreateWithURL(fileURL as CFURL, UTType.png.identifier as CFString, 1, nil) else {
287 |                 throw RunError.saving("Failed to create destination for \(fileURL)")
288 |             }
289 |             CGImageDestinationAddImage(dest, image, nil)
290 |             if !CGImageDestinationFinalize(dest) {
291 |                 throw RunError.saving("Failed to save \(fileURL)")
292 |             }
293 |             if logNames {
294 |                 log("Saved \(name)\n")
295 |             }
296 |             saved += 1
297 |         }
298 |         return saved
299 |     }
300 | 
301 |     func imageName(_ sample: Int, step: Int? = nil) -> String {
302 |         let fileCharLimit = 75
303 |         var name = prompt.prefix(fileCharLimit).replacingOccurrences(of: " ", with: "_")
304 |         if imageCount != 1 {
305 |             name += ".\(sample)"
306 |         }
307 |         
308 |         if image != nil {
309 |             name += ".str\(Int(strength * 100))"
310 |         }
311 | 
312 |         name += ".\(seed)"
313 | 
314 |         if let step = step {
315 |             name += ".\(step)"
316 |         } else {
317 |             name += ".final"
318 |         }
319 |         name += ".png"
320 |         return name
321 |     }
322 | 
323 |     func log(_ str: String, term: String = "") {
324 |         print(str, terminator: term)
325 |     }
326 | }
327 | 
328 | enum RunError: Error {
329 |     case resources(String)
330 |     case saving(String)
331 |     case unsupported(String)
332 | }
333 | 
334 | @available(iOS 16.2, macOS 13.1, *)
335 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable {
336 |     case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine
337 |     var asMLComputeUnits: MLComputeUnits {
338 |         switch self {
339 |         case .all: return .all
340 |         case .cpuAndGPU: return .cpuAndGPU
341 |         case .cpuOnly: return .cpuOnly
342 |         case .cpuAndNeuralEngine: return .cpuAndNeuralEngine
343 |         }
344 |     }
345 | }
346 | 
347 | @available(iOS 16.2, macOS 13.1, *)
348 | enum SchedulerOption: String, ExpressibleByArgument {
349 |     case pndm, dpmpp
350 |     var stableDiffusionScheduler: StableDiffusionScheduler {
351 |         switch self {
352 |         case .pndm: return .pndmScheduler
353 |         case .dpmpp: return .dpmSolverMultistepScheduler
354 |         }
355 |     }
356 | }
357 | 
358 | @available(iOS 16.2, macOS 13.1, *)
359 | enum RNGOption: String, ExpressibleByArgument {
360 |     case numpy, torch, nvidia
361 |     var stableDiffusionRNG: StableDiffusionRNG {
362 |         switch self {
363 |         case .numpy: return .numpyRNG
364 |         case .torch: return .torchRNG
365 |         case .nvidia: return .nvidiaRNG
366 |         }
367 |     }
368 | }
369 | 
370 | @available(iOS 16.2, macOS 13.1, *)
371 | extension Script: ExpressibleByArgument {}
372 | 
373 | if #available(iOS 16.2, macOS 13.1, *) {
374 |     StableDiffusionSample.main()
375 | } else {
376 |     print("Unsupported OS")
377 | }
378 | 


--------------------------------------------------------------------------------
/swift/StableDiffusionTests/StableDiffusionTests.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import XCTest
 5 | import CoreML
 6 | @testable import StableDiffusion
 7 | 
 8 | @available(iOS 16.2, macOS 13.1, *)
 9 | final class StableDiffusionTests: XCTestCase {
10 | 
11 |     var vocabFileInBundleURL: URL {
12 |         let fileName = "vocab"
13 |         guard let url = Bundle.module.url(forResource: fileName, withExtension: "json") else {
14 |             fatalError("BPE tokenizer vocabulary file is missing from bundle")
15 |         }
16 |         return url
17 |     }
18 | 
19 |     var mergesFileInBundleURL: URL {
20 |         let fileName = "merges"
21 |         guard let url = Bundle.module.url(forResource: fileName, withExtension: "txt") else {
22 |             fatalError("BPE tokenizer merges file is missing from bundle")
23 |         }
24 |         return url
25 |     }
26 | 
27 |     func testBPETokenizer() throws {
28 | 
29 |         let tokenizer = try BPETokenizer(mergesAt: mergesFileInBundleURL, vocabularyAt: vocabFileInBundleURL)
30 | 
31 |         func testPrompt(prompt: String, expectedIds: [Int]) {
32 | 
33 |             let (tokens, ids) = tokenizer.tokenize(input: prompt)
34 | 
35 |             print("Tokens          = \(tokens)\n")
36 |             print("Expected tokens = \(expectedIds.map({ tokenizer.token(id: $0) }))")
37 |             print("ids             = \(ids)\n")
38 |             print("Expected Ids    = \(expectedIds)\n")
39 | 
40 |             XCTAssertEqual(ids,expectedIds)
41 |         }
42 | 
43 |         testPrompt(prompt: "a photo of an astronaut riding a horse on mars",
44 |                    expectedIds: [49406, 320, 1125, 539, 550, 18376, 6765, 320, 4558, 525, 7496, 49407])
45 | 
46 |         testPrompt(prompt: "Apple CoreML developer tools on a Macbook Air are fast",
47 |                    expectedIds: [49406,  3055, 19622,  5780, 10929,  5771,   525,   320, 20617,
48 |                                  1922,   631,  1953, 49407])
49 |     }
50 | 
51 |     func test_randomNormalValues_matchNumPyRandom() {
52 |         var random = NumPyRandomSource(seed: 12345)
53 |         let samples = random.normalArray(count: 10_000)
54 |         let last5 = samples.suffix(5)
55 | 
56 |         // numpy.random.seed(12345); print(numpy.random.randn(10000)[-5:])
57 |         let expected = [-0.86285345, 2.15229409, -0.00670556, -1.21472309, 0.65498866]
58 | 
59 |         for (value, expected) in zip(last5, expected) {
60 |             XCTAssertEqual(value, expected, accuracy: .ulpOfOne.squareRoot())
61 |         }
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apple/ml-stable-diffusion/e5d960c41a6a4ab200b8db379194127607b1c590/tests/__init__.py


--------------------------------------------------------------------------------