├── tests
    ├── __init__.py
    └── test_stable_diffusion.py
├── python_coreml_stable_diffusion
    ├── _version.py
    ├── __init__.py
    ├── layer_norm.py
    ├── coreml_model.py
    ├── chunk_mlprogram.py
    └── pipeline.py
├── requirements.txt
├── assets
    ├── readme_reel.png
    ├── a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space
    │   ├── randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   └── randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png
    └── a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space
    │   ├── randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   ├── randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png
    │   ├── randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   ├── randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png
    │   └── randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png
├── CONTRIBUTING.md
├── setup.py
├── swift
    ├── StableDiffusion
    │   ├── tokenizer
    │   │   ├── BPETokenizer+Reading.swift
    │   │   └── BPETokenizer.swift
    │   └── pipeline
    │   │   ├── SampleTimer.swift
    │   │   ├── TextEncoder.swift
    │   │   ├── StableDiffusionPipeline+Resources.swift
    │   │   ├── Decoder.swift
    │   │   ├── Random.swift
    │   │   ├── Unet.swift
    │   │   ├── SafetyChecker.swift
    │   │   ├── StableDiffusionPipeline.swift
    │   │   └── Scheduler.swift
    ├── StableDiffusionTests
    │   └── StableDiffusionTests.swift
    └── StableDiffusionCLI
    │   └── main.swift
├── Package.swift
├── LICENSE.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── README.md
└── ACKNOWLEDGEMENTS


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | coremltools
2 | diffusers[torch]
3 | torch
4 | transformers
5 | scipy


--------------------------------------------------------------------------------
/assets/readme_reel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/readme_reel.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_ALL_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_ALL_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png


--------------------------------------------------------------------------------
/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justjake/ml-stable-diffusion/HEAD/assets/a_high_quality_photo_of_an_astronaut_riding_a_dragon_in_space/randomSeed_123456789_computeUnit_CPU_AND_GPU_modelVersion_CompVis_stable-diffusion-v1-4.png


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution Guide
 2 | 
 3 | Thanks for your interest in contributing. This project was released for system demonstration purposes and there are limited plans for future development of the repository.
 4 | 
 5 | While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
 6 | 
 7 | ## Before you get started
 8 | 
 9 | By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
10 | 
11 | We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | from python_coreml_stable_diffusion._version import __version__
 4 | 
 5 | with open('README.md') as f:
 6 |     readme = f.read()
 7 | 
 8 | setup(
 9 |     name='python_coreml_stable_diffusion',
10 |     version=__version__,
11 |     url='https://github.com/apple/ml-stable-diffusion',
12 |     description="Run Stable Diffusion on Apple Silicon with Core ML (Python and Swift)",
13 |     long_description=readme,
14 |     long_description_content_type='text/markdown',
15 |     author='Apple Inc.',
16 |     install_requires=[
17 |         "coremltools>=6.1",
18 |         "diffusers[torch]",
19 |         "torch",
20 |         "transformers",
21 |         "scipy",
22 |     ],
23 |     packages=find_packages(),
24 |     classifiers=[
25 |         "Development Status :: 4 - Beta",
26 |         "Intended Audience :: Developers",
27 |         "Operating System :: MacOS :: MacOS X",
28 |         "Programming Language :: Python :: 3",
29 |         "Programming Language :: Python :: 3.7",
30 |         "Programming Language :: Python :: 3.8",
31 |         "Programming Language :: Python :: 3.9",
32 |         "Topic :: Artificial Intelligence",
33 |         "Topic :: Scientific/Engineering",
34 |         "Topic :: Software Development",
35 |     ],
36 | )
37 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/tokenizer/BPETokenizer+Reading.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | 
 6 | extension BPETokenizer {
 7 |     enum FileReadError: Error {
 8 |         case invalidMergeFileLine(Int)
 9 |     }
10 | 
11 |     /// Read vocab.json file at URL into a dictionary mapping a String to its Int token id
12 |     static func readVocabulary(url: URL) throws -> [String: Int] {
13 |         let content = try Data(contentsOf: url)
14 |         return try JSONDecoder().decode([String: Int].self, from: content)
15 |     }
16 | 
17 |     /// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
18 |     static func readMerges(url: URL) throws -> [TokenPair: Int] {
19 |         let content = try String(contentsOf: url)
20 |         let lines = content.split(separator: "\n")
21 | 
22 |         let merges: [(TokenPair, Int)] = try lines.enumerated().compactMap { (index, line) in
23 |             if line.hasPrefix("#") {
24 |                 return nil
25 |             }
26 |             let pair = line.split(separator: " ")
27 |             if pair.count != 2 {
28 |                 throw FileReadError.invalidMergeFileLine(index+1)
29 |             }
30 |             return (TokenPair(String(pair[0]), String(pair[1])),index)
31 |         }
32 |         return [TokenPair : Int](uniqueKeysWithValues: merges)
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.7
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "stable-diffusion",
 8 |     platforms: [
 9 |         .macOS(.v13),
10 |         .iOS(.v16),
11 |        ],
12 |     products: [
13 |         .library(
14 |             name: "StableDiffusion",
15 |             targets: ["StableDiffusion"]),
16 |         .executable(
17 |             name: "StableDiffusionSample",
18 |             targets: ["StableDiffusionCLI"])
19 |     ],
20 |     dependencies: [
21 |         .package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.2.0")
22 |     ],
23 |     targets: [
24 |         .target(
25 |             name: "StableDiffusion",
26 |             dependencies: [],
27 |             path: "swift/StableDiffusion"),
28 |         .executableTarget(
29 |             name: "StableDiffusionCLI",
30 |             dependencies: [
31 |                 "StableDiffusion",
32 |                 .product(name: "ArgumentParser", package: "swift-argument-parser")],
33 |             path: "swift/StableDiffusionCLI"),
34 |         .testTarget(
35 |             name: "StableDiffusionTests",
36 |             dependencies: ["StableDiffusion"],
37 |             path: "swift/StableDiffusionTests",
38 |             resources: [
39 |                 .copy("Resources/vocab.json"),
40 |                 .copy("Resources/merges.txt")
41 |             ]),
42 |     ]
43 | )
44 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/SampleTimer.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | 
 6 | /// A utility for timing events and tracking time statistics
 7 | ///
 8 | /// Typical usage
 9 | /// ```
10 | /// let timer: SampleTimer
11 | ///
12 | /// for i in 0...<iterationCount {
13 | ///     timer.start()
14 | ///     doStuff()
15 | ///     timer.stop()
16 | /// }
17 | ///
18 | /// print(String(format: "mean: %.2f, var: %.2f",
19 | ///              timer.mean, timer.variance))
20 | /// ```
21 | public final class SampleTimer: Codable {
22 |     var startTime: CFAbsoluteTime?
23 |     var sum: Double = 0.0
24 |     var sumOfSquares: Double = 0.0
25 |     var count = 0
26 |     var samples: [Double] = []
27 | 
28 |     public init() {}
29 | 
30 |     /// Start a sample, noting the current time
31 |     public func start() {
32 |         startTime = CFAbsoluteTimeGetCurrent()
33 |     }
34 | 
35 |     // Stop a sample and record the elapsed time
36 |     @discardableResult public func stop() -> Double {
37 |         guard let startTime = startTime else {
38 |             return 0
39 |         }
40 | 
41 |         let elapsed = CFAbsoluteTimeGetCurrent() - startTime
42 |         sum += elapsed
43 |         sumOfSquares += elapsed * elapsed
44 |         count += 1
45 |         samples.append(elapsed)
46 |         return elapsed
47 |     }
48 | 
49 |     /// Mean of all sampled times
50 |     public var mean: Double { sum / Double(count) }
51 | 
52 |     /// Variance of all sampled times
53 |     public var variance: Double {
54 |         guard count > 1 else {
55 |             return 0.0
56 |         }
57 |         return sumOfSquares / Double(count - 1) - mean * mean
58 |     }
59 | 
60 |     /// Standard deviation of all sampled times
61 |     public var stdev: Double { variance.squareRoot() }
62 | 
63 |     /// Median of all sampled times
64 |     public var median: Double {
65 |         let sorted = samples.sorted()
66 |         let (q, r) = sorted.count.quotientAndRemainder(dividingBy: 2)
67 |         if r == 0 {
68 |             return (sorted[q] + sorted[q - 1]) / 2.0
69 |         } else {
70 |             return Double(sorted[q])
71 |         }
72 |     }
73 | 
74 |     public var allSamples: [Double] {
75 |         samples
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2022 Apple Inc. All Rights Reserved.
 2 | 
 3 | IMPORTANT:  This Apple software is supplied to you by Apple
 4 | Inc. ("Apple") in consideration of your agreement to the following
 5 | terms, and your use, installation, modification or redistribution of
 6 | this Apple software constitutes acceptance of these terms.  If you do
 7 | not agree with these terms, please do not use, install, modify or
 8 | redistribute this Apple software.
 9 | 
10 | In consideration of your agreement to abide by the following terms, and
11 | subject to these terms, Apple grants you a personal, non-exclusive
12 | license, under Apple's copyrights in this original Apple software (the
13 | "Apple Software"), to use, reproduce, modify and redistribute the Apple
14 | Software, with or without modifications, in source and/or binary forms;
15 | provided that if you redistribute the Apple Software in its entirety and
16 | without modifications, you must retain this notice and the following
17 | text and disclaimers in all such redistributions of the Apple Software.
18 | Neither the name, trademarks, service marks or logos of Apple Inc. may
19 | be used to endorse or promote products derived from the Apple Software
20 | without specific prior written permission from Apple.  Except as
21 | expressly stated in this notice, no other rights or licenses, express or
22 | implied, are granted by Apple herein, including but not limited to any
23 | patent rights that may be infringed by your derivative works or by other
24 | works in which the Apple Software may be incorporated.
25 | 
26 | The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
27 | MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
28 | THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
29 | FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
30 | OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
31 | 
32 | IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
33 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 | INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
36 | MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
37 | AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
38 | STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
39 | POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/swift/StableDiffusionTests/StableDiffusionTests.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import XCTest
 5 | import CoreML
 6 | @testable import StableDiffusion
 7 | 
 8 | final class StableDiffusionTests: XCTestCase {
 9 | 
10 |     var vocabFileInBundleURL: URL {
11 |         let fileName = "vocab"
12 |         guard let url = Bundle.module.url(forResource: fileName, withExtension: "json") else {
13 |             fatalError("BPE tokenizer vocabulary file is missing from bundle")
14 |         }
15 |         return url
16 |     }
17 | 
18 |     var mergesFileInBundleURL: URL {
19 |         let fileName = "merges"
20 |         guard let url = Bundle.module.url(forResource: fileName, withExtension: "txt") else {
21 |             fatalError("BPE tokenizer merges file is missing from bundle")
22 |         }
23 |         return url
24 |     }
25 | 
26 |     func testBPETokenizer() throws {
27 | 
28 |         let tokenizer = try BPETokenizer(mergesAt: mergesFileInBundleURL, vocabularyAt: vocabFileInBundleURL)
29 | 
30 |         func testPrompt(prompt: String, expectedIds: [Int]) {
31 | 
32 |             let (tokens, ids) = tokenizer.tokenize(input: prompt)
33 | 
34 |             print("Tokens          = \(tokens)\n")
35 |             print("Expected tokens = \(expectedIds.map({ tokenizer.token(id: $0) }))")
36 |             print("ids             = \(ids)\n")
37 |             print("Expected Ids    = \(expectedIds)\n")
38 | 
39 |             XCTAssertEqual(ids,expectedIds)
40 |         }
41 | 
42 |         testPrompt(prompt: "a photo of an astronaut riding a horse on mars",
43 |                    expectedIds: [49406, 320, 1125, 539, 550, 18376, 6765, 320, 4558, 525, 7496, 49407])
44 | 
45 |         testPrompt(prompt: "Apple CoreML developer tools on a Macbook Air are fast",
46 |                    expectedIds: [49406,  3055, 19622,  5780, 10929,  5771,   525,   320, 20617,
47 |                                  1922,   631,  1953, 49407])
48 |     }
49 | 
50 |     func test_randomNormalValues_matchNumPyRandom() {
51 |         var random = NumPyRandomSource(seed: 12345)
52 |         let samples = random.normalArray(count: 10_000)
53 |         let last5 = samples.suffix(5)
54 | 
55 |         // numpy.random.seed(12345); print(numpy.random.randn(10000)[-5:])
56 |         let expected = [-0.86285345, 2.15229409, -0.00670556, -1.21472309, 0.65498866]
57 | 
58 |         for (value, expected) in zip(last5, expected) {
59 |             XCTAssertEqual(value, expected, accuracy: .ulpOfOne.squareRoot())
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Swift Package
  2 | .DS_Store
  3 | /.build
  4 | /Packages
  5 | /*.xcodeproj
  6 | .swiftpm
  7 | .vscode
  8 | .*.sw?
  9 | *.docc-build
 10 | *.vs
 11 | Package.resolved
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | pip-wheel-metadata/
 36 | share/python-wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .nox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | *.py,cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
107 | __pypackages__/
108 | 
109 | # Celery stuff
110 | celerybeat-schedule
111 | celerybeat.pid
112 | 
113 | # SageMath parsed files
114 | *.sage.py
115 | 
116 | # Environments
117 | .env
118 | .venv
119 | env/
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # macOS filesystem
144 | *.DS_Store
145 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/TextEncoder.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | import CoreML
 6 | 
 7 | ///  A model for encoding text
 8 | public struct TextEncoder {
 9 | 
10 |     /// Text tokenizer
11 |     var tokenizer: BPETokenizer
12 | 
13 |     /// Embedding model
14 |     var model: MLModel
15 | 
16 |     /// Creates text encoder which embeds a tokenized string
17 |     ///
18 |     /// - Parameters:
19 |     ///   - tokenizer: Tokenizer for input text
20 |     ///   - model: Model for encoding tokenized text
21 |     public init(tokenizer: BPETokenizer, model: MLModel) {
22 |         self.tokenizer = tokenizer
23 |         self.model = model
24 |     }
25 | 
26 |     /// Encode input text/string
27 |     ///
28 |     ///  - Parameters:
29 |     ///     - text: Input text to be tokenized and then embedded
30 |     ///  - Returns: Embedding representing the input text
31 |     public func encode(_ text: String) throws -> MLShapedArray<Float32> {
32 | 
33 |         // Get models expected input length
34 |         let inputLength = inputShape.last!
35 | 
36 |         // Tokenize, padding to the expected length
37 |         var (tokens, ids) = tokenizer.tokenize(input: text, minCount: inputLength)
38 | 
39 |         // Truncate if necessary
40 |         if ids.count > inputLength {
41 |             tokens = tokens.dropLast(tokens.count - inputLength)
42 |             ids = ids.dropLast(ids.count - inputLength)
43 |             let truncated = tokenizer.decode(tokens: tokens)
44 |             print("Needed to truncate input '\(text)' to '\(truncated)'")
45 |         }
46 | 
47 |         // Use the model to generate the embedding
48 |         return try encode(ids: ids)
49 |     }
50 | 
51 |     /// Prediction queue
52 |     let queue = DispatchQueue(label: "textencoder.predict")
53 | 
54 |     func encode(ids: [Int]) throws -> MLShapedArray<Float32> {
55 |         let inputName = inputDescription.name
56 |         let inputShape = inputShape
57 | 
58 |         let floatIds = ids.map { Float32($0) }
59 |         let inputArray = MLShapedArray<Float32>(scalars: floatIds, shape: inputShape)
60 |         let inputFeatures = try! MLDictionaryFeatureProvider(
61 |             dictionary: [inputName: MLMultiArray(inputArray)])
62 | 
63 |         let result = try queue.sync { try model.prediction(from: inputFeatures) }
64 |         let embeddingFeature = result.featureValue(for: "last_hidden_state")
65 |         return MLShapedArray<Float32>(converting: embeddingFeature!.multiArrayValue!)
66 |     }
67 | 
68 |     var inputDescription: MLFeatureDescription {
69 |         model.modelDescription.inputDescriptionsByName.first!.value
70 |     }
71 | 
72 |     var inputShape: [Int] {
73 |         inputDescription.multiArrayConstraint!.shape.map { $0.intValue }
74 |     }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift:
--------------------------------------------------------------------------------
 1 | // For licensing see accompanying LICENSE.md file.
 2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 3 | 
 4 | import Foundation
 5 | import CoreML
 6 | 
 7 | public extension StableDiffusionPipeline {
 8 | 
 9 |     /// Create stable diffusion pipeline using model resources at a
10 |     /// specified URL
11 |     ///
12 |     /// - Parameters:
13 |     ///    - baseURL: URL pointing to directory holding all model
14 |     ///               and tokenization resources
15 |     ///   - configuration: The configuration to load model resources with
16 |     ///   - disableSafety: Load time disable of safety to save memory
17 |     /// - Returns:
18 |     ///  Pipeline ready for image generation if all  necessary resources loaded
19 |     init(resourcesAt baseURL: URL,
20 |          configuration config: MLModelConfiguration = .init(),
21 |          disableSafety: Bool = false) throws {
22 | 
23 |         /// Expect URL of each resource
24 |         let textEncoderURL = baseURL.appending(path: "TextEncoder.mlmodelc")
25 |         let unetURL = baseURL.appending(path: "Unet.mlmodelc")
26 |         let unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
27 |         let unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
28 |         let decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
29 |         let safetyCheckerURL = baseURL.appending(path: "SafetyChecker.mlmodelc")
30 |         let vocabURL = baseURL.appending(path: "vocab.json")
31 |         let mergesURL = baseURL.appending(path: "merges.txt")
32 | 
33 |         // Text tokenizer and encoder
34 |         let tokenizer = try BPETokenizer(mergesAt: mergesURL, vocabularyAt: vocabURL)
35 |         let textEncoderModel = try MLModel(contentsOf: textEncoderURL, configuration: config)
36 |         let textEncoder = TextEncoder(tokenizer: tokenizer, model:textEncoderModel )
37 | 
38 |         // Unet model
39 |         let unet: Unet
40 |         if FileManager.default.fileExists(atPath: unetChunk1URL.path) &&
41 |             FileManager.default.fileExists(atPath: unetChunk2URL.path) {
42 |             let chunk1 = try MLModel(contentsOf: unetChunk1URL, configuration: config)
43 |             let chunk2 = try MLModel(contentsOf: unetChunk2URL, configuration: config)
44 |             unet = Unet(chunks: [chunk1, chunk2])
45 |         } else {
46 |             let unetModel =  try MLModel(contentsOf: unetURL, configuration: config)
47 |             unet = Unet(model: unetModel)
48 |         }
49 | 
50 |         // Image Decoder
51 |         let decoderModel = try MLModel(contentsOf: decoderURL, configuration: config)
52 |         let decoder = Decoder(model: decoderModel)
53 | 
54 |         // Optional safety checker
55 |         var safetyChecker: SafetyChecker? = nil
56 |         if !disableSafety &&
57 |             FileManager.default.fileExists(atPath: safetyCheckerURL.path) {
58 |             let checkerModel = try MLModel(contentsOf: safetyCheckerURL, configuration: config)
59 |             safetyChecker = SafetyChecker(model: checkerModel)
60 |         }
61 | 
62 |         // Construct pipelien
63 |         self.init(textEncoder: textEncoder,
64 |                   unet: unet,
65 |                   decoder: decoder,
66 |                   safetyChecker: safetyChecker)
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/layer_norm.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # For licensing see accompanying LICENSE.md file.
 3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
 4 | #
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | # Reference: https://github.com/apple/ml-ane-transformers/blob/main/ane_transformers/reference/layer_norm.py
11 | class LayerNormANE(nn.Module):
12 |     """ LayerNorm optimized for Apple Neural Engine (ANE) execution
13 | 
14 |     Note: This layer only supports normalization over the final dim. It expects `num_channels`
15 |     as an argument and not `normalized_shape` which is used by `torch.nn.LayerNorm`.
16 |     """
17 | 
18 |     def __init__(self,
19 |                  num_channels,
20 |                  clip_mag=None,
21 |                  eps=1e-5,
22 |                  elementwise_affine=True):
23 |         """
24 |         Args:
25 |             num_channels:       Number of channels (C) where the expected input data format is BC1S. S stands for sequence length.
26 |             clip_mag:           Optional float value to use for clamping the input range before layer norm is applied.
27 |                                 If specified, helps reduce risk of overflow.
28 |             eps:                Small value to avoid dividing by zero
29 |             elementwise_affine: If true, adds learnable channel-wise shift (bias) and scale (weight) parameters
30 |         """
31 |         super().__init__()
32 |         # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine)
33 |         self.expected_rank = len("BC1S")
34 | 
35 |         self.num_channels = num_channels
36 |         self.eps = eps
37 |         self.clip_mag = clip_mag
38 |         self.elementwise_affine = elementwise_affine
39 | 
40 |         if self.elementwise_affine:
41 |             self.weight = nn.Parameter(torch.Tensor(num_channels))
42 |             self.bias = nn.Parameter(torch.Tensor(num_channels))
43 | 
44 |         self._reset_parameters()
45 | 
46 |     def _reset_parameters(self):
47 |         if self.elementwise_affine:
48 |             nn.init.ones_(self.weight)
49 |             nn.init.zeros_(self.bias)
50 | 
51 |     def forward(self, inputs):
52 |         input_rank = len(inputs.size())
53 | 
54 |         # Principle 1: Picking the Right Data Format (machinelearning.apple.com/research/apple-neural-engine)
55 |         # Migrate the data format from BSC to BC1S (most conducive to ANE)
56 |         if input_rank == 3 and inputs.size(2) == self.num_channels:
57 |             inputs = inputs.transpose(1, 2).unsqueeze(2)
58 |             input_rank = len(inputs.size())
59 | 
60 |         assert input_rank == self.expected_rank
61 |         assert inputs.size(1) == self.num_channels
62 | 
63 |         if self.clip_mag is not None:
64 |             inputs.clamp_(-self.clip_mag, self.clip_mag)
65 | 
66 |         channels_mean = inputs.mean(dim=1, keepdims=True)
67 | 
68 |         zero_mean = inputs - channels_mean
69 | 
70 |         zero_mean_sq = zero_mean * zero_mean
71 | 
72 |         denom = (zero_mean_sq.mean(dim=1, keepdims=True) + self.eps).rsqrt()
73 | 
74 |         out = zero_mean * denom
75 | 
76 |         if self.elementwise_affine:
77 |             out = (out + self.bias.view(1, self.num_channels, 1, 1)
78 |                    ) * self.weight.view(1, self.num_channels, 1, 1)
79 | 
80 |         return out
81 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
71 | available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/coreml_model.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import coremltools as ct
  7 | 
  8 | import logging
  9 | 
 10 | logging.basicConfig()
 11 | logger = logging.getLogger(__name__)
 12 | logger.setLevel(logging.INFO)
 13 | 
 14 | import numpy as np
 15 | 
 16 | import os
 17 | import time
 18 | 
 19 | 
 20 | class CoreMLModel:
 21 |     """ Wrapper for running CoreML models using coremltools
 22 |     """
 23 | 
 24 |     def __init__(self, model_path, compute_unit):
 25 |         assert os.path.exists(model_path) and model_path.endswith(".mlpackage")
 26 | 
 27 |         logger.info(f"Loading {model_path}")
 28 | 
 29 |         start = time.time()
 30 |         self.model = ct.models.MLModel(
 31 |             model_path, compute_units=ct.ComputeUnit[compute_unit])
 32 |         load_time = time.time() - start
 33 |         logger.info(f"Done. Took {load_time:.1f} seconds.")
 34 | 
 35 |         if load_time > LOAD_TIME_INFO_MSG_TRIGGER:
 36 |             logger.info(
 37 |                 "Loading a CoreML model through coremltools triggers compilation every time. "
 38 |                 "The Swift package we provide uses precompiled Core ML models (.mlmodelc) to avoid compile-on-load."
 39 |             )
 40 | 
 41 | 
 42 |         DTYPE_MAP = {
 43 |             65552: np.float16,
 44 |             65568: np.float32,
 45 |             131104: np.int32,
 46 |         }
 47 | 
 48 |         self.expected_inputs = {
 49 |             input_tensor.name: {
 50 |                 "shape": tuple(input_tensor.type.multiArrayType.shape),
 51 |                 "dtype": DTYPE_MAP[input_tensor.type.multiArrayType.dataType],
 52 |             }
 53 |             for input_tensor in self.model._spec.description.input
 54 |         }
 55 | 
 56 |     def _verify_inputs(self, **kwargs):
 57 |         for k, v in kwargs.items():
 58 |             if k in self.expected_inputs:
 59 |                 if not isinstance(v, np.ndarray):
 60 |                     raise TypeError(
 61 |                         f"Expected numpy.ndarray, got {v} for input: {k}")
 62 | 
 63 |                 expected_dtype = self.expected_inputs[k]["dtype"]
 64 |                 if not v.dtype == expected_dtype:
 65 |                     raise TypeError(
 66 |                         f"Expected dtype {expected_dtype}, got {v.dtype} for input: {k}"
 67 |                     )
 68 | 
 69 |                 expected_shape = self.expected_inputs[k]["shape"]
 70 |                 if not v.shape == expected_shape:
 71 |                     raise TypeError(
 72 |                         f"Expected shape {expected_shape}, got {v.shape} for input: {k}"
 73 |                     )
 74 |             else:
 75 |                 raise ValueError("Received unexpected input kwarg: {k}")
 76 | 
 77 |     def __call__(self, **kwargs):
 78 |         self._verify_inputs(**kwargs)
 79 |         return self.model.predict(kwargs)
 80 | 
 81 | 
 82 | LOAD_TIME_INFO_MSG_TRIGGER = 10  # seconds
 83 | 
 84 | 
 85 | def _load_mlpackage(submodule_name, mlpackages_dir, model_version,
 86 |                     compute_unit):
 87 |     """ Load Core ML (mlpackage) models from disk (As exported by torch2coreml.py)
 88 |     """
 89 |     logger.info(f"Loading {submodule_name} mlpackage")
 90 | 
 91 |     fname = f"Stable_Diffusion_version_{model_version}_{submodule_name}.mlpackage".replace(
 92 |         "/", "_")
 93 |     mlpackage_path = os.path.join(mlpackages_dir, fname)
 94 | 
 95 |     if not os.path.exists(mlpackage_path):
 96 |         raise FileNotFoundError(
 97 |             f"{submodule_name} CoreML model doesn't exist at {mlpackage_path}")
 98 | 
 99 |     return CoreMLModel(mlpackage_path, compute_unit)
100 | 
101 | def get_available_compute_units():
102 |     return tuple(cu for cu in ct.ComputeUnit._member_names_)
103 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Decoder.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Accelerate
  7 | 
  8 | /// A decoder model which produces RGB images from latent samples
  9 | public struct Decoder {
 10 | 
 11 |     /// VAE decoder model
 12 |     var model: MLModel
 13 | 
 14 |     /// Create decoder from Core ML model
 15 |     ///
 16 |     /// - Parameters
 17 |     ///     - model: Core ML model for VAE decoder
 18 |     public init(model: MLModel) {
 19 |         self.model = model
 20 |     }
 21 | 
 22 |     /// Prediction queue
 23 |     let queue = DispatchQueue(label: "decoder.predict")
 24 | 
 25 |     /// Batch decode latent samples into images
 26 |     ///
 27 |     ///  - Parameters:
 28 |     ///    - latents: Batch of latent samples to decode
 29 |     ///  - Returns: decoded images
 30 |     public func decode(_ latents: [MLShapedArray<Float32>]) throws -> [CGImage] {
 31 | 
 32 |         // Form batch inputs for model
 33 |         let inputs: [MLFeatureProvider] = try latents.map { sample in
 34 |             // Reference pipeline scales the latent samples before decoding
 35 |             let sampleScaled = MLShapedArray<Float32>(
 36 |                 scalars: sample.scalars.map { $0 / 0.18215 },
 37 |                 shape: sample.shape)
 38 | 
 39 |             let dict = [inputName: MLMultiArray(sampleScaled)]
 40 |             return try MLDictionaryFeatureProvider(dictionary: dict)
 41 |         }
 42 |         let batch = MLArrayBatchProvider(array: inputs)
 43 | 
 44 |         // Batch predict with model
 45 |         let results = try queue.sync { try model.predictions(fromBatch: batch) }
 46 | 
 47 |         // Transform the outputs to CGImages
 48 |         let images: [CGImage] = (0..<results.count).map { i in
 49 |             let result = results.features(at: i)
 50 |             let outputName = result.featureNames.first!
 51 |             let output = result.featureValue(for: outputName)!.multiArrayValue!
 52 | 
 53 |             return toRGBCGImage(MLShapedArray<Float32>(output))
 54 |         }
 55 | 
 56 |         return images
 57 |     }
 58 | 
 59 |     var inputName: String {
 60 |         model.modelDescription.inputDescriptionsByName.first!.key
 61 |     }
 62 | 
 63 |     typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
 64 |     typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
 65 |     typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
 66 |     typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
 67 | 
 68 |     func toRGBCGImage(_ array: MLShapedArray<Float32>) -> CGImage {
 69 | 
 70 |         // array is [N,C,H,W], where C==3
 71 |         let channelCount = array.shape[1]
 72 |         assert(channelCount == 3,
 73 |                "Decoding model output has \(channelCount) channels, expected 3")
 74 |         let height = array.shape[2]
 75 |         let width = array.shape[3]
 76 | 
 77 |         // Normalize each channel into a float between 0 and 1.0
 78 |         let floatChannels = (0..<channelCount).map { i in
 79 | 
 80 |             // Normalized channel output
 81 |             let cOut = PixelBufferPFx1(width: width, height:height)
 82 | 
 83 |             // Reference this channel in the array and normalize
 84 |             array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
 85 |                 let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
 86 |                                           width: width, height: height,
 87 |                                           byteCountPerRow: strides[0]*4)
 88 |                 // Map [-1.0 1.0] -> [0.0 1.0]
 89 |                 cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
 90 |             }
 91 |             return cOut
 92 |         }
 93 | 
 94 |         // Convert to interleaved and then to UInt8
 95 |         let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
 96 |         let uint8Image = PixelBufferI8x3(width: width, height: height)
 97 |         floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
 98 | 
 99 |         // Convert to uint8x3 to RGB CGImage (no alpha)
100 |         let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
101 |         let cgImage = uint8Image.makeCGImage(cgImageFormat:
102 |                 .init(bitsPerComponent: 8,
103 |                       bitsPerPixel: 3*8,
104 |                       colorSpace: CGColorSpaceCreateDeviceRGB(),
105 |                       bitmapInfo: bitmapInfo)!)!
106 | 
107 |         return cgImage
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Random.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// A random source consistent with NumPy
  8 | ///
  9 | ///  This implementation matches:
 10 | ///  [NumPy's older randomkit.c](https://github.com/numpy/numpy/blob/v1.0/numpy/random/mtrand/randomkit.c)
 11 | ///
 12 | struct NumPyRandomSource: RandomNumberGenerator {
 13 | 
 14 |     struct State {
 15 |         var key = [UInt32](repeating: 0, count: 624)
 16 |         var pos: Int = 0
 17 |         var nextGauss: Double? = nil
 18 |     }
 19 | 
 20 |     var state: State
 21 | 
 22 |     /// Initialize with a random seed
 23 |     ///
 24 |     /// - Parameters
 25 |     ///     - seed: Seed for underlying Mersenne Twister 19937 generator
 26 |     /// - Returns random source
 27 |     init(seed: UInt32) {
 28 |         state = .init()
 29 |         var s = seed & 0xffffffff
 30 |         for i in 0 ..< state.key.count {
 31 |             state.key[i] = s
 32 |             s = UInt32((UInt64(1812433253) * UInt64(s ^ (s >> 30)) + UInt64(i) + 1) & 0xffffffff)
 33 |         }
 34 |         state.pos = state.key.count
 35 |         state.nextGauss = nil
 36 |     }
 37 | 
 38 |     /// Generate next UInt32 using fast 32bit Mersenne Twister
 39 |     mutating func nextUInt32() -> UInt32 {
 40 |         let n = 624
 41 |         let m = 397
 42 |         let matrixA: UInt64    = 0x9908b0df
 43 |         let upperMask: UInt32  = 0x80000000
 44 |         let lowerMask: UInt32  = 0x7fffffff
 45 | 
 46 |         var y: UInt32
 47 |         if state.pos == state.key.count {
 48 |             for i in 0 ..< (n - m) {
 49 |                 y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 50 |                 state.key[i] = state.key[i + m] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 51 |             }
 52 |             for i in (n - m) ..< (n - 1) {
 53 |                 y = (state.key[i] & upperMask) | (state.key[i + 1] & lowerMask)
 54 |                 state.key[i] = state.key[i + (m - n)] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 55 |             }
 56 |             y = (state.key[n - 1] & upperMask) | (state.key[0] & lowerMask)
 57 |             state.key[n - 1] = state.key[m - 1] ^ (y >> 1) ^ UInt32((UInt64(~(y & 1)) + 1) & matrixA)
 58 |             state.pos = 0
 59 |         }
 60 |         y = state.key[state.pos]
 61 |         state.pos += 1
 62 | 
 63 |         y ^= (y >> 11)
 64 |         y ^= (y << 7) & 0x9d2c5680
 65 |         y ^= (y << 15) & 0xefc60000
 66 |         y ^= (y >> 18)
 67 | 
 68 |         return y
 69 |     }
 70 | 
 71 |     mutating func next() -> UInt64 {
 72 |         let low = nextUInt32()
 73 |         let high = nextUInt32()
 74 |         return (UInt64(high) << 32) | UInt64(low)
 75 |     }
 76 | 
 77 |     /// Generate next random double value
 78 |     mutating func nextDouble() -> Double {
 79 |         let a = Double(nextUInt32() >> 5)
 80 |         let b = Double(nextUInt32() >> 6)
 81 |         return (a * 67108864.0 + b) / 9007199254740992.0
 82 |     }
 83 | 
 84 |     /// Generate next random value from a standard normal
 85 |     mutating func nextGauss() -> Double {
 86 |         if let nextGauss = state.nextGauss {
 87 |             state.nextGauss = nil
 88 |             return nextGauss
 89 |         }
 90 |         var x1, x2, r2: Double
 91 |         repeat {
 92 |             x1 = 2.0 * nextDouble() - 1.0
 93 |             x2 = 2.0 * nextDouble() - 1.0
 94 |             r2 = x1 * x1 + x2 * x2
 95 |         } while r2 >= 1.0 || r2 == 0.0
 96 | 
 97 |         // Box-Muller transform
 98 |         let f = sqrt(-2.0 * log(r2) / r2)
 99 |         state.nextGauss = f * x1
100 |         return f * x2
101 |     }
102 | 
103 |     /// Generates a random value from a normal distribution with given mean and standard deviation.
104 |     mutating func nextNormal(mean: Double = 0.0, stdev: Double = 1.0) -> Double {
105 |         nextGauss() * stdev + mean
106 |     }
107 | 
108 |     /// Generates an array of random values from a normal distribution with given mean and standard deviation.
109 |     mutating func normalArray(count: Int, mean: Double = 0.0, stdev: Double = 1.0) -> [Double] {
110 |         (0 ..< count).map { _ in nextNormal(mean: mean, stdev: stdev) }
111 |     }
112 | 
113 |     /// Generate a shaped array with scalars from a normal distribution with given mean and standard deviation.
114 |     mutating func normalShapedArray(_ shape: [Int], mean: Double = 0.0, stdev: Double = 1.0) -> MLShapedArray<Double> {
115 |         let count = shape.reduce(1, *)
116 |         return .init(scalars: normalArray(count: count, mean: mean, stdev: stdev), shape: shape)
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Unet.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | 
  7 | /// U-Net noise prediction model for stable diffusion
  8 | public struct Unet {
  9 | 
 10 |     /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
 11 |     ///
 12 |     /// It can be in the form of a single model or multiple stages
 13 |     var models: [MLModel]
 14 | 
 15 |     /// Creates a U-Net noise prediction model
 16 |     ///
 17 |     /// - Parameters:
 18 |     ///   - model: U-Net held in single Core ML model
 19 |     /// - Returns: Ready for prediction
 20 |     public init(model: MLModel) {
 21 |         self.models = [model]
 22 |     }
 23 | 
 24 |     /// Creates a U-Net noise prediction model
 25 |     ///
 26 |     /// - Parameters:
 27 |     ///   - chunks: U-Net held chunked into multiple Core ML models
 28 |     /// - Returns: Ready for prediction
 29 |     public init(chunks: [MLModel]) {
 30 |         self.models = chunks
 31 |     }
 32 | 
 33 |     var latentSampleDescription: MLFeatureDescription {
 34 |         models.first!.modelDescription.inputDescriptionsByName["sample"]!
 35 |     }
 36 | 
 37 |     /// The expected shape of the models latent sample input
 38 |     public var latentSampleShape: [Int] {
 39 |         latentSampleDescription.multiArrayConstraint!.shape.map { $0.intValue }
 40 |     }
 41 | 
 42 |     /// Batch prediction noise from latent samples
 43 |     ///
 44 |     /// - Parameters:
 45 |     ///   - latents: Batch of latent samples in an array
 46 |     ///   - timeStep: Current diffusion timestep
 47 |     ///   - hiddenStates: Hidden state to condition on
 48 |     /// - Returns: Array of predicted noise residuals
 49 |     func predictNoise(
 50 |         latents: [MLShapedArray<Float32>],
 51 |         timeStep: Int,
 52 |         hiddenStates: MLShapedArray<Float32>
 53 |     ) throws -> [MLShapedArray<Float32>] {
 54 | 
 55 |         // Match time step batch dimension to the model / latent samples
 56 |         let t = MLShapedArray<Float32>(scalars:[Float(timeStep), Float(timeStep)],shape:[2])
 57 | 
 58 |         // Form batch input to model
 59 |         let inputs = try latents.map {
 60 |             let dict: [String: Any] = [
 61 |                 "sample" : MLMultiArray($0),
 62 |                 "timestep" : MLMultiArray(t),
 63 |                 "encoder_hidden_states": MLMultiArray(hiddenStates)
 64 |             ]
 65 |             return try MLDictionaryFeatureProvider(dictionary: dict)
 66 |         }
 67 |         let batch = MLArrayBatchProvider(array: inputs)
 68 | 
 69 |         // Make predictions
 70 |         let results = try predictions(from: batch)
 71 | 
 72 |         // Pull out the results in Float32 format
 73 |         let noise = (0..<results.count).map { i in
 74 | 
 75 |             let result = results.features(at: i)
 76 |             let outputName = result.featureNames.first!
 77 | 
 78 |             let outputNoise = result.featureValue(for: outputName)!.multiArrayValue!
 79 | 
 80 |             // To conform to this func return type make sure we return float32
 81 |             // Use the fact that the concatenating constructor for MLMultiArray
 82 |             // can do type conversion:
 83 |             let fp32Noise = MLMultiArray(
 84 |                 concatenating: [outputNoise],
 85 |                 axis: 0,
 86 |                 dataType: .float32
 87 |             )
 88 |             return MLShapedArray<Float32>(fp32Noise)
 89 |         }
 90 | 
 91 |         return noise
 92 |     }
 93 | 
 94 |     /// Prediction queue
 95 |     let queue = DispatchQueue(label: "unet.predict")
 96 | 
 97 |     func predictions(from batch: MLBatchProvider) throws -> MLBatchProvider {
 98 | 
 99 |         var results = try queue.sync {
100 |             try models.first!.predictions(fromBatch: batch)
101 |         }
102 | 
103 |         if models.count == 1 {
104 |             return results
105 |         }
106 | 
107 |         // Manual pipeline batch prediction
108 |         let inputs = batch.arrayOfFeatureValueDictionaries
109 |         for stage in models.dropFirst() {
110 | 
111 |             // Combine the original inputs with the outputs of the last stage
112 |             let next = try results.arrayOfFeatureValueDictionaries
113 |                 .enumerated().map { (index, dict) in
114 |                     let nextDict =  dict.merging(inputs[index]) { (out, _) in out }
115 |                     return try MLDictionaryFeatureProvider(dictionary: nextDict)
116 |             }
117 |             let nextBatch = MLArrayBatchProvider(array: next)
118 | 
119 |             // Predict
120 |             results = try queue.sync {
121 |                 try stage.predictions(fromBatch: nextBatch)
122 |             }
123 |         }
124 | 
125 |         return results
126 |     }
127 | }
128 | 
129 | extension MLFeatureProvider {
130 |     var featureValueDictionary: [String : MLFeatureValue] {
131 |         self.featureNames.reduce(into: [String : MLFeatureValue]()) { result, name in
132 |             result[name] = self.featureValue(for: name)
133 |         }
134 |     }
135 | }
136 | 
137 | extension MLBatchProvider {
138 |     var arrayOfFeatureValueDictionaries: [[String : MLFeatureValue]] {
139 |         (0..<self.count).map {
140 |             self.features(at: $0).featureValueDictionary
141 |         }
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/SafetyChecker.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Accelerate
  7 | 
  8 | /// Image safety checking model
  9 | public struct SafetyChecker {
 10 | 
 11 |     /// Safety checking Core ML model
 12 |     var model: MLModel
 13 | 
 14 |     /// Creates safety checker
 15 |     ///
 16 |     /// - Parameters:
 17 |     ///     - model: Underlying model which performs the safety check
 18 |     /// - Returns: Safety checker ready from checks
 19 |     public init(model: MLModel) {
 20 |         self.model = model
 21 |     }
 22 | 
 23 |     /// Prediction queue
 24 |     let queue = DispatchQueue(label: "safetycheker.predict")
 25 | 
 26 |     typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
 27 |     typealias PixelBufferP8x1 = vImage.PixelBuffer<vImage.Planar8>
 28 |     typealias PixelBufferPFx3 = vImage.PixelBuffer<vImage.PlanarFx3>
 29 |     typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
 30 |     typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
 31 |     typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
 32 |     typealias PixelBufferI8x4 = vImage.PixelBuffer<vImage.Interleaved8x4>
 33 | 
 34 |     enum SafetyCheckError: Error {
 35 |         case imageResizeFailure
 36 |         case imageToFloatFailure
 37 |         case modelInputFailure
 38 |         case unexpectedModelOutput
 39 |     }
 40 | 
 41 |     /// Check if image is safe
 42 |     ///
 43 |     /// - Parameters:
 44 |     ///     - image: Image to check
 45 |     /// - Returns: Whether the model considers the image to be safe
 46 |     public func isSafe(_ image: CGImage) throws -> Bool {
 47 | 
 48 |         let inputName = "clip_input"
 49 |         let adjustmentName = "adjustment"
 50 |         let imagesNames = "images"
 51 | 
 52 |         let inputInfo = model.modelDescription.inputDescriptionsByName
 53 |         let inputShape = inputInfo[inputName]!.multiArrayConstraint!.shape
 54 | 
 55 |         let width = inputShape[2].intValue
 56 |         let height = inputShape[3].intValue
 57 | 
 58 |         let resizedImage = try resizeToRGBA(image, width: width, height: height)
 59 | 
 60 |         let bufferP8x3 = try getRGBPlanes(of: resizedImage)
 61 | 
 62 |         let arrayPFx3 = normalizeToFloatShapedArray(bufferP8x3)
 63 | 
 64 |         guard let input = try? MLDictionaryFeatureProvider(
 65 |             dictionary:[
 66 |                 // Input that is analyzed for safety
 67 |                 inputName      : MLMultiArray(arrayPFx3),
 68 |                 // No adjustment, use default threshold
 69 |                 adjustmentName : MLMultiArray(MLShapedArray<Float32>(scalars: [0], shape: [1])),
 70 |                 // Supplying dummy images to be filtered (will be ignored)
 71 |                 imagesNames    : MLMultiArray(shape:[1, 512, 512, 3], dataType: .float16)
 72 |             ]
 73 |         ) else {
 74 |             throw SafetyCheckError.modelInputFailure
 75 |         }
 76 | 
 77 |         let result = try queue.sync { try model.prediction(from: input) }
 78 | 
 79 |         let output = result.featureValue(for: "has_nsfw_concepts")
 80 | 
 81 |         guard let unsafe = output?.multiArrayValue?[0].boolValue else {
 82 |             throw SafetyCheckError.unexpectedModelOutput
 83 |         }
 84 | 
 85 |         return !unsafe
 86 |     }
 87 | 
 88 |     func resizeToRGBA(_ image: CGImage,
 89 |                       width: Int, height: Int) throws -> CGImage {
 90 | 
 91 |         guard let context = CGContext(
 92 |             data: nil,
 93 |             width: width,
 94 |             height: height,
 95 |             bitsPerComponent: 8,
 96 |             bytesPerRow: width*4,
 97 |             space: CGColorSpaceCreateDeviceRGB(),
 98 |             bitmapInfo: CGImageAlphaInfo.noneSkipLast.rawValue) else {
 99 |             throw SafetyCheckError.imageResizeFailure
100 |         }
101 | 
102 |         context.interpolationQuality = .high
103 |         context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
104 |         guard let resizedImage = context.makeImage() else {
105 |             throw SafetyCheckError.imageResizeFailure
106 |         }
107 | 
108 |         return resizedImage
109 |     }
110 | 
111 |     func getRGBPlanes(of rgbaImage: CGImage) throws -> PixelBufferP8x3 {
112 |         // Reference as interleaved 8 bit vImage PixelBuffer
113 |         var emptyFormat = vImage_CGImageFormat()
114 |         guard let bufferI8x4 = try? PixelBufferI8x4(
115 |             cgImage: rgbaImage,
116 |             cgImageFormat:&emptyFormat) else {
117 |             throw SafetyCheckError.imageToFloatFailure
118 |         }
119 | 
120 |         // Drop the alpha channel, keeping RGB
121 |         let bufferI8x3 = PixelBufferI8x3(width: rgbaImage.width, height:rgbaImage.height)
122 |         bufferI8x4.convert(to: bufferI8x3, channelOrdering: .RGBA)
123 | 
124 |         // De-interleave into 8-bit planes
125 |         return PixelBufferP8x3(interleavedBuffer: bufferI8x3)
126 |     }
127 | 
128 |     func normalizeToFloatShapedArray(_ bufferP8x3: PixelBufferP8x3) -> MLShapedArray<Float32> {
129 |         let width = bufferP8x3.width
130 |         let height = bufferP8x3.height
131 | 
132 |         let means = [0.485, 0.456, 0.406] as [Float]
133 |         let stds  = [0.229, 0.224, 0.225] as [Float]
134 | 
135 |         // Convert to normalized float 1x3xWxH input (plannar)
136 |         let arrayPFx3 = MLShapedArray<Float32>(repeating: 0.0, shape: [1, 3, width, height])
137 |         for c in 0..<3 {
138 |             arrayPFx3[0][c].withUnsafeShapedBufferPointer { ptr, _, strides in
139 |                 let floatChannel = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
140 |                                                    width: width, height: height,
141 |                                                    byteCountPerRow: strides[0]*4)
142 | 
143 |                 bufferP8x3.withUnsafePixelBuffer(at: c) { uint8Channel in
144 |                     uint8Channel.convert(to: floatChannel) // maps [0 255] -> [0 1]
145 |                     floatChannel.multiply(by: 1.0/stds[c],
146 |                                           preBias: -means[c],
147 |                                           postBias: 0.0,
148 |                                           destination: floatChannel)
149 |                 }
150 |             }
151 |         }
152 |         return arrayPFx3
153 |     }
154 | }
155 | 


--------------------------------------------------------------------------------
/swift/StableDiffusionCLI/main.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import ArgumentParser
  5 | import CoreGraphics
  6 | import CoreML
  7 | import Foundation
  8 | import StableDiffusion
  9 | import UniformTypeIdentifiers
 10 | 
 11 | struct StableDiffusionSample: ParsableCommand {
 12 | 
 13 |     static let configuration = CommandConfiguration(
 14 |         abstract: "Run stable diffusion to generate images guided by a text prompt",
 15 |         version: "0.1"
 16 |     )
 17 | 
 18 |     @Argument(help: "Input string prompt")
 19 |     var prompt: String
 20 | 
 21 |     @Option(
 22 |         help: ArgumentHelp(
 23 |             "Path to stable diffusion resources.",
 24 |             discussion: "The resource directory should contain\n" +
 25 |                 " - *compiled* models: {TextEncoder,Unet,VAEDecoder}.mlmodelc\n" +
 26 |                 " - tokenizer info: vocab.json, merges.txt",
 27 |             valueName: "directory-path"
 28 |         )
 29 |     )
 30 |     var resourcePath: String = "./"
 31 | 
 32 |     @Option(help: "Number of images to sample / generate")
 33 |     var imageCount: Int = 1
 34 | 
 35 |     @Option(help: "Number of diffusion steps to perform")
 36 |     var stepCount: Int = 50
 37 | 
 38 |     @Option(
 39 |         help: ArgumentHelp(
 40 |             "How often to save samples at intermediate steps",
 41 |             discussion: "Set to 0 to only save the final sample"
 42 |         )
 43 |     )
 44 |     var saveEvery: Int = 0
 45 | 
 46 |     @Option(help: "Output path")
 47 |     var outputPath: String = "./"
 48 | 
 49 |     @Option(help: "Random seed")
 50 |     var seed: Int = 93
 51 | 
 52 |     @Option(help: "Compute units to load model with {all,cpuOnly,cpuAndGPU,cpuAndNeuralEngine}")
 53 |     var computeUnits: ComputeUnits = .all
 54 | 
 55 |     @Flag(help: "Disable safety checking")
 56 |     var disableSafety: Bool = false
 57 | 
 58 |     mutating func run() throws {
 59 |         guard FileManager.default.fileExists(atPath: resourcePath) else {
 60 |             throw RunError.resources("Resource path does not exist \(resourcePath)")
 61 |         }
 62 | 
 63 |         let config = MLModelConfiguration()
 64 |         config.computeUnits = computeUnits.asMLComputeUnits
 65 |         let resourceURL = URL(filePath: resourcePath)
 66 | 
 67 |         log("Loading resources and creating pipeline\n")
 68 |         log("(Note: This can take a while the first time using these resources)\n")
 69 |         let pipeline = try StableDiffusionPipeline(resourcesAt: resourceURL,
 70 |                                                    configuration: config,
 71 |                                                    disableSafety: disableSafety)
 72 | 
 73 |         log("Sampling ...\n")
 74 |         let sampleTimer = SampleTimer()
 75 |         sampleTimer.start()
 76 | 
 77 |         let images = try pipeline.generateImages(
 78 |             prompt: prompt,
 79 |             imageCount: imageCount,
 80 |             stepCount: stepCount,
 81 |             seed: seed
 82 |         ) { progress in
 83 |             sampleTimer.stop()
 84 |             handleProgress(progress,sampleTimer)
 85 |             if progress.stepCount != progress.step {
 86 |                 sampleTimer.start()
 87 |             }
 88 |             return true
 89 |         }
 90 | 
 91 |         _ = try saveImages(images, logNames: true)
 92 |     }
 93 | 
 94 |     func handleProgress(
 95 |         _ progress: StableDiffusionPipeline.Progress,
 96 |         _ sampleTimer: SampleTimer
 97 |     ) {
 98 |         log("\u{1B}[1A\u{1B}[K")
 99 |         log("Step \(progress.step) of \(progress.stepCount) ")
100 |         log(" [")
101 |         log(String(format: "mean: %.2f, ", 1.0/sampleTimer.mean))
102 |         log(String(format: "median: %.2f, ", 1.0/sampleTimer.median))
103 |         log(String(format: "last %.2f", 1.0/sampleTimer.allSamples.last!))
104 |         log("] step/sec")
105 | 
106 |         if saveEvery > 0, progress.step % saveEvery == 0 {
107 |             let saveCount = (try? saveImages(progress.currentImages, step: progress.step)) ?? 0
108 |             log(" saved \(saveCount) image\(saveCount != 1 ? "s" : "")")
109 |         }
110 |         log("\n")
111 |     }
112 | 
113 |     func saveImages(
114 |         _ images: [CGImage?],
115 |         step: Int? = nil,
116 |         logNames: Bool = false
117 |     ) throws -> Int {
118 |         let url = URL(filePath: outputPath)
119 |         var saved = 0
120 |         for i in 0 ..< images.count {
121 | 
122 |             guard let image = images[i] else {
123 |                 if logNames {
124 |                     log("Image \(i) failed safety check and was not saved")
125 |                 }
126 |                 continue
127 |             }
128 | 
129 |             let name = imageName(i, step: step)
130 |             let fileURL = url.appending(path:name)
131 | 
132 |             guard let dest = CGImageDestinationCreateWithURL(fileURL as CFURL, UTType.png.identifier as CFString, 1, nil) else {
133 |                 throw RunError.saving("Failed to create destination for \(fileURL)")
134 |             }
135 |             CGImageDestinationAddImage(dest, image, nil)
136 |             if !CGImageDestinationFinalize(dest) {
137 |                 throw RunError.saving("Failed to save \(fileURL)")
138 |             }
139 |             if logNames {
140 |                 log("Saved \(name)\n")
141 |             }
142 |             saved += 1
143 |         }
144 |         return saved
145 |     }
146 | 
147 |     func imageName(_ sample: Int, step: Int? = nil) -> String {
148 |         var name = prompt.replacingOccurrences(of: " ", with: "_")
149 |         if imageCount != 1 {
150 |             name += ".\(sample)"
151 |         }
152 | 
153 |         name += ".\(seed)"
154 | 
155 |         if let step = step {
156 |             name += ".\(step)"
157 |         } else {
158 |             name += ".final"
159 |         }
160 |         name += ".png"
161 |         return name
162 |     }
163 | 
164 |     func log(_ str: String, term: String = "") {
165 |         print(str, terminator: term)
166 |     }
167 | }
168 | 
169 | enum RunError: Error {
170 |     case resources(String)
171 |     case saving(String)
172 | }
173 | 
174 | enum ComputeUnits: String, ExpressibleByArgument, CaseIterable {
175 |     case all, cpuAndGPU, cpuOnly, cpuAndNeuralEngine
176 |     var asMLComputeUnits: MLComputeUnits {
177 |         switch self {
178 |         case .all: return .all
179 |         case .cpuAndGPU: return .cpuAndGPU
180 |         case .cpuOnly: return .cpuOnly
181 |         case .cpuAndNeuralEngine: return .cpuAndNeuralEngine
182 |         }
183 |     }
184 | }
185 | 
186 | StableDiffusionSample.main()
187 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/tokenizer/BPETokenizer.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | 
  6 | /// A tokenizer based on byte pair encoding.
  7 | public struct BPETokenizer {
  8 |     /// A dictionary that maps pairs of tokens to the rank/order of the merge.
  9 |     let merges: [TokenPair : Int]
 10 | 
 11 |     /// A dictionary from of tokens to identifiers.
 12 |     let vocabulary: [String: Int]
 13 | 
 14 |     /// The start token.
 15 |     let startToken: String = "<|startoftext|>"
 16 | 
 17 |     /// The end token.
 18 |     let endToken: String = "<|endoftext|>"
 19 | 
 20 |     /// The token used for padding
 21 |     let padToken: String = "<|endoftext|>"
 22 | 
 23 |     /// The unknown token.
 24 |     let unknownToken: String = "<|endoftext|>"
 25 | 
 26 |     var unknownTokenID: Int {
 27 |         vocabulary[unknownToken, default: 0]
 28 |     }
 29 | 
 30 |     /// Creates a tokenizer.
 31 |     ///
 32 |     /// - Parameters:
 33 |     ///   - merges: A dictionary that maps pairs of tokens to the rank/order of the merge.
 34 |     ///   - vocabulary: A dictionary from of tokens to identifiers.
 35 |     public init(merges: [TokenPair: Int], vocabulary: [String: Int]) {
 36 |         self.merges = merges
 37 |         self.vocabulary = vocabulary
 38 |     }
 39 | 
 40 |     /// Creates a tokenizer by loading merges and vocabulary from URLs.
 41 |     ///
 42 |     /// - Parameters:
 43 |     ///   - mergesURL: The URL of a text file containing merges.
 44 |     ///   - vocabularyURL: The URL of a JSON file containing the vocabulary.
 45 |     public init(mergesAt mergesURL: URL, vocabularyAt vocabularyURL: URL) throws {
 46 |         self.merges = try Self.readMerges(url: mergesURL)
 47 |         self.vocabulary = try! Self.readVocabulary(url: vocabularyURL)
 48 |     }
 49 | 
 50 |     /// Tokenizes an input string.
 51 |     ///
 52 |     /// - Parameters:
 53 |     ///   - input: A string.
 54 |     ///   - minCount: The minimum number of tokens to return.
 55 |     /// - Returns: An array of tokens and an array of token identifiers.
 56 |     public func tokenize(input: String, minCount: Int? = nil) -> (tokens: [String], tokenIDs: [Int]) {
 57 |         var tokens: [String] = []
 58 | 
 59 |         tokens.append(startToken)
 60 |         tokens.append(contentsOf: encode(input: input))
 61 |         tokens.append(endToken)
 62 | 
 63 |         // Pad if there was a min length specified
 64 |         if let minLen = minCount, minLen > tokens.count {
 65 |             tokens.append(contentsOf: repeatElement(padToken, count: minLen - tokens.count))
 66 |         }
 67 | 
 68 |         let ids = tokens.map({ vocabulary[$0, default: unknownTokenID] })
 69 |         return (tokens: tokens, tokenIDs: ids)
 70 |     }
 71 | 
 72 |     /// Returns the token identifier for a token.
 73 |     public func tokenID(for token: String) -> Int? {
 74 |         vocabulary[token]
 75 |     }
 76 | 
 77 |     /// Returns the token for a token identifier.
 78 |     public func token(id: Int) -> String? {
 79 |         vocabulary.first(where: { $0.value == id })?.key
 80 |     }
 81 | 
 82 |     /// Decodes a sequence of tokens into a fully formed string
 83 |     public func decode(tokens: [String]) -> String {
 84 |         String(tokens.joined())
 85 |             .replacingOccurrences(of: "</w>", with: " ")
 86 |             .replacingOccurrences(of: startToken, with: "")
 87 |             .replacingOccurrences(of: endToken, with: "")
 88 |     }
 89 | 
 90 |     /// Encode an input string to a sequence of tokens
 91 |     func encode(input: String) -> [String] {
 92 |         let normalized = input.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
 93 |         let words = normalized.split(separator: " ")
 94 |         return words.flatMap({ encode(word: $0) })
 95 |     }
 96 | 
 97 |     /// Encode a single word into a sequence of tokens
 98 |     func encode(word: Substring) -> [String] {
 99 |         var tokens = word.map { String($0) }
100 |         if let last = tokens.indices.last {
101 |             tokens[last] = tokens[last] + "</w>"
102 |         }
103 | 
104 |         while true {
105 |             let pairs = pairs(for: tokens)
106 |             let canMerge = pairs.filter { merges[$0] != nil }
107 | 
108 |             if canMerge.isEmpty {
109 |                 break
110 |             }
111 | 
112 |             // If multiple merges are found, use the one with the lowest rank
113 |             let shouldMerge = canMerge.min { merges[$0]! < merges[$1]! }!
114 |             tokens = update(tokens, merging: shouldMerge)
115 |         }
116 |         return tokens
117 |     }
118 | 
119 |     /// Get  the set of adjacent pairs / bigrams from a sequence of tokens
120 |     func pairs(for tokens: [String]) -> Set<TokenPair> {
121 |         guard tokens.count > 1 else {
122 |             return Set()
123 |         }
124 | 
125 |         var pairs = Set<TokenPair>(minimumCapacity: tokens.count - 1)
126 |         var prev = tokens.first!
127 |         for current in tokens.dropFirst() {
128 |             pairs.insert(TokenPair(prev, current))
129 |             prev = current
130 |         }
131 |         return pairs
132 |     }
133 | 
134 |     /// Update the sequence of tokens by greedily merging instance of a specific bigram
135 |     func update(_ tokens: [String], merging bigram: TokenPair) -> [String] {
136 |         guard tokens.count > 1 else {
137 |             return []
138 |         }
139 | 
140 |         var newTokens = [String]()
141 |         newTokens.reserveCapacity(tokens.count - 1)
142 | 
143 |         var index = 0
144 |         while index < tokens.count {
145 |             let remainingTokens = tokens[index...]
146 |             if let startMatchIndex = remainingTokens.firstIndex(of: bigram.first) {
147 |                 // Found a possible match, append everything before it
148 |                 newTokens.append(contentsOf: tokens[index..<startMatchIndex])
149 | 
150 |                 if index < tokens.count - 1 && tokens[startMatchIndex + 1] == bigram.second {
151 |                     // Full match, merge
152 |                     newTokens.append(bigram.first + bigram.second)
153 |                     index = startMatchIndex + 2
154 |                 } else {
155 |                     // Only matched the first, no merge
156 |                     newTokens.append(bigram.first)
157 |                     index = startMatchIndex + 1
158 |                 }
159 |             } else {
160 |                 // Didn't find any more matches, append the rest unmerged
161 |                 newTokens.append(contentsOf: remainingTokens)
162 |                 break
163 |             }
164 |         }
165 |         return newTokens
166 |     }
167 | }
168 | 
169 | extension BPETokenizer {
170 | 
171 |     /// A hashable tuple of strings
172 |     public struct TokenPair: Hashable {
173 |         let first: String
174 |         let second: String
175 | 
176 |         init(_ first: String, _ second: String) {
177 |             self.first = first
178 |             self.second = second
179 |         }
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import Foundation
  5 | import CoreML
  6 | import Accelerate
  7 | import CoreGraphics
  8 | 
  9 | /// A pipeline used to generate image samples from text input using stable diffusion
 10 | ///
 11 | /// This implementation matches:
 12 | /// [Hugging Face Diffusers Pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py)
 13 | public struct StableDiffusionPipeline {
 14 | 
 15 |     /// Model to generate embeddings for tokenized input text
 16 |     var textEncoder: TextEncoder
 17 | 
 18 |     /// Model used to predict noise residuals given an input, diffusion time step, and conditional embedding
 19 |     var unet: Unet
 20 | 
 21 |     /// Model used to generate final image from latent diffusion process
 22 |     var decoder: Decoder
 23 | 
 24 |     /// Optional model for checking safety of generated image
 25 |     var safetyChecker: SafetyChecker? = nil
 26 | 
 27 |     /// Reports whether this pipeline can perform safety checks
 28 |     public var canSafetyCheck: Bool {
 29 |         safetyChecker != nil
 30 |     }
 31 | 
 32 |     /// Creates a pipeline using the specified models and tokenizer
 33 |     ///
 34 |     /// - Parameters:
 35 |     ///   - textEncoder: Model for encoding tokenized text
 36 |     ///   - unet: Model for noise prediction on latent samples
 37 |     ///   - decoder: Model for decoding latent sample to image
 38 |     ///   - safetyChecker: Optional model for checking safety of generated images
 39 |     ///   - guidanceScale: Influence of the text prompt on generation process
 40 |     /// - Returns: Pipeline ready for image generation
 41 |     public init(textEncoder: TextEncoder,
 42 |                 unet: Unet,
 43 |                 decoder: Decoder,
 44 |                 safetyChecker: SafetyChecker? = nil
 45 |     ) {
 46 |         self.textEncoder = textEncoder
 47 |         self.unet = unet
 48 |         self.decoder = decoder
 49 |         self.safetyChecker = safetyChecker
 50 |     }
 51 | 
 52 |     /// Text to image generation using stable diffusion
 53 |     ///
 54 |     /// - Parameters:
 55 |     ///   - prompt: Text prompt to guide sampling
 56 |     ///   - stepCount: Number of inference steps to perform
 57 |     ///   - imageCount: Number of samples/images to generate for the input prompt
 58 |     ///   - seed: Random seed which
 59 |     ///   - disableSafety: Safety checks are only performed if `self.canSafetyCheck && !disableSafety`
 60 |     ///   - progressHandler: Callback to perform after each step, stops on receiving false response
 61 |     ///   - guidanceScale: Controls the influence of the text prompt on sampling process (0=random images)
 62 |     /// - Returns: An array of `imageCount` optional images.
 63 |     ///            The images will be nil if safety checks were performed and found the result to be un-safe
 64 |     public func generateImages(
 65 |         prompt: String,
 66 |         imageCount: Int = 1,
 67 |         stepCount: Int = 50,
 68 |         seed: Int = 0,
 69 |         disableSafety: Bool = false,
 70 |         guidanceScale: Float = 7.5,
 71 |         progressHandler: (Progress) -> Bool = { _ in true }
 72 |     ) throws -> [CGImage?] {
 73 | 
 74 |         // Encode the input prompt as well as a blank unconditioned input
 75 |         let promptEmbedding = try textEncoder.encode(prompt)
 76 |         let blankEmbedding = try textEncoder.encode("")
 77 | 
 78 |         // Convert to Unet hidden state representation
 79 |         let concatEmbedding = MLShapedArray<Float32>(
 80 |             concatenating: [blankEmbedding, promptEmbedding],
 81 |             alongAxis: 0
 82 |         )
 83 | 
 84 |         let hiddenStates = toHiddenStates(concatEmbedding)
 85 | 
 86 |         /// Setup schedulers
 87 |         let scheduler = (0..<imageCount).map { _ in Scheduler(stepCount: stepCount) }
 88 |         let stdev = scheduler[0].initNoiseSigma
 89 | 
 90 |         // Generate random latent samples from specified seed
 91 |         var latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed)
 92 | 
 93 |         // De-noising loop
 94 |         for (step,t) in scheduler[0].timeSteps.enumerated() {
 95 | 
 96 |             // Expand the latents for classifier-free guidance
 97 |             // and input to the Unet noise prediction model
 98 |             let latentUnetInput = latents.map {
 99 |                 MLShapedArray<Float32>(concatenating: [$0, $0], alongAxis: 0)
100 |             }
101 | 
102 |             // Predict noise residuals from latent samples
103 |             // and current time step conditioned on hidden states
104 |             var noise = try unet.predictNoise(
105 |                 latents: latentUnetInput,
106 |                 timeStep: t,
107 |                 hiddenStates: hiddenStates
108 |             )
109 | 
110 |             noise = performGuidance(noise, guidanceScale: guidanceScale)
111 | 
112 |             // Have the scheduler compute the previous (t-1) latent
113 |             // sample given the predicted noise and current sample
114 |             for i in 0..<imageCount {
115 |                 latents[i] = scheduler[i].step(
116 |                     output: noise[i],
117 |                     timeStep: t,
118 |                     sample: latents[i]
119 |                 )
120 |             }
121 | 
122 |             // Report progress
123 |             let progress = Progress(
124 |                 pipeline: self,
125 |                 prompt: prompt,
126 |                 step: step,
127 |                 stepCount: stepCount,
128 |                 currentLatentSamples: latents,
129 |                 isSafetyEnabled: canSafetyCheck && !disableSafety
130 |             )
131 |             if !progressHandler(progress) {
132 |                 // Stop if requested by handler
133 |                 return []
134 |             }
135 |         }
136 | 
137 |         // Decode the latent samples to images
138 |         return try decodeToImages(latents, disableSafety: disableSafety)
139 |     }
140 | 
141 |     func generateLatentSamples(_ count: Int, stdev: Float, seed: Int) -> [MLShapedArray<Float32>] {
142 |         var sampleShape = unet.latentSampleShape
143 |         sampleShape[0] = 1
144 | 
145 |         var random = NumPyRandomSource(seed: UInt32(seed))
146 |         let samples = (0..<count).map { _ in
147 |             MLShapedArray<Float32>(
148 |                 converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev)))
149 |         }
150 |         return samples
151 |     }
152 | 
153 |     func toHiddenStates(_ embedding: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
154 |         // Unoptimized manual transpose [0, 2, None, 1]
155 |         // e.g. From [2, 77, 768] to [2, 768, 1, 77]
156 |         let fromShape = embedding.shape
157 |         let stateShape = [fromShape[0],fromShape[2], 1, fromShape[1]]
158 |         var states = MLShapedArray<Float32>(repeating: 0.0, shape: stateShape)
159 |         for i0 in 0..<fromShape[0] {
160 |             for i1 in 0..<fromShape[1] {
161 |                 for i2 in 0..<fromShape[2] {
162 |                     states[scalarAt:i0,i2,0,i1] = embedding[scalarAt:i0, i1, i2]
163 |                 }
164 |             }
165 |         }
166 |         return states
167 |     }
168 | 
169 |     func performGuidance(_ noise: [MLShapedArray<Float32>], guidanceScale: Float) -> [MLShapedArray<Float32>] {
170 |         noise.map { performGuidance($0, guidanceScale: guidanceScale) }
171 |     }
172 | 
173 |     func performGuidance(_ noise: MLShapedArray<Float32>, guidanceScale: Float) -> MLShapedArray<Float32> {
174 | 
175 |         let blankNoiseScalars = noise[0].scalars
176 |         let textNoiseScalars = noise[1].scalars
177 | 
178 |         var resultScalars =  blankNoiseScalars
179 | 
180 |         for i in 0..<resultScalars.count {
181 |             // unconditioned + guidance*(text - unconditioned)
182 |             resultScalars[i] += guidanceScale*(textNoiseScalars[i]-blankNoiseScalars[i])
183 |         }
184 | 
185 |         var shape = noise.shape
186 |         shape[0] = 1
187 |         return MLShapedArray<Float32>(scalars: resultScalars, shape: shape)
188 |     }
189 | 
190 |     func decodeToImages(_ latents: [MLShapedArray<Float32>],
191 |                         disableSafety: Bool) throws -> [CGImage?] {
192 | 
193 | 
194 |         let images = try decoder.decode(latents)
195 | 
196 |         // If safety is disabled return what was decoded
197 |         if disableSafety {
198 |             return images
199 |         }
200 | 
201 |         // If there is no safety checker return what was decoded
202 |         guard let safetyChecker = safetyChecker else {
203 |             return images
204 |         }
205 | 
206 |         // Otherwise change images which are not safe to nil
207 |         let safeImages = try images.map { image in
208 |             try safetyChecker.isSafe(image) ? image : nil
209 |         }
210 | 
211 |         return safeImages
212 |     }
213 | 
214 | }
215 | 
216 | extension StableDiffusionPipeline {
217 |     /// Sampling progress details
218 |     public struct Progress {
219 |         public let pipeline: StableDiffusionPipeline
220 |         public let prompt: String
221 |         public let step: Int
222 |         public let stepCount: Int
223 |         public let currentLatentSamples: [MLShapedArray<Float32>]
224 |         public let isSafetyEnabled: Bool
225 |         public var currentImages: [CGImage?] {
226 |             try! pipeline.decodeToImages(
227 |                 currentLatentSamples,
228 |                 disableSafety: !isSafetyEnabled)
229 |         }
230 |     }
231 | }
232 | 


--------------------------------------------------------------------------------
/swift/StableDiffusion/pipeline/Scheduler.swift:
--------------------------------------------------------------------------------
  1 | // For licensing see accompanying LICENSE.md file.
  2 | // Copyright (C) 2022 Apple Inc. All Rights Reserved.
  3 | 
  4 | import CoreML
  5 | 
  6 | /// A scheduler used to compute a de-noised image
  7 | ///
  8 | ///  This implementation matches:
  9 | ///  [Hugging Face Diffusers PNDMScheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py)
 10 | ///
 11 | /// It uses the pseudo linear multi-step (PLMS) method only, skipping pseudo Runge-Kutta (PRK) steps
 12 | public final class Scheduler {
 13 |     /// Number of diffusion steps performed during training
 14 |     public let trainStepCount: Int
 15 | 
 16 |     /// Number of inference steps to be performed
 17 |     public let inferenceStepCount: Int
 18 | 
 19 |     /// Training diffusion time steps index by inference time step
 20 |     public let timeSteps: [Int]
 21 | 
 22 |     /// Schedule of betas which controls the amount of noise added at each timestep
 23 |     public let betas: [Float]
 24 | 
 25 |     /// 1 - betas
 26 |     let alphas: [Float]
 27 | 
 28 |     /// Cached cumulative product of alphas
 29 |     let alphasCumProd: [Float]
 30 | 
 31 |     /// Standard deviation of the initial noise distribution
 32 |     public let initNoiseSigma: Float
 33 | 
 34 |     // Internal state
 35 |     var counter: Int
 36 |     var ets: [MLShapedArray<Float32>]
 37 |     var currentSample: MLShapedArray<Float32>?
 38 | 
 39 |     /// Create a scheduler that uses a pseudo linear multi-step (PLMS)  method
 40 |     ///
 41 |     /// - Parameters:
 42 |     ///   - stepCount: Number of inference steps to schedule
 43 |     ///   - trainStepCount: Number of training diffusion steps
 44 |     ///   - betaSchedule: Method to schedule betas from betaStart to betaEnd
 45 |     ///   - betaStart: The starting value of beta for inference
 46 |     ///   - betaEnd: The end value for beta for inference
 47 |     /// - Returns: A scheduler ready for its first step
 48 |     public init(
 49 |         stepCount: Int = 50,
 50 |         trainStepCount: Int = 1000,
 51 |         betaSchedule: BetaSchedule = .scaledLinear,
 52 |         betaStart: Float = 0.00085,
 53 |         betaEnd: Float = 0.012
 54 |     ) {
 55 |         self.trainStepCount = trainStepCount
 56 |         self.inferenceStepCount = stepCount
 57 | 
 58 |         switch betaSchedule {
 59 |         case .linear:
 60 |             self.betas = linspace(betaStart, betaEnd, trainStepCount)
 61 |         case .scaledLinear:
 62 |             self.betas = linspace(pow(betaStart, 0.5), pow(betaEnd, 0.5), trainStepCount).map({ $0 * $0 })
 63 |         }
 64 | 
 65 |         self.alphas = betas.map({ 1.0 - $0 })
 66 |         self.initNoiseSigma = 1.0
 67 |         var alphasCumProd = self.alphas
 68 |         for i in 1..<alphasCumProd.count {
 69 |             alphasCumProd[i] *= alphasCumProd[i -  1]
 70 |         }
 71 |         self.alphasCumProd = alphasCumProd
 72 | 
 73 |         let stepsOffset = 1 // For stable diffusion
 74 |         let stepRatio = Float(trainStepCount / stepCount )
 75 |         let forwardSteps = (0..<stepCount).map {
 76 |             Int((Float($0) * stepRatio).rounded()) + stepsOffset
 77 |         }
 78 | 
 79 |         var timeSteps: [Int] = []
 80 |         timeSteps.append(contentsOf: forwardSteps.dropLast(1))
 81 |         timeSteps.append(timeSteps.last!)
 82 |         timeSteps.append(forwardSteps.last!)
 83 |         timeSteps.reverse()
 84 | 
 85 |         self.timeSteps = timeSteps
 86 |         self.counter = 0
 87 |         self.ets = []
 88 |         self.currentSample = nil
 89 |     }
 90 | 
 91 |     /// Compute a de-noised image sample and step scheduler state
 92 |     ///
 93 |     /// - Parameters:
 94 |     ///   - output: The predicted residual noise output of learned diffusion model
 95 |     ///   - timeStep: The current time step in the diffusion chain
 96 |     ///   - sample: The current input sample to the diffusion model
 97 |     /// - Returns: Predicted de-noised sample at the previous time step
 98 |     /// - Postcondition: The scheduler state is updated.
 99 |     ///   The state holds the current sample and history of model output noise residuals
100 |     public func step(
101 |         output: MLShapedArray<Float32>,
102 |         timeStep t: Int,
103 |         sample s: MLShapedArray<Float32>
104 |     ) -> MLShapedArray<Float32> {
105 |         
106 |         var timeStep = t
107 |         let stepInc = (trainStepCount / inferenceStepCount)
108 |         var prevStep = timeStep - stepInc
109 |         var modelOutput = output
110 |         var sample = s
111 | 
112 |         if counter != 1 {
113 |             if ets.count > 3 {
114 |                 ets = Array(ets[(ets.count - 3)..<ets.count])
115 |             }
116 |             ets.append(output)
117 |         } else {
118 |             prevStep = timeStep
119 |             timeStep = timeStep + stepInc
120 |         }
121 | 
122 |         if ets.count == 1 && counter == 0 {
123 |             modelOutput = output
124 |             currentSample = sample
125 |         } else if ets.count == 1 && counter == 1 {
126 |             modelOutput = weightedSum(
127 |                 [1.0/2.0, 1.0/2.0],
128 |                 [output,  ets[back: 1]]
129 |             )
130 |             sample = currentSample!
131 |             currentSample = nil
132 |         } else if ets.count == 2 {
133 |             modelOutput = weightedSum(
134 |                 [3.0/2.0,      -1.0/2.0],
135 |                 [ets[back: 1], ets[back: 2]]
136 |             )
137 |         } else if ets.count == 3 {
138 |             modelOutput = weightedSum(
139 |                 [23.0/12.0,    -16.0/12.0,   5.0/12.0],
140 |                 [ets[back: 1], ets[back: 2], ets[back: 3]]
141 |             )
142 |         } else {
143 |             modelOutput = weightedSum(
144 |                 [55.0/24.0,    -59.0/24.0,   37/24.0,      -9/24.0],
145 |                 [ets[back: 1], ets[back: 2], ets[back: 3], ets[back: 4]]
146 |             )
147 |         }
148 | 
149 |         let prevSample = previousSample(sample, timeStep, prevStep, modelOutput)
150 |         counter += 1
151 |         return prevSample
152 |     }
153 | 
154 |     /// Compute weighted sum of shaped arrays of equal shapes
155 |     ///
156 |     /// - Parameters:
157 |     ///   - weights: The weights each array is multiplied by
158 |     ///   - values: The arrays to be weighted and summed
159 |     /// - Returns: sum_i weights[i]*values[i]
160 |     func weightedSum(_ weights: [Double], _ values: [MLShapedArray<Float32>]) -> MLShapedArray<Float32> {
161 |         assert(weights.count > 1 && values.count == weights.count)
162 |         assert(values.allSatisfy({$0.scalarCount == values.first!.scalarCount}))
163 |         var w = Float(weights.first!)
164 |         var scalars = values.first!.scalars.map({ $0 * w })
165 |         for next in 1 ..< values.count {
166 |             w = Float(weights[next])
167 |             let nextScalars = values[next].scalars
168 |             for i in 0 ..< scalars.count {
169 |                 scalars[i] += w * nextScalars[i]
170 |             }
171 |         }
172 |         return MLShapedArray(scalars: scalars, shape: values.first!.shape)
173 |     }
174 | 
175 |     /// Compute  sample (denoised image) at previous step given a current time step
176 |     ///
177 |     /// - Parameters:
178 |     ///   - sample: The current input to the model x_t
179 |     ///   - timeStep: The current time step t
180 |     ///   - prevStep: The previous time step t−δ
181 |     ///   - modelOutput: Predicted noise residual the current time step e_θ(x_t, t)
182 |     /// - Returns: Computes previous sample x_(t−δ)
183 |     func previousSample(
184 |         _ sample: MLShapedArray<Float32>,
185 |         _ timeStep: Int,
186 |         _ prevStep: Int,
187 |         _ modelOutput: MLShapedArray<Float32>
188 |     ) ->  MLShapedArray<Float32> {
189 | 
190 |         // Compute x_(t−δ) using formula (9) from
191 |         // "Pseudo Numerical Methods for Diffusion Models on Manifolds",
192 |         // Luping Liu, Yi Ren, Zhijie Lin & Zhou Zhao.
193 |         // ICLR 2022
194 |         //
195 |         // Notation:
196 |         //
197 |         // alphaProdt       α_t
198 |         // alphaProdtPrev   α_(t−δ)
199 |         // betaProdt        (1 - α_t)
200 |         // betaProdtPrev    (1 - α_(t−δ))
201 |         let alphaProdt = alphasCumProd[timeStep]
202 |         let alphaProdtPrev = alphasCumProd[max(0,prevStep)]
203 |         let betaProdt = 1 - alphaProdt
204 |         let betaProdtPrev = 1 - alphaProdtPrev
205 | 
206 |         // sampleCoeff = (α_(t−δ) - α_t) divided by
207 |         // denominator of x_t in formula (9) and plus 1
208 |         // Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
209 |         // sqrt(α_(t−δ)) / sqrt(α_t))
210 |         let sampleCoeff = sqrt(alphaProdtPrev / alphaProdt)
211 | 
212 |         // Denominator of e_θ(x_t, t) in formula (9)
213 |         let modelOutputDenomCoeff = alphaProdt * sqrt(betaProdtPrev)
214 |         + sqrt(alphaProdt * betaProdt * alphaProdtPrev)
215 | 
216 |         // full formula (9)
217 |         let modelCoeff = -(alphaProdtPrev - alphaProdt)/modelOutputDenomCoeff
218 |         let prevSample = weightedSum(
219 |             [Double(sampleCoeff), Double(modelCoeff)],
220 |             [sample, modelOutput]
221 |         )
222 | 
223 |         return prevSample
224 |     }
225 | }
226 | 
227 | extension Scheduler {
228 |     /// How to map a beta range to a sequence of betas to step over
229 |     public enum BetaSchedule {
230 |         /// Linear stepping between start and end
231 |         case linear
232 |         /// Steps using linspace(sqrt(start),sqrt(end))^2
233 |         case scaledLinear
234 |     }
235 | }
236 | 
237 | /// Evenly spaced floats between specified interval
238 | ///
239 | /// - Parameters:
240 | ///   - start: Start of the interval
241 | ///   - end: End of the interval
242 | ///   - count: The number of floats to return between [*start*, *end*]
243 | /// - Returns: Float array with *count* elements evenly spaced between at *start* and *end*
244 | func linspace(_ start: Float, _ end: Float, _ count: Int) -> [Float] {
245 |     let scale = (end - start) / Float(count - 1)
246 |     return (0..<count).map { Float($0)*scale + start }
247 | }
248 | 
249 | extension Collection {
250 |     /// Collection element index from the back. *self[back: 1]* yields the last element
251 |     public subscript(back i: Int) -> Element {
252 |         return self[index(endIndex, offsetBy: -i)]
253 |     }
254 | }
255 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/chunk_mlprogram.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import argparse
  7 | from collections import OrderedDict
  8 | 
  9 | import coremltools as ct
 10 | from coremltools.converters.mil import Block, Program, Var
 11 | from coremltools.converters.mil.frontend.milproto.load import load as _milproto_to_pymil
 12 | from coremltools.converters.mil.mil import Builder as mb
 13 | from coremltools.converters.mil.mil import Placeholder
 14 | from coremltools.converters.mil.mil import types as types
 15 | from coremltools.converters.mil.mil.passes.helper import block_context_manager
 16 | from coremltools.converters.mil.mil.passes.pass_registry import PASS_REGISTRY
 17 | from coremltools.converters.mil.testing_utils import random_gen_input_feature_type
 18 | 
 19 | import gc
 20 | 
 21 | import logging
 22 | 
 23 | logging.basicConfig()
 24 | logger = logging.getLogger(__name__)
 25 | logger.setLevel(logging.INFO)
 26 | 
 27 | import numpy as np
 28 | import os
 29 | from python_coreml_stable_diffusion import torch2coreml
 30 | import shutil
 31 | import time
 32 | 
 33 | 
 34 | def _verify_output_correctness_of_chunks(full_model, first_chunk_model,
 35 |                                          second_chunk_model):
 36 |     """ Verifies the end-to-end output correctness of full (original) model versus chunked models
 37 |     """
 38 |     # Generate inputs for first chunk and full model
 39 |     input_dict = {}
 40 |     for input_desc in full_model._spec.description.input:
 41 |         input_dict[input_desc.name] = random_gen_input_feature_type(input_desc)
 42 | 
 43 |     # Generate outputs for first chunk and full model
 44 |     outputs_from_full_model = full_model.predict(input_dict)
 45 |     outputs_from_first_chunk_model = first_chunk_model.predict(input_dict)
 46 | 
 47 |     # Prepare inputs for second chunk model from first chunk's outputs and regular inputs
 48 |     second_chunk_input_dict = {}
 49 |     for input_desc in second_chunk_model._spec.description.input:
 50 |         if input_desc.name in outputs_from_first_chunk_model:
 51 |             second_chunk_input_dict[
 52 |                 input_desc.name] = outputs_from_first_chunk_model[
 53 |                     input_desc.name]
 54 |         else:
 55 |             second_chunk_input_dict[input_desc.name] = input_dict[
 56 |                 input_desc.name]
 57 | 
 58 |     # Generate output for second chunk model
 59 |     outputs_from_second_chunk_model = second_chunk_model.predict(
 60 |         second_chunk_input_dict)
 61 | 
 62 |     # Verify correctness across all outputs from second chunk and full model
 63 |     for out_name in outputs_from_full_model.keys():
 64 |         torch2coreml.report_correctness(
 65 |             original_outputs=outputs_from_full_model[out_name],
 66 |             final_outputs=outputs_from_second_chunk_model[out_name],
 67 |             log_prefix=f"{out_name}")
 68 | 
 69 | 
 70 | def _load_prog_from_mlmodel(model):
 71 |     """ Load MIL Program from an MLModel
 72 |     """
 73 |     model_spec = model.get_spec()
 74 |     start_ = time.time()
 75 |     logger.info(
 76 |         "Loading MLModel object into a MIL Program object (including the weights).."
 77 |     )
 78 |     prog = _milproto_to_pymil(
 79 |         model_spec=model_spec,
 80 |         specification_version=model_spec.specificationVersion,
 81 |         file_weights_dir=model.weights_dir,
 82 |     )
 83 |     logger.info(f"Program loaded in {time.time() - start_:.1f} seconds")
 84 | 
 85 |     return prog
 86 | 
 87 | 
 88 | def _get_op_idx_split_location(prog: Program):
 89 |     """ Find the op that approximately bisects the graph as measure by weights size on each side
 90 |     """
 91 |     main_block = prog.functions["main"]
 92 |     total_size_in_mb = 0
 93 | 
 94 |     for op in main_block.operations:
 95 |         if op.op_type == "const" and isinstance(op.val.val, np.ndarray):
 96 |             size_in_mb = op.val.val.size * op.val.val.itemsize / (1024 * 1024)
 97 |             total_size_in_mb += size_in_mb
 98 |     half_size = total_size_in_mb / 2
 99 | 
100 |     # Find the first non const op (single child), where the total cumulative size exceeds
101 |     # the half size for the first time
102 |     cumulative_size_in_mb = 0
103 |     for op in main_block.operations:
104 |         if op.op_type == "const" and isinstance(op.val.val, np.ndarray):
105 |             size_in_mb = op.val.val.size * op.val.val.itemsize / (1024 * 1024)
106 |             cumulative_size_in_mb += size_in_mb
107 | 
108 |         if (cumulative_size_in_mb > half_size and op.op_type != "const"
109 |                 and len(op.outputs) == 1
110 |                 and len(op.outputs[0].child_ops) == 1):
111 |             op_idx = main_block.operations.index(op)
112 |             return op_idx, cumulative_size_in_mb, total_size_in_mb
113 | 
114 | 
115 | def _get_first_chunk_outputs(block, op_idx):
116 |     # Get the list of all vars that go across from first program (all ops from 0 to op_idx (inclusive))
117 |     # to the second program (all ops from op_idx+1 till the end). These all vars need to be made the output
118 |     # of the first program and the input of the second program
119 |     boundary_vars = set()
120 |     for i in range(op_idx + 1):
121 |         op = block.operations[i]
122 |         for var in op.outputs:
123 |             if var.val is None:  # only consider non const vars
124 |                 for child_op in var.child_ops:
125 |                     child_op_idx = block.operations.index(child_op)
126 |                     if child_op_idx > op_idx:
127 |                         boundary_vars.add(var)
128 |     return list(boundary_vars)
129 | 
130 | 
131 | @block_context_manager
132 | def _add_fp32_casts(block, boundary_vars):
133 |     new_boundary_vars = []
134 |     for var in boundary_vars:
135 |         if var.dtype != types.fp16:
136 |             new_boundary_vars.append(var)
137 |         else:
138 |             fp32_var = mb.cast(x=var, dtype="fp32", name=var.name)
139 |             new_boundary_vars.append(fp32_var)
140 |     return new_boundary_vars
141 | 
142 | 
143 | def _make_first_chunk_prog(prog, op_idx):
144 |     """ Build first chunk by declaring early outputs and removing unused subgraph
145 |     """
146 |     block = prog.functions["main"]
147 |     boundary_vars = _get_first_chunk_outputs(block, op_idx)
148 | 
149 |     # Due to possible numerical issues, cast any fp16 var to fp32
150 |     new_boundary_vars = _add_fp32_casts(block, boundary_vars)
151 | 
152 |     block.outputs.clear()
153 |     block.set_outputs(new_boundary_vars)
154 |     PASS_REGISTRY["common::dead_code_elimination"](prog)
155 |     return prog
156 | 
157 | 
158 | def _make_second_chunk_prog(prog, op_idx):
159 |     """ Build second chunk by rebuilding a pristine MIL Program from MLModel
160 |     """
161 |     block = prog.functions["main"]
162 |     block.opset_version = ct.target.iOS16
163 | 
164 |     # First chunk outputs are second chunk inputs (e.g. skip connections)
165 |     boundary_vars = _get_first_chunk_outputs(block, op_idx)
166 | 
167 |     # This op will not be included in this program. Its output var will be made into an input
168 |     boundary_op = block.operations[op_idx]
169 | 
170 |     # Add all boundary ops as inputs
171 |     with block:
172 |         for var in boundary_vars:
173 |             new_placeholder = Placeholder(
174 |                 sym_shape=var.shape,
175 |                 dtype=var.dtype if var.dtype != types.fp16 else types.fp32,
176 |                 name=var.name,
177 |             )
178 | 
179 |             block._input_dict[
180 |                 new_placeholder.outputs[0].name] = new_placeholder.outputs[0]
181 | 
182 |             block.function_inputs = tuple(block._input_dict.values())
183 |             new_var = None
184 |             if var.dtype == types.fp16:
185 |                 new_var = mb.cast(x=new_placeholder.outputs[0],
186 |                                   dtype="fp16",
187 |                                   before_op=var.op)
188 |             else:
189 |                 new_var = new_placeholder.outputs[0]
190 | 
191 |             block.replace_uses_of_var_after_op(
192 |                 anchor_op=boundary_op,
193 |                 old_var=var,
194 |                 new_var=new_var,
195 |             )
196 | 
197 |     PASS_REGISTRY["common::dead_code_elimination"](prog)
198 | 
199 |     # Remove any unused inputs
200 |     new_input_dict = OrderedDict()
201 |     for k, v in block._input_dict.items():
202 |         if len(v.child_ops) > 0:
203 |             new_input_dict[k] = v
204 |     block._input_dict = new_input_dict
205 |     block.function_inputs = tuple(block._input_dict.values())
206 | 
207 |     return prog
208 | 
209 | 
210 | def main(args):
211 |     os.makedirs(args.o, exist_ok=True)
212 | 
213 |     # Check filename extension
214 |     mlpackage_name = os.path.basename(args.mlpackage_path)
215 |     name, ext = os.path.splitext(mlpackage_name)
216 |     assert ext == ".mlpackage", f"`--mlpackage-path` (args.mlpackage_path) is not an .mlpackage file"
217 | 
218 |     # Load CoreML model
219 |     logger.info("Loading model from {}".format(args.mlpackage_path))
220 |     start_ = time.time()
221 |     model = ct.models.MLModel(
222 |         args.mlpackage_path,
223 |         compute_units=ct.ComputeUnit.CPU_ONLY,
224 |     )
225 |     logger.info(
226 |         f"Loading {args.mlpackage_path} took {time.time() - start_:.1f} seconds"
227 |     )
228 | 
229 |     # Load the MIL Program from MLModel
230 |     prog = _load_prog_from_mlmodel(model)
231 | 
232 |     # Compute the incision point by bisecting the program based on weights size
233 |     op_idx, first_chunk_weights_size, total_weights_size = _get_op_idx_split_location(
234 |         prog)
235 |     main_block = prog.functions["main"]
236 |     incision_op = main_block.operations[op_idx]
237 |     logger.info(f"{args.mlpackage_path} will chunked into two pieces.")
238 |     logger.info(
239 |         f"The incision op: name={incision_op.name}, type={incision_op.op_type}, index={op_idx}/{len(main_block.operations)}"
240 |     )
241 |     logger.info(f"First  chunk size = {first_chunk_weights_size:.2f} MB")
242 |     logger.info(
243 |         f"Second chunk size = {total_weights_size - first_chunk_weights_size:.2f} MB"
244 |     )
245 | 
246 |     # Build first chunk (in-place modifies prog by declaring early exits and removing unused subgraph)
247 |     prog_chunk1 = _make_first_chunk_prog(prog, op_idx)
248 | 
249 |     # Build the second chunk
250 |     prog_chunk2 = _make_second_chunk_prog(_load_prog_from_mlmodel(model),
251 |                                           op_idx)
252 | 
253 |     if not args.check_output_correctness:
254 |         # Original model no longer needed in memory
255 |         del model
256 |         gc.collect()
257 | 
258 |     # Convert the MIL Program objects into MLModels
259 |     logger.info("Converting the two programs")
260 |     model_chunk1 = ct.convert(
261 |         prog_chunk1,
262 |         convert_to="mlprogram",
263 |         compute_units=ct.ComputeUnit.CPU_ONLY,
264 |         minimum_deployment_target=ct.target.iOS16,
265 |     )
266 |     del prog_chunk1
267 |     gc.collect()
268 |     logger.info("Conversion of first chunk done.")
269 | 
270 |     model_chunk2 = ct.convert(
271 |         prog_chunk2,
272 |         convert_to="mlprogram",
273 |         compute_units=ct.ComputeUnit.CPU_ONLY,
274 |         minimum_deployment_target=ct.target.iOS16,
275 |     )
276 |     del prog_chunk2
277 |     gc.collect()
278 |     logger.info("Conversion of second chunk done.")
279 | 
280 |     # Verify output correctness
281 |     if args.check_output_correctness:
282 |         logger.info("Verifying output correctness of chunks")
283 |         _verify_output_correctness_of_chunks(
284 |             full_model=model,
285 |             first_chunk_model=model_chunk1,
286 |             second_chunk_model=model_chunk2,
287 |         )
288 | 
289 |     # Remove original (non-chunked) model if requested
290 |     if args.remove_original:
291 |         logger.info(
292 |             "Removing original (non-chunked) model at {args.mlpackage_path}")
293 |         shutil.rmtree(args.mlpackage_path)
294 |         logger.info("Done.")
295 | 
296 |     # Save the chunked models to disk
297 |     out_path_chunk1 = os.path.join(args.o, name + "_chunk1.mlpackage")
298 |     out_path_chunk2 = os.path.join(args.o, name + "_chunk2.mlpackage")
299 | 
300 |     logger.info(
301 |         f"Saved chunks in {args.o} with the suffix _chunk1.mlpackage and _chunk2.mlpackage"
302 |     )
303 |     model_chunk1.save(out_path_chunk1)
304 |     model_chunk2.save(out_path_chunk2)
305 |     logger.info("Done.")
306 | 
307 | 
308 | if __name__ == "__main__":
309 |     parser = argparse.ArgumentParser()
310 |     parser.add_argument(
311 |         "--mlpackage-path",
312 |         required=True,
313 |         help=
314 |         "Path to the mlpackage file to be split into two mlpackages of approximately same file size.",
315 |     )
316 |     parser.add_argument(
317 |         "-o",
318 |         required=True,
319 |         help=
320 |         "Path to output directory where the two model chunks should be saved.",
321 |     )
322 |     parser.add_argument(
323 |         "--remove-original",
324 |         action="store_true",
325 |         help=
326 |         "If specified, removes the original (non-chunked) model to avoid duplicating storage."
327 |     )
328 |     parser.add_argument(
329 |         "--check-output-correctness",
330 |         action="store_true",
331 |         help=
332 |         ("If specified, compares the outputs of original Core ML model with that of pipelined CoreML model chunks and reports PSNR in dB. ",
333 |          "Enabling this feature uses more memory. Disable it if your machine runs out of memory."
334 |          ))
335 | 
336 |     args = parser.parse_args()
337 |     main(args)
338 | 


--------------------------------------------------------------------------------
/tests/test_stable_diffusion.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import argparse
  7 | import contextlib
  8 | import coremltools as ct
  9 | from diffusers import StableDiffusionPipeline
 10 | import json
 11 | import logging
 12 | import numpy as np
 13 | import os
 14 | import unittest
 15 | from PIL import Image
 16 | from statistics import median
 17 | import tempfile
 18 | import time
 19 | 
 20 | import torch
 21 | 
 22 | torch.set_grad_enabled(False)
 23 | 
 24 | from python_coreml_stable_diffusion import torch2coreml, pipeline, coreml_model
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | logger.setLevel("INFO")
 28 | 
 29 | # Testing configuration
 30 | TEST_SEED = 93
 31 | TEST_PROMPT = "a high quality photo of an astronaut riding a horse in space"
 32 | TEST_COMPUTE_UNIT = ["CPU_AND_GPU", "ALL", "CPU_AND_NE"]
 33 | TEST_PSNR_THRESHOLD = 35  # dB
 34 | TEST_ABSOLUTE_MAX_LATENCY = 90  # seconds
 35 | TEST_WARMUP_INFERENCE_STEPS = 3
 36 | TEST_TEXT_TO_IMAGE_SPEED_REPEATS = 3
 37 | TEST_MINIMUM_PROMPT_TO_IMAGE_CLIP_COSINE_SIMILARITY = 0.3  # in range [0.,1.]
 38 | 
 39 | 
 40 | class TestStableDiffusionForTextToImage(unittest.TestCase):
 41 |     """ Test Stable Diffusion text-to-image pipeline for:
 42 | 
 43 |     - PyTorch to CoreML conversion via coremltools
 44 |     - Speed of CoreML runtime across several compute units
 45 |     - Integration with `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py`
 46 |     - Efficacy of the safety_checker
 47 |     - Affinity of the generated image with the original prompt via CLIP score
 48 |     - The bridge between Python and Swift CLI
 49 |     - The signal parity of Swift CLI generated image with that of Python CLI
 50 |     """
 51 |     cli_args = None
 52 | 
 53 |     @classmethod
 54 |     def setUpClass(cls):
 55 |         cls.pytorch_pipe = StableDiffusionPipeline.from_pretrained(
 56 |             cls.cli_args.model_version,
 57 |             use_auth_token=True,
 58 |         )
 59 | 
 60 |         # To be initialized after test_torch_to_coreml_conversion is run
 61 |         cls.coreml_pipe = None
 62 |         cls.active_compute_unit = None
 63 | 
 64 |     @classmethod
 65 |     def tearDownClass(cls):
 66 |         cls.pytorch_pipe = None
 67 |         cls.coreml_pipe = None
 68 |         cls.active_compute_unit = None
 69 | 
 70 |     def test_torch_to_coreml_conversion(self):
 71 |         """ Tests:
 72 |         - PyTorch to CoreML conversion via coremltools
 73 |         """
 74 |         with self.subTest(model="vae_decoder"):
 75 |             logger.info("Converting vae_decoder")
 76 |             torch2coreml.convert_vae_decoder(self.pytorch_pipe, self.cli_args)
 77 |             logger.info("Successfuly converted vae_decoder")
 78 | 
 79 |         with self.subTest(model="unet"):
 80 |             logger.info("Converting unet")
 81 |             torch2coreml.convert_unet(self.pytorch_pipe, self.cli_args)
 82 |             logger.info("Successfuly converted unet")
 83 | 
 84 |         with self.subTest(model="text_encoder"):
 85 |             logger.info("Converting text_encoder")
 86 |             torch2coreml.convert_text_encoder(self.pytorch_pipe, self.cli_args)
 87 |             logger.info("Successfuly converted text_encoder")
 88 | 
 89 |         with self.subTest(model="safety_checker"):
 90 |             logger.info("Converting safety_checker")
 91 |             torch2coreml.convert_safety_checker(self.pytorch_pipe,
 92 |                                                 self.cli_args)
 93 |             logger.info("Successfuly converted safety_checker")
 94 | 
 95 |     def test_end_to_end_image_generation_speed(self):
 96 |         """ Tests:
 97 |         - Speed of CoreML runtime across several compute units
 98 |         - Integration with `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py`
 99 |         """
100 |         latency = {
101 |             compute_unit:
102 |             self._coreml_text_to_image_with_compute_unit(compute_unit)
103 |             for compute_unit in TEST_COMPUTE_UNIT
104 |         }
105 |         latency["num_repeats_for_median"] = TEST_TEXT_TO_IMAGE_SPEED_REPEATS
106 | 
107 |         json_path = os.path.join(self.cli_args.o, "benchmark.json")
108 |         logger.info(f"Saving inference benchmark results to {json_path}")
109 |         with open(json_path, "w") as f:
110 |             json.dump(latency, f)
111 | 
112 |         for compute_unit in TEST_COMPUTE_UNIT:
113 |             with self.subTest(compute_unit=compute_unit):
114 |                 self.assertGreater(TEST_ABSOLUTE_MAX_LATENCY,
115 |                                    latency[compute_unit])
116 | 
117 |     def test_image_to_prompt_clip_score(self):
118 |         """ Tests:
119 |         Affinity of the generated image with the original prompt via CLIP score
120 |         """
121 |         logger.warning(
122 |             "This test will download the CLIP ViT-B/16 model (approximately 600 MB) from Hugging Face"
123 |         )
124 | 
125 |         from transformers import CLIPProcessor, CLIPModel
126 | 
127 |         model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
128 |         processor = CLIPProcessor.from_pretrained(
129 |             "openai/clip-vit-base-patch16")
130 | 
131 |         for compute_unit in TEST_COMPUTE_UNIT:
132 |             with self.subTest(compute_unit=compute_unit):
133 |                 image_path = pipeline.get_image_path(self.cli_args,
134 |                                                      prompt=TEST_PROMPT,
135 |                                                      compute_unit=compute_unit)
136 |                 image = Image.open(image_path)
137 | 
138 |                 # Preprocess images and text for inference with CLIP
139 |                 inputs = processor(text=[TEST_PROMPT],
140 |                                    images=image,
141 |                                    return_tensors="pt",
142 |                                    padding=True)
143 |                 outputs = model(**inputs)
144 | 
145 |                 # Compute cosine similarity between image and text embeddings
146 |                 image_text_cosine_similarity = outputs.image_embeds @ outputs.text_embeds.T
147 |                 logger.info(
148 |                     f"Image ({image_path}) to text ({TEST_PROMPT}) CLIP score: {image_text_cosine_similarity[0].item():.2f}"
149 |                 )
150 | 
151 |                 # Ensure that the minimum cosine similarity threshold is achieved
152 |                 self.assertGreater(
153 |                     image_text_cosine_similarity,
154 |                     TEST_MINIMUM_PROMPT_TO_IMAGE_CLIP_COSINE_SIMILARITY,
155 |                 )
156 | 
157 |     def test_safety_checker_efficacy(self):
158 |         """ Tests:
159 |         - Efficacy of the safety_checker
160 |         """
161 |         self._init_coreml_pipe(compute_unit=self.active_compute_unit)
162 | 
163 |         safety_checker_test_prompt = "NSFW"
164 |         image = self.coreml_pipe(safety_checker_test_prompt)
165 | 
166 |         # Image must have been erased by the safety checker
167 |         self.assertEqual(np.array(image["images"][0]).sum(), 0.)
168 |         self.assertTrue(image["nsfw_content_detected"].any())
169 | 
170 |     def test_swift_cli_image_generation(self):
171 |         """ Tests:
172 |         - The bridge between Python and Swift CLI
173 |         - The signal parity of Swift CLI generated image with that of Python CLI
174 |         """
175 |         # coremltools to Core ML compute unit mapping
176 |         compute_unit_map = {
177 |             "ALL": "all",
178 |             "CPU_AND_GPU": "cpuAndGPU",
179 |             "CPU_AND_NE": "cpuAndNeuralEngine"
180 |         }
181 | 
182 |         # Prepare resources for Swift CLI
183 |         resources_dir = torch2coreml.bundle_resources_for_swift_cli(
184 |             self.cli_args)
185 |         logger.info("Bundled resources for Swift CLI")
186 | 
187 |         # Execute image generation with Swift CLI
188 |         # Note: First time takes ~5 minutes due to project building and so on
189 |         cmd = " ".join([
190 |             f"swift run StableDiffusionSample \"{TEST_PROMPT}\"",
191 |             f"--resource-path {resources_dir}",
192 |             f"--seed {TEST_SEED}",
193 |             f"--output-path {self.cli_args.o}",
194 |             f"--compute-units {compute_unit_map[TEST_COMPUTE_UNIT[-1]]}"
195 |         ])
196 |         logger.info(f"Executing `{cmd}`")
197 |         os.system(cmd)
198 |         logger.info(f"Image generation with Swift CLI is complete")
199 | 
200 |         # Load Swift CLI generated image
201 |         swift_cli_image = Image.open(
202 |             os.path.join(
203 |                 self.cli_args.o, "_".join(TEST_PROMPT.rsplit(" ")) + "." +
204 |                 str(TEST_SEED) + ".final.png"))
205 | 
206 |         # Load Python CLI (pipeline.py) generated image
207 |         python_cli_image = Image.open(pipeline.get_image_path(self.cli_args,
208 |                                                               prompt=TEST_PROMPT,
209 |                                                               compute_unit=TEST_COMPUTE_UNIT[-1]))
210 | 
211 |         # Compute signal parity
212 |         swift2torch_psnr = torch2coreml.report_correctness(
213 |             np.array(swift_cli_image.convert("RGB")),
214 |             np.array(python_cli_image.convert("RGB")),
215 |             "Swift CLI and Python CLI generated images")
216 |         self.assertGreater(swift2torch_psnr, torch2coreml.ABSOLUTE_MIN_PSNR)
217 | 
218 |     def _init_coreml_pipe(self, compute_unit):
219 |         """ Initializes CoreML pipe for the requested compute_unit
220 |         """
221 |         assert compute_unit in ct.ComputeUnit._member_names_, f"Not a valid coremltools.ComputeUnit: {compute_unit}"
222 | 
223 |         if self.active_compute_unit == compute_unit:
224 |             logger.info(
225 |                 "self.coreml_pipe matches requested compute_unit, skipping reinitialization"
226 |             )
227 |             assert \
228 |                 isinstance(self.coreml_pipe, pipeline.CoreMLStableDiffusionPipeline), \
229 |                 type(self.coreml_pipe)
230 |         else:
231 |             self.active_compute_unit = compute_unit
232 |             self.coreml_pipe = pipeline.get_coreml_pipe(
233 |                 pytorch_pipe=self.pytorch_pipe,
234 |                 mlpackages_dir=self.cli_args.o,
235 |                 model_version=self.cli_args.model_version,
236 |                 compute_unit=self.active_compute_unit,)
237 | 
238 | 
239 |     def _coreml_text_to_image_with_compute_unit(self, compute_unit):
240 |         """ Benchmark end-to-end text-to-image generation with the requested compute_unit
241 |         """
242 |         self._init_coreml_pipe(compute_unit)
243 | 
244 |         # Warm up (not necessary in all settings but improves consistency for benchmarking)
245 |         logger.info(
246 |             f"Warmup image generation with {TEST_WARMUP_INFERENCE_STEPS} inference steps"
247 |         )
248 |         image = self.coreml_pipe(
249 |             TEST_PROMPT, num_inference_steps=TEST_WARMUP_INFERENCE_STEPS)
250 | 
251 |         # Test end-to-end speed
252 |         logger.info(
253 |             f"Run full image generation {TEST_TEXT_TO_IMAGE_SPEED_REPEATS} times and report median"
254 |         )
255 | 
256 |         def test_coreml_text_to_image_speed():
257 |             """ Execute Core ML based image generation
258 |             """
259 |             _reset_seed()
260 |             image = self.coreml_pipe(TEST_PROMPT)["images"][0]
261 |             out_path = pipeline.get_image_path(self.cli_args,
262 |                                         prompt=TEST_PROMPT,
263 |                                         compute_unit=compute_unit)
264 |             logger.info(f"Saving generated image to {out_path}")
265 |             image.save(out_path)
266 | 
267 |         def collect_timings(callable, n):
268 |             """ Collect user latency for callable
269 |             """
270 |             user_latencies = []
271 |             for _ in range(n):
272 |                 s = time.time()
273 |                 callable()
274 |                 user_latencies.append(float(f"{time.time() - s:.2f}"))
275 |             return user_latencies
276 | 
277 |         coreml_latencies = collect_timings(
278 |             callable=test_coreml_text_to_image_speed,
279 |             n=TEST_TEXT_TO_IMAGE_SPEED_REPEATS)
280 |         coreml_median_latency = median(coreml_latencies)
281 | 
282 |         logger.info(
283 |             f"End-to-end latencies with coremltools.ComputeUnit.{compute_unit}: median={coreml_median_latency:.2f}"
284 |         )
285 | 
286 |         return coreml_median_latency
287 | 
288 | 
289 | def _reset_seed():
290 |     """ Reset RNG state in order to reproduce the results across multiple runs
291 |     """
292 |     torch.manual_seed(TEST_SEED)
293 |     np.random.seed(TEST_SEED)
294 | 
295 | 
296 | def _get_test_artifacts_dir(args):
297 |     if cli_args.persistent_test_artifacts_dir is not None:
298 |         os.makedirs(cli_args.persistent_test_artifacts_dir, exist_ok=True)
299 |         return contextlib.nullcontext(
300 |             enter_result=cli_args.persistent_test_artifacts_dir)
301 |     else:
302 |         return tempfile.TemporaryDirectory(
303 |             prefix="python_coreml_stable_diffusion_tests")
304 | 
305 | 
306 | def _extend_parser(parser):
307 |     parser.add_argument(
308 |         "--persistent-test-artifacts-dir",
309 |         type=str,
310 |         default=None,
311 |         help=
312 |         ("If specified, test artifacts such as Core ML models and generated images are saved in this directory. ",
313 |          "Otherwise, all artifacts are erased after the test program terminates."
314 |          ))
315 |     parser.add_argument(
316 |         "--fast",
317 |         action="store_true",
318 |         help=
319 |         "If specified, runs fewer repeats for `test_end_to_end_image_generation_speed`"
320 |     )
321 |     parser.add_argument(
322 |         "--test-image-to-prompt-clip-score-opt-in",
323 |         action="store_true",
324 |         help=
325 |         ("If specified, enables `test_image_to_prompt_clip_score` to verify the relevance of the "
326 |          "generated image content to the original text prompt. This test is an opt-in "
327 |          "test because it involves an additional one time 600MB model download."
328 |          ))
329 |     parser.add_argument(
330 |         "--test-swift-cli-opt-in",
331 |         action="store_true",
332 |         help=
333 |         ("If specified, compiles all models and builds the Swift CLI to run image generation and compares "
334 |          "results across Python and Swift runtime"))
335 |     parser.add_argument(
336 |         "--test-safety-checker-efficacy-opt-in",
337 |         action="store_true",
338 |         help=
339 |         ("If specified, generates a potentially NSFW image to check whether the `safety_checker` "
340 |          "accurately detects and removes the content"))
341 |     return parser
342 | 
343 | 
344 | if __name__ == "__main__":
345 |     # Reproduce the CLI of the original pipeline
346 |     parser = torch2coreml.parser_spec()
347 |     parser = _extend_parser(parser)
348 |     cli_args = parser.parse_args()
349 | 
350 |     cli_args.check_output_correctness = True
351 |     cli_args.prompt = TEST_PROMPT
352 |     cli_args.seed = TEST_SEED
353 |     cli_args.compute_unit = TEST_COMPUTE_UNIT[0]
354 |     cli_args.scheduler = None  # use default
355 |     torch2coreml.ABSOLUTE_MIN_PSNR = TEST_PSNR_THRESHOLD
356 | 
357 |     if cli_args.fast:
358 |         logger.info(
359 |             "`--fast` detected: Image generation will be run once " \
360 |             f"(instead of {TEST_TEXT_TO_IMAGE_SPEED_REPEATS } times) " \
361 |             "with ComputeUnit.ALL (other compute units are skipped)" \
362 |             " (median can not be reported)")
363 |         TEST_TEXT_TO_IMAGE_SPEED_REPEATS = 1
364 |         TEST_COMPUTE_UNIT = ["ALL"]
365 | 
366 |         logger.info("`--fast` detected: Skipping `--check-output-correctness` tests")
367 |         cli_args.check_output_correctness = False
368 |     elif cli_args.attention_implementation == "ORIGINAL":
369 |         TEST_COMPUTE_UNIT = ["CPU_AND_GPU", "ALL"]
370 |     elif cli_args.attention_implementation == "SPLIT_EINSUM":
371 |         TEST_COMPUTE_UNIT = ["ALL", "CPU_AND_NE"]
372 | 
373 |     logger.info(f"Testing compute units: {TEST_COMPUTE_UNIT}")
374 | 
375 | 
376 |     # Save CoreML model files and generated images into the artifacts dir
377 |     with _get_test_artifacts_dir(cli_args) as test_artifacts_dir:
378 |         cli_args.o = test_artifacts_dir
379 |         logger.info(f"Test artifacts will be saved under {test_artifacts_dir}")
380 | 
381 |         TestStableDiffusionForTextToImage.cli_args = cli_args
382 | 
383 |         # Run the following tests in sequential order
384 |         suite = unittest.TestSuite()
385 |         suite.addTest(
386 |             TestStableDiffusionForTextToImage(
387 |                 "test_torch_to_coreml_conversion"))
388 |         suite.addTest(
389 |             TestStableDiffusionForTextToImage(
390 |                 "test_end_to_end_image_generation_speed"))
391 | 
392 |         if cli_args.test_safety_checker_efficacy_opt_in:
393 |             suite.addTest(
394 |                 TestStableDiffusionForTextToImage("test_safety_checker_efficacy"))
395 | 
396 |         if cli_args.test_image_to_prompt_clip_score_opt_in:
397 |             suite.addTest(
398 |                 TestStableDiffusionForTextToImage(
399 |                     "test_image_to_prompt_clip_score"))
400 | 
401 |         if cli_args.test_swift_cli_opt_in:
402 |             suite.addTest(
403 |                 TestStableDiffusionForTextToImage(
404 |                     "test_swift_cli_image_generation"))
405 | 
406 |         if os.getenv("DEBUG", False):
407 |             suite.debug()
408 |         else:
409 |             runner = unittest.TextTestRunner()
410 |             runner.run(suite)
411 | 


--------------------------------------------------------------------------------
/python_coreml_stable_diffusion/pipeline.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # For licensing see accompanying LICENSE.md file.
  3 | # Copyright (C) 2022 Apple Inc. All Rights Reserved.
  4 | #
  5 | 
  6 | import argparse
  7 | 
  8 | from diffusers.pipeline_utils import DiffusionPipeline
  9 | from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 10 | from diffusers.schedulers import (
 11 |     DDIMScheduler,
 12 |     DPMSolverMultistepScheduler,
 13 |     EulerAncestralDiscreteScheduler,
 14 |     EulerDiscreteScheduler,
 15 |     LMSDiscreteScheduler,
 16 |     PNDMScheduler,
 17 | )
 18 | from diffusers.schedulers.scheduling_utils import SchedulerMixin
 19 | 
 20 | import gc
 21 | import inspect
 22 | 
 23 | import logging
 24 | 
 25 | logging.basicConfig()
 26 | logger = logging.getLogger(__name__)
 27 | logger.setLevel(logging.INFO)
 28 | 
 29 | import numpy as np
 30 | import os
 31 | 
 32 | from python_coreml_stable_diffusion.coreml_model import (
 33 |     CoreMLModel,
 34 |     _load_mlpackage,
 35 |     get_available_compute_units,
 36 | )
 37 | 
 38 | import time
 39 | import torch  # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
 40 | from transformers import CLIPFeatureExtractor, CLIPTokenizer
 41 | from typing import Union, Optional
 42 | 
 43 | 
 44 | class CoreMLStableDiffusionPipeline(DiffusionPipeline):
 45 |     """ Core ML version of
 46 |     `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline`
 47 |     """
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         text_encoder: CoreMLModel,
 52 |         unet: CoreMLModel,
 53 |         vae_decoder: CoreMLModel,
 54 |         feature_extractor: CLIPFeatureExtractor,
 55 |         safety_checker: Optional[CoreMLModel],
 56 |         scheduler: Union[DDIMScheduler,
 57 |                          DPMSolverMultistepScheduler,
 58 |                          EulerAncestralDiscreteScheduler,
 59 |                          EulerDiscreteScheduler,
 60 |                          LMSDiscreteScheduler,
 61 |                          PNDMScheduler],
 62 |         tokenizer: CLIPTokenizer,
 63 |     ):
 64 |         super().__init__()
 65 | 
 66 |         # Register non-Core ML components of the pipeline similar to the original pipeline
 67 |         self.register_modules(
 68 |             tokenizer=tokenizer,
 69 |             scheduler=scheduler,
 70 |             feature_extractor=feature_extractor,
 71 |         )
 72 | 
 73 |         if safety_checker is None:
 74 |             # Reproduce original warning:
 75 |             # https://github.com/huggingface/diffusers/blob/v0.9.0/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L119
 76 |             logger.warning(
 77 |                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
 78 |                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
 79 |                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
 80 |                 " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
 81 |                 " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
 82 |                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
 83 |             )
 84 | 
 85 |         # Register Core ML components of the pipeline
 86 |         self.safety_checker = safety_checker
 87 |         self.text_encoder = text_encoder
 88 |         self.unet = unet
 89 |         self.unet.in_channels = self.unet.expected_inputs["sample"]["shape"][1]
 90 | 
 91 |         self.vae_decoder = vae_decoder
 92 | 
 93 |         VAE_DECODER_UPSAMPLE_FACTOR = 8
 94 | 
 95 |         # In PyTorch, users can determine the tensor shapes dynamically by default
 96 |         # In CoreML, tensors have static shapes unless flexible shapes were used during export
 97 |         # See https://coremltools.readme.io/docs/flexible-inputs
 98 |         latent_h, latent_w = self.unet.expected_inputs["sample"]["shape"][2:]
 99 |         self.height = latent_h * VAE_DECODER_UPSAMPLE_FACTOR
100 |         self.width = latent_w * VAE_DECODER_UPSAMPLE_FACTOR
101 | 
102 |         logger.info(
103 |             f"Stable Diffusion configured to generate {self.height}x{self.width} images"
104 |         )
105 | 
106 |     def _encode_prompt(self, prompt, num_images_per_prompt,
107 |                        do_classifier_free_guidance, negative_prompt):
108 |         batch_size = len(prompt) if isinstance(prompt, list) else 1
109 | 
110 |         text_inputs = self.tokenizer(
111 |             prompt,
112 |             padding="max_length",
113 |             max_length=self.tokenizer.model_max_length,
114 |             return_tensors="np",
115 |         )
116 |         text_input_ids = text_inputs.input_ids
117 | 
118 |         if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
119 |             removed_text = self.tokenizer.batch_decode(
120 |                 text_input_ids[:, self.tokenizer.model_max_length:])
121 |             logger.warning(
122 |                 "The following part of your input was truncated because CLIP can only handle sequences up to"
123 |                 f" {self.tokenizer.model_max_length} tokens: {removed_text}")
124 |             text_input_ids = text_input_ids[:, :self.tokenizer.
125 |                                             model_max_length]
126 | 
127 |         text_embeddings = self.text_encoder(
128 |             input_ids=text_input_ids.astype(np.float32))["last_hidden_state"]
129 | 
130 |         if do_classifier_free_guidance:
131 |             uncond_tokens: List[str]
132 |             if negative_prompt is None:
133 |                 uncond_tokens = [""] * batch_size
134 |             elif type(prompt) is not type(negative_prompt):
135 |                 raise TypeError(
136 |                     "`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
137 |                     " {type(prompt)}.")
138 |             elif isinstance(negative_prompt, str):
139 |                 uncond_tokens = [negative_prompt] * batch_size
140 |             elif batch_size != len(negative_prompt):
141 |                 raise ValueError(
142 |                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
143 |                     f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
144 |                     " the batch size of `prompt`.")
145 |             else:
146 |                 uncond_tokens = negative_prompt
147 | 
148 |             max_length = text_input_ids.shape[-1]
149 |             uncond_input = self.tokenizer(
150 |                 uncond_tokens,
151 |                 padding="max_length",
152 |                 max_length=max_length,
153 |                 truncation=True,
154 |                 return_tensors="np",
155 |             )
156 | 
157 |             uncond_embeddings = self.text_encoder(
158 |                 input_ids=uncond_input.input_ids.astype(
159 |                     np.float32))["last_hidden_state"]
160 | 
161 |             # For classifier free guidance, we need to do two forward passes.
162 |             # Here we concatenate the unconditional and text embeddings into a single batch
163 |             # to avoid doing two forward passes
164 |             text_embeddings = np.concatenate(
165 |                 [uncond_embeddings, text_embeddings])
166 | 
167 |         text_embeddings = text_embeddings.transpose(0, 2, 1)[:, :, None, :]
168 | 
169 |         return text_embeddings
170 | 
171 |     def run_safety_checker(self, image):
172 |         if self.safety_checker is not None:
173 |             safety_checker_input = self.feature_extractor(
174 |                 self.numpy_to_pil(image),
175 |                 return_tensors="np",
176 |             )
177 | 
178 |             safety_checker_outputs = self.safety_checker(
179 |                 clip_input=safety_checker_input.pixel_values.astype(
180 |                     np.float16),
181 |                 images=image.astype(np.float16),
182 |                 adjustment=np.array([0.]).astype(
183 |                     np.float16),  # defaults to 0 in original pipeline
184 |             )
185 | 
186 |             # Unpack dict
187 |             has_nsfw_concept = safety_checker_outputs["has_nsfw_concepts"]
188 |             image = safety_checker_outputs["filtered_images"]
189 |             concept_scores = safety_checker_outputs["concept_scores"]
190 | 
191 |             logger.info(
192 |                 f"Generated image has nsfw concept={has_nsfw_concept.any()}")
193 |         else:
194 |             has_nsfw_concept = None
195 | 
196 |         return image, has_nsfw_concept
197 | 
198 |     def decode_latents(self, latents):
199 |         latents = 1 / 0.18215 * latents
200 |         image = self.vae_decoder(z=latents.astype(np.float16))["image"]
201 |         image = np.clip(image / 2 + 0.5, 0, 1)
202 |         image = image.transpose((0, 2, 3, 1))
203 | 
204 |         return image
205 | 
206 |     def prepare_latents(self,
207 |                         batch_size,
208 |                         num_channels_latents,
209 |                         height,
210 |                         width,
211 |                         latents=None):
212 |         latents_shape = (batch_size, num_channels_latents, self.height // 8,
213 |                          self.width // 8)
214 |         if latents is None:
215 |             latents = np.random.randn(*latents_shape).astype(np.float16)
216 |         elif latents.shape != latents_shape:
217 |             raise ValueError(
218 |                 f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
219 |             )
220 | 
221 |         latents = latents * self.scheduler.init_noise_sigma
222 | 
223 |         return latents
224 | 
225 |     def check_inputs(self, prompt, height, width, callback_steps):
226 |         if height != self.height or width != self.width:
227 |             logger.warning(
228 |                 "`height` and `width` dimensions (of the output image tensor) are fixed when exporting the Core ML models " \
229 |                 "unless flexible shapes are used during export (https://coremltools.readme.io/docs/flexible-inputs). " \
230 |                 "This pipeline was provided with Core ML models that generate {self.height}x{self.width} images (user requested {height}x{width})"
231 |             )
232 | 
233 |         if not isinstance(prompt, str) and not isinstance(prompt, list):
234 |             raise ValueError(
235 |                 f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
236 |             )
237 | 
238 |         if height % 8 != 0 or width % 8 != 0:
239 |             raise ValueError(
240 |                 f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
241 |             )
242 | 
243 |         if (callback_steps is None) or (callback_steps is not None and
244 |                                         (not isinstance(callback_steps, int)
245 |                                          or callback_steps <= 0)):
246 |             raise ValueError(
247 |                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
248 |                 f" {type(callback_steps)}.")
249 | 
250 |     def prepare_extra_step_kwargs(self, eta):
251 |         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
252 |         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
253 |         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
254 |         # and should be between [0, 1]
255 | 
256 |         accepts_eta = "eta" in set(
257 |             inspect.signature(self.scheduler.step).parameters.keys())
258 |         extra_step_kwargs = {}
259 |         if accepts_eta:
260 |             extra_step_kwargs["eta"] = eta
261 | 
262 |         return extra_step_kwargs
263 | 
264 |     def __call__(
265 |         self,
266 |         prompt,
267 |         height=512,
268 |         width=512,
269 |         num_inference_steps=50,
270 |         guidance_scale=7.5,
271 |         negative_prompt=None,
272 |         num_images_per_prompt=1,
273 |         eta=0.0,
274 |         latents=None,
275 |         output_type="pil",
276 |         return_dict=True,
277 |         callback=None,
278 |         callback_steps=1,
279 |         **kwargs,
280 |     ):
281 |         # 1. Check inputs. Raise error if not correct
282 |         self.check_inputs(prompt, height, width, callback_steps)
283 | 
284 |         # 2. Define call parameters
285 |         batch_size = 1 if isinstance(prompt, str) else len(prompt)
286 |         if batch_size > 1 or num_images_per_prompt > 1:
287 |             raise NotImplementedError(
288 |                 "For batched generation of multiple images and/or multiple prompts, please refer to the Swift package."
289 |             )
290 | 
291 |         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
292 |         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
293 |         # corresponds to doing no classifier free guidance.
294 |         do_classifier_free_guidance = guidance_scale > 1.0
295 | 
296 |         # 3. Encode input prompt
297 |         text_embeddings = self._encode_prompt(
298 |             prompt,
299 |             num_images_per_prompt,
300 |             do_classifier_free_guidance,
301 |             negative_prompt,
302 |         )
303 | 
304 |         # 4. Prepare timesteps
305 |         self.scheduler.set_timesteps(num_inference_steps)
306 |         timesteps = self.scheduler.timesteps
307 | 
308 |         # 5. Prepare latent variables
309 |         num_channels_latents = self.unet.in_channels
310 |         latents = self.prepare_latents(
311 |             batch_size * num_images_per_prompt,
312 |             num_channels_latents,
313 |             height,
314 |             width,
315 |             latents,
316 |         )
317 | 
318 |         # 6. Prepare extra step kwargs
319 |         extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
320 | 
321 |         # 7. Denoising loop
322 |         for i, t in enumerate(self.progress_bar(timesteps)):
323 |             # expand the latents if we are doing classifier free guidance
324 |             latent_model_input = np.concatenate(
325 |                 [latents] * 2) if do_classifier_free_guidance else latents
326 |             latent_model_input = self.scheduler.scale_model_input(
327 |                 latent_model_input, t)
328 | 
329 |             # predict the noise residual
330 |             noise_pred = self.unet(
331 |                 sample=latent_model_input.astype(np.float16),
332 |                 timestep=np.array([t, t], np.float16),
333 |                 encoder_hidden_states=text_embeddings.astype(np.float16),
334 |             )["noise_pred"]
335 | 
336 |             # perform guidance
337 |             if do_classifier_free_guidance:
338 |                 noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
339 |                 noise_pred = noise_pred_uncond + guidance_scale * (
340 |                     noise_pred_text - noise_pred_uncond)
341 | 
342 |             # compute the previous noisy sample x_t -> x_t-1
343 |             latents = self.scheduler.step(torch.from_numpy(noise_pred),
344 |                                           t,
345 |                                           torch.from_numpy(latents),
346 |                                           **extra_step_kwargs,
347 |             ).prev_sample.numpy()
348 | 
349 |             # call the callback, if provided
350 |             if callback is not None and i % callback_steps == 0:
351 |                 callback(i, t, latents)
352 | 
353 |         # 8. Post-processing
354 |         image = self.decode_latents(latents)
355 | 
356 |         # 9. Run safety checker
357 |         image, has_nsfw_concept = self.run_safety_checker(image)
358 | 
359 |         # 10. Convert to PIL
360 |         if output_type == "pil":
361 |             image = self.numpy_to_pil(image)
362 | 
363 |         if not return_dict:
364 |             return (image, has_nsfw_concept)
365 | 
366 |         return StableDiffusionPipelineOutput(
367 |             images=image, nsfw_content_detected=has_nsfw_concept)
368 | 
369 | 
370 | def get_available_schedulers():
371 |     schedulers = {}
372 |     for scheduler in [DDIMScheduler,
373 |                       DPMSolverMultistepScheduler,
374 |                       EulerAncestralDiscreteScheduler,
375 |                       EulerDiscreteScheduler,
376 |                       LMSDiscreteScheduler,
377 |                       PNDMScheduler]:
378 |         schedulers[scheduler().__class__.__name__.replace("Scheduler", "")] = scheduler
379 |     return schedulers
380 | 
381 | SCHEDULER_MAP = get_available_schedulers()
382 | 
383 | def get_coreml_pipe(pytorch_pipe,
384 |                     mlpackages_dir,
385 |                     model_version,
386 |                     compute_unit,
387 |                     delete_original_pipe=True,
388 |                     scheduler_override=None):
389 |     """ Initializes and returns a `CoreMLStableDiffusionPipeline` from an original
390 |     diffusers PyTorch pipeline
391 |     """
392 |     # Ensure `scheduler_override` object is of correct type if specified
393 |     if scheduler_override is not None:
394 |         assert isinstance(scheduler_override, SchedulerMixin)
395 |         logger.warning(
396 |             "Overriding scheduler in pipeline: "
397 |             f"Default={pytorch_pipe.scheduler}, Override={scheduler_override}")
398 | 
399 |     # Gather configured tokenizer and scheduler attributes from the original pipe
400 |     coreml_pipe_kwargs = {
401 |         "tokenizer": pytorch_pipe.tokenizer,
402 |         "scheduler": pytorch_pipe.scheduler if scheduler_override is None else scheduler_override,
403 |         "feature_extractor": pytorch_pipe.feature_extractor,
404 |     }
405 | 
406 |     model_names_to_load = ["text_encoder", "unet", "vae_decoder"]
407 |     if getattr(pytorch_pipe, "safety_checker", None) is not None:
408 |         model_names_to_load.append("safety_checker")
409 |     else:
410 |         logger.warning(
411 |             f"Original diffusers pipeline for {model_version} does not have a safety_checker, "
412 |             "Core ML pipeline will mirror this behavior.")
413 |         coreml_pipe_kwargs["safety_checker"] = None
414 | 
415 |     if delete_original_pipe:
416 |         del pytorch_pipe
417 |         gc.collect()
418 |         logger.info("Removed PyTorch pipe to reduce peak memory consumption")
419 | 
420 |     # Load Core ML models
421 |     logger.info(f"Loading Core ML models in memory from {mlpackages_dir}")
422 |     coreml_pipe_kwargs.update({
423 |         model_name: _load_mlpackage(
424 |             model_name,
425 |             mlpackages_dir,
426 |             model_version,
427 |             compute_unit,
428 |         )
429 |         for model_name in model_names_to_load
430 |     })
431 |     logger.info("Done.")
432 | 
433 |     logger.info("Initializing Core ML pipe for image generation")
434 |     coreml_pipe = CoreMLStableDiffusionPipeline(**coreml_pipe_kwargs)
435 |     logger.info("Done.")
436 | 
437 |     return coreml_pipe
438 | 
439 | 
440 | def get_image_path(args, **override_kwargs):
441 |     """ mkdir output folder and encode metadata in the filename
442 |     """
443 |     out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
444 |     os.makedirs(out_folder, exist_ok=True)
445 | 
446 |     out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
447 |     out_fname += f"_computeUnit_{override_kwargs.get('compute_unit', None) or args.compute_unit}"
448 |     out_fname += f"_modelVersion_{override_kwargs.get('model_version', None) or args.model_version.replace('/', '_')}"
449 | 
450 |     if args.scheduler is not None:
451 |         out_fname += f"_customScheduler_{override_kwargs.get('scheduler', None) or args.scheduler}"
452 |         out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
453 | 
454 |     return os.path.join(out_folder, out_fname + ".png")
455 | 
456 | 
457 | def main(args):
458 |     logger.info(f"Setting random seed to {args.seed}")
459 |     np.random.seed(args.seed)
460 | 
461 |     logger.info("Initializing PyTorch pipe for reference configuration")
462 |     from diffusers import StableDiffusionPipeline
463 |     pytorch_pipe = StableDiffusionPipeline.from_pretrained(args.model_version,
464 |                                                            use_auth_token=True)
465 | 
466 |     user_specified_scheduler = None
467 |     if args.scheduler is not None:
468 |         user_specified_scheduler = SCHEDULER_MAP[
469 |             args.scheduler].from_config(pytorch_pipe.scheduler.config)
470 | 
471 |     coreml_pipe = get_coreml_pipe(pytorch_pipe=pytorch_pipe,
472 |                                   mlpackages_dir=args.i,
473 |                                   model_version=args.model_version,
474 |                                   compute_unit=args.compute_unit,
475 |                                   scheduler_override=user_specified_scheduler)
476 | 
477 |     logger.info("Beginning image generation.")
478 |     image = coreml_pipe(
479 |         prompt=args.prompt,
480 |         height=coreml_pipe.height,
481 |         width=coreml_pipe.width,
482 |         num_inference_steps=args.num_inference_steps,
483 |     )
484 | 
485 |     out_path = get_image_path(args)
486 |     logger.info(f"Saving generated image to {out_path}")
487 |     image["images"][0].save(out_path)
488 | 
489 | 
490 | if __name__ == "__main__":
491 |     parser = argparse.ArgumentParser()
492 | 
493 |     parser.add_argument(
494 |         "--prompt",
495 |         required=True,
496 |         help="The text prompt to be used for text-to-image generation.")
497 |     parser.add_argument(
498 |         "-i",
499 |         required=True,
500 |         help=("Path to input directory with the .mlpackage files generated by "
501 |               "python_coreml_stable_diffusion.torch2coreml"))
502 |     parser.add_argument("-o", required=True)
503 |     parser.add_argument("--seed",
504 |                         "-s",
505 |                         default=93,
506 |                         type=int,
507 |                         help="Random seed to be able to reproduce results")
508 |     parser.add_argument(
509 |         "--model-version",
510 |         default="CompVis/stable-diffusion-v1-4",
511 |         help=
512 |         ("The pre-trained model checkpoint and configuration to restore. "
513 |          "For available versions: https://huggingface.co/models?search=stable-diffusion"
514 |          ))
515 |     parser.add_argument(
516 |         "--compute-unit",
517 |         choices=get_available_compute_units(),
518 |         default="ALL",
519 |         help=("The compute units to be used when executing Core ML models. "
520 |               f"Options: {get_available_compute_units()}"))
521 |     parser.add_argument(
522 |         "--scheduler",
523 |         choices=tuple(SCHEDULER_MAP.keys()),
524 |         default=None,
525 |         help=("The scheduler to use for running the reverse diffusion process. "
526 |              "If not specified, the default scheduler from the diffusers pipeline is utilized"))
527 |     parser.add_argument(
528 |         "--num-inference-steps",
529 |         default=50,
530 |         type=int,
531 |         help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
532 | 
533 |     args = parser.parse_args()
534 |     main(args)
535 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Core ML Stable Diffusion
  2 | 
  3 | Run Stable Diffusion on Apple Silicon with Core ML
  4 | 
  5 | <img src="assets/readme_reel.png">
  6 | 
  7 | This repository comprises:
  8 | 
  9 | - `python_coreml_stable_diffusion`, a Python package for converting PyTorch models to Core ML format and performing image generation with Hugging Face [diffusers](https://github.com/huggingface/diffusers) in Python
 10 | - `StableDiffusion`, a Swift package that developers can add to their Xcode projects as a dependency to deploy image generation capabilities in their apps. The Swift package relies on the Core ML model files generated by `python_coreml_stable_diffusion`
 11 | 
 12 | If you run into issues during installation or runtime, please refer to the [FAQ](#FAQ) section.
 13 | 
 14 | 
 15 | ## <a name="example-results"></a> Example Results
 16 | 
 17 | There are numerous versions of Stable Diffusion available on the [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion). Here are example results from three of those models:
 18 | 
 19 | `--model-version` | [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) |  [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) |  [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) |
 20 | :------:|:------:|:------:|:------:
 21 | Output | ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_11_computeUnit_CPU_AND_GPU_modelVersion_stabilityai_stable-diffusion-2-base.png) | ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_13_computeUnit_CPU_AND_NE_modelVersion_CompVis_stable-diffusion-v1-4.png) | ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png)
 22 | M1 iPad Pro 8GB Latency (s)     | 29 | 38 | 38 |
 23 | M1 MacBook Pro 16GB Latency (s) | 24 | 35 | 35 |
 24 | M2 MacBook Air 8GB Latency (s)  | 18 | 23 | 23 |
 25 | 
 26 | Please see [Important Notes on Performance Benchmarks](#important-notes-on-performance-benchmarks) section for details.
 27 | 
 28 | 
 29 | ## <a name="converting-models-to-coreml"></a> Converting Models to Core ML
 30 | 
 31 | <details>
 32 |   <summary> Click to expand </summary>
 33 | 
 34 | **Step 1:** Create a Python environment and install dependencies:
 35 | 
 36 | ```bash
 37 | conda create -n coreml_stable_diffusion python=3.8 -y
 38 | conda activate coreml_stable_diffusion
 39 | cd /path/to/cloned/ml-stable-diffusion/repository
 40 | pip install -e .
 41 | ```
 42 | 
 43 | **Step 2:** Log in to or register for your [Hugging Face account](https://huggingface.co), generate a [User Access Token](https://huggingface.co/settings/tokens) and use this token to set up Hugging Face API access by running `huggingface-cli login` in a Terminal window.
 44 | 
 45 | **Step 3:** Navigate to the version of Stable Diffusion that you would like to use on [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion) and accept its Terms of Use. The default model version is [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4). The model version may be changed by the user as described in the next step.
 46 | 
 47 | **Step 4:** Execute the following command from the Terminal to generate Core ML model files (`.mlpackage`)
 48 | 
 49 | ```shell
 50 | python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --convert-text-encoder --convert-vae-decoder --convert-safety-checker -o <output-mlpackages-directory>
 51 | ```
 52 | 
 53 | **WARNING:** This command will download several GB worth of PyTorch checkpoints from Hugging Face.
 54 | 
 55 | This generally takes 15-20 minutes on an M1 MacBook Pro. Upon successful execution, the 4 neural network models that comprise Stable Diffusion will have been converted from PyTorch to Core ML (`.mlpackage`) and saved into the specified `<output-mlpackages-directory>`. Some additional notable arguments:
 56 | 
 57 | - `--model-version`: The model version defaults to [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4). Developers may specify other versions that are available on [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion), e.g. [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) & [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
 58 | 
 59 | 
 60 | - `--bundle-resources-for-swift-cli`: Compiles all 4 models and bundles them along with necessary resources for text tokenization into `<output-mlpackages-directory>/Resources` which should provided as input to the Swift package. This flag is not necessary for the diffusers-based Python pipeline.
 61 | 
 62 | - `--chunk-unet`: Splits the Unet model in two approximately equal chunks (each with less than 1GB of weights) for mobile-friendly deployment. This is **required** for ANE deployment on iOS and iPadOS. This is not required for macOS. Swift CLI is able to consume both the chunked and regular versions of the Unet model but prioritizes the former. Note that chunked unet is not compatible with the Python pipeline because Python pipeline is intended for macOS only. Chunking is for on-device deployment with Swift only.
 63 | 
 64 | - `--attention-implementation`: Defaults to `SPLIT_EINSUM` which is the implementation described in [Deploying Transformers on the Apple Neural Engine](https://machinelearning.apple.com/research/neural-engine-transformers). `--attention-implementation ORIGINAL` will switch to an alternative that should be used for non-ANE deployment. Please refer to the [Performance Benchmark](#performance-benchmark) section for further guidance.
 65 | 
 66 | - `--check-output-correctness`: Compares original PyTorch model's outputs to final Core ML model's outputs. This flag increases RAM consumption significantly so it is recommended only for debugging purposes.
 67 | 
 68 | </details>
 69 | 
 70 | ## <a name="image-generation-with-python"></a> Image Generation with Python
 71 | 
 72 | <details>
 73 |   <summary> Click to expand </summary>
 74 | 
 75 | Run text-to-image generation using the example Python pipeline based on [diffusers](https://github.com/huggingface/diffusers):
 76 | 
 77 | ```shell
 78 | python -m python_coreml_stable_diffusion.pipeline --prompt "a photo of an astronaut riding a horse on mars" -i <output-mlpackages-directory> -o </path/to/output/image> --compute-unit ALL --seed 93
 79 | ```
 80 | Please refer to the help menu for all available arguments: `python -m python_coreml_stable_diffusion.pipeline -h`. Some notable arguments:
 81 | 
 82 | - `-i`: Should point to the `-o` directory from Step 4 of [Converting Models to Core ML](#converting-models-to-coreml) section from above.
 83 | - `--model-version`: If you overrode the default model version while converting models to Core ML, you will need to specify the same model version here.
 84 | - `--compute-unit`: Note that the most performant compute unit for this particular implementation may differ across different hardware. `CPU_AND_GPU` or `CPU_AND_NE` may be faster than `ALL`. Please refer to the [Performance Benchmark](#performance-benchmark) section for further guidance.
 85 | - `--scheduler`: If you would like to experiment with different schedulers, you may specify it here. For available options, please see the help menu. You may also specify a custom number of inference steps by `--num-inference-steps` which defaults to 50.
 86 | 
 87 | </details>
 88 | 
 89 | ## Image Generation with Swift
 90 | 
 91 | <details>
 92 |   <summary> Click to expand </summary>
 93 | 
 94 | ### <a name="swift-requirements"></a> System Requirements
 95 | Building the Swift projects require:
 96 | - macOS 13 or newer
 97 | - Xcode 14.1 or newer with command line tools installed. Please check [developer.apple.com](https://developer.apple.com/download/all/?q=xcode) for the latest version.
 98 | - Core ML models and tokenization resources. Please see `--bundle-resources-for-swift-cli` from the [Converting Models to Core ML](#converting-models-to-coreml) section above
 99 | 
100 | If deploying this model to:
101 | - iPhone
102 |   - iOS 16.2 or newer
103 |   - iPhone 12 or newer
104 | - iPad
105 |   - iPadOS 16.2 or newer
106 |   - M1 or newer
107 | - Mac
108 |   - macOS 13.1 or newer
109 |   - M1 or newer
110 | 
111 | ### Example CLI Usage
112 | ```shell
113 | swift run StableDiffusionSample "a photo of an astronaut riding a horse on mars" --resource-path <output-mlpackages-directory>/Resources/ --seed 93 --output-path </path/to/output/image>
114 | ```
115 | The output will be named based on the prompt and random seed:
116 | e.g. `</path/to/output/image>/a_photo_of_an_astronaut_riding_a_horse_on_mars.93.final.png`
117 | 
118 | Please use the `--help` flag to learn about batched generation and more.
119 | 
120 | ### Example Library Usage
121 | 
122 | ```swift
123 | import StableDiffusion
124 | ...
125 | let pipeline = try StableDiffusionPipeline(resourcesAt: resourceURL)
126 | let image = try pipeline.generateImages(prompt: prompt, seed: seed).first
127 | ```
128 | 
129 | ### Swift Package Details
130 | 
131 | This Swift package contains two products:
132 | 
133 | - `StableDiffusion` library
134 | - `StableDiffusionSample` command-line tool
135 | 
136 | Both of these products require the Core ML models and tokenization resources to be supplied. When specifying resources via a directory path that directory must contain the following:
137 | 
138 | - `TextEncoder.mlmodelc` (text embedding model)
139 | - `Unet.mlmodelc` or `UnetChunk1.mlmodelc` & `UnetChunk2.mlmodelc` (denoising autoencoder model)
140 | - `VAEDecoder.mlmodelc` (image decoder model)
141 | - `vocab.json` (tokenizer vocabulary file)
142 | - `merges.text` (merges for byte pair encoding file)
143 | 
144 | Optionally, it may also include the safety checker model that some versions of Stable Diffusion include:
145 | 
146 | - `SafetyChecker.mlmodelc`
147 | 
148 | Note that the chunked version of Unet is checked for first. Only if it is not present will the full `Unet.mlmodelc` be loaded. Chunking is required for iOS and iPadOS and not necessary for macOS.
149 | 
150 | </details>
151 | 
152 | ## <a name="performance-benchmark"></a> Performance Benchmark
153 | 
154 | <details>
155 |   <summary> Click to expand </summary>
156 | 
157 | Standard [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) Benchmark
158 | 
159 | |        Device                      | `--compute-unit`| `--attention-implementation` | Latency (seconds) |
160 | | ---------------------------------- | --------------  | ---------------------------- | ----------------- |
161 | | Mac Studio (M1 Ultra, 64-core GPU) | `CPU_AND_GPU`   |     `ORIGINAL`               |      9            |
162 | | Mac Studio (M1 Ultra, 48-core GPU) | `CPU_AND_GPU`   |     `ORIGINAL`               |      13           |
163 | | MacBook Pro (M1 Max, 32-core GPU)  | `CPU_AND_GPU`   |     `ORIGINAL`               |      18           |
164 | | MacBook Pro (M1 Max, 24-core GPU)  | `CPU_AND_GPU`   |     `ORIGINAL`               |      20           |
165 | | MacBook Pro (M1 Pro, 16-core GPU)  |    `ALL`        |     `SPLIT_EINSUM (default)` |      26           |
166 | | MacBook Pro (M2)                   | `CPU_AND_NE`    |     `SPLIT_EINSUM (default)` |      23           |
167 | | MacBook Pro (M1)                   | `CPU_AND_NE`    |     `SPLIT_EINSUM (default)` |      35           |
168 | | iPad Pro (5th gen, M1)             | `CPU_AND_NE`    |     `SPLIT_EINSUM (default)` |      38           |
169 | 
170 | 
171 | Please see [Important Notes on Performance Benchmarks](#important-notes-on-performance-benchmarks) section for details.
172 | 
173 | </details>
174 | 
175 | ## <a name="important-notes-on-performance-benchmarks"></a> Important Notes on Performance Benchmarks
176 | 
177 | <details>
178 |   <summary> Click to expand </summary>
179 | 
180 | - This benchmark was conducted by Apple using public beta versions of iOS 16.2, iPadOS 16.2 and macOS 13.1 in November 2022.
181 | - The executed program is `python_coreml_stable_diffusion.pipeline` for macOS devices and a minimal Swift test app built on the `StableDiffusion` Swift package for iOS and iPadOS devices.
182 | - The median value across 3 end-to-end executions is reported.
183 | - Performance may materially differ across different versions of Stable Diffusion due to architecture changes in the model itself. Each reported number is specific to the model version mentioned in that context.
184 | - The image generation procedure follows the standard configuration: 50 inference steps, 512x512 output image resolution, 77 text token sequence length, classifier-free guidance (batch size of 2 for unet).
185 | - The actual prompt length does not impact performance because the Core ML model is converted with a static shape that computes the forward pass for all of the 77 elements (`tokenizer.model_max_length`) in the text token sequence regardless of the actual length of the input text.
186 | - Pipelining across the 4 models is not optimized and these performance numbers are subject to variance under increased system load from other applications. Given these factors, we do not report sub-second variance in latency.
187 | - Weights and activations are in float16 precision for both the GPU and the ANE.
188 | - The Swift CLI program consumes a peak memory of approximately 2.6GB (without the safety checker), 2.1GB of which is model weights in float16 precision. We applied [8-bit weight quantization](https://coremltools.readme.io/docs/compressing-ml-program-weights#use-affine-quantization) to reduce peak memory consumption by approximately 1GB. However, we observed that it had an adverse effect on generated image quality and we rolled it back. We encourage developers to experiment with other advanced weight compression techniques such as [palettization](https://coremltools.readme.io/docs/compressing-ml-program-weights#use-a-lookup-table) and/or [pruning](https://coremltools.readme.io/docs/compressing-ml-program-weights#use-sparse-representation) which may yield better results.
189 | - In the [benchmark table](performance-benchmark), we report the best performing `--compute-unit` and `--attention-implementation` values per device. The former does not modify the Core ML model and can be applied during runtime. The latter modifies the Core ML model. Note that the best performing compute unit is model version and hardware-specific.
190 | 
191 | </details>
192 | 
193 | 
194 | ## <a name="results-with-different-compute-units"></a> Results with Different Compute Units
195 | 
196 | <details>
197 |   <summary> Click to expand </summary>
198 | 
199 | It is highly probable that there will be slight differences across generated images using different compute units.
200 | 
201 | The following images were generated on an M1 MacBook Pro and macOS 13.1 with the prompt *"a photo of an astronaut riding a horse on mars"* using the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) model version. The random seed was set to 93:
202 | 
203 |   CPU_AND_NE  |  CPU_AND_GPU  |  ALL  |
204 | :------------:|:-------------:|:------:
205 | ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_NE_modelVersion_runwayml_stable-diffusion-v1-5.png)  |  ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_CPU_AND_GPU_modelVersion_runwayml_stable-diffusion-v1-5.png) | ![](assets/a_high_quality_photo_of_an_astronaut_riding_a_horse_in_space/randomSeed_93_computeUnit_ALL_modelVersion_runwayml_stable-diffusion-v1-5.png) |
206 | 
207 | Differences may be less or more pronounced for different inputs. Please see the [FAQ](#faq) Q8 for a detailed explanation.
208 | 
209 | </details>
210 | 
211 | ## FAQ
212 | 
213 | <details>
214 |   <summary> Click to expand </summary>
215 | <details>
216 | 
217 | 
218 | <summary> <b> Q1: </b> <code> ERROR: Failed building wheel for tokenizers or error: can't find Rust compiler </code> </summary>
219 | 
220 | <b> A1: </b> Please review this [potential solution](https://github.com/huggingface/transformers/issues/2831#issuecomment-592724471).
221 | </details>
222 | 
223 | 
224 | <details>
225 | <summary> <b> Q2: </b> <code> RuntimeError: {NSLocalizedDescription = "Error computing NN outputs." </code> </summary>
226 | 
227 | <b> A2: </b> There are many potential causes for this error. In this context, it is highly likely to be encountered when your system is under increased memory pressure from other applications. Reducing memory utilization of other applications is likely to help alleviate the issue.
228 | </details>
229 | 
230 | <details>
231 | <summary> <b> Q3: </b> My Mac has 8GB RAM and I am converting models to Core ML using the example command. The process is geting killed because of memory issues. How do I fix this issue? </summary>
232 | 
233 | <b> A3: </b>  In order to minimize the memory impact of the model conversion process, please execute the following command instead:
234 | 
235 | ```bash
236 | python -m python_coreml_stable_diffusion.torch2coreml --convert-vae-decoder -o <output-mlpackages-directory> && \
237 | python -m python_coreml_stable_diffusion.torch2coreml --convert-unet -o <output-mlpackages-directory> && \
238 | python -m python_coreml_stable_diffusion.torch2coreml --convert-text-encoder -o <output-mlpackages-directory> && \
239 | python -m python_coreml_stable_diffusion.torch2coreml --convert-safety-checker -o <output-mlpackages-directory> &&
240 | ```
241 | 
242 | If you need `--chunk-unet`, you may do so in yet another independent command which will reuse the previously exported Unet model and simply chunk it in place:
243 | 
244 | ```bash
245 | python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --chunk-unet -o <output-mlpackages-directory>
246 | ```
247 | 
248 | </details>
249 | 
250 | <details>
251 | <summary> <b> Q4: </b> My Mac has 8GB RAM, should image generation work on my machine? </summary>
252 | 
253 | <b> A4: </b> Yes! Especially the `--compute-unit CPU_AND_NE` option should work under reasonable system load from other applications. Note that part of the [Example Results](#example-results) were generated using an M2 MacBook Air with 8GB RAM.
254 | </details>
255 | 
256 | <details>
257 | <summary> <b> Q5: </b> Every time I generate an image using the Python pipeline, loading all the Core ML models takes 2-3 minutes. Is this expected? </summary>
258 | 
259 | <b> A5: </b> Yes and using the Swift library reduces this to just a few seconds. The reason is that `coremltools` loads Core ML models (`.mlpackage`) and each model is compiled to be run on the requested compute unit during load time. Because of the size and number of operations of the unet model, it takes around 2-3 minutes to compile it for Neural Engine execution. Other models should take at most a few seconds. Note that `coremltools` does not cache the compiled model for later loads so each load takes equally long. In order to benefit from compilation caching, `StableDiffusion` Swift package by default relies on compiled Core ML models (`.mlmodelc`) which will be compiled down for the requested compute unit upon first load but then the cache will be reused on subsequent loads until it is purged due to lack of use.
260 | </details>
261 | 
262 | <details>
263 | <summary> <b> Q6: </b> I want to deploy <code>StableDiffusion</code>, the Swift package, in my mobile app. What should I be aware of?" </summary>
264 | 
265 | <b> A6: </b> [This section](#swift-requirements) describes the minimum SDK and OS versions as well as the device models supported by this package. In addition to these requirements, for best practice, we recommend testing the package on the device with the least amount of RAM available among your deployment targets. This is due to the fact that `StableDiffusion` consumes approximately 2.6GB of peak memory during runtime while using `.cpuAndNeuralEngine` (the Swift equivalent of `coremltools.ComputeUnit.CPU_AND_NE`). Other compute units may have a higher peak memory consumption so `.cpuAndNeuralEngine` is recommended for iOS and iPadOS deployment (Please refer to this [section](#swift-requirements) for minimum device model requirements). If your app crashes during image generation, please try adding the [Increased Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) capability to your Xcode project which should significantly increase your app's memory limit.
266 | </details>
267 | 
268 | <details>
269 | <summary> <b> Q7: </b> How do I generate images with different resolutions using the same Core ML models? </summary>
270 | 
271 | <b> A7: </b> The current version of `python_coreml_stable_diffusion` does not support single-model multi-resolution out of the box. However, developers may fork this project and leverage the [flexible shapes](https://coremltools.readme.io/docs/flexible-inputs) support from coremltools to extend the `torch2coreml` script by using `coremltools.EnumeratedShapes`. Note that, while the `text_encoder` is agnostic to the image resolution, the inputs and outputs of `vae_decoder` and `unet` models are dependent on the desired image resolution.
272 | </details>
273 | 
274 | <details>
275 | <summary> <b> Q8: </b> Are the Core ML and PyTorch generated images going to be identical? </summary>
276 | 
277 | <b> A8: </b> If desired, the generated images across PyTorch and Core ML can be made approximately identical. However, it is not guaranteed by default. There are several factors that might lead to different images across PyTorch and Core ML:
278 | 
279 | 
280 |   <b> 1. Random Number Generator Behavior </b>
281 | 
282 |   The main source of potentially different results across PyTorch and Core ML is the Random Number Generator ([RNG](https://en.wikipedia.org/wiki/Random_number_generation)) behavior. PyTorch and Numpy have different sources of randomness. `python_coreml_stable_diffusion` generally relies on Numpy for RNG (e.g. latents initialization) and `StableDiffusion` Swift Library reproduces this RNG behavior. However, PyTorch-based pipelines such as Hugging Face `diffusers` relies on PyTorch's RNG behavior.
283 | 
284 |   <b> 2. PyTorch </b>
285 | 
286 |   *"Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."* ([source](https://pytorch.org/docs/stable/notes/randomness.html#reproducibility)).
287 | 
288 |   <b> 3. Model Function Drift During Conversion </b>
289 | 
290 |   The difference in outputs across corresponding PyTorch and Core ML models is a potential cause. The signal integrity is tested during the conversion process (enabled via `--check-output-correctness` argument to  `python_coreml_stable_diffusion.torch2coreml`) and it is verified to be above a minimum [PSNR](https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) value as tested on random inputs. Note that this is simply a sanity check and does not guarantee this minimum PSNR across all possible inputs. Furthermore, the results are not guaranteed to be identical when executing the same Core ML models across different compute units. This is not expected to be a major source of difference as the sample visual results indicate in [this section](#results-with-different-compute-units).
291 | 
292 |   <b> 4. Weights and Activations Data Type </b>
293 | 
294 |   When quantizing models from float32 to lower-precision data types such as float16, the generated images are [known to vary slightly](https://lambdalabs.com/blog/inference-benchmark-stable-diffusion) in semantics even when using the same PyTorch model. Core ML models generated by coremltools have float16 weights and activations by default [unless explicitly overriden](https://github.com/apple/coremltools/blob/main/coremltools/converters/_converters_entry.py#L256). This is not expected to be a major source of difference.
295 | 
296 | </details>
297 | 
298 | <details>
299 | <summary> <b> Q9: </b> The model files are very large, how do I avoid a large binary for my App? </summary>
300 | 
301 | <b> A9: </b> The recommended option is to prompt the user to download these assets upon first launch of the app. This keeps the app binary size independent of the Core ML models being deployed. Disclosing the size of the download to the user is extremely important as there could be data charges or storage impact that the user might not be comfortable with.
302 | 
303 | </details>
304 | 
305 | </details>
306 | 


--------------------------------------------------------------------------------
/ACKNOWLEDGEMENTS:
--------------------------------------------------------------------------------
  1 | Acknowledgements
  2 | Portions of this software may utilize the following copyrighted 
  3 | material, the use of which is hereby acknowledged.
  4 | 
  5 | _____________________
  6 | The Hugging Face team (diffusers)
  7 |                                     Apache License
  8 |                               Version 2.0, January 2004
  9 |                            http://www.apache.org/licenses/
 10 | 
 11 |       TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 12 | 
 13 |       1. Definitions.
 14 | 
 15 |          "License" shall mean the terms and conditions for use, reproduction,
 16 |          and distribution as defined by Sections 1 through 9 of this document.
 17 | 
 18 |          "Licensor" shall mean the copyright owner or entity authorized by
 19 |          the copyright owner that is granting the License.
 20 | 
 21 |          "Legal Entity" shall mean the union of the acting entity and all
 22 |          other entities that control, are controlled by, or are under common
 23 |          control with that entity. For the purposes of this definition,
 24 |          "control" means (i) the power, direct or indirect, to cause the
 25 |          direction or management of such entity, whether by contract or
 26 |          otherwise, or (ii) ownership of fifty percent (50%) or more of the
 27 |          outstanding shares, or (iii) beneficial ownership of such entity.
 28 | 
 29 |          "You" (or "Your") shall mean an individual or Legal Entity
 30 |          exercising permissions granted by this License.
 31 | 
 32 |          "Source" form shall mean the preferred form for making modifications,
 33 |          including but not limited to software source code, documentation
 34 |          source, and configuration files.
 35 | 
 36 |          "Object" form shall mean any form resulting from mechanical
 37 |          transformation or translation of a Source form, including but
 38 |          not limited to compiled object code, generated documentation,
 39 |          and conversions to other media types.
 40 | 
 41 |          "Work" shall mean the work of authorship, whether in Source or
 42 |          Object form, made available under the License, as indicated by a
 43 |          copyright notice that is included in or attached to the work
 44 |          (an example is provided in the Appendix below).
 45 | 
 46 |          "Derivative Works" shall mean any work, whether in Source or Object
 47 |          form, that is based on (or derived from) the Work and for which the
 48 |          editorial revisions, annotations, elaborations, or other modifications
 49 |          represent, as a whole, an original work of authorship. For the purposes
 50 |          of this License, Derivative Works shall not include works that remain
 51 |          separable from, or merely link (or bind by name) to the interfaces of,
 52 |          the Work and Derivative Works thereof.
 53 | 
 54 |          "Contribution" shall mean any work of authorship, including
 55 |          the original version of the Work and any modifications or additions
 56 |          to that Work or Derivative Works thereof, that is intentionally
 57 |          submitted to Licensor for inclusion in the Work by the copyright owner
 58 |          or by an individual or Legal Entity authorized to submit on behalf of
 59 |          the copyright owner. For the purposes of this definition, "submitted"
 60 |          means any form of electronic, verbal, or written communication sent
 61 |          to the Licensor or its representatives, including but not limited to
 62 |          communication on electronic mailing lists, source code control systems,
 63 |          and issue tracking systems that are managed by, or on behalf of, the
 64 |          Licensor for the purpose of discussing and improving the Work, but
 65 |          excluding communication that is conspicuously marked or otherwise
 66 |          designated in writing by the copyright owner as "Not a Contribution."
 67 | 
 68 |          "Contributor" shall mean Licensor and any individual or Legal Entity
 69 |          on behalf of whom a Contribution has been received by Licensor and
 70 |          subsequently incorporated within the Work.
 71 | 
 72 |       2. Grant of Copyright License. Subject to the terms and conditions of
 73 |          this License, each Contributor hereby grants to You a perpetual,
 74 |          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 75 |          copyright license to reproduce, prepare Derivative Works of,
 76 |          publicly display, publicly perform, sublicense, and distribute the
 77 |          Work and such Derivative Works in Source or Object form.
 78 | 
 79 |       3. Grant of Patent License. Subject to the terms and conditions of
 80 |          this License, each Contributor hereby grants to You a perpetual,
 81 |          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 82 |          (except as stated in this section) patent license to make, have made,
 83 |          use, offer to sell, sell, import, and otherwise transfer the Work,
 84 |          where such license applies only to those patent claims licensable
 85 |          by such Contributor that are necessarily infringed by their
 86 |          Contribution(s) alone or by combination of their Contribution(s)
 87 |          with the Work to which such Contribution(s) was submitted. If You
 88 |          institute patent litigation against any entity (including a
 89 |          cross-claim or counterclaim in a lawsuit) alleging that the Work
 90 |          or a Contribution incorporated within the Work constitutes direct
 91 |          or contributory patent infringement, then any patent licenses
 92 |          granted to You under this License for that Work shall terminate
 93 |          as of the date such litigation is filed.
 94 | 
 95 |       4. Redistribution. You may reproduce and distribute copies of the
 96 |          Work or Derivative Works thereof in any medium, with or without
 97 |          modifications, and in Source or Object form, provided that You
 98 |          meet the following conditions:
 99 | 
100 |          (a) You must give any other recipients of the Work or
101 |              Derivative Works a copy of this License; and
102 | 
103 |          (b) You must cause any modified files to carry prominent notices
104 |              stating that You changed the files; and
105 | 
106 |          (c) You must retain, in the Source form of any Derivative Works
107 |              that You distribute, all copyright, patent, trademark, and
108 |              attribution notices from the Source form of the Work,
109 |              excluding those notices that do not pertain to any part of
110 |              the Derivative Works; and
111 | 
112 |          (d) If the Work includes a "NOTICE" text file as part of its
113 |              distribution, then any Derivative Works that You distribute must
114 |              include a readable copy of the attribution notices contained
115 |              within such NOTICE file, excluding those notices that do not
116 |              pertain to any part of the Derivative Works, in at least one
117 |              of the following places: within a NOTICE text file distributed
118 |              as part of the Derivative Works; within the Source form or
119 |              documentation, if provided along with the Derivative Works; or,
120 |              within a display generated by the Derivative Works, if and
121 |              wherever such third-party notices normally appear. The contents
122 |              of the NOTICE file are for informational purposes only and
123 |              do not modify the License. You may add Your own attribution
124 |              notices within Derivative Works that You distribute, alongside
125 |              or as an addendum to the NOTICE text from the Work, provided
126 |              that such additional attribution notices cannot be construed
127 |              as modifying the License.
128 | 
129 |          You may add Your own copyright statement to Your modifications and
130 |          may provide additional or different license terms and conditions
131 |          for use, reproduction, or distribution of Your modifications, or
132 |          for any such Derivative Works as a whole, provided Your use,
133 |          reproduction, and distribution of the Work otherwise complies with
134 |          the conditions stated in this License.
135 | 
136 |       5. Submission of Contributions. Unless You explicitly state otherwise,
137 |          any Contribution intentionally submitted for inclusion in the Work
138 |          by You to the Licensor shall be under the terms and conditions of
139 |          this License, without any additional terms or conditions.
140 |          Notwithstanding the above, nothing herein shall supersede or modify
141 |          the terms of any separate license agreement you may have executed
142 |          with Licensor regarding such Contributions.
143 | 
144 |       6. Trademarks. This License does not grant permission to use the trade
145 |          names, trademarks, service marks, or product names of the Licensor,
146 |          except as required for reasonable and customary use in describing the
147 |          origin of the Work and reproducing the content of the NOTICE file.
148 | 
149 |       7. Disclaimer of Warranty. Unless required by applicable law or
150 |          agreed to in writing, Licensor provides the Work (and each
151 |          Contributor provides its Contributions) on an "AS IS" BASIS,
152 |          WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
153 |          implied, including, without limitation, any warranties or conditions
154 |          of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
155 |          PARTICULAR PURPOSE. You are solely responsible for determining the
156 |          appropriateness of using or redistributing the Work and assume any
157 |          risks associated with Your exercise of permissions under this License.
158 | 
159 |       8. Limitation of Liability. In no event and under no legal theory,
160 |          whether in tort (including negligence), contract, or otherwise,
161 |          unless required by applicable law (such as deliberate and grossly
162 |          negligent acts) or agreed to in writing, shall any Contributor be
163 |          liable to You for damages, including any direct, indirect, special,
164 |          incidental, or consequential damages of any character arising as a
165 |          result of this License or out of the use or inability to use the
166 |          Work (including but not limited to damages for loss of goodwill,
167 |          work stoppage, computer failure or malfunction, or any and all
168 |          other commercial damages or losses), even if such Contributor
169 |          has been advised of the possibility of such damages.
170 | 
171 |       9. Accepting Warranty or Additional Liability. While redistributing
172 |          the Work or Derivative Works thereof, You may choose to offer,
173 |          and charge a fee for, acceptance of support, warranty, indemnity,
174 |          or other liability obligations and/or rights consistent with this
175 |          License. However, in accepting such obligations, You may act only
176 |          on Your own behalf and on Your sole responsibility, not on behalf
177 |          of any other Contributor, and only if You agree to indemnify,
178 |          defend, and hold each Contributor harmless for any liability
179 |          incurred by, or claims asserted against, such Contributor by reason
180 |          of your accepting any such warranty or additional liability.
181 | 
182 |       END OF TERMS AND CONDITIONS
183 | 
184 |       APPENDIX: How to apply the Apache License to your work.
185 | 
186 |          To apply the Apache License to your work, attach the following
187 |          boilerplate notice, with the fields enclosed by brackets "[]"
188 |          replaced with your own identifying information. (Don't include
189 |          the brackets!)  The text should be enclosed in the appropriate
190 |          comment syntax for the file format. We also recommend that a
191 |          file or class name and description of purpose be included on the
192 |          same "printed page" as the copyright notice for easier
193 |          identification within third-party archives.
194 | 
195 |       Copyright [yyyy] [name of copyright owner]
196 | 
197 |       Licensed under the Apache License, Version 2.0 (the "License");
198 |       you may not use this file except in compliance with the License.
199 |       You may obtain a copy of the License at
200 | 
201 |           http://www.apache.org/licenses/LICENSE-2.0
202 | 
203 |       Unless required by applicable law or agreed to in writing, software
204 |       distributed under the License is distributed on an "AS IS" BASIS,
205 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
206 |       See the License for the specific language governing permissions and
207 |       limitations under the License.
208 | 
209 | 
210 | The Hugging Face team (transformers)
211 |         Copyright 2018- The Hugging Face team. All rights reserved.
212 | 
213 |                                          Apache License
214 |                                    Version 2.0, January 2004
215 |                                 http://www.apache.org/licenses/
216 | 
217 |            TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
218 | 
219 |            1. Definitions.
220 | 
221 |               "License" shall mean the terms and conditions for use, reproduction,
222 |               and distribution as defined by Sections 1 through 9 of this document.
223 | 
224 |               "Licensor" shall mean the copyright owner or entity authorized by
225 |               the copyright owner that is granting the License.
226 | 
227 |               "Legal Entity" shall mean the union of the acting entity and all
228 |               other entities that control, are controlled by, or are under common
229 |               control with that entity. For the purposes of this definition,
230 |               "control" means (i) the power, direct or indirect, to cause the
231 |               direction or management of such entity, whether by contract or
232 |               otherwise, or (ii) ownership of fifty percent (50%) or more of the
233 |               outstanding shares, or (iii) beneficial ownership of such entity.
234 | 
235 |               "You" (or "Your") shall mean an individual or Legal Entity
236 |               exercising permissions granted by this License.
237 | 
238 |               "Source" form shall mean the preferred form for making modifications,
239 |               including but not limited to software source code, documentation
240 |               source, and configuration files.
241 | 
242 |               "Object" form shall mean any form resulting from mechanical
243 |               transformation or translation of a Source form, including but
244 |               not limited to compiled object code, generated documentation,
245 |               and conversions to other media types.
246 | 
247 |               "Work" shall mean the work of authorship, whether in Source or
248 |               Object form, made available under the License, as indicated by a
249 |               copyright notice that is included in or attached to the work
250 |               (an example is provided in the Appendix below).
251 | 
252 |               "Derivative Works" shall mean any work, whether in Source or Object
253 |               form, that is based on (or derived from) the Work and for which the
254 |               editorial revisions, annotations, elaborations, or other modifications
255 |               represent, as a whole, an original work of authorship. For the purposes
256 |               of this License, Derivative Works shall not include works that remain
257 |               separable from, or merely link (or bind by name) to the interfaces of,
258 |               the Work and Derivative Works thereof.
259 | 
260 |               "Contribution" shall mean any work of authorship, including
261 |               the original version of the Work and any modifications or additions
262 |               to that Work or Derivative Works thereof, that is intentionally
263 |               submitted to Licensor for inclusion in the Work by the copyright owner
264 |               or by an individual or Legal Entity authorized to submit on behalf of
265 |               the copyright owner. For the purposes of this definition, "submitted"
266 |               means any form of electronic, verbal, or written communication sent
267 |               to the Licensor or its representatives, including but not limited to
268 |               communication on electronic mailing lists, source code control systems,
269 |               and issue tracking systems that are managed by, or on behalf of, the
270 |               Licensor for the purpose of discussing and improving the Work, but
271 |               excluding communication that is conspicuously marked or otherwise
272 |               designated in writing by the copyright owner as "Not a Contribution."
273 | 
274 |               "Contributor" shall mean Licensor and any individual or Legal Entity
275 |               on behalf of whom a Contribution has been received by Licensor and
276 |               subsequently incorporated within the Work.
277 | 
278 |            2. Grant of Copyright License. Subject to the terms and conditions of
279 |               this License, each Contributor hereby grants to You a perpetual,
280 |               worldwide, non-exclusive, no-charge, royalty-free, irrevocable
281 |               copyright license to reproduce, prepare Derivative Works of,
282 |               publicly display, publicly perform, sublicense, and distribute the
283 |               Work and such Derivative Works in Source or Object form.
284 | 
285 |            3. Grant of Patent License. Subject to the terms and conditions of
286 |               this License, each Contributor hereby grants to You a perpetual,
287 |               worldwide, non-exclusive, no-charge, royalty-free, irrevocable
288 |               (except as stated in this section) patent license to make, have made,
289 |               use, offer to sell, sell, import, and otherwise transfer the Work,
290 |               where such license applies only to those patent claims licensable
291 |               by such Contributor that are necessarily infringed by their
292 |               Contribution(s) alone or by combination of their Contribution(s)
293 |               with the Work to which such Contribution(s) was submitted. If You
294 |               institute patent litigation against any entity (including a
295 |               cross-claim or counterclaim in a lawsuit) alleging that the Work
296 |               or a Contribution incorporated within the Work constitutes direct
297 |               or contributory patent infringement, then any patent licenses
298 |               granted to You under this License for that Work shall terminate
299 |               as of the date such litigation is filed.
300 | 
301 |            4. Redistribution. You may reproduce and distribute copies of the
302 |               Work or Derivative Works thereof in any medium, with or without
303 |               modifications, and in Source or Object form, provided that You
304 |               meet the following conditions:
305 | 
306 |               (a) You must give any other recipients of the Work or
307 |                   Derivative Works a copy of this License; and
308 | 
309 |               (b) You must cause any modified files to carry prominent notices
310 |                   stating that You changed the files; and
311 | 
312 |               (c) You must retain, in the Source form of any Derivative Works
313 |                   that You distribute, all copyright, patent, trademark, and
314 |                   attribution notices from the Source form of the Work,
315 |                   excluding those notices that do not pertain to any part of
316 |                   the Derivative Works; and
317 | 
318 |               (d) If the Work includes a "NOTICE" text file as part of its
319 |                   distribution, then any Derivative Works that You distribute must
320 |                   include a readable copy of the attribution notices contained
321 |                   within such NOTICE file, excluding those notices that do not
322 |                   pertain to any part of the Derivative Works, in at least one
323 |                   of the following places: within a NOTICE text file distributed
324 |                   as part of the Derivative Works; within the Source form or
325 |                   documentation, if provided along with the Derivative Works; or,
326 |                   within a display generated by the Derivative Works, if and
327 |                   wherever such third-party notices normally appear. The contents
328 |                   of the NOTICE file are for informational purposes only and
329 |                   do not modify the License. You may add Your own attribution
330 |                   notices within Derivative Works that You distribute, alongside
331 |                   or as an addendum to the NOTICE text from the Work, provided
332 |                   that such additional attribution notices cannot be construed
333 |                   as modifying the License.
334 | 
335 |               You may add Your own copyright statement to Your modifications and
336 |               may provide additional or different license terms and conditions
337 |               for use, reproduction, or distribution of Your modifications, or
338 |               for any such Derivative Works as a whole, provided Your use,
339 |               reproduction, and distribution of the Work otherwise complies with
340 |               the conditions stated in this License.
341 | 
342 |            5. Submission of Contributions. Unless You explicitly state otherwise,
343 |               any Contribution intentionally submitted for inclusion in the Work
344 |               by You to the Licensor shall be under the terms and conditions of
345 |               this License, without any additional terms or conditions.
346 |               Notwithstanding the above, nothing herein shall supersede or modify
347 |               the terms of any separate license agreement you may have executed
348 |               with Licensor regarding such Contributions.
349 | 
350 |            6. Trademarks. This License does not grant permission to use the trade
351 |               names, trademarks, service marks, or product names of the Licensor,
352 |               except as required for reasonable and customary use in describing the
353 |               origin of the Work and reproducing the content of the NOTICE file.
354 | 
355 |            7. Disclaimer of Warranty. Unless required by applicable law or
356 |               agreed to in writing, Licensor provides the Work (and each
357 |               Contributor provides its Contributions) on an "AS IS" BASIS,
358 |               WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
359 |               implied, including, without limitation, any warranties or conditions
360 |               of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
361 |               PARTICULAR PURPOSE. You are solely responsible for determining the
362 |               appropriateness of using or redistributing the Work and assume any
363 |               risks associated with Your exercise of permissions under this License.
364 | 
365 |            8. Limitation of Liability. In no event and under no legal theory,
366 |               whether in tort (including negligence), contract, or otherwise,
367 |               unless required by applicable law (such as deliberate and grossly
368 |               negligent acts) or agreed to in writing, shall any Contributor be
369 |               liable to You for damages, including any direct, indirect, special,
370 |               incidental, or consequential damages of any character arising as a
371 |               result of this License or out of the use or inability to use the
372 |               Work (including but not limited to damages for loss of goodwill,
373 |               work stoppage, computer failure or malfunction, or any and all
374 |               other commercial damages or losses), even if such Contributor
375 |               has been advised of the possibility of such damages.
376 | 
377 |            9. Accepting Warranty or Additional Liability. While redistributing
378 |               the Work or Derivative Works thereof, You may choose to offer,
379 |               and charge a fee for, acceptance of support, warranty, indemnity,
380 |               or other liability obligations and/or rights consistent with this
381 |               License. However, in accepting such obligations, You may act only
382 |               on Your own behalf and on Your sole responsibility, not on behalf
383 |               of any other Contributor, and only if You agree to indemnify,
384 |               defend, and hold each Contributor harmless for any liability
385 |               incurred by, or claims asserted against, such Contributor by reason
386 |               of your accepting any such warranty or additional liability.
387 | 
388 |            END OF TERMS AND CONDITIONS
389 | 
390 |            APPENDIX: How to apply the Apache License to your work.
391 | 
392 |               To apply the Apache License to your work, attach the following
393 |               boilerplate notice, with the fields enclosed by brackets "[]"
394 |               replaced with your own identifying information. (Don't include
395 |               the brackets!)  The text should be enclosed in the appropriate
396 |               comment syntax for the file format. We also recommend that a
397 |               file or class name and description of purpose be included on the
398 |               same "printed page" as the copyright notice for easier
399 |               identification within third-party archives.
400 | 
401 |            Copyright [yyyy] [name of copyright owner]
402 | 
403 |            Licensed under the Apache License, Version 2.0 (the "License");
404 |            you may not use this file except in compliance with the License.
405 |            You may obtain a copy of the License at
406 | 
407 |                http://www.apache.org/licenses/LICENSE-2.0
408 | 
409 |            Unless required by applicable law or agreed to in writing, software
410 |            distributed under the License is distributed on an "AS IS" BASIS,
411 |            WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
412 |            See the License for the specific language governing permissions and
413 |            limitations under the License.
414 | 
415 | 
416 | Facebook, Inc (PyTorch)
417 |         From PyTorch:
418 | 
419 |         Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
420 |         Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
421 |         Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
422 |         Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
423 |         Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
424 |         Copyright (c) 2011-2013 NYU                      (Clement Farabet)
425 |         Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
426 |         Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
427 |         Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
428 | 
429 |         From Caffe2:
430 | 
431 |         Copyright (c) 2016-present, Facebook Inc. All rights reserved.
432 | 
433 |         All contributions by Facebook:
434 |         Copyright (c) 2016 Facebook Inc.
435 | 
436 |         All contributions by Google:
437 |         Copyright (c) 2015 Google Inc.
438 |         All rights reserved.
439 | 
440 |         All contributions by Yangqing Jia:
441 |         Copyright (c) 2015 Yangqing Jia
442 |         All rights reserved.
443 | 
444 |         All contributions by Kakao Brain:
445 |         Copyright 2019-2020 Kakao Brain
446 | 
447 |         All contributions by Cruise LLC:
448 |         Copyright (c) 2022 Cruise LLC.
449 |         All rights reserved.
450 | 
451 |         All contributions from Caffe:
452 |         Copyright(c) 2013, 2014, 2015, the respective contributors
453 |         All rights reserved.
454 | 
455 |         All other contributions:
456 |         Copyright(c) 2015, 2016 the respective contributors
457 |         All rights reserved.
458 | 
459 |         Caffe2 uses a copyright model similar to Caffe: each contributor holds
460 |         copyright over their contributions to Caffe2. The project versioning records
461 |         all such contribution and copyright details. If a contributor wants to further
462 |         mark their specific copyright on a particular contribution, they should
463 |         indicate their copyright solely in the commit message of the change when it is
464 |         committed.
465 | 
466 |         All rights reserved.
467 | 
468 |         Redistribution and use in source and binary forms, with or without
469 |         modification, are permitted provided that the following conditions are met:
470 | 
471 |         1. Redistributions of source code must retain the above copyright
472 |            notice, this list of conditions and the following disclaimer.
473 | 
474 |         2. Redistributions in binary form must reproduce the above copyright
475 |            notice, this list of conditions and the following disclaimer in the
476 |            documentation and/or other materials provided with the distribution.
477 | 
478 |         3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
479 |            and IDIAP Research Institute nor the names of its contributors may be
480 |            used to endorse or promote products derived from this software without
481 |            specific prior written permission.
482 | 
483 |         THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
484 |         AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
485 |         IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
486 |         ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
487 |         LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
488 |         CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
489 |         SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
490 |         INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
491 |         CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
492 |         ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
493 |         POSSIBILITY OF SUCH DAMAGE.
494 | 
495 | NumPy (RandomKit 1.3)
496 | 
497 |  Copyright (c) 2003-2005, Jean-Sebastien Roy (js@jeannot.org)
498 | 
499 |  The rk_random and rk_seed functions algorithms and the original design of
500 |  the Mersenne Twister RNG:
501 | 
502 |    Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
503 |    All rights reserved.
504 | 
505 |    Redistribution and use in source and binary forms, with or without
506 |    modification, are permitted provided that the following conditions
507 |    are met:
508 | 
509 |    1. Redistributions of source code must retain the above copyright
510 |    notice, this list of conditions and the following disclaimer.
511 | 
512 |    2. Redistributions in binary form must reproduce the above copyright
513 |    notice, this list of conditions and the following disclaimer in the
514 |    documentation and/or other materials provided with the distribution.
515 | 
516 |    3. The names of its contributors may not be used to endorse or promote
517 |    products derived from this software without specific prior written
518 |    permission.
519 | 
520 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
521 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
522 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
523 |    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
524 |    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
525 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
526 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
527 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
528 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
529 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
530 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
531 | 
532 |  Original algorithm for the implementation of rk_interval function from
533 |  Richard J. Wagner's implementation of the Mersenne Twister RNG, optimised by
534 |  Magnus Jonsson.
535 | 
536 |  Constants used in the rk_double implementation by Isaku Wada.
537 | 
538 |  Permission is hereby granted, free of charge, to any person obtaining a
539 |  copy of this software and associated documentation files (the
540 |  "Software"), to deal in the Software without restriction, including
541 |  without limitation the rights to use, copy, modify, merge, publish,
542 |  distribute, sublicense, and/or sell copies of the Software, and to
543 |  permit persons to whom the Software is furnished to do so, subject to
544 |  the following conditions:
545 | 
546 |  The above copyright notice and this permission notice shall be included
547 |  in all copies or substantial portions of the Software.
548 | 
549 |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
550 |  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
551 |  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
552 |  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
553 |  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
554 |  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
555 |  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
556 | 


--------------------------------------------------------------------------------