├── .gitignore
├── .gitattributes
├── model_signing
    ├── install
    │   ├── requirements_test.in
    │   ├── requirements.in
    │   ├── requirements_test_Darwin.txt
    │   ├── requirements_test_Linux.txt
    │   └── requirements_test_Windows.txt
    ├── images
    │   └── sigstore-model-diagram.png
    ├── __init__.py
    ├── hashing
    │   ├── __init__.py
    │   ├── precomputed.py
    │   ├── precomputed_test.py
    │   ├── memory.py
    │   ├── memory_test.py
    │   ├── hashing.py
    │   ├── file.py
    │   └── file_test.py
    ├── main.py
    ├── benchmarks
    │   └── run.sh
    ├── model.py
    ├── README.md
    └── serialize.py
├── CODEOWNERS
├── slsa_for_models
    ├── install
    │   └── requirements.in
    ├── images
    │   ├── slsa_results.png
    │   └── slsa_trigger.png
    ├── kubeflow
    │   ├── images
    │   │   ├── clone
    │   │   │   ├── Dockerfile
    │   │   │   └── clone.sh
    │   │   ├── build_model
    │   │   │   ├── Dockerfile
    │   │   │   └── build.sh
    │   │   └── upload_model
    │   │   │   ├── Dockerfile
    │   │   │   └── upload.sh
    │   ├── README.md
    │   └── model_transparency.py
    ├── gcp
    │   ├── pipelinerun.yml
    │   ├── tasks
    │   │   ├── build-model.yml
    │   │   ├── upload-model.yml
    │   │   └── git-clone.yml
    │   ├── pipeline.yml
    │   └── README.md
    ├── main.py
    ├── github_actions.md
    ├── README.md
    ├── tensorflow_cifar10.py
    └── pytorch_cifar10.py
├── .github
    ├── workflows
    │   ├── scripts
    │   │   └── venv_activate.sh
    │   ├── dependency_review.yml
    │   ├── unit_tests.yml
    │   ├── lint.yml
    │   ├── validate_deps.yml
    │   ├── codeql.yml
    │   ├── scorecard.yml
    │   ├── slsa_for_ml.yml
    │   └── pin_deps.yml
    └── dependabot.yml
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/model_signing/install/requirements_test.in:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @sigstore/model-transparency-codeowners
2 | 


--------------------------------------------------------------------------------
/model_signing/install/requirements.in:
--------------------------------------------------------------------------------
1 | psutil
2 | sigstore
3 | 


--------------------------------------------------------------------------------
/slsa_for_models/install/requirements.in:
--------------------------------------------------------------------------------
1 | tensorflow
2 | tensorflow-datasets
3 | torch
4 | torchvision
5 | 


--------------------------------------------------------------------------------
/slsa_for_models/images/slsa_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/font/model-transparency/main/slsa_for_models/images/slsa_results.png


--------------------------------------------------------------------------------
/slsa_for_models/images/slsa_trigger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/font/model-transparency/main/slsa_for_models/images/slsa_trigger.png


--------------------------------------------------------------------------------
/model_signing/images/sigstore-model-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/font/model-transparency/main/model_signing/images/sigstore-model-diagram.png


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/clone/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu
2 | ARG DEBIAN_FRONTEND=noninteractive
3 | RUN apt update && apt install git-all -y
4 | COPY . /src
5 | WORKDIR /src
6 | RUN chmod +x /src/clone.sh
7 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/build_model/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/python:3.11
2 | ARG DEBIAN_FRONTEND=noninteractive
3 | RUN apt-get update && apt-get install coreutils -y
4 | COPY . /src
5 | WORKDIR /src
6 | RUN chmod +x /src/build.sh
7 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/upload_model/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/google.com/cloudsdktool/cloud-sdk:379.0.0-slim@sha256:d844877c7aaa06a0072979230c68417ddb0f27087277f29747c7169d6ed0d2b9
2 | ARG DEBIAN_FRONTEND=noninteractive
3 | RUN apt-get update && apt-get install coreutils -y
4 | COPY . /src
5 | WORKDIR /src
6 | RUN chmod +x /src/upload.sh
7 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/venv_activate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # shellcheck source=/dev/null
 4 | if [[ -f venv/bin/activate ]]; then
 5 |   source venv/bin/activate
 6 | elif [[ -f venv/Scripts/activate ]]; then
 7 |   source venv/Scripts/activate
 8 | else
 9 |   echo "Cannot activate venv sandbox. Failing"
10 |   exit 1
11 | fi
12 | 
13 | echo "Successfully activated venv sandbox. Python is at `which python`"
14 | 


--------------------------------------------------------------------------------
/model_signing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/pipelinerun.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: tekton.dev/v1
 2 | kind: PipelineRun
 3 | metadata:
 4 |   generateName: slsa-for-models-
 5 | spec:
 6 |   params:
 7 |     - name: model-name
 8 |       value: 'pytorch_model.pth'
 9 |     - name: model-storage
10 |       value:
11 |         package: 'pytorch-model'
12 |         location: 'us'
13 |         repository: 'ml-artifacts'
14 |   pipelineRef:
15 |     name: slsa-for-models
16 |   workspaces:
17 |     - name: shared
18 |       volumeClaimTemplate:
19 |         spec:
20 |           accessModes:
21 |             - ReadWriteOnce
22 |           resources:
23 |             requests:
24 |               storage: 1Gi
25 | 


--------------------------------------------------------------------------------
/model_signing/hashing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/.github/workflows/dependency_review.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: 'Dependency Review'
16 | on:
17 |   pull_request:
18 |     branches: [main]
19 |     types: [opened, synchronize]
20 | 
21 | permissions:
22 |   contents: read
23 | 
24 | jobs:
25 |   dependency-review:
26 |     name: License and Vulnerability Scan
27 |     uses: sigstore/community/.github/workflows/reusable-dependency-review.yml@8cc8d600fbf3012b9d9d84a499423fa96afa3765
28 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Copyright Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | version: 2
16 | # See https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file.
17 | updates:
18 |   - package-ecosystem: "pip"
19 |     directory: "/"
20 |     schedule:
21 |       interval: "weekly"
22 |     groups:
23 |       all:
24 |         patterns:
25 |         - "*"
26 |   - package-ecosystem: "github-actions"
27 |     directory: "/"
28 |     schedule:
29 |       interval: "weekly"
30 |     groups:
31 |       all:
32 |         patterns:
33 |         - "*"
34 | 


--------------------------------------------------------------------------------
/model_signing/install/requirements_test_Darwin.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.11
 3 | # by the following command:
 4 | #
 5 | #    pip-compile --generate-hashes --output-file=model_signing/install/requirements_test_Darwin.txt --strip-extras model_signing/install/requirements_test.in
 6 | #
 7 | iniconfig==2.0.0 \
 8 |     --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \
 9 |     --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374
10 |     # via pytest
11 | packaging==24.0 \
12 |     --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \
13 |     --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9
14 |     # via pytest
15 | pluggy==1.5.0 \
16 |     --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \
17 |     --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669
18 |     # via pytest
19 | pytest==8.2.1 \
20 |     --hash=sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd \
21 |     --hash=sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1
22 |     # via -r model_signing/install/requirements_test.in
23 | 


--------------------------------------------------------------------------------
/model_signing/install/requirements_test_Linux.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.11
 3 | # by the following command:
 4 | #
 5 | #    pip-compile --generate-hashes --output-file=model_signing/install/requirements_test_Linux.txt --strip-extras model_signing/install/requirements_test.in
 6 | #
 7 | iniconfig==2.0.0 \
 8 |     --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \
 9 |     --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374
10 |     # via pytest
11 | packaging==24.0 \
12 |     --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \
13 |     --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9
14 |     # via pytest
15 | pluggy==1.5.0 \
16 |     --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \
17 |     --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669
18 |     # via pytest
19 | pytest==8.2.1 \
20 |     --hash=sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd \
21 |     --hash=sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1
22 |     # via -r model_signing/install/requirements_test.in
23 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/tasks/build-model.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: tekton.dev/v1
 2 | kind: Task
 3 | metadata:
 4 |   name: build-model
 5 | spec:
 6 |   workspaces:
 7 |     - name: source
 8 |   params:
 9 |     - name: tool-versions
10 |       properties:
11 |         python: { }
12 |         bash: { }
13 |       default:
14 |         python: '3.11'
15 |         bash: 'latest'
16 |     - name: model-source
17 |       properties:
18 |         requirements-path: {}
19 |         main-path: {}
20 |     - name: model-name
21 |       enum:
22 |         - 'tensorflow_model.keras'
23 |         - 'tensorflow_hdf5_model.h5'
24 |         - 'tensorflow_hdf5.weights.h5'
25 |         - 'pytorch_model.pth'
26 |         - 'pytorch_full_model.pth'
27 |         - 'pytorch_jitted_model.pt'
28 |   results:
29 |     - name: digest
30 |   steps:
31 |     - name: run-script
32 |       image: docker.io/python:$(params.tool-versions.python)
33 |       workingDir: $(workspaces.source.path)
34 |       script: |
35 |         python -m pip install --require-hashes -r $(params.model-source.requirements-path)
36 |         python $(params.model-source.main-path) $(params.model-name)
37 |     - name: compute-digest
38 |       image: bash:$(params.tool-versions.bash)
39 |       workingDir: $(workspaces.source.path)
40 |       script:
41 |         sha256sum $(params.model-name) | awk '{print $1}' | tr -d '\n' | tee $(results.digest.path)
42 | 


--------------------------------------------------------------------------------
/model_signing/install/requirements_test_Windows.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with Python 3.11
 3 | # by the following command:
 4 | #
 5 | #    pip-compile --generate-hashes --output-file=model_signing/install/requirements_test_Windows.txt --strip-extras model_signing/install/requirements_test.in
 6 | #
 7 | colorama==0.4.6 \
 8 |     --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \
 9 |     --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6
10 |     # via pytest
11 | iniconfig==2.0.0 \
12 |     --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \
13 |     --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374
14 |     # via pytest
15 | packaging==24.0 \
16 |     --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \
17 |     --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9
18 |     # via pytest
19 | pluggy==1.5.0 \
20 |     --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \
21 |     --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669
22 |     # via pytest
23 | pytest==8.2.1 \
24 |     --hash=sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd \
25 |     --hash=sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1
26 |     # via -r model_signing/install/requirements_test.in
27 | 


--------------------------------------------------------------------------------
/model_signing/hashing/precomputed.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Precomputed digests.
16 | 
17 | In order to support digests computed by external tooling, we provide trivial
18 | `HashEngine` instances that just wrap around the digest.
19 | 
20 | Example usage:
21 | ```python
22 | >>> hasher = PrecomputedDigest("short-hash", b"abcd")
23 | >>> digest = hasher.compute()
24 | >>> digest.digest_hex
25 | '61626364'
26 | >>> digest.algorithm
27 | 'short-hash'
28 | ```
29 | """
30 | 
31 | from dataclasses import dataclass
32 | from typing_extensions import override
33 | 
34 | from model_signing.hashing import hashing
35 | 
36 | 
37 | @dataclass(frozen=True)
38 | class PrecomputedDigest(hashing.HashEngine):
39 |     """A wrapper around digests computed by external tooling."""
40 | 
41 |     _digest_type: str
42 |     _digest_value: bytes
43 | 
44 |     @override
45 |     def compute(self) -> hashing.Digest:
46 |         return hashing.Digest(self._digest_type, self._digest_value)
47 | 
48 |     @override
49 |     @property
50 |     def digest_name(self) -> str:
51 |         return self._digest_type
52 | 


--------------------------------------------------------------------------------
/slsa_for_models/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | 
17 | import tensorflow_cifar10 as tf
18 | import pytorch_cifar10 as pt
19 | 
20 | 
21 | def readOptions():
22 |     parser = argparse.ArgumentParser('Train CIFAR10 models with TF/PT')
23 |     model_formats = list(tf.supported_models().keys())
24 |     model_formats += list(pt.supported_models().keys())
25 |     parser.add_argument('model', choices=model_formats,
26 |                         help='Model to generate (name implies framework)')
27 |     return parser.parse_args()
28 | 
29 | 
30 | def main(args):
31 |     model_formats = list(tf.supported_models().keys())
32 |     for model_format in model_formats:
33 |         if args.model == model_format:
34 |             return tf.model_pipeline(args.model)
35 | 
36 |     model_formats = list(pt.supported_models().keys())
37 |     for model_format in model_formats:
38 |         if args.model == model_format:
39 |             return pt.model_pipeline(args.model)
40 | 
41 |     # we should not reach this case in the normal flow, but cover all corners
42 |     raise ValueError("Model format not supported")
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     args = readOptions()
47 |     main(args)
48 | 


--------------------------------------------------------------------------------
/.github/workflows/unit_tests.yml:
--------------------------------------------------------------------------------
 1 | name: Run unit tests
 2 | on:
 3 |   pull_request:
 4 |     branches: [main]
 5 |     types: [opened, synchronize]
 6 |     paths-ignore:
 7 |       - '**/*.md'
 8 |       - '*.md'
 9 | 
10 | permissions: {}
11 | 
12 | defaults:
13 |   run:
14 |     shell: bash
15 | 
16 | jobs:
17 |   model-signing-unit-tests:
18 |     name: Run unit tests for signing
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       fail-fast: false # Don't cancel other jobs if one fails
22 |       matrix:
23 |         os: [ubuntu-latest, macos-latest, windows-latest]
24 |         include:
25 |           - os: macos-latest
26 |             os_family: Darwin
27 |           - os: ubuntu-latest
28 |             os_family: Linux
29 |           - os: windows-latest
30 |             os_family: Windows
31 |     steps:
32 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
33 |     - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
34 |       with:
35 |         python-version: 3.11
36 |         cache: pip
37 |         cache-dependency-path: |
38 |           model_signing/install/requirements_${{ matrix.os_family }}.txt
39 |           model_signing/install/requirements_test_${{ matrix.os_family }}.txt
40 |     - name: Install dependencies
41 |       run: |
42 |         set -exuo pipefail
43 |         python -m venv venv
44 |         .github/workflows/scripts/venv_activate.sh
45 |         python -m pip install --require-hashes -r model_signing/install/requirements_${{ matrix.os_family }}.txt
46 |         python -m pip install --require-hashes -r model_signing/install/requirements_test_${{ matrix.os_family }}.txt
47 |     - name: Run unit tests
48 |       run: |
49 |         set -euo pipefail
50 |         .github/workflows/scripts/venv_activate.sh
51 |         # NOTE: option --full-trace may be useful for troubleshooting.
52 |         # TODO(#68): Remove the need to create this folder.
53 |         mkdir testdata
54 |         pytest -v .
55 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/clone/clone.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ############################################################
 3 | # Help                                                     #
 4 | ############################################################
 5 | Help()
 6 | {
 7 |    # Display Help
 8 |    echo "Add description of the script functions here."
 9 |    echo
10 |    echo "Syntax: scriptTemplate [h|u|p|c|t]"
11 |    echo "options:"
12 |    echo "u     Url of the repo to clone"
13 |    echo "t     Target of the repo to clone"
14 |    echo "p     Path to the url result"
15 |    echo "c     Path to commit result"
16 |    echo "h       Print this Help."
17 |    echo
18 | }
19 | 
20 | ############################################################
21 | ############################################################
22 | # Main program                                             #
23 | ############################################################
24 | ############################################################
25 | 
26 | url=""
27 | resultPathUrl=""
28 | resultPathCommit=""
29 | 
30 | ############################################################
31 | # Process the input options. Add options as needed.        #
32 | ############################################################
33 | # Get the options
34 | while getopts ":h:u:p:c:t:" option; do
35 |    case $option in
36 |       h) # display Help
37 |          Help
38 |          exit;;
39 |       u) # clone url
40 | 	 url=$OPTARG;;
41 |       p) # result path url
42 | 	 resultPathUrl=$OPTARG;;
43 |       c) # result path commit
44 | 	 resultPathCommit=$OPTARG;;
45 |       t) # result path commit
46 | 	 target=$OPTARG;;
47 |       \?) # Invalid option
48 |          echo "Error: Invalid option"
49 |          exit;;
50 |    esac
51 | done
52 | 
53 | echo "cloning $url into ${target}"
54 | git clone ${url} ${target}
55 | cd ${target}
56 | RESULT_SHA=$(git rev-parse HEAD)
57 | printf "%s" "${RESULT_SHA}" > ${resultPathCommit}
58 | printf "%s" "${url}" > ${resultPathUrl}
59 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/build_model/build.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | ############################################################
 3 | # Help                                                     #
 4 | ############################################################
 5 | Help()
 6 | {
 7 |    # Display Help
 8 |    echo "Add description of the script functions here."
 9 |    echo
10 |    echo "Syntax: scriptTemplate [h|r|s|m|d]"
11 |    echo "options:"
12 |    echo "r     Requirements path"
13 |    echo "s     source code path"
14 |    echo "m     model name"
15 |    echo "d     Path to digest result"
16 |    echo "h       Print this Help."
17 |    echo
18 | }
19 | 
20 | ############################################################
21 | ############################################################
22 | # Main program                                             #
23 | ############################################################
24 | ############################################################
25 | 
26 | ############################################################
27 | # Process the input options. Add options as needed.        #
28 | ############################################################
29 | # Get the options
30 | while getopts ":h:r:w:s:m:d:" option; do
31 |    case $option in
32 |       h) # display Help
33 |          Help
34 |          exit;;
35 |       r) # requirements path
36 | 	 requirements=$OPTARG;;
37 |       w) # workingDir
38 | 	 workingDir=$OPTARG;;
39 |       s) # source code path
40 | 	 sourcePath=$OPTARG;;
41 |       d) # result path digest
42 | 	 resultPathDigest=$OPTARG;;
43 |       m) # model name
44 | 	 model=$OPTARG;;
45 |       \?) # Invalid option
46 |          echo "Error: Invalid option"
47 |          exit;;
48 |    esac
49 | done
50 | 
51 | cd ${workingDir}
52 | python -m pip install --require-hashes -r ${requirements}
53 | python ${sourcePath} ${model}
54 | sha256sum ${model} | awk '{print $1}' | tr -d '\n' | tee ${resultPathDigest}
55 | echo "done..."
56 | echo ${workingDir}
57 | ls -lh
58 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |     types: [opened, synchronize]
 7 | 
 8 | permissions: read-all
 9 | 
10 | jobs:
11 |   flake8-lint:
12 |     runs-on: ubuntu-latest
13 |     name: Lint
14 |     steps:
15 |       - name: Check out source repository
16 |         uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
17 |       - name: Set up Python environment
18 |         uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
19 |         with:
20 |           python-version: "3.11"
21 |       - name: flake8 Lint
22 |         uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2.3.0
23 |         with:
24 |           max-line-length: "80"
25 |       - name: Detect empty lines at end of file and trailing whitespace
26 |         run: |
27 |           set -euxo pipefail  # No -x here!
28 |           failed=0
29 |           # First, check for empty files at end
30 |           for file in $(git ls-files --eol | grep 'i/[cr]*lf' | awk '{print $4}'); do
31 |             lines=$(tac "$file" | awk 'NF{exit};END{print NR?NR-1:0}')
32 |             if [[ $lines -ne 0 ]]; then
33 |               line=$(wc -l "$file" | cut -d' ' -f1)
34 |               echo "::error file=$file,line=$line::File $file has $lines empty lines at end. Please remove."
35 |               failed=$((failed + 1))
36 |             fi
37 |           done
38 |           # Next, check for files with whitespace at end of line. Remove CRLF files.
39 |           for file in $(git ls-files --eol | grep 'i/lf' | awk '{print $4}'); do
40 |             for line in $(grep -n '[[:space:]]$' "$file" | cut -d: -f1); do
41 |               echo "::error file=$file,line=$line::File $file has trailing whitespace at line $line. Please remove."
42 |               failed=$((failed + 1))
43 |             done
44 |           done
45 |           if [[ $failed -ne 0 ]]; then
46 |             echo "::error Found $failed whitespace errors, failing"
47 |             exit 1
48 |           fi
49 | 


--------------------------------------------------------------------------------
/model_signing/hashing/precomputed_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from model_signing.hashing import precomputed
16 | 
17 | 
18 | class TestPrecomputedDigest:
19 | 
20 |     def test_compute_does_not_change_hash(self):
21 |         hash_value = b"value"
22 |         hasher = precomputed.PrecomputedDigest("test", hash_value)
23 |         digest = hasher.compute()
24 |         assert digest.digest_value == hash_value
25 |         digest = hasher.compute()
26 |         assert digest.digest_value == hash_value
27 | 
28 |     def test_expected_hash_and_hex(self):
29 |         hash_value = b"abcd"
30 |         hash_hex_value = "61626364"
31 |         hasher = precomputed.PrecomputedDigest("test", hash_value)
32 |         digest = hasher.compute()
33 |         assert digest.digest_value == hash_value
34 |         assert digest.digest_hex == hash_hex_value
35 | 
36 |     def test_expected_hash_and_hex_unicode(self):
37 |         hash_value = "*哈¥эш希".encode("utf-8")
38 |         hash_hex_value = "2ae59388c2a5d18dd188e5b88c"
39 |         hasher = precomputed.PrecomputedDigest("test", hash_value)
40 |         digest = hasher.compute()
41 |         assert digest.digest_value == hash_value
42 |         assert digest.digest_hex == hash_hex_value
43 | 
44 |     def test_expected_hash_type(self):
45 |         hasher = precomputed.PrecomputedDigest("test", b"abcd")
46 |         assert hasher.digest_name == "test"
47 |         digest = hasher.compute()
48 |         assert digest.algorithm == "test"
49 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/images/upload_model/upload.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | ############################################################
 3 | # Help                                                     #
 4 | ############################################################
 5 | Help()
 6 | {
 7 |    # Display Help
 8 |    echo "Add description of the script functions here."
 9 |    echo
10 |    echo "Syntax: scriptTemplate [h|r|s|m|d]"
11 |    echo "options:"
12 |    echo "r     Requirements path"
13 |    echo "s     source code path"
14 |    echo "m     model name"
15 |    echo "d     Path to digest result"
16 |    echo "h       Print this Help."
17 |    echo
18 | }
19 | 
20 | ############################################################
21 | ############################################################
22 | # Main program                                             #
23 | ############################################################
24 | ############################################################
25 | 
26 | ############################################################
27 | # Process the input options. Add options as needed.        #
28 | ############################################################
29 | # Get the options
30 | while getopts ":h:r:w:c:s:l:" option; do
31 |    case $option in
32 |       h) # display Help
33 |          Help
34 |          exit;;
35 |       r) # result path url
36 | 	 resultPathUrl=$OPTARG;;
37 |       w) # workingDir
38 | 	 workingDir=$OPTARG;;
39 |       c) # result path commit
40 | 	 resultPathCommit=$OPTARG;;
41 |       s) # source code path
42 | 	 SOURCE=$OPTARG;;
43 |       l) # result path digest
44 | 	 LOCATION=$OPTARG;;
45 |       \?) # Invalid option
46 |          echo "Error: Invalid option"
47 |          exit;;
48 |    esac
49 | done
50 | 
51 | echo ${workingDir}
52 | cd ${workingDir}
53 | ls -lh
54 | echo "source: ${SOURCE}"
55 | echo "location: ${LOCATION}"
56 | gsutil cp "${SOURCE}" "${LOCATION}"
57 | SHA256=$(sha256sum ${SOURCE} | awk '{print $1}' | tr -d '\n')
58 | printf "sha256:%s" "${SHA256}" > ${resultPathCommit}
59 | printf "md5:%s" "${LOCATION}" > ${resultPathUrl}
60 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/tasks/upload-model.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: tekton.dev/v1
 2 | kind: Task
 3 | metadata:
 4 |   name: upload-model
 5 | spec:
 6 |   workspaces:
 7 |     - name: shared
 8 |   params:
 9 |     - name: tool-versions
10 |       properties:
11 |         gcloud: { }
12 |       default:
13 |         gcloud: 'slim'
14 |     - name: config
15 |       properties:
16 |         package: {}
17 |         version: {}
18 |         source: {}
19 |         location: {}
20 |         repository: {}
21 |   results:
22 |     - name: output
23 |     - name: json
24 |     - name: model_ARTIFACT_OUTPUTS
25 |       properties:
26 |         uri: {}
27 |         digest: {}
28 |   steps:
29 |     - name: upload-to-generic-repo
30 |       image: gcr.io/google.com/cloudsdktool/cloud-sdk:$(params.tool-versions.gcloud)
31 |       workingDir: $(workspaces.shared.path)
32 |       script: |
33 |         gcloud $@
34 |       args:
35 |         - artifacts
36 |         - generic
37 |         - upload
38 |         - --package=$(params.config.package)
39 |         - --version=$(params.config.version)
40 |         - --source=$(params.config.source)
41 |         - --location=$(params.config.location)
42 |         - --repository=$(params.config.repository)
43 |       stdoutConfig:
44 |         path: $(results.output.path)
45 |     - name: convert-to-json
46 |       image: docker.io/stedolan/jq@sha256:a61ed0bca213081b64be94c5e1b402ea58bc549f457c2682a86704dd55231e09
47 |       script: |
48 |         jq -R -n -c '[inputs|split(": ")|{(.[0]):.[1]}] | add' $(results.output.path)
49 |       stdoutConfig:
50 |         path: $(results.json.path)
51 |     - name: type-hint
52 |       image: docker.io/stedolan/jq@sha256:a61ed0bca213081b64be94c5e1b402ea58bc549f457c2682a86704dd55231e09
53 |       script: |
54 |         FULL=$(cat $(results.json.path) | jq -rj '.name')
55 |         URI=$(echo $FULL | cut -d ":" -f 1)
56 |         DIGEST=$(echo $FULL | cut -d ":" -f 2)
57 |         cat <<EOF | tee $(results.model_ARTIFACT_OUTPUTS.path)
58 |         {
59 |           "uri": "${URI}",
60 |           "digest": "sha256:${DIGEST}"
61 |         }
62 |         EOF
63 | 


--------------------------------------------------------------------------------
/model_signing/hashing/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Digests for memory objects.
16 | 
17 | These can only compute hashes of objects residing in memory, after they get
18 | converted to bytes.
19 | 
20 | Example usage:
21 | ```python
22 | >>> hasher = SHA256()
23 | >>> hasher.update(b"abcd")
24 | >>> digest = hasher.compute()
25 | >>> digest.digest_hex
26 | '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
27 | ```
28 | 
29 | Or, passing the data directly in the constructor:
30 | ```python
31 | >>> hasher = SHA256(b"abcd")
32 | >>> digest = hasher.compute()
33 | >>> digest.digest_hex
34 | '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
35 | ```
36 | """
37 | 
38 | import hashlib
39 | from typing_extensions import override
40 | 
41 | from model_signing.hashing import hashing
42 | 
43 | 
44 | class SHA256(hashing.StreamingHashEngine):
45 |     """A wrapper around `hashlib.sha256`."""
46 | 
47 |     def __init__(self, initial_data: bytes = b""):
48 |         self._hasher = hashlib.sha256(initial_data)
49 | 
50 |     @override
51 |     def update(self, data: bytes) -> None:
52 |         self._hasher.update(data)
53 | 
54 |     @override
55 |     def reset(self, data: bytes = b"") -> None:
56 |         self._hasher = hashlib.sha256(data)
57 | 
58 |     @override
59 |     def compute(self) -> hashing.Digest:
60 |         return hashing.Digest(self.digest_name, self._hasher.digest())
61 | 
62 |     @override
63 |     @property
64 |     def digest_name(self) -> str:
65 |         return "sha256"
66 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/README.md:
--------------------------------------------------------------------------------
 1 | # Kubeflow Pipeline to generate ML model with attestations
 2 | 
 3 | ## Prerequisutes
 4 | - A kubernetes cluster has been set up and running.
 5 | - Tekton Pipelines and Chains have been installed and running on the cluster.
 6 | - If pushing to private storage/repository
 7 |   - Workflow Identity federation has been setup with the `default` KSA.
 8 | - kubectl has been installed on your system.
 9 | 
10 | ## Build the images
11 | For `clone`, `build-model` and `upload-model` `Tasks`, we need to build the images.
12 | The Dockerfiles and supporting scripts for each Task are available under `slsa_for_models/kubeflow/images/`.
13 | 
14 | ### Build clone image
15 | 
16 | ```bash
17 | cd slsa_for_models/kubeflow/images/clone
18 | IMAGE=<path to your registry>/git-clone # e.g. docker.io/chitrangpatel/git-clone
19 | docker buildx build -f Dockerfile -t ${IMAGE} .
20 | docker push ${IMAGE}
21 | ```
22 | 
23 | ### Build build-model image
24 | 
25 | ```bash
26 | cd slsa_for_models/kubeflow/images/build-model
27 | IMAGE=<path to your registry>/build-model # e.g. docker.io/chitrangpatel/build-model
28 | docker buildx build -f Dockerfile -t ${IMAGE} .
29 | docker push ${IMAGE}
30 | ```
31 | 
32 | ### Build upload-model image
33 | 
34 | ```bash
35 | cd slsa_for_models/kubeflow/images/upload-model
36 | IMAGE=<path to your registry>/upload-model # e.g. docker.io/chitrangpatel/upload-model
37 | docker buildx build -f Dockerfile -t ${IMAGE} .
38 | docker push ${IMAGE}
39 | ```
40 | 
41 | ## Install kubeflow
42 | For exact details see https://github.com/kubeflow/kfp-tekton/tree/master/sdk#installation.
43 | Requires `> python3.5`
44 | 
45 | ```bash
46 | python3 -m venv .venv
47 | source .venv/bin/activate
48 | pip install kfp-tekton
49 | ```
50 | 
51 | ## Compile the DSL to a yaml
52 | 
53 | The python DSL is shown in `model_transparency.py` file. Depending on the image you produced and tagged, you will have to update the `image` value in corresponding the `components`.
54 | To generate a yaml from it, run:
55 | 
56 | ```bash
57 | python3 model_transparency.py
58 | ```
59 | 
60 | This will update the `model_transparency.yaml` file.
61 | 
62 | ## Run the pipeline
63 | 
64 | ```bash
65 | kubectl apply -f model_transparency.yaml
66 | ```
67 | 


--------------------------------------------------------------------------------
/model_signing/hashing/memory_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from model_signing.hashing import memory
16 | 
17 | 
18 | class TestPrecomputedDigest:
19 | 
20 |     def test_hash_known_value(self):
21 |         hasher = memory.SHA256(b"Test string")
22 |         digest = hasher.compute()
23 |         expected = (
24 |             "a3e49d843df13c2e2a7786f6ecd7e0d184f45d718d1ac1a8a63e570466e489dd"
25 |         )
26 |         assert digest.digest_hex == expected
27 | 
28 |     def test_hash_update_twice_is_the_same_as_update_with_concatenation(self):
29 |         str1 = "Test "
30 |         str2 = "string"
31 | 
32 |         hasher1 = memory.SHA256()
33 |         hasher1.update(str1.encode("utf-8"))
34 |         hasher1.update(str2.encode("utf-8"))
35 |         digest1 = hasher1.compute()
36 | 
37 |         str_all = str1 + str2
38 |         hasher2 = memory.SHA256()
39 |         hasher2.update(str_all.encode("utf-8"))
40 |         digest2 = hasher2.compute()
41 | 
42 |         assert digest1.digest_hex == digest2.digest_hex
43 |         assert digest1.digest_value == digest2.digest_value
44 | 
45 |     def test_hash_update_empty(self):
46 |         hasher1 = memory.SHA256(b"Test string")
47 |         hasher1.update(b"")
48 |         digest1 = hasher1.compute()
49 | 
50 |         hasher2 = memory.SHA256(b"Test string")
51 |         digest2 = hasher2.compute()
52 | 
53 |         assert digest1.digest_hex == digest2.digest_hex
54 |         assert digest1.digest_value == digest2.digest_value
55 | 
56 |     def test_update_after_reset(self):
57 |         hasher = memory.SHA256(b"Test string")
58 |         digest1 = hasher.compute()
59 |         hasher.reset()
60 |         hasher.update(b"Test string")
61 |         digest2 = hasher.compute()
62 | 
63 |         assert digest1.digest_hex == digest2.digest_hex
64 |         assert digest1.digest_value == digest2.digest_value
65 | 


--------------------------------------------------------------------------------
/.github/workflows/validate_deps.yml:
--------------------------------------------------------------------------------
 1 | name: Validate all Python dependencies work together
 2 | on:
 3 |   push:
 4 |     branches: [main]
 5 |   pull_request:
 6 |     branches: [main]
 7 |     types: [opened, synchronize]
 8 |     paths-ignore:
 9 |       - '**/*.md'
10 |       - '*.md'
11 | 
12 | permissions: {}
13 | 
14 | defaults:
15 |   run:
16 |     shell: bash
17 | 
18 | jobs:
19 |   model-signing:
20 |     name: Test model signing dependencies
21 |     runs-on: ${{ matrix.os }}
22 |     strategy:
23 |       fail-fast: false # Don't cancel other jobs if one fails
24 |       matrix:
25 |         os: [ubuntu-latest, macos-latest, windows-latest]
26 |         include:
27 |           - os: macos-latest
28 |             os_family: Darwin
29 |           - os: ubuntu-latest
30 |             os_family: Linux
31 |           - os: windows-latest
32 |             os_family: Windows
33 |     steps:
34 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
35 |     - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
36 |       with:
37 |         python-version: 3.11
38 |         cache: pip
39 |         cache-dependency-path: model_signing/install/requirements_${{ matrix.os_family }}.txt
40 |     - name: Install dependencies
41 |       run: |
42 |         set -exuo pipefail
43 |         python -m venv venv
44 |         .github/workflows/scripts/venv_activate.sh
45 |         python -m pip install --require-hashes -r model_signing/install/requirements_${{ matrix.os_family }}.txt
46 | 
47 |   slsa-for-ml:
48 |     name: Test SLSA for ML demo dependencies
49 |     runs-on: ${{ matrix.os }}
50 |     strategy:
51 |       fail-fast: false # Don't cancel other jobs if one fails
52 |       matrix:
53 |         os: [ubuntu-latest, macos-latest, windows-latest]
54 |         include:
55 |           - os: macos-latest
56 |             os_family: Darwin
57 |           - os: ubuntu-latest
58 |             os_family: Linux
59 |           - os: windows-latest
60 |             os_family: Windows
61 |     steps:
62 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
63 |     - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
64 |       with:
65 |         python-version: 3.11
66 |         cache: pip
67 |         cache-dependency-path: slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
68 |     - name: Install dependencies
69 |       run: |
70 |         set -exuo pipefail
71 |         python -m venv venv
72 |         .github/workflows/scripts/venv_activate.sh
73 |         python -m pip install --require-hashes -r slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
74 | 


--------------------------------------------------------------------------------
/model_signing/hashing/hashing.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The Sigstore Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Machinery for computing digests for a single object.
16 | 
17 | We define an abstract `HashEngine` class which can be used in type annotations
18 | and is at the root of the hashing classes hierarchy.
19 | 
20 | Since there are multiple hashing methods that we support, users should always
21 | specify the algorithm and the digest value.
22 | """
23 | 
24 | from abc import ABCMeta, abstractmethod
25 | from dataclasses import dataclass
26 | from typing import Protocol
27 | 
28 | 
29 | @dataclass(frozen=True)
30 | class Digest:
31 |     """A digest computed by a `HashEngine`."""
32 | 
33 |     algorithm: str
34 |     digest_value: bytes
35 | 
36 |     @property
37 |     def digest_hex(self) -> str:
38 |         """Hexadecimal, human readable, equivalent of `digest`."""
39 |         return self.digest_value.hex()
40 | 
41 | 
42 | class HashEngine(metaclass=ABCMeta):
43 |     """Generic hash engine."""
44 | 
45 |     @abstractmethod
46 |     def compute(self) -> Digest:
47 |         """Computes the digest of data passed to the engine."""
48 |         pass
49 | 
50 |     @property
51 |     @abstractmethod
52 |     def digest_name(self) -> str:
53 |         """The canonical name of the algorithm used to compute the hash.
54 | 
55 |         Subclasses MUST use the `digest_name()` method to record all parameters
56 |         that influence the hash output. For example, if a file is split into
57 |         shards which are hashed separately and the final digest value is
58 |         computed by aggregating these hashes, then the shard size must be given
59 |         in the output string.
60 |         """
61 |         pass
62 | 
63 | 
64 | class Streaming(Protocol):
65 |     """A protocol to support streaming data to `HashEngine` objects."""
66 | 
67 |     @abstractmethod
68 |     def update(self, data: bytes) -> None:
69 |         """Appends additional bytes to the data to be hashed."""
70 |         pass
71 | 
72 |     @abstractmethod
73 |     def reset(self, data: bytes = b"") -> None:
74 |         """Resets the data to be hashed to the passed argument."""
75 |         pass
76 | 
77 | 
78 | class StreamingHashEngine(Streaming, HashEngine):
79 |     """A `HashEngine` that can stream data to be hashed."""
80 | 
81 |     pass
82 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [ "main" ]
 9 |   schedule:
10 |     - cron: '30 22 * * 4'
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
16 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
17 |     #   - https://gh.io/supported-runners-and-hardware-resources
18 |     #   - https://gh.io/using-larger-runners
19 |     # Consider using larger runners for possible analysis time improvements.
20 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
21 |     timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
22 |     permissions:
23 |       actions: read
24 |       contents: read
25 |       security-events: write
26 | 
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         language: [ 'python' ]
31 | 
32 |     steps:
33 |     - name: Checkout repository
34 |       uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
35 | 
36 |     # Initializes the CodeQL tools for scanning.
37 |     - name: Initialize CodeQL
38 |       uses: github/codeql-action/init@f9a7c6738f28efb36e31d49c53a201a9c5d6a476 # v2.14.2
39 |       with:
40 |         languages: ${{ matrix.language }}
41 |         # If you wish to specify custom queries, you can do so here or in a config file.
42 |         # By default, queries listed here will override any specified in a config file.
43 |         # Prefix the list here with "+" to use these queries and those in the config file.
44 | 
45 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
46 |         # queries: security-extended,security-and-quality
47 | 
48 | 
49 |     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
50 |     # If this step fails, then you should remove it and run the build manually (see below)
51 |     - name: Autobuild
52 |       uses: github/codeql-action/autobuild@f9a7c6738f28efb36e31d49c53a201a9c5d6a476 # v2.14.2
53 | 
54 |     # ℹ️ Command-line programs to run using the OS shell.
55 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
56 | 
57 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
58 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
59 | 
60 |     # - run: |
61 |     #     echo "Run, Build Application using script"
62 |     #     ./location_of_script_within_repo/buildscript.sh
63 | 
64 |     - name: Perform CodeQL Analysis
65 |       uses: github/codeql-action/analyze@f9a7c6738f28efb36e31d49c53a201a9c5d6a476 # v2.14.2
66 |       with:
67 |         category: "/language:${{matrix.language}}"
68 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   schedule:
13 |     - cron: '37 21 * * 0'
14 |   push:
15 |     branches: [ "main" ]
16 | 
17 | # Declare default permissions as read only.
18 | permissions: read-all
19 | 
20 | jobs:
21 |   analysis:
22 |     name: Scorecard analysis
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       # Needed to upload the results to code-scanning dashboard.
26 |       security-events: write
27 |       # Needed to publish results and get a badge (see publish_results below).
28 |       id-token: write
29 |       # Uncomment the permissions below if installing in a private repository.
30 |       # contents: read
31 |       # actions: read
32 | 
33 |     steps:
34 |       - name: "Checkout code"
35 |         uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
36 |         with:
37 |           persist-credentials: false
38 | 
39 |       - name: "Run analysis"
40 |         uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
41 |         with:
42 |           results_file: results.sarif
43 |           results_format: sarif
44 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
45 |           # - you want to enable the Branch-Protection check on a *public* repository, or
46 |           # - you are installing Scorecard on a *private* repository
47 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
48 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
49 | 
50 |           # Public repositories:
51 |           #   - Publish results to OpenSSF REST API for easy access by consumers
52 |           #   - Allows the repository to include the Scorecard badge.
53 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
54 |           # For private repositories:
55 |           #   - `publish_results` will always be set to `false`, regardless
56 |           #     of the value entered here.
57 |           publish_results: true
58 | 
59 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
60 |       # format to the repository Actions tab.
61 |       - name: "Upload artifact"
62 |         uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
63 |         with:
64 |           name: SARIF file
65 |           path: results.sarif
66 |           retention-days: 5
67 | 
68 |       # Upload the results to GitHub's code scanning dashboard.
69 |       - name: "Upload to code-scanning"
70 |         uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4
71 |         with:
72 |           sarif_file: results.sarif
73 | 


--------------------------------------------------------------------------------
/slsa_for_models/github_actions.md:
--------------------------------------------------------------------------------
 1 | ## SLSA for Models in GitHub Actions
 2 | 
 3 | This example uses [SLSA L3 GitHub generator][slsa-generator] to generate SLSA
 4 | provenance for ML models in GitHub Actions. This happens during a
 5 | [workflow][workflow] which takes as input the format to save the model into.
 6 | 
 7 | When users download a given version of a model they can also check its
 8 | provenance by using [the SLSA verifier][slsa-verifier] repository.
 9 | 
10 | To test, fork this repository, then head over to the Actions tab and select the
11 | "SLSA for ML models example" workflow. Since the workflow has a
12 | `workflow_dispatch` trigger, it can be invoked on demand: click the `Run
13 | workflow` button, then select the value for the "Name of the model" argument.
14 | 
15 | ![Triggering a SLSA workflow](images/slsa_trigger.png)
16 | 
17 | The supported formats are:
18 | 
19 | | Workflow Argument            | Training Framework | Model format                    |
20 | |------------------------------|--------------------|---------------------------------|
21 | | `tensorflow_model.keras`     | TensorFlow         | Keras format (default)          |
22 | | `tensorflow_hdf5_model.h5`   | TensorFlow         | Legacy HDF5 format              |
23 | | `tensorflow_hdf5.weights.h5` | TensorFlow         | Legacy HDF5 weights only format |
24 | | `pytorch_model.pth`          | PyTorch            | PyTorch default format          |
25 | | `pytorch_full_model.pth`     | PyTorch            | PyTorch complete model format   |
26 | | `pytorch_jitted_model.pt`    | PyTorch            | PyTorch TorchScript format      |
27 | 
28 | After the workflow finishes execution, there will be two archives in the
29 | "Artifacts" section: one is the model that was trained and the other one is the
30 | SLSA provenance attached to the model.
31 | 
32 | ![Results of running a SLSA workflow](images/slsa_results.png)
33 | 
34 | To verify the provenance, download both archives, unzip each and then run
35 | `slsa-verifier`, making sure to replace the `--source-uri` argument with the
36 | _path to your fork_. For example, for a PyTorch model, which has been [built on
37 | this repository](https://github.com/sigstore/model-transparency/actions/runs/6646816974):
38 | 
39 | ```bash
40 | [...]$ slsa-verifier verify-artifact \
41 |        --provenance-path pytorch_model.pth.intoto.jsonl \
42 |        --source-uri github.com/sigstore/model-transparency \
43 |        pytorch_model.pth
44 | Verified signature against tlog entry index 45419124 at URL: https://rekor.sigstore.dev/api/v1/log/entries/24296fb24b8ad77a98dd03d23a78657e7f1efd3d9bea6988abbf23a72290a4ec7dc35c9edeab7ee1
45 | Verified build using builder "https://github.com/slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@refs/tags/v1.9.0" at commit ac26cbf66849cfec6f29747f4525180595c7eef0
46 | Verifying artifact pytorch_model.pth: PASSED
47 | 
48 | PASSED: Verified SLSA provenance
49 | ```
50 | 
51 | The verification of provenance can be done just before the model gets loaded in
52 | the serving pipeline.
53 | 
54 | [cifar10]: https://www.cs.toronto.edu/~kriz/cifar.html
55 | [slsa-generator]: https://github.com/slsa-framework/slsa-github-generator
56 | [slsa-verifier]: https://github.com/slsa-framework/slsa-verifier/
57 | [slsa]: https://slsa.dev
58 | [solarwinds]: https://www.techtarget.com/whatis/feature/SolarWinds-hack-explained-Everything-you-need-to-know
59 | [tekton-chains]: https://github.com/tektoncd/chains
60 | [tekton-kubeflow]: https://www.kubeflow.org/docs/components/pipelines/v1/sdk/pipelines-with-tekton/
61 | [workflow]: https://github.com/sigstore/model-transparency/blob/main/.github/workflows/slsa_for_ml.yml
62 | 


--------------------------------------------------------------------------------
/.github/workflows/slsa_for_ml.yml:
--------------------------------------------------------------------------------
 1 | name: SLSA for ML models example
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       model_type:
 6 |         description: Name of the model (implies framework)
 7 |         required: true
 8 |         type: choice
 9 |         options:
10 |         - tensorflow_model.keras
11 |         - tensorflow_hdf5_model.h5
12 |         - tensorflow_hdf5.weights.h5
13 |         - pytorch_model.pth
14 |         - pytorch_full_model.pth
15 |         - pytorch_jitted_model.pt
16 |   pull_request:
17 |     branches: [main]
18 |     types: [opened, synchronize]
19 |     paths-ignore:
20 |       - '**/*.md'
21 |       - '*.md'
22 | 
23 | permissions: read-all
24 | 
25 | defaults:
26 |   run:
27 |     shell: bash
28 | 
29 | jobs:
30 |   train:
31 |     name: Train model
32 |     runs-on: ${{ matrix.os }}
33 |     strategy:
34 |       fail-fast: false # Don't cancel other jobs if one fails
35 |       matrix:
36 |         os: [ubuntu-latest, macos-latest, windows-latest]
37 |         include:
38 |           - os: macos-latest
39 |             os_family: Darwin
40 |           - os: ubuntu-latest
41 |             os_family: Linux
42 |           - os: windows-latest
43 |             os_family: Windows
44 |     outputs:
45 |       hash-ubuntu-latest: ${{ steps.hash.outputs.hash-ubuntu-latest }}
46 |       hash-macos-latest: ${{ steps.hash.outputs.hash-macos-latest }}
47 |       hash-windows-latest: ${{ steps.hash.outputs.hash-windows-latest }}
48 |     steps:
49 |     - run: git config --global core.autocrlf input
50 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
51 |     - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
52 |       with:
53 |         python-version: 3.11
54 |         cache: pip
55 |         cache-dependency-path: slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
56 |     - name: Install dependencies
57 |       run: |
58 |         set -exuo pipefail
59 |         python -m venv venv
60 |         .github/workflows/scripts/venv_activate.sh
61 |         python -m pip install --require-hashes -r slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
62 |     - name: Build model
63 |       env:
64 |         MODEL_TYPE: ${{ github.event.inputs.model_type || 'pytorch_jitted_model.pt' }}
65 |       run: |
66 |         set -exuo pipefail
67 |         python -m venv venv
68 |         .github/workflows/scripts/venv_activate.sh
69 |         python slsa_for_models/main.py "$MODEL_TYPE"
70 |     - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
71 |       with:
72 |         path: ${{ github.event.inputs.model_type || 'pytorch_jitted_model.pt' }}
73 |         name: ${{ github.event.inputs.model_type || 'pytorch_jitted_model.pt' }}_${{ matrix.os_family }}
74 |         if-no-files-found: error
75 |     - id: hash
76 |       env:
77 |         MODEL: ${{ github.event.inputs.model_type || 'pytorch_jitted_model.pt' }}
78 |       run: |
79 |         set -euo pipefail
80 |         (sha256sum -t "$MODEL" || shasum -a 256 "$MODEL") > checksum
81 |         echo "hash-${{ matrix.os }}=$(base64 -w0 checksum || base64 checksum)" >> "${GITHUB_OUTPUT}"
82 | 
83 |   provenance:
84 |     # TODO(mihaimaruseac): Don't run on pull requests for now
85 |     if: ${{ github.event_name != 'pull_request' }}
86 |     needs: [train]
87 |     strategy:
88 |       fail-fast: false # Don't cancel other jobs if one fails
89 |       matrix:
90 |         os: [ubuntu-latest, macos-latest, windows-latest]
91 |     permissions:
92 |       actions: read
93 |       id-token: write
94 |       contents: write
95 |     uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0
96 |     with:
97 |       base64-subjects: "${{ needs.train.outputs[format('hash-{0}', matrix.os)] }}"
98 |       upload-assets: true # NOTE: This does nothing unless 'upload-tag-name' parameter is also set to an existing tag
99 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/pipeline.yml:
--------------------------------------------------------------------------------
  1 | apiVersion: tekton.dev/v1
  2 | kind: Pipeline
  3 | metadata:
  4 |   name: slsa-for-models
  5 | spec:
  6 |   workspaces:
  7 |     - name: shared
  8 |   params:
  9 |     - name: tool-versions
 10 |       properties:
 11 |         python: {}
 12 |         gcloud: {}
 13 |       default:
 14 |         python: '3.11'
 15 |         gcloud: 'slim'
 16 |     - name: model-source
 17 |       properties:
 18 |         url: {}
 19 |         revision: {}
 20 |         requirements-path: {}
 21 |         main-path: {}
 22 |       default:
 23 |         url: 'https://github.com/sigstore/model-transparency'
 24 |         revision: 'main'
 25 |         requirements-path: 'slsa_for_models/install/requirements_Linux.txt'
 26 |         main-path: 'slsa_for_models/main.py'
 27 |     - name: model-name
 28 |       enum:
 29 |         - 'tensorflow_model.keras'
 30 |         - 'tensorflow_hdf5_model.h5'
 31 |         - 'tensorflow_hdf5.weights.h5'
 32 |         - 'pytorch_model.pth'
 33 |         - 'pytorch_full_model.pth'
 34 |         - 'pytorch_jitted_model.pt'
 35 |     - name: model-storage
 36 |       properties:
 37 |         package: {}
 38 |         location: {}
 39 |         repository: {}
 40 |   results:
 41 |     - name: source_ARTIFACT_INPUTS
 42 |       value: $(tasks.git-clone.results.source_ARTIFACT_INPUTS[*])
 43 |     - name: model_ARTIFACT_OUTPUTS
 44 |       value: $(tasks.upload-model.results.model_ARTIFACT_OUTPUTS[*])
 45 |   tasks:
 46 |     - name: git-clone
 47 |       workspaces:
 48 |         - name: output
 49 |           workspace: shared
 50 |       params:
 51 |         - name: url
 52 |           value: $(params.model-source.url)
 53 |         - name: revision
 54 |           value: $(params.model-source.revision)
 55 |       taskRef:
 56 |         resolver: git
 57 |         params:
 58 |         - name: url
 59 |           value: https://github.com/sigstore/model-transparency.git
 60 |         - name: revision
 61 |           value: $(params.model-source.revision)
 62 |         - name: pathInRepo
 63 |           value: slsa_for_models/gcp/tasks/git-clone.yml
 64 |     - name: build-model
 65 |       runAfter:
 66 |         - git-clone
 67 |       workspaces:
 68 |         - name: source
 69 |           workspace: shared
 70 |       params:
 71 |         - name: model-name
 72 |           value: $(params.model-name)
 73 |         - name: model-source
 74 |           value:
 75 |             requirements-path: $(params.model-source.requirements-path)
 76 |             main-path: $(params.model-source.main-path)
 77 |         - name: python-version
 78 |           value: $(params.tool-versions.python)
 79 |       taskRef:
 80 |         resolver: git
 81 |         params:
 82 |         - name: url
 83 |           value: https://github.com/sigstore/model-transparency.git
 84 |         - name: revision
 85 |           value: $(params.model-source.revision)
 86 |         - name: pathInRepo
 87 |           value: slsa_for_models/gcp/tasks/build-model.yml
 88 |     - name: upload-model
 89 |       runAfter:
 90 |         - build-model
 91 |       workspaces:
 92 |         - name: shared
 93 |       params:
 94 |         - name: config
 95 |           value:
 96 |             package: $(params.model-storage.package)
 97 |             version: $(tasks.build-model.results.digest)
 98 |             source: $(params.model-name)
 99 |             location: $(params.model-storage.location)
100 |             repository: $(params.model-storage.repository)
101 |         - name: tool-versions
102 |           value:
103 |             gcloud: $(params.tool-versions.gcloud)
104 |       taskRef:
105 |         resolver: git
106 |         params:
107 |         - name: url
108 |           value: https://github.com/sigstore/model-transparency.git
109 |         - name: revision
110 |           value: $(params.model-source.revision)
111 |         - name: pathInRepo
112 |           value: slsa_for_models/gcp/tasks/upload-model.yml
113 | 


--------------------------------------------------------------------------------
/slsa_for_models/README.md:
--------------------------------------------------------------------------------
 1 | # SLSA for Models
 2 | 
 3 | This project shows how we can generate [SLSA][slsa] provenance for ML models
 4 | on [GitHub Actions][gha] and [Google Cloud Platform][gcp].
 5 | 
 6 | SLSA was originally developed for traditional software to protect against
 7 | tampering with builds, such as in the [Solarwinds attack][solarwinds], and
 8 | this project is a proof of concept that the _same supply chain protections
 9 | can be applied to ML_.
10 | 
11 | When users download a given version of a model they can also check its provenance.
12 | This can be integrated in the model hub and/or model serving platforms: for example
13 | the model serving pipeline could validate provenance for all new models before
14 | serving them. However, the verification can also be done manually, on demand.
15 | 
16 | As an additional benefit, having provenance for a model allows users to react
17 | to vulnerabilities in a training framework: they can quickly determine if a
18 | model needs to be retrained because it was created using a vulnerable version.
19 | 
20 | See the guides for [GitHub Actions][gha] and [Google Cloud Platform][gcp] for details.
21 | 
22 | ## Models
23 | 
24 | We support both TensorFlow and PyTorch models. The example repo trains a model
25 | on [CIFAR10][cifar10] dataset, saves it in one of the supported formats, and
26 | generates provenance for the output. The supported formats are:
27 | 
28 | | Workflow Argument            | Training Framework | Model format                    |
29 | |------------------------------|--------------------|---------------------------------|
30 | | `tensorflow_model.keras`     | TensorFlow         | Keras format (default)          |
31 | | `tensorflow_hdf5_model.h5`   | TensorFlow         | Legacy HDF5 format              |
32 | | `tensorflow_hdf5.weights.h5` | TensorFlow         | Legacy HDF5 weights only format |
33 | | `pytorch_model.pth`          | PyTorch            | PyTorch default format          |
34 | | `pytorch_full_model.pth`     | PyTorch            | PyTorch complete model format   |
35 | | `pytorch_jitted_model.pt`    | PyTorch            | PyTorch TorchScript format      |
36 | 
37 | While most of the ML models are currently too expensive to train, future work will
38 | cover the training of ML models that require access to accelerators (i.e., GPUs, TPUs)
39 | or that require multiple hours for training.
40 | 
41 | ## Future Work
42 | 
43 | ### Accelerators
44 | Future work will involve covering training ML models that require access to
45 | accelerators (i.e., GPUs, TPUs).
46 | 
47 | ### Platforms
48 | While our examples have targeted GitHub Actions and Tekton in GCP, we aim to bring
49 | support for other platforms (e.g., GCB and GitLab) and model training environments.
50 | 
51 | ### Directory Format
52 | TensorFlow also supports saving models in `SavedModel` format. This is
53 | a directory-based serialization format and currently we don't fully support
54 | this. We can generate SLSA provenance for all the files in the directory but
55 | there are caveats regarding verification. Furthermore, because there is a
56 | difference between the hashes generated by provenance and the hash generated
57 | during model signing, we have decided to add support for these model formats at
58 | a future time, after standardizing a way to generate and verify provenance in
59 | SLSA (in general, not just for ML).
60 | 
61 | [cifar10]: https://www.cs.toronto.edu/~kriz/cifar.html
62 | [slsa-generator]: https://github.com/slsa-framework/slsa-github-generator
63 | [slsa-verifier]: https://github.com/slsa-framework/slsa-verifier/
64 | [slsa]: https://slsa.dev
65 | [solarwinds]: https://www.techtarget.com/whatis/feature/SolarWinds-hack-explained-Everything-you-need-to-know
66 | [tekton-chains]: https://github.com/tektoncd/chains
67 | [tekton-kubeflow]: https://www.kubeflow.org/docs/components/pipelines/v1/sdk/pipelines-with-tekton/
68 | [workflow]: https://github.com/sigstore/model-transparency/blob/main/.github/workflows/slsa_for_ml.yml
69 | [gha]: github_actions.md
70 | [gcp]: gcp
71 | 


--------------------------------------------------------------------------------
/model_signing/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import argparse
 16 | import sys
 17 | from pathlib import Path
 18 | import model
 19 | 
 20 | 
 21 | # https://github.com/sigstore/sigstore-python/issues/661
 22 | # contains the logic to start the web browser.
 23 | 
 24 | def readOptions():
 25 |     parser = argparse.ArgumentParser("CLI for signing AI models")
 26 |     subcommands = parser.add_subparsers(required=True, dest="subcommand")
 27 | 
 28 |     # TODO: option for a path to store the signature.
 29 |     # Sign group.
 30 |     sign = subcommands.add_parser(
 31 |         "sign", formatter_class=argparse.ArgumentDefaultsHelpFormatter
 32 |     )
 33 |     sign.add_argument("--path", required=True, help="The path to sign")
 34 |     sign.add_argument("--disable-ambient", required=False,
 35 |                       default=False, action='store_true',
 36 |                       help="Auto detect ambient authority")
 37 | 
 38 |     # Verify group.
 39 |     verify = subcommands.add_parser(
 40 |         "verify", formatter_class=argparse.ArgumentDefaultsHelpFormatter
 41 |     )
 42 |     verify.add_argument("--path", required=True,
 43 |                         help="The path to a file to verify")
 44 |     verify.add_argument("--identity", required=True,
 45 |                         help="The identity (email, workload identity) to " +
 46 |                         "verify")
 47 |     verify.add_argument("--identity-provider", required=True,
 48 |                         help="The OIDC provider to verify")
 49 | 
 50 |     args = parser.parse_args()
 51 |     return args
 52 | 
 53 | 
 54 | def signature_path(modelfn: Path) -> Path:
 55 |     if modelfn.is_file():
 56 |         return Path(modelfn.parent).joinpath(f"{modelfn.name}.sig")
 57 |     return modelfn.joinpath("model.sig")
 58 | 
 59 | 
 60 | def ignored_paths(modelfn: Path) -> [Path]:
 61 |     if modelfn.is_file():
 62 |         return []
 63 |     return [modelfn.joinpath(".git")]
 64 | 
 65 | 
 66 | # Sign function
 67 | def sign(modelfn: Path, disable_ambient: bool) -> model.SignatureResult:
 68 |     signer = model.SigstoreSigner(disable_ambient=disable_ambient)
 69 |     return signer.sign(modelfn, signature_path(modelfn),
 70 |                        ignored_paths(modelfn))
 71 | 
 72 | 
 73 | def verify(modelfn: Path, issuer: str, identity: str,
 74 |            offline=False) -> model.VerificationResult:
 75 |     verifier = model.SigstoreVerifier(oidc_provider=issuer, identity=identity)
 76 |     return verifier.verify(modelfn, signature_path(modelfn),
 77 |                            ignored_paths(modelfn), offline)
 78 | 
 79 | 
 80 | def main(args) -> int:
 81 |     if args.subcommand == "sign":
 82 |         result = sign(Path(args.path), disable_ambient=args.disable_ambient)
 83 |         if result:
 84 |             print("signature success")
 85 |         else:
 86 |             print(f"signature failure: {str(result)}")
 87 |             return -1
 88 |     elif args.subcommand == "verify":
 89 |         modelfn = Path(args.path)
 90 |         result = verify(modelfn=modelfn,
 91 |                         issuer=args.identity_provider,
 92 |                         identity=args.identity)
 93 |         if result:
 94 |             print("verification success")
 95 |         else:
 96 |             print(f"verification failure: {str(result)}")
 97 |             return -1
 98 |     return 0
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     args = readOptions()
103 | 
104 |     sys.exit(main(args))
105 | 


--------------------------------------------------------------------------------
/.github/workflows/pin_deps.yml:
--------------------------------------------------------------------------------
  1 | name: Pin dependencies
  2 | on:
  3 |   workflow_dispatch:
  4 |   schedule:
  5 |     - cron: '0 0 * * TUE' # run every Tuesday at midnight
  6 | 
  7 | permissions: {}
  8 | 
  9 | defaults:
 10 |   run:
 11 |     shell: bash
 12 | 
 13 | jobs:
 14 |   pin:
 15 |     name: Generate dependency lock
 16 |     runs-on: ${{ matrix.os }}
 17 |     strategy:
 18 |       fail-fast: false # Don't cancel other jobs if one fails
 19 |       matrix:
 20 |         os: [ubuntu-latest, macos-latest, windows-latest]
 21 |         include:
 22 |           - os: ubuntu-latest
 23 |             os_family: Linux
 24 |           - os: macos-latest
 25 |             os_family: Darwin
 26 |           - os: windows-latest
 27 |             os_family: Windows
 28 |     steps:
 29 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
 30 |     - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
 31 |       with:
 32 |         python-version: 3.11
 33 |         cache: pip
 34 |         cache-dependency-path: |
 35 |           model_signing/install/requirements_${{ matrix.os_family }}.txt
 36 |           model_signing/install/requirements_test_${{ matrix.os_family }}.txt
 37 |           slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
 38 |     - name: Create an empty virtualenv and install `pip-tools`
 39 |       run: |
 40 |         set -exuo pipefail
 41 |         python -m venv venv
 42 |         .github/workflows/scripts/venv_activate.sh
 43 |         pip install pip-tools
 44 |         pip list  # For debugging
 45 |     - name: Use `pip-compile` to generate all freeze files
 46 |       run: |
 47 |         set -exuo pipefail
 48 |         .github/workflows/scripts/venv_activate.sh
 49 |         pip-compile --upgrade --generate-hashes --strip-extras --output-file=model_signing/install/requirements_${{ matrix.os_family }}.txt model_signing/install/requirements.in
 50 |         pip-compile --upgrade --generate-hashes --strip-extras --output-file=model_signing/install/requirements_test_${{ matrix.os_family }}.txt model_signing/install/requirements_test.in
 51 |         pip-compile --upgrade --generate-hashes --strip-extras --output-file=slsa_for_models/install/requirements_${{ matrix.os_family }}.txt slsa_for_models/install/requirements.in
 52 |     - name: Test freeze file (for model signing)
 53 |       run: |
 54 |         set -exuo pipefail
 55 |         rm -rf venv  # Need clean sandbox
 56 |         python -m venv venv
 57 |         .github/workflows/scripts/venv_activate.sh
 58 |         pip install -r model_signing/install/requirements_${{ matrix.os_family }}.txt
 59 |         pip list  # For debugging
 60 |     - name: Test freeze file (for testing model signing)
 61 |       run: |
 62 |         set -exuo pipefail
 63 |         rm -rf venv  # Need clean sandbox
 64 |         python -m venv venv
 65 |         .github/workflows/scripts/venv_activate.sh
 66 |         pip install -r model_signing/install/requirements_test_${{ matrix.os_family }}.txt
 67 |         pip list  # For debugging
 68 |     - name: Test freeze file (for SLSA for models)
 69 |       run: |
 70 |         set -exuo pipefail
 71 |         rm -rf venv  # Need clean sandbox
 72 |         python -m venv venv
 73 |         .github/workflows/scripts/venv_activate.sh
 74 |         pip install -r slsa_for_models/install/requirements_${{ matrix.os_family }}.txt
 75 |         pip list  # For debugging
 76 |     - name: Upload freeze files
 77 |       uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
 78 |       with:
 79 |         name: freeze-files-${{ matrix.os }}
 80 |         path: ./*/install/requirements*${{ matrix.os_family }}*txt
 81 | 
 82 |   # Separate PR creation job to make sure it creates only one single PR with
 83 |   # all changed files, eliminate race-conditions and restrict permissions only
 84 |   # to this specific job.
 85 |   create-pr:
 86 |     needs: [pin]
 87 |     runs-on: ubuntu-latest
 88 |     permissions:
 89 |       contents: write
 90 |       pull-requests: write
 91 |     steps:
 92 |     - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
 93 |     - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
 94 |       with:
 95 |         path: .
 96 |         merge-multiple: true
 97 |     - name: Create dependent PR with dependency changes
 98 |       uses: peter-evans/create-pull-request@6d6857d36972b65feb161a90e484f2984215f83e # v6.0.5
 99 |       with:
100 |         title: "Update frozen python dependencies"
101 |         commit-message: "Bump frozen dependencies"
102 |         signoff: true
103 |         delete-branch: true
104 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | When contributing to a repository in the Sigstore organization, please first discuss the change you wish
  4 | to make via an issue in the repository.
  5 | 
  6 | ## Pull Request Process
  7 | 
  8 | 1. Create an issue in the repository outlining the fix or feature.
  9 | 2. Fork the repository to your own GitHub account and clone it locally.
 10 | 3. Complete and test the change.
 11 | 4. If relevant, update documentation with details of the change. This includes updates to an API, new environment
 12 |    variables, exposed ports, useful file locations, CLI parameters and
 13 |    new or changed configuration values.
 14 | 5. Correctly format your commit message - See [Commit Messages](#commit-message-guidelines)
 15 |    below.
 16 | 6. Sign off your commit.
 17 | 7. Ensure that CI passes. If it fails, fix the failures.
 18 | 8. Every pull request requires a review from the Sigstore subprojects MAINTAINERS.
 19 | 9. If your pull request consists of more than one commit, please squash your
 20 |    commits as described in [Squash Commits](#squash-commits), or the commits
 21 |    will be squashed on merge.
 22 | 
 23 | ## Commit Message Guidelines
 24 | 
 25 | We follow the commit formatting recommendations found on [Chris Beams' How to Write a Git Commit Message article](https://chris.beams.io/posts/git-commit/).
 26 | 
 27 | Well formed commit messages not only help reviewers understand the nature of
 28 | the Pull Request, but also assists the release process where commit messages
 29 | are used to generate release notes.
 30 | 
 31 | A good example of a commit message would be as follows:
 32 | 
 33 | ```
 34 | Summarize changes in around 50 characters or less
 35 | 
 36 | More detailed explanatory text, if necessary. Wrap it to about 72
 37 | characters or so. In some contexts, the first line is treated as the
 38 | subject of the commit and the rest of the text as the body. The
 39 | blank line separating the summary from the body is critical (unless
 40 | you omit the body entirely); various tools like `log`, `shortlog`
 41 | and `rebase` can get confused if you run the two together.
 42 | 
 43 | Explain the problem that this commit is solving. Focus on why you
 44 | are making this change as opposed to how (the code explains that).
 45 | Are there side effects or other unintuitive consequences of this
 46 | change? Here's the place to explain them.
 47 | 
 48 | Further paragraphs come after blank lines.
 49 | 
 50 |  - Bullet points are okay, too
 51 | 
 52 |  - Typically a hyphen or asterisk is used for the bullet, preceded
 53 |    by a single space, with blank lines in between, but conventions
 54 |    vary here
 55 | 
 56 | If you use an issue tracker, put references to them at the bottom,
 57 | like this:
 58 | 
 59 | Resolves: #123
 60 | See also: #456, #789
 61 | ```
 62 | 
 63 | Note the `Resolves #123` tag, this references the issue raised and allows us to
 64 | ensure issues are associated and closed when a pull request is merged.
 65 | 
 66 | Please refer to [the github help page on message types](https://help.github.com/articles/closing-issues-using-keywords/)
 67 | for a complete list of issue references.
 68 | 
 69 | ## Squash Commits
 70 | 
 71 | Should your pull request consist of more than one commit (perhaps due to
 72 | a change being requested during the review cycle), please perform a git squash
 73 | once a reviewer has approved your pull request.
 74 | 
 75 | A squash can be performed as follows. Let's say you have the following commits:
 76 | 
 77 |     initial commit
 78 |     second commit
 79 |     final commit
 80 | 
 81 | Run the command below with the number set to the total commits you wish to
 82 | squash (in our case 3 commits):
 83 | 
 84 |     git rebase -i HEAD~3
 85 | 
 86 | You default text editor will then open up and you will see the following::
 87 | 
 88 |     pick eb36612 initial commit
 89 |     pick 9ac8968 second commit
 90 |     pick a760569 final commit
 91 | 
 92 |     # Rebase eb1429f..a760569 onto eb1429f (3 commands)
 93 | 
 94 | We want to rebase on top of our first commit, so we change the other two commits
 95 | to `squash`:
 96 | 
 97 |     pick eb36612 initial commit
 98 |     squash 9ac8968 second commit
 99 |     squash a760569 final commit
100 | 
101 | After this, should you wish to update your commit message to better summarise
102 | all of your pull request, run:
103 | 
104 |     git commit --amend
105 | 
106 | You will then need to force push (assuming your initial commit(s) were posted
107 | to github):
108 | 
109 |     git push origin your-branch --force
110 | 
111 | Alternatively, a core member can squash your commits within Github.
112 | 
113 | ## Code of Conduct
114 | 
115 | Sigstore adheres to and enforces the [Contributor Covenant](http://contributor-covenant.org/version/1/4/) Code of Conduct.
116 | Please take a moment to read the [CODE_OF_CONDUCT.md](https://github.com/sigstore/community/blob/main/CODE_OF_CONDUCT.md) document.
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Model Transparency
  2 | 
  3 | <img align="right" src="https://slsa.dev/images/logo-mono.svg" width="140" height="140">
  4 | 
  5 | <!-- markdown-toc --bullets="-" -i README.md -->
  6 | 
  7 | <!-- toc -->
  8 | 
  9 | - [Overview](#overview)
 10 | - [Projects](#projects)
 11 |   - [Model Signing](#model-signing)
 12 |   - [SLSA for ML](#slsa-for-ml)
 13 | - [Status](#status)
 14 | - [Contributing](#contributing)
 15 | 
 16 | <!-- tocstop -->
 17 | 
 18 | ## Overview
 19 | 
 20 | There is currently significant growth in the number of ML-powered applications.
 21 | This brings benefits, but it also provides grounds for attackers to exploit
 22 | unsuspecting ML users. This is why Google launched the [Secure AI Framework
 23 | (SAIF)][saif] to establish industry standards for creating trustworthy and
 24 | responsible AI applications. The first principle of SAIF is to
 25 | 
 26 | > Expand strong security foundations to the AI ecosystem
 27 | 
 28 | Building on the work with [Open Source Security Foundation][openssf], we are
 29 | creating this repository to demonstrate how the ML supply chain can be
 30 | strengthened in _the same way_ as the traditional software supply chain.
 31 | 
 32 | This repository hosts a collection of utilities and examples related to the
 33 | security of machine learning pipelines. The focus is on providing *verifiable*
 34 | claims about the integrity and provenance of the resulting models, meaning users
 35 | can check for themselves that these claims are true rather than having to just
 36 | trust the model trainer.
 37 | 
 38 | ## Projects
 39 | 
 40 | Currently, there are two main projects in the repository: model signing (to
 41 | prevent tampering of models after publication to ML model hubs) and
 42 | [SLSA](https://slsa.dev/) (to prevent tampering of models during the build
 43 | process).
 44 | 
 45 | ### Model Signing
 46 | 
 47 | This project demonstrates how to protect the integrity of a model by signing it
 48 | with [Sigstore](https://www.sigstore.dev/), a tool for making code signatures
 49 | transparent without requiring management of cryptographic key material.
 50 | 
 51 | When users download a given version of a signed model they can check that the
 52 | signature comes from a known or trusted identity and thus that the model hasn't
 53 | been tampered with after training.
 54 | 
 55 | We are able to sign large models with very good performance, as the following
 56 | table shows:
 57 | 
 58 | | Model              | Size  |  Sign Time | Verify Time |
 59 | |--------------------|-------|:----------:|:-----------:|
 60 | | roberta-base-11    | 8K    | 1s         | 0.6s        |
 61 | | hustvl/YOLOP       | 215M  | 1s         | 1s          |
 62 | | bertseq2seq        | 2.8G  | 1.9s       | 1.4s        |
 63 | | bert-base-uncased  | 3.3G  | 1.6s       | 1.1s        |
 64 | | tiiuae/falcon-7b   | 14GB  | 2.1s       | 1.8s        |
 65 | 
 66 | See [model_signing/README.md](model_signing/README.md) for more information.
 67 | 
 68 | ### SLSA for ML
 69 | 
 70 | This project shows how we can generate [SLSA][slsa] provenance for ML models,
 71 | using either Github Actions or Google Cloud Platform.
 72 | 
 73 | SLSA was originally developed for traditional software to protect against
 74 | tampering with builds, such as in the [Solarwinds attack][solarwinds], and
 75 | this project is a proof of concept that the same supply chain protections
 76 | can be applied to ML.
 77 | 
 78 | We support both TensorFlow and PyTorch models. The examples train a model
 79 | on [CIFAR10][cifar10] dataset, save it in one of the supported formats, and
 80 | generate provenance for the output. The supported formats are:
 81 | 
 82 | | Workflow Argument            | Training Framework | Model format                    |
 83 | |------------------------------|--------------------|---------------------------------|
 84 | | `tensorflow_model.keras`     | TensorFlow         | Keras format (default)          |
 85 | | `tensorflow_hdf5_model.h5`   | TensorFlow         | Legacy HDF5 format              |
 86 | | `tensorflow_hdf5.weights.h5` | TensorFlow         | Legacy HDF5 weights only format |
 87 | | `pytorch_model.pth`          | PyTorch            | PyTorch default format          |
 88 | | `pytorch_full_model.pth`     | PyTorch            | PyTorch complete model format   |
 89 | | `pytorch_jitted_model.pt`    | PyTorch            | PyTorch TorchScript format      |
 90 | 
 91 | See [slsa_for_models/README.md](slsa_for_models/README.md) for more information.
 92 | 
 93 | ## Status
 94 | 
 95 | This project is currently experimental, not ready for all production use-cases.
 96 | We may make breaking changes until the first official release.
 97 | 
 98 | ## Contributing
 99 | 
100 | Please see the [Contributor Guide](CONTRIBUTING.md) for more information.
101 | 
102 | [slsa]: https://slsa.dev/
103 | [saif]: https://blog.google/technology/safety-security/introducing-googles-secure-ai-framework/
104 | [openssf]: https://openssf.org/
105 | [slsa-generator]: https://github.com/slsa-framework/slsa-github-generator
106 | [solarwinds]: https://www.techtarget.com/whatis/feature/SolarWinds-hack-explained-Everything-you-need-to-know
107 | 


--------------------------------------------------------------------------------
/slsa_for_models/kubeflow/model_transparency.py:
--------------------------------------------------------------------------------
  1 | import kfp_tekton
  2 | from kfp import dsl, components
  3 | from kubernetes.client.models import (
  4 |     V1PersistentVolumeClaimSpec,
  5 |     V1ResourceRequirements,
  6 | )
  7 | import json
  8 | 
  9 | 
 10 | def git_clone(url: str, target: str):
 11 |     return components.load_component_from_text(
 12 |         """
 13 |     name: git-clone
 14 |     description: Git clone
 15 |     inputs:
 16 |       - {name: url, type: String}
 17 |       - {name: target, type: Directory}
 18 |     outputs:
 19 |       - {name: CHAINS-GIT_COMMIT, type: String}
 20 |       - {name: CHAINS-GIT_URL, type: String}
 21 |     implementation:
 22 |       container:
 23 |         image: chitrangpatel/git-clone
 24 |         command:
 25 |           - ./clone.sh
 26 |         args:
 27 |           - -u
 28 |           - {inputValue: url}
 29 |           - -c
 30 |           - {outputPath: CHAINS-GIT_COMMIT}
 31 |           - -p
 32 |           - {outputPath: CHAINS-GIT_URL}
 33 |           - -t
 34 |           - {inputValue: target}
 35 |     """
 36 |     )(url=url, target=target)
 37 | 
 38 | 
 39 | def build_model(requirements: str, source: str, model: str, workDir: str):
 40 |     return components.load_component_from_text(
 41 |         """
 42 |     name: build-model
 43 |     description: Build Model
 44 |     inputs:
 45 |       - {name: requirements, type: String}
 46 |       - {name: source, type: String}
 47 |       - {name: model, type: String}
 48 |       - {name: work, type: String}
 49 |     outputs:
 50 |       - {name: digest, type: String}
 51 |     implementation:
 52 |       container:
 53 |         image: chitrangpatel/build-model
 54 |         command:
 55 |           - ./build.sh
 56 |         args:
 57 |           - -r
 58 |           - {inputValue: requirements}
 59 |           - -w
 60 |           - {inputValue: work}
 61 |           - -s
 62 |           - {inputValue: source}
 63 |           - -m
 64 |           - {inputValue: model}
 65 |           - -d
 66 |           - {outputPath: digest}
 67 |     """
 68 |     )(requirements=requirements, source=source, model=model, work=workDir)
 69 | 
 70 | 
 71 | def upload_model(location: str, source: str, workDir: str):
 72 |     return components.load_component_from_text(
 73 |         """
 74 |     name: upload-model
 75 |     description: Upload Model
 76 |     inputs:
 77 |       - {name: location, type: String}
 78 |       - {name: source, type: String}
 79 |       - {name: work, type: String}
 80 |     outputs:
 81 |       - {name: model_ARTIFACT_URI, type: String}
 82 |       - {name: model_ARTIFACT_DIGEST, type: String}
 83 |     implementation:
 84 |       container:
 85 |         image: chitrangpatel/upload-model
 86 |         command:
 87 |           - ./upload.sh
 88 |         args:
 89 |           - -r
 90 |           - {outputPath: model_ARTIFACT_URI}
 91 |           - -w
 92 |           - {inputValue: work}
 93 |           - -c
 94 |           - {outputPath: model_ARTIFACT_DIGEST}
 95 |           - -s
 96 |           - {inputValue: source}
 97 |           - -l
 98 |           - {inputValue: location}
 99 |     """
100 |     )(location=location, source=source, work=workDir)
101 | 
102 | 
103 | @dsl.pipeline(
104 |     name="clone-build-push-pipeline",
105 |     description="Clone the source code, build & upload the model to GCS.",
106 | )
107 | def clone_build_push(
108 |     url: str = "https://github.com/sigstore/model-transparency",
109 |     target: str = "source",
110 |     model: str = "pytorch_model.pth",
111 | ):
112 |     """A three-step pipeline with the first two steps running in parallel."""
113 | 
114 |     source_code = "$(workspaces.shared-ws.path)/source"
115 |     relative_main_path = "slsa_for_models/main.py"
116 |     relative_requirements = "slsa_for_models/install/requirements_Linux.txt"
117 |     gcs_path = "gs://chitrang-ml-models/pytorch_model.pth"
118 | 
119 |     clone_task = git_clone(url, source_code)
120 |     workspace_json = {"shared-ws": {}}
121 |     clone_task.add_pod_annotation("workspaces", json.dumps(workspace_json))
122 | 
123 |     build_task = build_model(
124 |         requirements=relative_requirements,
125 |         workDir=source_code,
126 |         source=relative_main_path,
127 |         model=model,
128 |     )
129 |     build_task.after(clone_task)
130 |     build_task.add_pod_annotation("workspaces", json.dumps(workspace_json))
131 | 
132 |     upload_task = upload_model(gcs_path, model, source_code)
133 |     upload_task.after(build_task)
134 |     upload_task.add_pod_annotation("workspaces", json.dumps(workspace_json))
135 | 
136 | 
137 | pipeline_conf = kfp_tekton.compiler.pipeline_utils.TektonPipelineConf()
138 | pipeline_conf.add_pipeline_workspace(
139 |     workspace_name="shared-ws",
140 |     volume_claim_template_spec=V1PersistentVolumeClaimSpec(
141 |         access_modes=["ReadWriteOnce"],
142 |         resources=V1ResourceRequirements(requests={"storage": "5Gi"}),
143 |     ),
144 | )
145 | pipeline_conf.set_generate_component_spec_annotations(False)
146 | 
147 | if __name__ == "__main__":
148 |     from kfp_tekton.compiler import TektonCompiler
149 | 
150 |     TektonCompiler().compile(
151 |         clone_build_push,
152 |         __file__.replace(".py", ".yaml"),
153 |         tekton_pipeline_conf=pipeline_conf,
154 |     )
155 | 


--------------------------------------------------------------------------------
/model_signing/benchmarks/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -euo pipefail
  3 | 
  4 | if [ "$#" -lt 2 ]; then
  5 |     echo "Usage: $0 identity-provider identity output_path <cleanup>"
  6 |     echo "Example: $0 https://accounts.google.com myemail@gmail.com"
  7 |     exit 1
  8 | fi
  9 | 
 10 | time_cmd() {
 11 |     local cmd="$1"
 12 |     local arguments="$2"
 13 |     # shellcheck disable=SC2086 # We want word splitting
 14 |     { time "${cmd}" ${arguments} >/dev/null; } 2>&1 | grep real | cut -f2
 15 | }
 16 | 
 17 | run() {
 18 |     local model_name="$1"
 19 |     local model_path="$2"
 20 |     local model_init="$3"
 21 | 
 22 |     echo "Initializing ${model_name} ..."
 23 |     eval "${model_init}"
 24 |     # Replace the '/' character.
 25 |     model_name="${model_name/\//_}"
 26 | 
 27 |     echo "Running sign / verify for ${model_name} ..."
 28 |     results["${model_name}[size]"]=$(du -hs "${model_path}" | cut -f1)
 29 |     results["${model_name}[sign_time]"]=$(time_cmd python3 "main.py sign --path ${model_path}")
 30 |     results["${model_name}[verify_time]"]=$(time_cmd python3 "main.py verify --path ${model_path} --identity-provider ${identity_provider} --identity ${identity}")
 31 |     if [[ "${cleanup}" == "true" ]]; then
 32 |         rm -rf "${model_path}" "${model_path}.sig" 2>/dev/null || true
 33 |     fi
 34 | }
 35 | 
 36 | # shellcheck disable=SC2317 # Called via model_init().
 37 | download_github_repository() {
 38 |     local repository="$1"
 39 |     local model_path="$2"
 40 | 
 41 |     # We download the zip which does _not_ contain the .git folder.
 42 |     wget "https://github.com/${repository}/archive/main.zip" -O "${model_path}".zip
 43 |     mkdir -p "${model_path}"
 44 |     shopt -s dotglob
 45 |     cd "${model_path}" && unzip ../"${model_path}".zip && rm ../"${model_path}".zip && mv "${model_path}"-main/* . && rmdir "${model_path}"-main/ && cd -
 46 |     shopt -u dotglob
 47 | }
 48 | 
 49 | # shellcheck disable=SC2317 # Called via model_init().
 50 | download_hf_repository() {
 51 |     local repository="$1"
 52 |     local model_path="$2"
 53 |     git clone --depth=1 "https://huggingface.co/${repository}" "${model_path}"
 54 |     # We delete the .git folder.
 55 |     rm -rf "${model_path}"/.git
 56 | }
 57 | 
 58 | # User inputs.
 59 | identity_provider="$1"
 60 | identity="$2"
 61 | cleanup=""
 62 | 
 63 | if [ "$#" -eq 3 ]; then
 64 |     cleanup="$3"
 65 | fi
 66 | 
 67 | echo
 68 | echo "INFO: Be patient, this will take a few minutes!"
 69 | echo
 70 | 
 71 | # Variable holding results.
 72 | declare -A results
 73 | 
 74 | # Init the environment.
 75 | if [[ ! -d "test_env/" ]]; then
 76 |     python3 -m venv test_env
 77 | fi
 78 | # shellcheck disable=SC1091 # We have access to source=test_env/bin/activate.
 79 | source test_env/bin/activate
 80 | python3 -m pip install --require-hashes -r install/requirements_Linux.txt
 81 | 
 82 | # =========================================
 83 | #               Warm up!
 84 | # =========================================
 85 | # We need to have the identity in the environment, so perform one signature.
 86 | file=$(mktemp)
 87 | python3 main.py sign --path "${file}"
 88 | python3 main.py verify --path "${file}" --identity-provider "${identity_provider}" --identity "${identity}"
 89 | rm "${file}" "${file}.sig"
 90 | 
 91 | # =========================================
 92 | #       PyTorch YOLOP model
 93 | # =========================================
 94 | model_name=hustvl/YOLOP
 95 | model_path=$(echo "${model_name}" | cut -d/ -f2)
 96 | # shellcheck disable=SC2317 # Reachable via run() call.
 97 | model_init() {
 98 |     if [[ ! -d "${model_path}" ]]; then
 99 |         download_github_repository "${model_name}" "${model_path}"
100 |     fi
101 | }
102 | run "${model_name}" "${model_path}" model_init
103 | 
104 | # =========================================
105 | #       ONNX Roberta-base-11 model
106 | # =========================================
107 | model_name=roberta-base-11
108 | model_path="${model_name}.onnx"
109 | # shellcheck disable=SC2317 # Reachable via run() call.
110 | model_init() {
111 |     if [[ ! -f "${model_path}" ]]; then
112 |         wget "https://github.com/onnx/models/tree/857a3434216bd6f2be1ea1ff045fb94a437cbe10/text/machine_comprehension/roberta/model/${model_name}.onnx"
113 |     fi
114 | }
115 | run "${model_name}" "${model_path}" model_init
116 | 
117 | # =========================================
118 | #       tfhub bertseq2seq model
119 | # =========================================
120 | model_name=bertseq2seq
121 | model_path="${model_name}"
122 | # shellcheck disable=SC2317 # Reachable via run() call.
123 | model_init() {
124 |     if [[ ! -d "${model_path}" ]]; then
125 |         wget "https://tfhub.dev/google/bertseq2seq/bert24_en_de/1?tf-hub-format=compressed" -O "${model_path}".tgz
126 |         mkdir -p "${model_path}"
127 |         cd "${model_path}" && tar xvzf ../"${model_path}".tgz && rm ../"${model_path}".tgz && cd -
128 |     fi
129 | }
130 | run "${model_name}" "${model_path}" model_init
131 | 
132 | # =========================================
133 | #       Huggingface bert base model
134 | #       (Tensorflow and PyTorch)
135 | # =========================================
136 | model_name=bert-base-uncased
137 | model_path="${model_name}"
138 | # shellcheck disable=SC2317 # Reachable via run() call.
139 | model_init() {
140 |     if [[ ! -d "${model_path}" ]]; then
141 |         download_hf_repository "${model_name}" "${model_path}"
142 |     fi
143 | }
144 | run "${model_name}" "${model_path}" model_init
145 | 
146 | # =========================================
147 | #           PyTorch falcon-7b model
148 | # =========================================
149 | model_name=tiiuae/falcon-7b
150 | model_path=$(echo "${model_name}" | cut -d/ -f2)
151 | # shellcheck disable=SC2317 # Reachable via run() call.
152 | model_init() {
153 |     if [[ ! -d "${model_path}" ]]; then
154 |         download_hf_repository "${model_name}" "${model_path}"
155 |     fi
156 | }
157 | run "${model_name}" "${model_path}" model_init
158 | 
159 | 
160 | echo
161 | echo "===== RESULTS ======"
162 | # NOTE: Requires bash >= 4.4.
163 | echo "results:" "${!results[@]}"
164 | mapfile -d '' sorted < <(printf '%s\0' "${!results[@]}" | sort -z)
165 | for key in "${sorted[@]}"; do
166 |     echo "$key = ${results[${key}]}"
167 | done
168 | 
169 | deactivate
170 | 


--------------------------------------------------------------------------------
/slsa_for_models/tensorflow_cifar10.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # We will do a lazy import for these 2 modules, exploiting Python's symbol
 17 | # resolution. The lazy import is needed to make sure we only import TensorFlow
 18 | # libraries only if we want to train a TensorFlow model.
 19 | tf = None
 20 | tfds = None
 21 | 
 22 | 
 23 | def pretraining():
 24 |     """Perform setup required before training.
 25 | 
 26 |     Does the lazy loading of TensorFlow too, to prevent compatibility issues
 27 |     with mixing TensorFlow and PyTorch imports.
 28 |     """
 29 |     global tf
 30 |     global tfds
 31 |     import tensorflow as tf
 32 |     import tensorflow_datasets as tfds
 33 |     # Also compile model using XLA for ~20% performance gain
 34 |     tf.config.optimizer.set_jit(True)
 35 | 
 36 | 
 37 | def load_data():
 38 |     """Load the CIFAR10 data.
 39 | 
 40 |     Obtains both the train and the test splits. According to
 41 |     https://www.cs.toronto.edu/~kriz/cifar.html, there should be 50000 training
 42 |     images and 10000 test ones. Each image is 32x32 RGB.
 43 | 
 44 |     Data is normalized to be in [0, 1). Labels are one-hot encoded.
 45 | 
 46 |     Returns train and test pairs. Each pair consists of features and labels
 47 |     vectors of similar size.
 48 |     """
 49 |     result = tfds.load('cifar10', batch_size=-1)
 50 |     x_train = result['train']['image']
 51 |     y_train = result['train']['label']
 52 |     x_test = result['test']['image']
 53 |     y_test = result['test']['label']
 54 | 
 55 |     # transform input
 56 |     x_train = x_train.numpy().astype('float32') / 256
 57 |     x_test = x_test.numpy().astype('float32') / 256
 58 |     y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
 59 |     y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)
 60 | 
 61 |     return (x_train, y_train), (x_test, y_test)
 62 | 
 63 | 
 64 | def create_model(in_shape):
 65 |     """Create a TensorFlow NN model.
 66 | 
 67 |     The model is taken from the tutorial at
 68 |     https://www.tensorflow.org/xla/tutorials/autoclustering_xla.
 69 | 
 70 |     We need to pass as argument the expected input shape.
 71 | 
 72 |     Returns the model.
 73 |     """
 74 |     x, _, c = in_shape
 75 |     return tf.keras.models.Sequential([
 76 |         tf.keras.layers.Conv2D(x, (c, c), padding='same',
 77 |                                input_shape=in_shape),
 78 |         tf.keras.layers.Activation('relu'),
 79 |         tf.keras.layers.Conv2D(x, (c, c)),
 80 |         tf.keras.layers.Activation('relu'),
 81 |         tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
 82 |         tf.keras.layers.Dropout(0.25),
 83 |         tf.keras.layers.Conv2D(2*x, (c, c), padding='same'),
 84 |         tf.keras.layers.Activation('relu'),
 85 |         tf.keras.layers.Conv2D(2*x, (c, c)),
 86 |         tf.keras.layers.Activation('relu'),
 87 |         tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
 88 |         tf.keras.layers.Dropout(0.25),
 89 |         tf.keras.layers.Flatten(),
 90 |         tf.keras.layers.Dense(512),
 91 |         tf.keras.layers.Activation('relu'),
 92 |         tf.keras.layers.Dropout(0.5),
 93 |         tf.keras.layers.Dense(10),
 94 |         tf.keras.layers.Activation('softmax'),
 95 |     ])
 96 | 
 97 | 
 98 | def prepare_model(model):
 99 |     """Prepare model for training with loss and optimizer."""
100 |     opt = tf.keras.optimizers.RMSprop(learning_rate=0.0001)
101 |     model.compile(loss='categorical_crossentropy',
102 |                   optimizer=opt,
103 |                   metrics=['accuracy'])
104 |     return model
105 | 
106 | 
107 | def train_model(model, train, test):
108 |     """Train a model on the training set.
109 | 
110 |     The test set is used for cross validation.
111 |     """
112 |     x, y = train
113 |     model.fit(x, y, batch_size=256, epochs=16,
114 |               validation_data=test, shuffle=True)
115 | 
116 | 
117 | def score_model(model, test):
118 |     """Score a trained model on the test set."""
119 |     x, y = test
120 |     scores = model.evaluate(x, y, verbose=1)
121 |     print(f'Test loss: {scores[0]}')
122 |     print(f'Test accuracy: {scores[1]}')
123 | 
124 | 
125 | def supported_models():
126 |     """Returns supported model types paired with method to save them."""
127 |     return {
128 |         # New Keras format
129 |         'tensorflow_model.keras': lambda m, p: m.save(p, save_format='keras'),
130 |         # TF SavedModel formats, full model and weights only
131 |         # TODO: Re-enable support for these when SLSA supports directories
132 |         # 'tensorflow_saved_model': lambda m, p: m.save(p, save_format='tf'),
133 |         # 'tensorflow_exported_model': lambda m, p: m.export(p),
134 |         # Legacy HDFS format, full model and weights only
135 |         'tensorflow_hdf5_model.h5': lambda m, p: m.save(p, save_format='h5'),
136 |         'tensorflow_hdf5.weights.h5': lambda m, p: m.save_weights(p),
137 |     }
138 | 
139 | 
140 | def save_model(model, model_format):
141 |     """Save the model after training to be transferred to production.
142 | 
143 |     Saves in the requested format, if supported by TensorFlow.
144 |     """
145 |     saver = supported_models().get(model_format, None)
146 |     if not saver:
147 |         raise ValueError(
148 |             'Requested a model format not supported by TensorFlow')
149 |     saver(model, './' + model_format)
150 | 
151 | 
152 | def model_pipeline(model_format):
153 |     """Train a model and save it in the requested format."""
154 |     pretraining()
155 |     data = load_data()
156 |     model = create_model(data[0][0].shape[1:])
157 |     model = prepare_model(model)
158 |     train_model(model, data[0], data[1])
159 |     score_model(model, data[1])
160 |     save_model(model, model_format)
161 | 


--------------------------------------------------------------------------------
/slsa_for_models/pytorch_cifar10.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # We will do a lazy import for these 7 modules, exploiting Python's symbol
 17 | # resolution. The lazy import is needed to make sure we only import PyTorch
 18 | # libraries only if we want to train a PyTorch model.
 19 | torch = None
 20 | nn = None
 21 | F = None
 22 | optim = None
 23 | torchvision = None
 24 | transforms = None
 25 | 
 26 | 
 27 | def pretraining():
 28 |     """Perform setup required before training.
 29 | 
 30 |     Does the lazy loading of TensorFlow too, to prevent compatibility issues
 31 |     with mixing TensorFlow and PyTorch imports.
 32 |     """
 33 |     global torch
 34 |     global nn
 35 |     global F
 36 |     global optim
 37 |     global torchvision
 38 |     global transforms
 39 |     import torch
 40 |     import torch.nn as nn
 41 |     import torch.nn.functional as F
 42 |     import torch.optim as optim
 43 |     import torchvision
 44 |     import torchvision.transforms as transforms
 45 | 
 46 | 
 47 | def load_data():
 48 |     """Load the CIFAR10 data.
 49 | 
 50 |     Obtains both the train and the test splits. According to
 51 |     https://www.cs.toronto.edu/~kriz/cifar.html, there should be 50000 training
 52 |     images and 10000 test ones. Each image is 32x32 RGB.
 53 | 
 54 |     Data is normalized to be in range [-1, 1].
 55 | 
 56 |     Returns iterators to train and test sets.
 57 |     """
 58 |     transform = transforms.Compose([
 59 |         transforms.ToTensor(),
 60 |         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 61 |     ])
 62 | 
 63 |     batch_size = 4
 64 |     num_workers = 2
 65 | 
 66 |     trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
 67 |                                             download=True, transform=transform)
 68 |     trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
 69 |                                               shuffle=True,
 70 |                                               num_workers=num_workers)
 71 |     testset = torchvision.datasets.CIFAR10(root='./data', train=False,
 72 |                                            download=True, transform=transform)
 73 |     testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
 74 |                                              shuffle=True,
 75 |                                              num_workers=num_workers)
 76 | 
 77 |     return trainloader, testloader
 78 | 
 79 | 
 80 | def create_model():
 81 |     """Create a Torch NN model.
 82 | 
 83 |     The model is taken from the tutorial at
 84 |     https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html.
 85 | 
 86 |     Returns the model.
 87 |     """
 88 |     # Train a model based on tutorial from
 89 |     # https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html.
 90 |     # We inline the class to be able to use lazy loading of PyTorch modules.
 91 |     class MyModel(nn.Module):
 92 |         def __init__(self):
 93 |             super().__init__()
 94 |             self.conv1 = nn.Conv2d(3, 6, 5)
 95 |             self.pool = nn.MaxPool2d(2, 2)
 96 |             self.conv2 = nn.Conv2d(6, 16, 5)
 97 |             self.fc1 = nn.Linear(16 * 5 * 5, 120)
 98 |             self.fc2 = nn.Linear(120, 84)
 99 |             self.fc3 = nn.Linear(84, 10)
100 | 
101 |         def forward(self, x):
102 |             x = self.pool(F.relu(self.conv1(x)))
103 |             x = self.pool(F.relu(self.conv2(x)))
104 |             x = torch.flatten(x, 1)
105 |             x = F.relu(self.fc1(x))
106 |             x = F.relu(self.fc2(x))
107 |             x = self.fc3(x)
108 |             return x
109 | 
110 |     return MyModel()
111 | 
112 | 
113 | def prepare_model(model):
114 |     """Prepare model for training with loss and optimizer."""
115 |     # We only need to return loss and optimizer
116 |     loss = nn.CrossEntropyLoss()
117 |     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
118 |     return loss, optimizer
119 | 
120 | 
121 | def train_model(model, loss, optimizer, train):
122 |     """Train a model on the training set."""
123 |     num_epochs = 2
124 |     batch_size = 2000
125 |     for epoch in range(num_epochs):
126 |         running_loss = 0.0
127 |         for i, data in enumerate(train, 1):
128 |             x, y = data
129 |             optimizer.zero_grad()
130 |             outputs = model(x)
131 |             loss_score = loss(outputs, y)
132 |             loss_score.backward()
133 |             optimizer.step()
134 |             running_loss += loss_score.item()
135 |             if i % batch_size == 0:
136 |                 print(f'[{epoch}, {i:5d}], '
137 |                       f'loss: {running_loss / batch_size :.3f}')
138 |                 running_loss = 0.0
139 | 
140 | 
141 | def score_model(model, test):
142 |     """Score a trained model on the test set."""
143 |     correct = 0
144 |     total = 0
145 |     with torch.no_grad():
146 |         for data in test:
147 |             x, y = data
148 |             outputs = model(x)
149 |             _, predicted = torch.max(outputs.data, 1)
150 |             total += y.size(0)
151 |             correct += (predicted == y).sum().item()
152 |     print(f'Test accuracy: {correct / total}')
153 | 
154 | 
155 | def supported_models():
156 |     """Returns supported model types paired with method to save them."""
157 |     return {
158 |         'pytorch_model.pth': lambda m, p: torch.save(m.state_dict(), p),
159 |         'pytorch_full_model.pth': lambda m, p: torch.save(m, p),
160 |         'pytorch_jitted_model.pt': lambda m, p: torch.jit.script(m).save(p),
161 |     }
162 | 
163 | 
164 | def save_model(model, model_format):
165 |     """Save the model after training to be transferred to production.
166 | 
167 |     Saves in the requested format, if supported by PyTorch.
168 |     """
169 |     saver = supported_models().get(model_format, None)
170 |     if not saver:
171 |         raise ValueError('Requested a model format not supported by PyTorch')
172 |     saver(model, './' + model_format)
173 | 
174 | 
175 | def model_pipeline(model_format):
176 |     """Train a model and save it in the requested format."""
177 |     pretraining()
178 |     data = load_data()
179 |     model = create_model()
180 |     loss, optimizer = prepare_model(model)
181 |     train_model(model, loss, optimizer, data[0])
182 |     score_model(model, data[1])
183 |     save_model(model, model_format)
184 | 


--------------------------------------------------------------------------------
/model_signing/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from sigstore.sign import SigningContext
 16 | 
 17 | from sigstore.oidc import (
 18 |     IdentityToken,
 19 |     ExpiredIdentity,
 20 |     Issuer,
 21 |     detect_credential,
 22 | )
 23 | from sigstore_protobuf_specs.dev.sigstore.bundle.v1 import Bundle
 24 | from sigstore.verify import (
 25 |     policy,
 26 |     Verifier,
 27 | )
 28 | from sigstore.verify.models import (
 29 |     VerificationMaterials,
 30 | )
 31 | 
 32 | from sigstore._internal.fulcio.client import (
 33 |     ExpiredCertificate,
 34 | )
 35 | 
 36 | import io
 37 | from pathlib import Path
 38 | from typing import Optional
 39 | from serialize import Serializer
 40 | import psutil
 41 | import sys
 42 | 
 43 | 
 44 | def chunk_size() -> int:
 45 |     return int(psutil.virtual_memory().available // 2)
 46 | 
 47 | 
 48 | # TODO: Update this class to have a status instead of success.
 49 | class BaseResult:
 50 |     def __init__(self, success: bool = True, reason: str = "success"):
 51 |         self.success = success
 52 |         self.reason = reason
 53 | 
 54 |     def __bool__(self) -> bool:
 55 |         return self.success
 56 | 
 57 |     def __str__(self) -> str:
 58 |         return f"success=\"{self.success}\" reason=\"{self.reason}\""
 59 | 
 60 | 
 61 | class SignatureResult(BaseResult):
 62 |     pass
 63 | 
 64 | 
 65 | class SigstoreSigner():
 66 |     def __init__(self,
 67 |                  disable_ambient: bool = False,
 68 |                  start_default_browser: bool = False,
 69 |                  oidc_issuer: str = None):
 70 |         self.signing_ctx = SigningContext.production()
 71 |         self.disable_ambient = disable_ambient
 72 |         self.start_default_browser = start_default_browser
 73 |         self.oidc_issuer = oidc_issuer
 74 |         # NOTE: The client ID to use during OAuth2 flow.
 75 |         self.client_id = "sigstore"
 76 | 
 77 |     def get_identity_token(self) -> Optional[IdentityToken]:
 78 |         token: IdentityToken
 79 |         client_id = self.client_id
 80 |         if not self.disable_ambient:
 81 |             token = detect_credential()
 82 |             # Happy path: we've detected an ambient credential,
 83 |             # so we can return early.
 84 |             if token:
 85 |                 return IdentityToken(token)
 86 | 
 87 |         # TODO(): Support staging for testing.
 88 |         if self.oidc_issuer is not None:
 89 |             issuer = Issuer(self.oidc_issuer)
 90 |         else:
 91 |             issuer = Issuer.production()
 92 | 
 93 |         token = issuer.identity_token(client_id=client_id,
 94 |                                       force_oob=not self.start_default_browser)
 95 |         return token
 96 | 
 97 |     # NOTE: Only path in the top-level folder are considered for ignorepaths.
 98 |     def sign(self, inputfn: Path, signaturefn: Path,
 99 |              ignorepaths: [Path]) -> SignatureResult:
100 |         try:
101 |             oidc_token = self.get_identity_token()
102 |             if not oidc_token:
103 |                 raise ValueError("No identity token supplied or detected!")
104 |             # Calling the private attribute IdentityToken._federated issuer
105 |             # is a workaround for earlier versions of sigstore-python (<3.0.0)
106 |             # that do not support the federated_issuer property.
107 |             print(f"identity-provider: {oidc_token._federated_issuer}",
108 |                   file=sys.stderr)
109 |             print(f"identity: {oidc_token.identity}", file=sys.stderr)
110 | 
111 |             contentio = io.BytesIO(Serializer.serialize_v1(
112 |                 inputfn, chunk_size(), signaturefn, ignorepaths))
113 |             with self.signing_ctx.signer(oidc_token) as signer:
114 |                 result = signer.sign(input_=contentio)
115 |                 with signaturefn.open(mode="w") as b:
116 |                     print(result.to_bundle().to_json(), file=b)
117 |             return SignatureResult()
118 |         except ExpiredIdentity:
119 |             return SignatureResult(success=False,
120 |                                    reason="exception caught: Signature failed: identity token has expired")  # noqa: E501
121 |         except ExpiredCertificate:
122 |             return SignatureResult(success=False,
123 |                                    reason="exception caught: Signature failed: Fulcio signing certificate has expired")  # noqa: E501
124 |         except Exception as e:
125 |             return SignatureResult(success=False,
126 |                                    reason=f"exception caught: {str(e)}")
127 | 
128 | 
129 | # TODO: re-visit error handling and use a verbosity mode
130 | # to avoid leaking info
131 | class VerificationResult(BaseResult):
132 |     pass
133 | 
134 | 
135 | class SigstoreVerifier():
136 |     def __init__(self, oidc_provider: str, identity: str):
137 |         self.oidc_provider = oidc_provider
138 |         self.identity = identity
139 |         self.verifier = Verifier.production()
140 | 
141 |     # NOTE: Only path in the top-level folder are considered for ignorepaths.
142 |     def verify(self, inputfn: Path, signaturefn: Path,
143 |                ignorepaths: [Path], offline: bool) -> VerificationResult:
144 |         try:
145 |             bundle_bytes = signaturefn.read_bytes()
146 |             bundle = Bundle().from_json(bundle_bytes)
147 | 
148 |             material: tuple[Path, VerificationMaterials]
149 |             contentio = io.BytesIO(Serializer.serialize_v1(
150 |                 inputfn, chunk_size(), signaturefn, ignorepaths))
151 |             material = VerificationMaterials.from_bundle(input_=contentio,
152 |                                                          bundle=bundle,
153 |                                                          offline=offline)
154 |             policy_ = policy.Identity(
155 |                 identity=self.identity,
156 |                 issuer=self.oidc_provider,
157 |             )
158 |             result = self.verifier.verify(materials=material, policy=policy_)
159 |             if result:
160 |                 return VerificationResult()
161 |             return VerificationResult(success=False, reason=result.reason)
162 |         except Exception as e:
163 |             return VerificationResult(success=False,
164 |                                       reason=f"exception caught: {str(e)}")
165 |         raise ValueError("unreachable")
166 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/README.md:
--------------------------------------------------------------------------------
  1 | # SLSA for Models on Google Cloud Platform
  2 | 
  3 | This project uses [Tekton][tekton] to generate SLSA provenance for ML models on
  4 | Google Cloud Platform (GCP). It uses [Google Kubernetes Engine][gke] (GKE),
  5 | [Artifact Registry][ar], [Tekton] and [Sigstore].
  6 | 
  7 | ## Guide
  8 | 
  9 | 1. To get started, you'll need to have a [GCP Project][gcp]. You will also need
 10 |    to have these CLI tools installed:
 11 |    - [`gcloud`][gcloud]
 12 |    - [`kubectl`][kubectl]
 13 |    - [`tkn`][tkn]
 14 |    - [`cosign`][cosign]
 15 | 
 16 | 2. Enable the needed services:
 17 | 
 18 |    ```bash
 19 |    gcloud services enable \
 20 |      container.googleapis.com \
 21 |      artifactregistry.googleapis.com
 22 |    ```
 23 | 
 24 | 3. Create a GKE cluster:
 25 | 
 26 |     1. Set the `PROJECT_ID` environment variable from your GCP project:
 27 | 
 28 |        ```bash
 29 |        export PROJECT_ID=<PROJECT_ID>
 30 |        ```
 31 | 
 32 |    2. Set the `CLUSTER_NAME` environment variable to a cluster name of your
 33 |       choice:
 34 | 
 35 |       ```bash
 36 |       export CLUSTER_NAME=<CLUSTER_NAME>
 37 |       ```
 38 | 
 39 |    3. Create a cluster:
 40 | 
 41 |        ```bash
 42 |        gcloud container clusters create $CLUSTER_NAME \
 43 |          --enable-autoscaling \
 44 |          --min-nodes=1 \
 45 |          --max-nodes=3 \
 46 |          --scopes=cloud-platform \
 47 |          --no-issue-client-certificate \
 48 |          --project=$PROJECT_ID \
 49 |          --region=us-central1 \
 50 |          --machine-type=e2-standard-4 \
 51 |          --num-nodes=1 \
 52 |          --cluster-version=latest
 53 |        ```
 54 | 
 55 | 4. Install Tekton:
 56 | 
 57 |    1. Install Tekton Pipelines:
 58 | 
 59 |        ```bash
 60 |        kubectl apply --filename https://storage.googleapis.com/tekton-releases/pipeline/latest/release.yaml
 61 |        ```
 62 | 
 63 |    2. Install Tekton Chains:
 64 | 
 65 |        ```bash
 66 |        kubectl apply --filename https://storage.googleapis.com/tekton-releases/chains/latest/release.yaml
 67 |        ```
 68 | 
 69 | 5. Verify your Tekton installation was successful:
 70 | 
 71 |    1. Check that Tekton Pipelines Pods are running in Kubernetes:
 72 | 
 73 |        ```bash
 74 |        kubectl get pods -n tekton-pipelines
 75 |        ```
 76 | 
 77 |    2. Check that Tekton Chains Pods are running in Kubernetes:
 78 | 
 79 |       ```bash
 80 |       kubectl get pods -n tekton-chains
 81 |       ```
 82 | 
 83 | 6. Configure Tekton:
 84 | 
 85 |    1. Configure Tekton Pipelines to enable enumerations and alpha features:
 86 | 
 87 |        ```bash
 88 |        kubectl patch cm feature-flags -n tekton-pipelines -p '{"data":{
 89 |          "enable-param-enum":"true",
 90 |          "enable-api-fields":"alpha"
 91 |          }}'
 92 |        ```
 93 | 
 94 |    2. Then restart the Tekton Pipelines controller to ensure it picks up the
 95 |       changes:
 96 | 
 97 |       ```bash
 98 |       kubectl delete pods -n tekton-pipelines -l app=tekton-pipelines-controller
 99 |       ```
100 | 
101 |    3. Configure Tekton Chains to enable transparency log, set SLSA format and
102 |       configure storage:
103 | 
104 |       ```bash
105 |       kubectl patch configmap chains-config -n tekton-chains -p='{"data":{
106 |         "transparency.enabled": "true",
107 |         "artifacts.taskrun.format":"slsa/v2alpha2",
108 |         "artifacts.taskrun.storage": "tekton",
109 |         "artifacts.pipelinerun.format":"slsa/v2alpha2",
110 |         "artifacts.pipelinerun.storage": "tekton"
111 |         }}'
112 |       ```
113 |    4. Then restart the Tekton Chains controller to ensure it picks up the
114 |       changes:
115 | 
116 |       ```bash
117 |       kubectl delete pods -n tekton-chains -l app=tekton-chains-controller
118 |       ```
119 | 
120 | 7. Generate an encrypted x509 keypair and save it as a Kubernetes secret:
121 | 
122 |    ```bash
123 |    cosign generate-key-pair k8s://tekton-chains/signing-secrets
124 |    ```
125 | 
126 | 8. (Optional) View the Tekton resources:
127 | 
128 |    1. View the git-clone `Task`:
129 | 
130 |       ```bash
131 |       cat slsa_for_models/gcp/tasks/git-clone.yml
132 |       ```
133 | 
134 |    2. View the build-model `Task`:
135 | 
136 |       ```bash
137 |       cat slsa_for_models/gcp/tasks/build-model.yml
138 |       ```
139 | 
140 |    3. View the upload-model `Task`:
141 | 
142 |       ```bash
143 |       cat slsa_for_models/gcp/tasks/upload-model.yml
144 |       ```
145 | 
146 |    4. View the `Pipeline`:
147 | 
148 |       ```bash
149 |       cat slsa_for_models/gcp/pipeline.yml
150 |       ```
151 | 
152 |    5. View the `PipelineRun`:
153 | 
154 |       ```bash
155 |       cat slsa_for_models/gcp/pipelinerun.yml
156 |       ```
157 | 
158 | 9.  Apply the `Pipeline`:
159 | 
160 |    ```bash
161 |    kubectl apply -f slsa_for_models/gcp/pipeline.yml
162 |    ```
163 | 
164 | 10. Create a generic repository in Artifact Registry:
165 | 
166 |     1. Set the `REPOSITORY_NAME` environment variable to a name of your choice:
167 | 
168 |        ```bash
169 |        export REPOSITORY_NAME=ml-artifacts
170 |        ```
171 | 
172 |     2. Set the `LOCATION` environment variable to a [location] of your choice:
173 | 
174 |        ```bash
175 |        export LOCATION=us
176 |        ```
177 | 
178 |     3. Create a generic repository:
179 |         ```bash
180 |         gcloud artifacts repositories create $REPOSITORY_NAME \
181 |           --location=$LOCATION \
182 |           --repository-format=generic
183 |         ```
184 | 
185 |     4. If you set a different repository name and location from the example
186 |        above, make sure to modify the `Parameter` named 'model-storage' in the
187 |        `PipelineRun` with your own values.
188 | 
189 | 11. Execute the `PipelineRun`:
190 | 
191 |     ```bash
192 |     kubectl create -f slsa_for_models/gcp/pipelinerun.yml
193 |     ```
194 | 
195 | 12. Observe the `PipelineRun` execution:
196 | 
197 |     ```bash
198 |     export PIPELINERUN_NAME=$(tkn pr describe --last --output jsonpath='{.metadata.name}')
199 |     tkn pipelinerun logs $PIPELINERUN_NAME --follow
200 |     ```
201 | 
202 | 13. When the `PipelineRun` succeeds, view its status:
203 | 
204 |     ```bash
205 |     kubectl get pipelinerun $PIPELINERUN_NAME --output yaml
206 |     ```
207 | 
208 | 14. View the transparency log entry in the public [Rekor][rekor] instance:
209 | 
210 |    ```bash
211 |    export TLOG_ENTRY=$(tkn pr describe $PIPELINERUN_NAME --output jsonpath="{.metadata.annotations.chains\.tekton\.dev/transparency}")
212 |    open $TLOG_ENTRY
213 |    ```
214 | 
215 | 15. Retrieve the attestation from the `PipelineRun` which is stored as a base64-encoded annotation:
216 | 
217 |    ```bash
218 |    export PIPELINERUN_UID=$(tkn pr describe $PIPELINERUN_NAME --output  jsonpath='{.metadata.uid}')
219 |    tkn pr describe $PIPELINERUN_NAME --output jsonpath="{.metadata.annotations.chains\.tekton\.dev/signature-pipelinerun-$PIPELINERUN_UID}" | base64 -d > pytorch_model.pth.build-slsa
220 |    ```
221 | 
222 | 16. View the attestation:
223 | 
224 |    ```bash
225 |    cat pytorch_model.pth.build-slsa | tr -d '\n' | pbcopy
226 |    pbpaste | jq '.payload | @base64d | fromjson'
227 |    ```
228 | 
229 | 17. Download the model:
230 | 
231 |    ```bash
232 |    export MODEL_VERSION=$(tkn pr describe $PIPELINERUN_NAME --output jsonpath='{.status.results[1].value.digest}' | cut -d ':' -f 2)
233 |    gcloud artifacts generic download \
234 |      --package=pytorch-model \
235 |      --repository=$REPOSITORY_NAME \
236 |      --destination=. \
237 |      --version=$MODEL_VERSION
238 |    ```
239 | 
240 | 18. Verify the attestation:
241 | 
242 |    ```bash
243 |    cosign verify-blob-attestation \
244 |      --key k8s://tekton-chains/signing-secrets \
245 |      --signature pytorch_model.pth.build-slsa \
246 |      --type slsaprovenance1 \
247 |      pytorch_model.pth
248 |    ```
249 | 
250 | ### Kubeflow on Tekton
251 | 
252 | Provide a [Kubeflow Pipeline](#../kubeflow/README.md) that can be compiled into the above Tekton Pipeline
253 | using [Kubeflow on Tekton][tekton-kubeflow].
254 | 
255 | ## Future Work
256 | 
257 | ### Automate Provenance Verification
258 | 
259 | Demonstrate how to verify the provenance of the model before deploying and
260 | serving the model.
261 | 
262 | ### Automated Testing
263 | 
264 | Trigger execution of the `PipelineRun` whenever changes are made in the
265 | codebase.
266 | 
267 | 
268 | ### Accelerators
269 | 
270 | Demonstrate training ML models that require multiple hours for training and
271 | require access to accelerators (i.e., GPUs, TPUs).
272 | 
273 | [gcp]: https://cloud.google.com/docs/get-started
274 | [gcloud]: https://cloud.google.com/sdk/docs/install
275 | [kubectl]: https://kubernetes.io/docs/tasks/tools/
276 | [tkn]: https://tekton.dev/docs/cli/
277 | [cosign]: https://docs.sigstore.dev/system_config/installation/
278 | [tekton-kubeflow]: https://www.kubeflow.org/docs/components/pipelines/v1/sdk/pipelines-with-tekton/
279 | [tekton-chains]: https://tekton.dev/docs/chains/
280 | [tekton]: https://tekton.dev/docs/
281 | [rekor]: https://rekor.sigstore.dev
282 | [location]: https://cloud.google.com/artifact-registry/docs/repositories/repo-locations
283 | [gke]: https://cloud.google.com/kubernetes-engine?hl=en
284 | [ar]: https://cloud.google.com/artifact-registry
285 | [sigstore]: https://docs.sigstore.dev
286 | 


--------------------------------------------------------------------------------
/model_signing/hashing/file.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Machinery for computing digests for a single file.
 16 | 
 17 | Example usage for `FileHasher`:
 18 | ```python
 19 | >>> with open("/tmp/file", "w") as f:
 20 | ...     f.write("abcd")
 21 | >>> hasher = FileHasher("/tmp/file", SHA256())
 22 | >>> digest = hasher.compute()
 23 | >>> digest.digest_hex
 24 | '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
 25 | ```
 26 | 
 27 | Example usage for `ShardedFileHasher`, reading only the second part of a file:
 28 | ```python
 29 | >>> with open("/tmp/file", "w") as f:
 30 | ...     f.write("0123abcd")
 31 | >>> hasher = ShardedFileHasher("/tmo/file", SHA256(), start=4, end=8)
 32 | >>> digest = hasher.compute()
 33 | >>> digest.digest_hex
 34 | '88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589'
 35 | ```
 36 | """
 37 | 
 38 | import pathlib
 39 | from typing_extensions import override
 40 | 
 41 | from model_signing.hashing import hashing
 42 | 
 43 | 
 44 | class FileHasher(hashing.HashEngine):
 45 |     """Generic file hash engine.
 46 | 
 47 |     To compute the hash of a file, we read the file exactly once, including for
 48 |     very large files that don't fit in memory. Files are read in chunks and each
 49 |     chunk is passed to the `update` method of an inner
 50 |     `hashing.StreamingHashEngine`, instance. This ensures that the file digest
 51 |     will not change even if the chunk size changes. As such, we can dynamically
 52 |     determine an optimal value for the chunk argument.
 53 | 
 54 |     The `digest_name()` method MUST record all parameters that influence the
 55 |     hash output. For example, if a file is split into shards which are hashed
 56 |     separately and the final digest value is computed by aggregating these
 57 |     hashes, then the shard size must be given in the output string. However, for
 58 |     simplicity, predefined names can be used to override the `digest_name()`
 59 |     output.
 60 |     """
 61 | 
 62 |     def __init__(
 63 |         self,
 64 |         file: pathlib.Path,
 65 |         content_hasher: hashing.StreamingHashEngine,
 66 |         *,
 67 |         chunk_size: int = 8192,
 68 |         digest_name_override: str | None = None,
 69 |     ):
 70 |         """Initializes an instance to hash a file with a specific `HashEngine`.
 71 | 
 72 |         Args:
 73 |             file: The file to hash. Use `set_file` to reset it.
 74 |             content_hasher: A `hashing.StreamingHashEngine` instance used to
 75 |               compute the digest of the file.
 76 |             chunk_size: The amount of file to read at once. Default is 8KB. A
 77 |               special value of 0 signals to attempt to read everything in a
 78 |               single call.
 79 |             digest_name_override: Optional string to allow overriding the
 80 |               `digest_name` property to support shorter, standardized names.
 81 |         """
 82 |         if chunk_size < 0:
 83 |             raise ValueError(
 84 |                 f"Chunk size must be non-negative, got {chunk_size}."
 85 |             )
 86 | 
 87 |         self._file = file
 88 |         self._content_hasher = content_hasher
 89 |         self._chunk_size = chunk_size
 90 |         self._digest_name_override = digest_name_override
 91 | 
 92 |     def set_file(self, file: pathlib.Path) -> None:
 93 |         """Redefines the file to be hashed in `compute`."""
 94 |         self._file = file
 95 | 
 96 |     @override
 97 |     @property
 98 |     def digest_name(self) -> str:
 99 |         if self._digest_name_override is not None:
100 |             return self._digest_name_override
101 |         return f"file-{self._content_hasher.digest_name}"
102 | 
103 |     @override
104 |     def compute(self) -> hashing.Digest:
105 |         self._content_hasher.reset()
106 | 
107 |         if self._chunk_size == 0:
108 |             with open(self._file, "rb") as f:
109 |                 self._content_hasher.update(f.read())
110 |         else:
111 |             with open(self._file, "rb") as f:
112 |                 while True:
113 |                     data = f.read(self._chunk_size)
114 |                     if not data:
115 |                         break
116 |                     self._content_hasher.update(data)
117 | 
118 |         digest = self._content_hasher.compute()
119 |         return hashing.Digest(self.digest_name, digest.digest_value)
120 | 
121 | 
122 | class ShardedFileHasher(FileHasher):
123 |     """File hash engine that can be invoked in parallel.
124 | 
125 |     To efficiently support hashing large files, this class provides an ability
126 |     to compute the digest over a shard of the file. It is the responsibility of
127 |     the user to compose the digests of each shard into a single digest for the
128 |     entire file.
129 |     """
130 | 
131 |     def __init__(
132 |         self,
133 |         file: pathlib.Path,
134 |         content_hasher: hashing.StreamingHashEngine,
135 |         *,
136 |         start: int,
137 |         end: int,
138 |         chunk_size: int = 8192,
139 |         shard_size: int = 1000000,
140 |         digest_name_override: str | None = None,
141 |     ):
142 |         """Initializes an instance to hash a file with a specific `HashEngine`.
143 | 
144 |         Args:
145 |             file: The file to hash. Use `set_file` to reset it.
146 |             content_hasher: A `hashing.HashEngine` instance used to compute the
147 |               digest of the file. This instance must not be used outside of this
148 |               instance. However, it may be pre-initialized with a header.
149 |             start: The file offset to start reading from. Must be valid. Reset
150 |               with `set_shard`.
151 |             end: The file offset to start reading from. Must be stricly greater
152 |               than start. If past the file size, or -1, it will be trimmed.
153 |               Reset with `set_shard`.
154 |             chunk_size: The amount of file to read at once. Default is 8KB. A
155 |               special value of 0 signals to attempt to read everything in a
156 |               single call.
157 |             shard_size: The amount of file to read at once. Default is 8KB.
158 |             digest_name_override: Optional string to allow overriding the
159 |               `digest_name` property to support shorter, standardized names.
160 |         """
161 |         super().__init__(
162 |             file=file,
163 |             content_hasher=content_hasher,
164 |             chunk_size=chunk_size,
165 |             digest_name_override=digest_name_override,
166 |         )
167 | 
168 |         if shard_size <= 0:
169 |             raise ValueError(
170 |                 f"Shard size must be strictly positive, got {shard_size}."
171 |             )
172 |         self._shard_size = shard_size
173 | 
174 |         self.set_shard(start=start, end=end)
175 | 
176 |     def set_shard(self, *, start: int, end: int) -> None:
177 |         """Redefines the file shard to be hashed in `compute`."""
178 |         if start < 0:
179 |             raise ValueError(
180 |                 f"File start offset must be non-negative, got {start}."
181 |             )
182 |         if end <= start:
183 |             raise ValueError(
184 |                 "File end offset must be stricly higher that file start offset,"
185 |                 f" got {start=}, {end=}."
186 |             )
187 |         read_length = end - start
188 |         if read_length > self._shard_size:
189 |             raise ValueError(
190 |                 f"Must not read more than shard_size={self._shard_size}, got"
191 |                 f" {read_length}."
192 |             )
193 | 
194 |         self._start = start
195 |         self._end = end
196 | 
197 |     @override
198 |     def compute(self) -> hashing.Digest:
199 |         self._content_hasher.reset()
200 | 
201 |         with open(self._file, "rb") as f:
202 |             f.seek(self._start)
203 |             to_read = self._end - self._start
204 |             if self._chunk_size == 0 or self._chunk_size >= to_read:
205 |                 data = f.read(to_read)
206 |                 self._content_hasher.update(data)
207 |             else:
208 |                 while to_read >= 0:
209 |                     data = f.read(min(self._chunk_size, to_read))
210 |                     if not data:
211 |                         break
212 |                     to_read -= len(data)
213 |                     self._content_hasher.update(data)
214 | 
215 |         digest = self._content_hasher.compute()
216 |         return hashing.Digest(self.digest_name, digest.digest_value)
217 | 
218 |     @override
219 |     @property
220 |     def digest_name(self) -> str:
221 |         if self._digest_name_override is not None:
222 |             return self._digest_name_override
223 |         return f"file-{self._content_hasher.digest_name}-{self._shard_size}"
224 | 


--------------------------------------------------------------------------------
/model_signing/README.md:
--------------------------------------------------------------------------------
  1 | # Model Signing
  2 | 
  3 | This project demonstrates how to protect the integrity of a model by signing it
  4 | with [Sigstore](https://www.sigstore.dev/), a tool for making code signatures
  5 | transparent without requiring management of cryptographic key material.
  6 | 
  7 | When users download a given version of a signed model they can check that the
  8 | signature comes from a known or trusted identity and thus that the model hasn't
  9 | been tampered with after training.
 10 | 
 11 | Signing events are recorded to Sigstore's append-only transparency log.
 12 | Transparency logs make signing events discoverable: Model verifiers can validate
 13 | that the models they are looking at exist in the transparency log by checking a
 14 | proof of inclusion (which is handled by the model signing library).
 15 | Furthermore, model signers that monitor the log can check for any unexpected
 16 | signing events.
 17 | 
 18 | Model signers should monitor for occurences of their signing identity in the
 19 | log. Sigstore is actively developing a [log
 20 | monitor](https://github.com/sigstore/rekor-monitor) that runs on GitHub Actions.
 21 | 
 22 | ![Signing models with Sigstore](images/sigstore-model-diagram.png)
 23 | 
 24 | ## Usage
 25 | 
 26 | You will need to install a few prerequisites to be able to run all of the
 27 | examples below:
 28 | 
 29 | ```bash
 30 | sudo apt install git git-lfs python3-venv python3-pip unzip
 31 | git lfs install
 32 | ```
 33 | 
 34 | After this, you can clone the repository, create a Python virtual environment
 35 | and install the dependencies needed by the project:
 36 | 
 37 | ```bash
 38 | git clone git@github.com:sigstore/model-transparency.git
 39 | cd model-transparency/model_signing
 40 | python3 -m venv test_env
 41 | source test_env/bin/activate
 42 | os=Linux # Supported: Linux, Windows, Darwin.
 43 | python3 -m pip install --require-hashes -r "install/requirements_${os}".txt
 44 | ```
 45 | 
 46 | After this point, you can use the project to sign and verify models and
 47 | checkpoints. A help message with all arguments can be obtained by passing `-h`
 48 | argument, either to the main driver or to the two subcommands:
 49 | 
 50 | ```bash
 51 | python3 main.py -h
 52 | python3 main.py sign -h
 53 | python3 main.py verify -h
 54 | ```
 55 | 
 56 | Signing a model requires passing an argument for the path to the model. This can
 57 | be a path to a file or a directory (for large models, or model formats such as
 58 | `SavedModel` which are stored as a directory of related files):
 59 | 
 60 | ```bash
 61 | path=path/to/model
 62 | python3 main.py sign --path "${path}"
 63 | ```
 64 | 
 65 | The sign process will start an OIDC workflow to generate a short lived
 66 | certificate based on an identity provider. This will be relevant when verifying
 67 | the signature, as shown below.
 68 | 
 69 | **Note**: The signature is stored as `<file>.sig` for a model serialized as a
 70 | single file, and `<dir>/model.sig` for a model in a folder-based format.
 71 | 
 72 | For verification, we need to pass both the path to the model and identity
 73 | related arguments:
 74 | 
 75 | ```bash
 76 | python3 main.py verify --path "${path}" \
 77 |     --identity-provider https://accounts.google.com \
 78 |     --identity myemail@gmail.com
 79 | ```
 80 | 
 81 | For developers signing models, there are three identity providers that can
 82 | be used at the moment:
 83 | 
 84 | * Google's provider is `https://accounts.google.com`.
 85 | * GitHub's provider is `https://github.com/login/oauth`.
 86 | * Microsoft's provider is `https://login.microsoftonline.com`.
 87 | 
 88 | For automated signing using a workload identity, the following platforms
 89 | are currently supported, shown with their expected identities:
 90 | 
 91 | * GitHub Actions
 92 |   (`https://github.com/octo-org/octo-automation/.github/workflows/oidc.yml@refs/heads/main`)
 93 | * GitLab CI
 94 |   (`https://gitlab.com/my-group/my-project//path/to/.gitlab-ci.yml@refs/heads/main`)
 95 | * Google Cloud Platform (`SERVICE_ACCOUNT_NAME@PROJECT_ID.iam.gserviceaccount.com`)
 96 | * Buildkite CI (`https://buildkite.com/ORGANIZATION_SLUG/PIPELINE_SLUG`)
 97 | 
 98 | ### Supported Models
 99 | 
100 | The library supports multiple models, from multiple training frameworks and
101 | model hubs.
102 | 
103 | For example, to sign and verify a Bertseq2seq model, trained with TensorFlow,
104 | stored in TFHub, run the following commands:
105 | 
106 | ```bash
107 | model_path=bertseq2seq
108 | wget "https://tfhub.dev/google/bertseq2seq/bert24_en_de/1?tf-hub-format=compressed" -O "${model_path}".tgz
109 | mkdir -p "${model_path}"
110 | cd "${model_path}" && tar xvzf ../"${model_path}".tgz && rm ../"${model_path}".tgz && cd -
111 | python3 main.py sign --path "${model_path}"
112 | python3 main.py verify --path "${model_path}" \
113 |     --identity-provider https://accounts.google.com \
114 |     --identity myemail@gmail.com
115 | ```
116 | 
117 | For models stored in Hugging Face we need the large file support from git, which
118 | can be obtained via
119 | 
120 | ```bash
121 | sudo apt install git-lfs
122 | git lfs install
123 | ```
124 | 
125 | After this, we can sign and verify a Bert base model:
126 | 
127 | ```bash
128 | model_name=bert-base-uncased
129 | model_path="${model_name}"
130 | git clone --depth=1 "https://huggingface.co/${model_name}" && rm -rf "${model_name}"/.git
131 | python3 main.py sign --path "${model_path}"
132 | python3 main.py verify --path "${model_path}" \
133 |     --identity-provider https://accounts.google.com \
134 |     --identity myemail@gmail.com
135 | ```
136 | 
137 | Similarly, we can sign and verify a Falcon model:
138 | 
139 | ```bash
140 | model_name=tiiuae/falcon-7b
141 | model_path=$(echo "${model_name}" | cut -d/ -f2)
142 | git clone --depth=1 "https://huggingface.co/${model_name}" && rm -rf "${model_name}"/.git
143 | python3 main.py sign --path "${model_path}"
144 | python3 main.py verify --path "${model_path}" \
145 |     --identity-provider https://accounts.google.com \
146 |     --identity myemail@gmail.com
147 | ```
148 | 
149 | We can also support models from  the PyTorch Hub:
150 | 
151 | ```bash
152 | model_name=hustvl/YOLOP
153 | model_path=$(echo "${model_name}" | cut -d/ -f2)
154 | wget "https://github.com/${model_name}/archive/main.zip" -O "${model_path}".zip
155 | mkdir -p "${model_path}"
156 | cd "${model_path}" && unzip ../"${model_path}".zip && rm ../"${model_path}".zip && shopt -s dotglob && mv YOLOP-main/* . && shopt -u dotglob && rmdir YOLOP-main/ && cd -
157 | python3 main.py sign --path "${model_path}"
158 | python3 main.py verify --path "${model_path}" \
159 |     --identity-provider https://accounts.google.com \
160 |     --identity myemail@gmail.com
161 | ```
162 | 
163 | We also support ONNX models, for example Roberta:
164 | 
165 | ```bash
166 | model_name=roberta-base-11
167 | model_path="${model_name}.onnx"
168 | wget "https://github.com/onnx/models/raw/main/text/machine_comprehension/roberta/model/${model_name}.onnx"
169 | python3 main.py sign --path "${model_path}"
170 | python3 main.py verify --path "${model_path}" \
171 |     --identity-provider https://accounts.google.com \
172 |     --identity myemail@gmail.com
173 | ```
174 | 
175 | ## Benchmarking
176 | 
177 | Install as per [Usage section](#usage).
178 | Ensure you have enough disk space:
179 | - if passing 3rd script argument as `true`: at least 50GB
180 | - otherwise: at least 100GB
181 | 
182 | To run the benchmarks:
183 | 
184 | ```bash
185 | git clone git@github.com:sigstore/model-transparency.git
186 | cd model-transparency/model_signing
187 | bash benchmarks/run.sh https://accounts.google.com myemail@gmail.com [true]
188 | ```
189 | 
190 | A single run was performed.
191 | 
192 | Hashes used:
193 | - H1: Hashing using a tree representation of the directory.
194 | - H2: Hashing using a list representation of the directory. (Implementation is parallized with shards of 1GB sizes across vCPUs).
195 | 
196 | Machine M1: Debian 6.3.11 x86_64 GNU/Linux, 200GB RAM, 48 vCPUs, 512KB cache, AMD EPYC 7B12:
197 | 
198 | | Hash | Model              | Size  |  Sign Time | Verify Time |
199 | |------|--------------------|-------|:------:|:-----:|
200 | | H1 | roberta-base-11      | 8K    | 0.8s  | 0.6s  |
201 | | H1 | hustvl/YOLOP         | 215M  | 1.2s  | 0.8s  |
202 | | H1 | bertseq2seq          | 2.8G  | 4.6s  | 4.4s  |
203 | | H1 | bert-base-uncased    | 3.3G  | 5s    | 4.7s  |
204 | | H1 | tiiuae/falcon-7b     | 14GB  | 12.2s | 11.8s |
205 | | H2 | roberta-base-11      | 8K    | 1s    | 0.6s  |
206 | | H2 | hustvl/YOLOP         | 215M  | 1s    | 1s    |
207 | | H2 | bertseq2seq          | 2.8G  | 1.9s  | 1.4s  |
208 | | H2 | bert-base-uncased    | 3.3G  | 1.6s  | 1.1s  |
209 | | H2 | tiiuae/falcon-7b     | 14GB  | 2.1s  | 1.8s  |
210 | 
211 | Machine M2: Debian 5.10.1 x86_64 GNU/Linux, 4GB RAM, 2 vCPUs, 56320 KB, Intel(R) Xeon(R) CPU @ 2.20GHz:
212 | 
213 | | Hash | Model              | Size  |  Sign Time | Verify Time |
214 | |------|--------------------|-------|:------:|:-----:|
215 | | H1 | roberta-base-11      | 8K    | 1.1s  | 0.7s  |
216 | | H1 | hustvl/YOLOP         | 215M  | 1.9s  | 1.7s  |
217 | | H1 | bertseq2seq          | 2.8G  | 18s   | 23.2s |
218 | | H1 | bert-base-uncased    | 3.3G  | 23.4s | 18.9s |
219 | | H1 | tiiuae/falcon-7b     | 14GB  | 2m4s | 2m2s   |
220 | | H2 | roberta-base-11      | 8K    | 1.1s  | 0.8s  |
221 | | H2 | hustvl/YOLOP         | 215M  | 1.9s  | 1.6s  |
222 | | H2 | bertseq2seq          | 2.8G  | 13.8s | 25.9s |
223 | | H2 | bert-base-uncased    | 3.3G  | 22.7s | 23.3s |
224 | | H2 | tiiuae/falcon-7b     | 14GB  | 2m.1s | 2m3s  |
225 | 


--------------------------------------------------------------------------------
/slsa_for_models/gcp/tasks/git-clone.yml:
--------------------------------------------------------------------------------
  1 | # copied from https://github.com/tektoncd/catalog/tree/main/task/git-clone/0.7
  2 | # and modified to contain type hinting for provenance generation -- remove when
  3 | # the catalog updates the task to support type hinting
  4 | apiVersion: tekton.dev/v1beta1
  5 | kind: Task
  6 | metadata:
  7 |   name: git-clone
  8 |   labels:
  9 |     app.kubernetes.io/version: "0.7"
 10 |   annotations:
 11 |     tekton.dev/pipelines.minVersion: "0.29.0"
 12 |     tekton.dev/categories: Git
 13 |     tekton.dev/tags: git
 14 |     tekton.dev/displayName: "git clone"
 15 |     tekton.dev/platforms: "linux/amd64,linux/s390x,linux/ppc64le,linux/arm64"
 16 | spec:
 17 |   description: >-
 18 |     These Tasks are Git tasks to work with repositories used by other tasks
 19 |     in your Pipeline.
 20 | 
 21 |     The git-clone Task will clone a repo from the provided url into the
 22 |     output Workspace. By default the repo will be cloned into the root of
 23 |     your Workspace. You can clone into a subdirectory by setting this Task's
 24 |     subdirectory param. This Task also supports sparse checkouts. To perform
 25 |     a sparse checkout, pass a list of comma separated directory patterns to
 26 |     this Task's sparseCheckoutDirectories param.
 27 |   workspaces:
 28 |     - name: output
 29 |       description: The git repo will be cloned onto the volume backing this Workspace.
 30 |     - name: ssh-directory
 31 |       optional: true
 32 |       description: |
 33 |         A .ssh directory with private key, known_hosts, config, etc. Copied to
 34 |         the user's home before git commands are executed. Used to authenticate
 35 |         with the git remote when performing the clone. Binding a Secret to this
 36 |         Workspace is strongly recommended over other volume types.
 37 |     - name: basic-auth
 38 |       optional: true
 39 |       description: |
 40 |         A Workspace containing a .gitconfig and .git-credentials file. These
 41 |         will be copied to the user's home before any git commands are run. Any
 42 |         other files in this Workspace are ignored. It is strongly recommended
 43 |         to use ssh-directory over basic-auth whenever possible and to bind a
 44 |         Secret to this Workspace over other volume types.
 45 |     - name: ssl-ca-directory
 46 |       optional: true
 47 |       description: |
 48 |         A workspace containing CA certificates, this will be used by Git to
 49 |         verify the peer with when fetching or pushing over HTTPS.
 50 |   params:
 51 |     - name: url
 52 |       description: Repository URL to clone from.
 53 |       type: string
 54 |     - name: revision
 55 |       description: Revision to checkout. (branch, tag, sha, ref, etc...)
 56 |       type: string
 57 |       default: ""
 58 |     - name: refspec
 59 |       description: Refspec to fetch before checking out revision.
 60 |       default: ""
 61 |     - name: submodules
 62 |       description: Initialize and fetch git submodules.
 63 |       type: string
 64 |       default: "true"
 65 |     - name: depth
 66 |       description: Perform a shallow clone, fetching only the most recent N commits.
 67 |       type: string
 68 |       default: "1"
 69 |     - name: sslVerify
 70 |       description: Set the `http.sslVerify` global git config. Setting this to `false` is not advised unless you are sure that you trust your git remote.
 71 |       type: string
 72 |       default: "true"
 73 |     - name: crtFileName
 74 |       description: file name of mounted crt using ssl-ca-directory workspace. default value is ca-bundle.crt.
 75 |       type: string
 76 |       default: "ca-bundle.crt"
 77 |     - name: subdirectory
 78 |       description: Subdirectory inside the `output` Workspace to clone the repo into.
 79 |       type: string
 80 |       default: ""
 81 |     - name: sparseCheckoutDirectories
 82 |       description: Define the directory patterns to match or exclude when performing a sparse checkout.
 83 |       type: string
 84 |       default: ""
 85 |     - name: deleteExisting
 86 |       description: Clean out the contents of the destination directory if it already exists before cloning.
 87 |       type: string
 88 |       default: "true"
 89 |     - name: httpProxy
 90 |       description: HTTP proxy server for non-SSL requests.
 91 |       type: string
 92 |       default: ""
 93 |     - name: httpsProxy
 94 |       description: HTTPS proxy server for SSL requests.
 95 |       type: string
 96 |       default: ""
 97 |     - name: noProxy
 98 |       description: Opt out of proxying HTTP/HTTPS requests.
 99 |       type: string
100 |       default: ""
101 |     - name: verbose
102 |       description: Log the commands that are executed during `git-clone`'s operation.
103 |       type: string
104 |       default: "true"
105 |     - name: gitInitImage
106 |       description: The image providing the git-init binary that this Task runs.
107 |       type: string
108 |       default: "gcr.io/tekton-releases/github.com/tektoncd/pipeline/cmd/git-init:v0.29.0"
109 |     - name: userHome
110 |       description: |
111 |         Absolute path to the user's home directory. Set this explicitly if you are running the image as a non-root user or have overridden
112 |         the gitInitImage param with an image containing custom user configuration.
113 |       type: string
114 |       default: "/tekton/home"
115 |   results:
116 |     - name: commit
117 |       description: The precise commit SHA that was fetched by this Task.
118 |     - name: url
119 |       description: The precise URL that was fetched by this Task.
120 |     - name: source_ARTIFACT_INPUTS
121 |       properties:
122 |         uri: { }
123 |         digest: { }
124 |   steps:
125 |     - name: clone
126 |       image: "$(params.gitInitImage)"
127 |       env:
128 |         - name: HOME
129 |           value: "$(params.userHome)"
130 |         - name: PARAM_URL
131 |           value: $(params.url)
132 |         - name: PARAM_REVISION
133 |           value: $(params.revision)
134 |         - name: PARAM_REFSPEC
135 |           value: $(params.refspec)
136 |         - name: PARAM_SUBMODULES
137 |           value: $(params.submodules)
138 |         - name: PARAM_DEPTH
139 |           value: $(params.depth)
140 |         - name: PARAM_SSL_VERIFY
141 |           value: $(params.sslVerify)
142 |         - name: PARAM_CRT_FILENAME
143 |           value: $(params.crtFileName)
144 |         - name: PARAM_SUBDIRECTORY
145 |           value: $(params.subdirectory)
146 |         - name: PARAM_DELETE_EXISTING
147 |           value: $(params.deleteExisting)
148 |         - name: PARAM_HTTP_PROXY
149 |           value: $(params.httpProxy)
150 |         - name: PARAM_HTTPS_PROXY
151 |           value: $(params.httpsProxy)
152 |         - name: PARAM_NO_PROXY
153 |           value: $(params.noProxy)
154 |         - name: PARAM_VERBOSE
155 |           value: $(params.verbose)
156 |         - name: PARAM_SPARSE_CHECKOUT_DIRECTORIES
157 |           value: $(params.sparseCheckoutDirectories)
158 |         - name: PARAM_USER_HOME
159 |           value: $(params.userHome)
160 |         - name: WORKSPACE_OUTPUT_PATH
161 |           value: $(workspaces.output.path)
162 |         - name: WORKSPACE_SSH_DIRECTORY_BOUND
163 |           value: $(workspaces.ssh-directory.bound)
164 |         - name: WORKSPACE_SSH_DIRECTORY_PATH
165 |           value: $(workspaces.ssh-directory.path)
166 |         - name: WORKSPACE_BASIC_AUTH_DIRECTORY_BOUND
167 |           value: $(workspaces.basic-auth.bound)
168 |         - name: WORKSPACE_BASIC_AUTH_DIRECTORY_PATH
169 |           value: $(workspaces.basic-auth.path)
170 |         - name: WORKSPACE_SSL_CA_DIRECTORY_BOUND
171 |           value: $(workspaces.ssl-ca-directory.bound)
172 |         - name: WORKSPACE_SSL_CA_DIRECTORY_PATH
173 |           value: $(workspaces.ssl-ca-directory.path)
174 |       script: |
175 |         #!/usr/bin/env sh
176 |         set -eu
177 | 
178 |         if [ "${PARAM_VERBOSE}" = "true" ] ; then
179 |           set -x
180 |         fi
181 | 
182 | 
183 |         if [ "${WORKSPACE_BASIC_AUTH_DIRECTORY_BOUND}" = "true" ] ; then
184 |           cp "${WORKSPACE_BASIC_AUTH_DIRECTORY_PATH}/.git-credentials" "${PARAM_USER_HOME}/.git-credentials"
185 |           cp "${WORKSPACE_BASIC_AUTH_DIRECTORY_PATH}/.gitconfig" "${PARAM_USER_HOME}/.gitconfig"
186 |           chmod 400 "${PARAM_USER_HOME}/.git-credentials"
187 |           chmod 400 "${PARAM_USER_HOME}/.gitconfig"
188 |         fi
189 | 
190 |         if [ "${WORKSPACE_SSH_DIRECTORY_BOUND}" = "true" ] ; then
191 |           cp -R "${WORKSPACE_SSH_DIRECTORY_PATH}" "${PARAM_USER_HOME}"/.ssh
192 |           chmod 700 "${PARAM_USER_HOME}"/.ssh
193 |           chmod -R 400 "${PARAM_USER_HOME}"/.ssh/*
194 |         fi
195 | 
196 |         if [ "${WORKSPACE_SSL_CA_DIRECTORY_BOUND}" = "true" ] ; then
197 |            export GIT_SSL_CAPATH="${WORKSPACE_SSL_CA_DIRECTORY_PATH}"
198 |            if [ "${PARAM_CRT_FILENAME}" != "" ] ; then
199 |               export GIT_SSL_CAINFO="${WORKSPACE_SSL_CA_DIRECTORY_PATH}/${PARAM_CRT_FILENAME}"
200 |            fi
201 |         fi
202 |         CHECKOUT_DIR="${WORKSPACE_OUTPUT_PATH}/${PARAM_SUBDIRECTORY}"
203 | 
204 |         cleandir() {
205 |           # Delete any existing contents of the repo directory if it exists.
206 |           #
207 |           # We don't just "rm -rf ${CHECKOUT_DIR}" because ${CHECKOUT_DIR} might be "/"
208 |           # or the root of a mounted volume.
209 |           if [ -d "${CHECKOUT_DIR}" ] ; then
210 |             # Delete non-hidden files and directories
211 |             rm -rf "${CHECKOUT_DIR:?}"/*
212 |             # Delete files and directories starting with . but excluding ..
213 |             rm -rf "${CHECKOUT_DIR}"/.[!.]*
214 |             # Delete files and directories starting with .. plus any other character
215 |             rm -rf "${CHECKOUT_DIR}"/..?*
216 |           fi
217 |         }
218 | 
219 |         if [ "${PARAM_DELETE_EXISTING}" = "true" ] ; then
220 |           cleandir
221 |         fi
222 | 
223 |         test -z "${PARAM_HTTP_PROXY}" || export HTTP_PROXY="${PARAM_HTTP_PROXY}"
224 |         test -z "${PARAM_HTTPS_PROXY}" || export HTTPS_PROXY="${PARAM_HTTPS_PROXY}"
225 |         test -z "${PARAM_NO_PROXY}" || export NO_PROXY="${PARAM_NO_PROXY}"
226 | 
227 |         /ko-app/git-init \
228 |           -url="${PARAM_URL}" \
229 |           -revision="${PARAM_REVISION}" \
230 |           -refspec="${PARAM_REFSPEC}" \
231 |           -path="${CHECKOUT_DIR}" \
232 |           -sslVerify="${PARAM_SSL_VERIFY}" \
233 |           -submodules="${PARAM_SUBMODULES}" \
234 |           -depth="${PARAM_DEPTH}" \
235 |           -sparseCheckoutDirectories="${PARAM_SPARSE_CHECKOUT_DIRECTORIES}"
236 |         cd "${CHECKOUT_DIR}"
237 |         RESULT_SHA="$(git rev-parse HEAD)"
238 |         EXIT_CODE="$?"
239 |         if [ "${EXIT_CODE}" != 0 ] ; then
240 |           exit "${EXIT_CODE}"
241 |         fi
242 |         printf "%s" "${RESULT_SHA}" > "$(results.commit.path)"
243 |         printf "%s" "${PARAM_URL}" > "$(results.url.path)"
244 | 
245 |         # type hinting for provenance generation
246 |         cat <<EOF | tee $(results.source_ARTIFACT_INPUTS.path)
247 |         {
248 |           "uri": "${PARAM_URL}",
249 |           "digest": "sha256:${RESULT_SHA}"
250 |         }
251 |         EOF
252 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/model_signing/hashing/file_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import pytest
 16 | 
 17 | from model_signing.hashing import file
 18 | from model_signing.hashing import memory
 19 | 
 20 | 
 21 | # some constants used throughout testing
 22 | _HEADER: str = "Some "
 23 | _CONTENT: str = "text."  # note that these have the same length
 24 | _FULL_CONTENT = _HEADER + _CONTENT
 25 | _SHARD_SIZE = len(_HEADER)
 26 | 
 27 | 
 28 | @pytest.fixture(scope="session")
 29 | def sample_file(tmp_path_factory):
 30 |     file_path = tmp_path_factory.mktemp("dir") / "text.txt"
 31 |     file_path.write_text(_FULL_CONTENT)
 32 |     return file_path
 33 | 
 34 | 
 35 | @pytest.fixture(scope="session")
 36 | def sample_file_content_only(tmp_path_factory):
 37 |     file_path = tmp_path_factory.mktemp("dir") / "text.txt"
 38 |     file_path.write_text(_CONTENT)
 39 |     return file_path
 40 | 
 41 | 
 42 | @pytest.fixture(scope="session")
 43 | def expected_digest():
 44 |     # To ensure that the expected file digest is always up to date, use the
 45 |     # memory hashing and create a fixture for the expected value.
 46 |     hasher = memory.SHA256(_FULL_CONTENT.encode("utf-8"))
 47 |     digest = hasher.compute()
 48 |     return digest.digest_hex
 49 | 
 50 | 
 51 | @pytest.fixture(scope="session")
 52 | def expected_header_digest():
 53 |     hasher = memory.SHA256(_HEADER.encode("utf-8"))
 54 |     digest = hasher.compute()
 55 |     return digest.digest_hex
 56 | 
 57 | 
 58 | @pytest.fixture(scope="session")
 59 | def expected_content_digest():
 60 |     hasher = memory.SHA256(_CONTENT.encode("utf-8"))
 61 |     digest = hasher.compute()
 62 |     return digest.digest_hex
 63 | 
 64 | 
 65 | class TestFileHasher:
 66 | 
 67 |     def test_fails_with_negative_chunk_size(self):
 68 |         with pytest.raises(ValueError, match="Chunk size must be non-negative"):
 69 |             file.FileHasher("unused", memory.SHA256(), chunk_size=-2)
 70 | 
 71 |     def test_hash_of_known_file(self, sample_file, expected_digest):
 72 |         hasher = file.FileHasher(sample_file, memory.SHA256())
 73 |         digest = hasher.compute()
 74 |         assert digest.digest_hex == expected_digest
 75 | 
 76 |     def test_hash_of_known_file_no_chunk(self, sample_file, expected_digest):
 77 |         hasher = file.FileHasher(sample_file, memory.SHA256(), chunk_size=0)
 78 |         digest = hasher.compute()
 79 |         assert digest.digest_hex == expected_digest
 80 | 
 81 |     def test_hash_of_known_file_small_chunk(self, sample_file, expected_digest):
 82 |         hasher = file.FileHasher(sample_file, memory.SHA256(), chunk_size=2)
 83 |         digest = hasher.compute()
 84 |         assert digest.digest_hex == expected_digest
 85 | 
 86 |     def test_hash_file_twice(self, sample_file):
 87 |         hasher1 = file.FileHasher(sample_file, memory.SHA256())
 88 |         digest1 = hasher1.compute()
 89 |         hasher2 = file.FileHasher(sample_file, memory.SHA256())
 90 |         digest2 = hasher2.compute()
 91 |         assert digest1.digest_value == digest2.digest_value
 92 | 
 93 |     def test_hash_file_twice_same_hasher(self, sample_file):
 94 |         hasher = file.FileHasher(sample_file, memory.SHA256())
 95 |         digest1 = hasher.compute()
 96 |         digest2 = hasher.compute()
 97 |         assert digest1.digest_value == digest2.digest_value
 98 | 
 99 |     def test_hash_file_twice_same_hasher_reset_file(self, sample_file):
100 |         hasher = file.FileHasher(sample_file, memory.SHA256())
101 |         digest1 = hasher.compute()
102 |         hasher.set_file(sample_file)
103 |         digest2 = hasher.compute()
104 |         assert digest1.digest_value == digest2.digest_value
105 | 
106 |     def test_set_file(self, sample_file, sample_file_content_only):
107 |         hasher = file.FileHasher(sample_file, memory.SHA256())
108 |         digest1 = hasher.compute()
109 |         hasher.set_file(sample_file_content_only)
110 |         _ = hasher.compute()
111 |         hasher.set_file(sample_file)
112 |         digest2 = hasher.compute()
113 |         assert digest1.digest_value == digest2.digest_value
114 | 
115 |     def test_default_digest_name(self):
116 |         hasher = file.FileHasher("unused", memory.SHA256(), chunk_size=10)
117 |         assert hasher.digest_name == "file-sha256"
118 | 
119 |     def test_override_digest_name(self):
120 |         hasher = file.FileHasher(
121 |             "unused",
122 |             memory.SHA256(),
123 |             chunk_size=10,
124 |             digest_name_override="test-hash",
125 |         )
126 |         assert hasher.digest_name == "test-hash"
127 | 
128 |     def test_digest_algorithm_is_digest_name(self, sample_file):
129 |         hasher = file.FileHasher(sample_file, memory.SHA256())
130 |         digest = hasher.compute()
131 |         assert digest.algorithm == hasher.digest_name
132 | 
133 | 
134 | class TestShardedFileHasher:
135 | 
136 |     def test_fails_with_negative_shard_size(self):
137 |         with pytest.raises(
138 |             ValueError, match="Shard size must be strictly positive"
139 |         ):
140 |             file.ShardedFileHasher(
141 |                 "unused", memory.SHA256(), shard_size=-2, start=0, end=42
142 |             )
143 | 
144 |     def test_fails_with_negative_start(self):
145 |         with pytest.raises(
146 |             ValueError, match="File start offset must be non-negative"
147 |         ):
148 |             file.ShardedFileHasher("unused", memory.SHA256(), start=-2, end=42)
149 | 
150 |     def test_set_fails_with_negative_start(self):
151 |         hasher = file.ShardedFileHasher(
152 |             "unused", memory.SHA256(), start=0, end=42
153 |         )
154 |         with pytest.raises(
155 |             ValueError, match="File start offset must be non-negative"
156 |         ):
157 |             hasher.set_shard(start=-2, end=42)
158 | 
159 |     def test_fails_with_end_lower_than_start(self):
160 |         with pytest.raises(
161 |             ValueError,
162 |             match=(
163 |                 "File end offset must be stricly higher that file start offset"
164 |             ),
165 |         ):
166 |             file.ShardedFileHasher("unused", memory.SHA256(), start=42, end=2)
167 | 
168 |     def test_set_fails_with_end_lower_than_start(self):
169 |         hasher = file.ShardedFileHasher(
170 |             "unused", memory.SHA256(), start=0, end=42
171 |         )
172 |         with pytest.raises(
173 |             ValueError,
174 |             match=(
175 |                 "File end offset must be stricly higher that file start offset"
176 |             ),
177 |         ):
178 |             hasher.set_shard(start=42, end=2)
179 | 
180 |     def test_fails_with_zero_read_span(self):
181 |         with pytest.raises(
182 |             ValueError,
183 |             match=(
184 |                 "File end offset must be stricly higher that file start offset"
185 |             ),
186 |         ):
187 |             file.ShardedFileHasher("unused", memory.SHA256(), start=42, end=42)
188 | 
189 |     def test_set_fails_with_zero_read_span(self):
190 |         hasher = file.ShardedFileHasher(
191 |             "unused", memory.SHA256(), start=0, end=42
192 |         )
193 |         with pytest.raises(
194 |             ValueError,
195 |             match=(
196 |                 "File end offset must be stricly higher that file start offset"
197 |             ),
198 |         ):
199 |             hasher.set_shard(start=42, end=42)
200 | 
201 |     def test_fails_with_read_span_too_large(self):
202 |         with pytest.raises(
203 |             ValueError, match="Must not read more than shard_size=2"
204 |         ):
205 |             file.ShardedFileHasher(
206 |                 "unused", memory.SHA256(), start=0, end=42, shard_size=2
207 |             )
208 | 
209 |     def test_set_fails_with_read_span_too_large(self):
210 |         hasher = file.ShardedFileHasher(
211 |             "unused", memory.SHA256(), start=0, end=2, shard_size=2
212 |         )
213 |         with pytest.raises(
214 |             ValueError, match="Must not read more than shard_size=2"
215 |         ):
216 |             hasher.set_shard(start=0, end=42)
217 | 
218 |     def test_hash_of_known_file(
219 |         self, sample_file, expected_header_digest, expected_content_digest
220 |     ):
221 |         hasher1 = file.ShardedFileHasher(
222 |             sample_file, memory.SHA256(), start=0, end=_SHARD_SIZE
223 |         )
224 |         hasher2 = file.ShardedFileHasher(
225 |             sample_file, memory.SHA256(), start=_SHARD_SIZE, end=2 * _SHARD_SIZE
226 |         )
227 | 
228 |         digest1 = hasher1.compute()
229 |         assert digest1.digest_hex == expected_header_digest
230 | 
231 |         digest2 = hasher2.compute()
232 |         assert digest2.digest_hex == expected_content_digest
233 | 
234 |     def test_hash_of_known_file_using_set_shard(
235 |         self, sample_file, expected_header_digest, expected_content_digest
236 |     ):
237 |         hasher = file.ShardedFileHasher(
238 |             sample_file, memory.SHA256(), start=0, end=_SHARD_SIZE
239 |         )
240 | 
241 |         digest1 = hasher.compute()
242 |         assert digest1.digest_hex == expected_header_digest
243 | 
244 |         hasher.set_shard(start=_SHARD_SIZE, end=2 * _SHARD_SIZE)
245 |         digest2 = hasher.compute()
246 |         assert digest2.digest_hex == expected_content_digest
247 | 
248 |     def test_hash_of_known_file_end_overflow(
249 |         self, sample_file, expected_digest
250 |     ):
251 |         hasher = file.ShardedFileHasher(
252 |             sample_file, memory.SHA256(), start=0, end=3 * _SHARD_SIZE
253 |         )
254 |         digest = hasher.compute()
255 |         assert digest.digest_hex == expected_digest
256 | 
257 |     def test_hash_of_known_file_set_end_overflow(
258 |         self, sample_file, expected_digest
259 |     ):
260 |         hasher = file.ShardedFileHasher(
261 |             sample_file, memory.SHA256(), start=0, end=_SHARD_SIZE
262 |         )
263 |         hasher.set_shard(start=0, end=5 * _SHARD_SIZE)
264 |         digest = hasher.compute()
265 |         assert digest.digest_hex == expected_digest
266 | 
267 |     def test_hash_of_known_file_no_chunk(
268 |         self, sample_file, expected_header_digest, expected_content_digest
269 |     ):
270 |         hasher1 = file.ShardedFileHasher(
271 |             sample_file, memory.SHA256(), start=0, end=_SHARD_SIZE, chunk_size=0
272 |         )
273 |         hasher2 = file.ShardedFileHasher(
274 |             sample_file,
275 |             memory.SHA256(),
276 |             start=_SHARD_SIZE,
277 |             end=2 * _SHARD_SIZE,
278 |             chunk_size=0,
279 |         )
280 | 
281 |         digest1 = hasher1.compute()
282 |         assert digest1.digest_hex == expected_header_digest
283 | 
284 |         digest2 = hasher2.compute()
285 |         assert digest2.digest_hex == expected_content_digest
286 | 
287 |     def test_hash_of_known_file_small_chunk(
288 |         self, sample_file, expected_header_digest, expected_content_digest
289 |     ):
290 |         hasher1 = file.ShardedFileHasher(
291 |             sample_file, memory.SHA256(), start=0, end=_SHARD_SIZE, chunk_size=1
292 |         )
293 |         hasher2 = file.ShardedFileHasher(
294 |             sample_file,
295 |             memory.SHA256(),
296 |             start=_SHARD_SIZE,
297 |             end=2 * _SHARD_SIZE,
298 |             chunk_size=1,
299 |         )
300 | 
301 |         digest1 = hasher1.compute()
302 |         assert digest1.digest_hex == expected_header_digest
303 | 
304 |         digest2 = hasher2.compute()
305 |         assert digest2.digest_hex == expected_content_digest
306 | 
307 |     def test_default_digest_name(self):
308 |         hasher = file.ShardedFileHasher(
309 |             "unused", memory.SHA256(), start=0, end=2, shard_size=10
310 |         )
311 |         assert hasher.digest_name == "file-sha256-10"
312 | 
313 |     def test_override_digest_name(self):
314 |         hasher = file.ShardedFileHasher(
315 |             "unused",
316 |             memory.SHA256(),
317 |             start=0,
318 |             end=2,
319 |             shard_size=10,
320 |             digest_name_override="test-hash",
321 |         )
322 |         assert hasher.digest_name == "test-hash"
323 | 
324 |     def test_digest_algorithm_is_digest_name(self, sample_file):
325 |         hasher = file.ShardedFileHasher(
326 |             sample_file,
327 |             memory.SHA256(),
328 |             start=0,
329 |             end=2,
330 |             shard_size=10,
331 |             digest_name_override="test-hash",
332 |         )
333 |         digest = hasher.compute()
334 |         assert digest.algorithm == hasher.digest_name
335 | 


--------------------------------------------------------------------------------
/model_signing/serialize.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The Sigstore Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import hashlib
 16 | import base64
 17 | import os
 18 | from concurrent.futures import ProcessPoolExecutor
 19 | from multiprocessing import get_start_method, set_start_method
 20 | from pathlib import Path
 21 | import platform
 22 | 
 23 | # Use for testing while keeping disk size low.
 24 | allow_symlinks = False
 25 | 
 26 | 
 27 | class Hasher:
 28 |     @staticmethod
 29 |     def node_header(name: str, ty: str) -> bytes:
 30 |         header = ty.encode('utf-8') + b'.' + \
 31 |             base64.b64encode(name.encode('utf-8')) + b'.'
 32 |         return header
 33 | 
 34 |     @staticmethod
 35 |     def root_folder(path: Path, content: bytes) -> str:
 36 |         return Hasher._node_folder_compute(name="root", content=content)
 37 | 
 38 |     @staticmethod
 39 |     def node_folder(path: Path, content: bytes) -> str:
 40 |         return Hasher._node_folder_compute(name=path.name, content=content)
 41 | 
 42 |     @staticmethod
 43 |     def _node_folder_compute(name: str, content: bytes) -> bytes:
 44 |         value = Hasher.node_header(name, "dir") + content
 45 |         return hashlib.sha256(value).digest()
 46 | 
 47 |     @staticmethod
 48 |     def root_file(path: Path, chunk: int) -> bytes:
 49 |         return Hasher._node_file_compute(path, b'', chunk)
 50 | 
 51 |     @staticmethod
 52 |     def node_file(path: Path, chunk: int = 0) -> bytes:
 53 |         if not path.is_file():
 54 |             raise ValueError(f"path {path} is not a file")
 55 |         header = Hasher.node_header(path.name, "file")
 56 |         return Hasher._node_file_compute(path, header, chunk)
 57 | 
 58 |     @staticmethod
 59 |     def _node_file_compute(path: Path, header: bytes, chunk: int) -> bytes:
 60 |         h = hashlib.sha256(header)
 61 |         with open(path, "rb") as f:
 62 |             if chunk == 0:
 63 |                 all_data = f.read()
 64 |                 h.update(all_data)
 65 |             else:
 66 |                 # Compute the hash by reading chunk bytes at a time.
 67 |                 while True:
 68 |                     chunk_data = f.read(chunk)
 69 |                     if not chunk_data:
 70 |                         break
 71 |                     h.update(chunk_data)
 72 |         return h.digest()
 73 | 
 74 |     @staticmethod
 75 |     def _node_file_compute_v1(path: Path, header: bytes,
 76 |                               start: int, end: int, chunk: int) -> bytes:
 77 |         h = hashlib.sha256(header)
 78 |         with open(path, "rb") as f:
 79 |             # WARNING: We must start reading the file at the starting offset.
 80 |             f.seek(start)
 81 |             # Read all at once.
 82 |             if chunk == 0 or chunk >= (end - start):
 83 |                 content = f.read(end - start)
 84 |                 # print(f"all: {f.name}: {start}-{end}")
 85 |                 h.update(content)
 86 |             else:
 87 |                 # Compute the hash by reading chunk bytes at a time.
 88 |                 remains = end - start
 89 |                 while remains != 0:
 90 |                     # read = (end - start) - remains
 91 |                     # print(f"loop {i}: {f.name}:
 92 |                     # {read}-{read + min(chunk, remains)}")
 93 |                     processed = min(chunk, remains)
 94 |                     chunk_data = f.read(processed)
 95 |                     if processed != len(chunk_data):
 96 |                         raise ValueError("internal: unread bytes: " +
 97 |                                          f"{processed} != {len(chunk_data)}")
 98 |                     if not chunk_data:
 99 |                         raise ValueError("internal: no data: " +
100 |                                          f"filename={str(path)}, " +
101 |                                          f"remains={remains}, " +
102 |                                          f"{processed} != {len(chunk_data)}")
103 |                     h.update(chunk_data)
104 |                     remains -= processed
105 |         return h.digest()
106 | 
107 | 
108 | def remove_prefix(text, prefix):
109 |     if text.startswith(prefix):
110 |         return text[len(prefix):]
111 |     return text
112 | 
113 | 
114 | def validate_signature_path(model_path: Path, sig_path: Path):
115 |     if model_path.is_file():
116 |         return
117 |     # Note: Only allow top-level folder to have the signature for simplicity.
118 |     if sig_path is not None and sig_path.is_relative_to(model_path) and \
119 |        sig_path.parent != model_path:
120 |         raise ValueError(f"{sig_path} must be in the folder root")
121 | 
122 | 
123 | def is_relative_to(p: Path, path_list: [Path]) -> bool:
124 |     for e in path_list:
125 |         if p.is_relative_to(e):
126 |             return True
127 |     return False
128 | 
129 | 
130 | # TODO(): add a context "AI model"?
131 | class Serializer:
132 |     @staticmethod
133 |     # TODO: type of returned value.
134 |     def _ordered_files(path: Path, ignorepaths: [Path]) -> []:
135 |         children: [Path]
136 |         if path.is_file():
137 |             children = [path]
138 |         else:
139 |             # NOTE: the parent (..) and current directory (.) are not present.
140 |             # NOTE: this returns hidden files as well.
141 |             # TODO: tests that this pattern reports all files,
142 |             # regardless of their depth.
143 |             children = sorted(path.glob("**/*"))
144 | 
145 |         filtered = []
146 |         total_size = 0
147 |         for child in children:
148 |             if is_relative_to(child, ignorepaths):
149 |                 continue
150 | 
151 |             # To avoid bugs where we read the link rather than its target,
152 |             # we don't allow symlinks for now.
153 |             # NOTE: It seems that Python's read() *always* follows symlinks,
154 |             # so it may be safe to allow them. (readlink() is the function
155 |             # to read the link metadata).
156 |             if not allow_symlinks and child.is_symlink():
157 |                 raise ValueError(f"{str(child)} is symlink")
158 | 
159 |             if not child.is_file() and not child.is_dir():
160 |                 raise ValueError(f"{str(child)} is not a dir or file")
161 | 
162 |             # The recorded path must *not* contains the folder name,
163 |             # since users may rename it.
164 |             record_path = remove_prefix(
165 |                 str(child.as_posix()), str(path.as_posix() + '/'))
166 |             record_type = "file" if child.is_file() else "dir"
167 |             record_size = \
168 |                 os.path.getsize(str(child)) if record_type == "file" else 0
169 |             filtered += [(record_path, record_type, record_size)]
170 |             total_size += record_size
171 |         return filtered
172 | 
173 |     @staticmethod
174 |     # TODO: type of returned value.
175 |     def _create_tasks(children: [], shard_size: int) -> [[]]:
176 |         tasks = [[]] * 0
177 |         curr_file = 0
178 |         curr_pos = 0
179 | 
180 |         while True:
181 |             # All files have been processed.
182 |             if curr_file >= len(children):
183 |                 break
184 | 
185 |             name, typ, size = children[curr_file]
186 | 
187 |             # It's a directory.
188 |             # NOTE: It is fast to compute the hash because there's no data
189 |             # besides the name and the type.
190 |             # TODO(#12): do we need this at all? This only matters
191 |             # if we care about empty directories, since non-empty ones have
192 |             # their file + path recorded.
193 |             if typ == "dir":
194 |                 # Record the task.
195 |                 tasks += [(name, typ, 0, size)]
196 |                 curr_file += 1
197 |                 curr_pos = 0
198 |                 continue
199 | 
200 |             # It's a file.
201 | 
202 |             # Sanity checks.
203 |             if size <= curr_pos and size > 0:
204 |                 raise ValueError(f"internal: size={size}, " +
205 |                                  f"curr_pos={curr_pos} " +
206 |                                  f"for {children[curr_file]}")
207 | 
208 |             # Compute the number of bytes to process.
209 |             remains = size - curr_pos
210 |             if remains < 0:
211 |                 raise ValueError(f"internal: remains is {remains}")
212 |             processed = min(remains, shard_size)
213 |             end_pos = curr_pos + processed
214 | 
215 |             # Record the task.
216 |             tasks += [(name, typ, curr_pos, end_pos)]
217 | 
218 |             # Update position.
219 |             curr_pos += processed
220 | 
221 |             # If we have processed all bytes, we move on to the next file.
222 |             if remains == processed:
223 |                 curr_file += 1
224 |                 curr_pos = 0
225 |         return tasks
226 | 
227 |     @staticmethod
228 |     # TODO: type of tasks
229 |     def _run_tasks(path: Path, chunk: int, tasks: []) -> bytes:
230 |         # See https://superfastpython.com/processpoolexecutor-in-python/
231 |         # NOTE: 32 = length of sha256 digest.
232 |         digest_len = 32
233 |         all_hashes = [None] * (digest_len*len(tasks))
234 |         org_len = len(all_hashes)
235 | 
236 |         # Use fork on Linux as it's supposed to be faster.
237 |         if platform.system() == "Linux" and get_start_method() != "fork":
238 |             set_start_method('fork')
239 |         with ProcessPoolExecutor() as ppe:
240 |             futures = [ppe.submit(Serializer.task, (path, chunk, task))
241 |                        for task in tasks]
242 |             results = [f.result() for f in futures]
243 |             for i, result in enumerate(results):
244 |                 all_hashes[i*digest_len:(i+1)*digest_len] = result
245 |         # Sanity check.
246 |         if len(all_hashes) != org_len:
247 |             raise ValueError(f"internal: {len(all_hashes)} != {org_len}")
248 |         return bytes(all_hashes)
249 | 
250 |     @staticmethod
251 |     # TODO: type of task_info.
252 |     def task(task_info: []):
253 |         # NOTE: we can get process info using:
254 |         # from multiprocessing import current_process
255 |         # worker = current_process()
256 |         # print(f'Task {task_info},
257 |         # worker name={worker.name}, pid={worker.pid}', flush=True)
258 | 
259 |         model_path, chunk, (name, ty, start_pos, end_pos) = task_info
260 | 
261 |         # Header format is: "type.b64(filename).start-end."
262 |         header = ty.encode('utf-8') + b'.' + \
263 |             base64.b64encode(name.encode('utf-8')) + \
264 |             b'.' + f"{start_pos}-{end_pos}".encode('utf-8') + b'.'
265 | 
266 |         # To hash a directory, we use "none" content.
267 |         # TODO(#12): do we need this at all? This only matters
268 |         # if we care about empty directories, since non-empty ones have
269 |         # their file + path recorded.
270 |         if ty == "dir":
271 |             value = header + b'none'
272 |             return hashlib.sha256(value).digest()
273 | 
274 |         # We need to hash a file.
275 | 
276 |         # The model is a directory.
277 |         if model_path.is_dir():
278 |             return Hasher._node_file_compute_v1(model_path.joinpath(name),
279 |                                                 header, start_pos,
280 |                                                 end_pos, chunk)
281 | 
282 |         # The model is a single file.
283 |         # We update the file name to a generic "root".
284 |         header = ty.encode('utf-8') + b'.' + \
285 |             base64.b64encode("root".encode('utf-8')) + \
286 |             b'.' + f"{start_pos}-{end_pos}".encode('utf-8') + b'.'
287 |         return Hasher._node_file_compute_v1(name,
288 |                                             header, start_pos, end_pos, chunk)
289 | 
290 |     @staticmethod
291 |     def _serialize_v1(path: Path, chunk: int, shard: int, signature_path: Path,
292 |                       ignorepaths: [Path] = []) -> bytes:
293 |         if not path.exists():
294 |             raise ValueError(f"{str(path)} does not exist")
295 | 
296 |         if not allow_symlinks and path.is_symlink():
297 |             raise ValueError(f"{str(path)} is a symlink")
298 | 
299 |         if chunk < 0:
300 |             raise ValueError(f"{str(chunk)} is invalid")
301 | 
302 |         if not path.is_file() and not path.is_dir():
303 |             raise ValueError(f"{str(path)} is not a dir or file")
304 | 
305 |         # Validate the signature path.
306 |         validate_signature_path(path, signature_path)
307 | 
308 |         # Children to hash.
309 |         children = Serializer._ordered_files(path,
310 |                                              [signature_path] + ignorepaths)
311 | 
312 |         # We shard the computation by creating independent "tasks".
313 |         if shard < 0:
314 |             raise ValueError(f"{str(shard)} is invalid")
315 |         tasks = Serializer._create_tasks(children, shard)
316 | 
317 |         # Share the computation of hashes.
318 |         # For simplicity, we pre-allocate the entire array that will hold
319 |         # the concatenation of all hashes.
320 |         all_hashes = Serializer._run_tasks(path, chunk, tasks)
321 | 
322 |         # Finally, we hash everything.
323 |         return hashlib.sha256(bytes(all_hashes)).digest()
324 | 
325 |     def serialize_v1(path: Path, chunk: int, signature_path: Path,
326 |                      ignorepaths: [Path] = []) -> bytes:
327 |         # NOTE: The shard size must be the same for all clients for
328 |         # compatibility. We could make it configurable; but in this
329 |         # case the signature file must contain the value used by the signer.
330 |         shard_size = 1000000000  # 1GB
331 |         return Serializer._serialize_v1(path, chunk, shard_size,
332 |                                         signature_path, ignorepaths)
333 | 
334 |     @staticmethod
335 |     def serialize_v0(path: Path, chunk: int, signature_path: Path,
336 |                      ignorepaths: [Path] = []) -> bytes:
337 |         if not path.exists():
338 |             raise ValueError(f"{str(path)} does not exist")
339 | 
340 |         if not allow_symlinks and path.is_symlink():
341 |             raise ValueError(f"{str(path)} is a symlink")
342 | 
343 |         if chunk < 0:
344 |             raise ValueError(f"{str(chunk)} is invalid")
345 | 
346 |         if path.is_file():
347 |             return Hasher.root_file(path, chunk)
348 | 
349 |         if not path.is_dir():
350 |             raise ValueError(f"{str(path)} is not a dir")
351 | 
352 |         # Validate the signature path.
353 |         validate_signature_path(path, signature_path)
354 | 
355 |         children = sorted([x for x in path.iterdir()
356 |                            if x != signature_path and x not in ignorepaths])
357 |         # TODO: remove this special case?
358 |         if len(children) == 0:
359 |             return Hasher.root_folder(path, b"empty")
360 | 
361 |         hash = hashlib.sha256()
362 |         for child in children:
363 |             child_hash = Serializer._serialize_node(child, chunk, " ",
364 |                                                     ignorepaths)
365 |             hash.update(child_hash)
366 |         content = hash.digest()
367 |         return Hasher.root_folder(path, content)
368 | 
369 |     @staticmethod
370 |     def _serialize_node(path: Path, chunk: int, indent="",
371 |                         ignorepaths: [Path] = []) -> bytes:
372 |         if not allow_symlinks and path.is_symlink():
373 |             raise ValueError(f"{str(path)} is a symlink")
374 | 
375 |         if path.is_file():
376 |             return Hasher.node_file(path, chunk)
377 | 
378 |         if not path.is_dir():
379 |             raise ValueError(f"{str(path)} is not a dir")
380 | 
381 |         children = sorted([x for x in path.iterdir() if x not in ignorepaths])
382 |         # TODO: remove this special case?
383 |         if len(children) == 0:
384 |             return Hasher.node_folder(path, b"empty")
385 | 
386 |         hash = hashlib.sha256()
387 |         for child in children:
388 |             child_hash = Serializer._serialize_node(child, chunk, indent + " ",
389 |                                                     ignorepaths)
390 |             hash.update(child_hash)
391 |         content = hash.digest()
392 |         return Hasher.node_folder(path, content)
393 | 


--------------------------------------------------------------------------------