├── .env.template ├── .flake8 ├── .github └── workflows │ ├── docker.yml │ ├── docker_test.yml │ └── pr.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── data └── openimage_10.txt ├── flex └── metadata.json ├── my_project ├── __init__.py ├── config.py ├── pipeline.py └── run.py ├── pyproject.toml ├── pytorch_gpu.Dockerfile ├── requirements.dev.txt ├── requirements.prod.txt ├── scripts ├── check-beam.sh ├── check-pipeline.sh ├── check-tf-on-gpu.sh ├── check-torch-on-gpu.sh ├── create-gpu-vm.sh └── get_beam_version.py ├── setup.py ├── tensor_rt.Dockerfile ├── tensorflow_gpu.Dockerfile ├── tensorflow_gpu.flex.Dockerfile └── tests ├── sample.env.pytorch ├── sample.env.tf └── test_pipeline.py /.env.template: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### PYTHON/SDK/DOCKER SETTINGS 3 | ################################################################################ 4 | ##Pytorch + Py3.10 + Beam 2.47.0 5 | PYTHON_VERSION=3.10 6 | BEAM_VERSION=2.47.0 7 | DOCKERFILE_TEMPLATE=pytorch_gpu.Dockerfile 8 | DOCKER_CREDENTIAL_REGISTRIES="us-docker.pkg.dev" 9 | ##Pytorch + Tensor_RT + Py3.8 + Beam 2.46.0 10 | #PYTHON_VERSION=3.8 11 | #BEAM_VERSION=2.46.0 12 | #DOCKERFILE_TEMPLATE=tensor_rt.Dockerfile 13 | ################################################################################ 14 | ### GCP SETTINGS 15 | ################################################################################ 16 | PROJECT_ID=your-gcp-project-id 17 | REGION=your-region-to-run-dataflow-jobs 18 | ZONE=your-zone-to-run-vm 19 | DISK_SIZE_GB=50 20 | MACHINE_TYPE=n1-standard-2 21 | VM_NAME=beam-ml-starter-gpu 22 | ################################################################################ 23 | ### DATAFLOW JOB SETTINGS 24 | ################################################################################ 25 | STAGING_LOCATION=your-gcs-bucket-for-staging-files 26 | TEMP_LOCATION=your-gcs-bucket-for-temp-files 27 | CUSTOM_CONTAINER_IMAGE=your-gcr-image-uri-for-custom-container 28 | SERVICE_OPTIONS="worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver" 29 | ################################################################################ 30 | ### DATAFLOW JOB MODEL SETTINGS 31 | ################################################################################ 32 | ### PYTORCH MODEL EXAMPLES 33 | ## mobilenet_v2 34 | MODEL_STATE_DICT_PATH="gs://apache-beam-ml/models/torchvision.models.mobilenet_v2.pth" 35 | MODEL_NAME=mobilenet_v2 36 | ## resnet101 37 | #MODEL_STATE_DICT_PATH="gs://apache-beam-ml/models/torchvision.models.resnet101.pth" 38 | #MODEL_NAME=resnet101 39 | ### TF MODEL URI EXAMPLES 40 | #TF_MODEL_URI: only support TF2 models (https://tfhub.dev/s?subtype=module,placeholder&tf-version=tf2) 41 | #TF_MODEL_URI=https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4 42 | ################################################################################ 43 | ### DATAFLOW JOB INPUT&OUTPUT SETTINGS 44 | ################################################################################ 45 | INPUT_DATA="gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt" 46 | OUTPUT_DATA=your-gcs-bucket-for-saving-prediction-results -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-complexity = 40 4 | ignore = 5 | E203 6 | W503 7 | exclude = 8 | .eggs 9 | .git 10 | .tox 11 | __pycache__ 12 | build 13 | dist 14 | venv -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Build and push Docker image to GCP Artifact Registry 16 | 17 | on: 18 | workflow_dispatch: 19 | push: 20 | branches: 21 | - main 22 | schedule: 23 | # Every Monday at 1PM UTC (9AM EST) 24 | - cron: "0 13 * * 1" 25 | 26 | jobs: 27 | build-and-push: 28 | runs-on: ubuntu-latest 29 | 30 | steps: 31 | - name: Free Disk Space (Ubuntu) 32 | uses: jlumbroso/free-disk-space@main 33 | with: 34 | # this might remove tools that are actually needed, 35 | # if set to "true" but frees about 6 GB 36 | tool-cache: false 37 | 38 | android: true 39 | dotnet: true 40 | haskell: true 41 | large-packages: false 42 | docker-images: true 43 | swap-storage: true 44 | - name: Checkout 45 | uses: actions/checkout@v3 46 | - id: "auth" 47 | name: Authenticate to Google Cloud 48 | uses: google-github-actions/auth@v1.1.1 49 | with: 50 | credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }} 51 | token_format: access_token 52 | - name: Docker login 53 | uses: "docker/login-action@v1" 54 | with: 55 | registry: "us-docker.pkg.dev" 56 | username: "oauth2accesstoken" 57 | password: "${{ steps.auth.outputs.access_token }}" 58 | - name: Set up Python 3.8 59 | uses: actions/setup-python@v4 60 | with: 61 | python-version: "3.8" 62 | - name: Init env 63 | run: | 64 | cp tests/sample.env.tf .env 65 | echo '${{ steps.auth.outputs.access_token }}' | docker login -u oauth2accesstoken --password-stdin https://us-docker.pkg.dev 66 | make init 67 | - name: Build and push Docker image 68 | run: | 69 | make docker 70 | - name: Test Docker image 71 | run: | 72 | make run-df-gpu 73 | -------------------------------------------------------------------------------- /.github/workflows/docker_test.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Build and push Docker image to GCP Artifact Registry with the latest Beam 16 | 17 | on: 18 | workflow_dispatch: 19 | 20 | jobs: 21 | build-and-push: 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - name: Free Disk Space (Ubuntu) 26 | uses: jlumbroso/free-disk-space@main 27 | with: 28 | # this might remove tools that are actually needed, 29 | # if set to "true" but frees about 6 GB 30 | tool-cache: false 31 | 32 | android: true 33 | dotnet: true 34 | haskell: true 35 | large-packages: false 36 | docker-images: true 37 | swap-storage: true 38 | - name: Checkout 39 | uses: actions/checkout@v3 40 | - id: "auth" 41 | name: Authenticate to Google Cloud 42 | uses: google-github-actions/auth@v1.1.1 43 | with: 44 | credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }} 45 | token_format: access_token 46 | - name: Docker login 47 | uses: "docker/login-action@v1" 48 | with: 49 | registry: "us-docker.pkg.dev" 50 | username: "oauth2accesstoken" 51 | password: "${{ steps.auth.outputs.access_token }}" 52 | - name: Set up Python 3.10 53 | uses: actions/setup-python@v4 54 | with: 55 | python-version: "3.10" 56 | - name: Init env with the test Beam and docker URI 57 | run: | 58 | cp tests/sample.env.pytorch .env 59 | make init-venv 60 | ./venv/bin/pip install requests packaging 61 | make test-latest-env 62 | sed -i '/CUSTOM_CONTAINER_IMAGE=/d' .env 63 | echo -e "\n" >> .env 64 | echo "CUSTOM_CONTAINER_IMAGE=us-docker.pkg.dev/apache-beam-testing/dataflow-ml-starter/pytorch_gpu:test-beam" >> .env 65 | echo '${{ steps.auth.outputs.access_token }}' | docker login -u oauth2accesstoken --password-stdin https://us-docker.pkg.dev 66 | make init 67 | - name: Build and push Docker image 68 | run: | 69 | make docker 70 | - name: Test Docker image 71 | run: | 72 | make run-df-gpu 73 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Run basic tests with Python 3.8 16 | 17 | on: [push, pull_request, workflow_dispatch] 18 | 19 | jobs: 20 | tests: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 3.8 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: "3.8" 29 | - name: Init env 30 | run: | 31 | cp tests/sample.env.tf .env 32 | make init 33 | - name: Run local tests 34 | run: | 35 | make test 36 | - name: Run DirectRunner with TF 37 | run: | 38 | # tf model 39 | make run-direct 40 | test -f beam-output/beam_test_out.txt && echo "DirectRunner ran successfully!" || $(error "Cannot find beam-output/beam_test_out.txt!") 41 | - name: Run DirectRunner with PyTorch 42 | run: | 43 | # torch model 44 | sed -i '/TF_MODEL_URI=/d' .env 45 | echo -e "\n" >> .env 46 | echo "MODEL_STATE_DICT_PATH=gs://apache-beam-ml/models/torchvision.models.mobilenet_v2.pth" >> .env 47 | echo -e "\n" >> .env 48 | echo "MODEL_NAME=mobilenet_v2" >> .env 49 | make run-direct 50 | test -f beam-output/beam_test_out.txt && echo "DirectRunner ran successfully!" || $(error "Cannot find beam-output/beam_test_out.txt!") 51 | # restore .env 52 | cp tests/sample.env.tf .env 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # beam temp 16 | beam-temp-* 17 | beam-output/ 18 | Dockerfile 19 | requirements.txt 20 | 21 | # sys 22 | .DS_Store 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # C extensions 30 | *.so 31 | 32 | # Distribution / packaging 33 | .Python 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | wheels/ 46 | share/python-wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .nox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *.cover 72 | *.py,cover 73 | .hypothesis/ 74 | .pytest_cache/ 75 | cover/ 76 | 77 | # Translations 78 | *.mo 79 | *.pot 80 | 81 | # Django stuff: 82 | *.log 83 | local_settings.py 84 | db.sqlite3 85 | db.sqlite3-journal 86 | 87 | # Flask stuff: 88 | instance/ 89 | .webassets-cache 90 | 91 | # Scrapy stuff: 92 | .scrapy 93 | 94 | # Sphinx documentation 95 | docs/_build/ 96 | 97 | # PyBuilder 98 | .pybuilder/ 99 | target/ 100 | 101 | # Jupyter Notebook 102 | .ipynb_checkpoints 103 | 104 | # IPython 105 | profile_default/ 106 | ipython_config.py 107 | 108 | # pyenv 109 | # For a library or package, you might want to ignore these files since the code is 110 | # intended to run in multiple environments; otherwise, check them in: 111 | # .python-version 112 | 113 | # pipenv 114 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 115 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 116 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 117 | # install all needed dependencies. 118 | #Pipfile.lock 119 | 120 | # poetry 121 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 122 | # This is especially recommended for binary packages to ensure reproducibility, and is more 123 | # commonly ignored for libraries. 124 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 125 | #poetry.lock 126 | 127 | # pdm 128 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 129 | #pdm.lock 130 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 131 | # in version control. 132 | # https://pdm.fming.dev/#use-with-ide 133 | .pdm.toml 134 | 135 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 136 | __pypackages__/ 137 | 138 | # Celery stuff 139 | celerybeat-schedule 140 | celerybeat.pid 141 | 142 | # SageMath parsed files 143 | *.sage.py 144 | 145 | # Environments 146 | .env* 147 | .venv 148 | env/ 149 | venv/ 150 | ENV/ 151 | env.bak/ 152 | venv.bak/ 153 | 154 | # vscode 155 | .vscode 156 | 157 | # Spyder project settings 158 | .spyderproject 159 | .spyproject 160 | 161 | # Rope project settings 162 | .ropeproject 163 | 164 | # mkdocs documentation 165 | /site 166 | 167 | # mypy 168 | .mypy_cache/ 169 | .dmypy.json 170 | dmypy.json 171 | 172 | # Pyre type checker 173 | .pyre/ 174 | 175 | # pytype static type analyzer 176 | .pytype/ 177 | 178 | # Cython debug symbols 179 | cython_debug/ 180 | 181 | # PyCharm 182 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 183 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 184 | # and can be added to the global gitignore or merged into this file. For a more nuclear 185 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 186 | #.idea/ 187 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [settings] 16 | sections=FUTURE,STDLIB,THIRDPARTY,DFML,FIRSTPARTY,LOCALFOLDER 17 | import_heading_dfml=Dataflow ML libraries 18 | import_heading_stdlib=standard libraries 19 | import_heading_thirdparty=third party libraries 20 | include_trailing_comma=True 21 | indent=' ' 22 | known_dfml=my_project 23 | dedup_headings=True 24 | line_length=120 25 | multi_line_output=3 26 | skip=./venv/,./venv-docs/,./.git/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | exclude: ^docs/notebooks/ 16 | repos: 17 | - repo: https://github.com/ambv/black 18 | rev: 23.3.0 19 | hooks: 20 | - id: black 21 | args: ["--config=pyproject.toml", "--check", "--diff"] 22 | - repo: https://github.com/pycqa/flake8 23 | rev: "6.0.0" 24 | hooks: 25 | - id: flake8 26 | args: ["--config=.flake8"] 27 | - repo: https://github.com/timothycrosley/isort 28 | rev: 5.12.0 29 | hooks: 30 | - id: isort -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We would love to accept your patches and contributions to this project. 4 | 5 | ## Before you begin 6 | 7 | ### Sign our Contributor License Agreement 8 | 9 | Contributions to this project must be accompanied by a 10 | [Contributor License Agreement](https://cla.developers.google.com/about) (CLA). 11 | You (or your employer) retain the copyright to your contribution; this simply 12 | gives us permission to use and redistribute your contributions as part of the 13 | project. 14 | 15 | If you or your current employer have already signed the Google CLA (even if it 16 | was for a different project), you probably don't need to do it again. 17 | 18 | Visit to see your current agreements or to 19 | sign a new one. 20 | 21 | ### Review our Community Guidelines 22 | 23 | This project follows [Google's Open Source Community 24 | Guidelines](https://opensource.google/conduct/). 25 | 26 | ## Contribution process 27 | 28 | ### Code Reviews 29 | 30 | All submissions, including submissions by project members, require review. We 31 | use [GitHub pull requests](https://docs.github.com/articles/about-pull-requests) 32 | for this purpose. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | SILENT: 16 | .PHONY: 17 | .DEFAULT_GOAL := help 18 | 19 | # Load environment variables from .env file 20 | TF_MODEL_URI := 21 | include .env 22 | export 23 | 24 | define PRINT_HELP_PYSCRIPT 25 | import re, sys # isort:skip 26 | 27 | matches = [] 28 | for line in sys.stdin: 29 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 30 | if match: 31 | matches.append(match.groups()) 32 | 33 | for target, help in sorted(matches): 34 | print(" %-25s %s" % (target, help)) 35 | endef 36 | export PRINT_HELP_PYSCRIPT 37 | 38 | PYTHON = python$(PYTHON_VERSION) 39 | 40 | ifndef TF_MODEL_URI 41 | MODEL_ENV := "TORCH" 42 | else 43 | MODEL_ENV := "TF" 44 | endif 45 | 46 | help: ## Print this help 47 | @echo 48 | @echo " make targets:" 49 | @echo 50 | @$(PYTHON) -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 51 | 52 | test-latest-env: ## Replace the Beam vesion with the latest version (including release candidates) 53 | $(eval LATEST_VERSION=$(shell ./venv/bin/python3 scripts/get_beam_version.py)) 54 | @echo $(LATEST_VERSION) 55 | @sed 's/BEAM_VERSION=.*/BEAM_VERSION=$(LATEST_VERSION)/g' .env > .env.new && mv .env.new .env 56 | 57 | init-venv: ## Create virtual environment in venv folder 58 | @$(PYTHON) -m venv venv 59 | 60 | init: init-venv ## Init virtual environment 61 | @./venv/bin/python3 -m pip install -U pip 62 | @$(shell sed "s|\$${BEAM_VERSION}|$(BEAM_VERSION)|g" requirements.prod.txt > requirements.txt) 63 | @./venv/bin/python3 -m pip install -r requirements.txt 64 | @./venv/bin/python3 -m pip install -r requirements.dev.txt 65 | @./venv/bin/python3 -m pre_commit install --install-hooks --overwrite 66 | @mkdir -p beam-output 67 | @echo "use 'source venv/bin/activate' to activate venv " 68 | @./venv/bin/python3 -m pip install -e . 69 | 70 | format: ## Run formatter on source code 71 | @./venv/bin/python3 -m black --config=pyproject.toml . 72 | 73 | lint: ## Run linter on source code 74 | @./venv/bin/python3 -m black --config=pyproject.toml --check . 75 | @./venv/bin/python3 -m flake8 --config=.flake8 . 76 | 77 | clean-lite: ## Remove pycache files, pytest files, etc 78 | @rm -rf build dist .cache .coverage .coverage.* *.egg-info 79 | @find . -name .coverage | xargs rm -rf 80 | @find . -name .pytest_cache | xargs rm -rf 81 | @find . -name .tox | xargs rm -rf 82 | @find . -name __pycache__ | xargs rm -rf 83 | @find . -name *.egg-info | xargs rm -rf 84 | 85 | clean: clean-lite ## Remove virtual environment, downloaded models, etc 86 | @rm -rf venv 87 | @echo "run 'make init'" 88 | 89 | test: lint ## Run tests 90 | ./venv/bin/pytest -s -vv --cov=my_project --cov-fail-under=50 tests/ 91 | 92 | run-direct: ## Run a local test with DirectRunner 93 | @rm -f beam-output/beam_test_out.txt 94 | ifeq ($(MODEL_ENV), "TORCH") 95 | time ./venv/bin/python3 -m my_project.run \ 96 | --input data/openimage_10.txt \ 97 | --output beam-output/beam_test_out.txt \ 98 | --model_state_dict_path $(MODEL_STATE_DICT_PATH) \ 99 | --model_name $(MODEL_NAME) 100 | else 101 | time ./venv/bin/python3 -m my_project.run \ 102 | --input data/openimage_10.txt \ 103 | --output beam-output/beam_test_out.txt \ 104 | --tf_model_uri $(TF_MODEL_URI) 105 | endif 106 | 107 | docker: ## Build a custom docker image and push it to Artifact Registry 108 | @$(shell sed "s|\$${BEAM_VERSION}|$(BEAM_VERSION)|g; s|\$${PYTHON_VERSION}|$(PYTHON_VERSION)|g" ${DOCKERFILE_TEMPLATE} > Dockerfile) 109 | docker build --platform linux/amd64 -t $(CUSTOM_CONTAINER_IMAGE) -f Dockerfile . 110 | docker push $(CUSTOM_CONTAINER_IMAGE) 111 | 112 | run-df-gpu: ## Run a Dataflow job using the custom container with GPUs 113 | $(eval JOB_NAME := beam-ml-starter-gpu-$(shell date +%s)-$(shell echo $$$$)) 114 | ifeq ($(MODEL_ENV), "TORCH") 115 | time ./venv/bin/python3 -m my_project.run \ 116 | --runner DataflowRunner \ 117 | --job_name $(JOB_NAME) \ 118 | --project $(PROJECT_ID) \ 119 | --region $(REGION) \ 120 | --machine_type $(MACHINE_TYPE) \ 121 | --disk_size_gb $(DISK_SIZE_GB) \ 122 | --staging_location $(STAGING_LOCATION) \ 123 | --temp_location $(TEMP_LOCATION) \ 124 | --setup_file ./setup.py \ 125 | --device GPU \ 126 | --dataflow_service_option $(SERVICE_OPTIONS) \ 127 | --number_of_worker_harness_threads 1 \ 128 | --experiments=disable_worker_container_image_prepull \ 129 | --experiments=use_pubsub_streaming \ 130 | --sdk_container_image $(CUSTOM_CONTAINER_IMAGE) \ 131 | --sdk_location container \ 132 | --input $(INPUT_DATA) \ 133 | --output $(OUTPUT_DATA) \ 134 | --model_state_dict_path $(MODEL_STATE_DICT_PATH) \ 135 | --model_name $(MODEL_NAME) 136 | else 137 | time ./venv/bin/python3 -m my_project.run \ 138 | --runner DataflowRunner \ 139 | --job_name $(JOB_NAME) \ 140 | --project $(PROJECT_ID) \ 141 | --region $(REGION) \ 142 | --machine_type $(MACHINE_TYPE) \ 143 | --disk_size_gb $(DISK_SIZE_GB) \ 144 | --staging_location $(STAGING_LOCATION) \ 145 | --temp_location $(TEMP_LOCATION) \ 146 | --setup_file ./setup.py \ 147 | --device GPU \ 148 | --dataflow_service_option $(SERVICE_OPTIONS) \ 149 | --number_of_worker_harness_threads 1 \ 150 | --experiments=disable_worker_container_image_prepull \ 151 | --experiments=use_pubsub_streaming \ 152 | --sdk_container_image $(CUSTOM_CONTAINER_IMAGE) \ 153 | --sdk_location container \ 154 | --input $(INPUT_DATA) \ 155 | --output $(OUTPUT_DATA) \ 156 | --tf_model_uri $(TF_MODEL_URI) 157 | endif 158 | 159 | run-df-cpu: ## Run a Dataflow job with CPUs and without Custom Container 160 | @$(shell sed "s|\$${BEAM_VERSION}|$(BEAM_VERSION)|g" requirements.txt > beam-output/requirements.txt) 161 | @$(eval JOB_NAME := beam-ml-starter-cpu-$(shell date +%s)-$(shell echo $$$$)) 162 | ifeq ($(MODEL_ENV), "TORCH") 163 | time ./venv/bin/python3 -m my_project.run \ 164 | --runner DataflowRunner \ 165 | --job_name $(JOB_NAME) \ 166 | --project $(PROJECT_ID) \ 167 | --region $(REGION) \ 168 | --machine_type $(MACHINE_TYPE) \ 169 | --disk_size_gb $(DISK_SIZE_GB) \ 170 | --staging_location $(STAGING_LOCATION) \ 171 | --temp_location $(TEMP_LOCATION) \ 172 | --requirements_file requirements.txt \ 173 | --setup_file ./setup.py \ 174 | --input $(INPUT_DATA) \ 175 | --output $(OUTPUT_DATA) \ 176 | --model_state_dict_path $(MODEL_STATE_DICT_PATH) \ 177 | --model_name $(MODEL_NAME) 178 | else 179 | time ./venv/bin/python3 -m my_project.run \ 180 | --runner DataflowRunner \ 181 | --job_name $(JOB_NAME) \ 182 | --project $(PROJECT_ID) \ 183 | --region $(REGION) \ 184 | --machine_type $(MACHINE_TYPE) \ 185 | --disk_size_gb $(DISK_SIZE_GB) \ 186 | --staging_location $(STAGING_LOCATION) \ 187 | --temp_location $(TEMP_LOCATION) \ 188 | --requirements_file requirements.txt \ 189 | --setup_file ./setup.py \ 190 | --input $(INPUT_DATA) \ 191 | --output $(OUTPUT_DATA) \ 192 | --tf_model_uri $(TF_MODEL_URI) 193 | endif 194 | 195 | create-vm: ## Create a VM with GPU to test the docker image 196 | @./scripts/create-gpu-vm.sh 197 | 198 | delete-vm: ## Delete a VM 199 | gcloud compute instances delete $(VM_NAME) --project $(PROJECT_ID) --zone $(ZONE) --quiet 200 | 201 | check-beam: ## Check whether Beam is installed on GPU using VM with Custom Container 202 | @./scripts/check-beam.sh 203 | 204 | check-tf-gpu: ## Check whether Tensorflow works on GPU using VM with Custom Container 205 | @./scripts/check-tf-on-gpu.sh 206 | 207 | check-torch-gpu: ## Check whether PyTorch works on GPU using VM with Custom Container 208 | @./scripts/check-torch-on-gpu.sh 209 | 210 | check-pipeline: ## Check whether the Beam pipeline can run on GPU using VM with Custom Container and DirectRunner 211 | @./scripts/check-pipeline.sh 212 | 213 | create-flex-template: ## Create a Flex Template file using a Flex Template custom container 214 | gcloud dataflow flex-template build $(TEMPLATE_FILE_GCS_PATH) \ 215 | --image $(CUSTOM_CONTAINER_IMAGE) \ 216 | --metadata-file ./flex/metadata.json \ 217 | --sdk-language "PYTHON" \ 218 | --staging-location $(STAGING_LOCATION) \ 219 | --temp-location $(TEMP_LOCATION) \ 220 | --project $(PROJECT_ID) \ 221 | --worker-region $(REGION) \ 222 | --worker-machine-type $(MACHINE_TYPE) 223 | 224 | run-df-gpu-flex: ## Run a Dataflow job using the Flex Template 225 | $(eval JOB_NAME := beam-ml-starter-gpu-flex-$(shell date +%s)-$(shell echo $$$$)) 226 | ifeq ($(MODEL_ENV), "TORCH") 227 | gcloud dataflow flex-template run $(JOB_NAME) \ 228 | --template-file-gcs-location $(TEMPLATE_FILE_GCS_PATH) \ 229 | --project $(PROJECT_ID) \ 230 | --region $(REGION) \ 231 | --worker-machine-type $(MACHINE_TYPE) \ 232 | --additional-experiments disable_worker_container_image_prepull \ 233 | --parameters number_of_worker_harness_threads=1 \ 234 | --parameters sdk_location=container \ 235 | --parameters sdk_container_image=$(CUSTOM_CONTAINER_IMAGE) \ 236 | --parameters dataflow_service_option=$(SERVICE_OPTIONS) \ 237 | --parameters input=$(INPUT_DATA) \ 238 | --parameters output=$(OUTPUT_DATA) \ 239 | --parameters device=GPU \ 240 | --parameters model_state_dict_path=$(MODEL_STATE_DICT_PATH) \ 241 | --parameters model_name=$(MODEL_NAME) 242 | else 243 | gcloud dataflow flex-template run $(JOB_NAME) \ 244 | --template-file-gcs-location $(TEMPLATE_FILE_GCS_PATH) \ 245 | --project $(PROJECT_ID) \ 246 | --region $(REGION) \ 247 | --worker-machine-type $(MACHINE_TYPE) \ 248 | --additional-experiments disable_worker_container_image_prepull \ 249 | --parameters number_of_worker_harness_threads=1 \ 250 | --parameters sdk_location=container \ 251 | --parameters sdk_container_image=$(CUSTOM_CONTAINER_IMAGE) \ 252 | --parameters dataflow_service_option=$(SERVICE_OPTIONS) \ 253 | --parameters input=$(INPUT_DATA) \ 254 | --parameters output=$(OUTPUT_DATA) \ 255 | --parameters device=GPU \ 256 | --parameters tf_model_uri=$(TF_MODEL_URI) 257 | endif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataflow ML Starter Project 2 | 3 | ## Summary 4 | This repo contains a simple Beam RunInference project, which demonstrates how to run this Beam pipeline using DirectRunner to develop and test 5 | and launch the production job using DataflowRunner on either CPUs or GPUs. It can be served as a boilerplate to create a new Dataflow ML project. 6 | 7 | **This is not an officially supported Google product**. 8 | 9 | ## Prerequisites 10 | 11 | * conda 12 | * git 13 | * make 14 | * docker 15 | * gcloud 16 | * python3-venv 17 | 18 | ```bash 19 | sudo apt-get update 20 | sudo apt-get install -y python3-venv git make time wget 21 | ``` 22 | Install Docker on Debian: https://docs.docker.com/engine/install/debian/ 23 | Without sudo, 24 | ```bash 25 | sudo groupadd docker 26 | sudo usermod -aG docker $USER 27 | newgrp docker 28 | ``` 29 | 30 | ## Directory structure 31 | ``` 32 | . 33 | ├── LICENSE 34 | ├── .env.template <- A configuration template file to define environment-specific variables 35 | ├── Makefile <- Makefile with commands and type `make` to get the command list 36 | ├── README.md <- The top-level README for developers using this project 37 | ├── data <- Any data for local development and testing 38 | │   └── openimage_10.txt <- A sample test data that contains the gcs file path for each image 39 | ├── pyproject.toml <- The TOML format Python project configuration file 40 | ├── requirements.dev.txt <- Packages for the development such as `pytest` 41 | ├── requirements.prod.txt <- Packages for the production environment and produces `requirements.txt` 42 | ├── scripts <- utility bash scripts 43 | ├── setup.py <- Used in `python setup.py sdist` to create the multi-file python package 44 | ├── my_project <- Source code for use in this project, also your python package module name 45 | │   ├── __init__.py <- Makes my_project a Python package 46 | │   ├── config.py <- `pydantic` model classes to define sources, sinks, and models 47 | │   ├── pipeline.py <- Builds the Beam RunInference pipeline 48 | │   └── run.py <- A run module to parse the command options and run the Beam pipeline 49 | ├── tensor_rt.Dockerfile <- A Dockerfile to create a customer container with TensorRT 50 | └── tests <- Tests to cover local developments 51 | └── test_pipeline.py 52 | ``` 53 | 54 | ## User Guide 55 | 56 | **This process is only tested on GCE VMs with Debian.** 57 | 58 | ### Step 1: Clone this repo and edit .env 59 | 60 | ```bash 61 | git clone https://github.com/google/dataflow-ml-starter.git 62 | cd df-ml-starter 63 | cp .env.template .env 64 | ``` 65 | Use your editor to fill in the information in the `.env` file. 66 | 67 | If you want to try other pytorch models under `gs://apache-beam-ml/models/`, 68 | ```bash 69 | gsutil ls gs://apache-beam-ml/models/ 70 | ``` 71 | you need to edit `config.py` to add more model names. 72 | 73 | It is highly recommended to run through this guide once using `mobilenet_v2` for image classification. 74 | 75 | All the useful actions can be triggered using `make`: 76 | ```console 77 | $ make 78 | 79 | make targets: 80 | 81 | check-beam Check whether Beam is installed on GPU using VM with Custom Container 82 | check-pipeline Check whether the Beam pipeline can run on GPU using VM with Custom Container and DirectRunner 83 | check-tf-gpu Check whether Tensorflow works on GPU using VM with Custom Container 84 | check-torch-gpu Check whether PyTorch works on GPU using VM with Custom Container 85 | clean Remove virtual environment, downloaded models, etc 86 | clean-lite Remove pycache files, pytest files, etc 87 | create-flex-template Create a Flex Template file using a Flex Template custom container 88 | create-vm Create a VM with GPU to test the docker image 89 | delete-vm Delete a VM 90 | docker Build a custom docker image and push it to Artifact Registry 91 | format Run formatter on source code 92 | help Print this help 93 | init Init virtual environment 94 | init-venv Create virtual environment in venv folder 95 | lint Run linter on source code 96 | run-df-cpu Run a Dataflow job with CPUs and without Custom Container 97 | run-df-gpu Run a Dataflow job using the custom container with GPUs 98 | run-df-gpu-flex Run a Dataflow job using the Flex Template 99 | run-direct Run a local test with DirectRunner 100 | test Run tests 101 | test-latest-env Replace the Beam vesion with the latest version (including release candidates) 102 | ``` 103 | 104 | ### Pipeline Details 105 | 106 | This project contains a simple RunInference Beam pipeline, 107 | ``` 108 | Read the GCS file that contains image GCS paths (beam.io.ReadFromText) -> 109 | Pre-process the input image, run a Pytorch or Tensorflow image classification model, post-process the results --> 110 | Write all predictions back to the GCS output file 111 | ``` 112 | The input image data is created from the ImageNet images. 113 | 114 | The entire code flows in this way: 115 | 116 | * `.env` defines the environment variables such as Torch or TF models, model name, Dockerfile template, etc. 117 | * `Makefile` reads these environment variables from `.env` and based on the make targets, it can run tests, build docker images, run Dataflow jobs with CPUs or GPUs. 118 | * `run.py` is called by the`Makefile` targets to parse the input arguments and set `ModelConfig`, `SourceConfig`, and `SinkConfig` defined in `config.py`, then calls `build_pipeline` from `pipeline.py` to build the final Beam pipeline 119 | 120 | 121 | To customize the pipeline, modify `build_pipeline` in [pipeline.py](https://github.com/google/dataflow-ml-starter/blob/main/my_project/pipeline.py). It defines how to read the image data from TextIO, pre-process the images, score them, post-process the predictions, 122 | and at last save the results using TextIO. 123 | 124 | [config.py](https://github.com/google/dataflow-ml-starter/blob/main/my_project/config.py) contains a set of `pydantic` models to specify the configurations for sources, sinks, and models and validate them. Users can easily add more Pytorch classification models. [Here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference) contains more examples. 125 | 126 | ### `.env` Details 127 | 128 | Most of options are configured by the `.env` file. 129 | Below is one example to use the Pytorch `mobilenet_v2` model for image classification: 130 | ``` 131 | ################################################################################ 132 | ### PYTHON SDK SETTINGS 133 | ################################################################################ 134 | PYTHON_VERSION=3.10 135 | BEAM_VERSION=2.48.0 136 | DOCKERFILE_TEMPLATE=pytorch_gpu.Dockerfile 137 | DOCKER_CREDENTIAL_REGISTRIES="us-docker.pkg.dev" 138 | ################################################################################ 139 | ### GCP SETTINGS 140 | ################################################################################ 141 | PROJECT_ID=apache-beam-testing 142 | REGION=us-central1 143 | DISK_SIZE_GB=50 144 | MACHINE_TYPE=n1-standard-2 145 | VM_NAME=beam-ml-starter-gpu-1 146 | ################################################################################ 147 | ### DATAFLOW JOB SETTINGS 148 | ################################################################################ 149 | STAGING_LOCATION=gs://temp-storage-for-perf-tests/loadtests 150 | TEMP_LOCATION=gs://temp-storage-for-perf-tests/loadtests 151 | CUSTOM_CONTAINER_IMAGE=us-docker.pkg.dev/apache-beam-testing/xqhu/pytorch_gpu:latest 152 | SERVICE_OPTIONS="worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver" 153 | ################################################################################ 154 | ### DATAFLOW JOB MODEL SETTINGS 155 | ################################################################################ 156 | MODEL_STATE_DICT_PATH="gs://apache-beam-ml/models/torchvision.models.mobilenet_v2.pth" 157 | MODEL_NAME=mobilenet_v2 158 | ################################################################################ 159 | ### DATAFLOW JOB INPUT&OUTPUT SETTINGS 160 | ################################################################################ 161 | INPUT_DATA="gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt" 162 | OUTPUT_DATA="gs://temp-storage-for-end-to-end-tests/torch/result_gpu_xqhu.txt" 163 | ``` 164 | Most of options are intuitive. `DOCKERFILE_TEMPLATE` provides the Dockerfile template that will be used to build the custom container. `CUSTOM_CONTAINER_IMAGE` is the Docker image storage location. 165 | In default, we use GPUs (i.e., T4) with the custom container defined by `SERVICE_OPTIONS` for this Dataflow job. `MODEL_STATE_DICT_PATH` and `MODEL_NAME` defines the Pytorch model information. For this Beam pipeline, we use the GCS buckets for input and output data. 166 | 167 | ### Custom container 168 | We provide three Dockerfile templates as examples to show how to build a custom container: 169 | |Name|Description| 170 | |---|---| 171 | |tensor_rt.Dockerfile| TensorRT + Python 3.8| 172 | |pytorch_gpu.Dockerfile| Pytorch with GPUs + Python 3.10| 173 | |tensorflow_gpu.Dockerfile | Tensorflow with GPUs + Python 3.8| 174 | 175 | Note You should keep your local Python environment same as the one defined in Dockerfile. 176 | These Dockerfile examples should be customized based on your project requirements. 177 | 178 | ### Step 2: Initialize a venv for your project 179 | ```bash 180 | make init 181 | source venv/bin/activate 182 | ``` 183 | Note you must make sure the base Python version matches the version defined in `.env`. 184 | The base python can be configured using [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html), e.g., 185 | ```bash 186 | conda create --name py38 python=3.8 187 | conda activate py38 188 | ``` 189 | If anything goes wrong, you can rebuild the `venv`, 190 | ```bash 191 | make clean 192 | make init 193 | ``` 194 | To check the `venv` is created correctly, 195 | ```bash 196 | make test 197 | ``` 198 | 199 | ### Step 3: Test the Beam pipeline using DirectRunner 200 | `DirectRunner` provides the local way to validate whether your Beam pipeline works correctly, 201 | ```bash 202 | make run-direct 203 | ``` 204 | 205 | ### Step 4: Run the Beam pipeline using DataflowRunner 206 | To run a Dataflow job using CPUs without a custom container, try this: 207 | ```bash 208 | make run-df-cpu 209 | ``` 210 | When using resnet101 to score 50k images, the job took ~30m and cost ~1.4$ with resnet101. 211 | For `mobilenet_v2`, it cost 0.5$ with ~22m. 212 | Note the cost and time depends on your job settings and the regions. 213 | 214 | #### Build Custom Container with GPU supports 215 | Running Dataflow GPU jobs needs to build a custom container, 216 | ```bash 217 | make docker 218 | ``` 219 | The final docker image will be pushed to Artifact Registry. For this guide, 220 | we use `tensor_rt.Dockerfile` to demonstrate how to build the container to run the inference on GPUs with TensorRT. 221 | **Note given the base image issue for TensorRT, only Python 3.8 should be used when running GPUs.** 222 | You can follow [this doc](https://cloud.google.com/dataflow/docs/gpu/use-gpus#custom-container) to create other GPU containers. 223 | 224 | #### Test Custom Container using GCE VM 225 | It is highly recommended to test your custom container locally before running it with Dataflow, 226 | ```bash 227 | make create-vm 228 | ``` 229 | This creates a GCE VM with a T4 GPU and installs nvidia driver. It will take a few minutes. 230 | Now using this VM allows you to test whether the docker container is built correctly, 231 | ```bash 232 | # check whether Beam is installed in Custom Container 233 | make check-beam 234 | # check whether Tensorflow can use GPUs in Custom Container 235 | make check-tf-gpu 236 | # check whether PyTorch can use GPUs in Custom Container 237 | make check-torch-gpu 238 | # check whether DirectRunner can run on GPUs in Custom Container 239 | make check-pipeline 240 | ``` 241 | Note these commands will take some time to download your container. 242 | You should see outputs similar to these: 243 | ```bash 244 | Checking Python version on VM... 245 | Python 3.8.10 246 | Checking venv exists on VM... 247 | python3-venv/now 3.8.2-0ubuntu2 amd64 [installed,local] 248 | Checking Beam Version on VM... 249 | 2.48.0 250 | Checking Tensorflow on GPU... 251 | [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] 252 | Checking PyTorch on GPU... 253 | True 254 | Tesla T4 255 | ... 256 | The DirectRunner run succeeded on GPU! 257 | ``` 258 | The last line will display whether the pipeline can run successfully on VM GPUs in Custom Container. 259 | 260 | After finishing tests, you can remove this VM, 261 | ```bash 262 | make delete-vm 263 | ``` 264 | 265 | #### Run the Beam pipeline using DataflowRunner on GPUs 266 | This runs a Dataflow job with GPUs, 267 | ```bash 268 | make run-df-gpu 269 | ``` 270 | When using resnet101 to score 50k images, the job took ~1h and cost ~0.5$ with resnet101. 271 | For `mobilenet_v2`, it cost 0.05$ with ~1h. 272 | Note the cost and time depends on your job settings and the regions. 273 | 274 | ### Run the Beam pipeline with the Pub/Sub source 275 | When `INPUT_DATA` from the `.env` file defines a valid Pub/Sub topic (e.g., `projects/apache-beam-testing/topics/Imagenet_openimage_50k_benchmark`), 276 | the Beam pipeline is created using the Pub/Sub source with `FixedWindows` and switches to `beam.io.fileio.WriteToFiles` that supports the streaming pipeline. 277 | Note for this toy example, writing the predictions to a GCS bucket is not efficient since the file size is quite small with few bytes. 278 | In practice, you might tune up [the autoscaling options](https://cloud.google.com/dataflow/docs/guides/troubleshoot-autoscaling) to optimize the streaming pipeline performance. 279 | Note that the streaming job will run forever until it is canceled or drained. 280 | 281 | ### Run the Beam pipeline with Dataflow Flex Templates 282 | If you prefer to package all your code into a custom container and allow users to easily access your Beam pipeline, 283 | Dataflow Flex Template could be handy to create and run a Flex Template job using Google Cloud CLI or Google Cloud console. 284 | More importantly, building the flex templates container from the custom SDK container image can produce a reproducible launch environment that is [compatible with the runtime environment](https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#make-the-launch-environment-compatible-with-the-runtime-environment). 285 | (More benefits about templates are [here](https://cloud.google.com/dataflow/docs/concepts/dataflow-templates#benefits).) 286 | 287 | Since the custom container is already created, it is straightforward to adapt Dataflow Flex Templates: 288 | 1. create a [`metadata.json`](https://github.com/google/dataflow-ml-starter/blob/main/flex/metadata.json) file that contains the parameters required by your Beam pipeline. In this example, we can add `input`, `output`, `device`, `model_name`, `model_state_dict_path`, and `tf_model_uri` as the parameters that can be passed in by users. [Here](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates#example-metadata-file) is another example metadata file. 289 | 2. convert the custom container to your template container following [this](https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images). [`tensorflow_gpu.flex.Dockerfile`](https://github.com/google/dataflow-ml-starter/blob/main/tensorflow_gpu.flex.Dockerfile) is one example converted from `tensorflow_gpu.Dockerfile`. Only two parts are needed: switch to the Dataflow Template launcher entrypoint and package `my_project` into this container. Change `CUSTOM_CONTAINER_IMAGE` in `.env` and run `make docker` to create the custom container for Flex Templates. 290 | 3. `make create-flex-template` creates a template spec file in a Cloud Storage bucket defined by the env `TEMPLATE_FILE_GCS_PATH` that contains all of the necessary information to run the job, such as the SDK information and metadata. This calls the CLI `gcloud dataflow flex-template build`. 291 | 4. `make run-df-gpu-flex` runs a Flex Template pipeline using the spec file from `TEMPLATE_FILE_GCS_PATH`. This calls the CLI `gcloud dataflow flex-template run`. 292 | 293 | More information about Flex Templates can be found [here](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates). 294 | 295 | 296 | ## FAQ 297 | 298 | ### Permission error when using any GCP command 299 | ```bash 300 | gcloud auth login 301 | gcloud auth application-default login 302 | # replace it with the appropriate region 303 | gcloud auth configure-docker us-docker.pkg.dev 304 | # or if you use docker-credential-gcr 305 | docker-credential-gcr configure-docker --registries=us-docker.pkg.dev 306 | ``` 307 | Make sure you specify the appropriate region for Artifact Registry. 308 | 309 | ### AttributeError: Can't get attribute 'default_tensor_inference_fn' 310 | ``` 311 | AttributeError: Can't get attribute 'default_tensor_inference_fn' on 312 | ``` 313 | This error indicates your Dataflow job uses the old Beam SDK. If you use `--sdk_location container`, it means your Docker container has the old Beam SDK. 314 | 315 | ### QUOTA_EXCEEDED 316 | ``` 317 | Startup of the worker pool in zone us-central1-a failed to bring up any of the desired 1 workers. Please refer to https://cloud.google.com/dataflow/docs/guides/common-errors#worker-pool-failure for help troubleshooting. QUOTA_EXCEEDED: Instance 'benchmark-tests-pytorch-i-05041052-ufe3-harness-ww4n' creation failed: Quota 'NVIDIA_T4_GPUS' exceeded. Limit: 32.0 in region us-central1. 318 | ``` 319 | Please check https://cloud.google.com/compute/docs/regions-zones and select another zone with your desired machine type to relaunch the Dataflow job. 320 | 321 | ### ERROR: failed to solve: failed to fetch anonymous token: unexpected status: 401 Unauthorized 322 | ``` 323 | failed to solve with frontend dockerfile.v0: failed to create LLB definition: failed to authorize: rpc error: code = Unknown desc = failed to fetch anonymous token: unexpected status: 401 Unauthorized 324 | ``` 325 | Restarting the docker could resolve this issue. 326 | 327 | ### Check the built container 328 | ```bash 329 | docker run --rm -it --entrypoint=/bin/bash $CUSTOM_CONTAINER_IMAGE 330 | ``` 331 | 332 | ### Errors could happen when the custom container is not built correctly 333 | 334 | Check Cloud Logs, pay attention to INFO for Worker logs: 335 | ``` 336 | INFO 2023-05-06T15:13:01.237562007Z The virtual environment was not created successfully because ensurepip is not 337 | INFO 2023-05-06T15:13:01.237601258Z available. On Debian/Ubuntu systems, you need to install the python3-venv 338 | INFO 2023-05-06T15:13:01.237607714Z package using the following command. 339 | ``` 340 | or (might be caused by building the container on Mac OS) 341 | ``` 342 | exec /opt/apache/beam/boot: no such file or directory 343 | ``` 344 | 345 | ## Useful Links 346 | * https://cloud.google.com/dataflow/docs/guides/using-custom-containers#docker 347 | * https://cloud.google.com/dataflow/docs/gpu/use-gpus#custom-container 348 | * https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/ 349 | * https://github.com/apache/beam/blob/master/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy 350 | * https://cloud.google.com/dataflow/docs/gpu/troubleshoot-gpus#debug-vm 351 | * https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/dataflow/flex-templates/streaming_beam 352 | * https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates 353 | * https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images -------------------------------------------------------------------------------- /data/openimage_10.txt: -------------------------------------------------------------------------------- 1 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec63d33df5e91fd.jpg 2 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec64bfcb2d515c9.jpg 3 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec67f239007cb18.jpg 4 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec6988f60e8e881.jpg 5 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec6bf3c1551224a.jpg 6 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec6c1055bae51f5.jpg 7 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec6c15d60358c85.jpg 8 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec6ca007effdd80.jpg 9 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec7096df9477315.jpg 10 | gs://apache-beam-ml/datasets/openimage_50k_benchmark/1ec70b2abe194c75.jpg 11 | -------------------------------------------------------------------------------- /flex/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Beam RunInference Python flex template", 3 | "description": "Beam RunInference example for python flex template.", 4 | "parameters": [ 5 | { 6 | "name": "input", 7 | "label": "Input data", 8 | "helpText": "Input image URI data that could be a GCS bucket or pub/sub topic" 9 | }, 10 | { 11 | "name": "output", 12 | "label": "Output GCS bucket path", 13 | "helpText": "A GCS bucket that stores the model predictions" 14 | }, 15 | { 16 | "name": "tf_model_uri", 17 | "label": "TensorFlow model URI", 18 | "helpText": "A valid TensorFlow model URI", 19 | "isOptional": true 20 | }, 21 | { 22 | "name": "model_name", 23 | "label": "a Pytorch model name", 24 | "helpText": "A model name, e.g. resnet101", 25 | "isOptional": true 26 | }, 27 | { 28 | "name": "model_state_dict_path", 29 | "label": "a Pytorch model state path", 30 | "helpText": "Path to the model's state_dict", 31 | "isOptional": true 32 | }, 33 | { 34 | "name": "device", 35 | "label": "device to run models", 36 | "helpText": "device could be either CPU or GPU", 37 | "isOptional": true 38 | }, 39 | { 40 | "name": "disk_size_gb", 41 | "label": "disk_size_gb", 42 | "helpText": "disk_size_gb for worker", 43 | "isOptional": true 44 | }, 45 | { 46 | "name": "dataflow_service_option", 47 | "label": "dataflow_service_option", 48 | "helpText": "dataflow_service_option for worker", 49 | "isOptional": true 50 | } 51 | ] 52 | } -------------------------------------------------------------------------------- /my_project/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /my_project/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # standard libraries 16 | import re 17 | from enum import Enum 18 | 19 | # third party libraries 20 | from pydantic import BaseModel, Field, root_validator, validator 21 | 22 | 23 | class ModelName(str, Enum): 24 | RESNET101 = "resnet101" 25 | MOBILENET_V2 = "mobilenet_v2" 26 | 27 | 28 | class ModelConfig(BaseModel): 29 | model_state_dict_path: str = Field(None, description="path that contains the torch model state directory") 30 | model_class_name: ModelName = Field(None, description="Reference to the class definition of the model.") 31 | model_params: dict = Field( 32 | None, 33 | description="Parameters passed to the constructor of the model_class. " 34 | "These will be used to instantiate the model object in the RunInference API.", 35 | ) 36 | tf_model_uri: str = Field(None, description="TF model uri from https://tfhub.dev/") 37 | device: str = Field("CPU", description="Device to be used on the Runner. Choices are (CPU, GPU)") 38 | min_batch_size: int = 10 39 | max_batch_size: int = 100 40 | 41 | @root_validator 42 | def validate_fields(cls, values): 43 | v = values.get("model_state_dict_path") 44 | if v and values.get("tf_model_uri"): 45 | raise ValueError("Cannot specify both model_state_dict_path and tf_model_uri") 46 | if v is None and values.get("tf_model_uri") is None: 47 | raise ValueError("At least one of model_state_dict_path or tf_model_uri must be specified") 48 | if v and values.get("model_class_name") is None: 49 | raise ValueError("model_class_name must be specified when using model_state_dict_path") 50 | if v and values.get("model_params") is None: 51 | raise ValueError("model_params must be specified when using model_state_dict_path") 52 | return values 53 | 54 | 55 | def _validate_topic_path(topic_path): 56 | pattern = r"projects/.+/topics/.+" 57 | return bool(re.match(pattern, topic_path)) 58 | 59 | 60 | class SourceConfig(BaseModel): 61 | input: str = Field(..., description="the input path to a text file or a Pub/Sub topic") 62 | images_dir: str = Field( 63 | None, 64 | description="Path to the directory where images are stored." 65 | "Not required if image names in the input file have absolute path.", 66 | ) 67 | streaming: bool = False 68 | 69 | @validator("streaming", pre=True, always=True) 70 | def set_streaming(cls, v, values): 71 | return _validate_topic_path(values["input"]) 72 | 73 | 74 | class SinkConfig(BaseModel): 75 | output: str = Field(..., description="the output path to save results as a text file") 76 | -------------------------------------------------------------------------------- /my_project/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """A pipeline that uses RunInference API to perform image classification.""" 16 | 17 | # standard libraries 18 | import io 19 | import os 20 | from typing import Iterable, Iterator, Optional, Tuple, Union 21 | 22 | # third party libraries 23 | import apache_beam as beam 24 | import numpy as np 25 | import torch 26 | import torch.nn as nn 27 | from apache_beam.io.filesystems import FileSystems 28 | from apache_beam.ml.inference.base import KeyedModelHandler, PredictionResult, RunInference 29 | from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerTensor 30 | from apache_beam.ml.inference.tensorflow_inference import TFModelHandlerTensor 31 | from PIL import Image 32 | from torchvision import models, transforms 33 | 34 | # Dataflow ML libraries 35 | from my_project.config import ModelConfig, ModelName, SinkConfig, SourceConfig 36 | 37 | import tensorflow as tf # isort:skip 38 | 39 | 40 | def get_model_class(model_name: ModelName) -> nn.Module: 41 | model_dict = {ModelName.RESNET101: models.resnet101, ModelName.MOBILENET_V2: models.mobilenet_v2} 42 | 43 | model_class = model_dict.get(model_name) 44 | if not model_class: 45 | raise ValueError(f"cannot recognize the model {model_name}") 46 | return model_class 47 | 48 | 49 | def read_image(image_file_name: Union[str, bytes], path_to_dir: Optional[str] = None) -> Tuple[str, Image.Image]: 50 | if isinstance(image_file_name, bytes): 51 | image_file_name = image_file_name.decode() 52 | if path_to_dir is not None: 53 | image_file_name = os.path.join(path_to_dir, image_file_name) 54 | with FileSystems().open(image_file_name, "r") as file: 55 | data = Image.open(io.BytesIO(file.read())).convert("RGB") 56 | return image_file_name, data 57 | 58 | 59 | def preprocess_image(data: Image.Image) -> torch.Tensor: 60 | image_size = (224, 224) 61 | # Pre-trained PyTorch models expect input images normalized with the 62 | # below values (see: https://pytorch.org/vision/stable/models.html) 63 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 64 | transform = transforms.Compose( 65 | [ 66 | transforms.Resize(image_size), 67 | transforms.ToTensor(), 68 | normalize, 69 | ] 70 | ) 71 | return transform(data) 72 | 73 | 74 | def preprocess_image_for_tf(data: Image.Image) -> tf.Tensor: 75 | # Convert the input image to the type and dimensions required by the model. 76 | 77 | img = data.resize((224, 224)) 78 | img = np.array(img) / 255.0 79 | 80 | return tf.cast(tf.convert_to_tensor(img[...]), dtype=tf.float32) 81 | 82 | 83 | def filter_empty_lines(text: str) -> Iterator[str]: 84 | if len(text.strip()) > 0: 85 | yield text 86 | 87 | 88 | class PostProcessor(beam.DoFn): 89 | def process(self, element: Tuple[str, PredictionResult]) -> Iterable[str]: 90 | filename, prediction_result = element 91 | if isinstance(prediction_result.inference, torch.Tensor): 92 | prediction = torch.argmax(prediction_result.inference, dim=0) 93 | else: 94 | prediction = np.argmax(prediction_result.inference) 95 | yield filename + "," + str(prediction.item()) 96 | 97 | 98 | def build_pipeline(pipeline, source_config: SourceConfig, sink_config: SinkConfig, model_config: ModelConfig) -> None: 99 | """ 100 | Args: 101 | pipeline: a given input pipeline 102 | source_config: a source config 103 | sink_config: a sink config 104 | model_config: a model config to instantiate PytorchModelHandlerTensor 105 | """ 106 | 107 | # In this example we pass keyed inputs to RunInference transform. 108 | # Therefore, we use KeyedModelHandler wrapper over PytorchModelHandler or TFModelHandlerTensor. 109 | if model_config.model_state_dict_path: 110 | model_handler = KeyedModelHandler( 111 | PytorchModelHandlerTensor( 112 | state_dict_path=model_config.model_state_dict_path, 113 | model_class=get_model_class(model_config.model_class_name), 114 | model_params=model_config.model_params, 115 | device=model_config.device, 116 | min_batch_size=model_config.min_batch_size, 117 | max_batch_size=model_config.max_batch_size, 118 | ) 119 | ) 120 | elif model_config.tf_model_uri: 121 | model_handler = KeyedModelHandler( 122 | TFModelHandlerTensor( 123 | model_uri=model_config.tf_model_uri, 124 | device=model_config.device, 125 | min_batch_size=model_config.min_batch_size, 126 | max_batch_size=model_config.max_batch_size, 127 | ) 128 | ) 129 | else: 130 | raise ValueError("Only support PytorchModelHandler and TFModelHandlerTensor!") 131 | 132 | if source_config.streaming: 133 | # read the text file path from Pub/Sub and use FixedWindows to group these images 134 | # and then run the model inference and store the results into GCS 135 | filename_value_pair = ( 136 | pipeline 137 | | "ReadImageNamesFromPubSub" >> beam.io.ReadFromPubSub(topic=source_config.input) 138 | | "Window into fixed intervals" >> beam.WindowInto(beam.window.FixedWindows(60 * 5)) 139 | | "ReadImageData" >> beam.Map(lambda image_name: read_image(image_file_name=image_name)) 140 | ) 141 | else: 142 | # read the text file and create the pair of input data with the file name and its image 143 | filename_value_pair = ( 144 | pipeline 145 | | "ReadImageNames" >> beam.io.ReadFromText(source_config.input) 146 | | "FilterEmptyLines" >> beam.ParDo(filter_empty_lines) 147 | | "ReadImageData" 148 | >> beam.Map(lambda image_name: read_image(image_file_name=image_name, path_to_dir=source_config.images_dir)) 149 | ) 150 | 151 | if model_config.model_state_dict_path: 152 | filename_value_pair = filename_value_pair | "PreprocessImages" >> beam.MapTuple( 153 | lambda file_name, data: (file_name, preprocess_image(data)) 154 | ) 155 | else: 156 | filename_value_pair = filename_value_pair | "PreprocessImages_TF" >> beam.MapTuple( 157 | lambda file_name, data: (file_name, preprocess_image_for_tf(data)) 158 | ) 159 | 160 | # do the model inference and postprocessing 161 | predictions = ( 162 | filename_value_pair 163 | | "RunInference" >> RunInference(model_handler) 164 | | "ProcessOutput" >> beam.ParDo(PostProcessor()) 165 | ) 166 | 167 | # combine all the window results into one text for GCS 168 | if source_config.streaming: 169 | ( 170 | predictions 171 | | "WriteOutputToGCS" 172 | >> beam.io.fileio.WriteToFiles(sink_config.output, shards=0) # pylint: disable=expression-not-assigned 173 | ) 174 | else: 175 | # save the predictions to a text file 176 | predictions | "WriteOutputToGCS" >> beam.io.WriteToText( # pylint: disable=expression-not-assigned 177 | sink_config.output, shard_name_template="", append_trailing_newlines=True 178 | ) 179 | -------------------------------------------------------------------------------- /my_project/run.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """A run module that runs a Beam pipeline to perform image classification.""" 16 | 17 | # standard libraries 18 | import argparse 19 | import logging 20 | 21 | # third party libraries 22 | import apache_beam as beam 23 | from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions 24 | from apache_beam.runners.runner import PipelineResult 25 | 26 | # Dataflow ML libraries 27 | from my_project.config import ModelConfig, SinkConfig, SourceConfig 28 | from my_project.pipeline import build_pipeline 29 | 30 | 31 | def parse_known_args(argv): 32 | """Parses args for the workflow.""" 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument("--input", dest="input", required=True, help="Path to the text file containing image names.") 35 | parser.add_argument( 36 | "--output", dest="output", required=True, help="Path where to save output predictions." " text file." 37 | ) 38 | parser.add_argument( 39 | "--model_state_dict_path", dest="model_state_dict_path", required=False, help="Path to the model's state_dict." 40 | ) 41 | parser.add_argument("--model_name", dest="model_name", required=False, help="model name, e.g. resnet101") 42 | parser.add_argument( 43 | "--tf_model_uri", dest="tf_model_uri", required=False, help="tfhub model URI from https://tfhub.dev/" 44 | ) 45 | parser.add_argument( 46 | "--images_dir", 47 | default=None, 48 | help="Path to the directory where images are stored." 49 | "Not required if image names in the input file have absolute path.", 50 | ) 51 | parser.add_argument( 52 | "--device", 53 | default="CPU", 54 | help="Device to be used on the Runner. Choices are (CPU, GPU).", 55 | ) 56 | return parser.parse_known_args(argv) 57 | 58 | 59 | def run(argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult: 60 | """ 61 | Args: 62 | argv: Command line arguments defined for this example. 63 | save_main_session: Used for internal testing. 64 | test_pipeline: Used for internal testing. 65 | """ 66 | known_args, pipeline_args = parse_known_args(argv) 67 | 68 | # setup configs 69 | model_config = ModelConfig( 70 | model_state_dict_path=known_args.model_state_dict_path, 71 | model_class_name=known_args.model_name, 72 | model_params={"num_classes": 1000}, 73 | tf_model_uri=known_args.tf_model_uri, 74 | device=known_args.device, 75 | ) 76 | 77 | source_config = SourceConfig(input=known_args.input) 78 | sink_config = SinkConfig(output=known_args.output) 79 | 80 | # setup pipeline 81 | pipeline_options = PipelineOptions(pipeline_args, streaming=source_config.streaming) 82 | pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 83 | 84 | pipeline = test_pipeline 85 | if not test_pipeline: 86 | pipeline = beam.Pipeline(options=pipeline_options) 87 | 88 | # build the pipeline using configs 89 | build_pipeline(pipeline, source_config=source_config, sink_config=sink_config, model_config=model_config) 90 | 91 | # run it 92 | result = pipeline.run() 93 | result.wait_until_finish() 94 | return result 95 | 96 | 97 | if __name__ == "__main__": 98 | logging.getLogger().setLevel(logging.INFO) 99 | run() 100 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [tool.black] 16 | line-length = 120 17 | include = '\.pyi?$' 18 | exclude = ''' 19 | 20 | ( 21 | /( 22 | \.eggs # exclude a few common directories in the 23 | | \.git # root of the project 24 | | \.hg 25 | | \.mypy_cache 26 | | \.tox 27 | | \.vscode 28 | | \.idea 29 | | \.ipynb_checkpoints 30 | | \.dvc 31 | | _build 32 | | buck-out 33 | | build 34 | | dist 35 | | venv 36 | | node_modules 37 | )/ 38 | | version.py # also separately exclude a file named foo.py in 39 | # the root of the project 40 | ) 41 | ''' -------------------------------------------------------------------------------- /pytorch_gpu.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This uses Ubuntu with Python 3.10 16 | ARG PYTORCH_SERVING_BUILD_IMAGE=pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 17 | 18 | FROM ${PYTORCH_SERVING_BUILD_IMAGE} 19 | 20 | WORKDIR /workspace 21 | 22 | COPY requirements.txt requirements.txt 23 | 24 | RUN pip install --upgrade pip \ 25 | && pip install --no-cache-dir -r requirements.txt \ 26 | && rm -f requirements.txt 27 | 28 | # Copy files from official SDK image, including script/dependencies. 29 | COPY --from=apache/beam_python3.10_sdk:${BEAM_VERSION} /opt/apache/beam /opt/apache/beam 30 | 31 | # Set the entrypoint to Apache Beam SDK launcher. 32 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | #building 17 | setuptools>=67.7.2 18 | 19 | # format/lint 20 | flake8>=5.0.4 21 | isort>=5.6.4 22 | pre-commit>=2.9.3 23 | black>=22.3.0 24 | 25 | # test 26 | pytest>=6.2.1 27 | pytest-cov>=2.10.1 28 | pytest-ordering 29 | pytest-env 30 | -------------------------------------------------------------------------------- /requirements.prod.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apache-beam[gcp]==${BEAM_VERSION} 16 | pydantic<2.0.0 17 | torch>=1.7.1 18 | torchvision>=0.8.2 19 | pillow>=8.0.0 20 | tensorflow 21 | tensorflow_hub 22 | numpy<2.0.0 23 | pyOpenSSL 24 | -------------------------------------------------------------------------------- /scripts/check-beam.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2023 Google LLC 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Import environment variables from .env file. 18 | source .env 19 | 20 | # Check if the project ID and zone environment variables are set. 21 | if [ -z "${PROJECT_ID}" ]; then 22 | echo "The PROJECT_ID environment variable is not set." 23 | exit 1 24 | fi 25 | 26 | if [ -z "${ZONE}" ]; then 27 | echo "The ZONE environment variable is not set." 28 | exit 1 29 | fi 30 | 31 | if [ -z "${VM_NAME}" ]; then 32 | echo "The VM_NAME environment variable is not set." 33 | exit 1 34 | fi 35 | 36 | if [ -z "${CUSTOM_CONTAINER_IMAGE}" ]; then 37 | echo "The CUSTOM_CONTAINER_IMAGE environment variable is not set." 38 | exit 1 39 | fi 40 | 41 | echo "Checking Python version on VM..." 42 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command \ 43 | "docker run --entrypoint /bin/bash --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ 44 | --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 45 | --privileged $CUSTOM_CONTAINER_IMAGE -c \ 46 | \"python --version\"" 47 | 48 | echo "Checking venv exists on VM..." 49 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command \ 50 | "docker run --entrypoint /bin/bash --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ 51 | --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 52 | --privileged $CUSTOM_CONTAINER_IMAGE -c \ 53 | 'apt list --installed | grep python3-venv'" 54 | 55 | echo "Checking Beam Version on VM..." 56 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command \ 57 | "docker run --entrypoint /bin/bash --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ 58 | --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 59 | --privileged $CUSTOM_CONTAINER_IMAGE -c \ 60 | \"python -c 'import apache_beam as beam; print(beam.__version__)'\"" -------------------------------------------------------------------------------- /scripts/check-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2023 Google LLC 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Import environment variables from .env file. 18 | source .env 19 | 20 | # Check if the project ID and zone environment variables are set. 21 | if [ -z "${PROJECT_ID}" ]; then 22 | echo "The PROJECT_ID environment variable is not set." 23 | exit 1 24 | fi 25 | 26 | if [ -z "${ZONE}" ]; then 27 | echo "The ZONE environment variable is not set." 28 | exit 1 29 | fi 30 | 31 | if [ -z "${VM_NAME}" ]; then 32 | echo "The VM_NAME environment variable is not set." 33 | exit 1 34 | fi 35 | 36 | if [ -z "${CUSTOM_CONTAINER_IMAGE}" ]; then 37 | echo "The CUSTOM_CONTAINER_IMAGE environment variable is not set." 38 | exit 1 39 | fi 40 | 41 | vm_ssh="gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command" 42 | vm_scp="gcloud compute scp --strict-host-key-checking=no --project $PROJECT_ID --zone=$ZONE --quiet" 43 | 44 | # Package the local code and copy it to VM 45 | PACKAGE_NAME="my_project-0.0.1" 46 | python3 setup.py sdist 47 | $vm_ssh "sudo rm -fr ~/*" 48 | $vm_scp dist/$PACKAGE_NAME.tar.gz data/openimage_10.txt $VM_NAME:~/ 49 | $vm_ssh "tar zxvf $PACKAGE_NAME.tar.gz; mv openimage_10.txt $PACKAGE_NAME" 50 | 51 | # Test the model on GPUs 52 | if [ -z "${TF_MODEL_URI}" ]; then 53 | echo "Running the PyTorch model on GPU..." 54 | $vm_ssh "docker run --entrypoint /bin/bash \ 55 | --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 56 | --volume /home/\$USER/:/workspace/\$USER --privileged $CUSTOM_CONTAINER_IMAGE -c \ 57 | \"cd \$USER/$PACKAGE_NAME; python -m my_project.run --input openimage_10.txt --output beam-output/beam_test_out.txt --model_state_dict_path $MODEL_STATE_DICT_PATH --model_name $MODEL_NAME --device GPU\"" 58 | else 59 | echo "Running the Tensorflow model on GPU..." 60 | $vm_ssh "docker run --entrypoint /bin/bash \ 61 | --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 62 | --volume /home/\$USER/:/workspace/\$USER --privileged $CUSTOM_CONTAINER_IMAGE -c \ 63 | \"cd \$USER/$PACKAGE_NAME; python -m my_project.run --input openimage_10.txt --output beam-output/beam_test_out.txt --tf_model_uri $TF_MODEL_URI --device GPU\"" 64 | fi 65 | 66 | $vm_ssh "[ -f './$PACKAGE_NAME/beam-output/beam_test_out.txt' ] && echo 'The DirectRunner run succeeded on GPU!' || echo 'The DirectRunner run failed on GPU!'" -------------------------------------------------------------------------------- /scripts/check-tf-on-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2023 Google LLC 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Import environment variables from .env file. 18 | source .env 19 | 20 | # Check if the project ID and zone environment variables are set. 21 | if [ -z "${PROJECT_ID}" ]; then 22 | echo "The PROJECT_ID environment variable is not set." 23 | exit 1 24 | fi 25 | 26 | if [ -z "${ZONE}" ]; then 27 | echo "The ZONE environment variable is not set." 28 | exit 1 29 | fi 30 | 31 | if [ -z "${VM_NAME}" ]; then 32 | echo "The VM_NAME environment variable is not set." 33 | exit 1 34 | fi 35 | 36 | if [ -z "${CUSTOM_CONTAINER_IMAGE}" ]; then 37 | echo "The CUSTOM_CONTAINER_IMAGE environment variable is not set." 38 | exit 1 39 | fi 40 | 41 | echo "Checking Tensorflow on GPU..." 42 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command \ 43 | "docker run --entrypoint /bin/bash --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ 44 | --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 45 | --privileged $CUSTOM_CONTAINER_IMAGE -c \ 46 | \"python -c 'import tensorflow as tf; print(tf.config.list_physical_devices())'\"" -------------------------------------------------------------------------------- /scripts/check-torch-on-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2023 Google LLC 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Import environment variables from .env file. 18 | source .env 19 | 20 | # Check if the project ID and zone environment variables are set. 21 | if [ -z "${PROJECT_ID}" ]; then 22 | echo "The PROJECT_ID environment variable is not set." 23 | exit 1 24 | fi 25 | 26 | if [ -z "${ZONE}" ]; then 27 | echo "The ZONE environment variable is not set." 28 | exit 1 29 | fi 30 | 31 | if [ -z "${VM_NAME}" ]; then 32 | echo "The VM_NAME environment variable is not set." 33 | exit 1 34 | fi 35 | 36 | if [ -z "${CUSTOM_CONTAINER_IMAGE}" ]; then 37 | echo "The CUSTOM_CONTAINER_IMAGE environment variable is not set." 38 | exit 1 39 | fi 40 | 41 | echo "Checking PyTorch on GPU..." 42 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command \ 43 | "docker run --entrypoint /bin/bash --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \ 44 | --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin \ 45 | --privileged $CUSTOM_CONTAINER_IMAGE -c \ 46 | \"python -c 'import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name())'\"" -------------------------------------------------------------------------------- /scripts/create-gpu-vm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2023 Google LLC 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Import environment variables from .env file. 18 | source .env 19 | 20 | # Check if the project ID and zone environment variables are set. 21 | if [ -z "${PROJECT_ID}" ]; then 22 | echo "The PROJECT_ID environment variable is not set." 23 | exit 1 24 | fi 25 | 26 | if [ -z "${ZONE}" ]; then 27 | echo "The ZONE environment variable is not set." 28 | exit 1 29 | fi 30 | 31 | if [ -z "${VM_NAME}" ]; then 32 | echo "The VM_NAME environment variable is not set." 33 | exit 1 34 | fi 35 | 36 | if [ -z "${MACHINE_TYPE}" ]; then 37 | echo "The MACHINE_TYPE environment variable is not set." 38 | exit 1 39 | fi 40 | 41 | # Set the number of GPUs to attach to the VM. 42 | GPU_COUNT=1 43 | GPU_TYPE="nvidia-tesla-t4" 44 | 45 | # Create the VM. 46 | echo "Waiting for VM to be created (this will take a few minutes)..." 47 | 48 | gcloud compute instances create $VM_NAME \ 49 | --project $PROJECT_ID \ 50 | --zone $ZONE \ 51 | --machine-type $MACHINE_TYPE \ 52 | --accelerator count=$GPU_COUNT,type=$GPU_TYPE \ 53 | --image-family cos-stable \ 54 | --image-project=cos-cloud \ 55 | --maintenance-policy TERMINATE \ 56 | --restart-on-failure \ 57 | --boot-disk-size=200G \ 58 | --scopes=cloud-platform 59 | 60 | # Wait for the VM to be created. 61 | STATUS="" 62 | while [ "$STATUS" != "RUNNING" ]; do 63 | sleep 5 64 | STATUS=$(gcloud compute instances describe $VM_NAME --project $PROJECT_ID --zone=$ZONE --format="value(status)") 65 | done 66 | 67 | echo "VM $VM_NAME is now running." 68 | 69 | # Print the VM's IP address. 70 | echo "VM IP address: $(gcloud compute instances describe $VM_NAME --project $PROJECT_ID --zone=$ZONE --format='value(networkInterfaces[0].accessConfigs[0].natIP)')" 71 | 72 | # Install GPU driver 73 | echo "Installing Nvidia GPU driver..." 74 | gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --tunnel-through-iap --quiet \ 75 | --command "cos-extensions install gpu && sudo mount --bind /var/lib/nvidia /var/lib/nvidia && sudo mount -o remount,exec /var/lib/nvidia" 76 | 77 | vm_ssh="gcloud compute ssh --strict-host-key-checking=no $VM_NAME --project $PROJECT_ID --zone=$ZONE --quiet --command" 78 | 79 | echo "Getting the GPU driver information..." 80 | $vm_ssh "/var/lib/nvidia/bin/nvidia-smi" 81 | 82 | # docker-credential-gcr 83 | if [[ -n "$DOCKER_CREDENTIAL_REGISTRIES" ]]; then 84 | echo "HOME is defined." 85 | echo "Authenticating us-docker.pkg.dev..." 86 | $vm_ssh "docker-credential-gcr configure-docker --registries=$DOCKER_CREDENTIAL_REGISTRIES" 87 | fi -------------------------------------------------------------------------------- /scripts/get_beam_version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # third party libraries 16 | import requests 17 | from packaging.version import Version 18 | 19 | 20 | def beam_versions(package_name, limit_releases=10): 21 | url = f"https://pypi.org/pypi/{package_name}/json" 22 | data = requests.get(url).json() 23 | versions = list(data["releases"].keys()) 24 | versions.sort(key=Version, reverse=True) 25 | return versions[:limit_releases] 26 | 27 | 28 | print("\n".join(beam_versions("apache-beam", 1))) 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # standard libraries 16 | import os 17 | 18 | # third party libraries 19 | import setuptools 20 | 21 | required = [] 22 | if os.path.exists("requirements.txt"): 23 | with open("requirements.txt") as f: 24 | required = f.read().splitlines() 25 | 26 | setuptools.setup( 27 | name="my_project", 28 | version="0.0.1", 29 | install_requires=required, 30 | packages=["my_project"], 31 | ) 32 | -------------------------------------------------------------------------------- /tensor_rt.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This needs Python 3.8 for your local runtime environment 16 | ARG PYTORCH_SERVING_BUILD_IMAGE=nvcr.io/nvidia/pytorch:22.11-py3 17 | 18 | FROM ${PYTORCH_SERVING_BUILD_IMAGE} 19 | 20 | ENV PATH="/usr/src/tensorrt/bin:${PATH}" 21 | 22 | WORKDIR /workspace 23 | 24 | COPY requirements.txt requirements.txt 25 | 26 | ENV DEBIAN_FRONTEND=noninteractive 27 | 28 | RUN apt-get update \ 29 | && apt install python3.8 python3.8-venv python3-venv -y \ 30 | && pip install --upgrade pip \ 31 | && apt-get install ffmpeg libsm6 libxext6 -y --no-install-recommends \ 32 | && pip install cuda-python onnx numpy onnxruntime common \ 33 | && pip install git+https://github.com/facebookresearch/detectron2.git@5aeb252b194b93dc2879b4ac34bc51a31b5aee13 \ 34 | && pip install git+https://github.com/NVIDIA/TensorRT#subdirectory=tools/onnx-graphsurgeon 35 | 36 | RUN pip install --no-cache-dir -r requirements.txt && rm -f requirements.txt 37 | 38 | # Copy files from official SDK image, including script/dependencies. 39 | COPY --from=apache/beam_python3.8_sdk:${BEAM_VERSION} /opt/apache/beam /opt/apache/beam 40 | 41 | # Set the entrypoint to Apache Beam SDK launcher. 42 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /tensorflow_gpu.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This needs Python 3.8 for your local runtime environment 16 | 17 | # Select an NVIDIA base image with desired GPU stack from https://ngc.nvidia.com/catalog/containers/nvidia:cuda 18 | FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 19 | 20 | WORKDIR /workspace 21 | 22 | COPY requirements.txt requirements.txt 23 | 24 | RUN \ 25 | # Add Deadsnakes repository that has a variety of Python packages for Ubuntu. 26 | # See: https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa 27 | apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 \ 28 | && echo "deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \ 29 | && echo "deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \ 30 | && apt-get update \ 31 | && apt-get install -y curl \ 32 | python3.8 \ 33 | python3.8-venv \ 34 | python3-venv \ 35 | # With python3.8 package, distutils need to be installed separately. 36 | python3-distutils \ 37 | && rm -rf /var/lib/apt/lists/* \ 38 | && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ 39 | && curl https://bootstrap.pypa.io/pip/3.8/get-pip.py | python \ 40 | && pip install --upgrade pip \ 41 | && pip install --no-cache-dir -r requirements.txt \ 42 | && pip install --no-cache-dir tensorflow==2.12.1 \ 43 | && pip install --no-cache-dir torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 44 | 45 | # Copy files from official SDK image, including script/dependencies. 46 | COPY --from=apache/beam_python3.8_sdk:${BEAM_VERSION} /opt/apache/beam /opt/apache/beam 47 | 48 | # Set the entrypoint to Apache Beam SDK launcher. 49 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /tensorflow_gpu.flex.Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This needs Python 3.8 for your local runtime environment 16 | 17 | FROM gcr.io/dataflow-templates-base/flex-template-launcher-image:latest as template_launcher 18 | 19 | # Select an NVIDIA base image with desired GPU stack from https://ngc.nvidia.com/catalog/containers/nvidia:cuda 20 | FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 21 | 22 | WORKDIR /workspace 23 | 24 | COPY requirements.txt requirements.txt 25 | 26 | RUN \ 27 | # Add Deadsnakes repository that has a variety of Python packages for Ubuntu. 28 | # See: https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa 29 | apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 \ 30 | && echo "deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \ 31 | && echo "deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" >> /etc/apt/sources.list.d/custom.list \ 32 | && apt-get update \ 33 | && apt-get install -y curl \ 34 | python3.8 \ 35 | python3.8-venv \ 36 | python3-venv \ 37 | # With python3.8 package, distutils need to be installed separately. 38 | python3-distutils \ 39 | && rm -rf /var/lib/apt/lists/* \ 40 | && update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 \ 41 | && curl https://bootstrap.pypa.io/pip/3.8/get-pip.py | python \ 42 | && pip install --upgrade pip \ 43 | && pip install --no-cache-dir -r requirements.txt \ 44 | && pip install --no-cache-dir tensorflow==2.12.1 \ 45 | && pip install --no-cache-dir torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 46 | 47 | # Copy the run module 48 | COPY my_project/ /workspace/my_project 49 | RUN rm -fr /workspace/my_project/__pycache__ 50 | 51 | #Specifies which Python file to run to launch the Flex Template. 52 | ENV FLEX_TEMPLATE_PYTHON_PY_FILE="my_project/run.py" 53 | 54 | # Since we already downloaded all the dependencies, there's no need to rebuild everything. 55 | ENV PIP_NO_DEPS=True 56 | 57 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/my_project/" 58 | 59 | # Copy the Dataflow Template launcher 60 | COPY --from=template_launcher /opt/google/dataflow/python_template_launcher /opt/google/dataflow/python_template_launcher 61 | 62 | # Copy files from official SDK image, including script/dependencies. 63 | # Note Python 3.8 is used since the above setup uses Python 3.8. 64 | COPY --from=apache/beam_python3.8_sdk:${BEAM_VERSION} /opt/apache/beam /opt/apache/beam 65 | 66 | # Set the entrypoint to the Dataflow Template launcher 67 | # Use this if the launcher image is different with the custom container image 68 | # ENTRYPOINT ["/opt/google/dataflow/python_template_launcher"] 69 | 70 | # Set the entrypoint to Apache Beam SDK launcher. 71 | ENTRYPOINT ["/opt/apache/beam/boot"] -------------------------------------------------------------------------------- /tests/sample.env.pytorch: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### PYTHON SDK SETTINGS 3 | ################################################################################ 4 | PYTHON_VERSION=3.10 5 | BEAM_VERSION=2.60.0 6 | DOCKERFILE_TEMPLATE=pytorch_gpu.Dockerfile 7 | DOCKER_CREDENTIAL_REGISTRIES="us-docker.pkg.dev" 8 | ################################################################################ 9 | ### GCP SETTINGS 10 | ################################################################################ 11 | PROJECT_ID=apache-beam-testing 12 | REGION=us-central1 13 | ZONE=us-central1-f 14 | DISK_SIZE_GB=50 15 | MACHINE_TYPE=n1-standard-2 16 | VM_NAME=beam-ml-starter-gpu 17 | ################################################################################ 18 | ### DATAFLOW JOB SETTINGS 19 | ################################################################################ 20 | STAGING_LOCATION=gs://temp-storage-for-perf-tests/loadtests 21 | TEMP_LOCATION=gs://temp-storage-for-perf-tests/loadtests 22 | CUSTOM_CONTAINER_IMAGE=us-docker.pkg.dev/apache-beam-testing/dataflow-ml-starter/pytorch_gpu:test 23 | SERVICE_OPTIONS="worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver" 24 | ################################################################################ 25 | ### DATAFLOW JOB MODEL SETTINGS 26 | ################################################################################ 27 | MODEL_STATE_DICT_PATH="gs://apache-beam-ml/models/torchvision.models.mobilenet_v2.pth" 28 | MODEL_NAME=mobilenet_v2 29 | ################################################################################ 30 | ### DATAFLOW JOB INPUT&OUTPUT SETTINGS 31 | ################################################################################ 32 | INPUT_DATA="gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt" 33 | OUTPUT_DATA="gs://temp-storage-for-end-to-end-tests/temp-storage-for-end-to-end-tests/dataflow-ml-starter/result_gpu.txt" -------------------------------------------------------------------------------- /tests/sample.env.tf: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | ### PYTHON SDK SETTINGS 3 | ################################################################################ 4 | PYTHON_VERSION=3.8 5 | BEAM_VERSION=2.48.0 6 | DOCKERFILE_TEMPLATE=tensorflow_gpu.Dockerfile 7 | DOCKER_CREDENTIAL_REGISTRIES="us-docker.pkg.dev" 8 | ################################################################################ 9 | ### GCP SETTINGS 10 | ################################################################################ 11 | PROJECT_ID=apache-beam-testing 12 | REGION=us-central1 13 | ZONE=us-central1-f 14 | DISK_SIZE_GB=50 15 | MACHINE_TYPE=n1-standard-2 16 | VM_NAME=beam-ml-starter-gpu 17 | ################################################################################ 18 | ### DATAFLOW JOB SETTINGS 19 | ################################################################################ 20 | STAGING_LOCATION=gs://temp-storage-for-perf-tests/loadtests 21 | TEMP_LOCATION=gs://temp-storage-for-perf-tests/loadtests 22 | CUSTOM_CONTAINER_IMAGE=us-docker.pkg.dev/apache-beam-testing/dataflow-ml-starter/tf_gpu:test 23 | SERVICE_OPTIONS="worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver" 24 | ################################################################################ 25 | ### DATAFLOW JOB MODEL SETTINGS 26 | ################################################################################ 27 | #TF_MODEL_URI: only support TF2 models (https://tfhub.dev/s?subtype=module,placeholder&tf-version=tf2) 28 | TF_MODEL_URI=https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4 29 | ################################################################################ 30 | ### DATAFLOW JOB INPUT&OUTPUT SETTINGS 31 | ################################################################################ 32 | INPUT_DATA="gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt" 33 | OUTPUT_DATA="gs://temp-storage-for-end-to-end-tests/temp-storage-for-end-to-end-tests/dataflow-ml-starter/result_gpu.txt" -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # standard libraries 16 | from pathlib import Path 17 | 18 | # third party libraries 19 | import apache_beam as beam 20 | 21 | # Dataflow ML libraries 22 | # dfml libraries 23 | from my_project.config import ModelConfig, SinkConfig, SourceConfig 24 | from my_project.pipeline import build_pipeline 25 | 26 | DATA_FILE_PATH = Path(__file__).parent.parent / "data" 27 | 28 | 29 | def test_build_pipeline(): 30 | model_config = ModelConfig( 31 | model_state_dict_path="gs://apache-beam-ml/models/torchvision.models.resnet101.pth", 32 | model_class_name="resnet101", 33 | model_params={"num_classes": 1000}, 34 | ) 35 | source_config = SourceConfig(input=str(DATA_FILE_PATH / "openimage_10.txt")) 36 | sink_config = SinkConfig(output="beam-output/my_output.txt") 37 | 38 | p = beam.Pipeline() 39 | build_pipeline(p, source_config=source_config, sink_config=sink_config, model_config=model_config) 40 | 41 | 42 | def test_build_pipeline_with_tf(): 43 | model_config = ModelConfig( 44 | tf_model_uri="https://tfhub.dev/google/imagenet/mobilenet_v1_075_192/quantops/classification/3", 45 | ) 46 | source_config = SourceConfig(input=str(DATA_FILE_PATH / "openimage_10.txt")) 47 | sink_config = SinkConfig(output="beam-output/my_output.txt") 48 | 49 | p = beam.Pipeline() 50 | build_pipeline(p, source_config=source_config, sink_config=sink_config, model_config=model_config) 51 | 52 | 53 | def test_source_config_streaming(): 54 | source_config = SourceConfig(input=str(DATA_FILE_PATH / "openimage_10.txt")) 55 | assert source_config.streaming is False 56 | source_config = SourceConfig(input="projects/apache-beam-testing/topics/Imagenet_openimage_50k_benchmark") 57 | assert source_config.streaming is True 58 | --------------------------------------------------------------------------------