├── .devcontainer ├── devcontainer.env └── devcontainer.json ├── .editorconfig ├── .gcloudignore ├── .github ├── dependabot.yml ├── linters │ ├── .checkov.baseline │ ├── .checkov.yaml │ ├── .gitleaks.toml │ ├── .hadolint.yaml │ ├── .jscpd.json │ ├── .markdown-lint.yaml │ ├── .yaml-lint.yml │ ├── super-linter-fix-mode.env │ └── super-linter.env ├── renovate.json └── workflows │ ├── addlicense.yaml │ ├── conventional-commits.yaml │ └── lint.yaml ├── .gitignore ├── CONTRIBUTING.md ├── DATAFLOW.md ├── LICENSE ├── README.md ├── assets └── deployment-architecture.png ├── build ├── int.cloudbuild.yaml ├── terratest-builder-image │ ├── Dockerfile │ ├── README.md │ └── cloudbuild.yaml └── test │ ├── Makefile │ ├── composer_test.go │ ├── go.mod │ ├── go.sum │ ├── helpers │ ├── enable_docai_sa.sh │ └── ingest_test_docs.sh │ └── infra_test.go ├── components ├── __init__.py ├── common-infra │ ├── README.md │ └── terraform │ │ ├── alloydb.tf │ │ ├── bigquery.tf │ │ ├── gcs.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ ├── versions.tf │ │ └── vpc.tf ├── doc-classifier │ ├── README.md │ ├── src │ │ ├── Procfile │ │ ├── doc_classifier_main.py │ │ ├── requirements.in │ │ └── requirements.txt │ └── terraform │ │ ├── build.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf ├── doc-deletion │ ├── src │ │ ├── Procfile │ │ ├── doc_deletion_main.py │ │ ├── requirements.in │ │ └── requirements.txt │ └── terraform │ │ ├── build.tf │ │ ├── doc-deletion.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf ├── doc-registry │ ├── src │ │ ├── Procfile │ │ ├── document_registry_service.py │ │ ├── requirements.in │ │ └── requirements.txt │ └── terraform │ │ ├── bigquery.tf │ │ ├── build.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf ├── dpu-workflow │ ├── README.md │ ├── requirements.in │ ├── src │ │ ├── __init__.py │ │ ├── docs_processing_orchestrator.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── cloud_run_utils.py │ │ │ ├── datastore_utils.py │ │ │ ├── docai_utils.py │ │ │ ├── file_utils.py │ │ │ └── gcs_utils.py │ └── terraform │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf ├── post-setup-config │ ├── src │ │ ├── Procfile │ │ ├── dbconfig_main.py │ │ ├── requirements.in │ │ └── requirements.txt │ └── terraform │ │ ├── alloydb-config.tf │ │ ├── build.tf │ │ ├── variables.tf │ │ └── versions.tf ├── processing │ ├── .gcloudignore │ ├── .gitignore │ ├── README.md │ ├── libs │ │ ├── processor-base │ │ │ ├── README.md │ │ │ ├── pyproject.toml │ │ │ └── src │ │ │ │ └── processors │ │ │ │ ├── base │ │ │ │ ├── .coverage │ │ │ │ ├── __init__.py │ │ │ │ ├── gcsio.py │ │ │ │ ├── model.py │ │ │ │ ├── result_writer.py │ │ │ │ └── test_gcsio.py │ │ │ │ └── zip │ │ │ │ └── unzip_processor.py │ │ ├── processor-msg │ │ │ ├── pyproject.toml │ │ │ └── src │ │ │ │ └── processors │ │ │ │ └── msg │ │ │ │ ├── __init__.py │ │ │ │ ├── main_processor.py │ │ │ │ ├── msg_generator.py │ │ │ │ ├── msg_processor.py │ │ │ │ └── run.py │ │ └── processor-xlsx │ │ │ ├── pyproject.toml │ │ │ └── src │ │ │ └── processors │ │ │ └── xlsx │ │ │ ├── __init__.py │ │ │ ├── xlsx_generator.py │ │ │ └── xlsx_processor.py │ ├── tasks.py │ └── terraform │ │ ├── README.md │ │ ├── build.sh │ │ ├── build.tf │ │ ├── build │ │ ├── Dockerfile │ │ ├── cloudbuild.yaml.template │ │ ├── pyproject.toml │ │ └── requirements.txt │ │ ├── env.template │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── variables.tf │ │ └── versions.tf ├── specialized-parser │ ├── src │ │ ├── Procfile │ │ ├── configs.py │ │ ├── parser_main.py │ │ ├── requirements.in │ │ ├── requirements.txt │ │ └── runner.py │ └── terraform │ │ ├── bigquery.tf │ │ ├── build.tf │ │ ├── docai.tf │ │ ├── main.tf │ │ ├── outputs.tf │ │ ├── processed_documents.json │ │ ├── variables.tf │ │ └── versions.tf ├── utils │ ├── converter.py │ ├── main.py │ └── requirements.txt └── webui │ ├── .gitignore │ ├── README.md │ ├── app │ └── images │ │ └── logo.png │ ├── build.sh │ ├── deploy.sh │ ├── pyproject.toml │ ├── requirements.txt │ ├── run_proxy.sh │ ├── src │ ├── Home.py │ ├── dpu │ │ ├── api.py │ │ └── components.py │ └── pages │ │ ├── 1_Search_Documents.py │ │ └── 2_Browse_Documents.py │ ├── tasks.py │ └── terraform │ ├── build.tf │ ├── build │ ├── Dockerfile │ └── cloudbuild.yaml.template │ ├── cloudrun.tf │ ├── main.tf │ ├── outputs.tf │ ├── variables.tf │ └── versions.tf ├── docs ├── LICENSE_HEADER.txt ├── access_control.md └── foundation.md ├── invoke.sh ├── pyproject.toml ├── reqs ├── README.md ├── constraints.txt ├── requirements_all.in ├── requirements_all.txt ├── requirements_bootstrap.in ├── requirements_bootstrap.txt └── requirements_dev.in ├── run.sh ├── sample-deployments └── composer-orchestrated-process │ ├── DEPLOYMENT.md │ ├── README.md │ ├── USE.md │ ├── custom_iap_brand_admin.yaml │ ├── documents-for-testing │ ├── financial-documents │ │ ├── CHRO.pdf │ │ ├── DPU V1 Demo.pdf │ │ ├── HLXB.pdf │ │ ├── INTJ.pdf │ │ ├── RYDE.pdf │ │ └── questions.txt │ ├── forms-to-train-docai │ │ ├── test │ │ │ ├── CMS1500_06.pdf │ │ │ ├── CMS1500_07.pdf │ │ │ ├── CMS1500_08.pdf │ │ │ ├── CMS1500_09.pdf │ │ │ ├── CMS1500_10.pdf │ │ │ ├── CMS1500_11.pdf │ │ │ ├── CMS1500_12.pdf │ │ │ ├── CMS1500_13.pdf │ │ │ ├── CMS1500_14.pdf │ │ │ ├── CMS1500_15.pdf │ │ │ ├── pa-form-test20.pdf │ │ │ ├── pa-form-test21.pdf │ │ │ ├── package_3.pdf │ │ │ ├── package_4.pdf │ │ │ └── package_5.pdf │ │ └── train │ │ │ ├── CMS1500_01.pdf │ │ │ ├── CMS1500_02.pdf │ │ │ ├── CMS1500_03.pdf │ │ │ ├── CMS1500_04.pdf │ │ │ ├── CMS1500_05.pdf │ │ │ ├── CMS1500_16.pdf │ │ │ ├── CMS1500_17.pdf │ │ │ ├── CMS1500_18.pdf │ │ │ ├── CMS1500_19.pdf │ │ │ ├── CMS1500_20.pdf │ │ │ ├── pa-form-test1.pdf │ │ │ ├── pa-form-test22.pdf │ │ │ ├── pa-form-test23.pdf │ │ │ ├── pa-form-test24.pdf │ │ │ ├── pa-form-test25.pdf │ │ │ ├── package_1.pdf │ │ │ ├── package_2.pdf │ │ │ └── package_3.pdf │ ├── healthcare-billing-codes │ │ ├── 2024_DHS_Code_List_Addendum_11_29_2023.xlsx │ │ └── questions.txt │ └── network-ops-rca-documents │ │ ├── 937701.pdf │ │ ├── 937702.pdf │ │ ├── 937703.pdf │ │ ├── 937704.pdf │ │ ├── 937705.pdf │ │ ├── 937706.pdf │ │ ├── 937707.pdf │ │ ├── 937708.pdf │ │ ├── 937709.pdf │ │ ├── 937710.pdf │ │ └── questions.txt │ ├── env.template │ ├── main.tf │ ├── outputs.tf │ ├── persona_roles_DEPLOYER.txt │ ├── persona_roles_OPERATOR.txt │ ├── persona_roles_READER.txt │ ├── persona_roles_UPLOADER.txt │ ├── project_apis.txt │ ├── remote-backend.tf │ ├── scripts │ ├── apply_persona_roles.sh │ ├── common.sh │ ├── delete_doc.sh │ ├── find_document.sh │ ├── pre_tf_setup.sh │ ├── reset_datastore.sh │ └── trigger_workflow.sh │ ├── variables.tf │ └── versions.tf ├── scripts └── lint.sh └── tasks.py /.devcontainer/devcontainer.env: -------------------------------------------------------------------------------- 1 | DEFAULT_WORKSPACE=/workspaces/enterprise-knowledge-solution 2 | RUN_LOCAL=true 3 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json", 3 | "name": "Cloud Solutions devcontainer", 4 | "image": "ghcr.io/super-linter/super-linter:v7.1.0", 5 | "customizations": { 6 | "vscode": { 7 | "settings": { 8 | "css.validate": false, 9 | "editor.formatOnSave": true, 10 | "editor.formatOnSaveMode": "file", 11 | "editor.rulers": [80], 12 | "editor.wordWrap": "off", 13 | "files.insertFinalNewline": true, 14 | "files.trimFinalNewlines": true, 15 | "hadolint.cliOptions": [ 16 | "--config", 17 | "/workspaces/.github/linters/.hadolint.yaml" 18 | ], 19 | "less.validate": false, 20 | "markdownlint.config": { 21 | "extends": "${workspaceFolder}/.github/linters/.markdown-lint.yaml" 22 | }, 23 | "prettier.resolveGlobalModules": true, 24 | "redhat.telemetry.enabled": false, 25 | "scss.validate": false, 26 | "telemetry.telemetryLevel": "off", 27 | "[css]": { 28 | "editor.defaultFormatter": "esbenp.prettier-vscode" 29 | }, 30 | "[html]": { 31 | "editor.defaultFormatter": "esbenp.prettier-vscode" 32 | }, 33 | "[javascript]": { 34 | "editor.defaultFormatter": "esbenp.prettier-vscode" 35 | }, 36 | "[javascriptreact]": { 37 | "editor.defaultFormatter": "esbenp.prettier-vscode" 38 | }, 39 | "[json]": { 40 | "editor.defaultFormatter": "esbenp.prettier-vscode" 41 | }, 42 | "[jsonc]": { 43 | "editor.defaultFormatter": "esbenp.prettier-vscode" 44 | }, 45 | "[markdown]": { 46 | "editor.defaultFormatter": "esbenp.prettier-vscode", 47 | "editor.wordWrap": "off" 48 | }, 49 | "[python]": { 50 | "editor.defaultFormatter": "ms-python.black-formatter" 51 | }, 52 | "[scss]": { 53 | "editor.defaultFormatter": "esbenp.prettier-vscode" 54 | }, 55 | "[shellscript]": { 56 | "editor.defaultFormatter": "mkhl.shfmt" 57 | }, 58 | "[terraform]": { 59 | "editor.defaultFormatter": "hashicorp.terraform" 60 | }, 61 | "[terraform-vars]": { 62 | "editor.defaultFormatter": "hashicorp.terraform" 63 | }, 64 | "[typescript]": { 65 | "editor.defaultFormatter": "esbenp.prettier-vscode" 66 | }, 67 | "[typescriptreact]": { 68 | "editor.defaultFormatter": "esbenp.prettier-vscode" 69 | }, 70 | "[yaml]": { 71 | "editor.defaultFormatter": "esbenp.prettier-vscode" 72 | } 73 | }, 74 | "extensions": [ 75 | "DavidAnson.vscode-markdownlint", 76 | "EditorConfig.EditorConfig", 77 | "esbenp.prettier-vscode", 78 | "exiasr.hadolint", 79 | "HashiCorp.terraform", 80 | "mads-hartmann.bash-ide-vscode", 81 | "mkhl.shfmt", 82 | "ms-azuretools.vscode-docker", 83 | "ms-python.black-formatter", 84 | "ms-python.pylint", 85 | "stylelint.vscode-stylelint", 86 | "timonwong.shellcheck" 87 | ] 88 | } 89 | }, 90 | "runArgs": ["--rm", "--env-file", ".github/linters/super-linter.env"] 91 | } 92 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig: http://EditorConfig.org 2 | root = true 3 | 4 | # Unix-style newlines at the bottom of every file 5 | [*] 6 | end_of_line = lf 7 | charset = utf-8 8 | 9 | # Sets the natural language that should be used for spell checking. 10 | # Only one language can be specified 11 | # Ref: https://spec.editorconfig.org/#supported-pairs 12 | spelling_language = en-US 13 | 14 | # Tab indentation 15 | indent_style = space 16 | indent_size = 4 17 | 18 | # Make sure every file has a blank line at the end 19 | insert_final_newline = true 20 | 21 | # Remove any whitespace characters preceding newline characters 22 | trim_trailing_whitespace = true 23 | 24 | # Give operators breathing room, but not brackets 25 | spaces_around_operators = true 26 | spaces_around_brackets = false 27 | 28 | [{Corefile,Corefile.jinja,Dockerfile}] 29 | indent_size = 2 30 | 31 | [*.{hcl.tpl,ino,json,lock.hcl,js,sh,tf,tfvars,yml,yaml,yaml.jinja}] 32 | indent_size = 2 33 | 34 | [{Makefile,**.mk,go.mod,go.sum,*.go,.gitmodules}] 35 | indent_size = 4 36 | indent_style = tab 37 | 38 | # Defined by markdownlint 39 | [*.md] 40 | indent_size = unset 41 | indent_style = unset 42 | 43 | # Match the Google Shell Style Guide: https://google.github.io/styleguide/shellguide.html 44 | [*.sh] 45 | indent_size = 2 46 | indent_style = space 47 | # Files here are shell scripts but don't have a sh extension 48 | # This is a non-standard editorconfig feature that shfmt supports 49 | # Ref: https://github.com/mvdan/sh/blob/master/cmd/shfmt/shfmt.1.scd#examples 50 | 51 | [[shell]] 52 | indent_size = 2 53 | indent_style = space 54 | 55 | # Don't try to format binary files 56 | [*.tfvars.enc] 57 | charset = unset 58 | end_of_line = unset 59 | insert_final_newline = unset 60 | trim_trailing_whitespace = unset 61 | indent_style = unset 62 | indent_size = unset 63 | 64 | [LICENSE] 65 | # Don't try forcing a style to the LICENSE file because it's a rendered template 66 | indent_size = unset 67 | indent_style = unset 68 | 69 | # Don't try forcing a style to the YAML template because it's rendered 70 | [**/**.yaml.template] 71 | indent_size = unset 72 | indent_style = unset 73 | 74 | # Don't try forcing a style to the requirements_*.in because it's rendered 75 | [**/**.in] 76 | indent_size = unset 77 | indent_style = unset 78 | 79 | # Don't try forcing a style to the requirements_*.txt because it's rendered 80 | [**/requirements*.txt] 81 | indent_size = unset 82 | indent_style = unset 83 | -------------------------------------------------------------------------------- /.gcloudignore: -------------------------------------------------------------------------------- 1 | .git/** 2 | .venv/** 3 | .devcontainer/** 4 | .github/** 5 | assets/** 6 | docs/** 7 | sample-deployments/** 8 | scripts/** 9 | *.md 10 | *.sh 11 | *.tf 12 | *.template 13 | **/__pycache__/** 14 | **/.terraform/** 15 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | updates: 17 | - package-ecosystem: "github-actions" 18 | directory: "/" 19 | schedule: 20 | interval: "weekly" 21 | commit-message: 22 | prefix: "chore(deps):" 23 | 24 | - package-ecosystem: "terraform" 25 | directory: "/components" 26 | schedule: 27 | interval: "weekly" 28 | commit-message: 29 | prefix: "chore(deps):" 30 | 31 | version: 2 32 | -------------------------------------------------------------------------------- /.github/linters/.checkov.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "failed_checks": [] 3 | } 4 | -------------------------------------------------------------------------------- /.github/linters/.checkov.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | # Options reference: https://www.checkov.io/2.Basics/CLI%20Command%20Reference.html 17 | 18 | baseline: .github/linters/.checkov.baseline 19 | 20 | # scan for terraform rules only. Dockerfile checks introduce too many fixes not relevant for our scenario 21 | framework: terraform 22 | 23 | # Report skipped baseline checks in the output 24 | output-baseline-as-skipped: true 25 | 26 | # don't download external modules, causes false positives with GITLEAKS which detects an example api key in the TF module for service accounts 27 | download-external-modules: false 28 | 29 | # CKV2_GCP_22,CKV_GCP_84,CKV_GCP_80,CKV_GCP_82 disable checks for CMEK/CSEK on resources, not relevant to the sample repo, but relevant for production 30 | # CKV_GCP_121 disable check for BQ deletion protection. This is too disruptive for development, but relevant for production 31 | # CKV_GCP_62 Access and Storage Logging is the legacy feature, no longer recommended 32 | skip-check: CKV2_GCP_22,CKV_GCP_84,CKV_GCP_80,CKV_GCP_121,CKV_GCP_82,CKV_GCP_62 33 | 34 | skip-path: /tmp/* 35 | -------------------------------------------------------------------------------- /.github/linters/.gitleaks.toml: -------------------------------------------------------------------------------- 1 | [extend] 2 | useDefault = true 3 | 4 | [allowlist] 5 | paths = ['''\.venv''', '''.*\/\.venv.*\/.*'''] 6 | -------------------------------------------------------------------------------- /.github/linters/.hadolint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | ignored: 17 | - DL3059 # "multiple conservutive RUN commands, consider consolidation" https://github.com/hadolint/hadolint/wiki/DL3059 18 | -------------------------------------------------------------------------------- /.github/linters/.jscpd.json: -------------------------------------------------------------------------------- 1 | { 2 | "threshold": 0, 3 | "reporters": ["consoleFull"], 4 | "ignore": ["**/.venv*/**"], 5 | "absolute": true, 6 | "noSymlinks": true 7 | } 8 | -------------------------------------------------------------------------------- /.github/linters/.markdown-lint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | extends: "markdownlint/style/prettier" 17 | -------------------------------------------------------------------------------- /.github/linters/.yaml-lint.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | extends: default 17 | 18 | locale: "en_US.UTF-8" 19 | 20 | rules: 21 | braces: 22 | max-spaces-inside: 1 23 | max-spaces-inside-empty: 0 24 | min-spaces-inside: 1 25 | min-spaces-inside-empty: 0 26 | colons: 27 | max-spaces-after: 1 28 | comments: 29 | # As required by prettier 30 | min-spaces-from-content: 1 31 | comments-indentation: 32 | level: error 33 | document-start: 34 | level: error 35 | empty-values: enable 36 | line-length: 37 | allow-non-breakable-inline-mappings: true 38 | max: 250 39 | octal-values: enable 40 | truthy: 41 | level: error 42 | -------------------------------------------------------------------------------- /.github/linters/super-linter-fix-mode.env: -------------------------------------------------------------------------------- 1 | FIX_ENV=true 2 | FIX_JSON=true 3 | FIX_JSON_PRETTIER=true 4 | FIX_MARKDOWN=true 5 | FIX_MARKDOWN_PRETTIER=true 6 | FIX_PYTHON_BLACK=true 7 | FIX_PYTHON_ISORT=true 8 | FIX_PYTHON_RUFF=true 9 | FIX_SHELL_SHFMT=true 10 | FIX_TERRAFORM_FMT=true 11 | FIX_YAML_PRETTIER=true 12 | -------------------------------------------------------------------------------- /.github/linters/super-linter.env: -------------------------------------------------------------------------------- 1 | CREATE_LOG_FILE=false 2 | DEFAULT_BRANCH=main 3 | GITLEAKS_LOG_LEVEL=warn 4 | IGNORE_GITIGNORED_FILES=true 5 | MARKDOWN_CONFIG_FILE=.markdown-lint.yaml 6 | REMOVE_ANSI_COLOR_CODES_FROM_OUTPUT=true 7 | SAVE_SUPER_LINTER_OUTPUT=true 8 | VALIDATE_BASH=true 9 | VALIDATE_BASH_EXEC=true 10 | VALIDATE_CHECKOV=true 11 | VALIDATE_DOCKERFILE_HADOLINT=true 12 | VALIDATE_EDITORCONFIG=true 13 | VALIDATE_ENV=true 14 | VALIDATE_GITHUB_ACTIONS=true 15 | VALIDATE_GITLEAKS=true 16 | VALIDATE_JSON=true 17 | VALIDATE_JSON_PRETTIER=true 18 | VALIDATE_MARKDOWN=true 19 | VALIDATE_MARKDOWN_PRETTIER=true 20 | VALIDATE_NATURAL_LANGUAGE=true 21 | VALIDATE_PYTHON_BLACK=true 22 | VALIDATE_PYTHON_FLAKE8=true 23 | VALIDATE_PYTHON_ISORT=true 24 | VALIDATE_PYTHON_MYPY=true 25 | VALIDATE_PYTHON_RUFF=true 26 | VALIDATE_RENOVATE=true 27 | VALIDATE_SHELL_SHFMT=true 28 | VALIDATE_TERRAFORM_FMT=true 29 | VALIDATE_TERRAFORM_TFLINT=true 30 | VALIDATE_YAML=true 31 | VALIDATE_YAML_PRETTIER=true 32 | -------------------------------------------------------------------------------- /.github/renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": ["config:recommended", ":semanticCommits"], 4 | "dependencyDashboardApproval": true, 5 | "rangeStrategy": "bump", 6 | "customManagers": [ 7 | { 8 | "customType": "regex", 9 | "description": "Update _VERSION variables in Dockerfiles, shell scripts, and files in the ci-tasks directory", 10 | "fileMatch": [ 11 | "(^|/|\\.)([Dd]ocker|[Cc]ontainer)file$", 12 | "(^|/)([Dd]ocker|[Cc]ontainer)file[^/]*$", 13 | "(^|/)*.sh", 14 | "ci-tasks/.*$" 15 | ], 16 | "matchStrings": [ 17 | "# renovate: datasource=(?[a-z-]+?)(?: depName=(?.+?))? packageName=(?.+?)(?: versioning=(?[a-z-]+?))?\\s(?:ENV|ARG)?\\s*.+?_VERSION=\"?(?.+?)\"?\\s" 18 | ] 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /.github/workflows/addlicense.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | name: addlicense 17 | 18 | on: # yamllint disable-line rule:truthy 19 | pull_request: null 20 | 21 | permissions: 22 | contents: read 23 | 24 | jobs: 25 | addlicense: 26 | permissions: 27 | contents: write 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@v4 31 | with: 32 | fetch-depth: 0 33 | 34 | - name: Pull container for addlicense 35 | run: docker pull ghcr.io/google/addlicense:latest 36 | 37 | - name: invoke addlicense 38 | run: docker run -v "${PWD}":/src ghcr.io/google/addlicense -l "apache" -c "Google LLC" . 39 | 40 | - name: Commit and push linting fixes 41 | # Run only on: 42 | # - Pull requests 43 | # - Not on the default branch 44 | if: > 45 | github.event_name == 'pull_request' && 46 | github.ref_name != github.event.repository.default_branch 47 | uses: stefanzweifel/git-auto-commit-action@v5 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | with: 51 | branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }} 52 | commit_message: "chore: add license headers" 53 | commit_user_name: addlicense 54 | commit_user_email: no-reply@addlicense.dev 55 | -------------------------------------------------------------------------------- /.github/workflows/conventional-commits.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | name: Conventional Commits 17 | 18 | on: # yamllint disable-line rule:truthy 19 | pull_request: 20 | types: [opened, synchronize, reopened, edited] 21 | 22 | permissions: 23 | pull-requests: read 24 | 25 | jobs: 26 | validate-pr-title: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: PR Conventional Commit Validation 30 | uses: ytanikin/PRConventionalCommits@1.3.0 31 | with: 32 | task_types: '["feat","fix","docs","test","ci","refactor","perf","chore","revert"]' 33 | add_label: "false" 34 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | name: Lint 17 | 18 | on: # yamllint disable-line rule:truthy 19 | push: null 20 | pull_request: null 21 | workflow_call: null 22 | 23 | permissions: 24 | contents: read 25 | 26 | jobs: 27 | lint: 28 | concurrency: 29 | # Ref: https://docs.github.com/en/actions/learn-github-actions/contexts#github-context 30 | # github.head_ref: head_ref or source branch of the pull request 31 | # github.ref: ref of the branch that triggered the workflow 32 | group: ${{ github.workflow }}-lint-${{ github.head_ref || github.ref }}-${{ github.event_name }} 33 | cancel-in-progress: true 34 | permissions: 35 | contents: read 36 | packages: read 37 | statuses: write 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v4 41 | with: 42 | fetch-depth: 0 43 | - name: Load super-linter configuration 44 | # Use grep inverse matching to exclude eventual comments in the .env file 45 | # because the GitHub Actions command to set environment variables doesn't 46 | # support comments. 47 | # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-an-environment-variable 48 | run: grep -v '^#' .github/linters/super-linter.env >> "$GITHUB_ENV" 49 | - name: Super-Linter 50 | uses: super-linter/super-linter@v7.2.1 51 | env: 52 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | .DS_Store 3 | local_dev/** 4 | 5 | # Python 6 | **/__pycache__/** 7 | **/.venv/** 8 | 9 | # Terraform 10 | **/terraform.tfstate* 11 | **/terraform.tfvars 12 | **/.terraform.tfstate* 13 | **/backend.tf 14 | **/.terraform/** 15 | **/.terraform.lock.hcl 16 | **/samples/ 17 | 18 | # Generated files 19 | .env 20 | 21 | # Egg info directories 22 | **/*.egg-info/** 23 | sample-deployments/composer-orchestrated-process/dp.notes 24 | sample-deployments/composer-orchestrated-process/env.sh 25 | 26 | # Cloud SDK credentials 27 | **/.credentials/** 28 | sample-deployments/composer-orchestrated-process/terraform.tfvars.dp 29 | sample-deployments/composer-orchestrated-process/terraform.tfvars.go_demos 30 | .gitignore 31 | components/doc-classifier/terraform/build/cloudbuild.yaml 32 | 33 | # Linting and formatting 34 | super-linter-output 35 | super-linter.log 36 | .external_modules/* 37 | .mypy_cache/* 38 | 39 | # integration testing 40 | **/.test-data/* 41 | -------------------------------------------------------------------------------- /assets/deployment-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/assets/deployment-architecture.png -------------------------------------------------------------------------------- /build/terratest-builder-image/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | FROM golang:1.23.2 16 | 17 | ARG tf_version=1.8.1-1 18 | 19 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 20 | 21 | # Install terratest and jq 22 | RUN apt-get update && apt-get install -y gnupg=2.2.40-1.1 software-properties-common=0.99.30-4.1~deb12u1 curl=7.88.1-10+deb12u7 jq=1.6-2.1 --no-install-recommends \ 23 | && wget -nv -O- https://apt.releases.hashicorp.com/gpg | gpg --dearmor | tee /usr/share/keyrings/hashicorp-archive-keyring.gpg \ 24 | && echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/hashicorp.list \ 25 | && apt-get update \ 26 | && apt-get -y install terraform=${tf_version} --no-install-recommends \ 27 | && apt-get clean \ 28 | && rm -rf /var/lib/apt/lists/* 29 | 30 | # Install gcloud SDK 31 | RUN wget -nv https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-497.0.0-linux-x86_64.tar.gz && \ 32 | tar -xvzf google-cloud-cli-497.0.0-linux-x86_64.tar.gz && \ 33 | ./google-cloud-sdk/install.sh -q && \ 34 | rm google-cloud-cli-497.0.0-linux-x86_64.tar.gz 35 | 36 | # make gcloud available on PATH 37 | ENV PATH=$PATH:/go/google-cloud-sdk/bin 38 | -------------------------------------------------------------------------------- /build/terratest-builder-image/README.md: -------------------------------------------------------------------------------- 1 | # How to use this image 2 | 3 | The builder image used to run integration tests in this directory is created in an internal project. This page documents how to recreate and update the image. 4 | 5 | 1. Navigate to the `build/terratest-builder-image/` directory and run the following command: 6 | 7 | ```bash 8 | gcloud builds submit . --project=$PROJECT_ID 9 | ``` 10 | 11 | 1. Define a tag value for the new build, and ensure that `BUILDER_IMAGE_TAG` in [/build/int.cloudbuild.yaml](/build/int.cloudbuild.yaml) uses the correct tag value. 12 | -------------------------------------------------------------------------------- /build/terratest-builder-image/cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | steps: 17 | - name: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" 18 | script: | 19 | docker build -t us-central1-docker.pkg.dev/$PROJECT_ID/ci/test-builder . 20 | automapSubstitutions: true 21 | images: 22 | - "us-central1-docker.pkg.dev/$PROJECT_ID/ci/test-builder" 23 | -------------------------------------------------------------------------------- /build/test/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | infra-test: ## Deploy infrastructure 16 | @echo "Running infrastructure deployment test..." 17 | @go test -v -run ./ -timeout 90m --tags=infra 18 | 19 | dag-test: ## upload documents, trigger workflows, and check the results 20 | @echo "running functional test against Composer workflows..." 21 | @go test -v -run ./ -timeout 90m --tags=dag 22 | 23 | .PHONY: help 24 | .DEFAULT_GOAL := help 25 | 26 | help: 27 | @fgrep -h "##" $(MAKEFILE_LIST) | sed -e 's/\(\:.*\#\#\)/\:\ /' | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//' 28 | -------------------------------------------------------------------------------- /build/test/helpers/enable_docai_sa.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | PARENT_DIR="$(dirname "$0")" 18 | 19 | outputs=$(terraform -chdir="$PARENT_DIR/../../../sample-deployments/composer-orchestrated-process/" output -json) 20 | CLASSIFIER_SA=$(echo "$outputs" | jq -r ".classifier_service_account.value") 21 | 22 | gcloud projects add-iam-policy-binding "$CICD_PROJECT_ID" \ 23 | --member="serviceAccount:${CLASSIFIER_SA}" \ 24 | --role="roles/documentai.apiUser" \ 25 | --condition=None \ 26 | 1>/dev/null 27 | -------------------------------------------------------------------------------- /build/test/helpers/ingest_test_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | gcloud storage cp gs://"$CICD_PROJECT_ID"-testdocs/* gs://docs-input-"$TEST_PROJECT_ID"/ 18 | -------------------------------------------------------------------------------- /build/test/infra_test.go: -------------------------------------------------------------------------------- 1 | //go:build infra 2 | 3 | /** 4 | * Copyright 2023 Google LLC 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * https://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package test 20 | 21 | import ( 22 | "context" 23 | "testing" 24 | 25 | logger "github.com/gruntwork-io/terratest/modules/logger" 26 | terraform "github.com/gruntwork-io/terratest/modules/terraform" 27 | test_structure "github.com/gruntwork-io/terratest/modules/test-structure" 28 | envconfig "github.com/sethvargo/go-envconfig" 29 | ) 30 | 31 | type TestConfig struct { 32 | ProjectId string `env:"PROJECT_ID,required"` 33 | Region string `env:"REGION,required"` 34 | DocAiLocation string `env:"DOC_AI_LOCATION,required"` 35 | VertexAiDataStoreRegion string `env:"VERTEX_AI_DATA_STORE_REGION,required"` 36 | IapAccessDomains string `env:"IAP_ACCESS_DOMAINS,required"` 37 | WebUiDomains string `env:"WEB_UI_DOMAINS,required"` 38 | CustomClassifierId string `env:"CUSTOM_CLASSIFIER_ID, required"` 39 | } 40 | 41 | func TestE2e(t *testing.T) { 42 | 43 | var config TestConfig 44 | 45 | ctx := context.Background() 46 | err := envconfig.Process(ctx, &config) 47 | if err != nil { 48 | logger.Log(t, "There was an error processing the supplied environment variables:") 49 | logger.Log(t, err) 50 | t.Fatal() 51 | } 52 | 53 | terraformDir := "../../sample-deployments/composer-orchestrated-process" 54 | 55 | test_structure.RunTestStage(t, "setup", func() { 56 | terraformOptions := &terraform.Options{ 57 | TerraformDir: terraformDir, 58 | 59 | Vars: map[string]interface{}{ 60 | "project_id": config.ProjectId, 61 | "region": config.Region, 62 | "iap_access_domains": config.IapAccessDomains, 63 | "webui_domains": config.WebUiDomains, 64 | "docai_location": config.DocAiLocation, 65 | "vertex_ai_data_store_region": config.VertexAiDataStoreRegion, 66 | "custom_classifier_id": config.CustomClassifierId, 67 | }, 68 | NoColor: true, 69 | } 70 | 71 | test_structure.SaveTerraformOptions(t, terraformDir, terraformOptions) 72 | terraform.Init(t, terraformOptions) 73 | }) 74 | 75 | test_structure.RunTestStage(t, "apply", func() { 76 | terraformOptions := test_structure.LoadTerraformOptions(t, terraformDir) 77 | terraform.InitAndApply(t, terraformOptions) 78 | }) 79 | 80 | test_structure.RunTestStage(t, "migrate-tfstate", func() { 81 | terraformOptions := test_structure.LoadTerraformOptions(t, terraformDir) 82 | terraformOptions.MigrateState = true 83 | terraform.InitE(t, terraformOptions) 84 | }) 85 | } 86 | -------------------------------------------------------------------------------- /components/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /components/common-infra/README.md: -------------------------------------------------------------------------------- 1 | # Common Infra module 2 | 3 | The module provisions the common resources required by the Enterprise Knowledge Solution (EKS), which includes the following: 4 | 5 | | Name | Description | 6 | | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------- | 7 | | BigQuery `docs_store` Dataset | A dataset where the parsed document metadata are store and used as input to Agent Build Data Store | 8 | | Google Cloud Storage `docs-input` Bucket | Storage bucket for users of EKS to drop document for ingestion to the solutions | 9 | | Google Cloud Storage `dpu-process` Bucket | Storage bucket used by document processing workflow to store the ingested documents | 10 | | Google Cloud Storage `dpu-reject` Bucket | Storage bucket used by document processing workflow to store the documents which have been failed to parse | 11 | | Artifact Registry `dpu-docker-repo` | Container repository for hosting of custom containers userd by EKS | 12 | | VPC Network `dpu-network` | Common VPC network for the EKS solution | 13 | 14 | ## Inputs 15 | 16 | | Name | Description | 17 | | ---------------- | --------------------------------------------------------------------------------------------- | 18 | | project_id | Google Cloud project where the common resources for EKS are provisioned | 19 | | region | Google Cloud region where the resources are provioned, i.e. `us-central1`, `europe-west1` etc | 20 | | bq_store_dataset | (Optional) Dataset name for the document store, default : `docs_store` | 21 | 22 | ## Output 23 | 24 | | Name | Description | 25 | | ----------------------- | ---------------------------------------------------------------------- | 26 | | artifact_repo | The artifict registry object representaing the repositoy being created | 27 | | bq_store_dataset_id | The ID of the docs_store dataset being created | 28 | | gcs_input_bucket_name | GCS bucket name for the input bucket | 29 | | gcs_process_bucket_name | GCS bucket name for the process bucket | 30 | | gcs_reject_bucket_name | GCS bucket name for the reject bucket | 31 | | project_id | The project under which all the resources are being created | 32 | | vpc_network_id | The ID of the common VPC network being created | 33 | | vpc_network_name | The name of the common VPC network being created | 34 | -------------------------------------------------------------------------------- /components/common-infra/terraform/bigquery.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "docs_store_dataset" { 16 | source = "github.com/terraform-google-modules/terraform-google-bigquery?ref=0fe8ab60d7291a2260cd460d55cdcca7fc815a0d" # commit hash of version 8.1.0 17 | dataset_id = var.bq_store_dataset 18 | dataset_name = var.bq_store_dataset 19 | project_id = module.project_services.project_id 20 | location = var.region 21 | dataset_labels = local.dpu_label 22 | } 23 | -------------------------------------------------------------------------------- /components/common-infra/terraform/gcs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "input_bucket" { 16 | source = "github.com/terraform-google-modules/terraform-google-cloud-storage.git//modules/simple_bucket?ref=e8bb6eb49fdaf5f6f300d1b6dc46f097173dc488" # version 6.1.0, this commit is needed where kms is upgraded to 3.0, otherwise will get version conflict for google provider 17 | project_id = module.project_services.project_id 18 | name = "docs-input-${var.project_id}" 19 | location = var.region 20 | force_destroy = false 21 | labels = local.dpu_label 22 | public_access_prevention = "enforced" 23 | } 24 | 25 | module "process_bucket" { 26 | source = "github.com/terraform-google-modules/terraform-google-cloud-storage.git//modules/simple_bucket?ref=e8bb6eb49fdaf5f6f300d1b6dc46f097173dc488" # version 6.1.0, this commit is needed where kms is upgraded to 3.0, otherwise will get version conflict for google provider 27 | project_id = module.project_services.project_id 28 | name = "dpu-process-${var.project_id}" 29 | location = var.region 30 | force_destroy = false 31 | labels = local.dpu_label 32 | public_access_prevention = "enforced" 33 | } 34 | 35 | module "reject_bucket" { 36 | source = "github.com/terraform-google-modules/terraform-google-cloud-storage.git//modules/simple_bucket?ref=e8bb6eb49fdaf5f6f300d1b6dc46f097173dc488" # version 6.1.0, this commit is needed where kms is upgraded to 3.0, otherwise will get version conflict for google provider 37 | project_id = module.project_services.project_id 38 | name = "dpu-reject-${var.project_id}" 39 | location = var.region 40 | force_destroy = false 41 | labels = local.dpu_label 42 | public_access_prevention = "enforced" 43 | } 44 | -------------------------------------------------------------------------------- /components/common-infra/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | dpu_label = { 17 | goog-packaged-solution : "eks-solution" 18 | } 19 | } 20 | module "project_services" { 21 | source = "github.com/terraform-google-modules/terraform-google-project-factory.git//modules/project_services?ref=ff00ab5032e7f520eb3961f133966c6ced4fd5ee" # commit hash of version 17.0.0 22 | project_id = var.project_id 23 | disable_services_on_destroy = false 24 | disable_dependent_services = false 25 | activate_apis = [ 26 | "artifactregistry.googleapis.com", 27 | "cloudbuild.googleapis.com", 28 | "bigquery.googleapis.com", 29 | "alloydb.googleapis.com", 30 | "servicenetworking.googleapis.com", 31 | "dns.googleapis.com", 32 | "vpcaccess.googleapis.com" 33 | ] 34 | } 35 | 36 | resource "google_artifact_registry_repository" "docker-repo" { 37 | project = module.project_services.project_id 38 | format = "DOCKER" 39 | location = var.region 40 | repository_id = "dpu-docker-repo" 41 | description = "Docker containers" 42 | labels = local.dpu_label 43 | } 44 | 45 | module "cloud_build_account" { 46 | source = "github.com/terraform-google-modules/terraform-google-service-accounts?ref=a11d4127eab9b51ec9c9afdaf51b902cd2c240d9" #commit hash of version 4.0.0 47 | project_id = var.project_id 48 | names = ["cloud-build"] 49 | project_roles = [ 50 | "${var.project_id}=>roles/logging.logWriter", 51 | "${var.project_id}=>roles/storage.admin", 52 | "${var.project_id}=>roles/artifactregistry.writer", 53 | "${var.project_id}=>roles/run.developer", 54 | "${var.project_id}=>roles/iam.serviceAccountUser", 55 | ] 56 | display_name = "Cloud Build Service Account" 57 | description = "specific custom service account for Cloud Build" 58 | } 59 | 60 | 61 | # Propagation time for change of access policy typically takes 2 minutes 62 | # according to https://cloud.google.com/iam/docs/access-change-propagation 63 | # this wait make sure the policy changes are propagated before proceeding 64 | # with the build 65 | resource "time_sleep" "wait_for_policy_propagation" { 66 | create_duration = "120s" 67 | depends_on = [ 68 | module.cloud_build_account 69 | ] 70 | } 71 | -------------------------------------------------------------------------------- /components/common-infra/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | output "artifact_repo" { 15 | description = "Cloud arctifact repository object" 16 | value = google_artifact_registry_repository.docker-repo 17 | } 18 | 19 | output "cloud_build_service_account" { 20 | description = "IAM service account to run builds on top of Cloud Build" 21 | value = module.cloud_build_account 22 | } 23 | 24 | output "project_id" { 25 | description = "Google Cloud project user by the module." 26 | value = module.project_services.project_id 27 | } 28 | 29 | output "gcs_input_bucket_name" { 30 | description = "Cloud Storage bucket for input files" 31 | value = module.input_bucket.name 32 | } 33 | 34 | output "gcs_process_bucket_name" { 35 | description = "Cloud Storage bucket for processing of files" 36 | value = module.process_bucket.name 37 | } 38 | 39 | output "gcs_reject_bucket_name" { 40 | description = "Cloud Storage bucket for storing documents that were not able to be processed" 41 | value = module.reject_bucket.name 42 | } 43 | 44 | output "bq_store_dataset_id" { 45 | description = "BigQuery data store dataset" 46 | value = module.docs_store_dataset.bigquery_dataset.dataset_id 47 | } 48 | 49 | output "vpc_network_id" { 50 | value = local.vpc_network_id 51 | } 52 | 53 | output "vpc_network_name" { 54 | value = local.vpc_network_name 55 | } 56 | 57 | output "alloydb_cluster_name" { 58 | value = module.docs_results.cluster_name 59 | } 60 | 61 | output "alloydb_primary_instance" { 62 | value = module.docs_results.primary_instance_id 63 | } 64 | 65 | output "alloydb_location" { 66 | value = var.region 67 | } 68 | 69 | output "alloydb_cluster_ready" { 70 | description = "creating the alloydb resource in terraform does not guarantee it's in the ready state, so subsequent steps fail. This resource exists to force a sleep_timer that is referencable from other modules " 71 | value = true 72 | depends_on = [time_sleep.wait_for_alloydb_ready_state] 73 | } 74 | 75 | output "serverless_connector_subnet" { 76 | description = "the subnet used by Cloud Run for private access to alloydb" 77 | value = google_compute_subnetwork.serverless_connector_subnet.name 78 | } 79 | -------------------------------------------------------------------------------- /components/common-infra/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | variable "project_id" { 15 | type = string 16 | description = "project id required" 17 | } 18 | 19 | variable "region" { 20 | type = string 21 | description = "Cloud region where the resources are created" 22 | } 23 | 24 | variable "bq_store_dataset" { 25 | description = "BigQuery dataset" 26 | type = string 27 | default = "docs_store" 28 | } 29 | 30 | variable "create_vpc_network" { 31 | type = bool 32 | description = "configuration to manage vpc creation" 33 | default = true 34 | } 35 | 36 | variable "vpc_name" { 37 | type = string 38 | description = "name of vpc network" 39 | default = "dpu-network" 40 | } 41 | 42 | variable "alloy_db_cluster_id" { 43 | description = "AlloyDB Cluster ID" 44 | type = string 45 | default = "eks-docs-results" 46 | } 47 | 48 | variable "serverless_connector_subnet" { 49 | description = "Name of the VPC subnet to create" 50 | type = string 51 | } 52 | 53 | variable "serverless_connector_subnet_range" { 54 | description = "Range of the VPC subnet to create" 55 | type = string 56 | } 57 | 58 | variable "alloydb_psc_endpoint" { 59 | description = "Name of Private Service Connect endpoint used for access to AlloyDB from your VPC" 60 | type = string 61 | default = "alloydb-psc-endpoint" 62 | } 63 | 64 | variable "alloydb_psc_fwd_rule" { 65 | description = "Name of the forwarding rule associated with the alloydb PSC endpoint" 66 | type = string 67 | default = "alloydb-psc-fwd-rule" 68 | } 69 | 70 | variable "alloydb_psc_dns" { 71 | description = "Name of the DNS zone associated with AlloyDB PSC access" 72 | type = string 73 | default = "alloydb-psc-dns" 74 | } 75 | 76 | -------------------------------------------------------------------------------- /components/common-infra/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | time = { 24 | source = "hashicorp/time" 25 | version = "0.12.1" 26 | } 27 | } 28 | 29 | provider_meta "google" { 30 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /components/doc-classifier/src/Procfile: -------------------------------------------------------------------------------- 1 | web: python3 doc_classifier_main.py 2 | -------------------------------------------------------------------------------- /components/doc-classifier/src/requirements.in: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | google-cloud-documentai 16 | -------------------------------------------------------------------------------- /components/doc-classifier/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | cloud_build_content_hash = sha512( 18 | join("", [ 19 | for f in fileset(path.module, "../src/**") : 20 | filesha512("${path.module}/${f}") 21 | ] 22 | ) 23 | ) 24 | service_account_name = var.classifier_cloud_run_job_name 25 | } 26 | 27 | # See github.com/terraform-google-modules/terraform-google-gcloud 28 | module "gcloud_build_doc_classifier" { 29 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 30 | create_cmd_entrypoint = "gcloud" 31 | create_cmd_body = <<-EOT 32 | auth configure-docker ${var.region}-docker.pkg.dev && \ 33 | gcloud builds submit ${path.module}/../src \ 34 | --pack image=${local.image_name_and_tag} \ 35 | --project ${var.project_id} \ 36 | --region ${var.region} \ 37 | --default-buckets-behavior regional-user-owned-bucket \ 38 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 39 | 40 | EOT 41 | enabled = true 42 | 43 | create_cmd_triggers = { 44 | source_contents_hash = local.cloud_build_content_hash 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /components/doc-classifier/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "classifier_cloud_run_job_name" { 16 | description = "Cloud Run doc classifier job name" 17 | value = var.classifier_cloud_run_job_name 18 | } 19 | 20 | output "classifier_service_account" { 21 | description = "Document classifier service account" 22 | value = module.doc_classifier_account.email 23 | } 24 | -------------------------------------------------------------------------------- /components/doc-classifier/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | description = "Google Cloud project where infrastructure resource are deployed" 17 | type = string 18 | } 19 | 20 | variable "region" { 21 | description = "Google Cloud region where compute services are located." 22 | type = string 23 | } 24 | 25 | variable "artifact_repo" { 26 | description = "Docker registry" 27 | type = string 28 | default = "" 29 | } 30 | 31 | variable "classifier_cloud_run_job_name" { 32 | description = "Doc classifier job name" 33 | type = string 34 | default = "doc-classifier" 35 | } 36 | 37 | variable "cloud_build_service_account_email" { 38 | description = "the user-managed service account configured for Cloud Build" 39 | type = string 40 | } 41 | 42 | variable "vpc_network_name" { 43 | type = string 44 | description = "The name of the network where subnets will be created" 45 | } 46 | 47 | variable "serverless_connector_subnet" { 48 | description = "Name of the VPC subnet to create" 49 | type = string 50 | } 51 | -------------------------------------------------------------------------------- /components/doc-classifier/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | } 24 | 25 | provider_meta "google" { 26 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /components/doc-deletion/src/Procfile: -------------------------------------------------------------------------------- 1 | web: python3 doc_deletion_main.py 2 | -------------------------------------------------------------------------------- /components/doc-deletion/src/requirements.in: -------------------------------------------------------------------------------- 1 | google-cloud-storage 2 | google-cloud-bigquery 3 | google-cloud-discoveryengine 4 | google-api-python-client 5 | sqlalchemy 6 | google-cloud-alloydb-connector[pg8000] 7 | -------------------------------------------------------------------------------- /components/doc-deletion/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | cloud_build_fileset = fileset(path.module, "../src/**/*") 18 | cloud_build_content_hash = sha512(join("", [for f in local.cloud_build_fileset : filesha512("${path.module}/${f}")])) 19 | image_name_and_tag = "${var.region}-docker.pkg.dev/${var.project_id}/${var.artifact_repo}/${var.doc_deletion_cloud_run_job_name}:latest" 20 | } 21 | 22 | # See github.com/terraform-google-modules/terraform-google-gcloud 23 | module "gcloud_build_job_to_doc_deletion" { 24 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 25 | create_cmd_entrypoint = "gcloud" 26 | create_cmd_body = <<-EOT 27 | builds submit ${path.module}/../src \ 28 | --pack image=${local.image_name_and_tag} \ 29 | --project ${var.project_id} \ 30 | --region ${var.region} \ 31 | --default-buckets-behavior regional-user-owned-bucket \ 32 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 33 | EOT 34 | enabled = true 35 | 36 | create_cmd_triggers = { 37 | source_contents_hash = local.cloud_build_content_hash 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /components/doc-deletion/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "doc_deletion_cloud_run_job_name" { 16 | description = "Cloud Run doc deletion job name" 17 | value = google_cloud_run_v2_job.doc_deletion_job.name 18 | } 19 | 20 | output "doc_deletion_service_account" { 21 | description = "Doc deletion service account" 22 | value = module.doc_deletion_account.email 23 | } 24 | 25 | output "doc_deletion_db_user" { 26 | description = "The AlloyDB db role associated with the service account identity of the doc deletion Cloud Run job" 27 | value = google_alloydb_user.doc_deletion_db_user.user_id 28 | } 29 | 30 | output "db_role_content_hash" { 31 | description = "Additional deployment trigger to force rerun module.gcloud_build_job_to_configure_alloydb_schema if terraform reverts the db roles on specialized_parser_role (flaky)" 32 | value = sha512(terraform_data.dbrole_deployment_trigger.id) 33 | } 34 | -------------------------------------------------------------------------------- /components/doc-deletion/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | time = { 24 | source = "hashicorp/time" 25 | version = "0.12.1" 26 | } 27 | } 28 | 29 | provider_meta "google" { 30 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /components/doc-registry/src/Procfile: -------------------------------------------------------------------------------- 1 | web: python3 document_registry_service.py 2 | -------------------------------------------------------------------------------- /components/doc-registry/src/requirements.in: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proto-plus 16 | google-cloud-bigquery 17 | google-cloud-storage 18 | google-cloud-bigquery-storage 19 | -------------------------------------------------------------------------------- /components/doc-registry/terraform/bigquery.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module "docs_registry_dataset" { 16 | source = "github.com/terraform-google-modules/terraform-google-bigquery?ref=0fe8ab60d7291a2260cd460d55cdcca7fc815a0d" # commit hash of version 8.1.0 17 | dataset_id = var.bq_registry_dataset 18 | dataset_name = var.bq_registry_dataset 19 | project_id = module.project_services.project_id 20 | location = var.region 21 | dataset_labels = local.eks_label 22 | 23 | tables = [ 24 | { 25 | table_id = var.bq_registry_table 26 | range_partitioning = null 27 | time_partitioning = null 28 | expiration_time = null 29 | clustering = ["crc32", "fileName"] 30 | labels = local.eks_label 31 | schema = <<-EOT 32 | [ 33 | { 34 | "mode": "REQUIRED", 35 | "name": "id", 36 | "type": "STRING" 37 | }, 38 | { 39 | "mode": "REQUIRED", 40 | "name": "fileName", 41 | "type": "STRING" 42 | }, 43 | { 44 | "mode": "REQUIRED", 45 | "name": "gcsUri", 46 | "type": "STRING" 47 | }, 48 | { 49 | "mode": "REQUIRED", 50 | "name": "crc32", 51 | "type": "STRING" 52 | } 53 | ] 54 | EOT 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /components/doc-registry/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | image_name_and_tag = "${var.region}-docker.pkg.dev/${var.project_id}/${var.artifact_repo}/${var.doc_registry_service_cloud_run_job_name}:latest" 18 | cloud_build_content_hash = sha512( 19 | join("", [ 20 | for f in fileset(path.module, "../src/**") : 21 | filesha512("${path.module}/${f}") 22 | ] 23 | ) 24 | ) 25 | } 26 | 27 | # See github.com/terraform-google-modules/terraform-google-gcloud 28 | module "gcloud_build_doc_registry" { 29 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 30 | create_cmd_entrypoint = "gcloud" 31 | create_cmd_body = <<-EOT 32 | builds submit ${path.module}/../src \ 33 | --pack image=${local.image_name_and_tag} \ 34 | --project ${var.project_id} \ 35 | --region ${var.region} \ 36 | --default-buckets-behavior regional-user-owned-bucket \ 37 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 38 | EOT 39 | enabled = true 40 | 41 | create_cmd_triggers = { 42 | source_contents_hash = local.cloud_build_content_hash 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /components/doc-registry/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "project_id" { 16 | description = "Google Cloud project user by the module." 17 | value = module.project_services.project_id 18 | } 19 | 20 | output "bq_registry_dataset_id" { 21 | description = "BigQuery document registry dataset" 22 | value = module.docs_registry_dataset.bigquery_dataset.dataset_id 23 | } 24 | 25 | output "bq_registry_table_id" { 26 | description = "BigQuery document registry table" 27 | value = module.docs_registry_dataset.table_ids[0] 28 | } 29 | 30 | output "doc_registry_service_cloud_run_job_name" { 31 | description = "Doc Registry service job name" 32 | value = google_cloud_run_v2_job.doc-registry-service-job.name 33 | } 34 | -------------------------------------------------------------------------------- /components/doc-registry/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | type = string 17 | description = "project id required" 18 | } 19 | 20 | variable "region" { 21 | type = string 22 | description = "Cloud region where the resources are created" 23 | } 24 | 25 | variable "bq_registry_dataset" { 26 | description = "BigQuery dataset" 27 | type = string 28 | default = "docs_registry" 29 | } 30 | 31 | variable "bq_registry_table" { 32 | description = "BigQuery table for aggregated doument registry" 33 | type = string 34 | default = "docs_registry" 35 | } 36 | 37 | variable "doc_registry_service_cloud_run_job_name" { 38 | description = "Doc registry service job name" 39 | type = string 40 | default = "doc-registry-service" 41 | } 42 | 43 | variable "artifact_repo" { 44 | description = "Docker registry" 45 | type = string 46 | default = "" 47 | } 48 | 49 | variable "cloud_build_service_account_email" { 50 | description = "the user-managed service account configured for Cloud Build" 51 | type = string 52 | } 53 | 54 | variable "vpc_network_name" { 55 | type = string 56 | description = "The name of the network where subnets will be created" 57 | } 58 | 59 | variable "serverless_connector_subnet" { 60 | description = "Name of the VPC subnet to create" 61 | type = string 62 | } 63 | -------------------------------------------------------------------------------- /components/doc-registry/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | } 24 | 25 | provider_meta "google" { 26 | module_name = "cloud-solutions/eks-solution-v1.0.0" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /components/dpu-workflow/requirements.in: -------------------------------------------------------------------------------- 1 | # Additional dependencies for testing 2 | apache-airflow-providers-google 3 | apache-airflow 4 | -------------------------------------------------------------------------------- /components/dpu-workflow/src/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /components/dpu-workflow/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /components/dpu-workflow/src/utils/datastore_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from google.api_core.client_options import ClientOptions # type: ignore 17 | from google.api_core.gapic_v1.client_info import ClientInfo # type: ignore 18 | from google.cloud import discoveryengine 19 | 20 | USER_AGENT = "cloud-solutions/eks-agent-builder-v1" 21 | 22 | 23 | def import_docs_to_datastore(bq_table, data_store_region, datastore_id): 24 | client_options = ( 25 | ClientOptions( 26 | api_endpoint=f"{data_store_region}-discoveryengine.googleapis.com" 27 | ) 28 | if data_store_region != "global" 29 | else None 30 | ) 31 | client = discoveryengine.DocumentServiceClient( 32 | client_options=client_options, client_info=ClientInfo(user_agent=USER_AGENT) 33 | ) 34 | parent = client.branch_path( 35 | project=bq_table["project_id"], 36 | location=data_store_region, # pyright: ignore[reportArgumentType] 37 | data_store=datastore_id, 38 | # pyright: ignore[reportArgumentType] 39 | branch="default_branch", 40 | ) 41 | request = discoveryengine.ImportDocumentsRequest( 42 | parent=parent, 43 | bigquery_source=discoveryengine.BigQuerySource( 44 | project_id=bq_table["project_id"], 45 | dataset_id=bq_table["dataset_id"], 46 | table_id=bq_table["table_id"], 47 | data_schema="document", 48 | ), 49 | reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL, 50 | ) 51 | operation = client.import_documents(request=request, timeout=3600.0) 52 | response = operation.result() 53 | metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata) 54 | print(response) 55 | print(metadata) 56 | return operation.operation.name 57 | -------------------------------------------------------------------------------- /components/dpu-workflow/src/utils/docai_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | from typing import Optional, Tuple 17 | 18 | 19 | def is_valid_processor_id(processor_id: str) -> Optional[Tuple[str, str, str]]: 20 | """ 21 | Validates a GCP DocumentAI processor ID. 22 | 23 | Args: 24 | processor_id: The processor ID string to validate. 25 | 26 | Returns: 27 | Tuple (project_id, location, processor_id) if the processor ID is valid, False otherwise. 28 | """ 29 | pattern = r"^projects\/([a-z][a-z0-9\-]{4,28}[a-z0-9])\/locations\/(us|eu)\/processors\/([a-zA-Z0-9_-]+)$" 30 | match = re.match(pattern, processor_id) 31 | if not match: 32 | return None 33 | return match.group(1), match.group(2), match.group(3) 34 | -------------------------------------------------------------------------------- /components/dpu-workflow/src/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import random 17 | import string 18 | from collections import defaultdict 19 | from datetime import datetime 20 | from typing import Dict, List, Tuple 21 | 22 | 23 | def supported_files_by_type( 24 | file_list, file_type_to_processor 25 | ) -> Tuple[Dict[str, List[str]], List[str]]: 26 | supported_file_types = set( 27 | item["file-suffix"].lower() for item in file_type_to_processor 28 | ) 29 | unsupported_files = [] 30 | files_by_type = defaultdict(list) 31 | for input_file in file_list: 32 | file_type = input_file.split(".")[-1].lower() 33 | if file_type in supported_file_types: 34 | files_by_type[file_type].append(input_file) 35 | else: 36 | unsupported_files.append(input_file) 37 | return files_by_type, unsupported_files 38 | 39 | 40 | def get_random_process_folder_name(): 41 | suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=8)) 42 | process_folder = ( 43 | f"docs-processing-{datetime.now().strftime('%d-%m-%Y')}-" f"{suffix}" 44 | ) 45 | return process_folder 46 | 47 | 48 | def get_mv_params(files_to_process, input_folder, process_bucket, process_folder): 49 | input_folder_with_prefix = f"{input_folder}/" if input_folder else "" 50 | parameter_obj_list = [] 51 | for typ in files_to_process.keys(): 52 | parameter_obj = { 53 | "source_object": f"{input_folder_with_prefix}*.{typ}", 54 | "destination_bucket": process_bucket, 55 | "destination_object": f"{process_folder}/{typ}/", 56 | } 57 | parameter_obj_list.append(parameter_obj) 58 | return parameter_obj_list 59 | -------------------------------------------------------------------------------- /components/dpu-workflow/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "composer_dag_gcs_bucket" { 16 | description = "Stores the DAGs for the Cloud Composer environment." 17 | value = google_composer_environment.composer_env.storage_config[0].bucket 18 | } 19 | 20 | output "composer_uri" { 21 | description = "Cloud Composer Airflow URI" 22 | value = google_composer_environment.composer_env.config[0].airflow_uri 23 | } 24 | 25 | output "composer_location" { 26 | description = "Cloud Composer Location" 27 | value = google_composer_environment.composer_env.region 28 | } 29 | 30 | -------------------------------------------------------------------------------- /components/dpu-workflow/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | } 24 | 25 | provider_meta "google" { 26 | module_name = "cloud-solutions/dpu-orchestrator-v1.0.0" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /components/post-setup-config/src/Procfile: -------------------------------------------------------------------------------- 1 | web: python3 dbconfig_main.py 2 | -------------------------------------------------------------------------------- /components/post-setup-config/src/requirements.in: -------------------------------------------------------------------------------- 1 | google-api-python-client 2 | sqlalchemy 3 | google-cloud-alloydb-connector[pg8000] 4 | -------------------------------------------------------------------------------- /components/post-setup-config/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | cloud_build_fileset = fileset(path.module, "../src/**/*") 18 | cloud_build_content_hash = sha512(join("", [for f in local.cloud_build_fileset : filesha512("${path.module}/${f}")])) 19 | image_name_and_tag = "${var.region}-docker.pkg.dev/${var.project_id}/${var.artifact_repo}/${var.configure_schema_cloud_run_job_name}:latest" 20 | } 21 | 22 | # See github.com/terraform-google-modules/terraform-google-gcloud 23 | module "gcloud_build_job_to_configure_alloydb_schema" { 24 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 25 | create_cmd_entrypoint = "gcloud" 26 | create_cmd_body = <<-EOT 27 | builds submit ${path.module}/../src \ 28 | --pack image=${local.image_name_and_tag} \ 29 | --project ${var.project_id} \ 30 | --region ${var.region} \ 31 | --default-buckets-behavior regional-user-owned-bucket \ 32 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 33 | EOT 34 | enabled = true 35 | 36 | create_cmd_triggers = { 37 | source_contents_hash = local.cloud_build_content_hash 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /components/post-setup-config/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | variable "project_id" { 15 | type = string 16 | description = "project id required" 17 | } 18 | 19 | variable "region" { 20 | type = string 21 | description = "Cloud region where the resources are created" 22 | } 23 | 24 | variable "alloy_db_cluster_id" { 25 | description = "AlloyDB Cluster ID" 26 | type = string 27 | default = "eks-docs-results" 28 | } 29 | 30 | variable "artifact_repo" { 31 | description = "Docker registry" 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "configure_schema_cloud_run_job_name" { 37 | description = "Configure db schemas and permissions in AlloyDB" 38 | type = string 39 | default = "configure-db-schema" 40 | } 41 | 42 | variable "serverless_connector_subnet" { 43 | description = "Name of the VPC subnet to create" 44 | type = string 45 | } 46 | 47 | variable "alloydb_database" { 48 | description = "AlloyDB Database" 49 | type = string 50 | default = "postgres" 51 | } 52 | 53 | variable "vpc_network_name" { 54 | type = string 55 | description = "The name of the network where subnets will be created" 56 | } 57 | 58 | variable "alloydb_primary_instance" { 59 | description = "alloydb primary instance id" 60 | type = string 61 | } 62 | 63 | variable "alloydb_cluster_ready" { 64 | description = "creating the alloydb resource in terraform does not guarantee it's in the ready state, so subsequent steps fail. This resource exists to force a sleep_timer that is referencable from other modules, and must be passed as a variable into this module (instead of depends_on) because the gcloud submodule has errors related to `depends_on` block. See: https://github.com/kingman/tf-dont-do-depends-on-module-demo/blob/main/demo-flow/README.md" 65 | type = bool 66 | } 67 | 68 | variable "cloud_build_service_account_email" { 69 | description = "the user-managed service account configured for Cloud Build" 70 | type = string 71 | } 72 | 73 | variable "additional_db_users" { 74 | description = "The AlloyDB db roles associated with the service accounts identities that requires access to eks data." 75 | type = list(string) 76 | } 77 | 78 | variable "db_role_content_hash" { 79 | description = "Additional deployment trigger to force rerun module.gcloud_build_job_to_configure_alloydb_schema if terraform reverts the db roles on specialized_parser_role (flaky)" 80 | type = string 81 | } 82 | -------------------------------------------------------------------------------- /components/post-setup-config/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | time = { 24 | source = "hashicorp/time" 25 | version = "0.12.1" 26 | } 27 | } 28 | 29 | provider_meta "google" { 30 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /components/processing/.gcloudignore: -------------------------------------------------------------------------------- 1 | .venv/** 2 | **/__pycache__/** 3 | **/.terraform/** 4 | -------------------------------------------------------------------------------- /components/processing/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Generated files 3 | terraform/build/cloudbuild.yaml 4 | -------------------------------------------------------------------------------- /components/processing/libs/processor-base/README.md: -------------------------------------------------------------------------------- 1 | # Processing Base 2 | 3 | TO BE FILLED IN. 4 | -------------------------------------------------------------------------------- /components/processing/libs/processor-base/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "processor-base" 3 | version = "0.0.1.dev" 4 | dependencies = [ 5 | "proto-plus", 6 | "google-cloud-storage", 7 | "google-cloud-bigquery-storage", 8 | "google-cloud-bigquery", 9 | "google-crc32c", 10 | "pydantic", 11 | "pydantic-settings", 12 | ] 13 | -------------------------------------------------------------------------------- /components/processing/libs/processor-base/src/processors/base/.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/components/processing/libs/processor-base/src/processors/base/.coverage -------------------------------------------------------------------------------- /components/processing/libs/processor-base/src/processors/base/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /components/processing/libs/processor-base/src/processors/base/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import proto 17 | 18 | __protobuf__ = proto.module(package="") 19 | 20 | 21 | class ProtoReturnResult(proto.Message): 22 | """ProtoReturnResult required for Agent Builder""" 23 | 24 | id = proto.Field(proto.STRING, number=1) 25 | jsonData = proto.Field(proto.STRING, number=2) 26 | 27 | class Content(proto.Message): 28 | """Content of an individual document""" 29 | 30 | mimeType = proto.Field(proto.STRING, number=1) 31 | uri = proto.Field(proto.STRING, number=2) 32 | 33 | content = proto.Field(Content, number=3) 34 | -------------------------------------------------------------------------------- /components/processing/libs/processor-base/src/processors/zip/unzip_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import zipfile 17 | from typing import Dict 18 | 19 | from processors.base.gcsio import GCSPath 20 | 21 | # mypy: disable-error-code="import-untyped" 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def unzip_processor(source: GCSPath, output_dir: GCSPath) -> Dict: 28 | # Unzip the zip archive 29 | logger.info(f"Unzipping {str(source)}") 30 | with ( 31 | source.read_as_file() as r, 32 | zipfile.ZipFile(r) as z, 33 | output_dir.write_folder() as output, 34 | ): 35 | z.extractall(output) 36 | 37 | # Add it in as a rendered type 38 | return dict() 39 | -------------------------------------------------------------------------------- /components/processing/libs/processor-msg/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "processor-msg" 3 | version = "0.0.1.dev" 4 | dependencies = [ 5 | # MSG specific requirements 6 | "wkhtmltopdf", 7 | "extract-msg", 8 | "faker", 9 | 10 | # Load common requirements 11 | "processor-base @ ${PROJECT_ROOT}/components/processing/libs/processor-base", 12 | 13 | # Load xlsx processor 14 | "processor-xlsx @ ${PROJECT_ROOT}/components/processing/libs/processor-xlsx", 15 | ] 16 | 17 | [project.scripts] 18 | msg_generator = "processors.msg.msg_generator:main" 19 | msg_processor = "processors.msg.run:main" 20 | 21 | -------------------------------------------------------------------------------- /components/processing/libs/processor-msg/src/processors/msg/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .msg_processor import msg_processor 17 | 18 | __all__ = [ 19 | "msg_processor", 20 | ] 21 | -------------------------------------------------------------------------------- /components/processing/libs/processor-msg/src/processors/msg/msg_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | import pathlib 18 | from typing import Dict 19 | 20 | from extract_msg import openMsg 21 | from extract_msg.enums import ErrorBehavior 22 | from extract_msg.msg_classes import MessageBase 23 | from processors.base.gcsio import GCSPath 24 | 25 | error_behavior = ErrorBehavior.RTFDE | ErrorBehavior.ATTACH_NOT_IMPLEMENTED 26 | MAX_BODY_SIZE = 1024 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def msg_to_dict(msg: MessageBase) -> Dict: 31 | body = msg.body 32 | return dict( 33 | addr_from=msg.sender, 34 | addr_to=msg.to, 35 | addr_cc=msg.cc, 36 | addr_bcc=msg.bcc, 37 | subject=msg.subject, 38 | date=msg.date, 39 | body=body[:MAX_BODY_SIZE] if body else "", 40 | ) 41 | 42 | 43 | def msg_processor( 44 | source: GCSPath, 45 | output_dir: GCSPath, 46 | ) -> Dict: 47 | logger.info(f"Extracting message {source}") 48 | 49 | # Generate generic output 50 | with ( 51 | source.read_as_file() as r, 52 | openMsg(r, errorBehavior=error_behavior) as msg, 53 | output_dir.write_folder() as output, 54 | ): 55 | 56 | # It is a MessageBase (more exposed functionality) 57 | nmsg: MessageBase = msg # pyright: ignore[reportAssignmentType] 58 | 59 | # Extract attachments 60 | nmsg.save( 61 | allowFallback=True, 62 | customPath=output, 63 | customFilename="att", 64 | skipBodyNotFound=True, 65 | extractEmbedded=True, 66 | skipNotImplemented=True, 67 | overwriteExisting=True, 68 | attachmentsOnly=True, 69 | ) 70 | 71 | # Extract message content 72 | msg_path = pathlib.Path( 73 | output, f"{nmsg.defaultFolderName}.txt" 74 | ) # pylint: disable=no-member 75 | with open(msg_path, "wb") as f: 76 | f.write(nmsg.getSaveBody()) # pylint: disable=no-member 77 | 78 | # Capture meta data 79 | return msg_to_dict(nmsg) 80 | -------------------------------------------------------------------------------- /components/processing/libs/processor-xlsx/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "processor-xlsx" 3 | version = "0.0.1.dev" 4 | dependencies = [ 5 | "pyexcel==0.6.7", 6 | "pyexcel-xlsx==0.6.0", 7 | "openpyxl==3.0.10", 8 | "pyexcel-text==0.2.7.1", 9 | "faker", 10 | "python-markdown-generator", 11 | "processor-base @ ${PROJECT_ROOT}/components/processing/libs/processor-base", 12 | ] 13 | -------------------------------------------------------------------------------- /components/processing/libs/processor-xlsx/src/processors/xlsx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .xlsx_generator import XLSXGenerator 17 | from .xlsx_processor import xlsx_processor 18 | 19 | __all__ = [ 20 | "xlsx_processor", 21 | "XLSXGenerator", 22 | ] 23 | -------------------------------------------------------------------------------- /components/processing/libs/processor-xlsx/src/processors/xlsx/xlsx_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import tempfile 17 | 18 | import pyexcel 19 | from faker import Faker 20 | from processors.base.gcsio import GCSPath 21 | 22 | 23 | class XLSXGenerator: 24 | 25 | def __init__(self): 26 | self.fake = Faker() 27 | self.COLUMNS = { 28 | "Address": lambda: self.fake.address(), 29 | "City": lambda: self.fake.city(), 30 | "Country": lambda: self.fake.country(), 31 | "CC": lambda: self.fake.country_code(), 32 | "ZIP": lambda: self.fake.postalcode(), 33 | "zipCode": lambda: self.fake.postalcode(), 34 | "postcode": lambda: self.fake.postalcode(), 35 | "postal Code": lambda: self.fake.postalcode(), 36 | "street Address": lambda: self.fake.street_address(), 37 | "company": lambda: self.fake.company(), 38 | "price": lambda: self.fake.random_int(min=10, max=50000) / 100, 39 | "prc?": lambda: self.fake.random_int(min=10, max=50000) / 100, 40 | "item": lambda: self.fake.bothify("id-???-###"), 41 | "thing": lambda: self.fake.bothify("id-???-###"), 42 | "?id": lambda: self.fake.bothify("id-???-###"), 43 | "????": lambda: self.fake.word(), 44 | } 45 | 46 | def get_sheet(self, min_cols=2, max_cols=10, min_rows=100, max_rows=1000): 47 | cols = self.fake.random_elements( 48 | list(self.COLUMNS.keys()), 49 | unique=True, 50 | length=self.fake.random_int(min=min_cols, max=max_cols), 51 | ) 52 | 53 | data = [] 54 | data.append([self.fake.bothify(col) for col in cols]) 55 | for row in range(self.fake.random_int(min=min_rows, max=max_rows)): 56 | data.append([self.COLUMNS[col]() for col in cols]) 57 | return data 58 | 59 | def save(self, fname: GCSPath, min_sheets=1, max_sheets=4): 60 | # Get the book 61 | sheets = {} 62 | for sheet in self.fake.words( 63 | self.fake.random_int(min=min_sheets, max=max_sheets) 64 | ): 65 | sheets[sheet] = self.get_sheet() 66 | book = pyexcel.get_book(bookdict=sheets) 67 | 68 | # Save 69 | with fname.write_as_file() as f: 70 | book.save_as(f) 71 | 72 | def to_bytes(self, suffix=".xlsx") -> bytes: 73 | # Save and return as bytes 74 | with tempfile.NamedTemporaryFile(suffix=suffix) as f: 75 | self.save(GCSPath(f.name)) 76 | with open(f.name, "rb") as r: 77 | return r.read() 78 | -------------------------------------------------------------------------------- /components/processing/libs/processor-xlsx/src/processors/xlsx/xlsx_processor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import logging 17 | from typing import Dict 18 | 19 | import pyexcel 20 | from markdowngenerator import MarkdownGenerator 21 | from processors.base.gcsio import GCSPath 22 | 23 | # mypy: disable-error-code="import-untyped" 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def cleanse_string(c): 29 | c = str(c) 30 | c = c.replace("|", "\\|") 31 | c = c.strip() 32 | if "\n" in c: 33 | return c.split("\n") 34 | return c 35 | 36 | 37 | def xlsx_processor(source: GCSPath, output_dir: GCSPath) -> Dict: 38 | 39 | # Load the book 40 | logging.info(f"Extracting spreadsheet {str(source)}") 41 | with source.read_as_file() as r: 42 | book = pyexcel.get_book( 43 | file_name=r, 44 | force_file_type=source.suffix[1:], 45 | ) 46 | 47 | for name in book.sheet_names(): 48 | sheet = book.sheet_by_name(name) 49 | 50 | # Assume the first row is the header for the data 51 | sheet.name_columns_by_row(0) 52 | 53 | # Markdown output 54 | with ( 55 | GCSPath(output_dir, name + ".txt").write_as_file() as f, 56 | MarkdownGenerator(filename=f) as m, 57 | ): 58 | m.addHeader(1, name) 59 | 60 | # Prepare data 61 | data = [] 62 | first_row = True 63 | for row in sheet.to_array(): 64 | if first_row: 65 | first_row = False 66 | continue 67 | data.append([cleanse_string(v) for v in row]) 68 | 69 | # Generate the table 70 | m.addTable( 71 | header_names=sheet.colnames, alignment="left", row_elements=data 72 | ) 73 | 74 | return dict() 75 | -------------------------------------------------------------------------------- /components/processing/terraform/README.md: -------------------------------------------------------------------------------- 1 | # Cloud Run Deployment 2 | 3 | ## Overview 4 | 5 | The Cloud Run deployment is intended to provide a Cloud Run Job for execution on an 6 | as-needed basis against a processing folder. 7 | 8 | Orchestration of incremental updates is and updating of AI Agent Builder is intended 9 | to be done outside of this code. 10 | 11 | ## Deploying Cloud Run 12 | 13 | ### Create a project 14 | 15 | Create a project that will run with Cloud Run enabled. Terraform is provided to do 16 | this, along with create a artifact repository. 17 | 18 | ### Run invoke.sh 19 | 20 | Run `../../invoke.sh cloud-run.deploy` or `../../invoke.sh cloud-run.cloud-build` depending 21 | if cloud build is intended to be used or a local docker build. 22 | -------------------------------------------------------------------------------- /components/processing/terraform/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ( 18 | cd "$(dirname "$0")/build" || exit 19 | docker buildx build --build-context libs=../../libs --build-context reqs=../../../../reqs . 20 | ) 21 | -------------------------------------------------------------------------------- /components/processing/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | locals { 15 | registry_url = "${var.repository_region}-docker.pkg.dev/${var.project_id}/${var.artifact_repo}" 16 | cloud_build_fileset = [ 17 | "${path.module}/build/cloudbuild.yaml.template", 18 | "${path.module}/build/cloudbuild.yaml", 19 | "${path.module}/build/Dockerfile", 20 | "${path.module}/build/requirements.txt", 21 | ] 22 | lib_source_directory_path = "${path.module}/../libs" 23 | lib_source_fileset = [for f in fileset(local.lib_source_directory_path, "**/*.py") : "${local.lib_source_directory_path}/${f}"] 24 | all_dependent_fileset = setunion(local.cloud_build_fileset, local.lib_source_fileset) 25 | cloud_build_content_hash = sha512(join("", [for f in local.all_dependent_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) 26 | } 27 | 28 | # Depends on: input bucket, artefactory (registury_url), and docprocessor service account 29 | resource "local_file" "cloudbuild_cloud_run" { 30 | filename = "${path.module}/build/cloudbuild.yaml" 31 | content = templatefile("${path.module}/build/cloudbuild.yaml.template", { 32 | project_id = var.project_id, 33 | registry_url = local.registry_url, 34 | region = var.region, 35 | service_account = module.doc_processor_account.email 36 | processing_cloud_run_job_name = var.processing_cloud_run_job_name 37 | build_service_account = var.cloud_build_service_account_email 38 | }) 39 | } 40 | 41 | # See github.com/terraform-google-modules/terraform-google-gcloud 42 | module "gcloud_build_processing" { 43 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 44 | 45 | create_cmd_entrypoint = "gcloud" 46 | create_cmd_body = <<-EOT 47 | auth configure-docker ${var.region}-docker.pkg.dev && \ 48 | gcloud builds submit "${path.module}/../../.." \ 49 | --project ${var.project_id} \ 50 | --region ${var.region} \ 51 | --config ${local_file.cloudbuild_cloud_run.filename} \ 52 | --default-buckets-behavior regional-user-owned-bucket \ 53 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 54 | EOT 55 | enabled = true 56 | 57 | create_cmd_triggers = { 58 | source_contents_hash = local.cloud_build_content_hash 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /components/processing/terraform/build/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | FROM python:3.11-slim 18 | 19 | # # Install required system packages and cleanup to reduce image size 20 | RUN apt-get update -y && \ 21 | apt-get install --no-install-recommends -y -q \ 22 | wkhtmltopdf=0.12.6-2+b1 mime-support=3.66 media-types=10.0.0 && \ 23 | apt-get clean && \ 24 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 25 | 26 | # Create user for the application 27 | ENV HOME=/app/ 28 | RUN useradd -rm -d $HOME -s /bin/bash -u 1000 app 29 | 30 | # Bootstrap uv 31 | RUN python3 -m ensurepip --upgrade 32 | RUN --mount=from=reqs,target=/reqs pip install --no-cache-dir --require-hashes -r /reqs/requirements_bootstrap.txt 33 | ENV VIRTUAL_ENV=/usr/local 34 | 35 | # Copy requirements into the image and install the dependencies 36 | COPY --chown=app requirements.txt $HOME 37 | RUN python3 -m uv pip install --no-cache-dir --require-hashes -r $HOME/requirements.txt 38 | 39 | # Copy and install the libraries to the image 40 | # hadolint ignore=DL3022 41 | COPY --from=libs --chown=app / /components/processing/libs/ 42 | RUN python3 -m uv pip install --no-cache-dir \ 43 | -e components/processing/libs/processor-base \ 44 | -e components/processing/libs/processor-msg \ 45 | -e components/processing/libs/processor-xlsx 46 | 47 | # Switch to default user 48 | WORKDIR $HOME 49 | USER app 50 | 51 | ENTRYPOINT [ "msg_processor" ] 52 | -------------------------------------------------------------------------------- /components/processing/terraform/build/cloudbuild.yaml.template: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Copyright 2024 Google LLC 16 | 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | 21 | # https://www.apache.org/licenses/LICENSE-1.0 22 | 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | 29 | steps: 30 | 31 | # Pull previous version 32 | - name: 'gcr.io/cloud-builders/docker' 33 | entrypoint: 'bash' 34 | args: [ '-c', 'docker pull $_IMAGE_TAG || exit 0' ] 35 | 36 | # Build RAG container (multipurpose) 37 | - name: 'gcr.io/cloud-builders/docker' 38 | args: [ 'buildx', 'build', 39 | '-f', 'components/processing/terraform/build/Dockerfile', 40 | '--build-context', 'libs=components/processing/libs', 41 | '--build-context', 'reqs=reqs', 42 | '--cache-from=$_IMAGE_TAG', 43 | '-t', '$_IMAGE_TAG', 44 | 'components/processing/terraform/build' ] 45 | 46 | # Push container 47 | - name: 'gcr.io/cloud-builders/docker' 48 | args: [ 'push', '$_IMAGE_TAG' ] 49 | 50 | # Push Job container 51 | - name: 'gcr.io/cloud-builders/gcloud' 52 | args: [ '--project', '${project_id}', 53 | 'run', 54 | 'jobs', 55 | '--region', '$_REGION', 56 | 'deploy', '${processing_cloud_run_job_name}', 57 | '--image=$_IMAGE_TAG', 58 | '--task-timeout=60m', 59 | '--memory=3Gi', 60 | '--service-account', '$_SERVICE_ACCOUNT' ] 61 | 62 | serviceAccount: 'projects/${project_id}/serviceAccounts/${build_service_account}' 63 | substitutions: 64 | _IMAGE_TAG: '${registry_url}/docprocessor:latest' 65 | _REGION: '${region}' 66 | _SERVICE_ACCOUNT: '${service_account}' 67 | 68 | options: 69 | dynamicSubstitutions: true 70 | logging: CLOUD_LOGGING_ONLY 71 | -------------------------------------------------------------------------------- /components/processing/terraform/build/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "cloud-run-processor" 3 | version = "0.0.1.dev" 4 | dependencies = [ 5 | "processor-msg @ ${PROJECT_ROOT}/components/processing/libs/processor-msg", 6 | ] 7 | -------------------------------------------------------------------------------- /components/processing/terraform/env.template: -------------------------------------------------------------------------------- 1 | ###### 2 | # 3 | # Top level settings. 4 | # 5 | # These will be fed into terraform variables and 6 | # used to provide defaults to Python code. 7 | # 8 | 9 | # General infrastructure 10 | PROJECT_ID=${project_id} 11 | REGION=${region} 12 | BQ_REGION=${bq_region} 13 | GCS_REGION=${gcs_region} 14 | REPOSITORY_REGION=${repository_region} 15 | ARTIFACT_REPO_NAME=${artifact_repo} 16 | 17 | # Cloud run job 18 | PROCESSING_CLOUD_RUN_JOB_NAME=${processing_cloud_run_job_name} 19 | PROCESSING_SERVICE_ACCOUNT=${processing_service_account} 20 | 21 | FORM_PARSER_CLOUD_RUN_JOB_NAME=${form_parser_cloud_run_job_name} 22 | FORM_PARSER_SERVICE_ACCOUNT=${form_parser_service_account} 23 | 24 | CLASSIFIER_CLOUD_RUN_JOB_NAME=${classifier_cloud_run_job_name} 25 | CLASSIFIER_SERVICE_ACCOUNT=${classifier_service_account} 26 | 27 | # Agent builder configuration 28 | AGENT_BUILDER_LOCATION=${agent_builder_location} 29 | AGENT_BUILDER_DATA_STORE_ID=${agent_builder_data_store_id} 30 | AGENT_BUILDER_SEARCH_ID=${agent_builder_search_id} 31 | -------------------------------------------------------------------------------- /components/processing/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | service_account_name = var.processing_cloud_run_job_name 18 | } 19 | 20 | # Enable APIs 21 | # See github.com/terraform-google-modules/terraform-google-project-factory 22 | # The modules/project_services 23 | module "project_services" { 24 | source = "github.com/terraform-google-modules/terraform-google-project-factory.git//modules/project_services?ref=ff00ab5032e7f520eb3961f133966c6ced4fd5ee" # commit hash of version 17.0.0 25 | project_id = var.project_id 26 | disable_services_on_destroy = false 27 | disable_dependent_services = false 28 | activate_apis = [ 29 | # General container build and registry 30 | "artifactregistry.googleapis.com", 31 | "cloudbuild.googleapis.com", 32 | "documentai.googleapis.com", 33 | "run.googleapis.com", 34 | "compute.googleapis.com", 35 | "containerscanning.googleapis.com" 36 | ] 37 | 38 | # Provide more access to the cloudbuild service account 39 | activate_api_identities = [{ 40 | "api" : "cloudbuild.googleapis.com", 41 | "roles" : [ 42 | "roles/run.admin", 43 | # Required for Cloud Run to launch as the normal compute service account 44 | "roles/iam.serviceAccountUser", 45 | ] 46 | }, 47 | { 48 | "api" : "pubsub.googleapis.com", 49 | # PubSub publish to Cloud Run 50 | "roles" : [ 51 | "roles/iam.serviceAccountUser", 52 | ], 53 | } 54 | ] 55 | } 56 | 57 | module "doc_processor_account" { 58 | source = "github.com/terraform-google-modules/terraform-google-service-accounts?ref=a11d4127eab9b51ec9c9afdaf51b902cd2c240d9" #commit hash of version 4.0.0 59 | project_id = var.project_id 60 | prefix = "dpu" 61 | names = [local.service_account_name] 62 | project_roles = [ 63 | "${var.project_id}=>roles/storage.objectUser", 64 | "${var.project_id}=>roles/bigquery.dataEditor", 65 | ] 66 | display_name = "Doc Processor Account" 67 | description = "Account used to run the document processing jobs" 68 | } 69 | -------------------------------------------------------------------------------- /components/processing/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "processing_cloud_run_job_name" { 16 | description = "Cloud Run doc processor job name" 17 | value = var.processing_cloud_run_job_name 18 | } 19 | 20 | output "doc_processor_service_account" { 21 | description = "Document processor service account" 22 | value = module.doc_processor_account.email 23 | } 24 | -------------------------------------------------------------------------------- /components/processing/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | description = "Google Cloud project where infracture resource such as BigQuery dataset and Artifact repository are deployed" 17 | type = string 18 | } 19 | 20 | variable "region" { 21 | description = "Google Cloud region where compute services are located." 22 | type = string 23 | } 24 | 25 | variable "repository_region" { 26 | description = "Google Cloud region where container images are stored." 27 | type = string 28 | } 29 | 30 | variable "artifact_repo" { 31 | description = "Docker registry" 32 | type = string 33 | default = "" 34 | } 35 | 36 | variable "cloud_build_service_account_email" { 37 | description = "IAM service account email used for cloud build." 38 | type = string 39 | default = "" 40 | } 41 | 42 | variable "processing_cloud_run_job_name" { 43 | description = "Doc processor job name" 44 | type = string 45 | default = "ms-office-doc-processor" 46 | } 47 | -------------------------------------------------------------------------------- /components/processing/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | local = { 24 | source = "hashicorp/local" 25 | version = "2.5.2" 26 | } 27 | } 28 | 29 | provider_meta "google" { 30 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /components/specialized-parser/src/Procfile: -------------------------------------------------------------------------------- 1 | web: python3 parser_main.py 2 | -------------------------------------------------------------------------------- /components/specialized-parser/src/configs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | 17 | 18 | @dataclass 19 | class JobConfig: 20 | gcs_input_prefix: str 21 | gcs_output_uri: str 22 | run_id: str 23 | 24 | 25 | @dataclass 26 | class ProcessorConfig: 27 | project: str 28 | location: str 29 | processor_id: str 30 | timeout: int 31 | 32 | 33 | @dataclass 34 | class AlloyDBConfig: 35 | primary_instance: str 36 | database: str 37 | user: str 38 | 39 | 40 | @dataclass 41 | class BigQueryConfig: 42 | general_output_table_id: str 43 | -------------------------------------------------------------------------------- /components/specialized-parser/src/requirements.in: -------------------------------------------------------------------------------- 1 | google-api-python-client 2 | google-cloud-storage 3 | google-cloud-documentai 4 | google-cloud-bigquery 5 | sqlalchemy 6 | google-cloud-alloydb-connector[pg8000] 7 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/bigquery.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | resource "google_bigquery_table" "processed_documents" { 17 | dataset_id = var.bigquery_dataset_id 18 | table_id = "prcessed_documents" 19 | schema = file("${path.module}/processed_documents.json") 20 | 21 | # NOTE: For production use-cases, change this! 22 | deletion_protection = false 23 | } 24 | 25 | resource "google_bigquery_table_iam_member" "member" { 26 | project = google_bigquery_table.processed_documents.project 27 | dataset_id = google_bigquery_table.processed_documents.dataset_id 28 | table_id = google_bigquery_table.processed_documents.table_id 29 | role = "roles/bigquery.dataOwner" 30 | member = module.specialized_parser_account.iam_email 31 | } 32 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | locals { 17 | cloud_build_fileset = fileset(path.module, "../src/**/*") 18 | cloud_build_content_hash = sha512(join("", [for f in local.cloud_build_fileset : filesha512("${path.module}/${f}")])) 19 | image_name_and_tag = "${var.region}-docker.pkg.dev/${var.project_id}/${var.artifact_repo}/${var.specialized_parser_cloud_run_job_name}:latest" 20 | } 21 | 22 | # See github.com/terraform-google-modules/terraform-google-gcloud 23 | module "gcloud_build_specialized_parser" { 24 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 25 | create_cmd_entrypoint = "gcloud" 26 | create_cmd_body = <<-EOT 27 | builds submit ${path.module}/../src \ 28 | --pack image=${local.image_name_and_tag} \ 29 | --project ${var.project_id} \ 30 | --region ${var.region} \ 31 | --default-buckets-behavior regional-user-owned-bucket \ 32 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 33 | EOT 34 | enabled = true 35 | 36 | create_cmd_triggers = { 37 | source_contents_hash = local.cloud_build_content_hash 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/docai.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | resource "google_document_ai_processor" "eks-invoice-processor" { 16 | location = var.processors_location 17 | display_name = "eks-invoice-processor" 18 | type = "INVOICE_PROCESSOR" 19 | } 20 | 21 | resource "google_document_ai_processor" "eks-form-processor" { 22 | location = var.processors_location 23 | display_name = "eks-form-processor" 24 | type = "FORM_PARSER_PROCESSOR" 25 | } 26 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "specialized_parser_cloud_run_job_name" { 16 | description = "Cloud Run specialized parser job name" 17 | value = google_cloud_run_v2_job.specialized_parser_processor_job.name 18 | } 19 | 20 | output "specialized_parser_service_account" { 21 | description = "Specialized Parser service account" 22 | value = module.specialized_parser_account.email 23 | } 24 | 25 | output "specialized_processors_ids_json" { 26 | description = "JSON encoded string of all supported labels as keys and the corresponding processor id for each as values." 27 | value = jsonencode({ 28 | "invoice" = google_document_ai_processor.eks-invoice-processor.id 29 | "form" = google_document_ai_processor.eks-form-processor.id 30 | }) 31 | } 32 | 33 | output "specialized_parser_db_user" { 34 | description = "The AlloyDB db role associated with the service account identity of the specializer parser Cloud Run job" 35 | value = google_alloydb_user.specialized_parser_user.user_id 36 | } 37 | 38 | output "db_role_content_hash" { 39 | description = "Additional deployment trigger to force rerun module.gcloud_build_job_to_configure_alloydb_schema if terraform reverts the db roles on specialized_parser_role (flaky)" 40 | value = sha512(terraform_data.dbrole_deployment_trigger.id) 41 | } 42 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/processed_documents.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "id", 4 | "type": "STRING", 5 | "mode": "NULLABLE" 6 | }, 7 | { 8 | "name": "original_filename", 9 | "type": "STRING", 10 | "mode": "NULLABLE" 11 | }, 12 | { 13 | "name": "results_file", 14 | "type": "STRING", 15 | "mode": "NULLABLE" 16 | }, 17 | { 18 | "name": "run_id", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "entities", 24 | "type": "JSON", 25 | "mode": "NULLABLE" 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | description = "Google Cloud project where infrastructure resource are deployed" 17 | type = string 18 | } 19 | 20 | variable "region" { 21 | description = "Google Cloud region where compute services are located." 22 | type = string 23 | } 24 | 25 | variable "artifact_repo" { 26 | description = "Docker registry" 27 | type = string 28 | } 29 | 30 | variable "specialized_parser_cloud_run_job_name" { 31 | description = "Specialized Parser job name" 32 | type = string 33 | default = "specialized-parser-job" 34 | } 35 | 36 | variable "bigquery_dataset_id" { 37 | description = "BigQuery Dataset id" 38 | type = string 39 | } 40 | 41 | variable "alloydb_cluster" { 42 | description = "AlloyDB Cluster" 43 | type = string 44 | } 45 | 46 | variable "alloydb_instance" { 47 | description = "AlloyDB Instance" 48 | type = string 49 | } 50 | 51 | variable "alloydb_database" { 52 | description = "AlloyDB Database" 53 | type = string 54 | default = "postgres" 55 | } 56 | 57 | variable "processors_location" { 58 | description = "Location to setup Document AI processors" 59 | type = string 60 | default = "us" 61 | } 62 | 63 | variable "serverless_connector_subnet" { 64 | description = "Name of the VPC subnet to create" 65 | type = string 66 | } 67 | 68 | variable "vpc_network_name" { 69 | type = string 70 | description = "The name of the network where subnets will be created" 71 | } 72 | 73 | variable "cloud_build_service_account_email" { 74 | description = "the user-managed service account configured for Cloud Build" 75 | type = string 76 | } 77 | 78 | variable "alloydb_cluster_ready" { 79 | description = "creating the alloydb resource in terraform does not guarantee it's in the ready state, so subsequent steps fail. This resource exists to force a sleep_timer that is referencable from other modules, and must be passed as a variable into this module (instead of depends_on) because the gcloud submodule has errors related to `depends_on` block. See: https://github.com/kingman/tf-dont-do-depends-on-module-demo/blob/main/demo-flow/README.md" 80 | type = bool 81 | } 82 | -------------------------------------------------------------------------------- /components/specialized-parser/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | } 24 | 25 | provider_meta "google" { 26 | module_name = "cloud-solutions/eks-docai-v1" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /components/utils/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from converter import write_jsonl, xlsx_to_pdf 16 | 17 | 18 | def jsonl() -> None: 19 | write_jsonl( 20 | in_bucket_name="sample_in_bkt", 21 | in_path="output", 22 | out_bucket_name="sample_out_bkt", 23 | out_path="output", 24 | ) 25 | 26 | 27 | def main() -> None: 28 | # jsonl() 29 | # to_csv("./samples/test_file.xlsm", "./samples/out/test_file.csv") 30 | # to_csv_pd("./samples/test_file.xlsm", "./samples/out/test_file_pd.csv") 31 | # md_to_html("./samples/test_file.md", "./samples/out/test_file.html") 32 | xlsx_to_pdf( 33 | "./samples/test_file.xlsm", 34 | "./samples/out/test_file.html", 35 | "./samples/out/test_file.pdf", 36 | ) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /components/utils/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | pandas==2.2.2 3 | openpyxl==3.1.0 4 | jsonlines==4.0.0 5 | markdown==3.6 6 | google-cloud-aiplatform==1.51.0 7 | pdfkit==1.0.0 8 | -------------------------------------------------------------------------------- /components/webui/.gitignore: -------------------------------------------------------------------------------- 1 | # Generated files 2 | terraform/build/cloudbuild.yaml 3 | -------------------------------------------------------------------------------- /components/webui/README.md: -------------------------------------------------------------------------------- 1 | # Reference Implementation of Web UI to access EKS 2 | 3 | The Web-UI is a Web App to interface with Vertex AI Agent Builder using REST APIs 4 | 5 | ## Deploy Locally 6 | 7 | Set environment variables: 8 | 9 | ```commandline 10 | export PROJECT_ID=[your-project-id] 11 | export AGENT_BUILDER_DATA_STORE_ID=[your-search-datastore-id] 12 | export AGENT_BUILDER_LOCATION=[your-search-datastore-region] 13 | export AGENT_BUILDER_SEARCH_ID=[your-search-app-id] 14 | ``` 15 | 16 | Command to create a virtual environment if building this App for the first time. 17 | 18 | ```commandline 19 | python -m venv .venv 20 | ``` 21 | 22 | Activate the virtual environment 23 | 24 | ```commandlin 25 | source .venv/bin/activate 26 | ``` 27 | 28 | Install dependencies 29 | 30 | ```commandline 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | Initialize gcloud and set project 35 | 36 | ```commandline 37 | gcloud init 38 | ``` 39 | 40 | Authenticate to set Google Application Default Credentials 41 | 42 | ```commandline 43 | gcloud auth application-default login 44 | ``` 45 | 46 | Launch 47 | 48 | ```commandline 49 | streamlit run src/Home.py 50 | ``` 51 | 52 | ## Deploy to Cloud Run 53 | 54 | Set environment variables 55 | 56 | ```commandline 57 | export AR_REPO=[your-ar-repo-name] 58 | export AR_REPO_LOCATION=[your-ar-repo-region] 59 | export SERVICE_NAME=[your-app-name] 60 | ``` 61 | 62 | If this is the first time you are trying to deploy the App in your GCP Project, 63 | you must enable APIs and Create an Artifact repository in your new GCP Project. 64 | **You can skip this if a repository already exists!** 65 | 66 | ```commandline 67 | gcloud config set project $GOOGLE_CLOUD_PROJECT 68 | 69 | gcloud artifacts repositories create "$AR_REPO" --location="$AR_REPO_LOCATION" --repository-format=Docker 70 | 71 | gcloud services enable cloudbuild.googleapis.com 72 | gcloud services enable run.googleapis.com 73 | 74 | ``` 75 | 76 | Build the app and save it in the Artifact repository 77 | 78 | ```commandline 79 | ./build.sh 80 | ``` 81 | 82 | Deploy the app from the Artifact repository to Cloud Run 83 | 84 | ```commandline 85 | ./deploy.sh 86 | ``` 87 | 88 | Test locally using Cloud Run proxy 89 | 90 | ```commandline 91 | ./run_proxy.sh 92 | ``` 93 | -------------------------------------------------------------------------------- /components/webui/app/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/components/webui/app/images/logo.png -------------------------------------------------------------------------------- /components/webui/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | cp "$HOME"/.config/gcloud/application_default_credentials.json ./adc.json 18 | 19 | gcloud builds submit --tag "$AR_REPO_LOCATION-docker.pkg.dev/$GOOGLE_CLOUD_PROJECT/$AR_REPO/$SERVICE_NAME" 20 | 21 | rm ./adc.json 22 | -------------------------------------------------------------------------------- /components/webui/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | gcloud run deploy "$SERVICE_NAME" \ 18 | --port=8080 \ 19 | --image="$AR_REPO_LOCATION-docker.pkg.dev/$GOOGLE_CLOUD_PROJECT/$AR_REPO/$SERVICE_NAME" \ 20 | --allow-unauthenticated \ 21 | --region="$AR_REPO_LOCATION" \ 22 | --platform=managed \ 23 | --project="$GOOGLE_CLOUD_PROJECT" \ 24 | --set-env-vars=GOOGLE_CLOUD_PROJECT="$GOOGLE_CLOUD_PROJECT",AR_REPO_LOCATION="$AR_REPO_LOCATION",LOCATION="$LOCATION",GOOGLE_CLOUD_PROJECT="$GOOGLE_CLOUD_PROJECT",SEARCH_DATASTORE_ID="$SEARCH_DATASTORE_ID",SEARCH_APP_ID="$SEARCH_APP_ID",LLM_LOCATION="$LLM_LOCATION",LLM="$LLM" 25 | -------------------------------------------------------------------------------- /components/webui/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "processor-web-ui" 3 | version = "0.0.1.dev" 4 | dependencies = [ 5 | "protobuf", 6 | "streamlit", 7 | "streamlit-aggrid", 8 | "streamlit-modal", 9 | "streamlit-antd-components", 10 | "pandas", 11 | "firebase-admin", 12 | "google-api-core", 13 | "google-cloud-firestore", 14 | "google-cloud-discoveryengine", 15 | "google-cloud-aiplatform", 16 | "google-cloud-storage", 17 | ] 18 | -------------------------------------------------------------------------------- /components/webui/run_proxy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | gcloud run services proxy "$SERVICE_NAME" --project "$GOOGLE_CLOUD_PROJECT" --region "$AR_REPO_LOCATION" 18 | -------------------------------------------------------------------------------- /components/webui/src/Home.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import streamlit as st # type: ignore 16 | from dpu.components import LOGO 17 | 18 | logger = st.logger.get_logger(__name__) # pyright: ignore[reportAttributeAccessIssue] 19 | 20 | 21 | st.set_page_config( 22 | page_title="EKS Web UI", 23 | page_icon=LOGO, 24 | layout="wide", 25 | ) 26 | 27 | cols = st.columns([10, 90]) 28 | with cols[0]: 29 | st.write("") 30 | st.image(LOGO, "", 64) 31 | with cols[1]: 32 | st.title(":green[Enterprise Knowledge Solution (EKS) Web UI]") 33 | 34 | st.markdown(""" """) 35 | st.markdown( 36 | """ 37 | ### About 38 | This app demonstrates the search and summarization capabilities of the 39 | Enterprise Knowledge Solution (EKS). 40 | 41 | The app integrates with the Vertex AI Agent Builder using APIs. 42 | """ 43 | ) 44 | 45 | if st.button("Start Search"): 46 | st.switch_page("pages/1_Search_Documents.py") 47 | 48 | st.divider() 49 | -------------------------------------------------------------------------------- /components/webui/src/pages/2_Browse_Documents.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pathlib 17 | 18 | import pandas as pd # type: ignore 19 | import streamlit as st # type: ignore 20 | from dpu.api import fetch_all_agent_docs 21 | from dpu.components import LOGO, show_agent_document 22 | from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder # type: ignore 23 | 24 | logger = st.logger.get_logger(__name__) # pyright: ignore[reportAttributeAccessIssue] 25 | 26 | st.set_page_config( 27 | page_title="Browse Documents", 28 | page_icon=LOGO, 29 | layout="wide", 30 | ) 31 | 32 | cols = st.columns([10, 90]) 33 | with cols[0]: 34 | st.write("") 35 | st.image(LOGO, "", 64) 36 | with cols[1]: 37 | st.title(":green[Document Corpus]") 38 | st.divider() 39 | st.markdown("""Full Document corpus accessible to the Search App.""") 40 | 41 | df = pd.DataFrame(fetch_all_agent_docs()) 42 | 43 | if len(df) > 0: 44 | 45 | # Extract bucket and path 46 | df["bucket"] = df["uri"].str.extract(r"gs://([^/]*)/") 47 | df["path"] = df["uri"].str.extract(r"gs://[^/]*/(.*)$") 48 | 49 | # Extract parent and name from the path 50 | df["name"] = df["path"].apply(lambda p: pathlib.Path(p).name) 51 | common_prefix = os.path.commonprefix( 52 | df["path"].apply(lambda p: pathlib.Path(p).parent).to_list() 53 | ) 54 | df["full_name"] = df["path"].apply(lambda p: p[len(common_prefix) :]) 55 | 56 | gb = GridOptionsBuilder() 57 | gb.configure_column("name", header_name="Name", flex=0) 58 | gb.configure_column("full_name", header_name="Full Name", flex=1) 59 | gb.configure_selection() 60 | gb.configure_pagination() 61 | gridOptions = gb.build() 62 | 63 | data = AgGrid( 64 | df, 65 | gridOptions=gridOptions, 66 | columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW, 67 | allow_unsafe_jscode=True, 68 | ) 69 | 70 | if data["selected_rows"] is not None and len(data["selected_rows"]) > 0: 71 | show_agent_document(data["selected_rows"].iloc[0]["id"]) 72 | -------------------------------------------------------------------------------- /components/webui/tasks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import os 17 | 18 | from dotenv import load_dotenv 19 | from invoke import task 20 | 21 | # Find the base directory for invoke 22 | BASE_DIR = os.path.dirname(__file__) 23 | ROOT_DIR = os.path.join(BASE_DIR, "../../") 24 | 25 | 26 | load_dotenv() 27 | 28 | 29 | @task 30 | def local_dev(c, debug=False): 31 | """Start local streamlit webui""" 32 | OPTS = " ".join( 33 | [ 34 | "--browser.gatherUsageStats=false", # Disable usage stats 35 | "--server.headless=true", # Disable collection of email address 36 | ] 37 | ) 38 | if debug: 39 | OPTS += " --logger.level=DEBUG" 40 | with c.cd(ROOT_DIR): 41 | c.run(f"./run.sh streamlit run {OPTS} components/webui/src/Home.py", pty=True) 42 | -------------------------------------------------------------------------------- /components/webui/terraform/build.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | cloud_build_fileset = setunion(fileset(path.module, "../src/**"), fileset(path.module, "build/Dockerfile"), fileset(path.module, "../requirements.txt"), fileset(path.module, "build/cloudbuild.yaml")) 17 | cloud_build_content_hash = sha512(join(",", [ 18 | for f in local.cloud_build_fileset : fileexists("${path.module}/${f}") ? filesha512("${path.module}/${f}") : sha512("file-not-found")])) 19 | } 20 | 21 | resource "local_file" "cloudbuild_config" { 22 | filename = "${path.module}/build/cloudbuild.yaml" 23 | content = templatefile("${path.module}/build/cloudbuild.yaml.template", { 24 | project_id = var.project_id, 25 | build_service_account = var.cloud_build_service_account_email, 26 | image_tag = "${var.region}-docker.pkg.dev/${module.project_services.project_id}/${var.artifact_repo}/${var.webui_service_name}" 27 | }) 28 | } 29 | 30 | # Build and upload the app container 31 | module "gcloud_build_app" { 32 | source = "github.com/terraform-google-modules/terraform-google-gcloud?ref=db25ab9c0e9f2034e45b0034f8edb473dde3e4ff" # commit hash of version 3.5.0 33 | 34 | create_cmd_entrypoint = "gcloud" 35 | create_cmd_body = <<-EOT 36 | auth configure-docker ${var.region}-docker.pkg.dev && \ 37 | gcloud builds submit "${path.module}/../../.." \ 38 | --project ${var.project_id} \ 39 | --region ${var.region} \ 40 | --config ${local_file.cloudbuild_config.filename} \ 41 | --default-buckets-behavior regional-user-owned-bucket \ 42 | --service-account "projects/${var.project_id}/serviceAccounts/${var.cloud_build_service_account_email}" 43 | 44 | EOT 45 | enabled = true 46 | 47 | create_cmd_triggers = { 48 | source_contents_hash = local.cloud_build_content_hash 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /components/webui/terraform/build/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | # Copyright 2024 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | FROM python:3.12-slim AS builder 18 | 19 | ENV HOME=/app 20 | WORKDIR $HOME 21 | 22 | ENV PYTHONDONTWRITEBYTECODE 1 23 | ENV PYTHONUNBUFFERED 1 24 | 25 | # Bootstrap venv 26 | RUN python3 -m venv /opt/venv 27 | ENV PATH="/opt/venv/bin:$PATH" 28 | 29 | # Install the dependencies into venv 30 | RUN --mount=from=libs,target=/libs pip install --no-cache-dir --require-hashes -r /libs/requirements.txt 31 | 32 | # Final stage 33 | FROM python:3.12-slim 34 | 35 | # Create user for the application 36 | ENV HOME=/app 37 | # Explicitly set a user to avoid running as root 38 | RUN useradd -rm -d $HOME -s /bin/bash -u 1000 app 39 | USER app 40 | 41 | # Copy venv from builder 42 | COPY --from=builder --chown=app /opt/venv /opt/venv 43 | # Copy app source filed from named context: libs 44 | # hadolint ignore=DL3022 45 | COPY --from=libs --chown=app /app $HOME/app/ 46 | # hadolint ignore=DL3022 47 | COPY --from=libs --chown=app /src $HOME/src/ 48 | 49 | WORKDIR $HOME 50 | ENV PATH="/opt/venv/bin:$PATH" 51 | 52 | # Expose port you want your app on 53 | EXPOSE 8080 54 | 55 | ENTRYPOINT ["streamlit"] 56 | CMD ["run", "src/Home.py", "--server.port=8080", "--server.address=0.0.0.0"] 57 | -------------------------------------------------------------------------------- /components/webui/terraform/build/cloudbuild.yaml.template: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | steps: 16 | 17 | # Pull previous version 18 | - name: 'gcr.io/cloud-builders/docker' 19 | entrypoint: 'bash' 20 | args: [ '-c', 'docker pull $_IMAGE_TAG || exit 0' ] 21 | 22 | # Build webui container 23 | - name: 'gcr.io/cloud-builders/docker' 24 | args: [ 'buildx', 'build', 25 | '-f', 'components/webui/terraform/build/Dockerfile', 26 | '--build-context', 'libs=components/webui', 27 | '--cache-from=$_IMAGE_TAG', 28 | '-t', '$_IMAGE_TAG', 29 | 'components/webui/terraform/build' ] 30 | 31 | # Push container 32 | - name: 'gcr.io/cloud-builders/docker' 33 | args: [ 'push', '$_IMAGE_TAG' ] 34 | 35 | serviceAccount: 'projects/${project_id}/serviceAccounts/${build_service_account}' 36 | substitutions: 37 | _IMAGE_TAG: '${image_tag}:latest' 38 | 39 | options: 40 | dynamicSubstitutions: true 41 | logging: CLOUD_LOGGING_ONLY 42 | -------------------------------------------------------------------------------- /components/webui/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | /* 16 | * Activate required service API:s 17 | */ 18 | module "project_services" { 19 | source = "github.com/terraform-google-modules/terraform-google-project-factory.git//modules/project_services?ref=ff00ab5032e7f520eb3961f133966c6ced4fd5ee" # commit hash of version 17.0.0 20 | project_id = var.project_id 21 | disable_services_on_destroy = false 22 | disable_dependent_services = false 23 | activate_apis = [ 24 | "compute.googleapis.com", 25 | "iap.googleapis.com", 26 | "aiplatform.googleapis.com" 27 | ] 28 | } 29 | 30 | data "google_project" "project" { 31 | project_id = module.project_services.project_id 32 | } 33 | 34 | /* 35 | * IAP Configuration 36 | */ 37 | 38 | # OAuth Client 39 | resource "google_iap_client" "project_client" { 40 | display_name = "Enterprise Knowledge Search client" 41 | brand = "projects/${data.google_project.project.number}/brands/${data.google_project.project.number}" 42 | } 43 | 44 | resource "google_project_iam_member" "iap_users" { 45 | for_each = toset(var.iap_access_domains) 46 | project = module.project_services.project_id 47 | role = "roles/iap.httpsResourceAccessor" 48 | member = each.key 49 | } 50 | -------------------------------------------------------------------------------- /components/webui/terraform/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "dns_configuration" { 16 | value = "${join(",", var.lb_ssl_certificate_domains)} => ${module.eks_webui_lb.external_ip}" 17 | } 18 | -------------------------------------------------------------------------------- /components/webui/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | variable "project_id" { 16 | type = string 17 | description = "project id required" 18 | } 19 | 20 | variable "region" { 21 | type = string 22 | description = "Google Cloud region where resources are located " 23 | } 24 | 25 | variable "artifact_repo" { 26 | description = "Docker registry" 27 | type = string 28 | } 29 | 30 | variable "cloud_build_service_account_email" { 31 | description = "IAM service account email used for cloud build." 32 | type = string 33 | } 34 | 35 | variable "iap_access_domains" { 36 | description = "List of domains granted for IAP access to the APP" 37 | type = list(string) 38 | } 39 | 40 | variable "vertex_ai_data_store_region" { 41 | description = "The geographic location where the data store should reside. The value can only be one of 'global', 'us' and 'eu'" 42 | type = string 43 | } 44 | 45 | variable "agent_builder_data_store_id" { 46 | description = "Data store used" 47 | type = string 48 | } 49 | 50 | variable "agent_builder_search_id" { 51 | description = "Agent builder search engine id" 52 | type = string 53 | } 54 | 55 | variable "webui_service_name" { 56 | type = string 57 | description = "The service name for the webui" 58 | default = "eks-ui" 59 | } 60 | 61 | variable "lb_ssl_certificate_domains" { 62 | description = "Custom domain pointing to the WebUI app, DNS configured" 63 | type = list(string) 64 | } 65 | 66 | variable "vpc_network_name" { 67 | type = string 68 | description = "The name of the network where subnets will be created" 69 | } 70 | 71 | variable "serverless_connector_subnet" { 72 | description = "Name of the VPC subnet to create" 73 | type = string 74 | } 75 | -------------------------------------------------------------------------------- /components/webui/terraform/versions.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">=1.5.7" 17 | 18 | required_providers { 19 | google = { 20 | source = "hashicorp/google" 21 | version = ">= 5.23.0" 22 | } 23 | google-beta = { 24 | source = "hashicorp/google-beta" 25 | version = ">= 5.23.0" 26 | } 27 | time = { 28 | source = "hashicorp/time" 29 | version = "0.12.1" 30 | } 31 | null = { 32 | source = "hashicorp/null" 33 | version = "3.2.3" 34 | } 35 | local = { 36 | source = "hashicorp/local" 37 | version = "2.5.2" 38 | } 39 | } 40 | 41 | provider_meta "google" { 42 | module_name = "cloud-solutions/dpu-solution-v1.0.0" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /docs/LICENSE_HEADER.txt: -------------------------------------------------------------------------------- 1 | Copyright 2024 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /invoke.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ROOT="$(realpath "$(dirname "$0")")" 18 | 19 | exec "${ROOT}/run.sh" invoke "$@" 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # This file is used for configuring Python tools, rather than 4 | # the workspace as a whole. As uv supports workspace it will be used. 5 | # 6 | 7 | 8 | # Pylint configuration 9 | [tool.pylint] 10 | 11 | # Source roots - important for finding imports 12 | source-roots = [ 13 | "components/processing/libs/processor-base/src", 14 | "components/processing/libs/processor-msg/src", 15 | "components/processing/libs/processor-xlsx/src", 16 | "components/doc-classifier/src", 17 | "components/specialized-parser/src", 18 | "components/post-setup-config/src", 19 | "components/doc-deletion/src", 20 | "components/webui/src", 21 | ] 22 | 23 | [tool.pylint."MESSAGE_CONTROL"] 24 | 25 | # Disable everything 26 | disable="all" 27 | 28 | # But enable warnings and unused-import 29 | enable="E,unused-import" 30 | 31 | 32 | # Pyright configuration 33 | [tool.pyright] 34 | 35 | # Paths of files to validate 36 | include=[ 37 | "components/processing/libs/processor-base/src", 38 | "components/processing/libs/processor-msg/src", 39 | "components/processing/libs/processor-xlsx/src", 40 | "components/doc-classifier/src", 41 | "components/specialized-parser/src", 42 | "components/post-setup-config/src", 43 | "components/doc-deletion/src", 44 | "components/webui/src", 45 | ] 46 | 47 | # Paths of modules for dependencies 48 | executionEnvironments = [ 49 | { root = ".", extraPaths = [ 50 | "components/processing/libs/processor-xlsx/src", 51 | "components/processing/libs/processor-base/src", 52 | "components/processing/libs/processor-msgx/src", 53 | "components/doc-classifier/src", 54 | "components/specialized-parser/src", 55 | "components/post-setup-config/src", 56 | "components/doc-deletion/src", 57 | "components/webui/src", 58 | ] } 59 | ] 60 | 61 | reportMissingImports=true 62 | reportMissingTypeStubs=false 63 | -------------------------------------------------------------------------------- /reqs/README.md: -------------------------------------------------------------------------------- 1 | # Python components 2 | 3 | Note the .in files are the maintained, while the .txt files are locked with hashes to 4 | the specific versions. Invoke.sh will automatically create new lockfiles (and sync) as 5 | required. 6 | 7 | ## requirements_all.txt 8 | 9 | This is the global lockfile for the virtual environment that is based on all of the `*.in` requirement files. 10 | 11 | This also acts as a constraint file for all of the specific .txt output file, i.e., all requirements are 12 | synchronized to the global requirements_all.txt. 13 | 14 | ## requirements_bootstrap.in and requirements_bootstrap.txt 15 | 16 | These are the requirements for invoke.sh and is bootstrapped using pip in a first virtual environment. 17 | 18 | ## requirements_dev.in and requirements_dev.txt 19 | 20 | These are the development requirements required for the Python environment including all tools for 21 | testing, linting, syntax checking, etc. 22 | -------------------------------------------------------------------------------- /reqs/requirements_all.in: -------------------------------------------------------------------------------- 1 | # Import the libraries as editable 2 | -e components/processing/libs/processor-base 3 | -e components/processing/libs/processor-msg 4 | -e components/processing/libs/processor-xlsx 5 | -e components/webui 6 | 7 | # Additional development tools (relative to current file) 8 | -r ../components/dpu-workflow/requirements.in 9 | -r ../components/doc-registry/src/requirements.in 10 | -r ../components/doc-classifier/src/requirements.in 11 | -r ../components/specialized-parser/src/requirements.in 12 | -r ../components/post-setup-config/src/requirements.in 13 | -r ../components/doc-deletion/src/requirements.in 14 | 15 | # Additional development and bootstrap tools 16 | -r requirements_dev.in 17 | -r requirements_bootstrap.in 18 | -------------------------------------------------------------------------------- /reqs/requirements_bootstrap.in: -------------------------------------------------------------------------------- 1 | uv 2 | invoke 3 | python-dotenv 4 | -------------------------------------------------------------------------------- /reqs/requirements_bootstrap.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile --generate-hashes requirements_bootstrap.in -c ./constraints.txt -o requirements_bootstrap.txt 3 | invoke==2.2.0 \ 4 | --hash=sha256:6ea924cc53d4f78e3d98bc436b08069a03077e6f85ad1ddaa8a116d7dad15820 \ 5 | --hash=sha256:ee6cbb101af1a859c7fe84f2a264c059020b0cb7fe3535f9424300ab568f6bd5 6 | # via 7 | # -c ./constraints.txt 8 | # -r requirements_bootstrap.in 9 | python-dotenv==1.0.1 \ 10 | --hash=sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca \ 11 | --hash=sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a 12 | # via 13 | # -c ./constraints.txt 14 | # -r requirements_bootstrap.in 15 | uv==0.2.6 \ 16 | --hash=sha256:02b7d9fc2801059a36d73a927a63927ab52cc0fff4558b14374778edc62e9840 \ 17 | --hash=sha256:14dddd13b6641bc0064848b906235ea32e188d6be6f1b496df6ff6d8e27031fa \ 18 | --hash=sha256:1ac3b96c6284ef2e5367e62f31472cc3f27ba4c7e14f579e7c6dbd081943e38b \ 19 | --hash=sha256:1fb1700cef0f1dcc79366d54743433887286ba579c055107c0f5bc212a8f2cc2 \ 20 | --hash=sha256:35e0590eed0db7ce024679110e668af0cc393eb63d3d0ece34f315e3e065a0c2 \ 21 | --hash=sha256:42178f251aab2dd3d98aefe002b245a5767336fd96c4106c39a3cf06acab49d6 \ 22 | --hash=sha256:438bbfd98ced9c7208137068fa2a0953a8a4cc3b835d08a984c6381566b2a032 \ 23 | --hash=sha256:526d6a3206b5e50b08629b579cdbdb8b093d1766035ec2149955c0d96eddc884 \ 24 | --hash=sha256:7c43452c62833e583b9bf0fcea63aaa9a0319f06196b790ecbc67967204f059f \ 25 | --hash=sha256:88a35ee5bf821c3ccca02984b08a5654f1e77fce2b75fbca3a210b769bf0ee7a \ 26 | --hash=sha256:9184b9b6dd2c8f15d1d821519692c2277d92f10303ef77be090c5ede1c48dfdf \ 27 | --hash=sha256:93128e72d161bc9e594fc88ba1925853ccad28596cbe0e1277f664d33e3994df \ 28 | --hash=sha256:955ea5a0845731c8394d2a425e2d353ead419e80bd47792f904aac59bbe6378d \ 29 | --hash=sha256:9d77160f7e0705d1e1703774555cb78e1b8ef81c7907cc72b9d86c536e79b4c7 \ 30 | --hash=sha256:aaecaf6d45ee5623eee5e1a1c34d57bffab75bdd779ed888d0026bf7d0192dc0 \ 31 | --hash=sha256:abb095721824509f476760d6b208327f75ad128f32dba8164c411a31c50b1e06 \ 32 | --hash=sha256:e888cbc3b743a760e657784b7ed43054db12808690a943dfef4818510a3e32ac 33 | # via 34 | # -c ./constraints.txt 35 | # -r requirements_bootstrap.in 36 | -------------------------------------------------------------------------------- /reqs/requirements_dev.in: -------------------------------------------------------------------------------- 1 | types-protobuf 2 | pyright 3 | pylint 4 | invoke 5 | pre-commit 6 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ROOT="$(realpath "$(dirname "$0")")" 18 | VENV="${ROOT}/.venv" 19 | [ -d "${VENV}" ] || ( 20 | cd "${ROOT}" || exit 21 | 22 | # Create and bootstrap the virtual environment 23 | python3 -m venv "${VENV}" 24 | "${VENV}/bin/python3" -m pip install -q --require-hashes -r "${ROOT}/reqs/requirements_bootstrap.txt" 25 | 26 | # Synchronize the environment 27 | "${VENV}/bin/python3" -m invoke sync 28 | ) 29 | 30 | DOTENV="${ROOT}/.env" 31 | if [ -f "${DOTENV}" ]; then 32 | set -o allexport 33 | # shellcheck source=/dev/null 34 | source "${DOTENV}" 35 | set +o allexport 36 | fi 37 | 38 | CMD="$1" 39 | shift 40 | 41 | if [ ! -f "${VENV}/bin/${CMD}" ]; then 42 | echo "Usage: $(basename "$0") command [... command args]" 43 | echo "" 44 | echo "Command is a bin available in the virtual environment," 45 | echo "including python3 interpreter itself." 46 | exit 1 47 | fi 48 | 49 | exec "${VENV}/bin/${CMD}" "$@" 50 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # [Deployment Guide](../../README.md#deployment-guide) 2 | 3 | This guide provides step-by-step instructions on how to deploy the `Document Process and Understanding with Composer` sample on Google Cloud using Terraform. 4 | 5 | [Deployment Guide](../../README.md#deployment-guide) has been moved to the [Readme](../../README.md#deployment-guide). 6 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/README.md: -------------------------------------------------------------------------------- 1 | # EKS Cloud Composer orchestrated document processing 2 | 3 | In this directory, you can find an end-to-end example deployment of the Enterprise Knowledge Solution (EKS) on Google Cloud. 4 | 5 | The components of EKS can be deployed individually or as part of a custom setup. The [`main.tf`](./main.tf) file demonstrates how to configure and deploy each component using Terraform. 6 | 7 | ## Additional Resources 8 | 9 | In additional to the EKS components this example also deployes the following resources: 10 | 11 | | Name | Description | 12 | | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | 13 | | [Agent Builder Data Store](https://cloud.google.com/dialogflow/vertex/docs/concept/data-store) | The data store where processed documents are collected and indexed. This is the backbone that powers the search and summarization capabilities. | 14 | | [Agent Builder Search App](https://cloud.google.com/generative-ai-app-builder/docs/create-datastore-ingest) | A generic Agent Builder search app that provides the API interface for searching documents in the Data Store. | 15 | 16 | ## Get Started with the Deployment Guide 17 | 18 | Follow the [Deployment Guide](../../README.md#deployment-guide) 19 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/USE.md: -------------------------------------------------------------------------------- 1 | # [Usage Guide](../../README.md#usage-guide) 2 | 3 | This guide provides step-by-step instructions on how to use the `Enterprise Knowledge Solution with Composer` deployed on Google Cloud. 4 | After successful [deployment](../../README.md#deployment-guide), you can test the entire EKS workflow. 5 | 6 | [Usage Guide](../../README.md#usage-guide) has been moved to the [Readme](../../README.md#usage-guide). 7 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/custom_iap_brand_admin.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | --- 16 | title: "Custom IAP brand Admin" 17 | description: "Custom role for configuring the OAuth consent screen (brand) required by IAP" 18 | stage: "BETA" 19 | includedPermissions: 20 | - clientauthconfig.brands.list 21 | - clientauthconfig.brands.create 22 | - clientauthconfig.brands.get 23 | - clientauthconfig.clients.create 24 | - clientauthconfig.clients.listWithSecrets 25 | - clientauthconfig.clients.getWithSecret 26 | - clientauthconfig.clients.delete 27 | - clientauthconfig.clients.update 28 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/CHRO.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/CHRO.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/DPU V1 Demo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/DPU V1 Demo.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/HLXB.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/HLXB.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/INTJ.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/INTJ.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/RYDE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/RYDE.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/financial-documents/questions.txt: -------------------------------------------------------------------------------- 1 | When was Form S-1 submitted by CHROMOCELL THERAPEUTICS CORPORATION? 2 | How does Maxim Group LLC work with CHROMOCELL THERAPEUTICS CORPORATION? 3 | How can we automate our document processing workflow to save time and reduce errors? 4 | How can we automate our document processing workflow to save time and reduce errors? Answer in hindi. 5 | How can we automate our document processing workflow to save time and reduce errors? Generate answer in spanish. 6 | समय बचाने और त्रुटियों को कम करने के लिए हम अपने दस्तावेज़ प्रसंस्करण वर्कफ़्लो को कैसे स्वचालित कर सकते हैं? अंग्रेज़ी में उत्तर दें. 7 | How many shares are offered by Ryde Group Ltd? 8 | Create a table to list 2021 and 2022 Revenue of Ryde Group Ltd. and Intelligent Group Limited. 9 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_06.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_06.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_07.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_07.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_08.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_08.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_09.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_10.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_11.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_12.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_13.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_13.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_14.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_14.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_15.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/CMS1500_15.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/pa-form-test20.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/pa-form-test20.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/pa-form-test21.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/pa-form-test21.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_3.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_4.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/test/package_5.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_01.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_01.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_02.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_02.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_03.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_03.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_04.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_04.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_05.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_05.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_16.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_16.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_17.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_18.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_18.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_19.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_19.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_20.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/CMS1500_20.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test1.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test22.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test22.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test23.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test23.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test24.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test24.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test25.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/pa-form-test25.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_1.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_2.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/forms-to-train-docai/train/package_3.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/healthcare-billing-codes/2024_DHS_Code_List_Addendum_11_29_2023.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/healthcare-billing-codes/2024_DHS_Code_List_Addendum_11_29_2023.xlsx -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/healthcare-billing-codes/questions.txt: -------------------------------------------------------------------------------- 1 | What are the CPT codes for Therapeutic exercises and Massage therapy? 2 | What are the CPT codes for Mri orbit/face/neck with or without dye? 3 | What does the CPT code 73030 represent? 4 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937701.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937701.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937702.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937702.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937703.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937703.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937704.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937704.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937705.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937705.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937706.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937706.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937707.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937707.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937708.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937708.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937709.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937709.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937710.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/enterprise-knowledge-solution/d0540ca6b7350c029119cc3d5e6d7e89143e4f9b/sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/937710.pdf -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/documents-for-testing/network-ops-rca-documents/questions.txt: -------------------------------------------------------------------------------- 1 | Search Context: 2 | You are a Network Operations Question & Answering Agent. 3 | Answer the following questions from the RCA document only. 4 | Summarize answers from the most relevant RCA documents available in the corpus. 5 | Be factual, and return only the top two search results. 6 | 7 | Questions: 8 | Summarize the RCA for the outage in Fort Worth. 9 | How long was the outage duration in Denver? 10 | What was the customer impact in Boston? 11 | In which cities was routing affected? 12 | What is the status of RCA id 937708? 13 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/env.template: -------------------------------------------------------------------------------- 1 | ###### 2 | # 3 | # Top level settings. 4 | # 5 | # These will be fed into terraform variables and 6 | # used to provide defaults to Python code. 7 | # 8 | 9 | # General infrastructure 10 | PROJECT_ID=${project_id} 11 | REGION=${region} 12 | BQ_REGION=${bq_region} 13 | GCS_REGION=${gcs_region} 14 | REPOSITORY_REGION=${repository_region} 15 | ARTIFACT_REPO_NAME=${artifact_repo} 16 | GCS_INPUT_BUCKET=${gcs_input_bucket} 17 | GCS_PROCESS_BUCKET=${gcs_process_bucket} 18 | GCS_REJECT_BUCKET=${gcs_reject_bucket} 19 | 20 | # Cloud run job 21 | PROCESSING_CLOUD_RUN_JOB_NAME=${processing_cloud_run_job_name} 22 | PROCESSING_SERVICE_ACCOUNT=${processing_service_account} 23 | 24 | SPECIALIZED_PARSER_CLOUD_RUN_JOB_NAME=${specialized_parser_cloud_run_job_name} 25 | SPECIALIZED_PARSER_SERVICE_ACCOUNT=${specialized_parser_service_account} 26 | 27 | CLASSIFIER_CLOUD_RUN_JOB_NAME=${classifier_cloud_run_job_name} 28 | CLASSIFIER_SERVICE_ACCOUNT=${classifier_service_account} 29 | 30 | # Agent builder configuration 31 | AGENT_BUILDER_LOCATION=${agent_builder_location} 32 | AGENT_BUILDER_DATA_STORE_ID=${agent_builder_data_store_id} 33 | AGENT_BUILDER_SEARCH_ID=${agent_builder_search_id} 34 | 35 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/outputs.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | output "gcs_input_bucket_name" { 16 | description = "GCS input bucket name" 17 | value = module.common_infra.gcs_input_bucket_name 18 | } 19 | 20 | output "gcs_process_bucket_name" { 21 | description = "GCS processing bucket name" 22 | value = module.common_infra.gcs_process_bucket_name 23 | } 24 | 25 | output "gcs_reject_bucket_name" { 26 | description = "GCS reject bucket name" 27 | value = module.common_infra.gcs_reject_bucket_name 28 | } 29 | 30 | output "composer_uri" { 31 | description = "Cloud Composer Airflow URI" 32 | value = module.dpu_workflow.composer_uri 33 | } 34 | 35 | output "agent_app_uri" { 36 | description = "Agent Builder Search App URI" 37 | value = "https://console.cloud.google.com/gen-app-builder/locations/${var.vertex_ai_data_store_region}/engines/${google_discovery_engine_search_engine.basic.engine_id}/preview/search?project=${var.project_id}" 38 | } 39 | 40 | output "webui_dns_config" { 41 | description = "DNS Record for WebUI" 42 | value = module.dpu_ui.dns_configuration 43 | } 44 | 45 | output "classifier_processor_id" { 46 | description = "The DocAI Custom Classifier processor id" 47 | value = var.custom_classifier_id 48 | } 49 | 50 | output "classifier_service_account" { 51 | description = "the service account used by Cloud Run Job that does doc-classifier" 52 | value = module.doc_classifier_job.classifier_service_account 53 | } 54 | 55 | output "specialized_processors_ids_json" { 56 | description = "JSON encoded string of all supported labels as keys and the corresponding processor id for each as values." 57 | value = jsondecode(module.specialized_parser_job.specialized_processors_ids_json) 58 | } 59 | 60 | output "composer_location" { 61 | description = "Location of Cloud Composer" 62 | value = module.dpu_workflow.composer_location 63 | } 64 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/persona_roles_DEPLOYER.txt: -------------------------------------------------------------------------------- 1 | projects/${PROJECT_ID}/roles/customIAPAdmin 2 | roles/alloydb.admin 3 | roles/artifactregistry.admin 4 | roles/bigquery.dataOwner 5 | roles/cloudbuild.builds.builder 6 | roles/composer.admin 7 | roles/compute.loadBalancerAdmin 8 | roles/compute.networkAdmin 9 | roles/compute.securityAdmin 10 | roles/container.clusterAdmin 11 | roles/discoveryengine.admin 12 | roles/dns.admin 13 | roles/documentai.admin 14 | roles/iam.serviceAccountAdmin 15 | roles/iam.serviceAccountUser 16 | roles/iap.admin 17 | roles/logging.configWriter 18 | roles/resourcemanager.projectIamAdmin 19 | roles/run.admin 20 | roles/logging.logWriter 21 | roles/servicedirectory.admin 22 | roles/serviceusage.serviceUsageAdmin 23 | roles/storage.admin 24 | roles/vpcaccess.admin 25 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/persona_roles_OPERATOR.txt: -------------------------------------------------------------------------------- 1 | roles/alloydb.viewer 2 | roles/bigquery.dataViewer 3 | roles/composer.user 4 | roles/compute.viewer 5 | roles/container.viewer 6 | roles/discoveryengine.viewer 7 | roles/documentai.viewer 8 | roles/iap.httpsResourceAccessor 9 | roles/logging.viewer 10 | roles/monitoring.viewer 11 | roles/run.viewer 12 | roles/storage.objectViewer 13 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/persona_roles_READER.txt: -------------------------------------------------------------------------------- 1 | roles/discoveryengine.user 2 | roles/iap.httpsResourceAccessor 3 | roles/storage.objectViewer 4 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/persona_roles_UPLOADER.txt: -------------------------------------------------------------------------------- 1 | roles/storage.objectCreator 2 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/project_apis.txt: -------------------------------------------------------------------------------- 1 | aiplatform.googleapis.com 2 | artifactregistry.googleapis.com 3 | cloudasset.googleapis.com 4 | cloudbuild.googleapis.com 5 | cloudresourcemanager.googleapis.com 6 | compute.googleapis.com 7 | discoveryengine.googleapis.com 8 | documentai.googleapis.com 9 | iam.googleapis.com 10 | iap.googleapis.com 11 | orgpolicy.googleapis.com 12 | serviceusage.googleapis.com 13 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/remote-backend.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | resource "google_storage_bucket" "tf_backend" { 16 | name = "${var.project_id}-eks-tf-backend" 17 | location = var.region 18 | 19 | force_destroy = false 20 | public_access_prevention = "enforced" 21 | uniform_bucket_level_access = true 22 | 23 | versioning { 24 | enabled = true 25 | } 26 | } 27 | 28 | resource "local_file" "remote_backend" { 29 | file_permission = "0644" 30 | filename = "${path.module}/backend.tf" 31 | 32 | content = <<-EOT 33 | terraform { 34 | backend "gcs" { 35 | bucket = "${google_storage_bucket.tf_backend.name}" 36 | } 37 | } 38 | EOT 39 | } 40 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/scripts/apply_persona_roles.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | 20 | PARENT_DIR="$(dirname "$0")" 21 | 22 | # shellcheck source=/dev/null 23 | . "$PARENT_DIR/common.sh" 24 | 25 | section_open "Check that necessary environment variables are set" 26 | check_mandatory_variable "PROJECT_ID" "set the PROJECT_ID where IAM roles will be applied" 27 | section_close 28 | 29 | section_open "Enable required IAM roles for the UPLOADER persona" 30 | check_and_set_persona "UPLOADER" 31 | section_close 32 | 33 | section_open "Enable required IAM roles for the DEPLOYER persona" 34 | check_and_set_persona "DEPLOYER" 35 | section_close 36 | 37 | section_open "Enable required IAM roles for the OPERATOR (read-only) persona" 38 | check_and_set_persona "OPERATOR" 39 | section_close 40 | 41 | section_open "Enable required IAM roles for the READER persona" 42 | check_and_set_persona "READER" 43 | section_close 44 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/scripts/find_document.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # PROJECT_NUM="536170242658" 18 | # AGENT_BUILDER_LOCATION="global" 19 | # AGENT_BUILDER_DATA_STORE_ID="dpu-doc-store" 20 | 21 | # Check if a search string is provided as an argument 22 | if [ -z "$1" ]; then 23 | echo "Error: Search string is required as an argument." >&2 24 | exit 1 25 | fi 26 | 27 | # Check if PROJECT_ID is set, otherwise prompt for input 28 | if [[ -z "$PROJECT_ID" ]]; then 29 | read -r -p "Enter PROJECT_ID: " PROJECT_ID 30 | else 31 | echo "PROJECT_ID is set to: $PROJECT_ID" 32 | fi 33 | [[ -z "$PROJECT_ID" ]] && echo "PROJECT_ID is required." && exit 1 34 | 35 | PROJECT_NUM=$(gcloud projects list | grep "${PROJECT_ID}" | awk '{print $3}') 36 | 37 | # Check if Agent Builder Datastore location/region is set, otherwise prompt for input 38 | if [[ -z "$AGENT_BUILDER_LOCATION" ]]; then 39 | read -r -p "Enter Agent Builder Datastore location/region: " AGENT_BUILDER_LOCATION 40 | else 41 | echo "Agent Builder Datastore location/region is set to: $AGENT_BUILDER_LOCATION" 42 | fi 43 | [[ -z "$AGENT_BUILDER_LOCATION" ]] && echo "Agent Builder Datastore location/region is required." && exit 1 44 | 45 | # Check if Agent Builder Datastore ID is set, otherwise prompt for input 46 | if [[ -z "$AGENT_BUILDER_DATA_STORE_ID" ]]; then 47 | read -r -p "Enter Agent Builder Datastore ID: " AGENT_BUILDER_DATA_STORE_ID 48 | else 49 | echo "Agent Builder Datastore ID is set to: $AGENT_BUILDER_DATA_STORE_ID" 50 | fi 51 | [[ -z "$AGENT_BUILDER_DATA_STORE_ID" ]] && echo "Agent Builder Datastore ID is required." && exit 1 52 | 53 | # Check if a search string is provided as an argument 54 | if [ -z "$1" ]; then 55 | echo "Error: Search string is required as an argument." >&2 56 | exit 1 57 | fi 58 | 59 | # Set the string to search for from the first argument 60 | search_string="$1" 61 | 62 | # Run the curl command and pipe the output to grep 63 | curl -X GET \ 64 | -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \ 65 | -H "x-goog-user-project: $PROJECT_NUM" \ 66 | "https://discoveryengine.googleapis.com/v1alpha/projects/$PROJECT_NUM/locations/$AGENT_BUILDER_LOCATION/collections/default_collection/dataStores/$AGENT_BUILDER_DATA_STORE_ID/branches/default_branch/documents" | grep "$search_string" 67 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/scripts/pre_tf_setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -o errexit 18 | set -o nounset 19 | 20 | # shellcheck source=/dev/null 21 | . "$(dirname "$0")/common.sh" 22 | 23 | section_open "Check if the necessary dependencies are available: gcloud, terraform" 24 | check_exec_dependency "gcloud" 25 | check_exec_version "gcloud" 26 | check_exec_dependency "terraform" 27 | check_exec_version "terraform" 28 | section_close 29 | 30 | section_open "Check and set mandatory environment variables" 31 | check_mandatory_variable "PROJECT_ID" "the Google Cloud project where resources are created" 32 | check_mandatory_variable "REGION" "the Google Cloud region where resources are created" 33 | check_mandatory_variable "IAP_ADMIN_ACCOUNT" "the user or group configured as the contact for IAP consent screen" 34 | set_active_principal 35 | gcloud config unset billing/quota_project 36 | gcloud config set project "${PROJECT_ID}" 37 | section_close 38 | 39 | section_open "Enable the required APIs for bootstrap scripts" 40 | enable_bootstrap_apis 41 | section_close 42 | 43 | section_open "Setup OAuth consent screen (brand) required for IAP" 44 | create_oauth_consent_config 45 | section_close 46 | 47 | section_open "Create deployer service account and enable $ACTIVE_PRINCIPAL to use service account impersonation " 48 | create_service_account_and_enable_impersonation 49 | section_close 50 | 51 | section_open "Enable all the required IAM roles for deployer service account, serviceAccount:""${SERVICE_ACCOUNT_ID}""" 52 | enable_persona_roles "serviceAccount:${SERVICE_ACCOUNT_ID}" "persona_roles_DEPLOYER.txt" "DEPLOYER" 53 | section_close 54 | 55 | section_open "Set Application Default Credentials to be used by Terraform" 56 | set_adc 57 | section_close 58 | -------------------------------------------------------------------------------- /sample-deployments/composer-orchestrated-process/scripts/trigger_workflow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2024 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Just a small helper to your developers - a small bash function to trigger the DAG from the command line: 18 | 19 | PARENT_DIR="$(dirname "$0")" 20 | 21 | function trigger_dag() { 22 | # read terraform state 23 | outputs=$(terraform -chdir="$PARENT_DIR/../" output -json) 24 | 25 | json_config=$( 26 | cat <