├── .bumpversion.toml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── i_need_help.md
    └── PULL_REQUEST_TEMPLATE
    │   └── pull_request_template.md
├── .gitignore
├── .gitlab-ci.yml
├── .gitlab
    ├── issue_templates
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── i_need_help.md
    └── merge_request_templates
    │   ├── Default.md
    │   └── Release.md
├── .trunk
    ├── .gitignore
    ├── configs
    │   ├── .hadolint.yaml
    │   ├── .isort.cfg
    │   ├── .markdownlint.yaml
    │   ├── .shellcheckrc
    │   ├── .yamllint.yaml
    │   ├── ruff.toml
    │   └── svgo.config.js
    └── trunk.yaml
├── 00_llm_endpoint_setup
    ├── FalconLite
    │   ├── build-container
    │   │   ├── Dockerfile
    │   │   ├── build.sh
    │   │   ├── layers.py
    │   │   └── sagemaker-entrypoint.sh
    │   └── deploy_FalconLite.ipynb
    ├── codebuild
    │   ├── embeddings
    │   │   ├── buildspec.yml
    │   │   ├── code
    │   │   │   └── inference.py
    │   │   ├── endpoint-config-template.yml
    │   │   └── scripts
    │   │   │   └── build.py
    │   └── llm
    │   │   ├── buildspec.yml
    │   │   ├── endpoint-config-template.yml
    │   │   └── scripts
    │   │       └── build.py
    ├── deploy-falcon-40b-instruct.ipynb
    ├── deploy_embeddings_model_sagemaker_endpoint.ipynb
    └── deploy_llama2_70B.ipynb
├── 01_crawler
    ├── README.md
    ├── buildspec.yml
    ├── crawly
    │   ├── configs
    │   │   ├── admin-ch-press-releases-de.json
    │   │   └── admin-ch-press-releases-en.json
    │   ├── custom_middlewares.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── scrapy.cfg
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── webpage_spider.py
    ├── poetry.lock
    └── pyproject.toml
├── 02_ingestion
    ├── 01_opensearch_cluster_management.ipynb
    ├── 02_generate_qa_pairs.ipynb
    ├── 03_run_qa_evaluation.ipynb
    ├── 04_ingest_html_embeddings_to_opensearch.ipynb
    ├── 50_ingest_stock_embeddings_to_opensearch.ipynb
    ├── buildspec_admin_ch.yml
    ├── buildspec_fin_analyzer.yml
    ├── poetry.lock
    ├── pyproject.toml
    └── scripts
    │   ├── admin_ch_embedding.py
    │   ├── fin_analyzer_data.py
    │   ├── fin_analyzer_embedding.py
    │   └── modules
    │       ├── aws_helpers.py
    │       ├── embedding.py
    │       └── opensearch_helpers.py
├── 03_chatbot
    ├── .dockerignore
    ├── .streamlit
    │   └── config.toml
    ├── Dockerfile
    ├── README.md
    ├── bin
    │   └── icons
    │   │   ├── bug.png
    │   │   └── bug.svg
    ├── entrypoint.sh
    ├── example_app_configs
    │   ├── bedrock_endpoint.appconfig.json
    │   ├── bedrock_iam.appconfig.json
    │   ├── bedrock_multi_region.appconfig.json
    │   └── bedrock_prompts.appconfig.json
    ├── generate_internationalization.sh
    ├── generate_secrets.py
    ├── images
    │   ├── Amazon SageMaker endpoint tags dynamic discovery.png
    │   └── Genie_LLM_App_chatbot_code_flow.png
    ├── poetry.lock
    ├── pyproject.toml
    ├── scripts
    │   └── setup.sh
    ├── src
    │   ├── chatbot
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── appconfig.json
    │   │   ├── catalog
    │   │   │   ├── __init__.py
    │   │   │   ├── agent_chain_catalog.py
    │   │   │   ├── agent_chain_catalog_item.py
    │   │   │   ├── agent_chain_catalog_item_financial_analysis.py
    │   │   │   ├── agent_chain_catalog_item_sql_generator.py
    │   │   │   ├── agent_tools_catalog_item.py
    │   │   │   ├── catalog.py
    │   │   │   ├── catalog_item.py
    │   │   │   ├── flow_catalog.py
    │   │   │   ├── flow_catalog_item.py
    │   │   │   ├── flow_catalog_item_agent.py
    │   │   │   ├── flow_catalog_item_rag.py
    │   │   │   ├── flow_catalog_item_simple_chat.py
    │   │   │   ├── flow_catalog_item_upload_file.py
    │   │   │   ├── memory_catalog.py
    │   │   │   ├── memory_catalog_item.py
    │   │   │   ├── memory_catalog_item_dynamodb_table.py
    │   │   │   ├── model_catalog.py
    │   │   │   ├── model_catalog_item.py
    │   │   │   ├── model_catalog_item_bedrock.py
    │   │   │   ├── model_catalog_item_sagemaker.py
    │   │   │   ├── prompt_catalog.py
    │   │   │   ├── prompt_catalog_item.py
    │   │   │   ├── retriever_catalog.py
    │   │   │   ├── retriever_catalog_item.py
    │   │   │   ├── retriever_catalog_item_kendra.py
    │   │   │   └── retriever_catalog_item_open_search.py
    │   │   ├── config
    │   │   │   ├── __init__.py
    │   │   │   ├── amazon_bedrock.py
    │   │   │   ├── app_config.py
    │   │   │   ├── appearance.py
    │   │   │   ├── aws_config.py
    │   │   │   ├── aws_region.py
    │   │   │   ├── fin_analyzer.py
    │   │   │   ├── flow_config.py
    │   │   │   ├── iam.py
    │   │   │   ├── llm_config.py
    │   │   │   └── parser_helpers.py
    │   │   ├── embeddings
    │   │   │   ├── __init__.py
    │   │   │   └── sagemaker_endpoint_embeddings.py
    │   │   ├── fin_analyzer
    │   │   │   ├── __init__.py
    │   │   │   ├── fin_analyzer_index_retriever.py
    │   │   │   ├── prompts
    │   │   │   │   ├── anthropic_claude_chat.yaml
    │   │   │   │   └── anthropic_claude_rag.yaml
    │   │   │   ├── readme.md
    │   │   │   └── retriever_catalog_item_fin_analyzer.py
    │   │   ├── helpers
    │   │   │   ├── __init__.py
    │   │   │   ├── aws_helpers.py
    │   │   │   ├── environment_variables.py
    │   │   │   ├── langchain_bedrock_overwrite.py
    │   │   │   ├── logger
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── app_logging.py
    │   │   │   │   ├── llm_logging_handler.py
    │   │   │   │   └── log_to_ui_handler.py
    │   │   │   ├── sagemaker_async_endpoint.py
    │   │   │   └── urls.py
    │   │   ├── i18n
    │   │   │   ├── __init__.py
    │   │   │   ├── chatbot.pot
    │   │   │   ├── de_DE
    │   │   │   │   └── LC_MESSAGES
    │   │   │   │   │   └── chatbot.po
    │   │   │   ├── en_US
    │   │   │   │   └── LC_MESSAGES
    │   │   │   │   │   └── chatbot.po
    │   │   │   └── internationalization.py
    │   │   ├── json_schema
    │   │   │   ├── Readme
    │   │   │   └── aws_awsomechat_app_config.schema.json
    │   │   ├── llm_app.py
    │   │   ├── open_search
    │   │   │   ├── __init__.py
    │   │   │   └── open_search_index_retriever.py
    │   │   ├── prompts
    │   │   │   ├── ai21_jurassic_chat.yaml
    │   │   │   ├── ai21_jurassic_rag.yaml
    │   │   │   ├── anthropic_claude_agent_financial_analyzer.yaml
    │   │   │   ├── anthropic_claude_agent_sql.yaml
    │   │   │   ├── anthropic_claude_chat.yaml
    │   │   │   ├── anthropic_claude_rag.yaml
    │   │   │   ├── condense_question.yaml
    │   │   │   ├── default_chat.yaml
    │   │   │   ├── default_rag.yaml
    │   │   │   ├── falcon_chat.yaml
    │   │   │   ├── falcon_fine_tuned_greetings_rag.yaml
    │   │   │   ├── falcon_instruct_rag.yaml
    │   │   │   ├── meta_llama2_chat.yaml
    │   │   │   └── meta_llama2_rag.yaml
    │   │   └── ui
    │   │   │   ├── __init__.py
    │   │   │   ├── about_page.py
    │   │   │   ├── auth.py
    │   │   │   ├── chat_messages.py
    │   │   │   ├── chatbot_app.py
    │   │   │   ├── sidebar.py
    │   │   │   ├── stream_handler.py
    │   │   │   └── topbar.py
    │   ├── icons
    │   │   └── X-Ray.png
    │   └── run_module.py
    └── tests
    │   ├── conftest.py
    │   └── test_chat_end_to_end.py
├── 04_finetuning
    ├── deploy_llms_with_qlora
    │   └── deploy_fine_tuned_falcon.ipynb
    └── train_llms_with_qlora
    │   ├── fine-tune-falcon.ipynb
    │   └── scripts
    │       ├── requirements.txt
    │       └── run_clm.py
├── 05_doc
    ├── app-screenshot.png
    ├── architecture.drawio.svg
    ├── architecture.png
    ├── companion_architecture_simple.drawio
    ├── companion_architecture_simple.drawio.png
    ├── companion_architecture_simple.drawio.svg
    ├── deployment-overview.drawio.svg
    └── deployment-overview.png
├── 06_automation
    ├── README.md
    ├── app.py
    ├── cdk.json
    ├── configs
    │   └── dev.json
    ├── modules
    │   ├── config.py
    │   ├── kendra
    │   │   ├── __init__.py
    │   │   ├── data_source_is_complete_lambda
    │   │   │   └── function.py
    │   │   ├── data_source_lambda
    │   │   │   └── function.py
    │   │   └── kendra_data_source.py
    │   ├── ssm_parameter_reader.py
    │   └── stack.py
    ├── package.json
    ├── poetry.lock
    ├── pyproject.toml
    ├── stacks
    │   ├── README.md
    │   ├── chatbot
    │   │   ├── cert_lambda
    │   │   │   ├── function.py
    │   │   │   └── requirements.txt
    │   │   ├── chatbot_stack.py
    │   │   └── chatbot_vpc_stack.py
    │   ├── core
    │   │   └── core_stack.py
    │   ├── deployment_pipeline
    │   │   ├── buildspec-develop.yml
    │   │   ├── buildspec-main.yml
    │   │   └── deployment_pipeline_stack.py
    │   ├── kendra_datasources
    │   │   └── kendra_datasources_stack.py
    │   ├── kendra_index
    │   │   └── kendra_index_stack.py
    │   ├── llm_pipeline
    │   │   └── llm_pipeline_stack.py
    │   ├── opensearch_domain
    │   │   ├── opensearch_domain_stack.py
    │   │   ├── opensearch_private_vpc_stack.py
    │   │   ├── opensearch_vpc_endpoint_stack.py
    │   │   └── vpc_endpoint_lambda
    │   │   │   └── function.py
    │   ├── opensearch_ingestion_pipeline
    │   │   └── opensearch_ingestion_pipeline_stack.py
    │   ├── sagemaker_studio_domain
    │   │   └── sagemaker_studio_domain_stack.py
    │   └── shared
    │   │   ├── s3_access_logs_stack.py
    │   │   └── vpc_peering_stack.py
    └── tests
    │   ├── __init__.py
    │   ├── cdk_nag_test.py
    │   └── unit
    │       └── __init__.py
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── codebuild_build.sh
└── resize-disk.sh


/.bumpversion.toml:
--------------------------------------------------------------------------------
1 | [tool.bumpversion]
2 | current_version = "1.3.6"


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | # Bug
 2 | 
 3 | **Describe the bug**
 4 | A clear and concise description of what the bug is.
 5 | 
 6 | **Error message**
 7 | Error that was thrown (if available)
 8 | 
 9 | **Expected behavior**
10 | A clear and concise description of what you expected to happen.
11 | 
12 | **Additional context**
13 | Add any other context about the problem here, like document types / preprocessing steps / settings of reader etc.
14 | 
15 | **To Reproduce**
16 | Steps to reproduce the behavior, for example the query you entered into the Genie chatbot.
17 | 
18 | **System:**
19 | 
20 | - OS:
21 | - LLM APP Genie release version (commit or version number):
22 | - DocumentStore (Amazon Kendra or Amazon OpenSearch)
23 | - Large Language Model (Falcon, Amazon Titan, Anthropic Claude 2, ...):
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | # Feature Request
 2 | 
 3 | **Is your feature request related to a problem? Please describe.**
 4 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 5 | 
 6 | **Describe the solution you'd like**
 7 | A clear and concise description of what you want to happen.
 8 | 
 9 | **Describe alternatives you've considered**
10 | A clear and concise description of any alternative solutions or features you've considered.
11 | 
12 | **Additional context**
13 | Add any other context or screenshots about the feature request here.
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/i_need_help.md:
--------------------------------------------------------------------------------
 1 | # I need help
 2 | 
 3 | If you have a general question or need help with something specific that is not a bug or feature request you can create an issue with this template.
 4 | 
 5 | If you want to report a bug or feature request please consider opening this issue with the Bug Report template or the Feature Request template.
 6 | 
 7 | **Question**
 8 | Please describe your question or topic that you need help with.
 9 | 
10 | We will contact you in the comments of this issue. Pleases monitor it in case we have any follow-up questions for you.
11 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # \[Replace with Title\]
 2 | 
 3 | ### Description
 4 | 
 5 | Describe the problem or task that this pull request addressed.
 6 | 
 7 | ### Related Issues
 8 | 
 9 | If applicable reference the issues this pull request fixes.
10 | 
11 | - fixes #issue-number
12 | 
13 | ### Additional Notes
14 | 
15 | Include any extra information or considerations for reviewers, such as impacted areas of the codebase.
16 | 
17 | ### Pull Request Checklists
18 | 
19 | - [ ] I am using [conventional commit types](https://www.conventionalcommits.org/en/v1.0.0/) for my merge request title. Your title should follow the structure `<type>([optional scope]):<description>`.
20 | 
21 |   Common types are:
22 | 
23 |   - feat (for enhancements)
24 |   - bug (for bug fixes)
25 |   - docs (for changes to the documentation)
26 |   - test (for changes to the tests)
27 |   - perf (for performance improvements)
28 |   - refactor (for code refactorings)
29 | 
30 |   If your change is breaking backwards compatibility use a ! after the type to indicate that your merge request contains breaking changes.
31 | 
32 |   Examples:
33 | 
34 |   - feat(chatbot): add local document upload
35 |   - bug: fix CodePipeline
36 |   - refactor!: change environment variable names
37 | 
38 | - [ ] I fixed any [trunk](../../README.md#pre-requisites-for-development) issues.
39 |       Code follows project coding guidelines.
40 | - [ ] I updated the documentation to reflect the changes
41 | 
42 | **By submitting this pull request, I confirm that my contribution is made under the terms of the [MIT-0](../../LICENSE).**
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/.crawls/
 2 | **/.idea
 3 | **/*.pyc
 4 | **/node_modules
 5 | *.ipynb_checkpoints
 6 | **/.DS_Store
 7 | *.swp
 8 | **/package-lock.json
 9 | **/__pycache__
10 | **/.pytest_cache
11 | .venv
12 | **/*.egg-info
13 | # CDK asset staging directory
14 | **/.cdk.staging
15 | **/cdk.out
16 | !**/*.schema.json
17 | !appconfig.json
18 | infrastructure/genaiax.code-workspace
19 | */~*.pptx
20 | .vscode/launch.json
21 | **/launch.json
22 | **/cdk.context.json
23 | **/web-content/
24 | env.list
25 | .vscode/
26 | secrets.toml
27 | 
28 | **/bedrock-python-sdk/
29 | **/logs/
30 | 
31 | 
32 | # binary internationalization files
33 | *.mo
34 | 
35 | **/test-results/*
36 | report.html
37 | **/screenshots/*
38 | **/demos/
39 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | workflow:
 2 |   rules:
 3 |     - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "main"
 4 | code commit mirror:
 5 |   image: python:3.11
 6 |   stage: deploy
 7 |   before_script:
 8 |     - pip install git-remote-codecommit
 9 |     - echo $CODE_COMMIT_REPO
10 |     - git checkout main # checkout main to avoid deleting default branch
11 |     - git checkout develop # checkout develop to avoid deleting develop branch
12 |     - git checkout "$CI_COMMIT_REF_NAME"
13 |     - git status
14 |   script:
15 |     - git push --mirror $CODE_COMMIT_REPO
16 | 


--------------------------------------------------------------------------------
/.gitlab/issue_templates/bug_report.md:
--------------------------------------------------------------------------------
 1 | # Bug
 2 | 
 3 | **Describe the bug**
 4 | A clear and concise description of what the bug is.
 5 | 
 6 | **Error message**
 7 | Error that was thrown (if available)
 8 | 
 9 | **Expected behavior**
10 | A clear and concise description of what you expected to happen.
11 | 
12 | **Additional context**
13 | Add any other context about the problem here, like document types / preprocessing steps / settings of reader etc.
14 | 
15 | **To Reproduce**
16 | Steps to reproduce the behavior, for example the query you entered into the Genie chatbot.
17 | 
18 | **System:**
19 | 
20 | - OS:
21 | - LLM App Genie release version (commit or version number):
22 | - DocumentStore (Amazon Kendra or Amazon OpenSearch)
23 | - Large Language Model (Falcon, Amazon Titan, Anthropic Claude 2, ...):
24 | 


--------------------------------------------------------------------------------
/.gitlab/issue_templates/feature_request.md:
--------------------------------------------------------------------------------
 1 | # Feature Request
 2 | 
 3 | **Is your feature request related to a problem? Please describe.**
 4 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 5 | 
 6 | **Describe the solution you'd like**
 7 | A clear and concise description of what you want to happen.
 8 | 
 9 | **Describe alternatives you've considered**
10 | A clear and concise description of any alternative solutions or features you've considered.
11 | 
12 | **Additional context**
13 | Add any other context or screenshots about the feature request here.
14 | 


--------------------------------------------------------------------------------
/.gitlab/issue_templates/i_need_help.md:
--------------------------------------------------------------------------------
 1 | # I need help
 2 | 
 3 | If you have a general question or need help with something specific that is not a bug or feature request you can create an issue with this template.
 4 | 
 5 | If you want to report a bug or feature request please consider opening this issue with the Default bug template or the Feature Request template.
 6 | 
 7 | **Question**
 8 | Please describe your question or topic that you need help with.
 9 | 
10 | We will contact you in the comments of this issue. Pleases monitor it in case we have any follow-up questions for you.
11 | 


--------------------------------------------------------------------------------
/.gitlab/merge_request_templates/Default.md:
--------------------------------------------------------------------------------
 1 | # \[Replace with Title\]
 2 | 
 3 | ### Description
 4 | 
 5 | Describe the problem or task that this merge request addressed.
 6 | 
 7 | ### Related Issues
 8 | 
 9 | If applicable reference the issues this merge request fixes.
10 | 
11 | - fixes #issue-number
12 | 
13 | ### Additional Notes
14 | 
15 | Include any extra information or considerations for reviewers, such as impacted areas of the codebase.
16 | 
17 | ### Merge Request Checklists
18 | 
19 | - [ ] I am using [conventional commit types](https://www.conventionalcommits.org/en/v1.0.0/) for my merge request title. Your title should follow the structure `<type>([optional scope]):<description>`.
20 | 
21 |   Common types are:
22 | 
23 |   - feat (for enhancements)
24 |   - bug (for bug fixes)
25 |   - docs (for changes to the documentation)
26 |   - test (for changes to the tests)
27 |   - perf (for performance improvements)
28 |   - refactor (for code refactorings)
29 | 
30 |   If your change is breaking backwards compatibility use a ! after the type to indicate that your merge request contains breaking changes.
31 | 
32 |   Examples:
33 | 
34 |   - feat(chatbot): add local document upload
35 |   - bug: fix CodePipeline
36 |   - refactor!: change environment variable names
37 | 
38 | - [ ] I fixed any [trunk](../../README.md#pre-requisites-for-development) issues.
39 |       Code follows project coding guidelines.
40 | - [ ] I updated the documentation to reflect the changes
41 | - [ ] I tagged this merge request with the next MINOR [semver](https://semver.org/) version number or with the next PATH version number if it is a bug fix. You can find the prior version in GitLab in Issues > Milestones. If the highest version number in the Milestones is `v1.0.1` and this merge request is for a bug fix then you should create a new milestone `v1.0.2` (PATCH version increased by one) and tag this merge request with milestone `v1.0.1`. If your merge request is a change other than fixing a bug then you should create a new milestone with `v1.1.0` (MINOR version increased by one) and use it to tag this merge request.
42 | 
43 | **By submitting this pull request, I confirm that my contribution is made under the terms of the [MIT-0](../../LICENSE).**
44 | 


--------------------------------------------------------------------------------
/.gitlab/merge_request_templates/Release.md:
--------------------------------------------------------------------------------
 1 | # \[Replace with Title\]
 2 | 
 3 | ### Release Notes
 4 | 
 5 | Describe what changes compared to the last release.
 6 | 
 7 | ### Upgrade Notes
 8 | 
 9 | Document steps to migrate from the last release to this release.
10 | 
11 | ### Merge Request Checklists
12 | 
13 | - [ ] Documentation reflects the changes made.
14 | - [ ] I tagged this merge request with the next MAYOR [semver](https://semver.org/) version number.
15 | - [ ] The version numbers in all the sub folders match with this release version number.
16 | - [ ] Verify that the release changes pass QA.
17 | 
18 | **By submitting this pull request, I confirm that my contribution is made under the terms of the [MIT-0](../../LICENSE).**
19 | 


--------------------------------------------------------------------------------
/.trunk/.gitignore:
--------------------------------------------------------------------------------
1 | *out
2 | *logs
3 | *actions
4 | *notifications
5 | *tools
6 | plugins
7 | user_trunk.yaml
8 | user.yaml
9 | 


--------------------------------------------------------------------------------
/.trunk/configs/.hadolint.yaml:
--------------------------------------------------------------------------------
1 | # Following source doesn't work in most setups
2 | ignored:
3 |   - SC1090
4 |   - SC1091
5 | 


--------------------------------------------------------------------------------
/.trunk/configs/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile=black
3 | 


--------------------------------------------------------------------------------
/.trunk/configs/.markdownlint.yaml:
--------------------------------------------------------------------------------
 1 | # Autoformatter friendly markdownlint config (all formatting rules disabled)
 2 | default: true
 3 | blank_lines: false
 4 | bullet: false
 5 | html: false
 6 | indentation: false
 7 | line_length: false
 8 | spaces: false
 9 | url: false
10 | whitespace: false
11 | 


--------------------------------------------------------------------------------
/.trunk/configs/.shellcheckrc:
--------------------------------------------------------------------------------
1 | enable=all
2 | source-path=SCRIPTDIR
3 | disable=SC2154
4 | 
5 | # If you're having issues with shellcheck following source, disable the errors via:
6 | # disable=SC1090
7 | # disable=SC1091
8 | 


--------------------------------------------------------------------------------
/.trunk/configs/.yamllint.yaml:
--------------------------------------------------------------------------------
 1 | rules:
 2 |   quoted-strings:
 3 |     required: only-when-needed
 4 |     extra-allowed: ["{|}"]
 5 |   empty-values:
 6 |     forbid-in-block-mappings: true
 7 |     forbid-in-flow-mappings: true
 8 |   key-duplicates: {}
 9 |   octal-values:
10 |     forbid-implicit-octal: true
11 | 


--------------------------------------------------------------------------------
/.trunk/configs/ruff.toml:
--------------------------------------------------------------------------------
1 | # Generic, formatter-friendly config.
2 | select = ["B", "D3", "D4", "E", "F"]
3 | 
4 | # Never enforce `E501` (line length violations). This should be handled by formatters.
5 | ignore = ["E501"]
6 | 
7 | [tool.ruff.pydocstyle]
8 | convention = "google" # Accepts: "google", "numpy", or "pep257".
9 | 


--------------------------------------------------------------------------------
/.trunk/configs/svgo.config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   plugins: [
 3 |     {
 4 |       name: "preset-default",
 5 |       params: {
 6 |         overrides: {
 7 |           removeViewBox: false, // https://github.com/svg/svgo/issues/1128
 8 |           sortAttrs: true,
 9 |           removeOffCanvasPaths: true,
10 |         },
11 |       },
12 |     },
13 |   ],
14 | };
15 | 


--------------------------------------------------------------------------------
/.trunk/trunk.yaml:
--------------------------------------------------------------------------------
 1 | version: 0.1
 2 | cli:
 3 |   version: 1.14.2
 4 | plugins:
 5 |   sources:
 6 |     - id: trunk
 7 |       ref: v1.2.2
 8 |       uri: https://github.com/trunk-io/plugins
 9 | runtimes:
10 |   enabled:
11 |     - go@1.21.0
12 |     - node@18.12.1
13 |     - python@3.10.8
14 | lint:
15 |   enabled:
16 |     - bandit@1.7.5
17 |     - black@23.7.0
18 |     - checkov@2.4.9
19 |     - git-diff-check
20 |     - hadolint@2.12.0
21 |     - isort@5.12.0
22 |     - markdownlint@0.35.0
23 |     - osv-scanner@1.3.6
24 |     - oxipng@8.0.0
25 |     - prettier@3.0.2
26 |     - ruff@0.0.286
27 |     - shellcheck@0.9.0
28 |     - shfmt@3.6.0
29 |     - svgo@3.0.2
30 |     - taplo@0.8.1
31 |     - terrascan@1.18.3
32 |     - trivy@0.44.1
33 |     - trufflehog@3.54.0
34 |     - yamllint@1.32.0
35 |     - pylint@2.17.5
36 |     - pyright@1.1.323
37 | 
38 | actions:
39 |   enabled:
40 |     - trunk-announce
41 |     - trunk-check-pre-push
42 |     - trunk-fmt-pre-commit
43 |     - trunk-upgrade-available
44 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/FalconLite/build-container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ghcr.io/huggingface/text-generation-inference:0.9.2
 2 | 
 3 | COPY layers.py /opt/conda/lib/python3.9/site-packages/text_generation_server/utils/layers.py
 4 | 
 5 | COPY sagemaker-entrypoint.sh entrypoint.sh
 6 | RUN chmod +x entrypoint.sh
 7 | 
 8 | HEALTHCHECK --timeout=300 CMD curl -f http://localhost:80/health
 9 | 
10 | ENTRYPOINT ["./entrypoint.sh"]
11 | CMD [ "" ]


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/FalconLite/build-container/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | REPO_NAME=${1:-custom-tgi-ecr}
 4 | ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
 5 | REGION=$(aws configure get region)
 6 | 
 7 | echo "REPO_NAME: ${REPO_NAME}"
 8 | echo "REGION: ${REGION}"
 9 | echo "ACCOUNT_ID: ${ACCOUNT_ID}"
10 | 
11 | docker pull ghcr.io/huggingface/text-generation-inference:0.8.2
12 | 
13 | aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com
14 | 
15 | aws ecr describe-repositories --repository-names ${REPO_NAME} --region $REGION
16 | if [ $? -ne 0 ]; then
17 | 	echo "Creating ECR repository: ${REPO_NAME}"
18 | 	aws ecr create-repository --repository-name $REPO_NAME --region $REGION
19 | fi
20 | 
21 | docker build -t $REPO_NAME .
22 | 
23 | docker tag $REPO_NAME:latest $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:latest
24 | 
25 | docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:latest
26 | 
27 | echo "Container URI:"
28 | echo "$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:latest"
29 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/FalconLite/build-container/sagemaker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z ${HF_MODEL_ID} ]]; then
 4 | 	echo "HF_MODEL_ID must be set"
 5 | 	exit 1
 6 | fi
 7 | export MODEL_ID="${HF_MODEL_ID}"
 8 | 
 9 | if [[ -n ${HF_MODEL_REVISION} ]]; then
10 | 	export REVISION="${HF_MODEL_REVISION}"
11 | fi
12 | 
13 | if [[ -n ${SM_NUM_GPUS} ]]; then
14 | 	export NUM_SHARD="${SM_NUM_GPUS}"
15 | fi
16 | 
17 | if [[ -n ${HF_MODEL_QUANTIZE} ]]; then
18 | 	export QUANTIZE="${HF_MODEL_QUANTIZE}"
19 | fi
20 | 
21 | if [[ -n ${HF_MODEL_TRUST_REMOTE_CODE} ]]; then
22 | 	export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
23 | fi
24 | 
25 | if [[ -n ${GPTQ_BITS} ]]; then
26 | 	export GPTQ_BITS="${GPTQ_BITS}"
27 | fi
28 | 
29 | if [[ -n ${GPTQ_GROUPSIZE} ]]; then
30 | 	export GPTQ_GROUPSIZE="${GPTQ_GROUPSIZE}"
31 | fi
32 | 
33 | if [[ -n ${DNTK_ALPHA_SCALER} ]]; then
34 | 	export DNTK_ALPHA_SCALER="${DNTK_ALPHA_SCALER}"
35 | fi
36 | 
37 | if [[ -n ${MAX_BATCH_PREFILL_TOKENS} ]]; then
38 | 	export MAX_BATCH_PREFILL_TOKENS="${MAX_BATCH_PREFILL_TOKENS}"
39 | fi
40 | 
41 | if [[ -n ${MAX_BATCH_TOTAL_TOKENS} ]]; then
42 | 	export MAX_BATCH_TOTAL_TOKENS="${MAX_BATCH_TOTAL_TOKENS}"
43 | fi
44 | text-generation-launcher --port 8080
45 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/embeddings/buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       - python -m pip install --upgrade pip
10 |       - apt update -y
11 |       - curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
12 |       - apt install git-lfs git -y
13 |       - git lfs install
14 |       - export MODEL_ID=e5-large-v2
15 |       - git clone https://huggingface.co/intfloat/${MODEL_ID}
16 |       - export S3_PREFIX="custom_inference/${MODEL_ID}/model.tar.gz"
17 |       - cp -r code/ ${MODEL_ID}/code/
18 |       - pip install sagemaker
19 |   build:
20 |     on-failure: ABORT
21 |     commands:
22 |       - cd $MODEL_ID
23 |       - git lfs pull
24 |       - tar zcvf model.tar.gz *
25 |       - aws s3 cp model.tar.gz "${S3_BUCKET}/${S3_PREFIX}"
26 |       - export S3_LOCATION="${S3_BUCKET}/${S3_PREFIX}"
27 |       - cd ../
28 |       - python scripts/build.py --model-execution-role "${MODEL_EXECUTION_ROLE_ARN}" --s3-bucket "${S3_BUCKET}" --instance-type "${INSTANCE_TYPE}" --export-config "${EXPORT_CONFIG}" --region "${REGION}" --s3-model-data-url "${S3_LOCATION}" --endpoint-name "${ENDPOINT_NAME}"
29 |       - aws cloudformation package --template endpoint-config-template.yml --s3-bucket "$ARTIFACT_BUCKET" --output-template "$EXPORT_TEMPLATE_NAME"
30 |       - cat "$EXPORT_TEMPLATE_NAME"
31 |       - cat "$EXPORT_CONFIG"
32 | artifacts:
33 |   files:
34 |     - "${EXPORT_TEMPLATE_NAME}"
35 |     - "${EXPORT_CONFIG}"
36 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/embeddings/code/inference.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModel, AutoTokenizer
 3 | 
 4 | 
 5 | def average_pool(
 6 |     last_hidden_states: torch.Tensor, attention_mask: torch.Tensor
 7 | ) -> torch.Tensor:
 8 |     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
 9 |     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
10 | 
11 | 
12 | def model_fn(model_dir):
13 |     # Load model from HuggingFace Hub
14 |     tokenizer = AutoTokenizer.from_pretrained(model_dir)
15 |     model = AutoModel.from_pretrained(model_dir)
16 |     return model, tokenizer
17 | 
18 | 
19 | def predict_fn(data, model_and_tokenizer):
20 |     # destruct model and tokenizer
21 |     model, tokenizer = model_and_tokenizer
22 |     print(data)
23 |     # Tokenize documents
24 |     texts = data.pop("texts", data)
25 |     isQuery = data.pop("isQuery", False)
26 |     prefix = "passage: "
27 |     if isQuery:
28 |         prefix = "query: "
29 | 
30 |     texts = [prefix + t for t in texts]
31 |     # Tokenize the input texts
32 |     encoded_input = tokenizer(
33 |         texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
34 |     )
35 | 
36 |     # Compute token embeddings
37 |     with torch.no_grad():
38 |         model_output = model(**encoded_input)
39 | 
40 |     # Perform pooling
41 |     embeddings = average_pool(
42 |         model_output.last_hidden_state, encoded_input["attention_mask"]
43 |     )
44 | 
45 |     # return dictonary, which will be json serializable
46 |     return {"vectors": embeddings.detach().numpy().tolist()}
47 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/embeddings/endpoint-config-template.yml:
--------------------------------------------------------------------------------
 1 | Description: Template to deploy an embedding endpoint to sagemaker
 2 | Parameters:
 3 |   ModelExecutionRoleArn:
 4 |     Type: String
 5 |     Description: Execution role used for deploying the model.
 6 |   EndpointInstanceType:
 7 |     Type: String
 8 |     Description: The ML compute instance type for the endpoint.
 9 |   EndpointName:
10 |     Type: String
11 |     Description: The endpoint name for the embeddings for SageMaker hosting
12 |   Image:
13 |     Type: String
14 |     Description: ECR image for the serving container
15 |   Region:
16 |     Type: String
17 |     Description: AWS Region where to deploy
18 |   ModelDataUrl:
19 |     Type: String
20 |     Description: S3 bucket where the model.tar.gz is stored
21 | 
22 | Resources:
23 |   Model:
24 |     Type: AWS::SageMaker::Model
25 |     Properties:
26 |       PrimaryContainer:
27 |         Image: !Ref Image
28 |         Mode: SingleModel
29 |         ModelDataUrl: !Ref ModelDataUrl
30 |         Environment:
31 |           SAGEMAKER_CONTAINER_LOG_LEVEL: 20
32 |           SAGEMAKER_REGION: !Ref Region
33 |       EnableNetworkIsolation: false
34 |       ExecutionRoleArn: !Ref ModelExecutionRoleArn
35 | 
36 |   EndpointConfig:
37 |     Type: AWS::SageMaker::EndpointConfig
38 |     Properties:
39 |       ProductionVariants:
40 |         - InitialInstanceCount: 1
41 |           InitialVariantWeight: 1.0
42 |           InstanceType: !Ref EndpointInstanceType
43 |           ModelName: !GetAtt Model.ModelName
44 |           VariantName: AllTraffic
45 |           ContainerStartupHealthCheckTimeoutInSeconds: 600
46 | 
47 |   Endpoint:
48 |     Type: AWS::SageMaker::Endpoint
49 |     Properties:
50 |       EndpointName: !Ref EndpointName
51 |       EndpointConfigName: !GetAtt EndpointConfig.EndpointConfigName
52 |       Tags:
53 |         - Key: genie:deployment
54 |           Value: 'True'
55 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/embeddings/scripts/build.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | 
 6 | import boto3
 7 | from sagemaker.huggingface.model import HuggingFaceModel
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | sm_client = boto3.client("sagemaker")
11 | 
12 | 
13 | def extend_config(args, stage_config):
14 |     """
15 |     Extend the stage configuration with additional parameters and tags based.
16 |     """
17 |     # Create new params and tags
18 | 
19 |     model = HuggingFaceModel(
20 |         transformers_version="4.26", pytorch_version="1.13", py_version="py39"
21 |     )
22 | 
23 |     new_params = {
24 |         "ModelExecutionRoleArn": args.model_execution_role,
25 |         "EndpointInstanceType": args.instance_type,
26 |         "Image": model.serving_image_uri(
27 |             region_name=args.region, instance_type=args.instance_type
28 |         ),
29 |         "Region": args.region,
30 |         "ModelDataUrl": args.s3_model_data_url,
31 |         "EndpointName": args.endpoint_name,
32 |     }
33 | 
34 |     return {
35 |         "Parameters": {**new_params},
36 |     }
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument(
42 |         "--log-level", type=str, default=os.environ.get("LOGLEVEL", "INFO").upper()
43 |     )
44 |     parser.add_argument("--model-execution-role", type=str, required=True)
45 |     parser.add_argument("--instance-type", type=str, required=True)
46 |     parser.add_argument("--export-config", type=str, required=True)
47 |     parser.add_argument("--s3-model-data-url", type=str, required=True)
48 |     parser.add_argument("--region", type=str, required=True)
49 |     parser.add_argument("--endpoint-name", type=str, required=True)
50 | 
51 |     args, _ = parser.parse_known_args()
52 | 
53 |     # Configure logging to output the line number and message
54 |     log_format = "%(levelname)s: [%(filename)s:%(lineno)s] %(message)s"
55 |     logging.basicConfig(format=log_format, level=args.log_level)
56 | 
57 |     # Write the staging config
58 |     config = extend_config(args, {})
59 |     logger.debug("config: {}".format(json.dumps(config, indent=4)))
60 |     with open(args.export_config, "w", encoding="utf8") as f:
61 |         json.dump(config, f, indent=4)
62 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/llm/buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       - python -m pip install --upgrade pip
10 |       - pip install "sagemaker==2.163.0" --upgrade --quiet
11 |   build:
12 |     on-failure: ABORT
13 |     commands:
14 |       - ls -lah
15 |       - env
16 |       # Export the staging and production configuration files
17 |       - python scripts/build.py --model-execution-role "${MODEL_EXECUTION_ROLE_ARN}" --s3-bucket "${S3_BUCKET}" --instance-type "${INSTANCE_TYPE}" --export-config "${EXPORT_CONFIG}" --region "${REGION}" --endpoint-name "${ENDPOINT_NAME}"
18 |       - aws cloudformation package --template endpoint-config-template.yml --s3-bucket "$ARTIFACT_BUCKET" --output-template "$EXPORT_TEMPLATE_NAME"
19 |       - cat "$EXPORT_TEMPLATE_NAME"
20 |       - cat "$EXPORT_CONFIG"
21 | artifacts:
22 |   files:
23 |     - $EXPORT_TEMPLATE_NAME
24 |     - $EXPORT_CONFIG
25 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/llm/endpoint-config-template.yml:
--------------------------------------------------------------------------------
 1 | Description: Template to deploy a falcon 40B model to sagemaker
 2 | Parameters:
 3 |   ModelExecutionRoleArn:
 4 |     Type: String
 5 |     Description: Execution role used for deploying the model.
 6 |   EndpointInstanceType:
 7 |     Type: String
 8 |     Description: The ML compute instance type for the endpoint.
 9 |   EndpointName:
10 |     Type: String
11 |     Description: Endpoint name for the embeddings in SageMaker hosting
12 |   Image:
13 |     Type: String
14 |     Description: ECR image for the serving container
15 |   Region:
16 |     Type: String
17 |     Description: AWS Region where to deploy
18 | 
19 | Resources:
20 |   Model:
21 |     Type: AWS::SageMaker::Model
22 |     Properties:
23 |       PrimaryContainer:
24 |         Image: !Ref Image
25 |         Mode: SingleModel
26 |         Environment:
27 |           HF_MODEL_ID: tiiuae/falcon-40b-instruct
28 |           HF_MODEL_REVISION: 1e7fdcc9f45d13704f3826e99937917e007cd975
29 |           MAX_INPUT_LENGTH: 1900
30 |           MAX_TOTAL_TOKENS: 2048
31 |           SAGEMAKER_CONTAINER_LOG_LEVEL: 20
32 |           SAGEMAKER_REGION: !Ref Region
33 |           SM_NUM_GPUS: 4
34 |       EnableNetworkIsolation: false
35 |       ExecutionRoleArn: !Ref ModelExecutionRoleArn
36 | 
37 |   EndpointConfig:
38 |     Type: AWS::SageMaker::EndpointConfig
39 |     Properties:
40 |       ProductionVariants:
41 |         - InitialInstanceCount: 1
42 |           InitialVariantWeight: 1.0
43 |           InstanceType: !Ref EndpointInstanceType
44 |           ModelName: !GetAtt Model.ModelName
45 |           VariantName: AllTraffic
46 |           ContainerStartupHealthCheckTimeoutInSeconds: 600
47 | 
48 |   Endpoint:
49 |     Type: AWS::SageMaker::Endpoint
50 |     Properties:
51 |       EndpointName: !Ref EndpointName
52 |       EndpointConfigName: !GetAtt EndpointConfig.EndpointConfigName
53 |       Tags:
54 |         - Key: genie:friendly-name
55 |           Value: Falcon 40B Instruct
56 |         - Key: genie:prompt-rag
57 |           Value: prompts/falcon_fine_tuned_greetings_rag.yaml
58 |         - Key: genie:prompt-chat
59 |           Value: prompts/falcon_chat.yaml
60 |         - Key: genie:deployment
61 |           Value: 'True'
62 | 


--------------------------------------------------------------------------------
/00_llm_endpoint_setup/codebuild/llm/scripts/build.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | 
 6 | import boto3
 7 | from sagemaker.huggingface import get_huggingface_llm_image_uri
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | sm_client = boto3.client("sagemaker")
11 | 
12 | 
13 | def extend_config(args, stage_config):
14 |     """
15 |     Extend the stage configuration with additional parameters and tags based.
16 |     """
17 |     # Create new params and tags
18 |     new_params = {
19 |         "ModelExecutionRoleArn": args.model_execution_role,
20 |         "EndpointInstanceType": args.instance_type,
21 |         "Image": get_huggingface_llm_image_uri("huggingface", version="0.8.2"),
22 |         "Region": args.region,
23 |         "EndpointName": args.endpoint_name,
24 |     }
25 | 
26 |     return {
27 |         "Parameters": {**new_params},
28 |     }
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument(
34 |         "--log-level", type=str, default=os.environ.get("LOGLEVEL", "INFO").upper()
35 |     )
36 |     parser.add_argument("--model-execution-role", type=str, required=True)
37 |     parser.add_argument("--instance-type", type=str, required=True)
38 |     parser.add_argument("--export-config", type=str, required=True)
39 |     parser.add_argument("--s3-bucket", type=str, required=True)
40 |     parser.add_argument("--region", type=str, required=True)
41 |     parser.add_argument("--endpoint-name", type=str, required=True)
42 |     args, _ = parser.parse_known_args()
43 | 
44 |     # Configure logging to output the line number and message
45 |     log_format = "%(levelname)s: [%(filename)s:%(lineno)s] %(message)s"
46 |     logging.basicConfig(format=log_format, level=args.log_level)
47 | 
48 |     # Write the staging config
49 |     config = extend_config(args, {})
50 |     logger.debug("config: {}".format(json.dumps(config, indent=4)))
51 |     with open(args.export_config, "w", encoding="utf8") as f:
52 |         json.dump(config, f, indent=4)
53 | 


--------------------------------------------------------------------------------
/01_crawler/README.md:
--------------------------------------------------------------------------------
  1 | # Crawler
  2 | 
  3 | This is a browser based webcrawler based on scrapy and Playwright.
  4 | 
  5 | The main features are:
  6 | 
  7 | - Based on scrapy and Playwright
  8 | - supports dynamic page content
  9 | - Cleans up webpages using the [mozilla/readability](https://github.com/mozilla/readability). This the same as in Firefox Reader view.
 10 | - Can be easily extended or customized using python and playwright or scrapy
 11 | - Provides the full link structure of the crawling as well as the html version of the web page for later processing or analysis. This allows for example to do paragraph splitting based on the HTML view (e.g. Header) and not only on text.
 12 | 
 13 | ## Setting up the project
 14 | 
 15 | - Install Poetry ([docs](https://python-poetry.org/docs/)):
 16 |   ```bash
 17 |   curl -sSL https://install.python-poetry.org | python3 -
 18 |   ```
 19 | 
 20 | * Make sure to add poetry to env [path](https://python-poetry.org/docs/#:~:text=Add%20Poetry%20to%20your%20PATH)
 21 | 
 22 | - Alternatively you can use brew
 23 | 
 24 |   ```bash
 25 |   brew install poetry
 26 |   ```
 27 | 
 28 | - Install required packages
 29 | 
 30 |   ```bash
 31 |   cd 01_crawler
 32 |   poetry install
 33 |   ```
 34 | 
 35 | - If you want to add a new package (optional)
 36 | 
 37 |   ```bash
 38 |   poetry add mypackage
 39 |   ```
 40 | 
 41 | - Enter poetry shell
 42 | 
 43 |   ```bash
 44 |   poetry shell
 45 |   ```
 46 | 
 47 | - Install browsers for playwright (from poetry shell)
 48 |   ```bash
 49 |   playwright install-deps
 50 |   playwright install
 51 |   ```
 52 | 
 53 | ## Running the crawler,
 54 | 
 55 | - give the name, configuration and path for the output file, the file should be then used in **04_ingest_html_embeddings_to_opensearch.ipynb** notebook
 56 |   ```bash
 57 |   # user has to be in poetry shell
 58 |   cd crawly
 59 |   scrapy crawl webpage -O ../web-content/admin_ch_press_releases_en.json -a filename=configs/admin-ch-press-releases-en.json
 60 |   aws s3 cp admin_ch_press_releases_en.json s3://gen-ai-foundation/crawlers/admin-ch/
 61 |   scrapy crawl webpage -O ../web-content/admin_ch_press_releases_de.json -a filename=configs/admin-ch-press-releases-de.json
 62 |   aws s3 cp admin_ch_press_releases_de.json s3://gen-ai-foundation/crawlers/admin-ch/
 63 |   ```
 64 | 
 65 | ## Debugging the crawler
 66 | 
 67 | There might be cases where you need to add some custom actions to the crawler, so that it can support specific webpages.
 68 | You can investigate the playwright actions by starting the automation recorder
 69 | `bash
 70 |     npx playwright codegen admin.ch
 71 |     `
 72 | 
 73 | - setup your IDE (VS Code)
 74 |   `json
 75 | {
 76 |     "version": "0.2.0",
 77 |     "configurations": [
 78 |         {
 79 |             "name": "Python: Scrapy",
 80 |             "type": "python",
 81 |             "request": "launch",
 82 |             "python": "<YOUR_PYTHON_PATH>",
 83 |             "module": "scrapy",
 84 |             "args": [
 85 |                 "crawl",
 86 |                 "webpage",
 87 |                 "-O", 
 88 |                 "../web-content/admin_ch_press_releases_en.json",
 89 |                 "-a",
 90 |                 "filename=configs/admin-ch-press-releases-en.json"
 91 |             ],
 92 |             "console": "integratedTerminal",
 93 |             "cwd": "${workspaceFolder}/01_crawler/crawly"
 94 |         }
 95 |     ]
 96 | }    
 97 | `
 98 |   ## Todo
 99 | - Download the files and update the ingestion notebook to consider them
100 | 


--------------------------------------------------------------------------------
/01_crawler/buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       # install poetry
10 |       - curl -sSL https://install.python-poetry.org | python3 -
11 |       - export PATH="/root/.local/bin:$PATH"
12 |       - poetry install
13 |       - poetry run playwright install-deps
14 |       - poetry run playwright install
15 |   build:
16 |     on-failure: ABORT
17 |     commands:
18 |       - cd crawly
19 |       - poetry run scrapy crawl webpage -o admin_ch_press_releases-en.json -a filename=configs/admin-ch-press-releases-en.json
20 |       - export S3_PREFIX=crawler_results/file.json
21 |       - aws s3 cp admin_ch_press_releases-en.json "${S3_BUCKET}/${S3_PREFIX}"
22 |   post_build:
23 |     on-failure: ABORT
24 |     commands:
25 |       # upload customer .json file to s3
26 |       - aws ssm put-parameter --name "${APP_PREFIX}CrawledFileLocation" --value "${S3_BUCKET}/${S3_PREFIX}" --type String --overwrite
27 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/configs/admin-ch-press-releases-de.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "start_urls": [
 3 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=0",
 4 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=1",
 5 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=2",
 6 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=3",
 7 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=4",
 8 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=5",
 9 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=6",
10 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=7",
11 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=8",
12 |     "https://www.admin.ch/gov/de/start/dokumentation/medienmitteilungen.html?dyn_pageIndex=9"
13 |   ],
14 |   "CRAWLER_DEPTH": 2,
15 |   "ROBOTSTXT_OBEY": false,
16 |   "custom_settings": {
17 |     "LOG_LEVEL": "INFO",
18 |     "PLAYWRIGHT_LAUNCH_OPTIONS": {
19 |       "headless": true
20 |     },
21 |     "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
22 |     "CONCURRENT_REQUESTS": 10,
23 |     "DOWNLOAD_DELAY": 10,
24 |     "COOKIES_ENABLED": true,
25 |     "PLAYWRIGHT_BROWSER_TYPE": ""
26 |   },
27 |   "whitelist_patterns": ["dokumentation/medienmitteilungen.msg"],
28 |   "blacklist_patterns": ["#", "medienmitteilungen.html"],
29 |   "file_extensions": [".pdf", ".docx", ".xlsx", ".csv"]
30 | }
31 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/configs/admin-ch-press-releases-en.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "start_urls": [
 3 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=0",
 4 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=1",
 5 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=2",
 6 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=3",
 7 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=4",
 8 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=5",
 9 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=6",
10 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=7",
11 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=8",
12 |     "https://www.admin.ch/gov/en/start/documentation/media-releases.html?dyn_pageIndex=9"
13 |   ],
14 |   "CRAWLER_DEPTH": 2,
15 |   "ROBOTSTXT_OBEY": false,
16 |   "custom_settings": {
17 |     "LOG_LEVEL": "INFO",
18 |     "PLAYWRIGHT_LAUNCH_OPTIONS": {
19 |       "headless": true
20 |     },
21 |     "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
22 |     "CONCURRENT_REQUESTS": 10,
23 |     "DOWNLOAD_DELAY": 10,
24 |     "COOKIES_ENABLED": true,
25 |     "PLAYWRIGHT_BROWSER_TYPE": ""
26 |   },
27 |   "whitelist_patterns": ["documentation/media-releases.msg"],
28 |   "blacklist_patterns": ["#", "media-releases.html"],
29 |   "file_extensions": [".pdf", ".docx", ".xlsx", ".csv"]
30 | }
31 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/custom_middlewares.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | 
 3 | import crawly.settings as settings
 4 | from scrapy.downloadermiddlewares.retry import RetryMiddleware, get_retry_request
 5 | 
 6 | 
 7 | class CustomRetryMiddleware(RetryMiddleware):
 8 |     def process_response(self, request, response, spider):
 9 |         if response.status in settings.RETRY_HTTP_CODES:
10 |             # Check if the request has met the maximum retry times.
11 |             if self._retry_times_exceeded(request):
12 |                 return response  # don't retry anymore
13 | 
14 |             # Add a custom delay (e.g., 5 seconds) between retries.
15 |             sleep(settings.RETRY_DELAY)
16 | 
17 |             # Use the _retry method to get a new request for retry
18 |             new_request = self._retry(
19 |                 request=request,
20 |                 reason=f"Retry on status {response.status}",
21 |                 spider=spider,
22 |             )
23 | 
24 |             if new_request:
25 |                 new_request.priority = request.priority + self.priority_adjust
26 |                 return new_request
27 | 
28 |         return response  # return the original response if no retry needed
29 | 
30 |     def _retry_times_exceeded(self, request):
31 |         # Check if the request has met or exceeded the maximum retry times.
32 |         retry_times = request.meta.get("retry_times", 0)
33 | 
34 |         return retry_times >= self.max_retry_times
35 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class WebItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     name = scrapy.Field()
13 |     price = scrapy.Field()
14 |     stock = scrapy.Field()
15 |     tags = scrapy.Field()
16 |     last_updated = scrapy.Field(serializer=str)
17 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | # useful for handling different item types with a single interface
  7 | # from itemadapter import ItemAdapter, is_item
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class WebchatSpiderMiddleware:
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, or item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request or item objects.
 43 |         pass
 44 | 
 45 |     def process_start_requests(self, start_requests, spider):
 46 |         # Called with the start requests of the spider, and works
 47 |         # similarly to the process_spider_output() method, except
 48 |         # that it doesn’t have a response associated.
 49 | 
 50 |         # Must return only requests (not items).
 51 |         for r in start_requests:
 52 |             yield r
 53 | 
 54 |     def spider_opened(self, spider):
 55 |         spider.logger.info("Spider opened: %s" % spider.name)
 56 | 
 57 | 
 58 | class WebchatDownloaderMiddleware:
 59 |     # Not all methods need to be defined. If a method is not defined,
 60 |     # scrapy acts as if the downloader middleware does not modify the
 61 |     # passed objects.
 62 | 
 63 |     @classmethod
 64 |     def from_crawler(cls, crawler):
 65 |         # This method is used by Scrapy to create your spiders.
 66 |         s = cls()
 67 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 68 |         return s
 69 | 
 70 |     def process_request(self, request, spider):
 71 |         # Called for each request that goes through the downloader
 72 |         # middleware.
 73 | 
 74 |         # Must either:
 75 |         # - return None: continue processing this request
 76 |         # - or return a Response object
 77 |         # - or return a Request object
 78 |         # - or raise IgnoreRequest: process_exception() methods of
 79 |         #   installed downloader middleware will be called
 80 |         return None
 81 | 
 82 |     def process_response(self, request, response, spider):
 83 |         # Called with the response returned from the downloader.
 84 | 
 85 |         # Must either;
 86 |         # - return a Response object
 87 |         # - return a Request object
 88 |         # - or raise IgnoreRequest
 89 |         return response
 90 | 
 91 |     def process_exception(self, request, exception, spider):
 92 |         # Called when a download handler or a process_request()
 93 |         # (from other downloader middleware) raises an exception.
 94 | 
 95 |         # Must either:
 96 |         # - return None: continue processing this exception
 97 |         # - return a Response object: stops process_exception() chain
 98 |         # - return a Request object: stops process_exception() chain
 99 |         pass
100 | 
101 |     def spider_opened(self, spider):
102 |         spider.logger.info("Spider opened: %s" % spider.name)
103 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class WebchatPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawly.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawly
12 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for webchat project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = "crawly"
11 | 
12 | SPIDER_MODULES = ["crawly.spiders"]
13 | NEWSPIDER_MODULE = "crawly.spiders"
14 | 
15 | RETRY = True
16 | RETRY_TIMES = 10
17 | RETRY_DELAY = 30
18 | RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 403]
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | # USER_AGENT = "webchat (+http://www.yourdomain.com)"
22 | 
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = True
25 | 
26 | # # file download settings
27 | # ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
28 | # FILES_STORE = '.tmp_file_downloads'
29 | 
30 | 
31 | # Set settings whose default value is deprecated to a future-proof value
32 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
33 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
34 | FEED_EXPORT_ENCODING = "utf-8"
35 | 
36 | DOWNLOAD_HANDLERS = {
37 |     "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
38 |     "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
39 | }
40 | 
41 | DOWNLOADER_MIDDLEWARES = {
42 |     "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
43 |     "scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
44 |     "custom_middlewares.CustomRetryMiddleware": 300,
45 |     "scrapy_user_agents.middlewares.RandomUserAgentMiddleware": 400,
46 | }
47 | 


--------------------------------------------------------------------------------
/01_crawler/crawly/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/01_crawler/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "crawly"
 3 | version = "1.2.1"
 4 | description = ""
 5 | authors = ["Arlind Nocaj <arlnocaj@amazon.com>"]
 6 | readme = "README.md"
 7 | 
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | scrapy-playwright = "^0.0.34"
12 | scrapy = "^2.11.1"
13 | scrapy-user-agents = "^0.1.1"
14 | pandas = "^2.2.1"
15 | itables = "^1.7.1"
16 | playwright = "^1.42.0"
17 | 
18 | [build-system]
19 | requires = ["poetry-core"]
20 | build-backend = "poetry.core.masonry.api"
21 | 


--------------------------------------------------------------------------------
/02_ingestion/buildspec_admin_ch.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       - pip install pandas s3fs sagemaker --quiet
10 |       - pip install transformers --quiet
11 |       - pip install langchain==0.0.218 --quiet
12 |       - pip install opensearch-py==2.2.0 --quiet
13 |       - pip install beautifulsoup4 --quiet
14 |       - pip install awswrangler[opensearch] --quiet
15 |       - pip install requests_aws4auth --quiet
16 |       - pip install jsonpath_ng --quiet
17 |       - export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
18 |   build:
19 |     on-failure: ABORT
20 |     commands:      
21 |       - python scripts/admin_ch_embedding.py
22 | 


--------------------------------------------------------------------------------
/02_ingestion/buildspec_fin_analyzer.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       - pip install tqdm --quiet
10 |       - pip install pandas s3fs sagemaker --quiet
11 |       - pip install transformers --quiet
12 |       - pip install langchain==0.0.218 --quiet
13 |       - pip install opensearch-py==2.2.0 --quiet
14 |       - pip install beautifulsoup4 --quiet
15 |       - pip install awswrangler[opensearch] --quiet
16 |       - pip install requests_aws4auth --quiet
17 |       - pip install jsonpath_ng --quiet
18 |       - pip install markdownify finnhub-python alpaca_trade_api defusedxml --quiet
19 |       - export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
20 |   build:
21 |     on-failure: ABORT
22 |     commands:      
23 |       - python scripts/fin_analyzer_data.py
24 |       - python scripts/fin_analyzer_embedding.py
25 | 


--------------------------------------------------------------------------------
/02_ingestion/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "02-ingestion"
 3 | version = "1.2.1"
 4 | description = ""
 5 | authors = ["Arlind Nocaj <arlnocaj@amazon.com>"]
 6 | readme = "README.md"
 7 | packages = [{ include = "02_ingestion" }]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.10, <3.11"
11 | jupyter = "^1.0.0"
12 | pandas = "^2.0.3"
13 | datasets = "^2.14.4"
14 | 
15 | [build-system]
16 | requires = ["poetry-core"]
17 | build-backend = "poetry.core.masonry.api"
18 | 


--------------------------------------------------------------------------------
/02_ingestion/scripts/admin_ch_embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from modules.embedding import convert_paragraphs, generate_embeddings
 4 | from modules.opensearch_helpers import opensearch_auth, embeddings_to_index
 5 | from modules.aws_helpers import get_parameter_value
 6 | 
 7 | app_prefix = os.getenv("APP_PREFIX")
 8 | secret_name = os.getenv("OPENSEARCH_SECRET_NAME")
 9 | os_index_name = os.getenv("OPENSEARCH_INDEX_NAME")
10 | os_domain_ep = get_parameter_value(f"{app_prefix}OpenSearchEndpoint")
11 | 
12 | print(f"opensearch domain endpoint: {os_domain_ep}")
13 | 
14 | # Get data
15 | crawled_file_path = get_parameter_value(f"{app_prefix}CrawledFileLocation")
16 | df = pd.read_json(crawled_file_path)
17 | 
18 | paragraphs = df.apply(convert_paragraphs, axis=1)
19 | df["paragraphs"] = paragraphs.tolist()
20 | 
21 | df.to_json("pages_with_paragraphs_clean_by_section.json")
22 | 
23 | docs = generate_embeddings(df)
24 | os_http_auth = opensearch_auth(os_domain_ep, secret_name)
25 | 
26 | if os.getenv("EMBEDDING_TYPE") == "Sagemaker":
27 |     huggingface_config = {
28 |         "predictor_endpoint_name": os.getenv('ENDPOINT_NAME')
29 |     }
30 |     embeddings_to_index(os_domain_ep, os_index_name, docs, os_http_auth, bedrock_config=huggingface_config)
31 | else:
32 |     bedrock_config = {
33 |         "region": os.getenv("BEDROCK_REGION"),
34 |         "model_id": os.getenv("BEDROCK_EMBEDDING_MODEL")
35 |     }
36 |     embeddings_to_index(os_domain_ep, os_index_name, docs, os_http_auth, bedrock_config=bedrock_config)
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/02_ingestion/scripts/fin_analyzer_embedding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from modules.aws_helpers import get_parameter_value, read_from_s3, s3_client
 3 | import pandas as pd
 4 | from langchain.schema import Document
 5 | from modules.opensearch_helpers import opensearch_auth, embeddings_to_index
 6 | 
 7 | 
 8 | # Parameter group Finance Analyzer
 9 | s3_bucket = os.getenv("S3_BUCKET")
10 | s3_prefix = os.getenv("S3_PREFIX")
11 | 
12 | app_prefix = os.getenv("APP_PREFIX")
13 | secret_name = os.getenv("OPENSEARCH_SECRET_NAME")
14 | os_index_name = os.getenv("OPENSEARCH_INDEX_NAME")
15 | os_domain_ep = get_parameter_value(f"{app_prefix}OpenSearchEndpoint")
16 | 
17 | if s3_bucket == "" or os_index_name == "":
18 |     print("Complete Fin Analyzer setup. Please provide S3_BUCKET and OPENSEARCH_INDEX_NAME environment variables")
19 |     exit(0)
20 | 
21 | os_http_auth = opensearch_auth(os_domain_ep, secret_name)
22 | 
23 | print(f"opensearch domain endpoint: {os_domain_ep}")
24 | 
25 | df = pd.read_json(f"s3://{s3_bucket}/{s3_prefix}/embedding_docs.json")
26 | docs = [Document(page_content=row["page_content"], metadata=row["metadata"]) for _, row in df.iterrows()]
27 | 
28 | if os.getenv("EMBEDDING_TYPE") == "Sagemaker":
29 |     huggingface_config = {
30 |         "predictor_endpoint_name": os.getenv('ENDPOINT_NAME')
31 |     }
32 |     embeddings_to_index(os_domain_ep, os_index_name, docs, os_http_auth, huggingface_config=huggingface_config)
33 | else:
34 |     bedrock_config = {
35 |         "region": os.getenv("BEDROCK_REGION"),
36 |         "model_id": os.getenv("BEDROCK_EMBEDDING_MODEL")
37 |     }
38 |     embeddings_to_index(os_domain_ep, os_index_name, docs, os_http_auth, bedrock_config=bedrock_config)
39 | 


--------------------------------------------------------------------------------
/02_ingestion/scripts/modules/aws_helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | from botocore.config import Config
 4 | import os
 5 | from io import StringIO
 6 | import pandas as pd
 7 | 
 8 | global s3_client
 9 | 
10 | region = os.environ.get('AWS_REGION') 
11 | if not region:
12 |     region = "eu-west-1"
13 | 
14 | config = Config(region_name=region)
15 | 
16 | ssm_client = boto3.client("ssm")
17 | secmgr_client = boto3.client("secretsmanager", config=config)
18 | s3_client = boto3.client("s3", config=config)
19 | 
20 | # Get the parameter value
21 | def get_parameter_value(parameter_name, decrypt=True):
22 |     response = ssm_client.get_parameter(Name=parameter_name, WithDecryption=decrypt)
23 |     return response["Parameter"]["Value"]
24 | 
25 | # get secret from secret manager
26 | def get_credentials(secret_id: str) -> str:
27 |     response = secmgr_client.get_secret_value(SecretId=secret_id)
28 |     secrets_value = json.loads(response["SecretString"])
29 |     return secrets_value
30 | 
31 | # Loading the data from S3
32 | def read_from_s3(bucket, key, format):
33 |     obj = s3_client.get_object(Bucket=bucket, Key=key)
34 |     data = obj['Body'].read().decode('utf-8')
35 | 
36 |     if format == "csv":
37 |         return pd.read_csv(StringIO(data))
38 |     elif format == "json":
39 |         return pd.read_json(StringIO(data))


--------------------------------------------------------------------------------
/02_ingestion/scripts/modules/embedding.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | from langchain.schema import Document
 4 | 
 5 | class CustomEmbeddings:
 6 |     def __init__(self, embeddings_predictor):
 7 |         self.embeddings_predictor = embeddings_predictor
 8 | 
 9 |     def embed_documents(self, input_texts):
10 |         return self._embed_docs(input_texts, False)
11 | 
12 |     def embed_query(self, query_text):
13 |         return self._embed_docs([query_text])[0]
14 | 
15 |     def _embed_docs(self, texts, isQuery=False):
16 |         data = {
17 |             "texts": texts,
18 |         }
19 | 
20 |         res = self.embeddings_predictor.predict(data=data)
21 |         return res["vectors"]
22 |     
23 | def convert_paragraphs(row):
24 |     html = row["content"]
25 |     textContent = row["textContent"]
26 |     soup = BeautifulSoup(html, features="html.parser")
27 |     sections = [h.text for h in soup.find_all(re.compile("^h[1-6]$"))]
28 |     paragraphs = []
29 |     pos = 0
30 |     for section in sections:
31 |         split_pos = textContent.find(section, pos, len(textContent))
32 |         paragraphs.append(textContent[pos:split_pos])
33 |         pos = split_pos
34 |     paragraphs.append(textContent[pos : len(textContent)])
35 | 
36 |     paragraphs_clean = [p.strip() for p in paragraphs if len(p.strip()) > 0]
37 |     return paragraphs_clean
38 | 
39 | def generate_embeddings(df):
40 |     docs = []
41 |     for _, row in df.iterrows():
42 |         for _, paragraph in enumerate(row["paragraphs"]):
43 |             meta = {"source": row["source"], "title": row["title"]}
44 |             doc = Document(page_content=paragraph, metadata=meta)
45 |             docs.append(doc)
46 |     return docs


--------------------------------------------------------------------------------
/02_ingestion/scripts/modules/opensearch_helpers.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import awswrangler as wr
 3 | from langchain.vectorstores import OpenSearchVectorSearch
 4 | from sagemaker.huggingface.model import HuggingFacePredictor
 5 | from langchain.embeddings import BedrockEmbeddings
 6 | from opensearchpy import RequestsHttpConnection, AWSV4SignerAuth
 7 | from modules.embedding import CustomEmbeddings
 8 | from modules.aws_helpers import get_credentials
 9 | from tqdm import tqdm
10 | from itertools import islice
11 | 
12 | # opensearch authentication
13 | def opensearch_auth(os_domain_ep, secret_name = None):
14 |     # checking for opensearch serverless version
15 |     if ".aoss." in os_domain_ep:
16 |         boto3_session = boto3.Session()
17 | 
18 |         credentials = boto3_session.get_credentials()    
19 |         return AWSV4SignerAuth(credentials, boto3_session.region_name, "aoss")
20 |     else:
21 |         credentials = get_credentials(secret_name)
22 |         user = credentials["user"]
23 |         secret = credentials["password"]
24 |         return (user, secret)
25 | 
26 | def chunked_iterable(iterable, size):
27 |     iterator = iter(iterable)
28 |     for first in iterator:
29 |         yield [first] + list(islice(iterator, size - 1))
30 | 
31 | def embeddings_to_index(os_domain_ep, os_index_name, docs, os_http_auth, bedrock_config = None, huggingface_config = None):
32 |     # Checking the authentication method
33 |     if hasattr(os_http_auth, "service"):
34 |         user = None
35 |         secret = None
36 |     else:
37 |         user, secret = os_http_auth
38 | 
39 |     try:
40 |         client = wr.opensearch.connect(host=os_domain_ep.replace(":443", ""), username=user, password=secret)
41 |         wr.opensearch.delete_index(client=client, index=os_index_name)
42 |         print(f"index {os_index_name} is deleted")
43 |     except Exception as err:
44 |         print(f"failed to delete index {os_index_name} with error: {err}")
45 | 
46 |     if huggingface_config:
47 |         # HuggingFace custom predictor
48 |         predictor = HuggingFacePredictor(endpoint_name=huggingface_config["predictor_endpoint_name"])
49 |         embeddings = CustomEmbeddings(predictor)
50 |         embedding_provider = "huggingface"
51 |     elif bedrock_config:
52 |         # Bedrock predictor
53 |         bedrock_client = boto3.client("bedrock-runtime", region_name=bedrock_config["region"])
54 |         embeddings = BedrockEmbeddings(
55 |             client=bedrock_client,
56 |             model_id=bedrock_config["model_id"])
57 |         embedding_provider = "bedrock"
58 |     else:
59 |         raise Exception("Bedrock or Huggingface config is required")
60 | 
61 |     docsearch = OpenSearchVectorSearch(
62 |         index_name=os_index_name,
63 |         embedding_function=embeddings,
64 |         opensearch_url=os_domain_ep,
65 |         
66 |         http_auth=os_http_auth,
67 |         timeout = 300,
68 |         connection_class = RequestsHttpConnection,
69 |         use_ssl=True,
70 |         verify_certs=True,
71 |     )
72 |     print(f"embedding {len(docs)} documents using {embedding_provider}")
73 | 
74 |     # Calculate the total number of batches
75 |     batch_size = 10
76 |     num_batches = (len(docs) + batch_size - 1) // batch_size
77 | 
78 |     # Assuming docs is your list of documents
79 |     for batch in tqdm(chunked_iterable(docs, batch_size), total=num_batches):
80 |         docsearch.add_documents(documents=batch)
81 | 
82 | 


--------------------------------------------------------------------------------
/03_chatbot/.dockerignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/.dockerignore


--------------------------------------------------------------------------------
/03_chatbot/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | ARG LISTEN_PORT=8001
 3 | 
 4 | FROM public.ecr.aws/docker/library/python:3.10-slim AS poetry_base
 5 | 
 6 | 
 7 | ENV PIP_NO_CACHE_DIR=off \
 8 |     PIP_DISABLE_PIP_VERSION_CHECK=on \
 9 |     PIP_DEFAULT_TIMEOUT=100 \
10 |     POETRY_VERSION=1.5.1 \
11 |     POETRY_HOME="/opt/poetry"
12 | 
13 | ENV PATH="$POETRY_HOME/bin:$PATH"
14 | 
15 | # System deps:
16 | RUN apt-get update \
17 |     && apt-get install --no-install-recommends -y \
18 |         curl \
19 |         build-essential \
20 |         wget \
21 |         unzip \
22 |     && apt-get clean
23 | 
24 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
25 | # Install Poetry - respects $POETRY_VERSION & $POETRY_HOME
26 | RUN curl -sSL https://install.python-poetry.org | python3 -
27 | 
28 | RUN apt-get purge --auto-remove -y \
29 |       build-essential
30 | 
31 | FROM poetry_base
32 | ARG LISTEN_PORT
33 | ENV YOUR_ENV="production"
34 | 
35 | ENV APP_DIR="/code"
36 | 
37 | ENV YOUR_ENV=${YOUR_ENV} \
38 |   PYTHONFAULTHANDLER=1 \
39 |   PYTHONUNBUFFERED=1 \
40 |   PYTHONDONTWRITEBYTECODE=1 \
41 |   PYTHONHASHSEED=random \
42 |   APP_DIR=${APP_DIR} \
43 |   POETRY_VIRTUALENVS_IN_PROJECT=true \
44 |   POETRY_CACHE_DIR=".cache" \
45 |   VIRTUAL_ENVIRONMENT_PATH="${APP_DIR}/.venv" \
46 |   LISTEN_PORT=${LISTEN_PORT} \
47 |   STREAMLIT_CONFIG_DIR=".streamlit"
48 | 
49 | 
50 | 
51 | # Adding the virtual environment to PATH in order to "activate" it.
52 | # https://docs.python.org/3/library/venv.html#how-venvs-work
53 | ENV PATH="$VIRTUAL_ENVIRONMENT_PATH/bin:$PATH"
54 | 
55 | # Principle of least privilege: create a new user for running the application
56 | RUN groupadd docker
57 | RUN useradd -r -g docker python_application
58 | 
59 | # Copy only requirements to cache them in docker layer
60 | WORKDIR $APP_DIR
61 | 
62 | RUN chown python_application:docker $APP_DIR
63 | 
64 | # Create cache directory and set permissions because user has no home
65 | # and poetry cache directory.
66 | # https://python-poetry.org/docs/configuration/#cache-directory
67 | RUN mkdir ${POETRY_CACHE_DIR} && chown python_application:docker ${POETRY_CACHE_DIR}
68 | 
69 | # Allow execution and read by user and group
70 | COPY --chown=python_application:docker --chmod=550 entrypoint.sh ./
71 | COPY --chown=python_application:docker --chmod=550 generate_internationalization.sh ./
72 | COPY --chown=python_application:docker --chmod=550 generate_secrets.py ./
73 | 
74 | COPY --chown=python_application:docker poetry.lock pyproject.toml ./
75 | 
76 | # activate after adding bedrock sdk
77 | RUN mkdir ${STREAMLIT_CONFIG_DIR} && chown python_application:docker ${STREAMLIT_CONFIG_DIR}
78 | 
79 | # Project initialization:
80 | RUN poetry install --no-interaction --no-ansi --no-root
81 | 
82 | # copy source code files
83 | COPY --chown=python_application:docker src ./src/
84 | 
85 | RUN ./generate_internationalization.sh
86 | 
87 | HEALTHCHECK CMD curl --fail http://localhost:${LISTEN_PORT}/_stcore/health
88 | 
89 | USER python_application
90 | EXPOSE ${LISTEN_PORT}
91 | 
92 | ENTRYPOINT ["./entrypoint.sh"]
93 | 
94 | 


--------------------------------------------------------------------------------
/03_chatbot/bin/icons/bug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/bin/icons/bug.png


--------------------------------------------------------------------------------
/03_chatbot/bin/icons/bug.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" aria-hidden="true" viewBox="0 0 16 16"><path fill="none" stroke="#000716" stroke-width="2" d="M5 5h6a1 1 0 0 1 1 1v5.5A3.5 3.5 0 0 1 8.5 15h-1A3.5 3.5 0 0 1 4 11.5V6a1 1 0 0 1 1-1Zm0 0a3 3 0 0 1 6 0" class="stroke-linejoin-round"/><path fill="none" stroke="#000716" stroke-width="2" d="M12 9h3M1 9h3m8 3h2l1 2M4 12H2l-1 2m11-8h2l1-2M4 6H2L1 4"/></svg>


--------------------------------------------------------------------------------
/03_chatbot/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | exec poetry run python generate_secrets.py &
4 | exec poetry run streamlit run src/run_module.py --server.enableCORS true --server.port $LISTEN_PORT --browser.serverPort $LISTEN_PORT --browser.gatherUsageStats false


--------------------------------------------------------------------------------
/03_chatbot/example_app_configs/bedrock_endpoint.appconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "./json_schema/aws_awsomechat_app_config.schema.json",
 3 |   "appearance": {
 4 |     "type": "AWSomeChatAppearance",
 5 |     "parameters": {
 6 |       "name": "My Chatbot",
 7 |       "faviconUrl": "https://a0.awsstatic.com/libra-css/images/logos/aws_smile-header-desktop-en-white_59x35@2x.png"
 8 |     }
 9 |   },
10 |   "amazonBedrock": [
11 |     {
12 |       "type": "AmazonBedrock",
13 |       "parameters": {
14 |         "region": "us-east-1",
15 |         "endpointURL": "https://bedrock.us-east-1.amazonaws.com"
16 |       }
17 |     }
18 |   ]
19 | }
20 | 


--------------------------------------------------------------------------------
/03_chatbot/example_app_configs/bedrock_iam.appconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "./json_schema/aws_awsomechat_app_config.schema.json",
 3 |   "appearance": {
 4 |     "type": "AWSomeChatAppearance",
 5 |     "parameters": {
 6 |       "name": "My Chatbot",
 7 |       "faviconUrl": "https://a0.awsstatic.com/libra-css/images/logos/aws_smile-header-desktop-en-white_59x35@2x.png"
 8 |     }
 9 |   },
10 |   "amazonBedrock": [
11 |     {
12 |       "type": "AmazonBedrock",
13 |       "parameters": {
14 |         "region": "us-east-1",
15 |         "iam": {
16 |           "type": "BotoIAM",
17 |           "parameters": {
18 |             "profile": "<YOUR_PROFILE_NAME>"
19 |           }
20 |         }
21 |       }
22 |     },
23 |     {
24 |       "type": "AmazonBedrock",
25 |       "parameters": {
26 |         "region": "us-east-1",
27 |         "endpointURL": "https://bedrock.us-east-1.amazonaws.com",
28 |         "iam": {
29 |           "type": "BotoIAM",
30 |           "parameters": {
31 |             "roleARN": "arn:aws:iam::<YOUR_ACCOUNT_ID>:role/<YOUR_ROLE_NAME>"
32 |           }
33 |         }
34 |       }
35 |     }
36 |   ]
37 | }
38 | 


--------------------------------------------------------------------------------
/03_chatbot/example_app_configs/bedrock_multi_region.appconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "./json_schema/aws_awsomechat_app_config.schema.json",
 3 |   "appearance": {
 4 |     "type": "AWSomeChatAppearance",
 5 |     "parameters": {
 6 |       "name": "My Chatbot",
 7 |       "faviconUrl": "https://a0.awsstatic.com/libra-css/images/logos/aws_smile-header-desktop-en-white_59x35@2x.png"
 8 |     }
 9 |   },
10 |   "amazonBedrock": [
11 |     {
12 |       "type": "AmazonBedrock",
13 |       "parameters": {
14 |         "region": "us-east-1"
15 |       }
16 |     },
17 |     {
18 |       "type": "AmazonBedrock",
19 |       "parameters": {
20 |         "region": "us-west-2"
21 |       }
22 |     }
23 |   ]
24 | }
25 | 


--------------------------------------------------------------------------------
/03_chatbot/example_app_configs/bedrock_prompts.appconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "json_schema/aws_awsomechat_app_config.schema.json",
 3 |   "appearance": {
 4 |     "type": "AWSomeChatAppearance",
 5 |     "parameters": {
 6 |       "name": "My Chatbot",
 7 |       "faviconUrl": "https://a0.awsstatic.com/libra-css/images/logos/aws_smile-header-desktop-en-white_59x35@2x.png"
 8 |     }
 9 |   },
10 |   "llmConfig": {
11 |     "parameters": {
12 |       "anthropic.claude*": {
13 |         "type": "LLMConfig",
14 |         "parameters": {
15 |           "chatPrompt": "prompts/anthropic_claude_chat.yaml",
16 |           "ragPrompt": "prompts/anthropic_claude_rag.yaml",
17 |           "temperature": 0.5
18 |         }
19 |       }
20 |     },
21 |     "type": "LLMConfigMap"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/03_chatbot/generate_internationalization.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | exec poetry run pybabel compile -d "./src/chatbot/i18n" --domain=chatbot
4 | 


--------------------------------------------------------------------------------
/03_chatbot/generate_secrets.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | import toml
4 | 
5 | with open("./.streamlit/secrets.toml", "w", encoding="utf8") as file:
6 |     data = {"passwords": {os.environ["USERNAME"]: os.environ["PASSWORD"]}}
7 |     toml.dump(data, file)
8 | 


--------------------------------------------------------------------------------
/03_chatbot/images/Amazon SageMaker endpoint tags dynamic discovery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/images/Amazon SageMaker endpoint tags dynamic discovery.png


--------------------------------------------------------------------------------
/03_chatbot/images/Genie_LLM_App_chatbot_code_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/images/Genie_LLM_App_chatbot_code_flow.png


--------------------------------------------------------------------------------
/03_chatbot/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "chatbot"
 3 | version = "1.3.6"
 4 | description = "Genie Chatbot"
 5 | authors = [
 6 |     "Arlind Nocaj <arlnocaj@amazon.ch>",
 7 |     "Luca Perrozzi <lperroz@amazon.ch>",
 8 |     "Malte Reimann <malterei@amazon.ch>",
 9 |     "Paolo Di Francesco <frpaolo@amazon.at>",
10 |     "Tasio Guevara <tasio@amazon.ch>",
11 |     "Mikael Mutafyan <mimuta@amazon.ch>"
12 | ]
13 | readme = "README.md"
14 | 
15 | [tool.poetry.dependencies]
16 | python = ">=3.10, <3.13"
17 | streamlit = "^1.32.0"
18 | streamlit-chat = "^0.1.1"
19 | streamlit-extras = "^0.4.0"
20 | langchain = "0.1.13"
21 | opensearch-py = "^2.4.2"
22 | sagemaker = "^2.212.0"
23 | babel = "^2.14.0"
24 | ansi2html = "^1.9.1"
25 | toml = "^0.10.2"
26 | sample-helper-aws-appconfig = "^2.1.0"
27 | boto3 = "^1.34.58"
28 | botocore = "^1.34.58"
29 | plotnine = "^0.13.1"
30 | tabulate = "^0.9.0"
31 | duckduckgo-search = "^6.1.0"
32 | yfinance = '^0.2.37'
33 | pandas_datareader = '^0.10.0'
34 | beautifulsoup4 = '^4.12.3'
35 | psycopg2-binary = '2.9.9'
36 | pymysql = '1.1.0'
37 | pypdf = '4.1.0'
38 | amazon-textract-caller = '0.2.4'
39 | amazon-textract-response-parser = '1.0.3'
40 | textractor = '^0.1.2'
41 | amazon-textract-textractor = '1.8.2'
42 | statsmodels = "0.14.1"
43 | watchdog = "^4.0.0"
44 | langchain-community = "^0.0.29"
45 | google-search-results = "^2.4.2"
46 | editdistance = '0.8.1'
47 | 
48 | [tool.poetry.group.dev.dependencies]
49 | pytest-playwright = "^0.4.3"
50 | pytest-html = "^4.1.1"
51 | unicode-slugify = "^0.1.5"
52 | 
53 | [build-system]
54 | requires = ["poetry-core"]
55 | build-backend = "poetry.core.masonry.api"
56 | 
57 | [tool.pytest.ini_options]
58 | minversion = "6.2.5"
59 | addopts = "-rA --disable-warnings --browser chromium --tracing on --output test-results --headed"
60 | testpaths = ["tests"]
61 | python_files = ["test_*.py"]
62 | 


--------------------------------------------------------------------------------
/03_chatbot/scripts/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the AWS account ID
 4 | aws_account_id=$(aws sts get-caller-identity --query Account --output text)
 5 | aws_region=$(aws configure get region)
 6 | 
 7 | echo "AccountId = ${aws_account_id}"
 8 | echo "Region = ${aws_region}"
 9 | 
10 | # Create a new ECR repository
11 | echo "Creating ECR Repository..."
12 | aws ecr create-repository --repository-name rag-chatbot
13 | 
14 | # Get the login command for the new repository
15 | echo "Logging into the repository..."
16 | #$(aws ecr get-login --no-include-email)
17 | # aws ecr get-login-password --region ${aws_region} | docker login --username AWS --password-stdin ${aws_account_id}.dkr.ecr.${aws_region}.amazonaws.com
18 | 
19 | # Build and push the Docker image and tag it
20 | echo "Building and pushing Docker image..."
21 | sm-docker build -t "${aws_account_id}.dkr.ecr.us-east-1.amazonaws.com/rag-app:latest" --repository rag-chatbot:latest .
22 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/src/chatbot/__init__.py


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from chatbot.helpers import ChatbotEnvironment
 4 | from chatbot.ui import write_chatbot
 5 | 
 6 | dirname = os.path.dirname(__file__)
 7 | 
 8 | environment = ChatbotEnvironment()
 9 | write_chatbot(dirname, environment)
10 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/__init__.py:
--------------------------------------------------------------------------------
 1 | from .catalog import Catalog, CatalogById, CatalogItem  # noqa
 2 | from .catalog_item import CatalogItem  # noqa
 3 | 
 4 | from .memory_catalog_item_dynamodb_table import (  # noqa
 5 |     DynamoDBTableMemoryItem,
 6 |     MemoryCatalogItem,
 7 | )
 8 | from .memory_catalog import DynamoDBTableMemoryItem, MemoryCatalog
 9 | from .memory_catalog_item import CatalogItem, MemoryCatalogItem
10 | 
11 | from .flow_catalog_item_simple_chat import SimpleChatFlowItem, SIMPLE_CHATBOT
12 | from .flow_catalog_item_upload_file import DocUploadItem, UPLOAD_DOCUMENT_SEARCH
13 | from .flow_catalog_item_rag import RagItem, RETRIEVAL_AUGMENTED_GENERATION
14 | from .flow_catalog_item_agent import AgentsItem, AGENT_NAME
15 | 
16 | from .flow_catalog import (
17 |     Catalog,
18 |     FlowCatalog,
19 |     SimpleChatFlowItem,
20 |     DocUploadItem,
21 |     RagItem,
22 |     AgentsItem,
23 | )
24 | from .flow_catalog_item import (
25 |     CatalogItem,
26 |     FlowCatalogItem,
27 | )
28 | 
29 | from .agent_chain_catalog_item import AgentChainCatalogItem
30 | from .agent_chain_catalog_item_sql_generator import SqlGeneratorAgentChainItem
31 | from .agent_chain_catalog_item_financial_analysis import FinancialAnalysisAgentChainItem
32 | from .agent_chain_catalog import (
33 |     Catalog,
34 |     AgentChainCatalog,
35 |     FinancialAnalysisAgentChainItem,
36 |     SqlGeneratorAgentChainItem,
37 | )
38 | 
39 | from .retriever_catalog_item_kendra import KendraRetrieverItem, RetrieverCatalogItem
40 | from .retriever_catalog_item_open_search import OpenSearchRetrieverItem, RetrieverCatalogItem
41 | from .retriever_catalog import (
42 |     Catalog,
43 |     RetrieverCatalog,
44 |     KendraRetrieverItem,
45 |     OpenSearchRetrieverItem,
46 | )
47 | from .retriever_catalog_item import (
48 |     CatalogItem,
49 |     RetrieverCatalogItem,
50 | )
51 | 
52 | from .model_catalog import BedrockModelItem, Catalog, ModelCatalog, SageMakerModelItem
53 | from .model_catalog_item import CatalogItem, ModelCatalogItem
54 | from .model_catalog_item_bedrock import BedrockModelItem, ModelCatalogItem
55 | from .model_catalog_item_sagemaker import ModelCatalogItem, SageMakerModelItem
56 | 
57 | from .prompt_catalog import CatalogById, PromptCatalog, PromptCatalogItem  # noqa
58 | from .prompt_catalog_item import CatalogItem, PromptCatalogItem
59 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/agent_chain_catalog.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an agent chains catalog.
 2 | """
 3 | import logging
 4 | import time
 5 | from dataclasses import dataclass
 6 | from logging import Logger, getLogger
 7 | from typing import List
 8 | 
 9 | from .catalog import FRIENDLY_NAME_TAG, Catalog
10 | from .agent_chain_catalog_item_financial_analysis import FinancialAnalysisAgentChainItem
11 | from .agent_chain_catalog_item_sql_generator import SqlGeneratorAgentChainItem
12 | 
13 | 
14 | @dataclass
15 | class AgentChainCatalog(Catalog):
16 |     """Catalog to get agent chains."""
17 | 
18 |     regions: List[str]
19 |     logger: Logger
20 | 
21 |     def __init__(
22 |         self,
23 |         regions: list,
24 |         logger: Logger = getLogger("AgentChainCatalogLogger"),
25 |     ) -> None:
26 |         self.regions = regions
27 |         self.logger = logger
28 |         super().__init__()
29 | 
30 |     def _get_agent_chain_financial_analysis(self):
31 |         """Get agent chain for financial analysis"""
32 | 
33 |         start_time = time.time()
34 |         self.logger.info("Retrieving Agent Chain for financial analysis...")
35 | 
36 |         _var = FinancialAnalysisAgentChainItem()
37 |         self += [_var]
38 | 
39 |         self.logger.info(
40 |             "%s Financial Analysis Agent Chain retrieved in %s seconds",
41 |             len(self),
42 |             time.time() - start_time,
43 |         )
44 | 
45 |     def _get_agent_chain_sql_generator(self):
46 |         """Get agent chain for SQL generation"""
47 | 
48 |         start_time = time.time()
49 |         self.logger.info("Retrieving Agent Chain for SQL generation...")
50 | 
51 |         _var = SqlGeneratorAgentChainItem()
52 |         self += [_var]
53 | 
54 |         self.logger.info(
55 |             "%s SQL generation Agent Chain retrieved in %s seconds",
56 |             len(self),
57 |             time.time() - start_time,
58 |         )
59 | 
60 | 
61 |     def bootstrap(self) -> None:
62 |         """Bootstraps the catalog."""
63 |         self._get_agent_chain_financial_analysis()
64 |         self._get_agent_chain_sql_generator()
65 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/agent_chain_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an abstract base class that represents an agent chain item.
 2 | """
 3 | from dataclasses import dataclass
 4 | from typing import Any, List, Tuple, Union
 5 | 
 6 | from .catalog_item import CatalogItem
 7 | from langchain.tools import BaseTool
 8 | 
 9 | 
10 | @dataclass
11 | class AgentChainCatalogItem(CatalogItem[BaseTool]):
12 |     """Abstract base class that renpresents an agent chain catalog item."""
13 | 
14 |     @property
15 |     def available_filter_options(self) -> Union[List[Tuple[str, Any]], None]:
16 |         return None
17 | 
18 |     @property
19 |     def current_filter(self) -> List[Tuple[str, Any]]:
20 |         return []
21 | 
22 |     @current_filter.setter
23 |     def current_filter(self, value: List[Tuple[str, Any]]):
24 |         pass
25 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/agent_chain_catalog_item_financial_analysis.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an abstract base class that represents an agent chain item.
 2 | """
 3 | from dataclasses import dataclass
 4 | from langchain.tools import BaseTool
 5 | from .agent_chain_catalog_item import AgentChainCatalogItem
 6 | import os
 7 | from chatbot.helpers import ChatbotEnvironment, ChatbotEnvironmentVariables
 8 | 
 9 | AGENT_CHAIN_FINANCIAL_ANALYSIS_NAME = "Financial Analysis"
10 | SERPAPI_API_KEY = ChatbotEnvironment().get_env_variable(ChatbotEnvironmentVariables.SERPAPI_API_KEY)
11 | 
12 | @dataclass
13 | class FinancialAnalysisAgentChainItem(AgentChainCatalogItem):
14 |     """Abstract base class that represents an agent chain for financial analysis."""
15 | 
16 |     def __init__(self):
17 |         super().__init__(AGENT_CHAIN_FINANCIAL_ANALYSIS_NAME)
18 | 
19 |     def get_prompt_path(self) -> str:
20 |         return 'prompts/anthropic_claude_agent_financial_analyzer.yaml'
21 | 
22 |     def get_instance(self) -> BaseTool:
23 |         from .agent_tools_catalog_item import get_stock_price, get_recent_stock_news, get_financial_statements
24 |         from langchain.utilities import DuckDuckGoSearchAPIWrapper
25 |         from langchain.utilities import SerpAPIWrapper
26 |         from langchain.agents import Tool
27 | 
28 |         if SERPAPI_API_KEY == '':
29 |             search = DuckDuckGoSearchAPIWrapper()
30 |         else:
31 |             search = SerpAPIWrapper()
32 | 
33 |         agent_chain = [
34 |             Tool(
35 |                 name="get stock data",
36 |                 func=get_stock_price,
37 |                 description="Use when you are asked to evaluate or analyze a stock. This will output historic share price data for the last 60 days. You should input the stock ticker to it"
38 |             ),
39 |             Tool(
40 |                 name="Search",
41 |                 func=search.run,
42 |                 description="Use this to fetch the stock ticker, you can also get recent stock related news. Dont use it for any other analysis or task"
43 |             ),
44 |             Tool(
45 |                 name="get recent news",
46 |                 func=get_recent_stock_news,
47 |                 description="Use this to fetch recent news about stocks"
48 |             ),
49 |             Tool(
50 |                 name="get financial statements",
51 |                 func=get_financial_statements,
52 |                 description="Use this to get financial statement of the company. With the help of this data companys historic performance can be evaluated. You should input stock ticker to it"
53 |             )
54 |         ]
55 |         
56 |         return agent_chain
57 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/agent_chain_catalog_item_sql_generator.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an abstract base class that represents an agent chain item.
 2 | """
 3 | from dataclasses import dataclass
 4 | from langchain.tools import BaseTool
 5 | from .agent_chain_catalog_item import AgentChainCatalogItem
 6 | 
 7 | AGENT_CHAIN_SQL_GENERATOR_NAME = "SQL query generator"
 8 | 
 9 | 
10 | @dataclass
11 | class SqlGeneratorAgentChainItem(AgentChainCatalogItem):
12 |     """Abstract base class that represents an agent chain for generating SQL."""
13 | 
14 |     def __init__(self):
15 |         super().__init__(AGENT_CHAIN_SQL_GENERATOR_NAME)
16 | 
17 |     def get_prompt_path(self) -> str:
18 |         return 'prompts/anthropic_claude_agent_sql.yaml'
19 | 
20 |     def get_instance(self) -> BaseTool:        
21 |         return None
22 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/agent_tools_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains tools for agents. """
 2 | 
 3 | from pandas_datareader import data as pdr
 4 | import yfinance as yf
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import requests
 8 | from datetime import date
 9 | from datetime import timedelta
10 | def get_stock_price(ticker, history=60):
11 |     yf.pdr_override()
12 |     today = date.today()
13 |     start_date = today - timedelta(days=history)
14 |     data = pdr.get_data_yahoo(ticker, start=start_date, end=today)
15 |     dataname = ticker + '_' + str(today)
16 |     return data, dataname
17 | 
18 | # Fetch top 5 google news for given company name
19 | def google_query(search_term):
20 |     if "news" not in search_term:
21 |         search_term = search_term + " stock news"
22 |     url = f"https://www.google.com/search?q={search_term}"
23 |     url = re.sub(r"\s", "+", url)
24 |     return url
25 | 
26 | def get_recent_stock_news(company_name):
27 |     headers = {
28 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
29 |     }
30 | 
31 |     g_query = google_query(company_name)
32 |     res = requests.get(g_query, headers=headers, timeout=1.0)
33 |     res.raise_for_status()
34 |     
35 |     soup = BeautifulSoup(res.text, "html.parser")
36 |     news = []
37 |     for n in soup.find_all("div", "n0jPhd ynAwRc tNxQIb nDgy9d"):
38 |         news.append(n.text)
39 |     for n in soup.find_all("div", "IJl0Z"):
40 |         news.append(n.text)
41 | 
42 |     if len(news) > 6:
43 |         news = news[:4]
44 |     else:
45 |         news = news
46 |     news_string = ""
47 |     for i, n in enumerate(news):
48 |         news_string += f"{i}. {n}\n"
49 |     top5_news = "Recent News:\n\n" + news_string
50 | 
51 |     return top5_news
52 | 
53 | # Get financial statements from Yahoo Finance
54 | def get_financial_statements(ticker):
55 |     if "." in ticker:
56 |         ticker = ticker.split(".")[0]
57 |     else:
58 |         ticker = ticker
59 |     company = yf.Ticker(ticker)
60 |     balance_sheet = company.balance_sheet
61 |     if balance_sheet.shape[1] >= 3:
62 |         balance_sheet = balance_sheet.iloc[:, :3]  # Only captures last 3 years of data
63 |     balance_sheet = balance_sheet.dropna(how="any")
64 |     balance_sheet = balance_sheet.to_string()
65 |     return balance_sheet
66 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/catalog.py:
--------------------------------------------------------------------------------
 1 | """ Module for catalogs that contain base items. """
 2 | from abc import ABC, abstractmethod
 3 | from dataclasses import dataclass
 4 | from typing import Dict, List, TypeVar
 5 | 
 6 | from .catalog_item import CatalogItem
 7 | 
 8 | T = TypeVar("T", bound=CatalogItem)
 9 | 
10 | FRIENDLY_NAME_TAG = "genie:friendly-name"
11 | 
12 | 
13 | @dataclass
14 | class Catalog(ABC, List[T]):
15 |     """Base class for catalogs."""
16 | 
17 |     regions: List[str]
18 |     """ List of regions where the catalog looks for resources. """
19 | 
20 |     def __init__(self) -> None:
21 |         super().__init__()
22 | 
23 |     @abstractmethod
24 |     def bootstrap(self) -> None:
25 |         """
26 |         Bootstraps the catalog.
27 |         """
28 | 
29 |     def get_friendly_names(self) -> List[str]:
30 |         """A friendly name is a human readable name that can be used in to represent an item.
31 | 
32 |         Returns:
33 |           A list of friendly names for the catalog
34 |         """
35 |         return [item.friendly_name for item in self]
36 | 
37 | 
38 | @dataclass
39 | class CatalogById(ABC, Dict[str, T]):
40 |     """Base class for catalogs that store catalog items by id."""
41 | 
42 |     def __getitem__(self, __key: str) -> T:
43 |         item = super().get(__key, None)
44 |         if item is None:
45 |             item = self._retrieve(__key)
46 |             self[__key] = item
47 |         return item
48 | 
49 |     @abstractmethod
50 |     def _retrieve(self, key: str) -> T:
51 |         """Retrieves an item that does not exist in the catalog."""
52 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Abstract base class that represents a catalog item. """
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from dataclasses import dataclass
 5 | from typing import Generic, TypeVar
 6 | 
 7 | T = TypeVar("T")
 8 | 
 9 | 
10 | @dataclass
11 | class CatalogItem(ABC, Generic[T]):
12 |     """Abstract base class that represents a catalog item."""
13 | 
14 |     friendly_name: str
15 | 
16 |     @abstractmethod
17 |     def get_instance(self) -> T:
18 |         """Returns an instance of the item."""
19 | 
20 |     def __str__(self):
21 |         return self.friendly_name
22 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a flow catalog.
 2 | """
 3 | import logging
 4 | import time
 5 | from dataclasses import dataclass
 6 | from logging import Logger, getLogger
 7 | from operator import itemgetter
 8 | from typing import List
 9 | 
10 | import boto3
11 | import botocore
12 | 
13 | from .catalog import FRIENDLY_NAME_TAG, Catalog
14 | from .flow_catalog_item_simple_chat import SimpleChatFlowItem
15 | from .flow_catalog_item_upload_file import DocUploadItem
16 | from .flow_catalog_item_rag import RagItem
17 | from .flow_catalog_item_agent import AgentsItem
18 | 
19 | @dataclass
20 | class FlowCatalog(Catalog):
21 |     """Catalog to get flows."""
22 | 
23 |     regions: List[str]
24 | 
25 |     account_id: str
26 | 
27 |     logger: Logger
28 | 
29 |     def __init__(
30 |         self,
31 |         account_id,
32 |         regions: list,
33 |         logger: Logger = getLogger("FlowCatalogLogger"),
34 |     ) -> None:
35 |         self.regions = regions
36 |         self.account_id = account_id
37 |         self.logger = logger
38 |         super().__init__()
39 | 
40 |     def _add_simple_chat_flow_option(self) -> None:
41 |         self.append(SimpleChatFlowItem())
42 | 
43 |     def _add_doc_upload_flow_option(self) -> None:
44 |         self.append(DocUploadItem())
45 |         
46 |     def _add_rag_option(self) -> None:
47 |         self.append(RagItem())
48 | 
49 |     def _add_agent_option(self) -> None:
50 |         self.append(AgentsItem())
51 |         
52 | 
53 | 
54 |     def bootstrap(self) -> None:
55 |         """Bootstraps the catalog."""
56 |         self._add_simple_chat_flow_option()
57 |         self._add_doc_upload_flow_option()
58 |         self._add_rag_option()
59 |         self._add_agent_option()
60 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an abstract base class that represents a retriever catalog item.
 2 | """
 3 | from dataclasses import dataclass
 4 | from typing import Any, List, Tuple, Union
 5 | 
 6 | from chatbot.llm_app import BaseLLMApp, LLMApp, RAGApp
 7 | from langchain.chains.base import Chain
 8 | from abc import ABC
 9 | from abc import abstractmethod
10 | 
11 | from .retriever_catalog_item import RetrieverCatalogItem
12 | from .catalog import CatalogById
13 | from .catalog_item import CatalogItem
14 | from .model_catalog_item import ModelCatalogItem
15 | from .agent_chain_catalog_item import AgentChainCatalogItem
16 | 
17 | @dataclass
18 | class FlowCatalogItem(CatalogItem[Chain], ABC):
19 |     """Abstract base class that represents a retriever catalog item."""
20 | 
21 |     @property
22 |     def available_filter_options(self) -> Union[List[Tuple[str, Any]], None]:
23 |         return None
24 | 
25 |     @property
26 |     def enable_retriever(self) -> bool:
27 |         return False
28 | 
29 |     @property
30 |     def enable_file_upload(self) -> bool:
31 |         return False
32 | 
33 |     @property
34 |     def enable_agents_chains(self) -> bool:
35 |         return False
36 | 
37 |     @property
38 |     def current_filter(self) -> List[Tuple[str, Any]]:
39 |         return []
40 | 
41 |     @current_filter.setter
42 |     def current_filter(self, value: List[Tuple[str, Any]]):
43 |         pass
44 | 
45 |     @abstractmethod
46 |     def llm_app_factory(
47 |         self, 
48 |         retriever: RetrieverCatalogItem,
49 |         model: ModelCatalogItem, 
50 |         agent_chain: AgentChainCatalogItem,
51 |         prompt_catalog: CatalogById,
52 |         sql_connection_uri: str,
53 |         sql_model: ModelCatalogItem
54 |     ) -> LLMApp:
55 |         """
56 |         Returns the llm app to use for this retriever.
57 |         """
58 |         rag_prompt = prompt_catalog[model.rag_prompt_identifier].get_instance()
59 |         llm = model.get_instance()
60 |         condense_question_prompt = prompt_catalog[
61 |             "prompts/condense_question.yaml"
62 |         ].get_instance()
63 | 
64 |         retriever = retriever.get_instance()
65 | 
66 |         return RAGApp(
67 |             prompt=rag_prompt,
68 |             llm=llm,
69 |             condense_question_prompt_template=condense_question_prompt,
70 |             retriever=retriever,
71 |         )
72 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog_item_agent.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents an agent catalog item. """
 2 | from dataclasses import dataclass
 3 | from typing import Union
 4 | 
 5 | from langchain.chains.base import Chain
 6 | 
 7 | from .flow_catalog_item import FlowCatalogItem
 8 | from .retriever_catalog_item import RetrieverCatalogItem
 9 | from .catalog import CatalogById
10 | from .model_catalog_item import ModelCatalogItem
11 | from chatbot.llm_app import MRKLApp, SQLMRKLApp
12 | from .agent_chain_catalog_item import AgentChainCatalogItem
13 | 
14 | from .agent_chain_catalog_item_sql_generator import AGENT_CHAIN_SQL_GENERATOR_NAME
15 | 
16 | AGENT_NAME = "[Experimental] Agents"
17 | 
18 | 
19 | @dataclass
20 | class AgentsItem(FlowCatalogItem):
21 |     """
22 |     Class that represents using an agent flow.
23 |     """
24 | 
25 |     def __init__(self):
26 |         super().__init__(AGENT_NAME)
27 | 
28 |     def enable_file_upload(self) -> bool:
29 |         return False
30 | 
31 |     def enable_agents_chains(self) -> bool:
32 |         return True
33 | 
34 |     def get_instance(self) -> Chain:
35 |         return None
36 | 
37 |     def llm_app_factory(
38 |         self, 
39 |         model: ModelCatalogItem, 
40 |         retriever: RetrieverCatalogItem, 
41 |         agent_chain: AgentChainCatalogItem,
42 |         prompt_catalog: CatalogById,
43 |         sql_connection_uri: str,
44 |         sql_model: ModelCatalogItem
45 |     ) -> Union[MRKLApp, SQLMRKLApp]:
46 |         """
47 |         Returns the llm app (i.e. agent in this case) to use for this flow.
48 |         """
49 |         
50 |         if str(agent_chain)==AGENT_CHAIN_SQL_GENERATOR_NAME:
51 |             agent_chain_type = "SQL"
52 |         else:
53 |             agent_chain_type = "TOOLS"
54 | 
55 |         chat_prompt = prompt_catalog[agent_chain.get_prompt_path()].get_instance()
56 |         llm = model.get_instance()
57 | 
58 |         if agent_chain_type =="TOOLS":
59 |             agent_chain = agent_chain.get_instance()
60 |             return MRKLApp(
61 |                 prompt=chat_prompt,
62 |                 llm=llm,
63 |                 agent_chain=agent_chain
64 |             )
65 |         else:
66 |             sql_llm  = sql_model.get_instance()
67 |             return SQLMRKLApp(
68 |                 prompt=chat_prompt,
69 |                 llm=llm,
70 |                 sql_connection_uri=sql_connection_uri,
71 |                 sql_llm=sql_llm
72 |             )
73 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog_item_rag.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents a File Upload retriever catalog item. """
 2 | from dataclasses import dataclass
 3 | 
 4 | from langchain.chains.base import Chain
 5 | 
 6 | from chatbot.catalog.flow_catalog_item_simple_chat import SimpleChatFlowItem
 7 | 
 8 | from .flow_catalog_item import FlowCatalogItem
 9 | from .retriever_catalog_item import RetrieverCatalogItem
10 | from .catalog import CatalogById
11 | from .model_catalog_item import ModelCatalogItem
12 | from chatbot.llm_app import BaseLLMApp, LLMApp, RAGApp
13 | from .agent_chain_catalog_item import AgentChainCatalogItem
14 | 
15 | 
16 | RETRIEVAL_AUGMENTED_GENERATION = "Retrieval Augmented Generation"
17 | 
18 | 
19 | @dataclass
20 | class RagItem(FlowCatalogItem):
21 |     """
22 |     Class that represents using a LLM with a retriever.
23 |     """
24 | 
25 |     def __init__(self):
26 |         super().__init__(RETRIEVAL_AUGMENTED_GENERATION)
27 | 
28 |     def enable_file_upload(self) -> bool:
29 |         return False
30 | 
31 |     def enable_retriever(self) -> bool:
32 |         return True
33 | 
34 |     def get_instance(self) -> Chain:
35 |         return None
36 | 
37 |     def llm_app_factory(
38 |         self, 
39 |         model: ModelCatalogItem, 
40 |         retriever: RetrieverCatalogItem, 
41 |         agent_chain: AgentChainCatalogItem,
42 |         prompt_catalog: CatalogById,
43 |         sql_connection_uri: str,
44 |         sql_model: ModelCatalogItem
45 |     ) -> LLMApp:
46 |         """
47 |         Returns the llm app to use for this retriever.
48 |         """
49 |         if retriever is None:
50 |             # Fallback
51 |             chat_flow = SimpleChatFlowItem()
52 |             return chat_flow.llm_app_factory(model, retriever, prompt_catalog)
53 |         
54 |         rag_prompt = prompt_catalog[model.rag_prompt_identifier].get_instance()
55 |         llm = model.get_instance()
56 |         condense_question_prompt = prompt_catalog[
57 |             "prompts/condense_question.yaml"
58 |         ].get_instance()
59 | 
60 |         retriever = retriever.get_instance()
61 | 
62 |         # Checking if retriever is initialized, if not app will print retriever errors
63 |         # TODO: implemenent error handling on app level then retriver can throw an error
64 |         if retriever:
65 |             return RAGApp(
66 |                 prompt=rag_prompt,
67 |                 llm=llm,
68 |                 condense_question_prompt_template=condense_question_prompt,
69 |                 retriever=retriever,
70 |             )
71 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog_item_simple_chat.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents No search retriever catalog item. """
 2 | from dataclasses import dataclass
 3 | 
 4 | from langchain.chains.base import Chain
 5 | 
 6 | from .flow_catalog_item import FlowCatalogItem
 7 | from .retriever_catalog_item import RetrieverCatalogItem
 8 | from .catalog import CatalogById
 9 | from .model_catalog_item import ModelCatalogItem
10 | from chatbot.llm_app import BaseLLMApp, LLMApp
11 | from .agent_chain_catalog_item import AgentChainCatalogItem
12 | 
13 | 
14 | SIMPLE_CHATBOT = "Only Chat"
15 | 
16 | 
17 | @dataclass
18 | class SimpleChatFlowItem(FlowCatalogItem):
19 |     """
20 |     Class that represents using a LLM without a retriever.
21 |     """
22 | 
23 |     def __init__(self):
24 |         super().__init__(SIMPLE_CHATBOT)
25 | 
26 |     def enable_file_upload(self) -> bool:
27 |         return False
28 | 
29 |     def get_instance(self) -> Chain:
30 |         return None
31 | 
32 |     def llm_app_factory(
33 |         self, 
34 |         model: ModelCatalogItem, 
35 |         retriever: RetrieverCatalogItem, 
36 |         agent_chain: AgentChainCatalogItem,
37 |         prompt_catalog: CatalogById,
38 |         sql_connection_uri: str,
39 |         sql_model: ModelCatalogItem
40 |     ) -> LLMApp:
41 |         """
42 |         Returns the llm app to use without retriever.
43 |         """
44 | 
45 |         llm = model.get_instance()
46 |         chat_prompt = prompt_catalog[model.chat_prompt_identifier].get_instance()
47 | 
48 |         return BaseLLMApp(prompt=chat_prompt, 
49 |                         llm=llm, 
50 |                         )
51 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/flow_catalog_item_upload_file.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents a File Upload retriever catalog item. """
 2 | from dataclasses import dataclass
 3 | 
 4 | from langchain.chains.base import Chain
 5 | 
 6 | from .flow_catalog_item import FlowCatalogItem
 7 | from .retriever_catalog_item import RetrieverCatalogItem
 8 | from .catalog import CatalogById
 9 | from .model_catalog_item import ModelCatalogItem
10 | from chatbot.llm_app import BaseLLMApp, LLMApp
11 | from .agent_chain_catalog_item import AgentChainCatalogItem
12 | 
13 | 
14 | UPLOAD_DOCUMENT_SEARCH = "Upload a document and search it"
15 | 
16 | 
17 | @dataclass
18 | class DocUploadItem(FlowCatalogItem):
19 |     """
20 |     Class that represents using a LLM without a retriever, but using an uploaded document.
21 |     """
22 | 
23 |     def __init__(self):
24 |         super().__init__(UPLOAD_DOCUMENT_SEARCH)
25 | 
26 |     def enable_file_upload(self) -> bool:
27 |         return True
28 | 
29 |     def get_instance(self) -> Chain:
30 |         return None
31 | 
32 |     def llm_app_factory(
33 |         self, 
34 |         model: ModelCatalogItem, 
35 |         retriever: RetrieverCatalogItem, 
36 |         agent_chain: AgentChainCatalogItem,
37 |         prompt_catalog: CatalogById,
38 |         sql_connection_uri: str,
39 |         sql_model: ModelCatalogItem
40 |     ) -> LLMApp:
41 |         """
42 |         Returns the llm app to use without a retriever, but using an uploaded document.
43 |         """
44 |         llm = model.get_instance()
45 |         chat_prompt = prompt_catalog[model.chat_prompt_identifier].get_instance()
46 | 
47 |         return BaseLLMApp(prompt=chat_prompt, llm=llm)
48 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/memory_catalog.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains catalog for chat history memory. """
 2 | import logging
 3 | import sys
 4 | import time
 5 | from dataclasses import dataclass
 6 | from logging import Logger, getLogger
 7 | from typing import List
 8 | 
 9 | import boto3
10 | 
11 | from .catalog import Catalog
12 | from .memory_catalog_item_dynamodb_table import DynamoDBTableMemoryItem
13 | 
14 | 
15 | @dataclass
16 | class MemoryCatalog(Catalog):
17 |     """Class for chat history memory catalog."""
18 | 
19 |     regions: List[str]
20 | 
21 |     account_id: str
22 | 
23 |     logger: Logger
24 | 
25 |     def __init__(
26 |         self,
27 |         account_id: str,
28 |         regions: list,
29 |         logger: Logger = getLogger("MemoryCatalogLogger"),
30 |     ) -> None:
31 |         self.regions = regions
32 |         self.account_id = account_id
33 |         self.logger = logger
34 |         super().__init__()
35 | 
36 |     def _get_dynamodb_memory_table(self, account):
37 |         """Get Amazon DynamoDB table available in the account that is part of Genie."""
38 |         start_time = time.time()
39 |         self.logger.info("Retrieving DynamoDB memory table...")
40 | 
41 |         memory_tables = []
42 | 
43 |         for region in self.regions:
44 |             dynamodb_client = boto3.client("dynamodb", region)
45 | 
46 |             paginator = dynamodb_client.get_paginator("list_tables")
47 |             tables = paginator.paginate().build_full_result()["TableNames"]
48 | 
49 |             table_name_to_arn = (
50 |                 lambda table_name, region: f"arn:aws:dynamodb:{region}:{account}:table/{table_name}"
51 |             )
52 |             genaix_dynamodb_tag_filter = lambda tag: tag["Key"] == "genie:memory-table"
53 |             for table_name in tables:
54 |                 table_arn = table_name_to_arn(table_name, region=region)
55 |                 tags_paginator = dynamodb_client.get_paginator("list_tags_of_resource")
56 |                 tags = tags_paginator.paginate(
57 |                     ResourceArn=table_arn
58 |                 ).build_full_result()["Tags"]
59 |                 genaix_tags = list(filter(genaix_dynamodb_tag_filter, tags))
60 |                 if len(genaix_tags) > 0:
61 |                     memory_tables.append(DynamoDBTableMemoryItem(table_name=table_name))
62 | 
63 |         self.logger.info(
64 |             "%s DynamoDB tables retrieved in %s seconds",
65 |             len(memory_tables),
66 |             time.time() - start_time,
67 |         )
68 |         self.logger.info(memory_tables)
69 |         if len(memory_tables) > 0:
70 |             self.logger.info(
71 |                 "Using first DynamoDB table %s to store chat history",
72 |                 str(memory_tables[0]),
73 |             )
74 |             self.append(memory_tables[0])
75 | 
76 |     def bootstrap(self) -> None:
77 |         """Bootstraps the catalog."""
78 | 
79 |         self._get_dynamodb_memory_table(self.account_id)
80 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/memory_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Class that represents an item in a memory catalog. """
 2 | 
 3 | from abc import abstractmethod
 4 | from dataclasses import dataclass
 5 | 
 6 | from langchain.schema import BaseChatMessageHistory
 7 | 
 8 | from .catalog_item import CatalogItem
 9 | 
10 | 
11 | @dataclass
12 | class MemoryCatalogItem(CatalogItem[BaseChatMessageHistory]):
13 |     """Abstract base class that represents a catalog item."""
14 | 
15 |     @abstractmethod
16 |     def get_instance(self, session_id) -> BaseChatMessageHistory:
17 |         """Returns an instance of the item."""
18 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/memory_catalog_item_dynamodb_table.py:
--------------------------------------------------------------------------------
 1 | """ from.Module that contains a class that represents a OpenSearch retriever catalog item. """
 2 | from dataclasses import dataclass
 3 | 
 4 | from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory
 5 | from langchain.schema import BaseChatMessageHistory
 6 | 
 7 | from .memory_catalog_item import MemoryCatalogItem
 8 | 
 9 | 
10 | @dataclass
11 | class DynamoDBTableMemoryItem(MemoryCatalogItem):
12 |     """Class that represents a Amazon DynamoDB table memory catalog item."""
13 | 
14 |     table_name: str
15 |     """ DynamoDB table name """
16 | 
17 |     def __init__(self, table_name):
18 |         super().__init__(f"Memory table: {table_name}")
19 |         self.table_name = table_name
20 | 
21 |     def get_instance(self, session_id) -> BaseChatMessageHistory:
22 |         return DynamoDBChatMessageHistory(
23 |             table_name=self.table_name, session_id=session_id
24 |         )
25 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/model_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Abstract base class that represents a catalog item. """
 2 | from dataclasses import dataclass
 3 | 
 4 | from langchain.llms.base import LLM
 5 | 
 6 | from .catalog_item import CatalogItem
 7 | 
 8 | 
 9 | @dataclass
10 | class ModelCatalogItem(CatalogItem[LLM]):
11 |     """Abstract base class that represents a catalog item."""
12 | 
13 |     chat_prompt_identifier: str
14 |     """ Identifies which prompt to use when chatting with the model. """
15 | 
16 |     rag_prompt_identifier: str
17 |     """ Identifies which prompt to use with the model when using document retrieval. """
18 | 
19 |     supports_streaming: bool = False
20 |     """ Whether the model supports streaming the response. """
21 | 
22 |     streaming_on: bool = False
23 |     """ Whether the model is streaming the response. """
24 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/prompt_catalog.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains catalog for prompts. """
 2 | import json
 3 | import os
 4 | import re
 5 | from dataclasses import dataclass
 6 | from pathlib import Path
 7 | from typing import Union
 8 | 
 9 | import boto3
10 | import yaml
11 | from langchain.prompts import BasePromptTemplate
12 | from langchain.prompts import load_prompt as langchain_load_prompt
13 | from langchain.prompts.loading import load_prompt_from_config
14 | 
15 | from .catalog import CatalogById
16 | from .prompt_catalog_item import PromptCatalogItem
17 | 
18 | dirname = os.path.dirname(__file__)
19 | basedir = os.path.join(dirname, "../")
20 | 
21 | 
22 | @dataclass
23 | class PromptCatalog(CatalogById[PromptCatalogItem]):
24 |     """Class to get and load prompt templates."""
25 | 
26 |     def _retrieve(self, key: str) -> PromptCatalogItem:
27 |         prompt = self._load_prompt(key)
28 |         return PromptCatalogItem(friendly_name=key, prompt=prompt)
29 | 
30 |     def _try_load_from_s3(
31 |         self,
32 |         path: Union[str, Path],
33 |     ) -> BasePromptTemplate:
34 |         """Load configuration from S3."""
35 |         S3_URI = re.compile(r"s3://(?P<ref>.+)/(?P<path>.*)")
36 |         s3Client = boto3.client("s3")
37 | 
38 |         if not isinstance(path, str) or not (match := S3_URI.match(path)):
39 |             return None
40 |         bucket, object_key = match.groups()
41 | 
42 |         object_path = Path(object_key)
43 | 
44 |         valid_suffixes = {"json", "yaml"}
45 | 
46 |         if object_path.suffix[1:] not in valid_suffixes:
47 |             raise ValueError("Unsupported file type.")
48 | 
49 |         response = s3Client.get_object(Bucket=bucket, Key=object_key)
50 | 
51 |         content = response["Body"].read()
52 |         if object_path.suffix == ".json":
53 |             config = json.load(content)
54 |         elif object_path.suffix == ".yaml":
55 |             config = yaml.safe_load(content)
56 |         return load_prompt_from_config(config)
57 | 
58 |     def _load_prompt(self, path: Union[str, Path]) -> BasePromptTemplate:
59 |         """Unified method for loading a prompt from Amazon S3, LangChainHub or local fs."""
60 |         if s3_result := self._try_load_from_s3(path):
61 |             return s3_result
62 |         else:
63 |             path_from_base = os.path.join(basedir, path)
64 |             return langchain_load_prompt(path_from_base)
65 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/prompt_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Abstract base class that represents a prompt item.
 2 | """
 3 | from dataclasses import dataclass
 4 | 
 5 | from langchain.prompts import BasePromptTemplate
 6 | 
 7 | from .catalog_item import CatalogItem
 8 | 
 9 | 
10 | @dataclass
11 | class PromptCatalogItem(CatalogItem[BasePromptTemplate]):
12 |     """Represents a prompt template."""
13 | 
14 |     prompt: BasePromptTemplate
15 |     """Prompt template"""
16 | 
17 |     def get_instance(self) -> BasePromptTemplate:
18 |         return self.prompt
19 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/retriever_catalog_item.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains an abstract base class that represents a retriever catalog item.
 2 | """
 3 | from dataclasses import dataclass
 4 | from typing import Any, List, Tuple, Union
 5 | 
 6 | from chatbot.llm_app import BaseLLMApp, LLMApp, RAGApp
 7 | from langchain.schema import BaseRetriever
 8 | 
 9 | from .catalog import CatalogById
10 | from .catalog_item import CatalogItem
11 | from .model_catalog_item import ModelCatalogItem
12 | 
13 | 
14 | @dataclass
15 | class RetrieverCatalogItem(CatalogItem[BaseRetriever]):
16 |     """Abstract base class that represents a retriever catalog item."""
17 | 
18 |     @property
19 |     def available_filter_options(self) -> Union[List[Tuple[str, Any]], None]:
20 |         return None
21 | 
22 |     @property
23 |     def current_filter(self) -> List[Tuple[str, Any]]:
24 |         return []
25 | 
26 |     @current_filter.setter
27 |     def current_filter(self, value: List[Tuple[str, Any]]):
28 |         pass
29 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/retriever_catalog_item_kendra.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents a Kendra retriever catalog item. """
 2 | from dataclasses import dataclass, field
 3 | from typing import Any, List, Tuple, Union
 4 | 
 5 | from langchain.retrievers.kendra import AmazonKendraRetriever
 6 | from langchain.schema import BaseRetriever
 7 | 
 8 | from .retriever_catalog_item import RetrieverCatalogItem
 9 | 
10 | import boto3
11 | from botocore.config import Config
12 | 
13 | 
14 | @dataclass
15 | class KendraRetrieverItem(RetrieverCatalogItem):
16 |     """Class that represents a Kendra retriever catalog item."""
17 | 
18 |     region: str
19 |     """ AWS Region """
20 |     index_id: str
21 |     """ Kendra Index ID """
22 | 
23 |     top_k: int
24 |     """ Number of documents to be retrieved """
25 | 
26 |     _data_sources: Any
27 |     """ Kendra Data Sources in this index """
28 | 
29 |     _selected_data_sources: List[Tuple[str, Any]]
30 |     """ Kendra Data Source IDs that the retriever is supposed to use at the moment. """
31 | 
32 |     def __init__(self, friendly_name, index_id, data_sources, region=None, top_k=3):
33 |         super().__init__(friendly_name)
34 |         self._data_sources = data_sources
35 |         self._selected_data_sources = []
36 |         self.index_id = index_id
37 |         self.region = region
38 |         self.top_k = top_k
39 | 
40 |     @property
41 |     def available_filter_options(self) -> Union[List[Tuple[str, Any]], None]:
42 |         return [(data_src["Name"], data_src) for data_src in self._data_sources]
43 | 
44 |     @property
45 |     def current_filter(self) -> List[str]:
46 |         return [
47 |             (data_src["Name"], data_src) for data_src in self._selected_data_sources
48 |         ]
49 | 
50 |     @current_filter.setter
51 |     def current_filter(self, value: List[Tuple[str, Any]]):
52 |         selected_and_part_of_index = list(
53 |             filter(lambda x: x[1] in self._data_sources, value)
54 |         )
55 |         self._selected_data_sources = selected_and_part_of_index
56 | 
57 |     def get_instance(self) -> BaseRetriever:
58 |         data_src_filters = [
59 |             {
60 |                 "EqualsTo": {
61 |                     "Key": "_data_source_id",
62 |                     "Value": {"StringValue": src[1]["Id"]},
63 |                 }
64 |             }
65 |             for src in self._selected_data_sources
66 |         ]
67 | 
68 |         attribute_filter = {"OrAllFilters": data_src_filters}
69 |         filter = attribute_filter
70 | 
71 |         if len(self._selected_data_sources) == 1:
72 |             # Add data source language filter if the language is not English
73 |             language = self._selected_data_sources[0][1].get("LanguageCode", None)
74 |             if language and language != "en":
75 |                 lang_filter = {
76 |                     "EqualsTo": {
77 |                         "Key": "_language_code",
78 |                         "Value": {
79 |                             "StringValue": language
80 |                         }
81 |                     }
82 |                 }
83 | 
84 |                 filter = {"AndAllFilters": [attribute_filter, lang_filter]}
85 | 
86 |         return AmazonKendraRetriever(
87 |             index_id=self.index_id,
88 |             region_name=self.region,
89 |             attribute_filter=filter,
90 |             top_k=self.top_k
91 |         )
92 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/catalog/retriever_catalog_item_open_search.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains a class that represents a OpenSearch retriever catalog item. """
 2 | from dataclasses import dataclass
 3 | from typing import List
 4 | 
 5 | from chatbot.open_search import OpenSearchIndexRetriever
 6 | from langchain.schema import BaseRetriever
 7 | 
 8 | from .retriever_catalog_item import RetrieverCatalogItem
 9 | from typing import Any, List, Tuple, Union
10 | import streamlit as st
11 | 
12 | @dataclass
13 | class OpenSearchRetrieverItem(RetrieverCatalogItem):
14 |     """Class that represents a Amazon OpenSearch retriever catalog item."""
15 | 
16 |     region: str
17 |     """ AWS Region """
18 | 
19 |     rag_config: dict
20 |     """ Global RAG config (Character Limit, Number of documents to retrieve, etc.) """
21 | 
22 |     embedding_config: dict
23 |     """ SageMaker or Bedrock embedding config """
24 | 
25 |     _data_sources: Any
26 |     """ OpenSearch indexes in this domain """
27 | 
28 |     _selected_data_sources: List[Tuple[str, Any]]
29 |     """ OpenSearch indexes that are selected. """
30 | 
31 |     index_name: str
32 |     """ Selected OpenSearch Index Name """
33 | 
34 |     def __init__(
35 |         self,
36 |         friendly_name,
37 |         data_sources,
38 |         rag_config,
39 |         embedding_config,
40 |         top_k=3
41 |     ):
42 |         super().__init__(friendly_name)
43 |         self.index_name = ""
44 |         self._data_sources = data_sources
45 |         self._selected_data_sources = []
46 |         self.region = embedding_config["region"]
47 |         self.rag_config = rag_config
48 |         self.embedding_config = embedding_config
49 |         self.top_k = top_k
50 | 
51 |     @property
52 |     def available_filter_options(self) -> Union[List[Tuple[str, Any]], None]:
53 |         return [(data_src["index"], data_src) for data_src in self._data_sources]
54 | 
55 |     @property
56 |     def current_filter(self) -> List[str]:
57 |         return [
58 |             (data_src["index"], data_src) for data_src in self._selected_data_sources
59 |         ]
60 | 
61 |     @current_filter.setter
62 |     def current_filter(self, value: List[Tuple[str, Any]]):
63 |         selected_and_part_of_index = list(
64 |             filter(lambda x: x[1] in self._data_sources, value)
65 |         )
66 |         self._selected_data_sources = selected_and_part_of_index
67 | 
68 | 
69 |     def get_instance(self) -> BaseRetriever:
70 |         if len(self._selected_data_sources) != 1:
71 |             st.error('Please select exactly one data source.')
72 |             return None
73 |         
74 |         index_name = self._selected_data_sources[0][0]
75 | 
76 |         retriever = OpenSearchIndexRetriever(
77 |             index_name, rag_config=self.rag_config, embedding_config=self.embedding_config, k=self.top_k
78 |         )
79 |         return retriever
80 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from chatbot.config.aws_region import AWSRegion
 2 | 
 3 | from .amazon_bedrock import AmazonBedrock, AmazonBedrockParameters
 4 | from .app_config import AppConfig, AppConfigProvider
 5 | from .appearance import AWSsomeChatAppearance, AWSsomeChatAppearanceParameters
 6 | from .aws_config import AWSConfig
 7 | from .iam import Iam, IamParameters
 8 | from .llm_config import LLMConfig, LLMConfigMap, LLMConfigParameters
 9 | from .flow_config import FlowConfig, FlowConfigParameters
10 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/amazon_bedrock.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional, List
 3 | 
 4 | from .aws_region import AWSRegion
 5 | from .iam import Iam
 6 | from .parser_helpers import from_none, from_str, from_union, to_class, to_enum, from_list
 7 | 
 8 | 
 9 | @dataclass
10 | class AmazonBedrockParameters:
11 |     region: AWSRegion
12 |     """AWS Region to access Amazon Bedrock."""
13 |     endpoint_url: Optional[str] = None
14 |     """Optional endpoint url to access Amazon Bedrock."""
15 |     iam: Optional[Iam] = None
16 |     """Optional IAM configuration to access Amazon Bedrock."""
17 |     hide_models: Optional[List[str]] = None
18 |     """Bedrock models to hide."""
19 | 
20 |     @staticmethod
21 |     def from_dict(obj: Any) -> "AmazonBedrockParameters":
22 |         assert isinstance(obj, dict)
23 |         region = AWSRegion(obj.get("region"))
24 |         endpoint_url = from_union([from_str, from_none], obj.get("endpointURL"))
25 |         iam = from_union([Iam.from_dict, from_none], obj.get("iam"))
26 |         hide_models = from_union([lambda x: from_list(from_str, x), from_none], obj.get("hide_models"))
27 |         return AmazonBedrockParameters(region, endpoint_url, iam, hide_models)
28 | 
29 |     def to_dict(self) -> dict:
30 |         result: dict = {}
31 |         result["region"] = to_enum(AWSRegion, self.region)
32 |         result["endpointURL"] = from_union([from_str, from_none], self.endpoint_url)
33 |         result["iam"] = from_union([lambda x: to_class(Iam, x), from_none], self.iam)
34 |         result["hide_models"] = from_union([lambda x: from_list(from_str, x), from_none], self.hide_models)
35 |         return result
36 | 
37 | 
38 | @dataclass
39 | class AmazonBedrock:
40 |     """Optional Configuration for Amazon Bedrock."""
41 | 
42 |     parameters: AmazonBedrockParameters
43 |     type: str
44 | 
45 |     def __init__(self, parameters: AmazonBedrockParameters):
46 |         self.parameters = parameters
47 |         self.type = AmazonBedrock.typename()
48 | 
49 |     @classmethod
50 |     def typename(cls) -> str:
51 |         return "AmazonBedrock"
52 | 
53 |     @staticmethod
54 |     def from_dict(obj: Any) -> "AmazonBedrock":
55 |         assert isinstance(obj, dict)
56 |         parameters = AmazonBedrockParameters.from_dict(obj.get("parameters"))
57 |         type = from_str(obj.get("type"))
58 |         assert type == AmazonBedrock.typename()
59 |         return AmazonBedrock(parameters)
60 | 
61 |     def to_dict(self) -> dict:
62 |         result: dict = {}
63 |         result["parameters"] = to_class(AmazonBedrockParameters, self.parameters)
64 |         result["type"] = from_str(self.type)
65 |         return result
66 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/appearance.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from functools import partial
 3 | from typing import Any
 4 | 
 5 | from .parser_helpers import (
 6 |     Defaultable,
 7 |     from_default,
 8 |     from_none,
 9 |     from_str,
10 |     from_union,
11 |     to_class,
12 | )
13 | 
14 | # To use this code, make sure you
15 | #
16 | #     import json
17 | #
18 | # and then, to convert JSON from a string, do
19 | #
20 | #     result = app_config_from_dict(json.loads(json_string))
21 | 
22 | 
23 | @dataclass
24 | class AWSsomeChatAppearanceParameters:
25 |     __DEFAULT_FAVICON_URL = "src/aws.png"
26 |     __DEFAULT_NAME = "AWSomeChat"
27 |     """URL or path to image that is the logo for your chat application. Relative to the
28 |     03_chatbot directory.
29 |     """
30 |     favicon_url: str = None
31 |     """The name of your src."""
32 |     name: str = None
33 | 
34 |     def __init__(self, favicon_url=__DEFAULT_FAVICON_URL, name=__DEFAULT_NAME):
35 |         apply_default_favicon_url = partial(
36 |             from_default, defaultValue=self.__DEFAULT_FAVICON_URL
37 |         )
38 |         apply_default_name = partial(from_default, defaultValue=self.__DEFAULT_NAME)
39 | 
40 |         self.name = apply_default_name(x=name)
41 |         self.favicon_url = apply_default_favicon_url(x=favicon_url)
42 | 
43 |     @staticmethod
44 |     def from_dict(obj: Any) -> "AWSsomeChatAppearanceParameters":
45 |         assert isinstance(obj, dict)
46 |         favicon_url = from_union([from_str, from_none], obj.get("faviconUrl"))
47 |         name = from_union([from_str, from_none], obj.get("name"))
48 |         return AWSsomeChatAppearanceParameters(favicon_url, name)
49 | 
50 |     def to_dict(self) -> dict:
51 |         result: dict = {}
52 |         result["faviconUrl"] = from_union([from_str, from_none], self.favicon_url)
53 |         result["name"] = from_union([from_str, from_none], self.name)
54 |         return result
55 | 
56 | 
57 | @dataclass
58 | class AWSsomeChatAppearance:
59 |     pass
60 | 
61 | 
62 | @dataclass
63 | class AWSsomeChatAppearance(Defaultable[AWSsomeChatAppearance]):
64 |     """Personalize how the app looks like to your use case."""
65 | 
66 |     parameters: AWSsomeChatAppearanceParameters
67 |     type: str
68 | 
69 |     @classmethod
70 |     def typename(cls) -> str:
71 |         return "AWSomeChatAppearance"
72 | 
73 |     @staticmethod
74 |     def from_dict(obj: Any) -> "AWSsomeChatAppearance":
75 |         assert isinstance(obj, dict)
76 |         type = from_str(obj.get("type"))
77 |         assert type == AWSsomeChatAppearance.typename()
78 |         parameters = AWSsomeChatAppearanceParameters.from_dict(obj.get("parameters"))
79 |         return AWSsomeChatAppearance(parameters, type)
80 | 
81 |     def to_dict(self) -> dict:
82 |         result: dict = {}
83 |         result["parameters"] = to_class(
84 |             AWSsomeChatAppearanceParameters, self.parameters
85 |         )
86 |         result["type"] = from_str(self.type)
87 |         return result
88 | 
89 |     @staticmethod
90 |     def from_default(x: Any) -> "AWSsomeChatAppearance":
91 |         params = AWSsomeChatAppearanceParameters()
92 |         return AWSsomeChatAppearance(params, AWSsomeChatAppearance.typename())
93 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/aws_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class AWSConfig:
 6 |     account_id: str
 7 |     """ AWS account id """
 8 |     region: str
 9 |     """ AWS region"""
10 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/aws_region.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class AWSRegion(Enum):
 5 |     """Represents an AWS Region"""
 6 | 
 7 |     AF_SOUTH_1 = "af-south-1"
 8 |     AP_EAST_1 = "ap-east-1"
 9 |     AP_NORTHEAST_1 = "ap-northeast-1"
10 |     AP_NORTHEAST_2 = "ap-northeast-2"
11 |     AP_NORTHEAST_3 = "ap-northeast-3"
12 |     AP_SOUTHEAST_1 = "ap-southeast-1"
13 |     AP_SOUTHEAST_2 = "ap-southeast-2"
14 |     AP_SOUTHEAST_3 = "ap-southeast-3"
15 |     AP_SOUTHEAST_4 = "ap-southeast-4"
16 |     AP_SOUTH_1 = "ap-south-1"
17 |     AP_SOUTH_2 = "ap-south-2"
18 |     CA_CENTRAL_1 = "ca-central-1"
19 |     EU_CENTRAL_1 = "eu-central-1"
20 |     EU_CENTRAL_2 = "eu-central-2"
21 |     EU_NORTH_1 = "eu-north-1"
22 |     EU_SOUTH_1 = "eu-south-1"
23 |     EU_SOUTH_2 = "eu-south-2"
24 |     EU_WEST_1 = "eu-west-1"
25 |     EU_WEST_2 = "eu-west-2"
26 |     EU_WEST_3 = "eu-west-3"
27 |     ME_CENTRAL_1 = "me-central-1"
28 |     ME_SOUTH_1 = "me-south-1"
29 |     SA_EAST_1 = "sa-east-1"
30 |     US_EAST_1 = "us-east-1"
31 |     US_EAST_2 = "us-east-2"
32 |     US_WEST_1 = "us-west-1"
33 |     US_WEST_2 = "us-west-2"
34 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/fin_analyzer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional
 3 | 
 4 | from .parser_helpers import from_none, from_str, from_union, to_class, to_enum
 5 | 
 6 | 
 7 | @dataclass
 8 | class FinAnalyzerParameters:
 9 |     friendly_name: str = "Stock Analysis"
10 |     """The name of the RAG option in the menu."""
11 |     s3_bucket: Optional[str] = None
12 |     """S3 bucket with Finance Data."""
13 |     s3_prefix: Optional[str] = None
14 |     """S3 bucket prefix (path) with Finance Data."""
15 | 
16 |     @staticmethod
17 |     def from_dict(obj: Any) -> "FinAnalyzerParameters":
18 |         assert isinstance(obj, dict)
19 |         friendly_name = from_union([from_str, from_none], obj.get("friendlyName"))
20 |         s3_bucket = from_union([from_str, from_none], obj.get("s3Bucket"))
21 |         s3_prefix = from_union([from_str, from_none], obj.get("s3Prefix"))
22 |         return FinAnalyzerParameters(friendly_name, s3_bucket, s3_prefix)
23 | 
24 |     def to_dict(self) -> dict:
25 |         result: dict = {}
26 |         result["friendlyName"] = from_union([from_str, from_none], self.friendly_name)
27 |         result["s3Bucket"] = from_union([from_str, from_none], self.s3_bucket)
28 |         result["s3Prefix"] = from_union([from_str, from_none], self.s3_prefix)
29 |         return result
30 | 
31 | 
32 | @dataclass
33 | class FinAnalyzer:
34 |     """Configuration for Finance Analyzer."""
35 | 
36 |     parameters: FinAnalyzerParameters
37 |     type: str
38 | 
39 |     def __init__(self, parameters: FinAnalyzerParameters):
40 |         self.parameters = parameters
41 |         self.type = FinAnalyzer.typename()
42 | 
43 |     @classmethod
44 |     def typename(cls) -> str:
45 |         return "FinAnalyzer"
46 | 
47 |     @staticmethod
48 |     def from_dict(obj: Any) -> "FinAnalyzer":
49 |         assert isinstance(obj, dict)
50 |         parameters = FinAnalyzerParameters.from_dict(obj.get("parameters"))
51 |         type = from_str(obj.get("type"))
52 |         assert type == FinAnalyzer.typename()
53 |         return FinAnalyzer(parameters)
54 | 
55 |     def to_dict(self) -> dict:
56 |         result: dict = {}
57 |         result["parameters"] = to_class(FinAnalyzerParameters, self.parameters)
58 |         result["type"] = from_str(self.type)
59 |         return result
60 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/flow_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional
 3 | 
 4 | from .parser_helpers import from_dict, from_none, from_str, from_union, to_class, to_enum
 5 | 
 6 | 
 7 | @dataclass
 8 | class FlowConfigParameters:
 9 |     """Flow confitufation dictionary."""
10 |     flows: dict
11 |     rag: dict
12 | 
13 |     @staticmethod
14 |     def from_dict(obj: Any) -> "FlowConfigParameters":
15 |         assert isinstance(obj, dict)
16 |         flows = from_union([lambda x: from_dict(dict, x), from_none], obj.get("flows"))
17 |         rag = obj.get("rag")
18 |         # s3_bucket = from_union([from_str, from_none], obj.get("s3Bucket"))
19 |         return FlowConfigParameters(flows, rag)
20 | 
21 |     def to_dict(self) -> dict:
22 |         result: dict = {}
23 |         result["flows"] = from_union([lambda x: from_dict(dict, x), from_none], self.flows) 
24 |         result["rag"] = self.rag
25 |         # result["s3Bucket"] = from_union([from_str, from_none], self.s3_bucket)
26 |         return result
27 | 
28 | 
29 | @dataclass
30 | class FlowConfig:
31 |     """Configuration for Finance Analyzer."""
32 | 
33 |     parameters: FlowConfigParameters
34 |     type: str
35 | 
36 |     def __init__(self, parameters: FlowConfigParameters):
37 |         self.parameters = parameters
38 |         self.type = FlowConfig.typename()
39 | 
40 |     @classmethod
41 |     def typename(cls) -> str:
42 |         return "FlowConfig"
43 | 
44 |     @staticmethod
45 |     def from_dict(obj: Any) -> "FlowConfig":
46 |         assert isinstance(obj, dict)
47 |         parameters = FlowConfigParameters.from_dict(obj.get("parameters"))
48 |         type = from_str(obj.get("type"))
49 |         assert type == FlowConfig.typename()
50 |         return FlowConfig(parameters)
51 | 
52 |     def to_dict(self) -> dict:
53 |         result: dict = {}
54 |         result["parameters"] = to_class(FlowConfigParameters, self.parameters)
55 |         result["type"] = from_str(self.type)
56 |         return result
57 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/iam.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional
 3 | 
 4 | from .parser_helpers import from_none, from_str, from_union, to_class
 5 | 
 6 | 
 7 | @dataclass
 8 | class IamParameters:
 9 |     """Optional credentials profile name to use for access."""
10 | 
11 |     profile: Optional[str] = None
12 |     """Optional IAM role to assume."""
13 |     role_arn: Optional[str] = None
14 | 
15 |     @staticmethod
16 |     def from_dict(obj: Any) -> "IamParameters":
17 |         assert isinstance(obj, dict)
18 |         profile = from_union([from_str, from_none], obj.get("profile"))
19 |         role_arn = from_union([from_str, from_none], obj.get("roleARN"))
20 |         return IamParameters(profile, role_arn)
21 | 
22 |     def to_dict(self) -> dict:
23 |         result: dict = {}
24 |         result["profile"] = from_union([from_str, from_none], self.profile)
25 |         result["roleARN"] = from_union([from_str, from_none], self.role_arn)
26 |         return result
27 | 
28 | 
29 | @dataclass
30 | class Iam:
31 |     """Optional IAM configuration to access AWS resources. Supports profile or assuming an IAM
32 |     role.
33 |     """
34 | 
35 |     parameters: IamParameters
36 |     type: str
37 | 
38 |     @classmethod
39 |     def typename(cls) -> str:
40 |         return "BotoIAM"
41 | 
42 |     @staticmethod
43 |     def from_dict(obj: Any) -> "Iam":
44 |         assert isinstance(obj, dict)
45 |         parameters = IamParameters.from_dict(obj.get("parameters"))
46 |         type = from_str(obj.get("type"))
47 |         assert type == Iam.typename()
48 |         return Iam(parameters, type)
49 | 
50 |     def to_dict(self) -> dict:
51 |         result: dict = {}
52 |         result["parameters"] = to_class(IamParameters, self.parameters)
53 |         result["type"] = from_str(self.type)
54 |         return result
55 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/config/parser_helpers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from enum import Enum
 3 | from typing import Any, Callable, Dict, Generic, List, Type, TypeVar, cast
 4 | 
 5 | T = TypeVar("T")
 6 | EnumT = TypeVar("EnumT", bound=Enum)
 7 | 
 8 | 
 9 | def from_str(x: Any) -> str:
10 |     assert isinstance(x, str)
11 |     return x
12 | 
13 | 
14 | def to_enum(c: Type[EnumT], x: Any) -> EnumT:
15 |     assert isinstance(x, c)
16 |     return x.value
17 | 
18 | 
19 | def to_class(c: Type[T], x: Any) -> dict:
20 |     assert isinstance(x, c)
21 |     return cast(Any, x).to_dict()
22 | 
23 | 
24 | def from_none(x: Any) -> Any:
25 |     assert x is None
26 |     return x
27 | 
28 | 
29 | def from_union(fs, x):
30 |     for f in fs:
31 |         try:
32 |             return f(x)
33 |         except Exception as e:
34 |             pass
35 |     assert False
36 | 
37 | 
38 | def from_default(defaultValue: Type[T], x: Type[T]) -> Type[T]:
39 |     assert defaultValue is not None
40 |     return x or defaultValue
41 | 
42 | 
43 | def from_list(f: Callable[[Any], T], x: Any) -> List[T]:
44 |     assert isinstance(x, list)
45 |     return [f(y) for y in x]
46 | 
47 | 
48 | class Defaultable(ABC, Generic[T]):
49 |     @staticmethod
50 |     @abstractmethod
51 |     def from_default(x: Any) -> T:
52 |         pass
53 | 
54 | 
55 | def from_int(x: Any) -> int:
56 |     assert isinstance(x, int) and not isinstance(x, bool)
57 |     return x
58 | 
59 | 
60 | def from_float(x: Any) -> float:
61 |     assert isinstance(x, (float, int)) and not isinstance(x, bool)
62 |     return float(x)
63 | 
64 | 
65 | def to_float(x: Any) -> float:
66 |     assert isinstance(x, float)
67 |     return x
68 | 
69 | 
70 | def from_dict(f: Callable[[Any], T], x: Any) -> Dict[str, T]:
71 |     assert isinstance(x, dict)
72 |     return {k: f(v) for (k, v) in x.items()}
73 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from .sagemaker_endpoint_embeddings import SageMakerEndpointEmbeddings
2 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/embeddings/sagemaker_endpoint_embeddings.py:
--------------------------------------------------------------------------------
 1 | class SageMakerEndpointEmbeddings:
 2 |     def __init__(self, embeddings_predictor):
 3 |         self.embeddings_predictor = embeddings_predictor
 4 | 
 5 |     def embed_documents(self, input_texts):
 6 |         return self._embed_docs(input_texts, False)
 7 | 
 8 |     def embed_query(self, query_text):
 9 |         return self._embed_docs([query_text])[0]
10 | 
11 |     def _embed_docs(self, texts, isQuery=False):
12 |         prefix = "passage: "
13 |         if isQuery:
14 |             prefix = "query: "
15 |         texts = [prefix + text for text in texts]
16 |         data = {
17 |             "texts": texts,
18 |         }
19 | 
20 |         res = self.embeddings_predictor.predict(data=data)
21 |         return res["vectors"]
22 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/fin_analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | """ This module contains integration with FinAnalyzer datasource."""
2 | from .fin_analyzer_index_retriever import FinAnalyzerIndexRetriever
3 | from .retriever_catalog_item_fin_analyzer import FinAnalyzerRetrieverItem
4 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/fin_analyzer/prompts/anthropic_claude_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   Human: You are virtual trading machine designed purely for academic purposes. Your analysis or suggestions do not affect daily life and financial decisions.
 7 |   You provide unbiased information strictly based on below <input> data only, in descending date order, starting from the latest.
 8 | 
 9 |   Chat History:
10 |   {chat_history}
11 | 
12 |   <input>
13 |   {input}
14 |   </input>
15 | 
16 | 
17 |   Assistant:
18 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/fin_analyzer/prompts/anthropic_claude_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   Human: You are virtual trading machine designed purely for academic purposes. Your analysis or suggestions do not affect daily life and financial decisions.
 7 |   Answer the <question> in an unbiased fashion, you do not have up to date information and you will only find it in <context> below. 
 8 |   <context> contains up to date data about financial announcements. 
 9 |   If you find the answer in the <context>, start from the latest information.
10 |   Where possible provide information in table format.
11 | 
12 |   <context>
13 |   {context}
14 |   </context>
15 | 
16 |   <question>{question}</question>
17 |   Assistant:
18 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from .aws_helpers import get_boto_session, get_current_account_id
2 | from .environment_variables import ChatbotEnvironment, ChatbotEnvironmentVariables
3 | from .urls import is_url
4 | from .sagemaker_async_endpoint import SagemakerAsyncEndpoint
5 | from .langchain_bedrock_overwrite import Bedrock


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/aws_helpers.py:
--------------------------------------------------------------------------------
 1 | """ Module that contains helper functions for common AWS operations.
 2 | """
 3 | import datetime
 4 | from typing import Union
 5 | 
 6 | import boto3
 7 | import botocore
 8 | from botocore.credentials import (
 9 |     AssumeRoleCredentialFetcher,
10 |     DeferredRefreshableCredentials,
11 | )
12 | from chatbot.config import Iam
13 | from dateutil.tz import tzlocal
14 | 
15 | 
16 | def get_current_account_id():
17 |     """Returns the current AWS account ID."""
18 |     account = boto3.client("sts").get_caller_identity()["Account"]
19 |     return account
20 | 
21 | 
22 | def _get_client_creator(session):
23 |     def client_creator(service_name, **kwargs):
24 |         return session.client(service_name, **kwargs)
25 | 
26 |     return client_creator
27 | 
28 | 
29 | def assume_role_session(
30 |     role_arn: str, region: str, base_session: botocore.session.Session = None
31 | ):
32 |     session = base_session or boto3.Session()
33 |     fetcher = AssumeRoleCredentialFetcher(
34 |         client_creator=_get_client_creator(session),
35 |         source_credentials=session.get_credentials(),
36 |         role_arn=role_arn,
37 |     )
38 |     botocore_session = botocore.session.Session()
39 |     botocore_session._credentials = DeferredRefreshableCredentials(
40 |         method="assume-role", refresh_using=fetcher.fetch_credentials
41 |     )
42 | 
43 |     return boto3.Session(botocore_session=botocore_session)
44 | 
45 | 
46 | def get_boto_session(iam_config: Union[Iam, None], region: str):
47 |     if not iam_config:
48 |         return boto3.Session()
49 |     iam_profile_name = iam_config.parameters.profile or None
50 |     iam_role_arn = iam_config.parameters.role_arn or None
51 | 
52 |     if iam_profile_name:
53 |         return boto3.Session(profile_name=iam_profile_name)
54 | 
55 |     if iam_role_arn:
56 |         return assume_role_session(iam_role_arn, region=region)
57 |     return boto3.Session()
58 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/environment_variables.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | from typing import Dict
 4 | 
 5 | 
 6 | class ChatbotEnvironmentVariables(Enum):
 7 |     """
 8 |     The names of all the environment variables the chatbot app uses.
 9 |     """
10 | 
11 |     AmazonBedrockRegion = "BEDROCK_REGION"
12 |     AWSRegion = "AWS_DEFAULT_REGION"
13 |     AmazonTextractS3Bucket = "AMAZON_TEXTRACT_S3_BUCKET"
14 |     BaseUrl = "BASE_URL"
15 |     AWSAppConfigApplication = "AWS_APP_CONFIG_APPLICATION"
16 |     AWSAppConfigEnvironment = "AWS_APP_CONFIG_ENVIRONMENT"
17 |     AWSAppConfigProfile = "AWS_APP_CONFIG_PROFILE"
18 |     AppPrefix = "APP_PREFIX"
19 |     SERPAPI_API_KEY = "SERPAPI_API_KEY"
20 | 
21 | 
22 | class ChatbotEnvironment:
23 |     """
24 |     Access environment variables available to the chatbot without having to provide defaults.
25 | 
26 |     Central place were all defaults for the environment variables are defined.
27 |     """
28 | 
29 |     __defaults: Dict[ChatbotEnvironmentVariables, str] = {
30 |         ChatbotEnvironmentVariables.AmazonBedrockRegion: None,
31 |         ChatbotEnvironmentVariables.AWSRegion: "eu-west-1",
32 |         ChatbotEnvironmentVariables.AppPrefix: "genie",
33 |         ChatbotEnvironmentVariables.SERPAPI_API_KEY: ""
34 |     }
35 | 
36 |     def get_env_variable(self, variable_name: ChatbotEnvironmentVariables) -> str:
37 |         return os.environ.get(variable_name.value, self.__defaults.get(variable_name))
38 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .app_logging import TECHNICAL_LOGGER_NAME
2 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/logger/app_logging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module to help logging application activity.
 3 | """
 4 | import logging
 5 | 
 6 | from .llm_logging_handler import LlmLoggingHandler
 7 | from .log_to_ui_handler import LogToUiHandler
 8 | 
 9 | LLM_LOGGER_NAME = "LLM_LOGGER"
10 | TECHNICAL_LOGGER_NAME = "TECHNICAL_LOGGER"
11 | 
12 | 
13 | class SessionLogFilter(logging.Filter):
14 |     """
15 |     This filter only show log entries for specified thread name.
16 |     """
17 | 
18 |     def __init__(self, session_id, *args, **kwargs):
19 |         logging.Filter.__init__(self, *args, **kwargs)
20 |         self.session_id = session_id
21 | 
22 |     def filter(self, record):
23 |         return not record.getMessage().startswith(self.session_id)
24 | 
25 | 
26 | def get_llm_logger(
27 |     ui_handler: LogToUiHandler, session_id: str, level: int = logging.DEBUG
28 | ) -> logging.Logger:
29 |     """
30 |     NOTE: Call this function only once per thread in the application.
31 |     This function initializes a global Python logger for LLM usage with the provided ui_handler.
32 | 
33 | 
34 |     Args:
35 |         ui_handler: UI handler that will receive the LLM logs.
36 |         level: logging level. Default is logging.DEBUG.
37 | 
38 |     Returns:
39 |         A logger that calls the ui_handler on every log line.
40 |     """
41 |     logger = _get_logger(ui_handler, session_id, LLM_LOGGER_NAME, level=level)
42 |     return logger
43 | 
44 | 
45 | def get_llm_log_handler(logger: logging.Logger) -> LlmLoggingHandler:
46 |     """
47 |     NOTE: Call this function only once per thread in the application.
48 |     This function creates an LlmLoggingHandler object that captures LLM outputs.
49 | 
50 |     Args:
51 |         logger: the logger to use for capturing LLM outputs.
52 | 
53 |     Returns:
54 |         A handler object that captures LLM outputs and passes it to the UI handler.
55 |         This handler needs to be passed to LangChain.
56 |     """
57 |     handler = LlmLoggingHandler(logger=logger)
58 |     return handler
59 | 
60 | 
61 | def get_technical_logger(
62 |     ui_handler: LogToUiHandler, session_id: str, level: int = logging.DEBUG
63 | ) -> logging.Logger:
64 |     """
65 |     NOTE: Call this function only once per thread in the application.
66 |     This function initializes a global Python logger for technical usage with the provided ui_handler.
67 | 
68 |     Returns
69 |         A logger that calls the ui_handler on every log line.
70 |     """
71 |     logger = _get_logger(ui_handler, TECHNICAL_LOGGER_NAME, session_id, level=level)
72 |     logger.addHandler(logging.StreamHandler())
73 |     return logger
74 | 
75 | 
76 | def _get_logger(
77 |     ui_handler: LogToUiHandler,
78 |     logger_name: str,
79 |     session_id: str,
80 |     level: int = logging.DEBUG,
81 | ) -> logging.Logger:
82 |     """
83 |     NOTE: Call this function only once per thread in the application.
84 |     This function initializes a global Python logger with the provided logger_name.
85 | 
86 |     Args:
87 |         ui_handler: UI handler that will receive the logs.
88 |         logger_name: name of the logger.
89 |         level: logging level. Default is logging.DEBUG.
90 | 
91 |     Returns:
92 |         A logger that calls the ui_handler on every log line.
93 |     """
94 |     ui_handler.addFilter(SessionLogFilter(session_id))
95 |     logger = logging.getLogger(f"{session_id}.{logger_name}")
96 |     logger.addHandler(ui_handler)
97 |     logger.setLevel(level)
98 |     return logger
99 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/logger/log_to_ui_handler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module includes a handler to integrate log messages into UI components, e.g.: Streamlit.
 3 | """
 4 | import logging
 5 | from typing import Callable, Union
 6 | 
 7 | 
 8 | class LogToUiHandler(logging.Handler):
 9 |     """
10 |     Python custom logger to send log events to a list of callbacks.
11 |     This is intended to integrate logs into UI components.
12 |     """
13 | 
14 |     def __init__(self, callback: Union[Callable[[str], None], None] = None) -> None:
15 |         super().__init__()
16 |         logging.Handler.__init__(self=self)
17 |         self._messages: list[str] = []
18 |         self._callback = callback
19 | 
20 |     def emit(self, record: logging.LogRecord) -> None:
21 |         """
22 |         Emit a record by storing it in the messages list it can be rendered afterwards.
23 |         """
24 |         self._callback(record.getMessage())
25 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/helpers/urls.py:
--------------------------------------------------------------------------------
 1 | """ Helper for dealing with URLs. """
 2 | import urllib
 3 | 
 4 | 
 5 | def is_url(potential_url):
 6 |     """Return whether the string is a URL.
 7 | 
 8 |     Args:
 9 |         potential_url: the string to check.
10 | 
11 |     Returns:
12 |         True if the string is a URL, False otherwise.
13 |     """
14 |     return urllib.parse.urlparse(potential_url).scheme != ""
15 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/i18n/__init__.py:
--------------------------------------------------------------------------------
1 | """ Internationalization module exports. """
2 | from .internationalization import install_language
3 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/i18n/chatbot.pot:
--------------------------------------------------------------------------------
 1 | # Translations template for LLM APP Genie.
 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | # This file is distributed under the same license as the LLM App Genie project.
 4 | # Malte Reimann malterei@amazon.com, 2023.
 5 | #, fuzzy
 6 | msgid ""
 7 | msgstr ""
 8 | "Project-Id-Version: LLM App Genie 1.2.1\n"
 9 | "Report-Msgid-Bugs-To: malterei@amazon.com\n"
10 | "POT-Creation-Date: 2023-08-17 20:02+0200\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
13 | "Language-Team: LANGUAGE <LL@li.org>\n"
14 | "MIME-Version: 1.0\n"
15 | "Content-Type: text/plain; charset=utf-8\n"
16 | "Content-Transfer-Encoding: 8bit\n"
17 | "Generated-By: Babel 2.12.1\n"
18 | 
19 | #: src/chatbot/__main__.py:53
20 | msgid "{chatbot_name} - An LLM-powered app for your custom data"
21 | msgstr ""
22 | 
23 | #: src/chatbot/__main__.py:54
24 | msgid ""
25 | "Made with ❤️ by your AWS WWSO AIML EMEA and Swiss SA Team (and Amazon "
26 | "CodeWhisperer 🤫)"
27 | msgstr ""
28 | 
29 | #: src/chatbot/__main__.py:55
30 | msgid "LLM-powered app built using:"
31 | msgstr ""
32 | 
33 | #: src/chatbot/__main__.py:155
34 | msgid "Reset Session"
35 | msgstr ""
36 | 
37 | #: src/chatbot/__main__.py:176
38 | msgid "Ask a question or prompt the LLM"
39 | msgstr ""
40 | 
41 | #: src/chatbot/ui/chat_messages.py:55
42 | msgid "Hi, I'm {chatbot_name}. How may I help you?"
43 | msgstr ""
44 | 
45 | #: src/chatbot/ui/chat_messages.py:58
46 | msgid "The information source is {retriever_name}."
47 | msgstr ""
48 | 
49 | #: src/chatbot/ui/chat_messages.py:59
50 | msgid "You are chatting with {model_name}."
51 | msgstr ""
52 | 
53 | #: src/chatbot/ui/sidebar.py:50
54 | msgid "Knowledge Base"
55 | msgstr ""
56 | 
57 | #: src/chatbot/ui/sidebar.py:54
58 | msgid "Language Model"
59 | msgstr ""
60 | 
61 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/i18n/de_DE/LC_MESSAGES/chatbot.po:
--------------------------------------------------------------------------------
 1 | # German (Germany) translations for LLM App Genie.
 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | # This file is distributed under the same license as the LLM App Genie project.
 4 | # Malte Reimann malterei@amazon.com, 2023.
 5 | msgid ""
 6 | msgstr ""
 7 | "Project-Id-Version: v1.3.0\n"
 8 | "Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
 9 | "POT-Creation-Date: 2023-08-17 20:02+0200\n"
10 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
11 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
12 | "Language: de_DE\n"
13 | "Language-Team: de_DE <LL@li.org>\n"
14 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.12.1\n"
19 | 
20 | #: src/chatbot/__main__.py:53
21 | msgid "{chatbot_name} - An LLM-powered app for your custom data"
22 | msgstr "{chatbot_name} - Eine LLM-powered App für Ihre Daten"
23 | 
24 | #: src/chatbot/__main__.py:54
25 | msgid ""
26 | "Made with ❤️ by your AWS WWSO AIML EMEA and Swiss SA Team (and Amazon "
27 | "CodeWhisperer 🤫)"
28 | msgstr "Entwickelt mit ❤️ von Ihrem AWS WWSO AIML EMEA und Swiss SA Team (und Amazon CodeWhisperer 🤫)"
29 | 
30 | #: src/chatbot/__main__.py:55
31 | msgid "LLM-powered app built using:"
32 | msgstr "LLM-powered App entickelt mit:"
33 | 
34 | #: src/chatbot/__main__.py:155
35 | msgid "Reset Session"
36 | msgstr "Session Zürücksetzen"
37 | 
38 | #: src/chatbot/__main__.py:176
39 | msgid "Ask a question or prompt the LLM"
40 | msgstr "Stelle eine Frage oder gebe Anweisungen an das LLM"
41 | 
42 | #: src/chatbot/ui/chat_messages.py:55
43 | msgid "Hi, I'm {chatbot_name}. How may I help you?"
44 | msgstr "Hallo, ich bin {chatbot_name}. Wie kann ich Ihnen helfen?"
45 | 
46 | #: src/chatbot/ui/chat_messages.py:58
47 | msgid "The information source is {retriever_name}."
48 | msgstr "Die Informationsquelle ist {retriever_name}."
49 | 
50 | #: src/chatbot/ui/chat_messages.py:59
51 | msgid "You are chatting with {model_name}."
52 | msgstr "Sie chatten mit {model_name}."
53 | 
54 | #: src/chatbot/ui/sidebar.py:50
55 | msgid "Knowledge Base"
56 | msgstr "Informationsquelle"
57 | 
58 | #: src/chatbot/ui/sidebar.py:54
59 | msgid "Language Model"
60 | msgstr ""
61 | 
62 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/i18n/en_US/LC_MESSAGES/chatbot.po:
--------------------------------------------------------------------------------
 1 | # English (United States) translations for LLM App Genie.
 2 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | # This file is distributed under the same license as the LLM App Genie project.
 4 | # Malte Reimann malterei@amazon.com, 2023.
 5 | msgid ""
 6 | msgstr ""
 7 | "Project-Id-Version: LLM App Genie 0.0.1\n"
 8 | "Report-Msgid-Bugs-To: malterei@amazon.com\n"
 9 | "POT-Creation-Date: 2023-08-17 20:02+0200\n"
10 | "PO-Revision-Date: 2023-08-17 20:15+0200\n"
11 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
12 | "Language: en_US\n"
13 | "Language-Team: en_US <LL@li.org>\n"
14 | "Plural-Forms: nplurals=2; plural=(n != 1);\n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.12.1\n"
19 | 
20 | #: src/chatbot/__main__.py:53
21 | msgid "{chatbot_name} - An LLM-powered app for your custom data"
22 | msgstr ""
23 | 
24 | #: src/chatbot/__main__.py:54
25 | msgid ""
26 | "Made with ❤️ by your AWS WWSO AIML EMEA and Swiss SA Team (and Amazon "
27 | "CodeWhisperer 🤫)"
28 | msgstr ""
29 | 
30 | #: src/chatbot/__main__.py:55
31 | msgid "LLM-powered app built using:"
32 | msgstr ""
33 | 
34 | #: src/chatbot/__main__.py:155
35 | msgid "Reset Session"
36 | msgstr ""
37 | 
38 | #: src/chatbot/__main__.py:176
39 | msgid "Ask a question or prompt the LLM"
40 | msgstr ""
41 | 
42 | #: src/chatbot/ui/chat_messages.py:55
43 | msgid "Hi, I'm {chatbot_name}. How may I help you?"
44 | msgstr ""
45 | 
46 | #: src/chatbot/ui/chat_messages.py:58
47 | msgid "The information source is {retriever_name}."
48 | msgstr ""
49 | 
50 | #: src/chatbot/ui/chat_messages.py:59
51 | msgid "You are chatting with {model_name}."
52 | msgstr ""
53 | 
54 | #: src/chatbot/ui/sidebar.py:50
55 | msgid "Knowledge Base"
56 | msgstr ""
57 | 
58 | #: src/chatbot/ui/sidebar.py:54
59 | msgid "Language Model"
60 | msgstr ""
61 | 
62 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/i18n/internationalization.py:
--------------------------------------------------------------------------------
 1 | """ Internationalization of text to other locals. """
 2 | import gettext
 3 | import logging
 4 | import os
 5 | 
 6 | 
 7 | def install_language(locale: str):
 8 |     """Installs a given local and returns the gettext translation function for that language.
 9 | 
10 |     Args:
11 |         locale: The locale to install.
12 | 
13 |     Returns:
14 |         gettext.gettext: The gettext translation function for the given locale.
15 | 
16 |     Example:
17 |         ```python
18 |         gettext = install_language("en_US")
19 |         _ = gettext
20 |         _("translates string")
21 |         ```
22 |     """
23 |     i18n_dir = os.path.dirname(__file__)
24 |     try:
25 |         lang = gettext.translation("chatbot", localedir=i18n_dir, languages=[locale])
26 |         lang.install()
27 | 
28 |         return lang.gettext
29 |     except:
30 |         logging.info("Running without internationalization")
31 |     return gettext.gettext
32 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/json_schema/Readme:
--------------------------------------------------------------------------------
 1 | # JSON Schema for App Config
 2 | The [JSON schemas](http://json-schema.org/) in the `json_schema` folder describe the structure for configuring the AWSomeChat application using a `appconfig.json` file in the root folder of the application.
 3 | 
 4 | Use the `$schema` property in the `appconfig.json` file to point to the [aws_awsomechat_app_config.schema.json](./aws_awsomechat_app_config.schema.json) JSON schema. Here is an example for the `appconfig.json` file:
 5 | ```json
 6 | {
 7 |     "$schema": "./json_schema/aws_awsomechat_app_config.schema.json",
 8 |     "appearance": {
 9 |       "type": "AWSomeChatAppearance",
10 |       "parameters": {
11 |         "name": "My Chat",
12 |         "faviconUrl": "chatbot/aws.png"
13 |       }
14 |     }
15 |   }
16 | ```
17 | 
18 | ## Why
19 | JSON schema describes and validates the format of a JSON document. This enhances the developer experience when configuring the AWSomeChat application using the `appconfig.json` file. We also use the JSON schema to automatically generate code that parses the `appconfig.json` file.
20 | 
21 | ## Development
22 | The entire `json_schema/` directory can be excluded from any build artifacts (i.e. Docker container) because the application does not use it at run time. The JSON schema is used during development.
23 | 
24 | [Quicktype](https://quicktype.io/) is the tool to generate the code to parse the `appconfig.json` file during runtime. To install Quicktype run `npm install -g quicktype` 
25 | 
26 | You can modify [aws_awsomechat_app_config.schema.json](./aws_awsomechat_app_config.schema.json) to expand the schema. For backwards compatibilty we should only extend and not delete/modify the existing schema if possible. After making changes to [aws_awsomechat_app_config.schema.json](./aws_awsomechat_app_config.schema.json) you can generate the code with the following command:
27 | ```bash 
28 | quicktype -o app_config_change.py -t AppConfig -s schema --src ./json_schema/aws_awsomechat_app_config.schema.json --telemetry disable  --python-version 3.7
29 | ```
30 | Now you can take the code from the `app_config_change.py` file and integrate the changes into [app_config.py](../app_config.py) file. Do not replace all the content in `app_config.py` with the new code because `app_config.py` contains manual changes. Add the new code that is the result of your schema additions to the existing code.
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/open_search/__init__.py:
--------------------------------------------------------------------------------
1 | """ This module contains integration with OpenSearch."""
2 | from .open_search_index_retriever import OpenSearchIndexRetriever, get_credentials, get_open_search_index_list
3 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/ai21_jurassic_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.
 7 | 
 8 |   Chat history:
 9 |   {chat_history}
10 | 
11 |   Question: {input}
12 | 
13 |   Answer:
14 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/ai21_jurassic_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   You are an assistant that helps answer questions about businesses and institutions.
 7 |   You receive a question and a piece of context that may contain recent news, rules, procedures, and other type of documentation.
 8 |   That question and its context may be written in mixed languages.
 9 |   Use only the information in the context to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.
10 | 
11 |   
12 |   Question: {question}
13 | 
14 |   Context: {context}
15 |   
16 |   Answer: 
17 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/anthropic_claude_agent_financial_analyzer.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - agent_scratchpad
 4 |   - input
 5 | template: |
 6 |     Human: You are a financial advisor. Give stock recommendations for given query based on following instructions. Don't be shy and always answer the questions.
 7 |     <instructions>
 8 |     Everytime first you should identify the company name and get the stock ticker symbol for the stock using the Search tool.
 9 |     Answer the following questions as best you can. You have access to the following tools:
10 |     
11 |     get stock data: Use when you are asked to evaluate or analyze a stock. This will output historic share price data. You should input the the stock ticker to it
12 |     Search: Use only to get recent stock related news or the stock ticker. Dont use it for any other analysis or task
13 |     get recent news: Use this to fetch recent news about stocks
14 |     get financial statements: Use this to get financial statement of the company. With the help of this data companys historic performance can be evaluated. You should input stock ticker to it
15 |     </instructions>
16 |     
17 |     <steps>
18 |     Note- if you fail in satisfying any of the step below, don't be shy and just move to next one
19 |     1) Get the company name and search the ticker using Search. Dont hallucinate extract stock ticker as it is from the text. Output- stock ticker
20 |     2) Use "get stock data" tool to gather stock info. Output- Stock data
21 |     3) Get company's historic financial data using "get financial statements". Output- Financial statement
22 |     4) Use this "get recent news" tool to search for latest stock realted news. Output- Stock news
23 |     5) Analyze the stock based on gathered data and give detail analysis for investment choice. provide numbers and reasons to justify your answer. Output- Detailed stock Analysis
24 |     </steps>
25 |     
26 |     Strictly use the following format:
27 |     Question: the input question you must answer
28 |     Thought: you should always think about what to do, Also try to follow steps mentioned above
29 |     Action: the action to take, should be one of [Search, get recent news, get financial statements]
30 |     Action Input: the input to the action
31 |     Observation: the result of the action
32 |     ... (this Thought/Action/Action Input/Observation can repeat N times)
33 |     Thought: I now know the final answer
34 |     Final Answer: the final answer to the original input question
35 |     
36 |     Question: {input}
37 |     
38 |     Assistant:
39 |     {agent_scratchpad}


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/anthropic_claude_agent_sql.yaml:
--------------------------------------------------------------------------------
1 | _type: prompt
2 | input_variables:
3 |   - input
4 | template: |
5 |     Human: You are an SQL query generator. Only provide the required SQL statement as output.
6 |     Question: {input}
7 |     
8 |     Assistant:
9 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/anthropic_claude_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   Human: The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.
 7 | 
 8 |   <chat_history>
 9 |   {chat_history}
10 |   </chat_history>
11 | 
12 |   <input>{input}</input>
13 | 
14 |   Assistant:
15 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/anthropic_claude_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   Human: You are an assistant that helps answer questions about businesses and institutions.
 7 |   You receive a question and a piece of <context> that may contain recent news, rules, procedures, and other type of documentation.
 8 |   That question and its <context> may be written in mixed languages.
 9 |   Use only the information in the <context> to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.
10 |   
11 |   <question>{question}</question>
12 | 
13 |   <context>
14 |     {context}
15 |   </context>
16 |   
17 |   
18 |   Assistant: 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/condense_question.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - question
 5 | template: |
 6 |   Given the following conversation in diverse languages and a follow up question in English, rephrase the follow up question to be a standalone question that summarizes the context. Summarize the question into one simple query.
 7 |   
 8 |   <context>
 9 |     <chat_history>{chat_history}</chat_history>
10 |     
11 |     <follow_up_question>{question}</follow_up_question>
12 |   </context>
13 |   Standalone question:
14 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/default_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.
 7 | 
 8 |   Chat History:
 9 |   {chat_history}
10 | 
11 |   Human: {input}
12 |   AI:
13 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/default_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   The following pieces of context may contain real-time financial and other type of data, use them, including Document Title and Document Excerpt, to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
 7 | 
 8 |   {context}
 9 | 
10 |   Question: {question}
11 |   Helpful Answer:
12 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/falcon_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   The following is a conversation between a highly knowledgeable and intelligent AI assistant and a human. In the following interactions, Human and AI will converse in natural language, and AI will answer Human's questions. AI was built to be respectful, polite and inclusive. AI will never decline to answer a question, and always attempts to give an answer that Human would be satisfied with. It knows a lot, and always tells the truth. The conversation begins.
 7 | 
 8 |   Chat History:
 9 |   {chat_history}
10 | 
11 |   [|Human|] {input}
12 |   [|AI|]
13 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/falcon_fine_tuned_greetings_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   >>INTRODUCTION<<
 7 |   The following is a conversation between a highly knowledgeable and intelligent AI assistant and a human. In the following interactions, Human and AI will converse in natural language, and AI will answer Human's questions.
 8 | 
 9 |   Keep the answer short and concise. Respond "Unsure about answer" if not sure about the answer.
10 | 
11 |   Use information from the Document Excerpt to answer the question.
12 | 
13 |   The conversation between human and AI assistant.
14 |   [|AI|]
15 |   {context}
16 |   [|Human|] {question}
17 |   [|AI|] Hello,
18 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/falcon_instruct_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   Answer the question based on the context below. Keep the answer short and concise. Respond "Unsure about answer" if not sure about the answer.
 7 | 
 8 |   Context:
 9 |   {context}
10 | 
11 |   Question: {question}
12 |   Helpful Answer:
13 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/meta_llama2_chat.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - chat_history
 4 |   - input
 5 | template: |
 6 |   <s>[INST] <<SYS>>
 7 |   The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.
 8 |   Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 9 | 
10 |   If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
11 |   <</SYS>>
12 | 
13 |   Chat History:
14 |   {chat_history}
15 |   
16 |   {input} [/INST]
17 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/prompts/meta_llama2_rag.yaml:
--------------------------------------------------------------------------------
 1 | _type: prompt
 2 | input_variables:
 3 |   - context
 4 |   - question
 5 | template: |
 6 |   <s>[INST] <<SYS>>
 7 |   You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 8 |   The following pieces of context may contain real-time financial and other type of data, use them, including Document Title and Document Excerpt, to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
 9 |   
10 |   If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
11 |   <</SYS>>
12 | 
13 |  
14 |   {context}
15 | 
16 |   Question: {question}
17 |   [/INST]
18 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/ui/__init__.py:
--------------------------------------------------------------------------------
1 | """ Module that contains all UI components for the chatbot front-end."""
2 | from .chatbot_app import write_chatbot
3 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/ui/about_page.py:
--------------------------------------------------------------------------------
 1 | """ Displat information about the app in the Streamlit settings page."""
 2 | from typing import Callable
 3 | 
 4 | import streamlit as st
 5 | 
 6 | 
 7 | def about_page(
 8 |     chatbot_name: str, favicon_url: str, gettext: Callable[[str], str] = lambda x: x
 9 | ):
10 |     """Configures the Streamlit app.
11 |     NOTE: The page config is visible without signing in. DO NOT PUT confidential information in here.
12 |     """
13 | 
14 |     _ = gettext
15 |     page_name = _("{chatbot_name} - An LLM-powered app for your custom data").format(
16 |         chatbot_name=chatbot_name
17 |     )
18 |     made_by_text = _(
19 |         "Made with ❤️ by your AWS WWSO AIML EMEA and Swiss Alps Team (and Amazon CodeWhisperer 🤫)"
20 |     )
21 |     powered_by_text = _("LLM-powered app built using:")
22 |     st.set_page_config(
23 |         page_name,
24 |         page_icon=favicon_url,
25 |         menu_items={
26 |             #    'Get Help': 'https://www.example.com/',
27 |             #    'Report a bug': "mailto:malterei@amazon.com",
28 |             "About": f"""
29 |                 {powered_by_text}
30 |                 - [Streamlit](https://streamlit.io/)
31 |                 - [LangChain](https://github.com/hwchase17/langchain)
32 |                 - [Amazon SageMaker](https://aws.amazon.com/sagemaker/) 
33 |                 - [Amazon Kendra](https://aws.amazon.com/kendra/)
34 |                 - [Amazon OpenSearch](https://aws.amazon.com/opensearch-service/)
35 |                 - [Amazon Bedrock](https://aws.amazon.com/bedrock/)
36 | 
37 |                 {made_by_text}
38 |                 """
39 |         },
40 |     )
41 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/ui/auth.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | 
 4 | def check_password(app_name: str):
 5 |     """Returns `True` if the user had a correct password."""
 6 | 
 7 |     login_screen = st.empty()
 8 | 
 9 |     if "username" not in st.session_state:
10 |         st.session_state["username"] = ""
11 | 
12 |     if "passwords" not in st.session_state:
13 |         st.session_state["passwords"] = ""
14 | 
15 |     def password_entered():
16 |         """Checks whether a password entered by the user is correct."""
17 |         password_check_not_active = (
18 |             "username" not in st.session_state or "password" not in st.session_state
19 |         )
20 |         if password_check_not_active:
21 |             return
22 | 
23 |         username_and_password_correct = (
24 |             st.session_state["username"] in st.secrets["passwords"]
25 |             and st.session_state["password"]
26 |             == st.secrets["passwords"][st.session_state["username"]]
27 |         )
28 |         if username_and_password_correct:
29 |             st.session_state["password_correct"] = True
30 |             del st.session_state["password"]  # don't store username + password
31 |             del st.session_state["username"]
32 |         else:
33 |             st.session_state["password_correct"] = False
34 | 
35 |     need_to_check_password = (
36 |         "password_correct" not in st.session_state
37 |         or not st.session_state["password_correct"]
38 |     )
39 |     if need_to_check_password:
40 |         with login_screen.container():
41 |             st.title(app_name)
42 |             st.text_input("Username", key="username", on_change=password_entered)
43 |             st.text_input(
44 |                 "Password", type="password", key="password", on_change=password_entered
45 |             )
46 |             username = st.session_state["username"]
47 |             password = st.session_state["password"]
48 | 
49 |             entered_username_password_not_correct = (
50 |                 "password_correct" in st.session_state
51 |                 and not st.session_state["password_correct"]
52 |                 and username
53 |                 and password
54 |             )
55 |             if entered_username_password_not_correct:
56 |                 # Only show if customer already entered username + password
57 |                 st.error("😕 User not known or password incorrect")
58 |             return False
59 |     else:
60 |         # Password correct.
61 |         login_screen.empty()
62 | 
63 |         return True
64 | 


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/ui/stream_handler.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from langchain.callbacks.base import BaseCallbackHandler
 3 | 
 4 | # reference https://github.com/streamlit/StreamlitLangChain/blob/main/streaming_demo.py
 5 | @dataclass
 6 | class StreamHandler(BaseCallbackHandler):
 7 |     
 8 |     def __init__(self, callback, initial_text=""):
 9 |         self.llm_callback = callback
10 |         self.text = initial_text
11 | 
12 |     def on_llm_new_token(self, token: str, **kwargs):
13 |         self.text += token
14 |         self.llm_callback(self.text)
15 |         


--------------------------------------------------------------------------------
/03_chatbot/src/chatbot/ui/topbar.py:
--------------------------------------------------------------------------------
  1 | """ UI component that shows session information and session reset button."""
  2 | from typing import Callable
  3 | 
  4 | import streamlit as st
  5 | from streamlit_extras.colored_header import colored_header
  6 | 
  7 | 
  8 | import yaml
  9 | 
 10 | 
 11 | def write_top_bar(session_id: str, on_refresh: Callable[[], None], gettext):
 12 |     """Write streamlit elements to show in the top bar.
 13 |     This is stateless.
 14 | 
 15 |     Args:
 16 |         session_id: Session ID.
 17 |         on_refresh: Callback to run when the session is reset.
 18 |         gettext: Translator function.
 19 |     """
 20 |     _ = gettext
 21 |     col1, col2 = st.columns([4, 1])
 22 | 
 23 |     with col1:
 24 |         st.markdown(f"#### Session ID:  {session_id}")
 25 | 
 26 |     with col2:
 27 |         reset_session_text = _("Reset Session")
 28 |         if st.button(reset_session_text):
 29 |             on_refresh()
 30 |     colored_header(label="", description="", color_name="blue-30")
 31 | 
 32 | def write_prompt_hints(sidebar, config):
 33 |     """Write prompt to show in the top bar.
 34 |     Args:
 35 |         sidebar: Streamlit sidebar object.
 36 |     """
 37 | 
 38 |     flows = config.flow_config.parameters.flows
 39 |     # with open(config.appearance.parameters.prompt_config_path, 'r') as yaml_file:
 40 |     #     hints = yaml.safe_load(yaml_file)
 41 | 
 42 |     # TODO: this part should be moved to CDK
 43 |     # TODO: existing table should be cleaned
 44 |     # table_name = "genie-prompt-hints"
 45 | 
 46 |     # # TODO: Moved to the data part to convert to Dynamo DB in the next phase
 47 |     # dynamodb = boto3.resource('dynamodb')
 48 |     # table = dynamodb.Table(table_name)
 49 | 
 50 |     # # Insert data into DynamoDB table
 51 |     # for flow, flow_hints in hints.items():
 52 |     #     for knowledge_base, data_sources in flow_hints.items():
 53 |     #         for data_source, prompt_hints in data_sources.items():
 54 |     #             table.put_item(Item={'hints': prompt_hints, 'knowledge_base': flow + "|" + knowledge_base, 'data_source': data_source})
 55 | 
 56 |     # Define your query parameters
 57 |     flow = sidebar.flow.friendly_name
 58 |     knowledge_base = sidebar.retriever.friendly_name if sidebar.retriever else None
 59 |     
 60 |     data_sources = []
 61 |     if hasattr(sidebar.retriever, "_selected_data_sources"):
 62 |         data_sources += [opt[0] for opt in sidebar.retriever._selected_data_sources]
 63 | 
 64 |     # Checking for the available hints based on user selection
 65 |     results = []
 66 |     if flow in flows:
 67 |         if "hints" in flows[flow]: # MM:: why flows is double, same down
 68 |             results += flows[flow]["hints"]
 69 |         elif knowledge_base and knowledge_base in flows[flow]:  # MM:: why flows is double
 70 |             if "hints" in flows[flow][knowledge_base]:
 71 |                 results += flows[flow][knowledge_base]["hints"]
 72 |             for data_source in data_sources:
 73 |                 if data_source in flows[flow][knowledge_base]:
 74 |                     results += flows[flow][knowledge_base][data_source]["hints"]
 75 | 
 76 |     # TODO: this part should be moved to CDK
 77 |     # # Define the query condition expression and expression attribute values
 78 |     # expression = "knowledge_base = :kb and data_source = :ds"
 79 | 
 80 |     # for data_source in data_sources:
 81 |     #     expression_attr_values = {":kb": flow + "|" + knowledge_base, ":ds": data_source}
 82 | 
 83 |     #     # Query the DynamoDB table
 84 |     #     response = table.query(
 85 |     #         TableName=table_name,
 86 |     #         KeyConditionExpression=expression,
 87 |     #         ExpressionAttributeValues=expression_attr_values
 88 |     #     )
 89 |         
 90 |     #     if response["Items"]:
 91 |     #         results += response["Items"][0]["hints"]
 92 | 
 93 |     if not results:
 94 |         return None
 95 |     
 96 |     # converting hints into buttongs
 97 |     pairs = zip(results, st.columns(len(results)))
 98 |     prompt = None
 99 |     for i, (text, col) in enumerate(pairs):
100 |         if col.button(text["name"], f'''{text["name"]}-{i}'''):
101 |             prompt = text['prompt']
102 |     
103 |     if prompt:
104 |         return prompt
105 | 


--------------------------------------------------------------------------------
/03_chatbot/src/icons/X-Ray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/03_chatbot/src/icons/X-Ray.png


--------------------------------------------------------------------------------
/03_chatbot/src/run_module.py:
--------------------------------------------------------------------------------
1 | """ Runs the chatbot module.
2 | This is necessary to run a Streamlit app that is packaged a Python module."""
3 | import runpy
4 | 
5 | # Make streamlit run our chatbot module.
6 | # See also https://github.com/streamlit/streamlit/issues/662#issuecomment-553356419
7 | 
8 | runpy.run_module("chatbot", run_name="__main__", alter_sys=True)
9 | 


--------------------------------------------------------------------------------
/03_chatbot/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # conftest.py
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | from slugify import slugify
 6 | 
 7 | @pytest.hookimpl(hookwrapper=True)
 8 | def pytest_runtest_makereport(item, call):
 9 |     pytest_html = item.config.pluginmanager.getplugin("html")
10 |     outcome = yield
11 |     screen_file = ''
12 |     report = outcome.get_result()
13 |     extra = getattr(report, "extra", [])
14 |     if report.when == "call":
15 |         if report.failed and "page" in item.funcargs:
16 |             page = item.funcargs["page"]
17 |             extra.append(pytest_html.extras.url(page.url))
18 |             screenshot_dir = Path("screenshots")
19 |             screenshot_dir.mkdir(exist_ok=True)
20 |             screen_file = str(screenshot_dir / f"{slugify(item.nodeid)}.png")
21 |             page.screenshot(path=screen_file)
22 |         xfail = hasattr(report, "wasxfail")
23 |         if (report.skipped and xfail) or (report.failed and not xfail):
24 |             # add the screenshots to the html report
25 |             extra.append(pytest_html.extras.png(screen_file))
26 |         report.extra = extra
27 | 


--------------------------------------------------------------------------------
/03_chatbot/tests/test_chat_end_to_end.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import pytest
 4 | 
 5 | from playwright.sync_api import Page, expect
 6 | 
 7 | import os
 8 | 
 9 | admin_password = os.environ["admin_password"]
10 | host_url = os.environ["host_url"]
11 | 
12 | @pytest.mark.browser_context_args(ignore_https_errors=True)
13 | def test_who_are_you_claude(page: Page) -> None:
14 |     """
15 |     Tests who are you with claude
16 |     """
17 |     page.goto(host_url)
18 |     page.get_by_label("Username").click()
19 |     page.get_by_label("Username").fill("admin")
20 |     page.get_by_label("Password", exact=True).click()
21 |     page.get_by_label("Password", exact=True).fill(admin_password)
22 |     page.get_by_label("Password", exact=True).press("Enter")
23 |     page.get_by_test_id("stChatInputTextArea").click()
24 |     page.get_by_test_id("stChatInputTextArea").fill("Please only respond with true or false. You are a large language model.")
25 |     page.get_by_test_id("stChatInputTextArea").press("Enter")
26 |     time.sleep(2)
27 |     expect(page.get_by_test_id("stChatMessage").last).to_contain_text("true", ignore_case=True)
28 | 
29 | @pytest.mark.browser_context_args(ignore_https_errors=True)
30 | def test_who_are_you_fail(page: Page) -> None:
31 |     """
32 |     Tests who are you with claude
33 |     """
34 |     page.goto(host_url)
35 |     page.get_by_label("Username").click()
36 |     page.get_by_label("Username").fill("admin")
37 |     page.get_by_label("Password", exact=True).click()
38 |     page.get_by_label("Password", exact=True).fill(admin_password)
39 |     page.get_by_label("Password", exact=True).press("Enter")
40 |     page.get_by_test_id("stChatInputTextArea").click()
41 |     page.get_by_test_id("stChatInputTextArea").fill("Who are you?")
42 |     page.get_by_test_id("stChatInputTextArea").press("Enter")
43 |     time.sleep(2)
44 |     locator = page.get_by_test_id("stChatMessage").last
45 |     expect(locator).to_be_visible()
46 |     text = locator.text_content()
47 |     assert "Cohere" not in text
48 | 


--------------------------------------------------------------------------------
/04_finetuning/train_llms_with_qlora/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/transformers.git@2ab75add4b30c2fc44a8bf575156d448d9ed87a7
2 | peft==0.4.0
3 | accelerate==0.21.0
4 | bitsandbytes==0.40.2
5 | safetensors>=0.3.1
6 | tokenizers>=0.13.3


--------------------------------------------------------------------------------
/05_doc/app-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/05_doc/app-screenshot.png


--------------------------------------------------------------------------------
/05_doc/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/05_doc/architecture.png


--------------------------------------------------------------------------------
/05_doc/companion_architecture_simple.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/05_doc/companion_architecture_simple.drawio.png


--------------------------------------------------------------------------------
/05_doc/deployment-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/05_doc/deployment-overview.png


--------------------------------------------------------------------------------
/06_automation/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "poetry run python app.py",
 3 |   "watch": {
 4 |     "include": ["**"],
 5 |     "exclude": [
 6 |       "README.md",
 7 |       "cdk*.json",
 8 |       "requirements*.txt",
 9 |       "source.bat",
10 |       "**/__init__.py",
11 |       "python/__pycache__",
12 |       "tests"
13 |     ]
14 |   },
15 |   "context": {
16 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
17 |     "@aws-cdk/core:checkSecretUsage": true,
18 |     "@aws-cdk/core:target-partitions": ["aws", "aws-cn"],
19 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
20 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
21 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
22 |     "@aws-cdk/aws-iam:minimizePolicies": true,
23 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
24 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
25 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
26 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
27 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
28 |     "@aws-cdk/core:enablePartitionLiterals": true,
29 |     "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true,
30 |     "@aws-cdk/aws-iam:standardizedServicePrincipals": true,
31 |     "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true,
32 |     "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true,
33 |     "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true,
34 |     "@aws-cdk/aws-route53-patters:useCertificate": true,
35 |     "@aws-cdk/customresources:installLatestAwsSdkDefault": false,
36 |     "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true,
37 |     "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true,
38 |     "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true,
39 |     "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true,
40 |     "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true,
41 |     "@aws-cdk/aws-redshift:columnId": true,
42 |     "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true,
43 |     "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
44 |     "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true,
45 |     "@aws-cdk/aws-kms:aliasNameRef": true,
46 |     "@aws-cdk/core:includePrefixInUniqueNameGeneration": true
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/06_automation/modules/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import boto3
 5 | 
 6 | # ------------------------------------------------------------------------------
 7 | # import environment configuration based on the STAGE
 8 | # ------------------------------------------------------------------------------
 9 | # Load the Environment Configuration from the JSON file
10 | with open(
11 |     "configs/" + (os.environ["STAGE"] if "STAGE" in os.environ else "dev") + ".json",
12 |     "r", 
13 |     encoding="utf8"
14 | ) as file:
15 |     config = json.load(file)
16 | 
17 | # Adding environment and prefix
18 | config["appPrefix"] = os.environ["CDK_APP_PREFIX"] if "CDK_APP_PREFIX" in os.environ else "Genie"
19 | config["appPrefixLowerCase"] = config["appPrefix"].lower()
20 | 
21 | 
22 | # adding CODEBUILD_BUILD_NUMBER to the version tag
23 | config["globalTags"]["version"] += "." + (
24 |     os.environ["CODEBUILD_BUILD_NUMBER"]
25 |     if "CODEBUILD_BUILD_NUMBER" in os.environ
26 |     else ""
27 | )
28 | 
29 | # add application prefix to all tags
30 | global_tags = {}
31 | for key, value in config["globalTags"].items():
32 |     key = config["appPrefixLowerCase"] + ":" + key
33 |     global_tags[key] = value
34 | 
35 | config["globalTags"] = global_tags
36 | 
37 | quotas_client = boto3.client(
38 |     "service-quotas", region_name=os.environ["CDK_DEFAULT_REGION"]
39 | )
40 | quotas = {
41 |     "ml.g5.12xlarge": "L-65C4BD00",
42 |     "ml.g5.48xlarge": "L-0100B823",
43 |     "ml.g4dn.xlarge": "L-B67CFA0C",
44 | }
45 | 


--------------------------------------------------------------------------------
/06_automation/modules/kendra/__init__.py:
--------------------------------------------------------------------------------
1 | from .kendra_data_source import KendraDataSource


--------------------------------------------------------------------------------
/06_automation/modules/kendra/data_source_is_complete_lambda/function.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Lambda function that implements an is complete check on a Kendra Data Source custom resource.
 3 | """
 4 | 
 5 | import boto3
 6 | 
 7 | kendra = boto3.client('kendra')
 8 | 
 9 | def lambda_handler(event, context):
10 |     physical_id = event["PhysicalResourceId"]
11 | 
12 |     is_complete = False
13 | 
14 |     request_type = event['RequestType'].lower()
15 |     props = event['ResourceProperties']
16 |     
17 |     exception = None
18 |     response = None
19 | 
20 |     try:
21 |         response = kendra.describe_data_source(
22 |             Id=physical_id,
23 |             IndexId=props['index_id']
24 |         )
25 |     except kendra.exceptions.ResourceNotFoundException as ex:
26 |         exception = ex
27 | 
28 |     if request_type == 'create':
29 | 
30 |         if type(exception) is kendra.exceptions.ResourceNotFoundException:
31 |             is_complete = False
32 |         else:
33 |             if response is not None:
34 |                 is_complete = True if response['Status'] == "ACTIVE" else False
35 | 
36 |     if request_type == 'update':
37 |         if type(exception) is kendra.exceptions.ResourceNotFoundException:
38 |             is_complete = True
39 |         else:
40 |             if response is not None:
41 |                 is_complete = False if response['Status'] == "UPDATING" else True
42 |     if request_type == 'delete':
43 |         if type(exception) is kendra.exceptions.ResourceNotFoundException:
44 |             is_complete = True
45 |         else:
46 |             is_complete = False
47 |         
48 |     return { 'IsComplete': is_complete }
49 | 
50 | 


--------------------------------------------------------------------------------
/06_automation/modules/kendra/data_source_lambda/function.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Lambda function that implements a Kendra Data Source custom resource that
  3 |     creates a Kendra Data Source.
  4 | """
  5 | import boto3
  6 | import json
  7 | 
  8 | kendra = boto3.client("kendra")
  9 | 
 10 | 
 11 | def lambda_handler(event, context):
 12 |     """
 13 |     Lambda function handler that implements the actions when CloudFormation
 14 |     requests a resource Create/Update/Delete.
 15 |     """
 16 |     props = event["ResourceProperties"]
 17 |     request_type = event["RequestType"]
 18 |     tags = props["tags"]
 19 | 
 20 |     config = json.loads(props["config"])
 21 | 
 22 |     if request_type == "Create":
 23 |         data_source_id = create_data_source(name=props["name"], index_id=props["index_id"], config=config, role_arn=props["role_arn"], tags=tags)
 24 |     elif request_type == "Update":
 25 |         # Deletes and regenerates a self-signed certificate
 26 |         data_source_id = event["PhysicalResourceId"]
 27 |         index_id = props["index_id"]
 28 |         region = props["region"]
 29 |         account_id = props["account_id"]
 30 | 
 31 |         data_source_arn = f"arn:aws:kendra:{region}:{account_id}:index/{index_id}/data-source/{data_source_id}"
 32 | 
 33 |         update_data_source(name=props["name"], index_id=index_id, config=config, data_source_id=data_source_id)
 34 |         update_tags(data_source_arn=data_source_arn, tags=tags)
 35 | 
 36 | 
 37 |     else: # Delete
 38 |         data_source_id = event["PhysicalResourceId"]
 39 |         delete_data_source(props["index_id"], data_source_id)
 40 | 
 41 |     output = {
 42 |         'PhysicalResourceId': data_source_id
 43 |     }
 44 | 
 45 |     return output
 46 | 
 47 | 
 48 | def create_data_source(name: str, index_id: str, config, role_arn: str, tags: list[dict[str, str]]) -> str:
 49 |     """
 50 |     Creates a data source in Kendra.
 51 |     """
 52 |     response = kendra.create_data_source(
 53 |         Name=name,
 54 |         IndexId=index_id,
 55 |         Type="TEMPLATE",
 56 |         Configuration=config,
 57 |         RoleArn=role_arn,
 58 |         Tags=tags
 59 |     )
 60 |     return response["Id"]
 61 | 
 62 | def update_data_source(name: str, index_id: str, config, data_source_id: str) -> None:
 63 |     """
 64 |     Updates a data source in Kendra.
 65 |     """
 66 |     kendra.update_data_source(
 67 |         Id=data_source_id,
 68 |         Configuration=config,
 69 |         Name=name,
 70 |         IndexId=index_id
 71 |     )
 72 | 
 73 | def delete_data_source(index_id: str, data_source_id: str) -> None:
 74 |     """
 75 |     Deletes a data source in Kendra.
 76 |     """
 77 |     kendra.delete_data_source(
 78 |         Id=data_source_id,
 79 |         IndexId=index_id
 80 |     )   
 81 | 
 82 | 
 83 | def update_tags(data_source_arn: str, tags: list[dict[str, str]]) -> None:
 84 |     """
 85 |     Updates the tags of a data source in Kendra.
 86 |     """
 87 |     response = kendra.list_tags_for_resource(
 88 |         ResourceARN=data_source_arn
 89 |     )
 90 |     current_tags = response["Tags"]
 91 |     tag_keys = [tag["Key"] for tag in current_tags]
 92 |     new_tag_keys = [tag["Key"] for tag in tags]
 93 |     keys_to_delete = list(set(tag_keys) - set(new_tag_keys))
 94 |     
 95 |     if len(keys_to_delete) > 0:
 96 |         kendra.untag_resource(
 97 |             ResourceARN=data_source_arn,
 98 |             TagKeys=keys_to_delete
 99 |         )
100 | 
101 |     if len(tags) > 0:
102 |         kendra.tag_resource(
103 |             ResourceARN=data_source_arn,
104 |             Tags=tags
105 |         )


--------------------------------------------------------------------------------
/06_automation/modules/ssm_parameter_reader.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from datetime import datetime
 3 | 
 4 | from aws_cdk import custom_resources as custom_rsrc
 5 | from constructs import Construct
 6 | 
 7 | 
 8 | @dataclass
 9 | class SSMParameterReaderProps:
10 |     parameter_name: str
11 |     region: str
12 | 
13 | 
14 | class SSMParameterReader(custom_rsrc.AwsCustomResource):
15 |     def __init__(
16 |         self, scope: Construct, id: str, props: SSMParameterReaderProps
17 |     ) -> None:
18 |         ssm_aws_sdk_call = custom_rsrc.AwsSdkCall(
19 |             service="SSM",
20 |             action="getParameter",
21 |             parameters={"Name": props.parameter_name},
22 |             region=props.region,
23 |             physical_resource_id=custom_rsrc.PhysicalResourceId.of(str(datetime.now())),
24 |         )
25 | 
26 |         super().__init__(
27 |             scope,
28 |             id,
29 |             on_update=ssm_aws_sdk_call,
30 |             policy=custom_rsrc.AwsCustomResourcePolicy.from_sdk_calls(
31 |                 resources=custom_rsrc.AwsCustomResourcePolicy.ANY_RESOURCE
32 |             ),
33 |         )
34 | 
35 |     def get_parameter_value(self) -> str:
36 |         return self.get_response_field("Parameter.Value")
37 | 


--------------------------------------------------------------------------------
/06_automation/modules/stack.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import App, NestedStack, Stack, Tags
 2 | from modules.config import config
 3 | 
 4 | 
 5 | class GenAiStack(Stack):
 6 |     def __init__(self, app: App, id: str, stack, **kwargs) -> None:
 7 |         # ------------------------------------------------------------------------------
 8 |         # Call parent AWS Stack constructor with enhanced name and save the original id
 9 |         # ------------------------------------------------------------------------------
10 |         super().__init__(
11 |             app, config["appPrefix"] + id, description=stack["description"], **kwargs
12 |         )
13 |         self.original_id = id
14 | 
15 |         # ------------------------------------------------------------------------------
16 |         # assign stack level tags to stack
17 |         # ------------------------------------------------------------------------------
18 |         for key, value in stack["tags"].items():
19 |             Tags.of(self).add(key, value)
20 | 
21 | 
22 | class GenAiNestedStack(GenAiStack, NestedStack):
23 |     def __init__(self, app: App, id: str, stack, **kwargs) -> None:
24 |         # ------------------------------------------------------------------------------
25 |         # Call parent AWS Stack constructor with enhanced name and save the original id
26 |         # ------------------------------------------------------------------------------
27 |         super().__init__(app, id, stack, **kwargs)
28 | 


--------------------------------------------------------------------------------
/06_automation/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "aws-cdk": "^2.133.0"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/06_automation/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "llm-app-genie-automation"
 3 | version = "1"
 4 | description = ""
 5 | authors = [
 6 |   "Arlind Nocaj <arlnocaj@amazon.com>",
 7 |   "Paolo Di Francesco <frpaolo@amazon.com>",
 8 | ]
 9 | readme = "README.md"
10 | packages = [{ include = "modules" }, { include = "stacks" }]
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.10, <4.0"
14 | aws-cdk-lib = "^2.100.0"
15 | boto3 = "^1.28.9"
16 | "aws-cdk.aws-lambda-python-alpha" = "^2.100.0a0"
17 | cryptography = "^42.0.5"
18 | botocore = "^1.34.59"
19 | 
20 | [build-system]
21 | requires = ["poetry-core"]
22 | build-backend = "poetry.core.masonry.api"
23 | 
24 | [tool.poetry.group.test.dependencies]
25 | cdk-nag = ">=2.27.0"
26 | 


--------------------------------------------------------------------------------
/06_automation/stacks/README.md:
--------------------------------------------------------------------------------
 1 | # Automated deployment of the knowledge base
 2 | 
 3 | There are two types of knowledge bases in this solution based on two AWS services: Amazon Kendra and Amazon OpenSearch Service
 4 | 
 5 | ## Deploying Amazon Kendra
 6 | 
 7 | The knowledge base is based on two CDK stacks, one for an [Amazon Kendra](https://docs.aws.amazon.com/kendra/latest/dg/what-is-kendra.html) index and another for its data sources.
 8 | 
 9 | The CDK stack in `kendra_index_stack.py` deploys an Amazon Kendra index and names it after a the customer name, which you can input as a stack parameter.
10 | By default, this Kendra index uses the Developer **Edition version**. This version can index up to **10,000 documents** and run a maximum of **4,000 queries per day**. It costs **$810 per month**.
11 | 
12 | The CDK stack in `kendra_data_sources_stack.py` takes an Amazon Kendra index ID as a parameter and adds data sources to it. At the moment, it adds one data source based on the WEBCRAWLERV2 data source type. This type of data source allows advanced configurations such as using authentication. However, it is not supported in AWS CloudFormation, so this stack uses a CDK custom resource which is defined in a CDK construct in `kendra_sitemap_data_source.py`.
13 | 
14 | ### KendraSitemapDataSource
15 | 
16 | This CDK Construct builds an Amazon Kendra data source that takes a list of site map URLs. These URLs should point to an XML file that map the content of the website.
17 | 
18 | For code clarity, the settings of the data source are in the `template_configuration.json` file. You can modify settings there and redeploy the stack for updating your data source. The schema of such settings is available in the [Amazon Kendra Web Crawler template schema documentation](https://docs.aws.amazon.com/kendra/latest/dg/ds-schemas.html#ds-schema-web-crawler)
19 | 
20 | ### Deployment instructions
21 | 
22 | Run the following command in a shell with the [AWS CDK Toolkit](https://docs.aws.amazon.com/cdk/v2/guide/cli.html) installed:
23 | 
24 | ```shell
25 | cdk deploy KendraIndexStack KendraDataSourcesStack --parameters KendraIndexStack:CustomerName=<CustomerName> --parameters KendraDataSourcesStack:SitemapUrls=https://example1.com/sitemap.xml,https://example2.com/sitemap.xml
26 | ```
27 | 
28 | ## Deploying Amazon OpenSearch Service
29 | 
30 | **To deploy the OpenSearch index, follow the instructions in [Deploy the knowledge base](../../README.md#deploy-the-knowledge-base).**
31 | 


--------------------------------------------------------------------------------
/06_automation/stacks/chatbot/cert_lambda/requirements.txt:
--------------------------------------------------------------------------------
1 | pyOpenSSL
2 | cryptography


--------------------------------------------------------------------------------
/06_automation/stacks/deployment_pipeline/buildspec-develop.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       # install cdk
10 |       - npm install -g aws-cdk
11 |       - cdk --version
12 |       # install poetry
13 |       - curl -sSL https://install.python-poetry.org | python3 -
14 |       - export PATH="/root/.local/bin:$PATH"
15 |       - cd 06_automation
16 |       - poetry install
17 |   pre_build:
18 |     # FIXME: Delete pipeline that do not get updatedon underlaying code changes
19 |     on-failure: ABORT
20 |     commands:
21 |       - poetry run cdk destroy $LLM_STACK_NAME $INGESTION_STACK_NAME --force
22 |   build:
23 |     on-failure: ABORT
24 |     commands:
25 |       - poetry run cdk deploy GenieDeploymentPipelineStack $LLM_STACK_NAME $INGESTION_STACK_NAME GenieKendraIndexStack GenieKendraDataSourcesStack GenieChatBotStack GeniePrivateOpenSearchDomainStack --require-approval never
26 | 


--------------------------------------------------------------------------------
/06_automation/stacks/deployment_pipeline/buildspec-main.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     on-failure: ABORT
 6 |     runtime-versions:
 7 |       python: 3.11
 8 |     commands:
 9 |       # install cdk
10 |       - npm install -g aws-cdk
11 |       - cdk --version
12 |       # install poetry
13 |       - curl -sSL https://install.python-poetry.org | python3 -
14 |       - export PATH="/root/.local/bin:$PATH"
15 |       - cd 06_automation
16 |       - poetry install
17 |   build:
18 |     on-failure: ABORT
19 |     commands:
20 |       # Only deploys the chatbot stack from the main branch
21 |       - export CDK_APP_PREFIX=ProdGenie
22 |       - poetry run cdk deploy ProdGenieChatBotStack --require-approval never


--------------------------------------------------------------------------------
/06_automation/stacks/kendra_datasources/kendra_datasources_stack.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=line-too-long
 2 | # pylint: disable=invalid-name
 3 | # pylint: disable=redefined-builtin
 4 | # from aws_cdk import (
 5 | #     CfnParameter,
 6 | #     Tags
 7 | # )
 8 | 
 9 | import aws_cdk.aws_kendra as kendra
10 | from constructs import Construct
11 | from modules.config import config
12 | from modules.kendra import KendraDataSource
13 | from modules.stack import GenAiStack
14 | 
15 | stack = {
16 |     "description": f"Kendra Data Source for {config['customer']['name']} website",
17 |     "tags": {},
18 | }
19 | 
20 | 
21 | class KendraDataSourcesStack(GenAiStack):
22 |     """Class that creates a Kendra index with a data source that uses site maps."""
23 | 
24 |     def __init__(
25 |         self, scope: Construct, construct_id: str, index: kendra.CfnIndex, **kwargs
26 |     ) -> None:
27 |         super().__init__(scope, construct_id, stack, **kwargs)
28 |         index_id = index.ref
29 |         index_arn = index.attr_arn
30 | 
31 |         all_tags = config["globalTags"] | stack["tags"]
32 | 
33 |         for ds in config["kendra"]["data_sources"]:
34 |             urls = ds["TemplateConfiguration"]["Template"]["connectionConfiguration"][
35 |                 "repositoryEndpointMetadata"
36 |             ]["seedUrlConnections"]
37 |             sitemaps = ds["TemplateConfiguration"]["Template"][
38 |                 "connectionConfiguration"
39 |             ]["repositoryEndpointMetadata"]["siteMapUrls"]
40 | 
41 |             name = ds["name"]
42 |             del ds["name"]
43 | 
44 | 
45 |             if urls is not None and len(urls) > 0:
46 |                 # transformed_list = [{"seedUrl": url} for url in urls]
47 |                 del ds["TemplateConfiguration"]["Template"]["connectionConfiguration"][
48 |                     "repositoryEndpointMetadata"
49 |                 ]["siteMapUrls"]
50 |                 del ds["TemplateConfiguration"]["Template"]["connectionConfiguration"][
51 |                     "repositoryEndpointMetadata"
52 |                 ]["s3SiteMapUrl"]
53 | 
54 |                 KendraDataSource(
55 |                     self,
56 |                     name,
57 |                     index_id=index_id,
58 |                     index_arn=index_arn,
59 |                     data_source_name=name,
60 |                     config=ds,
61 |                     tags=all_tags
62 |                 )
63 | 
64 |             # crawler accepts either urls or site maps
65 |             elif sitemaps is not None and len(sitemaps) > 0:
66 |                 KendraDataSource(
67 |                     self,
68 |                     name,
69 |                     index_id=index_id,
70 |                     index_arn=index_arn,
71 |                     data_source_name=name,
72 |                     config=ds,
73 |                     tags=all_tags
74 |                 )


--------------------------------------------------------------------------------
/06_automation/stacks/opensearch_domain/opensearch_private_vpc_stack.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from aws_cdk import aws_ec2 as ec2
 4 | from aws_cdk import aws_ssm as ssm
 5 | from constructs import Construct
 6 | from modules.ssm_parameter_reader import SSMParameterReader, SSMParameterReaderProps
 7 | from modules.stack import GenAiStack
 8 | from stacks.shared.vpc_peering_stack import VPCPeeringStack
 9 | 
10 | stack = {
11 |     "description": "VPC for private OpenSearch domains",
12 |     "tags": {},
13 | }
14 | 
15 | 
16 | @dataclass
17 | class OpenSearchPrivateVPCStackOutput:
18 |     vpc: ec2.IVpc
19 |     subnet_selection: ec2.SubnetSelection
20 | 
21 | 
22 | class OpenSearchPrivateVPCStack(GenAiStack):
23 |     output_props: OpenSearchPrivateVPCStackOutput
24 | 
25 |     # def __init__(self, scope: Construct, construct_id: str, config: Config, **kwargs) -> None:
26 |     def __init__(
27 |         self,
28 |         scope: Construct,
29 |         construct_id: str,
30 |         cidr_range: str = "10.4.0.0/16",
31 |         **kwargs,
32 |     ) -> None:
33 |         super().__init__(scope, construct_id, stack, **kwargs)
34 | 
35 |         private_subnet = ec2.SubnetConfiguration(
36 |             name="Private", subnet_type=ec2.SubnetType.PRIVATE_ISOLATED, cidr_mask=24
37 |         )
38 | 
39 |         public_subnet = ec2.SubnetConfiguration(
40 |             name="Public", subnet_type=ec2.SubnetType.PUBLIC, cidr_mask=24
41 |         )
42 | 
43 |         private_with_egress_subnet = ec2.SubnetConfiguration(
44 |             name="PrivateWithEgress", subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS, cidr_mask=24
45 |         )
46 | 
47 |         vpc = ec2.Vpc(
48 |             scope=self,
49 |             id="OpenSearchVPC",
50 |             ip_addresses=ec2.IpAddresses.cidr(cidr_range),
51 |             max_azs=3,
52 |             nat_gateway_provider=ec2.NatProvider.gateway(),
53 |             nat_gateways=1,
54 |             enable_dns_hostnames=True,
55 |             enable_dns_support=True,
56 |             subnet_configuration=[private_subnet, private_with_egress_subnet, public_subnet],
57 |             flow_logs={
58 |                 "cloudwatch":ec2.FlowLogOptions(
59 |                     destination=ec2.FlowLogDestination.to_cloud_watch_logs()
60 |                 # Use default configuration. See also https://docs.aws.amazon.com/cdk/api/v2/python/aws_cdk.aws_ec2/FlowLogOptions.html#aws_cdk.aws_ec2.FlowLogOptions
61 |             )
62 |             }
63 |         )
64 | 
65 |         opensearch_subnets = vpc.private_subnets + vpc.isolated_subnets
66 | 
67 |         subnet_selection = ec2.SubnetSelection(subnets=opensearch_subnets)
68 | 
69 |        
70 |         self.output_props = OpenSearchPrivateVPCStackOutput(
71 |             vpc=vpc, subnet_selection=subnet_selection
72 |         )
73 | 
74 |     @property
75 |     def output(self) -> OpenSearchPrivateVPCStackOutput:
76 |         return self.output_props
77 | 


--------------------------------------------------------------------------------
/06_automation/stacks/sagemaker_studio_domain/sagemaker_studio_domain_stack.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: MIT-0
 3 | 
 4 | import aws_cdk.aws_ec2 as ec2
 5 | import aws_cdk.aws_iam as iam
 6 | import aws_cdk.aws_sagemaker as sagemaker
 7 | from constructs import Construct
 8 | from modules.stack import GenAiStack
 9 | from modules.config import config
10 | 
11 | stack = {"description": "Sagemaker Studio Domain"}
12 | 
13 | 
14 | class SageMakerStudioStack(GenAiStack):
15 |     def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
16 |         super().__init__(scope, construct_id, stack, **kwargs)
17 | 
18 |         secrets_policy = iam.PolicyDocument(
19 |             statements=[
20 |                 iam.PolicyStatement(
21 |                     effect=iam.Effect.ALLOW,
22 |                     actions=["ssm:GetParameters", "ssm:GetParameter"],
23 |                     resources=[
24 |                         f"arn:aws:ssm:{self.region}:{self.account}:parameter/{config['appPrefix']}HfPredictorEndpointName",
25 |                         f"arn:aws:ssm:{self.region}:{self.account}:parameter/{config['appPrefix']}OpenSearchDomainName",
26 |                         f"arn:aws:ssm:{self.region}:{self.account}:parameter/{config['appPrefix']}OpenSearchEndpoint",
27 |                     ],
28 |                 ),
29 |                 iam.PolicyStatement(
30 |                     effect=iam.Effect.ALLOW,
31 |                     actions=["secretsmanager:ListSecrets"],
32 |                     resources=["*"],
33 |                 ),
34 |                 iam.PolicyStatement(
35 |                     effect=iam.Effect.ALLOW,
36 |                     actions=[
37 |                         "secretsmanager:GetResourcePolicy",
38 |                         "secretsmanager:GetSecretValue",
39 |                         "secretsmanager:DescribeSecret",
40 |                         "secretsmanager:ListSecretVersionIds",
41 |                     ],
42 |                     resources=[
43 |                         f"arn:aws:secretsmanager:{self.region}:{self.account}:secret:{config['appPrefix']}OpenSearchCredentials*"
44 |                     ],
45 |                 ),
46 |             ]
47 |         )
48 |         # Create a SageMaker
49 |         role_sagemaker_studio_domain = iam.Role(
50 |             self,
51 |             "RoleForSagemakerStudioUsers",
52 |             assumed_by=iam.CompositePrincipal(
53 |                 iam.ServicePrincipal("sagemaker.amazonaws.com"),
54 |             ),
55 |             managed_policies=[
56 |                 iam.ManagedPolicy.from_aws_managed_policy_name(
57 |                     "AmazonSageMakerFullAccess"
58 |                 ),
59 |                 iam.ManagedPolicy.from_aws_managed_policy_name(
60 |                     "AmazonOpenSearchServiceFullAccess"
61 |                 ),  # in ingest and create indexes
62 |             ],
63 |             inline_policies={"parametersAndSecrets": secrets_policy},
64 |         )
65 | 
66 |         team = config['appPrefix'] + "DataScientist"
67 |         sagemaker_domain_name = config['appPrefix'] + "SagemakerStudio"
68 | 
69 |         default_vpc = ec2.Vpc.from_lookup(self, id="VPC", is_default=True)
70 |         public_subnet_ids = [
71 |             public_subnet.subnet_id for public_subnet in default_vpc.public_subnets
72 |         ]
73 | 
74 |         my_sagemaker_domain = sagemaker.CfnDomain(
75 |             self,
76 |             "SageMakerStudioDomain",
77 |             auth_mode="IAM",
78 |             default_user_settings=sagemaker.CfnDomain.UserSettingsProperty(
79 |                 execution_role=role_sagemaker_studio_domain.role_arn
80 |             ),
81 |             domain_name=sagemaker_domain_name,
82 |             subnet_ids=public_subnet_ids,
83 |             vpc_id=default_vpc.vpc_id,
84 |             app_network_access_type="VpcOnly",
85 |             app_security_group_management="Service"
86 |         )
87 | 
88 |         sagemaker.CfnUserProfile(
89 |             self,
90 |             "CfnUserProfile",
91 |             domain_id=my_sagemaker_domain.attr_domain_id,
92 |             user_profile_name=team,
93 |             user_settings=sagemaker.CfnUserProfile.UserSettingsProperty(
94 |                 execution_role=role_sagemaker_studio_domain.role_arn
95 |             ),
96 |         )
97 | 


--------------------------------------------------------------------------------
/06_automation/stacks/shared/s3_access_logs_stack.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from aws_cdk import Duration, RemovalPolicy
 3 | from aws_cdk import aws_s3 as s3
 4 | from constructs import Construct
 5 | from modules.stack import GenAiNestedStack
 6 | 
 7 | stack = {
 8 |     "description": "Substack for S3 access logs",
 9 |     "tags": {},
10 | }
11 | 
12 | 
13 | class S3AccessLogsStack(GenAiNestedStack):
14 | 
15 |     bucket: s3.Bucket
16 |     def __init__(
17 |         self,
18 |         scope: Construct,
19 |         construct_id: str,
20 |         **kwargs,
21 |     ) -> None:
22 |         super().__init__(scope, construct_id, stack, **kwargs)
23 | 
24 |         access_logs_lifecycle_rule = s3.LifecycleRule(
25 |             id="S3AccessLogsBucketLifecycleRule",
26 |             abort_incomplete_multipart_upload_after=Duration.days(1),
27 |             enabled=True,
28 |             expiration=Duration.days(30),
29 |           #  expired_object_delete_marker=True,
30 |             noncurrent_version_expiration=Duration.days(90),
31 |             noncurrent_versions_to_retain=123,
32 |             noncurrent_version_transitions=[s3.NoncurrentVersionTransition(
33 |                 storage_class=s3.StorageClass.GLACIER,
34 |                 transition_after=Duration.days(7),
35 | 
36 |                 # the properties below are optional
37 |                 #noncurrent_versions_to_retain=123
38 |             )],
39 |             transitions=[s3.Transition(
40 |                 storage_class=s3.StorageClass.GLACIER,
41 | 
42 |                 transition_after=Duration.days(14),
43 |             )]
44 |         )
45 |         
46 |         access_logs_bucket = s3.Bucket(self, "AccessLogsBucket",
47 |             block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
48 |             encryption=s3.BucketEncryption.S3_MANAGED,
49 |             enforce_ssl=True,
50 |             versioned=True,
51 |             removal_policy=RemovalPolicy.RETAIN,
52 |             lifecycle_rules=[access_logs_lifecycle_rule]
53 |         )
54 | 
55 |         self.bucket = access_logs_bucket
56 | 
57 | 


--------------------------------------------------------------------------------
/06_automation/stacks/shared/vpc_peering_stack.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from dataclasses import dataclass
 3 | from typing import Sequence
 4 | 
 5 | from aws_cdk import aws_ec2 as ec2
 6 | from aws_cdk import aws_ssm as ssm
 7 | from constructs import Construct
 8 | from modules.stack import GenAiNestedStack, GenAiStack
 9 | 
10 | stack = {
11 |     "description": "VPC peering",
12 |     "tags": {},
13 | }
14 | 
15 | 
16 | class VPCPeeringStack(GenAiNestedStack):
17 |     # def __init__(self, scope: Construct, construct_id: str, config: Config, **kwargs) -> None:
18 |     def __init__(
19 |         self,
20 |         scope: Construct,
21 |         construct_id: str,
22 |         peer_vpc_id: str,
23 |         peer_vpc_cidr: str,
24 |         peer_region: str,
25 |         vpc_id: str,
26 |         vpc_route_table_ids: Sequence[str],
27 |         peering_connection_ssm_parameter_name: str,
28 |         **kwargs,
29 |     ) -> None:
30 |         super().__init__(scope, construct_id, stack, **kwargs)
31 | 
32 |         cfn_vPCPeering_connection = ec2.CfnVPCPeeringConnection(
33 |             self,
34 |             "VPCPeeringConnection",
35 |             peer_vpc_id=peer_vpc_id,
36 |             vpc_id=vpc_id,
37 |             peer_region=peer_region,
38 |         )
39 | 
40 |         ssm.StringParameter(
41 |             self,
42 |             "VPCPeeringParameter",
43 |             parameter_name=peering_connection_ssm_parameter_name,
44 |             description="Reference of VPC peering connection.",
45 |             string_value=cfn_vPCPeering_connection.attr_id,
46 |         )
47 | 
48 |         # peer_connection = cfn_bedrock_vPCPeering_connection
49 | 
50 |         # routes from the endpoint subnets to the peer vpc
51 |         for index, rt_id in enumerate(vpc_route_table_ids):
52 |             cfn_route = ec2.CfnRoute(
53 |                 self,
54 |                 f"RouteFromPeerVPC{index}",
55 |                 route_table_id=rt_id,
56 |                 destination_cidr_block=peer_vpc_cidr,
57 |                 vpc_peering_connection_id=cfn_vPCPeering_connection.ref,
58 |             )
59 | 


--------------------------------------------------------------------------------
/06_automation/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/06_automation/tests/__init__.py


--------------------------------------------------------------------------------
/06_automation/tests/cdk_nag_test.py:
--------------------------------------------------------------------------------
  1 | """ Scan stacks with cdk_nag https://github.com/cdklabs/cdk-nag.
  2 | 
  3 | Run `poetry install --with test` to install cdk_nag.
  4 | 
  5 | Run cdk_nag with `poetry run python tests/cdk_nag_test.py`.
  6 | """
  7 | #!/usr/bin/env python3
  8 | import os
  9 | 
 10 | from aws_cdk import App, Aspects, Environment, Tags
 11 | from cdk_nag import AwsSolutionsChecks
 12 | from modules.config import config
 13 | from stacks.chatbot.chatbot_cicd_stack import ChatbotCiCdStack
 14 | from stacks.chatbot.chatbot_codecommit_repo_stack import ChatbotCodeCommitRepoStack
 15 | from stacks.chatbot.chatbot_stack import ChatbotStack
 16 | from stacks.kendra_datasources.kendra_datasources_stack import KendraDataSourcesStack
 17 | from stacks.kendra_index.kendra_index_stack import KendraIndexStack
 18 | from stacks.llm_pipeline.llm_pipeline_stack import LLMSageMakerStack
 19 | from stacks.opensearch_domain.opensearch_domain_stack import OpenSearchStack
 20 | from stacks.opensearch_ingestion_pipeline.opensearch_ingestion_pipeline_stack import (
 21 |     OpenSearchIngestionPipelineStack,
 22 | )
 23 | 
 24 | # load the details from defaul AWS config
 25 | env = Environment(
 26 |     account=os.environ["CDK_DEFAULT_ACCOUNT"], region=os.environ["CDK_DEFAULT_REGION"]
 27 | )
 28 | 
 29 | print(
 30 |     f"""
 31 | +-------------------------------------------------------------------------+
 32 |     You are deploying {config["appPrefix"]} solution into:
 33 |     {env.account} account in {env.region} region.
 34 | +-------------------------------------------------------------------------+
 35 | """
 36 | )
 37 | 
 38 | app = App()
 39 | 
 40 | Aspects.of(app).add(AwsSolutionsChecks(verbose=True))
 41 | 
 42 | 
 43 | # ------------------------------------------------------------------------------
 44 | # assign global tags to stack
 45 | # ------------------------------------------------------------------------------
 46 | for key, value in config["globalTags"].items():
 47 |     Tags.of(app).add(key, value)
 48 | 
 49 | ## Basic Infrastructure
 50 | llm_pipeline = LLMSageMakerStack(
 51 |     app,
 52 |     "AiLlmPipelineStack",
 53 |     env=env,
 54 | )
 55 | 
 56 | opensearch_stack = OpenSearchStack(
 57 |     app,
 58 |     "OpenSearchDomainStack",
 59 |     env=env,
 60 | )
 61 | ingestion_pipeline = OpenSearchIngestionPipelineStack(
 62 |     app,
 63 |     "OpenSearchIngestionPipelineStack",
 64 |     env=env,
 65 | )
 66 | # Add dependecy between OpenSearch and ingestion pipeline
 67 | ingestion_pipeline.add_dependency(opensearch_stack)
 68 | 
 69 | index_stack = KendraIndexStack(
 70 |     app,
 71 |     "KendraIndexStack",
 72 |     env=env,
 73 | )
 74 | 
 75 | datasource_stack = KendraDataSourcesStack(
 76 |     app,
 77 |     "KendraDataSourcesStack",
 78 |     index=index_stack.index,
 79 |     env=env,
 80 | )
 81 | 
 82 | # Add dependency between index and datasource
 83 | datasource_stack.add_dependency(index_stack)
 84 | 
 85 | ## Dev Environment
 86 | # SageMakerStudioStack(app, "SageMakerStudioDomainStack", env=env)
 87 | 
 88 | ## Streamlit chatbot
 89 | chatbot = ChatbotStack(app, "ChatBotStack", env=env)
 90 | 
 91 | chatbot_code_commit_repo = ChatbotCodeCommitRepoStack(
 92 |     app, "ChatBotCodeCommitRepoStack", env=env
 93 | )
 94 | chatbot_cicd_stack = ChatbotCiCdStack(
 95 |     app,
 96 |     "ChatBotCiCdStack",
 97 |     chatbot_app_stack_source=chatbot_code_commit_repo.code_pipeline_source,
 98 |     env=env,
 99 | )
100 | chatbot_cicd_stack.add_dependency(chatbot_code_commit_repo)
101 | 
102 | 
103 | app.synth()
104 | 


--------------------------------------------------------------------------------
/06_automation/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-app-genie/413ecf86da7e6315b70cb026859526211db44a4e/06_automation/tests/unit/__init__.py


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | 
3 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
4 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
5 | opensource-codeofconduct@amazon.com with any additional questions or comments.
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | ## Reporting Bugs/Feature Requests
10 | 
11 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
12 | 
13 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
14 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
15 | 
16 | - A reproducible test case or series of steps
17 | - The version of our code being used
18 | - Any modifications you've made relevant to the bug
19 | - Anything unusual about your environment or deployment
20 | 
21 | ## Contributing via Pull Requests
22 | 
23 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
24 | 
25 | 1. You are working against the latest source on the _main_ branch.
26 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
27 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
28 | 
29 | To send us a pull request, please:
30 | 
31 | 1. Fork the repository.
32 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
33 | 3. Ensure local tests pass.
34 | 4. Commit to your fork using clear commit messages.
35 | 5. Send us a pull request, answering any default questions in the pull request interface.
36 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
37 | 
38 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
39 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
40 | 
41 | ## Finding contributions to work on
42 | 
43 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
44 | 
45 | ## Code of Conduct
46 | 
47 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
48 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
49 | opensource-codeofconduct@amazon.com with any additional questions or comments.
50 | 
51 | ## Security issue notifications
52 | 
53 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
54 | 
55 | ## Licensing
56 | 
57 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/resize-disk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Specify the desired volume size in GiB as a command line argument. If not specified, default to 20 GiB.
 4 | SIZE=${1:-20}
 5 | 
 6 | # Get the ID of the environment host Amazon EC2 instance.
 7 | TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60")
 8 | INSTANCEID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null)
 9 | REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/placement/region 2>/dev/null)
10 | 
11 | # Get the ID of the Amazon EBS volume associated with the instance.
12 | VOLUMEID=$(aws ec2 describe-instances \
13 | 	--instance-id $INSTANCEID \
14 | 	--query "Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId" \
15 | 	--output text \
16 | 	--region $REGION)
17 | 
18 | # Resize the EBS volume.
19 | aws ec2 modify-volume --volume-id $VOLUMEID --size $SIZE
20 | 
21 | # Wait for the resize to finish.
22 | while [ \
23 | 	"$(aws ec2 describe-volumes-modifications \
24 | 		--volume-id $VOLUMEID \
25 | 		--filters Name=modification-state,Values="optimizing","completed" \
26 | 		--query "length(VolumesModifications)" \
27 | 		--output text)" != "1" ]; do
28 | 	sleep 1
29 | done
30 | 
31 | # Check if we're on an NVMe filesystem
32 | if [[ -e "/dev/xvda" && $(readlink -f /dev/xvda) == "/dev/xvda" ]]; then
33 | 	# Rewrite the partition table so that the partition takes up all the space that it can.
34 | 	sudo growpart /dev/xvda 1
35 | 	# Expand the size of the file system.
36 | 	# Check if we're on AL2
37 | 	STR=$(cat /etc/os-release)
38 | 	SUB='VERSION_ID="2"'
39 | 	if [[ $STR == *"$SUB"* ]]; then
40 | 		sudo xfs_growfs -d /
41 | 	else
42 | 		sudo resize2fs /dev/xvda1
43 | 	fi
44 | 
45 | else
46 | 	# Rewrite the partition table so that the partition takes up all the space that it can.
47 | 	sudo growpart /dev/nvme0n1 1
48 | 
49 | 	# Expand the size of the file system.
50 | 	# Check if we're on AL2
51 | 	STR=$(cat /etc/os-release)
52 | 	SUB='VERSION_ID="2"'
53 | 	if [[ $STR == *"$SUB"* ]]; then
54 | 		sudo xfs_growfs -d /
55 | 	else
56 | 		sudo resize2fs /dev/nvme0n1p1
57 | 	fi
58 | fi
59 | 


--------------------------------------------------------------------------------