├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── artifacts
    ├── architecture.png
    ├── demo.gif
    ├── webui.png
    └── webui_news.png
├── build-script
    ├── build.sh
    └── dir_md5.sh
├── business_logic
    ├── lambdas
    │   ├── embed_docs
    │   │   ├── Dockerfile
    │   │   ├── embed_docs.py
    │   │   └── requirements.txt
    │   ├── pre_process_docs
    │   │   ├── Dockerfile
    │   │   ├── pre_process_docs.py
    │   │   └── requirements.txt
    │   ├── summarization
    │   │   ├── Dockerfile
    │   │   ├── news summarization streaming.ipynb
    │   │   ├── requirements.txt
    │   │   └── summarization.py
    │   └── trigger_sfn
    │   │   ├── Dockerfile
    │   │   ├── requirements.txt
    │   │   └── trigger_sfn.py
    ├── model_artifacts
    │   ├── embedding
    │   │   └── model
    │   │   │   ├── code
    │   │   │       ├── inference.py
    │   │   │       └── requirements.txt
    │   │   │   └── model
    │   │   │       ├── __init__.py
    │   │   │       ├── embed_documents.py
    │   │   │       └── embedding_model_utils.py
    │   └── multi_gpu_embedding
    │   │   └── model
    │   │       └── code
    │   │           ├── inference.py
    │   │           └── requirements.txt
    ├── stream_consumer
    │   ├── clustering.py
    │   ├── process_records.py
    │   └── requirements.txt
    └── temp.json
├── data
    ├── clear_data.py
    ├── download_public_data.sh
    ├── example_article.json
    ├── put_records.py
    ├── script.py
    └── send_articles.sh
├── front_end
    ├── Dockerfile
    ├── README.md
    ├── nginx.conf
    ├── package-lock.json
    ├── package.json
    ├── public
    │   ├── favicon.ico
    │   ├── index.html
    │   ├── logo192.png
    │   ├── logo512.png
    │   ├── manifest.json
    │   └── robots.txt
    └── src
    │   ├── App.css
    │   ├── App.js
    │   ├── components
    │       ├── ClusterList.js
    │       └── ClusterModal.js
    │   ├── index.css
    │   └── index.js
└── iac
    ├── roots
        ├── README.md
        └── main
        │   ├── clustering_compute.tf
        │   ├── embedding_endpoint.tf
        │   ├── eventbridge.tf
        │   ├── iam.tf
        │   ├── kms.tf
        │   ├── lambda.tf
        │   ├── main.tf
        │   ├── outputs.tf
        │   ├── summarization_pipeline.tf
        │   ├── templates
        │       ├── ClusterList-js.template
        │       ├── ConfigureNode.sh
        │       ├── aws-exports-js.template
        │       ├── cognito-policy.json
        │       ├── ecs-role.json
        │       └── init.cfg
        │   ├── terraform.tfvars
        │   ├── variables.tf
        │   └── versions.tf
    └── templates
        ├── README.md
        ├── components
            └── README.md
        └── modules
            ├── README.md
            ├── ecr
                ├── main.tf
                ├── outputs.tf
                └── variables.tf
            ├── lambda
                ├── README.md
                ├── main.tf
                ├── outputs.tf
                └── variables.tf
            └── s3_bucket
                ├── README.md
                ├── main.tf
                ├── outputs.tf
                └── variables.tf


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Local .terraform directories
  3 | **/.terraform/*
  4 | 
  5 | # .tfstate files
  6 | *.tfstate
  7 | *.tfstate.*
  8 | *.lock.hcl
  9 | *.venv
 10 | # Crash log files
 11 | crash.log
 12 | 
 13 | *.zip
 14 | *.pem
 15 | *.tar.gz
 16 | 
 17 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
 18 | # .tfvars files are managed as part of configuration and so should be included in
 19 | # version control.
 20 | #
 21 | # example.tfvars
 22 | 
 23 | # Ignore override files as they are usually used to override resources locally and so
 24 | # are not checked in
 25 | override.tf
 26 | override.tf.json
 27 | *_override.tf
 28 | *_override.tf.json
 29 | 
 30 | # Include override files you do wish to add to version control using negated pattern
 31 | #
 32 | # !example_override.tf
 33 | 
 34 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
 35 | # example: *tfplan*
 36 | 
 37 | # Pycharm
 38 | .venv
 39 | .idea
 40 | .idea/
 41 | 
 42 | # Go files
 43 | go.mod
 44 | go.sum
 45 | 
 46 | # node modules
 47 | **/node_modules/*
 48 | 
 49 | # build folder
 50 | **/build/*
 51 | 
 52 | # MacOS folder
 53 | .DS_Store
 54 | 
 55 | # Local developer temporary files such as for scratchpads
 56 | temp-*
 57 | 
 58 | # Files created by gitlab-runner
 59 | builds
 60 | 
 61 | # project generated files
 62 | environment/.environment-*.json
 63 | environment/.current-environment
 64 | environment/.cli-profiles.json
 65 | environment/app-env-var-names-backup.txt
 66 | environment/.choice-cache.json
 67 | environment/make-env
 68 | environment/.log.txt
 69 | cicd/iam-role/final-cicd-iam-role.json
 70 | iac/bootstrap/final-tf-backend-cf-stack.json
 71 | *.bak
 72 | config/.env
 73 | package
 74 | 
 75 | model_evaluation/
 76 | 
 77 | .env
 78 | 1000_embeddings.json
 79 | 5000_embeddings.json
 80 | 
 81 | articles/
 82 | customer_data/
 83 | public_data/
 84 | featured_data/
 85 | eps_screen_results/
 86 | test_results/
 87 | cluster_results
 88 | 
 89 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 90 | 
 91 | # dependencies
 92 | **/node_modules
 93 | **/.pnp
 94 | .pnp.js
 95 | 
 96 | # testing
 97 | **/coverage
 98 | 
 99 | # production
100 | **/build
101 | 
102 | # misc
103 | .DS_Store
104 | .env.local
105 | .env.development.local
106 | .env.test.local
107 | .env.production.local
108 | 
109 | npm-debug.log*
110 | yarn-debug.log*
111 | yarn-error.log*
112 | 
113 | *.env
114 | **/*.env
115 | **/aws-exports.js
116 | venv


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # List all Make targets in alphabetical order
 2 | .PHONY: list send-article
 3 | 
 4 | # Terraform Init
 5 | init: 
 6 | 	terraform -chdir=iac/roots/main init
 7 | 
 8 | # Deploy all targets in the correct order
 9 | deploy-all: 
10 | 	terraform -chdir=iac/roots/main apply -auto-approve
11 | 
12 | # Destroy all targets in the correct order
13 | destroy-all:
14 | 	terraform -chdir=iac/roots/main apply -destroy 
15 | 
16 | send-articles:
17 | 	@echo "Sending articles..."
18 | 	cd data && ./send_articles.sh && cd ..
19 | 
20 | download-public-dataset:
21 | 	@echo "Downloading public dataset..."
22 | 	cd data && ./download_public_data.sh && cd ..
23 | 
24 | clear-data:
25 | 	@echo "Clearing DynamoDB table, SQS queue, S3 bucket DBSCAN memory and removing EC2 instance from ASG..."
26 | 	cd data && python clear_data.py && cd ..


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # News Clustering And Summarization
  2 | 
  3 | ## Table of Contents
  4 | 1. [About this Repository](#About)
  5 | 2. [Architecture](#Architecture)
  6 | 3. [Demo](#Demo)
  7 | 4. [Tool Versions](#Versions)
  8 | 5. [Prerequisites](#Prerequisites)
  9 | 6. [Build and Deploy](#Build_Deploy)
 10 | 7. [Test](#Test)
 11 | 8. [Destroy](#Destroy)
 12 | 9. [License](#License)
 13 | 
 14 | ## About this Repository <a name="About"></a>
 15 | 
 16 | ### News Clustering And Summarization
 17 | 
 18 | This solution aims to launch a news Event feature that clusters related news stories into summaries, providing customers with near real-time updates on unfolding events. 
 19 | This augmented news consumption experience will enable users to easily follow evolving stories while maximizing relevance and reducing the firehose of information for articles covering the same event. By tailoring news clusters around key events, this application can improve customer satisfaction and engagement.
 20 | 
 21 | This project was built by the AWS GFS SA team, FSI PACE team and the Generative AI Innovation Center.
 22 | 
 23 | ### Repository Details
 24 | 
 25 | Below are some descriptions of the content in this repository.
 26 | ```
 27 | artifacts/                                  # Contains the architecture diagram and demo gif
 28 | build-script/                               # Hosts required build scripts
 29 |     |--- build.sh                           # Creates and Uploads docker image for the Lambda function
 30 |     |--- dir_md5.sh                         # Trigger the rebuild of docker image once the underlying code changes
 31 | business_logic/                             # Hosts the necessary artifacts and logic to run the solution
 32 |     |--- lambdas/                           # Each folder contains a .py file and a requirements.txt to run the code
 33 |         |--- embed_docs/                    # Takes a list of processed documents and embeds with SageMaker endpoint
 34 |         |--- preprocess_docs/               # Generates the string to embed, by concatenating relevant fields
 35 |         |--- summarization/                 # Generates the summary of a list of articles
 36 |         |--- trigger_sfn/                   # Determines if the added articles should trigger a new summarization or if we should wait
 37 |     |--- model_artifacts/                   # Necesary artifacts to deploy the embedding model
 38 |         |--- embedding/         
 39 |             |--- model/
 40 |                 |--- code/
 41 |                     | inference.py          # In specific SageMaker format for handling inputs and outputs, and invoking model
 42 |                     | requirements.txt      # List of requirements for model to run
 43 |                 |--- model/                 # Contains the code for importing and running model
 44 |     |--- stream_consumer/                   # Contains the code and requirements for the clsutering compute
 45 |     |--- example.json
 46 | data/                                       # For testing data upload to kinesis stream
 47 |     |--- customer_data/                     # DOES NOT CURRENTLY EXIST, place json documents in here to have them accessible for testing
 48 |     |--- put_records.py                     # Pushes files from customer_data to kinesis stream for processing
 49 |     |--- clear_data.py                      # Clears the DynamoDB table, SQS queue, S3 bucket DBSCAN memory and removes the EC2 instance from the ASG
 50 |     |--- send_articles.sh                   # Sends articles to the kinesis stream for processing to simulate a data feed
 51 | frontend/                                   # Front end code for demo purposes
 52 | iac/                                        # All infrastructure as code
 53 |     |--- roots/                             
 54 |         |--- main/
 55 |             |--- _globals.tf
 56 |             |--- backend.tf
 57 |             |--- clustering_compute.tf      # Deploys the clustering compute
 58 |             |--- dynamodb.tf                # Creates the table that contains cluster information
 59 |             |--- embedding_endpoint.tf      # Deploys the embedding SageMaker endpoint
 60 |             |--- eventbridge.tf             # Creates the EventBridge pipe
 61 |             |--- iam.tf                     # Defines IAM policies and roles
 62 |             |--- lambda.tf                  # Builds all necessary lambda functions
 63 |             |--- main.tf                    # Creates some S3 buckets, Kinesis streams, and Step Function for ingestion
 64 |             |--- outputs.tf                 # Not used in this solution
 65 |             |--- summarization_pipeline.tf  # Deploys the summarization Step Functions
 66 |             |--- terraform.tfvars           # Defines app and environment names
 67 |             |--- variables.tf               # Necessary variables
 68 |             |--- vpc.tf                     # Creates VPC and other necessary networking
 69 |         |--- README.md
 70 |     |--- templates/
 71 | Makefile                                    # Simplifies scripts for easy deployment 
 72 | README.md                                   # File you are currently reading with details on how to operate the solution
 73 | ```
 74 | 
 75 | ## Architecture <a name="Architecture"></a>
 76 | 
 77 | ![architecture](artifacts/architecture.png)
 78 | 
 79 | This solution leverages a combination of AWS managed services and serverless options to create a scalable, event-driven, microservice architecture capable of processing up dozens or news artcles per second.  The architecture utilizes AWS Lambda, Step Functions, Amazon Kinesis, EventBridge (Pipes), DynamoDB, EC2 with Auto Scaling Groups, S3, and Amazon Bedrock.
 80 | 
 81 | The workflow begins with raw JSON article ingestion through Amazon Kinesis, bridged to Step Functions via EventBridge Pipes. The first Step Functions state machine preprocesses documents and embeds articles using Titan Embeddings on Bedrock. Data is temporarily stored in S3 between steps to handle large payloads. Processed articles are then sent to SQS for micro-batch clustering.
 82 | 
 83 | Clustering occurs on EC2 instances, which pull batches from SQS and apply the DBSCAN algorithm. Results update a DynamoDB table, with periodic checkpoints saved to S3. DynamoDB Streams trigger summarization pipelines when clusters reach a specified threshold. Summaries are generated using Claude Haiku through another Step Functions workflow and stored back in DynamoDB for UI access.
 84 | 
 85 | This architecture ensures high scalability, fault tolerance, and near real-time processing of large volumes of articles, making it suitable for applications requiring rapid content analysis and clustering.
 86 | 
 87 | ## Demo <a name="Demo"></a>
 88 | 
 89 | Below is a GIF that demostrates the solution in action. 
 90 | 
 91 | ![demo](artifacts/demo.gif)
 92 | 
 93 | The demo demostrates the solution by sending articles to the Kinesis stream, and waiting those articles to be clustered and summarized. The solution will begin clustering once it hits 500 articles (this can be changed). The web UI gets updated every 5 seconds reading the DynamoDB table which contains the clusters, articles and summaries. 
 94 | 
 95 | ## Tool Versions <a name="Versions"></a>
 96 | 
 97 | To build and deploy this template the following tools are required.
 98 | 
 99 | 1. AWS CLI >= 2
100 | 2. Terraform >= 1.4.6
101 | 3. Docker
102 | 4. md5
103 | 
104 | ## Prerequisites <a name="Prerequisites"></a>
105 | 
106 | ### Credentials
107 | 
108 | Use the secret access key of a user or export the temporary credentials.
109 | 
110 | ### Environment
111 | 
112 | The environment and application are current defined as below in ```iac/roots/main/terraform.tfvars```:
113 | ```
114 | appName = "clustering"
115 | envName = "demo2"
116 | ```
117 | To edit these values navigate to ```iac/roots/main/terraform.tfvars``` and manually change them.
118 | 
119 | ### Understanding the MakeFile
120 | 
121 | At the root of the repository there is a ```Makefile```. This has custom commands to abstract some of the terraform commands for ease of use.
122 | 
123 | This includes the following commands:
124 | ```
125 | # Terraform Init
126 | init: 
127 | 	terraform -chdir=iac/roots/main init
128 | 
129 | # Deploy all targets in the correct order
130 | deploy-all: 
131 | 	terraform -chdir=iac/roots/main apply -auto-approve
132 | 
133 | # Destroy all targets in the correct order
134 | destroy-all:
135 | 	terraform -chdir=iac/roots/main apply -destroy 
136 | 
137 | # Send Articles
138 | send-articles:
139 | 	cd data && ./send_articles.sh && cd ..
140 | 
141 | # Download Public Dataset
142 | download-public-dataset:
143 | 	cd data && ./download_public_data.sh && cd ..
144 | 
145 | # Clear Data
146 | clear-data:
147 | 	cd data && python clear_data.py && cd ..
148 | ```
149 | 
150 | In the next sections, we will explain when to use these commands.
151 | 
152 | 
153 | ### Init Terraform
154 | 
155 | To initialize terraform, run
156 | ```
157 | make init
158 | ```
159 | 
160 | This will run ```terraform -chdir=iac/roots/main init```.
161 | 
162 | 
163 | ## Build and Deploy <a name="Build_Deploy"></a>
164 | 
165 | ### Deploy
166 | 
167 | To deploy the resources, run:
168 | 
169 | ```
170 | make deploy-all
171 | ```
172 | 
173 | This will run ```terraform -chdir=iac/roots/main apply -auto-approve```.
174 | 
175 | ### Accessing the Frontend
176 | 
177 | To access the frontend, you'll see the Terraform output that should look something like:
178 | 
179 | ```
180 | dns_record_for_application = "https://front-end-clustering-demo2-1234567890.us-east-1.elb.amazonaws.com"
181 | sample_user_creds = tomap({
182 |   "user1" = {
183 |     "email" = "donotreply@amazon.com"
184 |     "name" = "aws-user"
185 |     "password" = "awsiscool$"
186 |   }
187 | })
188 | ```
189 | 
190 | Open the link in your browser. You should see the following login screen using the email and password provided in the output. 
191 | 
192 | The page should look like the following:
193 | 
194 | ![webui](artifacts/webui.png)
195 | 
196 | After you have logged in, please go ahead and start sending articles to the solution. You can follow the instrucions in the sections below. 
197 | 
198 | ## Test <a name="Test"></a>
199 | 
200 | ### Testing Infrastructure
201 | 
202 | Running ```make download-public-dataset``` will download a public dataset to ```data/public-data```. 
203 | 
204 | The following is an example of one of the items in the dataset. The solution expects the first 3 keys to be filled
205 | ```json
206 | {
207 |   "id": "66536",
208 |   "text": "this is the body of the article", // ! Required
209 |   "title": "Article Title", // ! Required
210 |   "date": "2013-12-18 08:14:00", // ! Required
211 |   "event_id": "9102",
212 |   "duplicate": false,
213 |   "lang": "deu",
214 |   "bag_id": "b738a3b7-2db3-4d38-88c8-76c4eb4f835b-2325",
215 |   "source": "finanzen.net",
216 |   "cluster": "322"
217 | }
218 | 
219 | ```
220 | 
221 | > Note: Use this data for exploration/testing. You should use your own data for prod. Just ensure it has the same expected fields: text, title, and date.
222 | 
223 | ### Installing Required Python Libraries
224 | 
225 | Before sending articles, ensure you have `boto3` and `tdqdm` python libraries installed. You can install them using the following command:
226 | 
227 | ```bash
228 | pip install boto3 tqdm
229 | ```
230 | 
231 | ### Sending Articles
232 | 
233 | Running ```make send-articles``` will call on ```data/put_records.py```.
234 | 
235 | ```put_records.py``` relies on the following global variables
236 | ```
237 | STREAM_NAME = "input-stream-clustering-demo2"   # Name of Kinesis stream
238 | PARTITION_KEY = "a"                         # Partition key of Kinesis stream (does not need editing)
239 | JSON_DIR = "./customer_data"                # Path to article json files
240 | COUNT = 1200000                             # Number of articles to test with (actual number run is min(COUNT, num articles in JSON_DIR))
241 | BATCH_SIZE = 5                              # Number of articles to send as a batch to the Kinesis stream
242 | ```
243 | Once you have sent articles, you should see them in the frontend. The frontend will display clusters as they are formed and updated in real time.
244 | A screenshot of the frontend displaying news clusters is shown below:
245 | 
246 | ![webui_news](artifacts/webui_news.png)
247 | 
248 | Each row in the web UI displays a cluster, its summary, the number of articles in the cluster, and a link to see each articles in the cluster. If you click in the "View Articles" button, you are able to see each article in detail with title, date, and full text. 
249 | 
250 | *Note: after testing, it may be required to clear the SQS queue and the DyanmoDB table*
251 | 
252 | ### Clearing Data 
253 | 
254 | Running ```make clear-data``` will clear the DynamoDB table, SQS queue, S3 bucket DBSCAN memory and remove the EC2 instance from the ASG. Please wait 5-10 minutes after deleting the data to send more news to the solution. It takes few minutes for the ASG to create a new EC2 instance.
255 | 
256 | If you changed the variables with a specific project name, you might need to edit the ```data/clear_data.py``` file to match the project name.
257 | 
258 | ### Testing Business Logic
259 | 
260 | The ```test``` folder has automated embedding and epsilon tests with notebooks for evaluating clustering and summarization.
261 | 
262 | For more details, navigate to the ```README.md``` in the ```test``` folder.
263 | 
264 | 
265 | ## Destroy <a name="Destory"></a>
266 | 
267 | To destroy the resources, run:
268 | 
269 | ```
270 | make destroy-all
271 | ```
272 | 
273 | This will run ```terraform -chdir=iac/roots/main apply -destroy ```.
274 | 
275 | # Contributors
276 | 
277 | - [Samuel Baruffi](https://www.linkedin.com/in/samuelbaruffi/)
278 | - [Kareem Abdol-Hamid](https://www.linkedin.com/in/kabdolha/)
279 | - [Alexandar (Ally) Meringer](https://www.linkedin.com/in/kabdolha/)
280 | - [Hector Lopez Hernandez](https://www.linkedin.com/in/hlopezhernandez/)
281 | - [Yanxiang Yu](https://www.linkedin.com/in/yyu2/)
282 | - [Nitin Jain](https://www.linkedin.com/in/annjay/)
283 | 
284 | ## License <a name="License"></a>
285 | 
286 | This library is licensed under the Amazon Software License.
287 | 
288 | 
289 | 
290 | 
291 | 


--------------------------------------------------------------------------------
/artifacts/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/architecture.png


--------------------------------------------------------------------------------
/artifacts/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/demo.gif


--------------------------------------------------------------------------------
/artifacts/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/webui.png


--------------------------------------------------------------------------------
/artifacts/webui_news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/webui_news.png


--------------------------------------------------------------------------------
/build-script/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This is the order of arguments
 4 | ECR_BASE_ARN=${1}
 5 | BUILD_FOLDER=${2}
 6 | IMAGE_NAME=${3}
 7 | IMAGE_URI=${4}
 8 | TARGET_AWS_REGION=${5}
 9 | MYTAG=$(date +%Y%m%d%H%M%S)
10 | 
11 | # Check that git is installed
12 | which git >/dev/null || {
13 |     echo 'ERROR: git is not installed'
14 |     exit 1
15 | }
16 | 
17 | # Check that aws is installed
18 | which aws >/dev/null || {
19 |     echo 'ERROR: aws-cli is not installed'
20 |     exit 1
21 | }
22 | 
23 | # Check that docker is installed
24 | which docker >/dev/null || {
25 |     echo 'ERROR: docker is not installed'
26 |     exit 1
27 | }
28 | 
29 | # Connect into aws and login into ECR
30 | SLEEP_INT=$((1 + RANDOM % 11))
31 | for CTR in {1..5}; do
32 |     
33 |     # Check that docker is running
34 |     docker ps >/dev/null 
35 |     DOCKER_STATUS=$?
36 |    
37 |     # Check that ECR creds are obtained
38 |     aws ecr get-login-password --region ${TARGET_AWS_REGION} | docker login --username AWS --password-stdin ${ECR_BASE_ARN}
39 |     ECR_GET_CREDS_STATUS=$?
40 | 
41 |     if [ ${ECR_GET_CREDS_STATUS} -ne 0 ] || [ ${DOCKER_STATUS} -ne 0 ]; then
42 |         echo "ERROR: aws ecr login failed, trying again in ${SLEEP_INT} Seconds"
43 |         sleep ${SLEEP_INT}
44 |         ((CTR=CTR+1))
45 |         continue
46 |     else   
47 |         echo "SUCCESS: aws ecr login succeded in ${CTR} attempt"
48 |         break
49 |     fi
50 |     exit 1
51 | done
52 | 
53 | # Build image
54 | docker build --no-cache -t ${IMAGE_NAME} ${BUILD_FOLDER} --platform linux/amd64 || {
55 |     echo 'ERROR: docker build faied'
56 |     exit 1
57 | }
58 | 
59 | # Docker Tag and Push
60 | docker tag ${IMAGE_NAME} ${IMAGE_URI}:${MYTAG}
61 | docker push ${IMAGE_URI}:${MYTAG} || {
62 |     echo 'ERROR: docker push faied'
63 |     exit 1
64 | }
65 | 
66 | # Get the sha of the image
67 | SHA_IMAGE=$(docker inspect --format='{{.RepoDigests}}' ${IMAGE_URI}:${MYTAG})
68 | echo "Tags Used for ${IMAGE_NAME} Image are ${MYTAG} with this SHA : ${SHA_IMAGE}"


--------------------------------------------------------------------------------
/build-script/dir_md5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script facilitates checking if the contents are different between `terraform apply` runs
 4 | 
 5 | # List of arguments
 6 | build_folder=${1}
 7 | 
 8 | # Linux has command md5sum and OSX has command md5
 9 | if command -v md5sum >/dev/null 2>&1; then
10 |   MD5_PROGRAM=md5sum
11 | elif command -v md5 >/dev/null 2>&1; then
12 |   MD5_PROGRAM=md5
13 | else
14 |   echo "ERROR: md5sum is not installed"
15 |   exit 255
16 | fi
17 | 
18 | # Take md5 from each object inside the program and then take a md5 of that output
19 | md5_output="$(eval ${MD5_PROGRAM} $build_folder/** | ${MD5_PROGRAM})"
20 | 
21 | # Output result as JSON back to terraform
22 | echo "{ \"md5\": \"${md5_output}\" }"
23 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/Dockerfile:
--------------------------------------------------------------------------------
 1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
 2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
 3 | 
 4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
 5 | 
 6 | USER root
 7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all 
 8 | 
 9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 | 
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 |     && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 |     && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 |     && chmod 0440 /etc/sudoers.d/$USERNAME
18 | 
19 | WORKDIR /var/task
20 | 
21 | COPY requirements.txt /var/task
22 | COPY embed_docs.py /var/task
23 | 
24 | RUN chown -R ${user}:${user} /var/task && \
25 |     chmod 755 /var/task/embed_docs.py /var/task/requirements.txt
26 | 
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt 
28 | 
29 | USER ${USERNAME}
30 | 
31 | CMD ["embed_docs.handler"]
32 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/embed_docs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import boto3
  4 | 
  5 | SQS_QUEUE_URL = os.environ["SQS_QUEUE_URL"]
  6 | MAX_ARTICLES = int(os.environ["MAX_ARTICLES"])
  7 | EMBEDDING_ENDPOINT_NAME = os.environ["EMBEDDING_ENDPOINT_NAME"]
  8 | EMBEDDING_MODEL = os.environ["EMBEDDING_MODEL"]
  9 | MAX_LENGTH = int(os.environ["MAX_LENGTH"])
 10 | EMBEDDING_FIELDS = [
 11 |     "title",
 12 |     "summary",
 13 |     "text",
 14 |     # * Useful for embneddings not in public dataset
 15 |     # "subjects",
 16 |     # "industries",
 17 |     # "organizations",
 18 |     # "people",
 19 |     # "locations",
 20 | ]
 21 | PREPROCESS_BUCKET = os.environ["PREPROCESS_BUCKET"]
 22 | EMBEDDING_BUCKET = os.environ["EMBEDDING_BUCKET"]
 23 | 
 24 | s3_client = boto3.client("s3")
 25 | sagemaker_client = boto3.client("sagemaker-runtime")
 26 | sqs_client = boto3.client("sqs")
 27 | bedrock_client = boto3.client("bedrock-runtime")
 28 | 
 29 | 
 30 | def create_concat_text(doc_list):
 31 |     concat_list = []
 32 |     for doc in doc_list:
 33 |         concat_text = []
 34 |         for field in EMBEDDING_FIELDS:
 35 |             if isinstance(doc[field], str):
 36 |                 concat_text.append(doc[field])
 37 | 
 38 |         # concat_text = [doc[f] for f in EMBEDDING_FIELDS]
 39 |         print("Concat Text", concat_text)
 40 |         concatenated = "\n".join(concat_text)
 41 |         concat_list.append(concatenated)
 42 |     return concat_list
 43 | 
 44 | 
 45 | # Event is list of S3 keys
 46 | def handler(event, context):
 47 | 
 48 |     document_list = []
 49 |     for s3_key in event:
 50 |         print("Getting article from ", s3_key)
 51 |         response = s3_client.get_object(Bucket=PREPROCESS_BUCKET, Key=s3_key)
 52 |         data = response["Body"].read().decode("utf-8")
 53 |         doc = json.loads(data)
 54 |         document_list.append(doc)
 55 | 
 56 |     text_list = create_concat_text(document_list)
 57 |     print("Text list: ", text_list)
 58 | 
 59 |     data = {"input_texts": text_list, "max_length": MAX_LENGTH}
 60 | 
 61 |     # Print the content
 62 |     print("Data:")
 63 |     print(data)
 64 |     json_data = json.dumps(data)
 65 |     print("Embedding endpoint name: ", EMBEDDING_ENDPOINT_NAME)
 66 | 
 67 |     if len(document_list) > MAX_ARTICLES:
 68 |         document_list[:MAX_ARTICLES]
 69 | 
 70 |     # If titan use bedrock, otherwise use sagemaker
 71 |     prediction = {"embeddings": []}
 72 |     if EMBEDDING_MODEL == "titan":
 73 |         for text in text_list:
 74 |             response = bedrock_client.invoke_model(
 75 |                 body=json.dumps(
 76 |                     {"inputText": text, "dimensions": MAX_LENGTH, "normalize": True}
 77 |                 ),
 78 |                 modelId="amazon.titan-embed-text-v2:0",
 79 |                 accept="application/json",
 80 |                 contentType="application/json",
 81 |             )
 82 |             response_body = json.loads(response.get("body").read().decode("utf-8"))
 83 |             prediction["embeddings"].append(response_body["embedding"])
 84 |     else:
 85 |         # Push content to the SageMaker endpoint
 86 |         response = sagemaker_client.invoke_endpoint(
 87 |             EndpointName=EMBEDDING_ENDPOINT_NAME,
 88 |             ContentType="application/json",
 89 |             Body=json_data,
 90 |         )
 91 |         prediction = json.loads(response["Body"].read().decode("utf-8"))
 92 | 
 93 |     print("Prediction:")
 94 |     print(prediction)
 95 |     embedding_list = prediction["embeddings"]
 96 | 
 97 |     for i, doc in enumerate(document_list):
 98 |         doc["concat_embedding"] = [embedding_list[i]]
 99 |         message_body = json.dumps(doc)
100 |         if len(message_body.encode("utf-8")) > 262144:
101 |             print(f"Skipping item at index {i} due to size limit")
102 |             continue
103 |         s3_key = doc["id"] + ".json"
104 |         json_data = json.dumps(doc)
105 |         sqs_client.send_message(QueueUrl=SQS_QUEUE_URL, MessageBody=json_data)
106 |         s3_client.put_object(Bucket=EMBEDDING_BUCKET, Key=s3_key, Body=json_data)
107 | 
108 |     print("End of function")
109 |     return "Success"
110 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3


--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/Dockerfile:
--------------------------------------------------------------------------------
 1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
 2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
 3 | 
 4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
 5 | 
 6 | USER root
 7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all 
 8 | 
 9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 | 
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 |     && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 |     && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 |     && chmod 0440 /etc/sudoers.d/$USERNAME
18 | 
19 | WORKDIR /var/task
20 | 
21 | COPY requirements.txt /var/task
22 | COPY pre_process_docs.py /var/task
23 | 
24 | RUN chown -R ${user}:${user} /var/task && \
25 |     chmod 755 /var/task/pre_process_docs.py /var/task/requirements.txt
26 | 
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt 
28 | 
29 | USER ${USERNAME}
30 | 
31 | CMD ["pre_process_docs.handler"]
32 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/pre_process_docs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pickle
  4 | import boto3
  5 | from typing import List, Dict
  6 | from bs4 import BeautifulSoup
  7 | import re
  8 | import base64
  9 | 
 10 | kinesis_client = boto3.client("kinesis")
 11 | s3_client = boto3.client("s3")
 12 | 
 13 | PREPROCESS_BUCKET = os.environ["PREPROCESS_BUCKET"]
 14 | 
 15 | 
 16 | def clean_text(text):
 17 |     # apply to title
 18 |     text = text.replace("&quot;", '"')
 19 |     text = re.sub(r'[^:a-zA-Z0-9\s"\'-]', "", text)
 20 |     return text
 21 | 
 22 | 
 23 | def extract_top_subjects(subject_entry: List[dict], threshold: float):
 24 |     subjects = []
 25 |     for e in subject_entry:
 26 |         if e["relevance"] >= threshold:
 27 |             subjects.append(e["long_name"])
 28 | 
 29 |     return "StorySubjects: " + ", ".join(subjects)
 30 | 
 31 | 
 32 | def extract_top_industries(industries_entry: List[dict], threshold: float):
 33 |     industries = []
 34 |     for e in industries_entry:
 35 |         if e["relevance"] >= threshold:
 36 |             industries.append(e["long_name"])
 37 | 
 38 |     result = "RelevantIndustries: " + ", ".join(industries) if industries else ""
 39 | 
 40 |     return result
 41 | 
 42 | 
 43 | def extract_top_organizations(orgs_entry: List[dict], threshold: float):
 44 |     orgs = []
 45 |     for e in orgs_entry:
 46 |         if e["relevance"] >= threshold:
 47 |             orgs.append(e["name"])
 48 | 
 49 |     result = "RelevantOrganizations: " + ", ".join(orgs) if orgs else ""
 50 | 
 51 |     return result
 52 | 
 53 | 
 54 | def remove_tags(text: str):
 55 |     soup = BeautifulSoup(text, "html.parser")
 56 |     return soup.get_text()
 57 | 
 58 | 
 59 | def get_names(people: List[Dict], threshold=0.5):
 60 | 
 61 |     names = [person["name"] for person in people if person["relevance"] > threshold]
 62 | 
 63 |     result = "PeopleOfInterest: " + ", ".join(names) if names else ""
 64 | 
 65 |     return result
 66 | 
 67 | 
 68 | def get_locations(locations: List[dict], threshold=0.8):
 69 |     result = []
 70 |     if locations:
 71 |         names = [
 72 |             location["long_name"]
 73 |             for location in locations
 74 |             if location["relevance"] > threshold
 75 |         ]
 76 | 
 77 |         result = "Location: " + ", ".join(names) if names else ""
 78 | 
 79 |     return result
 80 | 
 81 | 
 82 | def process_data(data: dict):
 83 | 
 84 |     # irrelevant columns for embedding
 85 |     drop = [
 86 |         "vendor_data",
 87 |         "headline_only",
 88 |         "deckline",
 89 |         "version",
 90 |         "story_link",
 91 |         "copyright_line",
 92 |         "display_date",
 93 |         "received_date",
 94 |         "publication_reason",
 95 |         "media",
 96 |         "spam",
 97 |         "control_flags",
 98 |         "issuer",
 99 |         "market",
100 |         "business_relevance",
101 |         "cluster_signature",
102 |         "headline_cluster_signature",
103 |         "signals",
104 |         "cik",
105 |         "feed",
106 |     ]
107 | 
108 |     processed_data = {}
109 |     for k, v in data.items():
110 |         if k not in drop:
111 |             processed_data[k] = v
112 | 
113 |     processed_data["title"] = clean_text(data["title"])
114 |     processed_data["summary"] = clean_text(
115 |         data["text"]
116 |     )  # No summary in public dataset using text
117 |     processed_data["text"] = remove_tags(data["text"])
118 |     processed_data["publication_date"] = remove_tags(data["date"])
119 | 
120 |     ## * Additional data that's useful for embeddings but isn't in public data
121 |     # processed_data["subjects"] = extract_top_subjects(data["subjects"], threshold=0.8)
122 |     # processed_data["summary"] = clean_text(data["summary"])
123 |     # processed_data["industries"] = extract_top_industries(
124 |     #     data["industries"], threshold=0.8
125 |     # )
126 |     # processed_data["organizations"] = extract_top_organizations(
127 |     #     data["organizations"], threshold=0.6
128 |     # )
129 |     # processed_data["people"] = get_names(data["people"], threshold=0.5)
130 |     # processed_data["locations"] = get_locations(data.get("locations"), threshold=0.8)
131 | 
132 |     return processed_data
133 | 
134 | 
135 | def handler(events, context):
136 |     event = events[0]
137 |     print("EVENT: ", event)
138 | 
139 |     encrypted_list = event["data"]
140 |     document_json = base64.b64decode(encrypted_list).decode("utf-8")
141 | 
142 |     document_list = json.loads(document_json)
143 |     print("Document List: ", document_list)
144 |     s3_key_list = []
145 | 
146 |     for doc in document_list:
147 |         processed_data = process_data(doc)
148 | 
149 |         print("Processed Data:")
150 |         print(processed_data)
151 | 
152 |         s3_key = processed_data["id"] + ".json"
153 |         json_data = json.dumps(processed_data)
154 |         print("Pushing data to ", PREPROCESS_BUCKET + "/" + s3_key)
155 |         s3_client.put_object(Bucket=PREPROCESS_BUCKET, Key=s3_key, Body=json_data)
156 | 
157 |         s3_key_list.append(s3_key)
158 | 
159 |     print("End of function: ", s3_key_list)
160 |     return s3_key_list
161 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | bs4
3 | chardet


--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/Dockerfile:
--------------------------------------------------------------------------------
 1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
 2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
 3 | 
 4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
 5 | 
 6 | USER root
 7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all 
 8 | 
 9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 | 
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 |     && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 |     && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 |     && chmod 0440 /etc/sudoers.d/$USERNAME
18 | 
19 | WORKDIR /var/task
20 | 
21 | COPY requirements.txt /var/task
22 | COPY summarization.py /var/task
23 | 
24 | RUN chown -R ${user}:${user} /var/task && \
25 |     chmod 755 /var/task/summarization.py /var/task/requirements.txt
26 | 
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt 
28 | 
29 | USER ${USERNAME}
30 | 
31 | CMD ["summarization.handler"]
32 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3


--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/summarization.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import boto3
  4 | from datetime import datetime
  5 | from collections import Counter
  6 | 
  7 | bedrock_client = boto3.client("bedrock-runtime")
  8 | dynamodb = boto3.resource("dynamodb")
  9 | model_id = os.environ["MODEL_ID"]
 10 | table_name = os.environ["DYNAMODB_TABLE_NAME"]
 11 | 
 12 | 
 13 | def generate_average_cluster_data(articles):
 14 |     # Initialize counters and variables for tracking
 15 |     location_counter = Counter()
 16 |     organization_counter = Counter()
 17 |     earliest_date = datetime.max
 18 |     latest_date = datetime.min
 19 | 
 20 |     # Check if articles list is empty
 21 |     if not articles:
 22 |         return {
 23 |             "most_common_location": "",
 24 |             "most_common_organization": "",
 25 |             "earliest_date": "",
 26 |             "latest_date": "",
 27 |         }
 28 | 
 29 |     # Process each article
 30 |     for article in articles:
 31 |         publication_date = None
 32 |         if article.get("publication_date"):
 33 |             publication_date = datetime.fromisoformat(
 34 |                 article.get("publication_date").rstrip("Z")
 35 |             )
 36 |         location_counter.update(article.get("locations"))
 37 |         organization_counter.update(article.get("organizations"))
 38 | 
 39 |         if publication_date and publication_date < earliest_date:
 40 |             earliest_date = publication_date
 41 |         if publication_date and publication_date > latest_date:
 42 |             latest_date = publication_date
 43 | 
 44 |     # Handle case where no locations or organizations were found
 45 |     if location_counter:
 46 |         most_common_location, _ = location_counter.most_common(1)[0]
 47 |     else:
 48 |         most_common_location = ""
 49 | 
 50 |     if organization_counter:
 51 |         most_common_organization, _ = organization_counter.most_common(1)[0]
 52 |     else:
 53 |         most_common_organization = ""
 54 | 
 55 |     # Adjusted return to include a check for the date range
 56 |     return {
 57 |         "most_common_location": most_common_location,
 58 |         "most_common_organization": most_common_organization,
 59 |         "earliest_date": earliest_date.strftime("%Y-%m-%d %H:%M:%S"),
 60 |         "latest_date": latest_date.strftime("%Y-%m-%d %H:%M:%S"),
 61 |     }
 62 | 
 63 | 
 64 | def get_cluster_data(cluster_id):
 65 |     # Initialize a DynamoDB client
 66 |     table = dynamodb.Table(table_name)
 67 | 
 68 |     # Query the table
 69 |     response = table.query(
 70 |         KeyConditionExpression=boto3.dynamodb.conditions.Key("PK").eq(cluster_id),
 71 |     )
 72 |     cluster_data = response.get("Items", [])
 73 | 
 74 |     # Extract the first item
 75 |     metadata = cluster_data[0]
 76 |     articles = cluster_data[1:]
 77 |     summary_count = metadata.get("summary_count", 0)
 78 | 
 79 |     return metadata.get("generated_summary", ""), summary_count, articles
 80 | 
 81 | 
 82 | def generate_bedrock_claude(input_tokens):
 83 |     claude_body = {
 84 |         "modelId": model_id,
 85 |         "body": json.dumps(
 86 |             {
 87 |                 "anthropic_version": "bedrock-2023-05-31",
 88 |                 "messages": [{"role": "user", "content": input_tokens}],
 89 |                 "max_tokens": 500,  # the higher this is the longer it takes
 90 |                 "temperature": 0.1,  # these parameters affect response diversity
 91 |                 "top_p": 1,
 92 |                 "top_k": 100,
 93 |             }
 94 |         ),
 95 |     }
 96 |     bedrock_response = bedrock_client.invoke_model(
 97 |         **claude_body,
 98 |         accept="*/*",
 99 |         contentType="application/json",
100 |     )
101 |     body = bedrock_response.get("body")
102 |     rd = body.read()
103 |     body_json = json.loads(rd)
104 |     try:
105 |         response = body_json["content"][0].get("text")
106 |         output_token_cnt = int(
107 |             bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
108 |                 "x-amzn-bedrock-output-token-count"
109 |             )
110 |         )
111 |         input_token_cnt = int(
112 |             bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
113 |                 "x-amzn-bedrock-input-token-count"
114 |             )
115 |         )
116 |     except Exception:
117 |         print(rd)
118 |     return input_token_cnt, output_token_cnt, response
119 | 
120 | 
121 | def parse_res(res):
122 |     try:
123 |         title = res.split("<title>")[-1].split("</title>")[0]
124 |         summary = res.split("<summary>")[-1].split("</summary>")[0]
125 |         return title, summary
126 |     except Exception:
127 |         return "<Title>", res
128 | 
129 | 
130 | def generate_cluster_summary(previous_summary, articles, limit):
131 |     input_context = []
132 |     # If we've done summaries before we'll limit the input tokens for each summary
133 |     limit_number = 2000
134 |     if limit:
135 |         limit_number = 1500
136 |     instructions = "You will be provided with multiple sets of titles and summaries from different articles in <context> tag, and the current title and summary for a story in <story> tag. Compile, summarize and update the current title and summary for the story. The summary should be less than 100 words. Put the generated context inside <title> and <summary> tag. Do not hallucinate or make up content.\n\n"
137 |     texts = "\n".join(
138 |         [
139 |             f"title: {article.get('title')}, summary: {article.get('summary', "")[:limit_number]}"
140 |             for article in articles
141 |         ]
142 |     )
143 |     prompt = f"{instructions} <story> \n{previous_summary} </story> \n\n <context>\n{texts}\n</context>\n"
144 |     print("Prompt Length:", len(prompt))
145 |     input_context.append(prompt)
146 |     output = generate_bedrock_claude(prompt[:12000])
147 |     title, summary = parse_res(output[2])
148 | 
149 |     return {"title": title, "summary": summary}
150 | 
151 | 
152 | """
153 | Event Expected in following format
154 | {
155 |     cluster_id: "198be4aa-95e8-4d8e-9e0b-a37eef6c29e2"
156 | }
157 | """
158 | 
159 | 
160 | def handler(event, context):
161 |     print("Input Event", event)
162 | 
163 |     previous_summary, summary_count, articles = get_cluster_data(event["cluster_id"])
164 |     generated_summary = generate_cluster_summary(previous_summary, articles, summary_count > 0)
165 |     averages = generate_average_cluster_data(articles)
166 | 
167 |     print("Generated Summary", generated_summary)
168 |     print("Averages", averages)
169 | 
170 |     return {**generated_summary, **averages, "summary_count": summary_count + 1}
171 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/Dockerfile:
--------------------------------------------------------------------------------
 1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
 2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
 3 | 
 4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
 5 | 
 6 | USER root
 7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all 
 8 | 
 9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 | 
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 |     && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 |     && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 |     && chmod 0440 /etc/sudoers.d/$USERNAME
18 | 
19 | WORKDIR /var/task
20 | 
21 | COPY requirements.txt /var/task
22 | COPY trigger_sfn.py /var/task
23 | 
24 | RUN chown -R ${user}:${user} /var/task && \
25 |     chmod 755 /var/task/trigger_sfn.py /var/task/requirements.txt
26 | 
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt 
28 | 
29 | USER ${USERNAME}
30 | 
31 | CMD ["trigger_sfn.handler"]
32 | 


--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3


--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/trigger_sfn.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | import os
 4 | 
 5 | 
 6 | def handler(event, context):
 7 |     # Initialize the DynamoDB and Step Functions clients
 8 |     dynamodb_client = boto3.client("dynamodb")
 9 |     sfn_client = boto3.client("stepfunctions")
10 | 
11 |     # State Machine ARN and the threshold for number_of_articles from environment variables
12 |     state_machine_arn = os.environ["STATE_MACHINE_ARN"]
13 |     articles_threshold = int(os.environ["ARTICLES_THRESHOLD"])
14 |     article_cap = 3  # A multiple of articles_threshold, to stop processing summaries
15 |     # DynamoDB table name
16 |     table_name = os.environ["DYNAMODB_TABLE_NAME"]
17 | 
18 |     # Process each record in the DynamoDB Stream
19 |     for record in event[
20 |         "Records"
21 |     ]:  # ! ToDo aggregate records and send them in batch instead of one at at Time
22 |         if record["eventName"] == "INSERT":
23 |             new_image = record["dynamodb"].get("NewImage", {})
24 |             print("New Record")
25 |             if "type" in new_image and new_image["type"]["S"] == "article":
26 |                 print("Record is an Article")
27 | 
28 |                 # Extract primary key (PK)
29 |                 pk_value = new_image["PK"]["S"]
30 |                 metadata_key = f"#METADATA#{pk_value}"
31 |                 print("PK is", pk_value)
32 | 
33 |                 # Get the item with PK and #METADATA#[PK] sort key
34 |                 response = dynamodb_client.get_item(
35 |                     TableName=table_name,
36 |                     Key={"PK": {"S": pk_value}, "SK": {"S": metadata_key}},
37 |                 )
38 |                 item = response.get("Item", {})
39 |                 print("Cluster: ", item)
40 |                 
41 |                 # If we get an empty item with no articles move to the next record
42 |                 if "number_of_articles" not in item:
43 |                     continue
44 | 
45 |                 summary_count = int(item.get("summary_count", {"N": "0"})["N"])
46 |                 lower_limit_flag = int(item["number_of_articles"]["N"]) > articles_threshold * (summary_count + 1)
47 |                 upper_limit_flag = int(item["number_of_articles"]["N"]) < 3 * articles_threshold
48 |                 
49 |                 print("Summary Count:", summary_count)
50 |                 print("Lower Limit Flag:", lower_limit_flag)
51 |                 print("Upper Limit Flag:", upper_limit_flag)
52 |                 print("Overall flag:", (lower_limit_flag and upper_limit_flag) or (lower_limit_flag and summary_count == 0))
53 | 
54 |                 # Check if number_of_articles is within a range or if it is outside the upper limit but still hasn't been summarized
55 |                 if (lower_limit_flag and upper_limit_flag) or (lower_limit_flag and summary_count == 0):
56 |                     # Prepare data for Step Functions
57 |                     input_data = {
58 |                         "cluster_id": pk_value,
59 |                     }
60 | 
61 |                     # Start execution of the state machine
62 |                     response = sfn_client.start_execution(
63 |                         stateMachineArn=state_machine_arn, input=json.dumps(input_data)
64 |                     )
65 | 
66 |                     print(
67 |                         f"Started Step Functions execution for 'article' record: {response['executionArn']}"
68 |                     )
69 |                 else:
70 |                     print(
71 |                         "Not enough articles in the cluster yet, less than ",
72 |                         articles_threshold,
73 |                     )
74 | 
75 |     return {
76 |         "statusCode": 200,
77 |         "body": json.dumps(
78 |             'Processed DynamoDB stream records of type "article" with sufficient count.'
79 |         ),
80 |     }
81 | 


--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/code/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import time
 5 | import torch.multiprocessing as mp
 6 | 
 7 | mp.set_start_method("spawn", force=True)
 8 | 
 9 | MODEL_NAME = os.environ.get("MODEL_NAME")
10 | BIT_LOADING = os.environ.get("BIT_LOADING")
11 | print(f"MODEL_NAME: {MODEL_NAME}")
12 | print(f"BIT_LOADING: {BIT_LOADING}")
13 | MODEL_MAP = {
14 |     "mistralinstruct": None,
15 |     "bge": None,
16 | }
17 | 
18 | 
19 | print("Current working directory: ", os.getcwd())
20 | print("List current working directory: ", os.listdir(os.getcwd()))
21 | 
22 | 
23 | def model_fn(model_dir):
24 |     try:
25 |         print(f"In model_fn, model_dir={model_dir}")
26 |         print(f"CWD: {os.getcwd()}")
27 |         print(f"List CWD: {os.listdir(os.getcwd())}")
28 |         print(f"List model_dir: {os.listdir(model_dir)}")
29 |         sys.path.append(model_dir + "/model")
30 |         print(f"Sys path: {sys.path}")
31 |         print(f"List model_dir/model: {os.listdir(model_dir+'/model')}")
32 |         print(f"List model_dir/code: {os.listdir(model_dir+'/code')}")
33 | 
34 |         from embed_documents import EmbedDocuments
35 | 
36 |         print("Successfully imported EmbedDocuments")
37 |         model_cls = EmbedDocuments(MODEL_NAME, MODEL_MAP[MODEL_NAME], BIT_LOADING)
38 | 
39 |     except Exception as e:
40 |         print(f"WEIRD, error: {e}")
41 |     return model_cls
42 | 
43 | 
44 | def input_fn(input_data, content_type="application/json"):
45 |     """A default input_fn that can handle JSON, CSV and NPZ formats.
46 | 
47 |     Args:
48 |         input_data: the request payload serialized in the content_type format
49 |         content_type: the request content_type
50 | 
51 |     Returns: input_data deserialized into torch.FloatTensor or torch.cuda.FloatTensor depending if cuda is available.
52 |     """
53 |     print(f"input_fn, input_data={input_data}, content_type={content_type}")
54 |     # Process the input data (e.g., convert from JSON)
55 |     print("input_fn")
56 |     print("request body: ", input_data)
57 |     if content_type == "application/json":
58 |         print("request_content_type is application/json")
59 |         data = json.loads(input_data)
60 |         texts = data["input_texts"]
61 |         return texts
62 |     else:
63 |         raise ValueError(f"Unsupported content type: {content_type}")
64 | 
65 | 
66 | def predict_fn(data, model):
67 |     """A default predict_fn for PyTorch. Calls a model on data deserialized in input_fn.
68 |     Runs prediction on GPU if cuda is available.
69 | 
70 |     Args:
71 |         data: input data (torch.Tensor) for prediction deserialized by input_fn
72 |         model: PyTorch model loaded in memory by model_fn
73 | 
74 |     Returns: a prediction
75 |     """
76 |     print(f"predict_fn, data={data}, model={model}")
77 |     start_time = time.time()
78 |     new_doc = model.model_handler.encode(data)
79 |     end_time = time.time()
80 |     new_data = {"embeddings": new_doc, "time": end_time - start_time}
81 |     return new_data
82 | 
83 | 
84 | def output_fn(prediction, content_type="application/json"):
85 |     """A default output_fn for PyTorch. Serializes predictions from predict_fn to JSON, CSV or NPY format.
86 | 
87 |     Args:
88 |         prediction: a prediction result from predict_fn
89 |         accept: type which the output data needs to be serialized
90 | 
91 |     Returns: output data serialized
92 |     """
93 |     print(f"output_fn, prediction={prediction}, content_type={content_type}")
94 |     if content_type == "application/json":
95 |         print("content_type is application/json")
96 |         return json.dumps(prediction)
97 |     else:
98 |         raise ValueError(f"Unsupported content type: {content_type}")
99 | 


--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/code/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | bitsandbytes
3 | spacy
4 | torch
5 | transformers


--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/business_logic/model_artifacts/embedding/model/model/__init__.py


--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/embed_documents.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Literal
 3 | import importlib
 4 | from embedding_model_utils import PretrainedHandler
 5 | 
 6 | print("END EMBED_DOCUMENNTS IMPORTS")
 7 | 
 8 | 
 9 | class EmbedDocuments:
10 |     def __init__(
11 |         self,
12 |         model_name: Literal["bge", "mistralinstruct"],
13 |         pretrained_path=None,
14 |         bit_loading=None,
15 |         device=None,
16 |         model_handler_module: str = "embedding_model_utils",
17 |     ):
18 | 
19 |         self.supported_models = dict(
20 |             bge="PretrainedBGELarge",
21 |             mistralinstruct="PretrainedMistral7bInstruct",
22 |         )
23 | 
24 |         self.model_name = model_name.lower().strip()
25 |         assert (
26 |             model_name in self.supported_models
27 |         ), f"model_name is not supported. Choose from {list(self.supported_models.keys())}"
28 | 
29 |         self.bit_loading = bit_loading
30 |         self.model_handler: PretrainedHandler = None
31 | 
32 |         if device is None:
33 |             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34 |         else:
35 |             self.device = device
36 | 
37 |         self.models_module = importlib.import_module(model_handler_module)
38 |         self.load_model(pretrained_path=pretrained_path)
39 | 
40 |     def load_model(self, pretrained_path=None):
41 |         model_class_name = self.supported_models[self.model_name]
42 | 
43 |         if hasattr(self.models_module, model_class_name):
44 |             model_class = getattr(self.models_module, model_class_name)
45 |         else:
46 |             raise NotImplementedError(
47 |                 "Model loading method does not exist. Check for typos or implement"
48 |             )
49 | 
50 |         self.model_handler = model_class(
51 |             pretrained_path=pretrained_path, bit_loading=self.bit_loading
52 |         )
53 | 
54 |         assert self.model_handler is not None
55 | 
56 |     def delete_model(self):
57 |         self.model_handler.model.to("cpu")
58 |         del self.model_handler.model
59 |         torch.cuda.empty_cache()
60 | 


--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/embedding_model_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import Tensor
  4 | from transformers import AutoTokenizer, AutoModel
  5 | from typing import List
  6 | from abc import ABC, abstractmethod
  7 | 
  8 | 
  9 | class PretrainedHandler(ABC):
 10 |     def __init__(self, pretrained_path=None, bit_loading=None, device=None):
 11 |         self.model = None
 12 |         self.tokenizer = None
 13 | 
 14 |         if device is None:
 15 |             self.device = "cuda" if torch.cuda.is_available() else "cpu"
 16 |         else:
 17 |             assert device in set(
 18 |                 ["cuda", "cpu"]
 19 |             ), "Incorrect device chosen. Choose from [cuda, cpu]"
 20 |             self.device = device
 21 | 
 22 |         self.bit_loading = bit_loading
 23 |         self.get_model(pretrained_path=pretrained_path)
 24 | 
 25 |     @abstractmethod
 26 |     def get_model(self, pretrained_path=None) -> None:
 27 |         """
 28 |         Instantiates self.model and self.tokenizer
 29 |         """
 30 |         raise NotImplementedError
 31 | 
 32 |     def encode(self, texts: List[str]):
 33 |         """encode texts"""
 34 |         return self._encode()(texts)
 35 | 
 36 |     def _encode(self):
 37 |         """return the encoding method for the target model
 38 |         Can differ between models (e.g. model.encode, model, model.forward)"""
 39 |         return self.model.encode
 40 | 
 41 | 
 42 | class PretrainedMistral7bInstruct(PretrainedHandler):
 43 | 
 44 |     @classmethod
 45 |     def last_token_pool(
 46 |         cls, last_hidden_states: Tensor, attention_mask: Tensor
 47 |     ) -> Tensor:
 48 |         left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
 49 |         if left_padding:
 50 |             return last_hidden_states[:, -1]
 51 |         else:
 52 |             sequence_lengths = attention_mask.sum(dim=1) - 1
 53 |             batch_size = last_hidden_states.shape[0]
 54 |             return last_hidden_states[
 55 |                 torch.arange(batch_size, device=last_hidden_states.device),
 56 |                 sequence_lengths,
 57 |             ]
 58 | 
 59 |     @classmethod
 60 |     def get_detailed_instruct(cls, task_description: str, query: str) -> str:
 61 |         return f"Instruct: {task_description}\nNewsPassage: {query}"
 62 | 
 63 |     def get_model(self, pretrained_path=None):
 64 | 
 65 |         model_source = (
 66 |             "intfloat/e5-mistral-7b-instruct"
 67 |             if pretrained_path is None
 68 |             else pretrained_path
 69 |         )
 70 | 
 71 |         # Each query must come with a one-sentence instruction that describes the task
 72 |         # Example
 73 |         # task = 'Given a web search query, retrieve relevant passages that answer the query'
 74 |         # input_texts = [self.get_detailed_instruct(task, 'how much protein should a female eat'),
 75 |         #             self.get_detailed_instruct(task, 'summit define'),
 76 |         #             "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
 77 |         #             "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."]
 78 |         self.tokenizer = AutoTokenizer.from_pretrained(
 79 |             pretrained_model_name_or_path=model_source
 80 |         )
 81 | 
 82 |         assert (
 83 |             torch.cuda.is_available()
 84 |         ), "GPU is needed to load model in 4-bit or 8-bit"
 85 | 
 86 |         if self.bit_loading == "4":
 87 |             print("loading in 4bit")
 88 | 
 89 |             self.model = AutoModel.from_pretrained(
 90 |                 pretrained_model_name_or_path=model_source,
 91 |                 load_in_4bit=True,
 92 |                 bnb_4bit_compute_dtype=torch.float16,
 93 |                 device_map=self.device,
 94 |             )
 95 |         else:
 96 |             print("loading in 8bit")
 97 |             self.model = AutoModel.from_pretrained(
 98 |                 pretrained_model_name_or_path=model_source, load_in_8bit=True
 99 |             )
100 | 
101 |         self.model.eval()
102 | 
103 |     def encode(self, texts: List[str]):
104 |         max_length = 4096
105 | 
106 |         task = "Given this news passage, retrieve relevant news passages that pertain to the same event (who, what, where, when)"
107 |         texts = [self.get_detailed_instruct(task, text) for text in texts]
108 | 
109 |         # Tokenize the input texts
110 |         batch_dict = self.tokenizer(
111 |             texts,
112 |             max_length=max_length - 1,
113 |             return_attention_mask=False,
114 |             padding=False,
115 |             truncation=True,
116 |         )
117 | 
118 |         # append eos_token_id to every input_ids
119 |         batch_dict["input_ids"] = [
120 |             input_ids + [self.tokenizer.eos_token_id]
121 |             for input_ids in batch_dict["input_ids"]
122 |         ]
123 |         batch_dict = self.tokenizer.pad(
124 |             batch_dict, padding=True, return_attention_mask=True, return_tensors="pt"
125 |         )
126 | 
127 |         return self._encode(encoded_input=batch_dict)
128 | 
129 |     def _encode(self, encoded_input=None):
130 |         with torch.no_grad():
131 |             outputs = self.model(**encoded_input)
132 | 
133 |         embeddings = self.last_token_pool(
134 |             outputs.last_hidden_state, encoded_input["attention_mask"]
135 |         )
136 | 
137 |         # normalize embeddings
138 |         embeddings = F.normalize(embeddings, p=2, dim=1)
139 | 
140 |         embeddings = embeddings.to("cpu").tolist()
141 | 
142 |         return embeddings
143 | 
144 | 
145 | class PretrainedBGELarge(PretrainedHandler):
146 | 
147 |     def get_model(self, pretrained_path=None):
148 | 
149 |         model_source = (
150 |             "BAAI/bge-large-zh-v1.5" if pretrained_path is None else pretrained_path
151 |         )
152 | 
153 |         # Load model from HuggingFace Hub
154 |         tokenizer = AutoTokenizer.from_pretrained(model_source)
155 |         model = AutoModel.from_pretrained(model_source)
156 |         model.eval()
157 | 
158 |         self.model = model
159 |         self.tokenizer = tokenizer
160 | 
161 |         model.to(self.device)
162 | 
163 |     def encode(self, texts: List[str]):
164 | 
165 |         # # Tokenize sentencesxs
166 |         # encoded_input = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
167 | 
168 |         # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
169 |         instruction = "Embed this passage for clustering on the topic of discussion in the news article: "
170 |         encoded_input = self.tokenizer(
171 |             [instruction + t for t in texts],
172 |             padding=True,
173 |             truncation=True,
174 |             max_length=512,
175 |             return_tensors="pt",
176 |         )
177 | 
178 |         encoded_input.to(self.device)
179 | 
180 |         return self._encode()(encoded_input)
181 | 
182 |     def _encode(self):
183 |         def forward(encoded_input):
184 |             # Compute token embeddings
185 |             with torch.no_grad():
186 |                 model_output = self.model(**encoded_input)
187 |                 # Perform pooling. In this case, cls pooling.
188 |                 sentence_embeddings = model_output[0][:, 0]
189 | 
190 |             # normalize embeddings
191 |             sentence_embeddings = (
192 |                 torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
193 |                 .to("cpu")
194 |                 .tolist()
195 |             )
196 | 
197 |             return sentence_embeddings
198 | 
199 |         return forward
200 | 


--------------------------------------------------------------------------------
/business_logic/model_artifacts/multi_gpu_embedding/model/code/inference.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch import Tensor
  5 | from transformers import AutoTokenizer, AutoModel
  6 | import traceback
  7 | from accelerate import Accelerator
  8 | 
  9 | accelerate = Accelerator()
 10 | 
 11 | 
 12 | def model_fn(model_dir, context):
 13 | 
 14 |     # load tokenizer and model from model_dir
 15 |     try:
 16 |         device = f"cuda:{context._system_properties['gpu_id']}"
 17 |         print(f"LOADING MODEL onto: {device}")
 18 |         model = AutoModel.from_pretrained(
 19 |             model_dir,
 20 |             quantization_config=BitsAndBytesConfig(load_in_8bit=True),
 21 |             device_map=device,
 22 |         )
 23 |         model.eval()
 24 | 
 25 |     except Exception as e:
 26 |         print("FAILED: LOADING MODEL")
 27 |         print(e)
 28 |         print(traceback.format_exc())
 29 | 
 30 |     tokenizer = AutoTokenizer.from_pretrained(model_dir)
 31 | 
 32 |     return tokenizer, model
 33 | 
 34 | 
 35 | def predict_fn(data, tokenizer_and_model):
 36 |     torch.cuda.empty_cache()
 37 | 
 38 |     # unpack tokenizer and model
 39 |     tokenizer, model = tokenizer_and_model
 40 | 
 41 |     # Grab the data
 42 |     texts = data.pop("input_texts")
 43 |     max_length = data.pop("max_length")
 44 | 
 45 |     def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
 46 | 
 47 |         left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
 48 |         if left_padding:
 49 |             return last_hidden_states[:, -1]
 50 |         else:
 51 |             sequence_lengths = attention_mask.sum(dim=1) - 1
 52 |             batch_size = last_hidden_states.shape[0]
 53 |             return last_hidden_states[
 54 |                 torch.arange(batch_size, device=last_hidden_states.device),
 55 |                 sequence_lengths,
 56 |             ]
 57 | 
 58 |     def get_detailed_instruct(task_description: str, query: str) -> str:
 59 |         return f"Instruct: {task_description}\nQuery: {query}"
 60 | 
 61 |     print("PROCESSING texts")
 62 |     task = "Given this news passage, retrieve relevant news passages that pertain to the same event (who, what, where, when)"
 63 |     texts = [get_detailed_instruct(task, text) for text in texts]
 64 | 
 65 |     # Tokenize the input texts
 66 |     batch_dict = tokenizer(
 67 |         texts,
 68 |         max_length=max_length - 1,
 69 |         return_attention_mask=False,
 70 |         padding=False,
 71 |         truncation=True,
 72 |     )
 73 | 
 74 |     print("TOKENIZED texts")
 75 |     # append eos_token_id to every input_ids
 76 |     batch_dict["input_ids"] = [
 77 |         input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict["input_ids"]
 78 |     ]
 79 |     batch_dict = tokenizer.pad(
 80 |         batch_dict, padding=True, return_attention_mask=True, return_tensors="pt"
 81 |     )
 82 | 
 83 |     try:
 84 |         print("FORWARD PASS")
 85 |         with torch.no_grad():
 86 |             outputs = model(**batch_dict)
 87 | 
 88 |         print("GET EMBEDDINGS")
 89 |         embeddings = last_token_pool(
 90 |             outputs.last_hidden_state, batch_dict["attention_mask"]
 91 |         )
 92 | 
 93 |         # normalize embeddings
 94 |         embeddings = F.normalize(embeddings.to(torch.float32), p=2, dim=1)
 95 | 
 96 |         embeddings = embeddings.to("cpu").tolist()
 97 |     except Exception as e:
 98 |         print("FORWARD ERROR")
 99 |         print(traceback.format_exc())
100 |         print(e)
101 |         embeddings = [None for _ in range(len(texts))]
102 | 
103 |     del batch_dict
104 | 
105 |     return {"embeddings": embeddings}
106 | 


--------------------------------------------------------------------------------
/business_logic/model_artifacts/multi_gpu_embedding/model/code/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.27.2
2 | transformers==4.48.0
3 | bitsandbytes==0.42.0
4 | --extra-index-url https://download.pytorch.org/whl/cu118
5 | torch==2.2.1
6 | huggingface-hub


--------------------------------------------------------------------------------
/business_logic/stream_consumer/clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | from sklearn.neighbors import sort_graph_by_row_values
  4 | from scipy.sparse import csr_matrix, tril
  5 | from joblib import Parallel, delayed
  6 | import functools
  7 | 
  8 | 
  9 | def timer(func):
 10 |     @functools.wraps(func)
 11 |     def wrapper(*args, **kwargs):
 12 |         start = time.time()
 13 |         result = func(*args, **kwargs)
 14 |         end = time.time()
 15 |         print(f"{func.__name__}\t{end - start:f}")
 16 |         return result
 17 | 
 18 |     return wrapper
 19 | 
 20 | 
 21 | def sort_row(data_slice, indices_slice):
 22 |     order = np.argsort(data_slice, kind="mergesort")
 23 |     return data_slice[order], indices_slice[order]
 24 | 
 25 | 
 26 | def parallel_sort_rows(graph):
 27 |     # Get the slices of data and indices
 28 |     data_slices = [
 29 |         graph.data[start:stop]
 30 |         for start, stop in zip(graph.indptr[:-1], graph.indptr[1:])
 31 |     ]
 32 |     indices_slices = [
 33 |         graph.indices[start:stop]
 34 |         for start, stop in zip(graph.indptr[:-1], graph.indptr[1:])
 35 |     ]
 36 | 
 37 |     # Sort each slice in parallel
 38 |     sorted_slices = Parallel(n_jobs=-1)(
 39 |         delayed(sort_row)(data_slice, indices_slice)
 40 |         for data_slice, indices_slice in zip(data_slices, indices_slices)
 41 |     )
 42 | 
 43 |     # Update the graph with sorted slices
 44 |     for (start, stop), (sorted_data, sorted_indices) in zip(
 45 |         zip(graph.indptr[:-1], graph.indptr[1:]), sorted_slices
 46 |     ):
 47 |         graph.data[start:stop] = sorted_data
 48 |         graph.indices[start:stop] = sorted_indices
 49 | 
 50 |     return graph
 51 | 
 52 | 
 53 | def batch_update_numpy_distance_matrix(new_embeds, cluster_pool, batch_size=120):
 54 | 
 55 |     # Convert the vectors to NumPy arrays
 56 |     vectors_numpy = np.array(new_embeds)
 57 |     cluster_pool_numpy = np.array(cluster_pool)
 58 |     norms = np.linalg.norm(vectors_numpy, axis=1, keepdims=True)  # L2 Norm
 59 |     normalized_vectors = vectors_numpy / norms  # Unit vectors
 60 |     norms = np.linalg.norm(cluster_pool_numpy, axis=1, keepdims=True)  # L2 Norm
 61 |     normalized_pool = cluster_pool_numpy / norms
 62 | 
 63 |     # Initialize an empty similarity matrix
 64 |     distance_matrix = np.zeros(
 65 |         (len(vectors_numpy), len(cluster_pool_numpy)), dtype=np.float16
 66 |     )
 67 | 
 68 |     # Iterate through the vectors in batches
 69 |     for start in range(0, len(cluster_pool_numpy), batch_size):
 70 |         end = min(start + batch_size, len(cluster_pool_numpy))
 71 |         batch_cluster_pool = normalized_pool[start:end]
 72 | 
 73 |         # Compute cosine similarity for the batch
 74 |         similarity_batch = np.dot(normalized_vectors, batch_cluster_pool.T)
 75 | 
 76 |         # Convert similarity to distance
 77 |         distance_batch = 1 - similarity_batch
 78 | 
 79 |         # Fill in the corresponding section of the distance matrix
 80 |         distance_matrix[:, start:end] = distance_batch
 81 | 
 82 |     # Clip values to prevent numerical issues that might result in values slightly outside [0, 1]
 83 |     distance_matrix = np.clip(distance_matrix, 0, 1)
 84 | 
 85 |     return distance_matrix
 86 | 
 87 | 
 88 | def get_sparse_distance_matrix(dense, n_priors):
 89 | 
 90 |     values = dense.flatten().astype(np.float32)
 91 | 
 92 |     row_indices = [*range(0, dense.shape[1])] * dense.shape[0]
 93 | 
 94 |     column_pointers = [0] * (n_priors + 1) + [
 95 |         *range(dense.shape[1], dense.shape[0] * (dense.shape[1] + 1), dense.shape[1])
 96 |     ]
 97 | 
 98 |     sparse_matrix = csr_matrix(
 99 |         (values, row_indices, column_pointers), shape=(dense.shape[1], dense.shape[1])
100 |     )
101 |     sparse_matrix = make_symmetric(sparse_matrix=sparse_matrix)
102 | 
103 |     if n_priors < 15000:
104 |         res = sort_graph_by_row_values(
105 |             sparse_matrix, copy=True, warn_when_not_sorted=False
106 |         )
107 |     else:
108 |         res = parallel_sort_rows(sparse_matrix)
109 | 
110 |     return res
111 | 
112 | 
113 | def make_symmetric(sparse_matrix):
114 | 
115 |     low_tri = tril(sparse_matrix, k=0)
116 |     symmetric_matrix = low_tri + tril(low_tri, k=-1).T
117 | 
118 |     return symmetric_matrix
119 | 
120 | 
121 | def prep_for_streaming(documents, interval=40):
122 | 
123 |     # split for streaming
124 |     doc_splits = {}
125 | 
126 |     aug_records = documents
127 |     estimated_time = len(aug_records) / interval
128 |     for j, i in enumerate(range(0, len(aug_records), interval)):
129 |         doc_splits[j] = aug_records[i : i + interval]
130 | 
131 |     return doc_splits, estimated_time
132 | 


--------------------------------------------------------------------------------
/business_logic/stream_consumer/process_records.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import json
  3 | import botocore
  4 | from clustering import (
  5 |     batch_update_numpy_distance_matrix,
  6 |     get_sparse_distance_matrix,
  7 | )
  8 | import numpy as np
  9 | import time
 10 | import boto3
 11 | from sklearn.cluster import DBSCAN
 12 | import uuid
 13 | from datetime import datetime
 14 | import ast  # Use Abstract Syntax Trees module to safely evaluate string representation of dictionaries
 15 | import functools
 16 | import os
 17 | import pickle
 18 | import threading
 19 | import copy
 20 | from botocore.exceptions import ClientError
 21 | 
 22 | # Initialize AWS clients
 23 | s3 = boto3.client("s3")
 24 | ssm = boto3.client("ssm")
 25 | dynamodb = boto3.resource("dynamodb")
 26 | sqs = boto3.client("sqs")
 27 | 
 28 | # Configuration variables
 29 | S3_BUCKET_NAME = os.environ["S3_BUCKET_NAME"]
 30 | S3_FILE_KEY = os.environ["S3_FILE_KEY"]
 31 | SQS_QUEUE = os.environ["SQS_QUEUE"]
 32 | DYNAMODB_TABLE = os.environ["DYNAMODB_TABLE"]
 33 | 
 34 | # Setup for clustering
 35 | label_tracker: List[tuple] = []
 36 | is_cluster: List[bool] = []
 37 | embeds: List = None
 38 | 
 39 | distance_matrix = None
 40 | 
 41 | unique_article_id = 0
 42 | unique_cluster_id = 0
 43 | cluster_count = 0
 44 | 
 45 | # Stream
 46 | batch_times = []  #
 47 | processed_pool_sizes = []
 48 | incoming_articles = []
 49 | 
 50 | 
 51 | def timer(func):
 52 |     @functools.wraps(func)
 53 |     def wrapper(*args, **kwargs):
 54 |         start = time.time()
 55 |         result = func(*args, **kwargs)
 56 |         end = time.time()
 57 |         print(f"{func.__name__}\t{end - start:f}")
 58 |         return result
 59 | 
 60 |     return wrapper
 61 | 
 62 | 
 63 | # Format docs for clustering
 64 | @timer
 65 | def format_documents(messages):
 66 |     print("Format Docs")
 67 |     converted_messages = []
 68 |     associated_articles = {}
 69 |     seen_ids = set()  # Keep track of seen ids
 70 | 
 71 |     for msg in messages:
 72 |         try:
 73 |             message_body = json.loads(msg.get("Body", "{}"))
 74 |         except json.JSONDecodeError:
 75 |             continue  # Skip this message if there's a problem parsing it
 76 | 
 77 |         message_id = message_body.get("id")
 78 | 
 79 |         # Check for duplicate ids and skip if found
 80 |         if message_id in seen_ids:
 81 |             continue
 82 |         else:
 83 |             seen_ids.add(message_id)
 84 | 
 85 |         # Proceed if id is not a duplicate
 86 |         embeddings = np.asarray(message_body["concat_embedding"][0])
 87 | 
 88 |         converted_messages.append(
 89 |             {
 90 |                 "id": message_id,
 91 |                 "concat_embedding": embeddings,
 92 |             }
 93 |         )
 94 |         associated_articles[message_id] = message_body
 95 | 
 96 |     return converted_messages, associated_articles
 97 | 
 98 | 
 99 | @timer
100 | def batch_get_meta_data(keys_to_get):
101 |     items = []  # List to store the successfully retrieved items
102 |     missing_keys = []  # List to store keys of items that were not found
103 |     unprocessed_keys = keys_to_get  # Start with all keys as unprocessed
104 | 
105 |     while unprocessed_keys:
106 |         # Prepare the current batch request
107 |         request = {
108 |             "RequestItems": {
109 |                 DYNAMODB_TABLE: {
110 |                     "Keys": unprocessed_keys[
111 |                         :100
112 |                     ]  # DynamoDB limits to 100 items per batch
113 |                 }
114 |             }
115 |         }
116 | 
117 |         # Perform the batch get operation
118 |         response = dynamodb.batch_get_item(RequestItems=request["RequestItems"])
119 | 
120 |         # Add the successfully retrieved items to our results list
121 |         items.extend(response["Responses"][DYNAMODB_TABLE])
122 | 
123 |         # Update unprocessed_keys based on UnprocessedKeys from the response
124 |         unprocessed_keys_info = response.get("UnprocessedKeys", {})
125 |         unprocessed_keys = unprocessed_keys_info.get(DYNAMODB_TABLE, {}).get("Keys", [])
126 | 
127 |         # If there are more than 100 unprocessed keys, prepare the next batch
128 |         if unprocessed_keys:
129 |             unprocessed_keys = unprocessed_keys[100:]
130 | 
131 |     # Assuming items is the list of items returned from DynamoDB
132 |     found_keys = [{"PK": item["PK"], "SK": item["SK"]} for item in items]
133 | 
134 |     # Assuming keys_to_get is the list of keys you originally requested
135 |     requested_keys = keys_to_get  # No change needed if keys_to_get already structured as [{'PK': ..., 'SK': ...}, ...]
136 | 
137 |     # To find missing keys, we'll convert these dictionaries to a comparable format (e.g., string) because dictionaries cannot be directly compared in sets
138 |     found_keys_str = set([str(k) for k in found_keys])
139 |     requested_keys_str = set([str(k) for k in requested_keys])
140 | 
141 |     # Identify missing keys by comparing their string representations
142 |     missing_keys_str = requested_keys_str - found_keys_str
143 | 
144 |     # Convert back to dictionaries using ast.literal_eval for safety
145 |     missing_keys = [ast.literal_eval(k) for k in missing_keys_str]
146 | 
147 |     return items, missing_keys
148 | 
149 | 
150 | def check_for_repeats(strings):
151 |     seen = set()
152 |     for string in strings:
153 |         if string in seen:
154 |             return True  # Found a repeat
155 |         seen.add(string)
156 |     return False  # No repeats found
157 | 
158 | 
159 | def find_duplicates(items):
160 |     # Track occurrences of (PK, SK) tuples
161 |     occurrences = {}
162 |     # Track duplicates
163 |     duplicates = {}
164 | 
165 |     for item in items:
166 |         pk_sk_tuple = (item["PK"], item["SK"])
167 |         if pk_sk_tuple in occurrences:
168 |             occurrences[pk_sk_tuple] += 1
169 |             duplicates[pk_sk_tuple] = occurrences[pk_sk_tuple]
170 |         else:
171 |             occurrences[pk_sk_tuple] = 1
172 | 
173 |     # Check if there are duplicates and throw an error
174 |     if duplicates:
175 |         duplicate_details = ", ".join(
176 |             [f"{duplicate}: {count}" for duplicate, count in duplicates.items()]
177 |         )
178 |         raise ValueError(f"Duplicates found - {duplicate_details}")
179 | 
180 | 
181 | @timer
182 | def add_items_to_dynamodb(articles, clusters, associated_articles):
183 |     # Get the table
184 |     table = dynamodb.Table(DYNAMODB_TABLE)
185 | 
186 |     keys_to_get = [
187 |         {"PK": cluster[0], "SK": f"#METADATA#{cluster[0]}"} for cluster in clusters
188 |     ]
189 | 
190 |     # Convert to the desired dictionary format
191 |     cluster_associations = {}
192 | 
193 |     # Initialize a dictionary to keep track of items to batch write
194 |     items_to_batch_write = {}
195 |     for item in clusters:
196 |         key, article_ids = item
197 |         cluster_associations[key] = article_ids
198 | 
199 |     existing_metadata, missing_keys = batch_get_meta_data(keys_to_get)
200 |     print("Missing Keys: ", len(missing_keys))
201 |     print("Existing Metadata: ", len(existing_metadata))
202 | 
203 |     for item in existing_metadata:
204 |         pk_sk = (item["PK"], item["SK"])
205 | 
206 |         # Assume 'NumAttribute' exists, increment it
207 |         if "number_of_articles" in item:
208 |             item["number_of_articles"] += (
209 |                 len(cluster_associations[item["PK"]]) - 1
210 |             )  # Subtract one for metadata
211 |         # Check for duplicates
212 |         if pk_sk in items_to_batch_write:
213 |             print(f"Duplicate found for existing metadata: {pk_sk}")
214 |         items_to_batch_write[pk_sk] = item
215 | 
216 |     # For unprocessed keys write a new METADATA entry
217 |     for key in missing_keys:
218 |         pk_sk = (key["PK"], f"#METADATA#{key['PK']}")
219 |         item = {
220 |             "PK": key["PK"],
221 |             "SK": f"#METADATA#{key['PK']}",
222 |             "type": "metadata",
223 |             "created_at": datetime.now().isoformat(),
224 |             "number_of_articles": len(cluster_associations[key["PK"]]) + 1,
225 |             "generated_summary": "",
226 |             "summary_count": 0,
227 |             "description": "",
228 |             "is_cluster": True,
229 |         }  # Partition Key  # Sort Key
230 |         if pk_sk in items_to_batch_write:
231 |             print(f"Duplicate found for new metadata: {pk_sk}")
232 |         items_to_batch_write[pk_sk] = item
233 | 
234 |     for cluster_id, ids in clusters + articles:
235 |         for article_id in ids:
236 |             pk_sk = (cluster_id, f"ARTICLE#{article_id}")
237 |             article = associated_articles.get(article_id)
238 | 
239 |             # ! This is accounting for a bug, should not have to be done!!
240 |             if article is not None:
241 |                 # Define the item to be inserted
242 |                 item = {
243 |                     "PK": cluster_id,
244 |                     "SK": f"ARTICLE#{article_id}",
245 |                     "type": "article",
246 |                     "article_id": article_id,
247 |                     "title": article.get("title"),
248 |                     "summary": article.get("summary"),
249 |                     "text": article.get("text"),
250 |                     "organizations": article.get("organizations_fd"),
251 |                     "locations": article.get("locations_fd"),
252 |                     # "article_sentiment": article.get("article_sentiment"),
253 |                     "publication_date": article.get("publication_date"),
254 |                     "entry_creation_date": datetime.now().isoformat(),
255 |                 }  # Partition Key  # Sort Key
256 |             else:
257 |                 item = {
258 |                     "PK": cluster_id,
259 |                     "SK": f"ARTICLE#{article_id}",
260 |                     "type": "article",
261 |                     "article_id": article_id,
262 |                     "entry_creation_date": datetime.now().isoformat(),
263 |                 }  # Partition Key  # Sort Key
264 | 
265 |             # Check for duplicates
266 |             if pk_sk in items_to_batch_write:
267 |                 print(f"Duplicate found for article: {pk_sk}")
268 |             items_to_batch_write[pk_sk] = item
269 | 
270 |     # Write aggregated items to DynamoDB using batch writer
271 |     with table.batch_writer() as batch:
272 |         for pk_sk, item in items_to_batch_write.items():
273 |             batch.put_item(Item=item)
274 | 
275 | 
276 | def find_string_duplicates(strings):
277 |     seen = set()
278 |     duplicates = set(string for string in strings if string in seen or seen.add(string))
279 |     if duplicates:
280 |         raise ValueError(f"Duplicates: {', '.join(duplicates)}")
281 | 
282 | 
283 | @timer
284 | def cluster(records):
285 |     # Set Global Variables # ToDO Find "pythonic" way of doing this
286 |     global label_tracker
287 |     global is_cluster
288 |     global distance_matrix
289 |     global embeds
290 | 
291 |     global unique_article_id
292 |     global unique_cluster_id
293 |     global cluster_count
294 |     global batch_times
295 |     global processed_pool_sizes
296 | 
297 |     batch_update_distance_matrix = (
298 |         batch_update_numpy_distance_matrix  # For now we will always use this function
299 |     )
300 | 
301 |     eps = 0.10  # ToDo Parameterize
302 | 
303 |     print("***\t***")
304 |     print(f"Starting eps:\t{eps}")
305 | 
306 |     # Configure logging
307 |     metric = "precomputed"
308 |     clustering_args = dict(eps=eps, min_samples=2, metric=metric, n_jobs=-1)
309 | 
310 |     batch_time = time.time()
311 | 
312 |     # report cluster pool metrics
313 |     processed_pool_size = len(label_tracker)
314 |     number_of_singletons = processed_pool_size - cluster_count
315 |     print(f"Number of clusters in pool:\t{cluster_count}")
316 |     print(f"Number of singletons in pool:\t{number_of_singletons}")
317 | 
318 |     # add this batch to bookkeeping
319 |     processed_pool_sizes.append(processed_pool_size)
320 | 
321 |     label_tracker.extend(
322 |         [(str(uuid.uuid4()), [doc["id"]]) for i, doc in enumerate(records)]
323 |     )
324 | 
325 |     is_cluster.extend([False for _ in range(len(records))])
326 | 
327 |     # Size of existing cluster_pool.
328 |     old_size = len(embeds) if embeds is not None else 0
329 | 
330 |     # update embedding list
331 |     new_embeds = [doc["concat_embedding"] for doc in records]
332 | 
333 |     if embeds is not None:
334 |         embeds.extend(new_embeds)
335 |     else:
336 |         embeds = new_embeds
337 | 
338 |     unique_article_id += len(records)  # increment by number of samples added
339 | 
340 |     # get distances from new samples to old samples
341 |     # M X [[N], [M]] = M x N+M matrix
342 |     # TODO: This implementation vs. Database
343 |     # TODO: Thresholding to make it more sparse
344 |     add_to_distance_matrix = batch_update_distance_matrix(
345 |         np.ascontiguousarray(new_embeds),
346 |         cluster_pool=np.ascontiguousarray(embeds),
347 |     )
348 | 
349 |     # Convert (M, N+M) -> (N+M, N+M), make sparse if possible
350 |     if distance_matrix is None:
351 |         distance_matrix = add_to_distance_matrix
352 |     else:
353 |         distance_matrix = get_sparse_distance_matrix(
354 |             add_to_distance_matrix, old_size if old_size > 0 else None
355 |         )
356 | 
357 |     # Cluster
358 |     clusterer = DBSCAN(**clustering_args).fit(distance_matrix)
359 | 
360 |     # Update clusters and singletons
361 |     update_time = time.time()
362 |     unique_labels = np.unique(clusterer.labels_)
363 |     to_remove = set()
364 |     updated_clusters = []  # Indicies to update database
365 | 
366 |     # Cluster formation
367 |     for label in unique_labels:
368 |         if label != -1:
369 |             indices = np.nonzero(clusterer.labels_ == label)[0]
370 | 
371 |             update_idx = indices[0]
372 | 
373 |             # * Don't need for DB
374 |             to_remove.update(
375 |                 [i for i in indices[1:] if not is_cluster[i]]
376 |             )  # keep track of items to remove from all items
377 | 
378 |             added_articles = [
379 |                 label_tracker[id_idx][1][0]
380 |                 for id_idx in indices[1:]
381 |                 if not is_cluster[id_idx]
382 |             ]
383 | 
384 |             updated_clusters.append((label_tracker[update_idx][0], added_articles))
385 | 
386 |             # extend first instance with all like labels
387 |             label_tracker[update_idx][1].extend(added_articles)
388 | 
389 |             # rename if not labeled cluster yet
390 |             if is_cluster[update_idx] is False:
391 |                 cluster_count += 1
392 | 
393 |                 unique_cluster_id += 1
394 |                 is_cluster[update_idx] = True
395 | 
396 |             # Update embeddings with the mean of all the embeddings in cluster
397 |             embeddings_for_this_cluster_label = [embeds[id_idx] for id_idx in indices]
398 | 
399 |             centroid = np.mean(embeddings_for_this_cluster_label, axis=0)
400 |             embeds[update_idx] = centroid.tolist()
401 | 
402 |     print(f"update_time:\t{time.time() - update_time}")
403 | 
404 |     # delete indices that were merged
405 |     cleanup_time = time.time()
406 |     update_label_time = time.time()
407 | 
408 |     label_tracker = [
409 |         label_tracker[i] for i in range(len(label_tracker)) if i not in to_remove
410 |     ]
411 | 
412 |     is_cluster = [is_cluster[i] for i in range(len(is_cluster)) if i not in to_remove]
413 |     print(f"Labeling cleanup\t{time.time() - update_label_time:.2f}")
414 | 
415 |     embed_cleanup = time.time()
416 |     embeds = [e for i, e in enumerate(embeds) if i not in to_remove]
417 | 
418 |     print(f"embed cleanup\t{time.time() - embed_cleanup:.2f}")
419 |     print(f"cleanup_time:\t{time.time() - cleanup_time}")
420 | 
421 |     # Track times
422 |     batch_time = time.time() - batch_time
423 |     batch_times.append(batch_time)
424 |     print(f"Batch time:\t{batch_time}")
425 |     print(f"mean batch time:\t{sum(batch_times)/len(batch_times)}")
426 | 
427 |     # dont use aggregated variables here, recalculate to double check accuracy
428 |     number_of_clusters = len(np.nonzero(is_cluster)[0])
429 |     number_of_singletons = len(np.nonzero(~np.asarray(is_cluster, dtype=bool))[0])
430 |     print(f"Number of clusters\t{number_of_clusters}")
431 |     print(f"Number of singletons\t{number_of_singletons}")
432 | 
433 |     number_of_stories_in_saved = sum([len(samples[1]) for samples in label_tracker])
434 |     print(f"total_stories_clustered\t{number_of_stories_in_saved}")
435 | 
436 |     new_entries_articles = [
437 |         label_tracker[i]
438 |         for i in range(old_size, len(label_tracker))
439 |         if is_cluster[i] is False
440 |     ]
441 | 
442 |     total_new_articles = sum([len(a[1]) for a in new_entries_articles])
443 |     print("Total New Articles Actual", total_new_articles)
444 |     print("Total New Articles Expected", len(new_entries_articles))
445 |     return new_entries_articles, updated_clusters
446 | 
447 | 
448 | @timer
449 | def process_messages(records):
450 |     formatted_records, associated_articles = format_documents(records)
451 |     new_entries_articles, updated_clusters = cluster(formatted_records)
452 |     add_items_to_dynamodb(new_entries_articles, updated_clusters, associated_articles)
453 | 
454 | 
455 | @timer
456 | def delete_messages_in_batches(messages):
457 |     # Split messages into batches of 10 for deletion
458 |     batch_size = 10
459 |     for i in range(0, len(messages), batch_size):
460 |         batch = messages[i : i + batch_size]
461 |         entries = [
462 |             {"Id": str(index), "ReceiptHandle": msg["ReceiptHandle"]}
463 |             for index, msg in enumerate(batch)
464 |         ]
465 |         sqs.delete_message_batch(QueueUrl=SQS_QUEUE, Entries=entries)
466 |     print("Deleted messages from queue")
467 | 
468 | 
469 | def consume_records(batch_size=20):
470 |     global incoming_articles
471 | 
472 |     # -----------------------------------------------------------------
473 |     # Get the records.
474 |     # Get max_records from the shard, or run continuously if you wish.
475 |     # -----------------------------------------------------------------
476 |     all_messages = []
477 |     while len(all_messages) < batch_size:
478 | 
479 |         response = sqs.receive_message(
480 |             QueueUrl=SQS_QUEUE,
481 |             MaxNumberOfMessages=min(10, int(batch_size - len(all_messages))),
482 |             WaitTimeSeconds=0,  # Short polling to avoid long waits
483 |         )
484 | 
485 |         messages = response.get("Messages", [])
486 |         if not messages:
487 |             # print("The queue is empty.")
488 |             break
489 | 
490 |         all_messages.extend(messages)
491 |         if len(all_messages) >= batch_size:
492 |             break
493 | 
494 |     incoming_articles.extend(all_messages)
495 | 
496 | 
497 | @timer
498 | def checkpoint():
499 |     global label_tracker
500 |     global is_cluster
501 |     global distance_matrix
502 |     global embeds
503 |     global incoming_articles
504 | 
505 |     data_to_serialize = {
506 |         "label_tracker": label_tracker,
507 |         "is_cluster": is_cluster,
508 |         "embeds": embeds,
509 |     }
510 | 
511 |     serialized_data = pickle.dumps(data_to_serialize)
512 | 
513 |     # Upload the updated data back to S3 as a checkpoint
514 |     s3.put_object(Body=serialized_data, Bucket=S3_BUCKET_NAME, Key=S3_FILE_KEY)
515 |     print(f"Updated file uploaded successfully to {S3_BUCKET_NAME}/{S3_FILE_KEY}")
516 | 
517 | 
518 | @timer
519 | def load_from_checkpoint():
520 |     global label_tracker
521 |     global is_cluster
522 |     global embeds
523 |     global distance_matrix
524 |     global cluster_count
525 | 
526 |     try:
527 |         # Retrieve the object from S3
528 |         s3_response_object = s3.get_object(Bucket=S3_BUCKET_NAME, Key=S3_FILE_KEY)
529 | 
530 |         # Read the file's content
531 |         serialized_data = s3_response_object["Body"].read()
532 |         loaded_data = pickle.loads(serialized_data)
533 | 
534 |         label_tracker = loaded_data["label_tracker"]
535 |         is_cluster = loaded_data["is_cluster"]
536 |         embeds = loaded_data["embeds"]
537 |         # distance_matrix = ""  # ToDo ask hector best way to deal with this
538 |         distance_matrix = "" if embeds is not None and len(embeds) > 0 else None
539 | 
540 |         print(
541 |             "Successfully loaded from checkpoint, cluster pool size: ",
542 |             len(label_tracker),
543 |         )
544 |         number_of_clusters = len(np.nonzero(is_cluster)[0])
545 |         number_of_singletons = len(np.nonzero(~np.asarray(is_cluster, dtype=bool))[0])
546 |         print(f"Number of clusters\t{number_of_clusters}")
547 |         print(f"Number of singletons\t{number_of_singletons}")
548 | 
549 |         cluster_count = number_of_clusters
550 |     except s3.exceptions.NoSuchKey:
551 |         print(
552 |             f"No existing checkpoint found at {S3_BUCKET_NAME}/{S3_FILE_KEY}. Starting with new data."
553 |         )
554 | 
555 | 
556 | if __name__ == "__main__":
557 | 
558 |     batch_size = 500
559 |     checkpoint_rate = 5  # How many batches before checkpointing
560 |     batches_processed = 0
561 |     number_of_threads = 50
562 |     number_of_articles = batch_size / number_of_threads
563 |     print("Batch Size", batch_size)
564 |     print("Checkpoint Rate", checkpoint_rate)
565 | 
566 |     load_from_checkpoint()
567 | 
568 |     print(f"Article queue: {len(incoming_articles)}")
569 | 
570 |     ### Define number of threads
571 |     # articles_received = number_of_threads * batch_size
572 |     threads = [
573 |         threading.Thread(target=lambda: consume_records(number_of_articles))
574 |         for _ in range(number_of_threads)
575 |     ]
576 |     # start all threads
577 |     start = time.time()
578 |     [t.start() for t in threads]
579 |     # collect threads to finish
580 |     [t.join() for t in threads]
581 | 
582 |     print(f"Processed batches: {len(incoming_articles)}")
583 |     print(f"total time: {time.time() - start:.2f} seconds")
584 | 
585 |     # Consumer Server
586 |     while True:
587 |         threads = [
588 |             threading.Thread(target=lambda: consume_records(number_of_articles))
589 |             for _ in range(number_of_threads)
590 |         ]
591 |         if batches_processed % checkpoint_rate == 0:
592 |             checkpoint_thread = threading.Thread(target=lambda: checkpoint())
593 |             threads.append(checkpoint_thread)
594 | 
595 |         # start all threads
596 |         start = time.time()
597 |         [t.start() for t in threads]
598 | 
599 |         if len(incoming_articles) >= batch_size:  # Check we have enough articles
600 |             process_messages(incoming_articles)
601 |             delete_messages_in_batches(incoming_articles)
602 | 
603 |             batches_processed += 1
604 |             incoming_articles = []
605 | 
606 |         [t.join() for t in threads]
607 |         print(f"Processed batches: {len(incoming_articles)}")
608 |         print(f"TOTAL TIME FOR CLUSTERING BATCH: {time.time() - start:.2f} seconds")
609 | 


--------------------------------------------------------------------------------
/business_logic/stream_consumer/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3 
2 | scikit-learn 
3 | scipy 
4 | numpy 
5 | joblib
6 | utils


--------------------------------------------------------------------------------
/data/clear_data.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | 
 4 | def clear_dynamodb_table(table_name):
 5 |     # Initialize a DynamoDB resource
 6 |     dynamodb = boto3.resource("dynamodb")
 7 |     table = dynamodb.Table(table_name)
 8 | 
 9 |     # Scan the table for all items (note: this is resource-intensive and not recommended for large tables)
10 |     scan = table.scan()
11 |     items = scan["Items"]
12 | 
13 |     # Continue scanning if all items were not returned in the first scan
14 |     while "LastEvaluatedKey" in scan:
15 |         scan = table.scan(ExclusiveStartKey=scan["LastEvaluatedKey"])
16 |         items.extend(scan["Items"])
17 | 
18 |     # Delete items in batches
19 |     with table.batch_writer() as batch:
20 |         for item in items:
21 |             batch.delete_item(
22 |                 Key={
23 |                     "PK": item["PK"],  # Primary Key
24 |                     "SK": item["SK"],  # Sort Key, if applicable
25 |                 }
26 |             )
27 | 
28 |     print(f"Cleared {len(items)} items from the table {table_name}.")
29 | 
30 | 
31 | def clear_sqs_queue(queue_name):
32 |     sqs = boto3.client("sqs")
33 |     response = sqs.get_queue_url(QueueName=queue_name)
34 |     queue_url = response['QueueUrl']
35 |     response = sqs.receive_message(QueueUrl=queue_url, MaxNumberOfMessages=10)
36 |     messages = response.get("Messages", [])
37 |     for message in messages:
38 |         sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"])
39 |     print(f"Cleared {len(messages)} messages from the queue {queue_name}.")
40 | 
41 | 
42 | def remove_s3_objects(bucket_name):
43 |     s3 = boto3.client("s3")
44 |     s3.delete_object(Bucket=bucket_name, Key="checkpoint.pkl")
45 |     print(f"Deleted the 'checkpoint.pkl' object from the bucket {bucket_name}.")
46 | 
47 | def terminate_ec2_instance(instance_name):
48 |     ec2 = boto3.client("ec2")
49 |     response = ec2.describe_instances(Filters=[
50 |         {'Name': 'tag:Name', 'Values': [instance_name]},
51 |         {'Name': 'instance-state-name', 'Values': ['running']}
52 |     ])
53 |     if response['Reservations']:
54 |         instance_id = response['Reservations'][0]['Instances'][0]['InstanceId']
55 |         ec2.terminate_instances(InstanceIds=[instance_id])
56 |         print(f"Terminated the running EC2 instance {instance_name}.")
57 |     else:
58 |         print(f"No running EC2 instance found with the name {instance_name}.")
59 | 
60 | 
61 | # Clean DynamoDB table
62 | table_name = "cluster-table-clustering-demo2"
63 | clear_dynamodb_table(table_name)
64 | 
65 | # Clean SQS queue
66 | queue_name = "clustering-demo2-queue"
67 | clear_sqs_queue(queue_name)
68 | 
69 | # Clean S3 bucket, need to find the bucket name dynamically starting with "code-bucket-clustering-demo"
70 | def get_s3_bucket_name(prefix):
71 |     s3 = boto3.client('s3')
72 |     response = s3.list_buckets()
73 |     for bucket in response['Buckets']:
74 |         if bucket['Name'].startswith(prefix):
75 |             print(f"Found S3 bucket: {bucket['Name']}") 
76 |             return bucket['Name']
77 |     return None
78 | 
79 | bucket_prefix = "code-bucket-clustering-demo"
80 | bucket_name = get_s3_bucket_name(bucket_prefix)
81 | if bucket_name:
82 |     remove_s3_objects(bucket_name)
83 | else:
84 |     print(f"No S3 bucket found with prefix: {bucket_prefix}")
85 | 
86 | # Terminate EC2 instance
87 | instance_name = "stream-consumer-instance-clustering-demo2"
88 | terminate_ec2_instance(instance_name)
89 | 


--------------------------------------------------------------------------------
/data/download_public_data.sh:
--------------------------------------------------------------------------------
1 | mkdir public_data
2 | wget -P public_data ftp://"ftp.priberam.pt|anonymous"@ftp.priberam.pt/SUMMAPublic/Corpora/Clustering/2018.0/dataset/dataset.dev.json


--------------------------------------------------------------------------------
/data/example_article.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "12345",
 3 |   "text": "(FAKE) In a surprising turn of events, recent studies have shown a significant surge in penguin populations across several species, bringing a glimmer of hope amidst ongoing climate change concerns. This unexpected rise has been observed in regions spanning from Antarctica to the coasts of South America and Africa. Marine biologists attribute this population increase to several factors. Firstly, conservation efforts have played a crucial role. Strict regulations on fishing in certain areas have reduced competition for food, allowing penguin populations to recover. Marine protected areas (MPAs) have provided safe havens where penguins can breed and forage without human interference.",
 4 |   "title": "Surge in Penguin Populations Brings Hope Amidst Climate Change Concerns",
 5 |   "event_id": "1234",
 6 |   "duplicate": false,
 7 |   "lang": "en",
 8 |   "bag_id": "9e1f2c6b-4b27-4d5f-91d4-3e0aafae1987-2325",
 9 |   "date": "2024-06-06 16:09:00",
10 |   "source": "naturewatchnews.com",
11 |   "cluster": "444"
12 | }
13 | 


--------------------------------------------------------------------------------
/data/put_records.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import json
 3 | import time
 4 | import random
 5 | import string
 6 | from tqdm import tqdm
 7 | 
 8 | STREAM_NAME = "input-stream-clustering-demo2"
 9 | PARTITION_KEY = "a"
10 | JSON_FILE_PATH = "./public_data/dataset.dev.json"  # Path to the single JSON file
11 | COUNT = 1200000
12 | BATCH_SIZE = 5
13 | 
14 | # Create a Kinesis client
15 | kinesis = boto3.client("kinesis")
16 | 
17 | 
18 | # Helper function to generate a random partition key
19 | def generate_partition_key():
20 |     return "".join(random.choices(string.ascii_letters + string.digits, k=16))
21 | 
22 | 
23 | # Helper function to check record size does not exceed 1MB
24 | def is_record_size_valid(record):
25 |     return len(record) < 1024 * 1024  # less than 1MB
26 | 
27 | 
28 | # Helper function to check batch size does not exceed 5MB
29 | def is_batch_size_valid(batch):
30 |     return (
31 |         sum(len(record["Data"]) for record in batch) < 5 * 1024 * 1024
32 |     )  # less than 5MB
33 | 
34 | 
35 | # Read the JSON data from the file
36 | with open(JSON_FILE_PATH, "r") as f:
37 |     data_list = json.load(f)
38 | 
39 | # Iterate through the JSON data in batches
40 | for batch_index in tqdm(range(0, min(COUNT, len(data_list)), BATCH_SIZE)):
41 |     batch_list = data_list[batch_index : batch_index + BATCH_SIZE]
42 |     data_json = json.dumps(batch_list)
43 | 
44 |     # Check if the individual record size is valid
45 |     if not is_record_size_valid(data_json):
46 |         print(
47 |             f"Batch starting at index {batch_index} exceeds the maximum allowed size of 1MB."
48 |         )
49 |         continue  # Skip this batch
50 | 
51 |     # Create a record to put to Kinesis
52 |     record = {
53 |         "Data": data_json,
54 |         "PartitionKey": generate_partition_key(),
55 |     }
56 | 
57 |     records_to_put = [record]
58 | 
59 |     # Add the record to the batch if it doesn't exceed the batch size
60 |     if is_batch_size_valid(records_to_put):
61 |         # Delay for 0.2 seconds
62 |         time.sleep(0.2)
63 | 
64 |         # Create the PutRecords request
65 |         put_records_request = {
66 |             "Records": records_to_put,
67 |             "StreamName": STREAM_NAME,
68 |         }
69 | 
70 |         # Put the records to Kinesis
71 |         response = kinesis.put_records(**put_records_request)
72 | 
73 |         # Check for any failed records
74 |         failed_records = response.get("Records", [])
75 |         for record in failed_records:
76 |             if "ErrorCode" in record:
77 |                 print(
78 |                     f"Error: {record['ErrorCode']}, Message: {record['ErrorMessage']}"
79 |                 )
80 |     else:
81 |         print(
82 |             f"Batch starting at index {batch_index} would exceed the batch size limit of 5MB."
83 |         )
84 | 


--------------------------------------------------------------------------------
/data/script.py:
--------------------------------------------------------------------------------
1 | import json
2 | 
3 | with open("public_data/dataset.test.json", "r") as f:
4 |     data = json.load(f)
5 |     print(len(data))
6 | 


--------------------------------------------------------------------------------
/data/send_articles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python put_records.py
 4 | 
 5 | 
 6 | # aws kinesis put-record --stream-name "$STREAM_NAME" --data file://"$FILE_PATH" --partition-key "$PARTITION_KEY"
 7 | 
 8 | # aws kinesis put-record --stream-name "$STREAM_NAME" --data file://"$FILE_PATH" --partition-key "id"
 9 | 
10 | # aws kinesis put-records \
11 | #     --stream-name "$STREAM_NAME" \
12 | #     --records file://"$FILE_PATH"
13 | 


--------------------------------------------------------------------------------
/front_end/Dockerfile:
--------------------------------------------------------------------------------
 1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
 2 | #checkov:skip=CKV_DOCKER_3: Ensure that a user for the container has been created
 3 | 
 4 | # Use a multi-stage build
 5 | FROM node:18-alpine3.19 as builder
 6 | 
 7 | # Set the working directory
 8 | WORKDIR /app
 9 | 
10 | # Copy package.json and package-lock.json
11 | COPY package.json package-lock.json ./
12 | 
13 | # Install dependencies
14 | RUN npm ci
15 | 
16 | # Copy the application code
17 | COPY . .
18 | 
19 | # Build the application
20 | RUN npm run build
21 | 
22 | # Create the production image
23 | FROM nginx:1.23-alpine
24 | 
25 | # Install OpenSSL to generate a self-signed certificate
26 | RUN apk add --no-cache openssl
27 | 
28 | # Create a directory for SSL certificates
29 | RUN mkdir -p /etc/nginx/ssl
30 | 
31 | # Generate a self-signed SSL certificate
32 | RUN openssl req \
33 |     -x509 \
34 |     -nodes \
35 |     -days 365 \
36 |     -newkey rsa:2048 \
37 |     -keyout /etc/nginx/ssl/nginx-selfsigned.key \
38 |     -out /etc/nginx/ssl/nginx-selfsigned.crt \
39 |     -subj "/C=US/ST=State/L=City/O=Company/OU=Department/CN=localhost"
40 | 
41 | # Remove the default NGINX config and replace with custom config
42 | RUN rm -rf /etc/nginx/conf.d/default.conf
43 | COPY nginx.conf /etc/nginx/conf.d/
44 | 
45 | # Set the working directory
46 | WORKDIR /usr/share/nginx/html
47 | 
48 | # Copy the built assets from the builder stage
49 | COPY --from=builder /app/build .
50 | 
51 | # Expose the port
52 | EXPOSE 443
53 | 
54 | # Start Nginx
55 | CMD ["nginx", "-g", "daemon off;"]
56 | 


--------------------------------------------------------------------------------
/front_end/README.md:
--------------------------------------------------------------------------------
  1 | # React Application Setup and Deployment
  2 | 
  3 | This guide covers the setup of a React application using Amazon Cognito for authentication, and deployment options using Amazon S3 with CloudFront or a containerized approach with Nginx and a Load Balancer.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - AWS Account
  8 | - Node.js installed
  9 | - NPM or Yarn installed
 10 | - AWS CLI installed and configured
 11 | 
 12 | ## Setup
 13 | 
 14 | ### Step 1: Create a Cognito User Pool
 15 | 
 16 | 1. Go to the Amazon Cognito Console.
 17 | 2. Click **Manage User Pools** and then **Create a user pool**.
 18 | 3. Name your user pool and click **Review defaults**.
 19 | 4. Click **Create pool**.
 20 | 5. Note the **Pool Id** and **Pool ARN**.
 21 | 
 22 | ### Step 2: Create a Cognito Identity Pool
 23 | 
 24 | 1. Go back to the main Cognito console and select **Manage Identity Pools**.
 25 | 2. Click **Create new identity pool**.
 26 | 3. Give your identity pool a name, and check **Enable access to unauthenticated identities** if required.
 27 | 4. Under **Authentication providers**, in the **Cognito** tab, enter your User Pool ID and App client id.
 28 | 5. Click **Create Pool**.
 29 | 6. On the next screen, you will be prompted to set up IAM roles for your identity pool. AWS can create default roles for you, or you can choose to edit these roles. It is critical to attach the appropriate permissions to these roles depending on what AWS resources your application will access.
 30 | 
 31 | #### Configuring IAM Roles
 32 | 
 33 | After the Identity Pool is created, AWS assigns two roles: one for authenticated users and another for unauthenticated users (if enabled). To allow authenticated users to access DynamoDB resources, you must attach a policy with the necessary permissions to the authenticated role.
 34 | 
 35 | 1. Go to the IAM console.
 36 | 2. Find the role associated with your Cognito Identity Pool for authenticated users.
 37 | 3. Click **Attach policies** and then **Create policy**.
 38 | 4. In the policy editor, paste the following JSON. This policy allows actions on the DynamoDB table used by your application:
 39 | 
 40 |     ```json
 41 |     {
 42 |         "Version": "2012-10-17",
 43 |         "Statement": [
 44 |             {
 45 |                 "Sid": "VisualEditor0",
 46 |                 "Effect": "Allow",
 47 |                 "Action": [
 48 |                     "dynamodb:Scan"
 49 |                 ],
 50 |                 "Resource": "arn:aws:dynamodb:us-east-1:<AWS-ACCOUNT-ID>:table/cluster-table-clustering-demo"
 51 |             }
 52 |         ]
 53 |     }
 54 |     ```
 55 | 
 56 |     Be sure to replace `your-aws-account-id` with your actual AWS account ID.
 57 | 
 58 | 5. Click **Review policy**, give your policy a name, and then click **Create policy**.
 59 | 6. Attach the newly created policy to the IAM role for authenticated users.
 60 | 
 61 | This setup ensures that your application has the necessary permissions to interact with the specified DynamoDB table, following the principle of least privilege by granting only the permissions needed.
 62 | 
 63 | 
 64 | ### Step 3: Configuration File
 65 | 
 66 | 1. Create a file named `aws-exports.js` in your React app's `src` directory.
 67 | 2. Add the following configuration:
 68 | 
 69 |    ```javascript
 70 |    const awsConfig = {
 71 |     aws_project_region: 'AWS_REGION',  // AWS region of Cognito
 72 |     aws_cognito_region: 'AWS_REGION',  // AWS region of Cognito
 73 |     aws_cognito_identity_pool_id: 'AWS_COGNITO_IDENTITY_POOL',  // Identity pool ID
 74 |     aws_user_pools_id: 'AWS_COGNITO_USER_POOL_ID',                // User Pool ID
 75 |     aws_user_pools_web_client_id: 'AWS_CONGITO_USER_POOL_APP_CLIENT_ID', // App client ID
 76 |     federationTarget: "COGNITO_USER_POOLS" // keep as "COGNITO_USER_POOLS"
 77 |     };
 78 | 
 79 |     export default awsConfig;
 80 |    ```
 81 | 3. Make sure all the fields above are properly filled, if you're using Terraform to deploy the tool, make sure you can extract and create the file dynamically. 
 82 | 
 83 | 
 84 | ### Step 4: Build the React Application
 85 | 
 86 | 1. Navigate to your project directory.
 87 | 2. Run `npm install` to install all required dependencies.
 88 | 3. Build your React application by running:
 89 |    ```bash
 90 |    npm run build
 91 |    ```
 92 | 4. This command creates a build directory containing your static files (HTML, CSS, JS).
 93 | 
 94 | ## Running the Application Locally
 95 | 
 96 | Before deploying your React application, it is crucial to ensure everything functions correctly in a local development environment. Follow these steps to run your application locally:
 97 | 
 98 | ### Prerequisites for Running Locally
 99 | 
100 | 1. **Configure aws-exports.js:**
101 |    - Ensure that you have created `aws-exports.js` in the src directory of your project. This file should include all necessary configurations for Amazon Cognito:
102 |      ```javascript
103 |      const awsConfig = {
104 |         aws_project_region: 'AWS_REGION',  // AWS region of Cognito
105 |         aws_cognito_region: 'AWS_REGION',  // AWS region of Cognito
106 |         aws_cognito_identity_pool_id: 'AWS_COGNITO_IDENTITY_POOL',  // Identity pool ID
107 |         aws_user_pools_id: 'AWS_COGNITO_USER_POOL_ID',                // User Pool ID
108 |         aws_user_pools_web_client_id: 'AWS_CONGITO_USER_POOL_APP_CLIENT_ID', // App client ID
109 |         federationTarget: "COGNITO_USER_POOLS" // keep as "COGNITO_USER_POOLS"
110 |      };
111 |      export default awsConfig;
112 |      ```
113 |    - Replace `your-region`, `identity-pool-id`, `your-user-pool-id`, and `your-app-client-id` with the actual values from your Cognito setup.
114 | 
115 | 2. **Install Project Dependencies:**
116 |    - Open a terminal and navigate to your project directory.
117 |    - Install all necessary dependencies by running:
118 |      ```bash
119 |      npm install
120 |      ```
121 | 
122 | 3. **Start the React Application:**
123 |    - Run the following command to start your React application:
124 |      ```bash
125 |      npm start
126 |      ```
127 |    - This will compile the application and start a development server.
128 | 
129 | 4. **Access the Application:**
130 |    - Open a web browser and navigate to [http://localhost:3000](http://localhost:3000).
131 |    - You should see your React application running locally. Make sure to test all functionalities, especially those interacting with AWS services, to ensure everything is working as expected.
132 | 
133 | By following these steps, you can run and test your React application locally before moving on to deploy it in a production environment. This local setup is crucial for development and debugging purposes.
134 | 
135 | 
136 | ## Deployment Options
137 | 
138 | ### Option 1: Deploy to Amazon S3 with CloudFront using Origin Access Identity (OAI)
139 | 
140 | This method utilizes an Origin Access Identity (OAI) to securely serve your React application's static files from an S3 bucket via CloudFront, without the bucket being publicly accessible.
141 | 
142 | 1. **Create an S3 Bucket:**
143 |    - Navigate to the Amazon S3 service within the AWS Management Console and create a new bucket:
144 |      ```bash
145 |      aws s3 mb s3://your-bucket-name --region your-region
146 |      ```
147 |    - Replace `your-bucket-name` and `your-region` with your specific details.
148 |    - Do not enable public access; keep the default settings which block all public access.
149 | 
150 | 2. **Upload the Build Directory to S3:**
151 |    - Upload your React application's build directory to the S3 bucket using the AWS CLI:
152 |      ```bash
153 |      aws s3 sync build/ s3://your-bucket-name/
154 |      ```
155 | 
156 | 3. **Create an Origin Access Identity (OAI):**
157 |    - Navigate to the CloudFront service in the AWS Management Console.
158 |    - Go to the **Security** section, then click on **Origin Access Identity**.
159 |    - Click **Create Origin Access Identity**.
160 |    - Provide a comment to describe the OAI (e.g., "OAI for React App"), then create it.
161 | 
162 | 4. **Configure S3 Bucket Permissions:**
163 |    - Go to your S3 bucket in the AWS Management Console.
164 |    - Under the **Permissions** tab, click on **Bucket Policy**.
165 |    - Use the following policy, replacing `your-oai-id` and `your-bucket-name` with your specific OAI ID and bucket name:
166 |      ```json
167 |      {
168 |          "Version": "2012-10-17",
169 |          "Statement": [
170 |              {
171 |                  "Effect": "Allow",
172 |                  "Principal": {
173 |                      "AWS": "arn:aws:iam::cloudfront:user/CloudFront Origin Access Identity your-oai-id"
174 |                  },
175 |                  "Action": "s3:GetObject",
176 |                  "Resource": "arn:aws:s3:::your-bucket-name/*"
177 |              }
178 |          ]
179 |      }
180 |      ```
181 | 
182 | 5. **Create a CloudFront Distribution:**
183 |    - Go back to the CloudFront console and create a new distribution.
184 |    - For the origin source, select your S3 bucket.
185 |    - Enter the Origin Access Identity you just created.
186 |    - Set the origin to use HTTPS only.
187 |    - Set the Viewer Protocol Policy to "Redirect HTTP to HTTPS" for security.
188 |    - Optionally, specify your index document under the Default Root Object, such as `index.html`.
189 |    - Create the distribution.
190 |    - Note the distribution's domain name provided by CloudFront.
191 | 
192 | 6. **Update DNS Records:**
193 |    - If you have a domain name, update your DNS settings to create a CNAME record that points to your CloudFront distribution's domain name.
194 | 
195 | ### Option 2: Containerize with Nginx and Deploy Using a Load Balancer
196 | 
197 | - Create Docker from with Nginx
198 | - Host the static files from the React `build` folder
199 | - Expose port
200 | - Create ALB
201 | - ACM is used to store the certificate for your load balancer. For demonstration purposes, we are utilizing a self-signed certificate stored in ACM. However, for production applications, it is recommended to obtain a certificate from a trusted Certificate Authority (CA), which can be either external or internal.
202 | 
203 | ## Package Considerations
204 | 
205 | We leverage AWS Amplify package for the frontend which has certain dependencies that will trigger an NPM audit. Either update to a newer version or use leverage a different frontend/library to avoid the following:
206 | ```
207 | Dependency: fast-xml-parser Version: 4.2.5 (npm)
208 | Dependency: nth-check Version: 1.0.2 (npm)
209 | Dependency: fast-xml-parser Version: 4.3.6 (npm)
210 | Dependency: webpack Version: 5.91.0 (npm)
211 | Dependency: postcss Version: 7.0.39 (npm)
212 | Dependency: braces Version: 3.0.2 (npm)
213 | ```
214 | 
215 | ## Conclusion
216 | 
217 | These steps guide you through deploying your React application using AWS Cognito for authentication. Choose between a secure, serverless deployment using Amazon S3 with CloudFront or a containerized approach using Nginx for traditional server-based hosting.
218 | 
219 | 


--------------------------------------------------------------------------------
/front_end/nginx.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |     listen 443 ssl;
 3 |     server_name localhost;
 4 | 
 5 |     ssl_certificate /etc/nginx/ssl/nginx-selfsigned.crt;
 6 |     ssl_certificate_key /etc/nginx/ssl/nginx-selfsigned.key;
 7 | 
 8 |     ssl_protocols TLSv1.2 TLSv1.3;
 9 |     ssl_ciphers HIGH:!aNULL:!MD5;
10 | 
11 |     root /usr/share/nginx/html;
12 |     index index.html;
13 | 
14 |     location / {
15 |         try_files $uri $uri/ =404;
16 |     }
17 | }


--------------------------------------------------------------------------------
/front_end/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "news-cluster-ui",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "@aws-amplify/ui-react": "6.1.9",
 7 |     "@cloudscape-design/components": "3.0.625",
 8 |     "@cloudscape-design/global-styles": "1.0.27",
 9 |     "@testing-library/jest-dom": "5.17.0",
10 |     "@testing-library/react": "13.4.0",
11 |     "@testing-library/user-event": "13.5.0",
12 |     "amazon-cognito-identity-js": "6.3.12",
13 |     "aws-amplify": "6.6.3",
14 |     "aws-sdk": "2.1613.0",
15 |     "dotenv": "16.4.5",
16 |     "react": "^18.3.1",
17 |     "react-dom": "^18.3.1",
18 |     "react-scripts": "5.0.1",
19 |     "web-vitals": "2.1.4"
20 |   },
21 |   "scripts": {
22 |     "start": "react-scripts start",
23 |     "build": "react-scripts build",
24 |     "test": "react-scripts test",
25 |     "eject": "react-scripts eject"
26 |   },
27 |   "eslintConfig": {
28 |     "extends": [
29 |       "react-app",
30 |       "react-app/jest"
31 |     ]
32 |   },
33 |   "browserslist": {
34 |     "production": [
35 |       ">0.2%",
36 |       "not dead",
37 |       "not op_mini all"
38 |     ],
39 |     "development": [
40 |       "last 1 chrome version",
41 |       "last 1 firefox version",
42 |       "last 1 safari version"
43 |     ]
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/front_end/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/favicon.ico


--------------------------------------------------------------------------------
/front_end/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <link rel="icon" href="%PUBLIC_URL%/favicon.ico" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 7 |     <meta name="theme-color" content="#000000" />
 8 |     <meta
 9 |       name="description"
10 |       content="Web site created using create-react-app"
11 |     />
12 |     <link rel="apple-touch-icon" href="%PUBLIC_URL%/logo192.png" />
13 |     <!--
14 |       manifest.json provides metadata used when your web app is installed on a
15 |       user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
16 |     -->
17 |     <link rel="manifest" href="%PUBLIC_URL%/manifest.json" />
18 |     <!--
19 |       Notice the use of %PUBLIC_URL% in the tags above.
20 |       It will be replaced with the URL of the `public` folder during the build.
21 |       Only files inside the `public` folder can be referenced from the HTML.
22 | 
23 |       Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
24 |       work correctly both with client-side routing and a non-root public URL.
25 |       Learn how to configure a non-root public URL by running `npm run build`.
26 |     -->
27 |     <title>News Clustering and Summarization Demo</title>
28 |   </head>
29 |   <body>
30 |     <noscript>You need to enable JavaScript to run this app.</noscript>
31 |     <div id="root"></div>
32 |     <!--
33 |       This HTML file is a template.
34 |       If you open it directly in the browser, you will see an empty page.
35 | 
36 |       You can add webfonts, meta tags, or analytics to this file.
37 |       The build step will place the bundled scripts into the <body> tag.
38 | 
39 |       To begin the development, run `npm start` or `yarn start`.
40 |       To create a production bundle, use `npm run build` or `yarn build`.
41 |     -->
42 |   </body>
43 | </html>
44 | 


--------------------------------------------------------------------------------
/front_end/public/logo192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/logo192.png


--------------------------------------------------------------------------------
/front_end/public/logo512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/logo512.png


--------------------------------------------------------------------------------
/front_end/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "React App",
 3 |   "name": "Create React App Sample",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     },
10 |     {
11 |       "src": "logo192.png",
12 |       "type": "image/png",
13 |       "sizes": "192x192"
14 |     },
15 |     {
16 |       "src": "logo512.png",
17 |       "type": "image/png",
18 |       "sizes": "512x512"
19 |     }
20 |   ],
21 |   "start_url": ".",
22 |   "display": "standalone",
23 |   "theme_color": "#000000",
24 |   "background_color": "#ffffff"
25 | }
26 | 


--------------------------------------------------------------------------------
/front_end/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/front_end/src/App.css:
--------------------------------------------------------------------------------
 1 | .App {
 2 |   text-align: center;
 3 | }
 4 | 
 5 | .App-logo {
 6 |   height: 40vmin;
 7 |   pointer-events: none;
 8 | }
 9 | 
10 | @media (prefers-reduced-motion: no-preference) {
11 |   .App-logo {
12 |     animation: App-logo-spin infinite 20s linear;
13 |   }
14 | }
15 | 
16 | .App-header {
17 |   background-color: #282c34;
18 |   min-height: 100vh;
19 |   display: flex;
20 |   flex-direction: column;
21 |   align-items: center;
22 |   justify-content: center;
23 |   font-size: calc(10px + 2vmin);
24 |   color: white;
25 | }
26 | 
27 | .App-link {
28 |   color: #61dafb;
29 | }
30 | 
31 | @keyframes App-logo-spin {
32 |   from {
33 |     transform: rotate(0deg);
34 |   }
35 |   to {
36 |     transform: rotate(360deg);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/front_end/src/App.js:
--------------------------------------------------------------------------------
 1 | import React, { useState, useEffect } from 'react';
 2 | import './App.css';
 3 | import ClusterList from './components/ClusterList';
 4 | import { AppLayout, Toggle, Box, SpaceBetween, Header, Icon, Button } from '@cloudscape-design/components';
 5 | import { applyMode, Mode } from '@cloudscape-design/global-styles';
 6 | import awsConfig from './aws-exports'; // Path to your aws-exports file
 7 | import { Amplify } from 'aws-amplify';
 8 | import { Authenticator, View, Image, Heading, components } from '@aws-amplify/ui-react';
 9 | import '@aws-amplify/ui-react/styles.css';
10 | 
11 | 
12 | 
13 | 
14 | console.log('Configuring Amplify with:', awsConfig);
15 | Amplify.configure(awsConfig);
16 | // console.log('Amplify configuration:', Amplify.configure());
17 | 
18 | function App() {
19 |   const [darkMode, setDarkMode] = useState(false);
20 | 
21 |   const toggleDarkMode = () => {
22 |     setDarkMode(!darkMode);
23 |   };
24 | 
25 |   applyMode(darkMode ? Mode.Dark : Mode.Light);
26 | 
27 | 
28 | // Custom Header for the Sign In page
29 | const SignInHeader = () => {
30 |   return (
31 |     <View textAlign="center" padding="large">
32 |       <Image
33 |         alt="App Logo"
34 |         src="favicon.ico" // Ensure the path is correct based on your project structure
35 |         style={{ width: 50, height: 50 }} // Adjust size as necessary
36 |       />
37 |       <Heading level={3}>Near Real Time News Clustering and Summarization Demo</Heading>
38 |     </View>
39 |   );
40 | };
41 | 
42 | 
43 |   return (
44 |     <Authenticator
45 |       hideSignUp={true}
46 |       loginMechanisms={['email']}
47 |       components={{
48 |         Header: SignInHeader, // Use your custom Header for the Sign In page
49 |       }}
50 |     >
51 |         {({ signOut }) => (
52 |            <div>
53 |            <AppLayout
54 |             mode={darkMode ? Mode.Dark : Mode.Light}
55 |             content={
56 |               <div className="App">
57 |                 <ClusterList />
58 |               </div>
59 |             }
60 |             navigationHide
61 |             tools={
62 |               <Box padding="m">
63 |                 <Header variant="h2" info={<Icon name="settings" />}>Settings</Header>
64 |                 <br></br>
65 |                 <SpaceBetween direction="horizontal" size="m">
66 |                   Dark Mode
67 |                   <Toggle
68 |                     checked={darkMode}
69 |                     onChange={toggleDarkMode}
70 |                     ariaLabel="Toggle dark mode"
71 |                   />
72 |                 </SpaceBetween>
73 |                 <br></br>
74 |                 <Button variant='primary' onClick={signOut}>Sign out</Button>
75 | 
76 |               </Box>
77 |           }
78 |         />
79 |        </div>
80 |    )}
81 |           
82 |     </Authenticator>
83 | 
84 | 
85 | 
86 |   );
87 | }
88 | 
89 | export default App;
90 | 


--------------------------------------------------------------------------------
/front_end/src/components/ClusterList.js:
--------------------------------------------------------------------------------
  1 | // src/components/ClusterList.js
  2 | import React, { useState, useEffect, useRef } from "react";
  3 | import AWS from "aws-sdk";
  4 | import {
  5 |   Button,
  6 |   Table,
  7 |   Box,
  8 |   ProgressBar,
  9 |   SpaceBetween,
 10 | } from "@cloudscape-design/components";
 11 | import { fetchAuthSession } from "@aws-amplify/auth";
 12 | import ClusterModal from "./ClusterModal";
 13 | import awsConfig from "../aws-exports";
 14 | 
 15 | const refreshInterval = 5000;
 16 | 
 17 | const ClusterList = () => {
 18 |   const [clusters, setClusters] = useState([]);
 19 |   const [selectedCluster, setSelectedCluster] = useState(null);
 20 |   const [totalArticles, setTotalArticles] = useState(0);
 21 |   const [isModalVisible, setModalVisible] = useState(false);
 22 |   const [progress, setProgress] = useState(0); // Initialize progress at 0%
 23 |   const [secondsRemaining, setSecondsRemaining] = useState(
 24 |     refreshInterval / 1000
 25 |   ); // Initialize countdown
 26 | 
 27 |   const dynamoDbRef = useRef();
 28 | 
 29 |   useEffect(() => {
 30 |     const configureAWS = async () => {
 31 |       const session = await fetchAuthSession();
 32 |       const { accessKeyId, secretAccessKey, sessionToken } =
 33 |         session.credentials;
 34 |       AWS.config.update({
 35 |         region: awsConfig.aws_cognito_region,
 36 |         credentials: new AWS.Credentials(
 37 |           accessKeyId,
 38 |           secretAccessKey,
 39 |           sessionToken
 40 |         ),
 41 |       });
 42 |       dynamoDbRef.current = new AWS.DynamoDB.DocumentClient();
 43 |       fetchClusters();
 44 |     };
 45 |     configureAWS();
 46 |   }, []);
 47 | 
 48 |   useEffect(() => {
 49 |     const intervalId = setInterval(() => {
 50 |       fetchClusters();
 51 |     }, refreshInterval);
 52 | 
 53 |     const progressId = setInterval(() => {
 54 |       setProgress(
 55 |         (prevProgress) => (prevProgress + (1000 / refreshInterval) * 100) % 100
 56 |       );
 57 |       setSecondsRemaining((prevSeconds) =>
 58 |         prevSeconds <= 1 ? refreshInterval / 1000 : prevSeconds - 1
 59 |       );
 60 |     }, 1000);
 61 | 
 62 |     return () => {
 63 |       clearInterval(intervalId);
 64 |       clearInterval(progressId);
 65 |     };
 66 |   }, []);
 67 | 
 68 |   const fetchClusters = async () => {
 69 |     if (!dynamoDbRef.current) {
 70 |       console.log("DynamoDB client not initialized");
 71 |       return;
 72 |     }
 73 |     let lastEvaluatedKey = null;
 74 |     const allItems = [];
 75 |     let articlesCount = 0;
 76 |     const params = {
 77 |       TableName: "cluster-table-clustering-demo2",
 78 |     };
 79 | 
 80 |     do {
 81 |       if (lastEvaluatedKey) {
 82 |         params.ExclusiveStartKey = lastEvaluatedKey;
 83 |       }
 84 |       const data = await dynamoDbRef.current.scan(params).promise();
 85 |       allItems.push(...data.Items);
 86 |       lastEvaluatedKey = data.LastEvaluatedKey;
 87 |     } while (lastEvaluatedKey);
 88 | 
 89 |     const articlesByCluster = allItems.reduce((acc, item) => {
 90 |       if (item.is_cluster) {
 91 |         acc[item.PK] = acc[item.PK] || [];
 92 |       } else if (item.SK.startsWith("ARTICLE#")) {
 93 |         if (item.publication_date) {
 94 |           articlesCount++;
 95 |           if (acc[item.PK]) {
 96 |             acc[item.PK].push(item);
 97 |           }
 98 |         }
 99 |       }
100 |       return acc;
101 |     }, {});
102 | 
103 |     const newClusters = allItems
104 |       .filter(
105 |         (item) =>
106 |           item.is_cluster &&
107 |           item.generated_summary &&
108 |           articlesByCluster[item.PK] &&
109 |           articlesByCluster[item.PK].length > 2
110 |       )
111 |       .map((cluster) => ({
112 |         ...cluster,
113 |         articles: articlesByCluster[cluster.PK],
114 |         number_of_articles: articlesByCluster[cluster.PK].length,
115 |       }))
116 |       .sort((a, b) => b.number_of_articles - a.number_of_articles);
117 | 
118 |     setClusters(newClusters);
119 |     setTotalArticles(articlesCount);
120 |   };
121 | 
122 |   const handleViewArticles = (cluster) => {
123 |     console.log("Opening modal for cluster:", cluster.PK);
124 |     setSelectedCluster(cluster);
125 |     setModalVisible(true); // Set the modal to be visible
126 |   };
127 | 
128 |   const wrapStyleSummary = {
129 |     whiteSpace: "normal", // Allow the text to wrap to the next line
130 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
131 |     maxWidth: "600px", // Set a maximum width for the cell content
132 |     textAlign: "justify", // Center the text
133 |   };
134 | 
135 |   const wrapStyleTitle = {
136 |     whiteSpace: "normal", // Allow the text to wrap to the next line
137 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
138 |     maxWidth: "150px", // Set a maximum width for the cell content
139 |     textAlign: "center",
140 |   };
141 | 
142 |   const wrapStyleNumberOfArticles = {
143 |     whiteSpace: "normal", // Allow the text to wrap to the next line
144 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
145 |     maxWidth: "100px", // Set a maximum width for the cell content
146 |     textAlign: "center",
147 |   };
148 | 
149 |   // Column definitions using inline styles
150 |   const columnDefinitions = [
151 |     {
152 |       header: "Title",
153 |       cell: (item) => <div style={wrapStyleTitle}>{item.description}</div>,
154 |     },
155 |     {
156 |       header: "Summary",
157 |       cell: (item) => (
158 |         <div style={wrapStyleSummary}>{item.generated_summary}</div>
159 |       ),
160 |     },
161 |     {
162 |       header: "Articles",
163 |       cell: (item) => (
164 |         <div style={wrapStyleNumberOfArticles}>{item.number_of_articles}</div>
165 |       ),
166 |     },
167 |     {
168 |       header: "View",
169 |       cell: (item) => (
170 |         <Button onClick={() => handleViewArticles(item)}>View Articles</Button>
171 |       ),
172 |     },
173 |   ];
174 | 
175 |   return (
176 |     <Box textAlign="center" padding="m">
177 |       <SpaceBetween direction="vertical" size="s">
178 |         <h1 textAlign="center">
179 |           {" "}
180 |           Near Real Time News Clustering and Summarization Demo
181 |         </h1>
182 |         <b>
183 |           Total Clusters: {clusters.length} | Total Articles: {totalArticles}
184 |         </b>
185 |         <div style={{ width: "30%", margin: "0 auto" }}>
186 |           <ProgressBar
187 |             value={progress}
188 |             label={`Next refresh in ${secondsRemaining} seconds`}
189 |           />
190 |         </div>
191 | 
192 |         <Table
193 |           items={clusters}
194 |           columnDefinitions={columnDefinitions}
195 |           trackBy="PK"
196 |         />
197 |         {selectedCluster && (
198 |           <ClusterModal
199 |             cluster={selectedCluster}
200 |             articles={selectedCluster.articles} // Pass articles directly to the modal
201 |             onClose={() => {
202 |               setSelectedCluster(null);
203 |               setModalVisible(false); // Hide the modal when closed
204 |             }}
205 |             visible={isModalVisible} // Control visibility with state
206 |           />
207 |         )}
208 |       </SpaceBetween>
209 |     </Box>
210 |   );
211 | };
212 | 
213 | export default ClusterList;
214 | 


--------------------------------------------------------------------------------
/front_end/src/components/ClusterModal.js:
--------------------------------------------------------------------------------
 1 | // src/components/ClusterModal.js
 2 | import { Modal, Button } from "@cloudscape-design/components";
 3 | import React, { useState } from "react";
 4 | 
 5 | const ClusterModal = ({ cluster, articles, onClose, visible }) => {
 6 |   // State to manage visibility of each article's full text
 7 |   const [visibleArticles, setVisibleArticles] = useState({});
 8 | 
 9 |   // Function to toggle article text visibility
10 |   const toggleArticleVisibility = (id) => {
11 |     setVisibleArticles((prev) => ({ ...prev, [id]: !prev[id] }));
12 |   };
13 | 
14 |   // Helper function to format date
15 |   const formatDate = (dateString) => {
16 |     const date = new Date(dateString);
17 |     return date.toLocaleDateString("en-US", {
18 |       year: "numeric",
19 |       month: "long",
20 |       day: "numeric",
21 |       hour: "2-digit",
22 |       minute: "2-digit",
23 |     });
24 |   };
25 | 
26 |   return (
27 |     <Modal
28 |       onDismiss={onClose}
29 |       header={cluster ? cluster.description : "Loading..."}
30 |       footer={<Button onClick={onClose}>Close</Button>}
31 |       visible={visible}
32 |       size="large"
33 |     >
34 |       {articles && articles.length > 0 ? (
35 |         articles.map((article) => (
36 |           <div key={article.SK}>
37 |             <h3>{article.title} </h3>
38 |             <medium>{formatDate(article.publication_date)}</medium>
39 |             <p>{article.summary}</p>
40 |             <Button onClick={() => toggleArticleVisibility(article.SK)}>
41 |               {visibleArticles[article.SK]
42 |                 ? "Hide Full Text"
43 |                 : "Show Full Text"}
44 |             </Button>
45 |             {visibleArticles[article.SK] && <p>{article.text}</p>}
46 |             <hr></hr>
47 |           </div>
48 |         ))
49 |       ) : (
50 |         <p>No articles found.</p>
51 |       )}
52 |     </Modal>
53 |   );
54 | };
55 | 
56 | export default ClusterModal;
57 | 


--------------------------------------------------------------------------------
/front_end/src/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   margin: 0;
 3 |   font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
 4 |     'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
 5 |     sans-serif;
 6 |   -webkit-font-smoothing: antialiased;
 7 |   -moz-osx-font-smoothing: grayscale;
 8 | }
 9 | 
10 | code {
11 |   font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 |     monospace;
13 | }
14 | 


--------------------------------------------------------------------------------
/front_end/src/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import ReactDOM from 'react-dom/client';
 3 | import './index.css';
 4 | import App from './App';
 5 | 
 6 | const root = ReactDOM.createRoot(document.getElementById('root'));
 7 | root.render(
 8 |   <React.StrictMode>
 9 |     <App />
10 |   </React.StrictMode>
11 | );
12 | 
13 | 


--------------------------------------------------------------------------------
/iac/roots/README.md:
--------------------------------------------------------------------------------
 1 | The roots directory should include all top-level Terraform projects.
 2 | 
 3 | A top-level project is defined as a directory containing a main.tf file
 4 | that you would want to run "terraform apply" on. Each top-level project
 5 | has its own separate Terraform state file.
 6 | 
 7 | Top-level projects should make use of reusable components and modules,
 8 | which are located under the "templates" directory. Essentially, your
 9 | top-level projects should not define any behavior on their own. They simply 
10 | define input variables and make calls to reusable templates.


--------------------------------------------------------------------------------
/iac/roots/main/clustering_compute.tf:
--------------------------------------------------------------------------------
  1 | # Code Deployment
  2 | module "cluster_code_bucket" {
  3 |   source      = "../../templates/modules/s3_bucket"
  4 |   name_prefix = "code-bucket-${var.app_name}-${var.env_name}"
  5 |   log_bucket  = module.log_bucket.name
  6 | }
  7 | 
  8 | resource "aws_s3_object" "clustering_code" {
  9 |   bucket = module.cluster_code_bucket.name
 10 | 
 11 |   for_each      = fileset("../../../business_logic/stream_consumer", "**/*.*")
 12 |   key           = "stream_consumer/${each.value}"
 13 |   source        = "../../../business_logic/stream_consumer/${each.value}"
 14 |   source_hash   = filemd5("../../../business_logic/stream_consumer/${each.value}")
 15 |   content_type  = each.value
 16 |   force_destroy = true
 17 | }
 18 | 
 19 | # SQS Queue
 20 | resource "aws_sqs_queue" "tags" {
 21 |   name                    = "${var.app_name}-${var.env_name}-queue"
 22 |   sqs_managed_sse_enabled = true
 23 | }
 24 | 
 25 | # EC2 Instance
 26 | data "aws_ami" "amazon_linux" {
 27 |   most_recent = true
 28 |   filter {
 29 |     name   = "owner-alias"
 30 |     values = ["amazon"]
 31 |   }
 32 |   filter {
 33 |     name   = "name"
 34 |     values = ["al2023-ami-2023*"] # Arm
 35 |   }
 36 |   filter {
 37 |     name   = "architecture"
 38 |     values = ["arm64"] # Arm
 39 |   }
 40 | 
 41 | }
 42 | 
 43 | resource "aws_iam_instance_profile" "this_aws_iam_instance_profile_stream_consumer" {
 44 |   name = "stream-consumer-instance-profile-${var.app_name}-${var.env_name}"
 45 |   role = aws_iam_role.stream_consumer_role.name
 46 | }
 47 | 
 48 | # User Data
 49 | data "cloudinit_config" "this_cloudinit_config" {
 50 |   gzip          = false
 51 |   base64_encode = false
 52 |   # Main cloud-config configuration file.
 53 |   part {
 54 |     filename     = "init.cfg"
 55 |     content_type = "text/cloud-config"
 56 |     content = templatefile("${path.module}/templates/init.cfg",
 57 |       {
 58 |         CONFIGURE_NODE_SCRIPT = base64gzip(templatefile("${path.module}/templates/ConfigureNode.sh",
 59 |           {
 60 |             config = {
 61 |               "S3_BUCKET_PATH"     = "${module.cluster_code_bucket.id}/stream_consumer/"
 62 |               "S3_BUCKET_NAME"     = module.cluster_code_bucket.id
 63 |               "S3_FILE_KEY"        = "checkpoint.pkl"
 64 |               "SQS_QUEUE"          = aws_sqs_queue.tags.url
 65 |               "DYNAMODB_TABLE"     = aws_dynamodb_table.cluster_table.name
 66 |               "AWS_DEFAULT_REGION" = local.region
 67 |             }
 68 |           }
 69 |           )
 70 |         )
 71 |       }
 72 |     )
 73 |   }
 74 | }
 75 | 
 76 | resource "aws_security_group" "this_aws_security_group_ec2" {
 77 |   name        = "${local.standard_resource_name}-ec2"
 78 |   description = "Security group for EC2"
 79 |   vpc_id      = module.vpc.vpc_id
 80 |   egress {
 81 |     description = "Internet access"
 82 |     from_port   = 0
 83 |     to_port     = 0
 84 |     protocol    = "-1"
 85 |     cidr_blocks = ["0.0.0.0/0"]
 86 |   }
 87 |   tags = merge(local.tags, { Name = "${local.standard_resource_name}-ec2" })
 88 | 
 89 | }
 90 | 
 91 | resource "aws_launch_template" "this_aws_launch_template" {
 92 |   name_prefix                          = "stream-consumer-instance-${local.standard_resource_name}-"
 93 |   description                          = "Launch template for stream-consumer-instance-${local.standard_resource_name}"
 94 |   tags                                 = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
 95 |   image_id                             = data.aws_ami.amazon_linux.id
 96 |   instance_type                        = var.instance_type
 97 |   vpc_security_group_ids               = [aws_security_group.this_aws_security_group_ec2.id]
 98 |   user_data                            = base64encode(data.cloudinit_config.this_cloudinit_config.rendered)
 99 |   ebs_optimized                        = true
100 |   instance_initiated_shutdown_behavior = "stop"
101 |   update_default_version               = true
102 |   disable_api_termination              = false
103 | 
104 |   iam_instance_profile {
105 |     arn = aws_iam_instance_profile.this_aws_iam_instance_profile_stream_consumer.arn
106 |   }
107 | 
108 |   tag_specifications {
109 |     resource_type = "instance"
110 |     tags          = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
111 |   }
112 | 
113 |   tag_specifications {
114 |     resource_type = "volume"
115 |     tags          = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
116 |   }
117 | 
118 |   tag_specifications {
119 |     resource_type = "network-interface"
120 |     tags          = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
121 |   }
122 | 
123 |   block_device_mappings {
124 |     device_name = "/dev/xvda"
125 |     ebs {
126 |       volume_size           = var.volume_size
127 |       volume_type           = "gp3"
128 |       delete_on_termination = true
129 |       encrypted             = true
130 |       kms_key_id            = aws_kms_key.this_aws_kms_key.arn
131 |     }
132 |   }
133 | 
134 |   metadata_options {
135 |     http_endpoint               = "enabled"
136 |     http_tokens                 = "required" # Enforces IMDSv2
137 |     http_put_response_hop_limit = 1
138 |   }
139 | 
140 |   monitoring {
141 |     enabled = true
142 |   }
143 | }
144 | 
145 | resource "aws_autoscaling_group" "this_aws_autoscaling_group_stream_consumer" {
146 |   depends_on  = [aws_s3_object.clustering_code] # Code needs to be uploaded to s3 first
147 |   name_prefix = "stream-consumer-instance-${local.standard_resource_name}"
148 |   launch_template {
149 |     id      = aws_launch_template.this_aws_launch_template.id
150 |     version = "$Latest"
151 |   }
152 |   vpc_zone_identifier     = [module.vpc.private_subnets[0]]
153 |   max_size                = var.number_of_nodes
154 |   min_size                = 0
155 |   desired_capacity        = var.number_of_nodes
156 |   service_linked_role_arn = aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
157 |   dynamic "tag" {
158 |     for_each = local.tags
159 |     iterator = tags
160 |     content {
161 |       key                 = tags.key
162 |       value               = tags.value
163 |       propagate_at_launch = true
164 |     }
165 |   }
166 | }
167 | 


--------------------------------------------------------------------------------
/iac/roots/main/embedding_endpoint.tf:
--------------------------------------------------------------------------------
  1 | # Create an S3 bucket for storing model artifacts
  2 | module "model_bucket" {
  3 |   source      = "../../templates/modules/s3_bucket"
  4 |   name_prefix = "models-${var.app_name}-${var.env_name}"
  5 |   log_bucket  = module.log_bucket.name
  6 | }
  7 | 
  8 | resource "aws_s3_object" "uncompressed_model_artifact" {
  9 |   bucket = module.model_bucket.name
 10 | 
 11 |   for_each      = fileset("../../../business_logic/model_artifacts/embedding/model", "**/*.*")
 12 |   key           = "model/${each.value}"
 13 |   source        = "../../../business_logic/model_artifacts/embedding/model/${each.value}"
 14 |   source_hash   = filemd5("../../../business_logic/model_artifacts/embedding/model/${each.value}")
 15 |   content_type  = each.value
 16 |   force_destroy = true
 17 | }
 18 | 
 19 | 
 20 | # Define IAM Role for SageMaker
 21 | resource "aws_iam_role" "sagemaker_execution_role" {
 22 | 
 23 |   assume_role_policy = jsonencode({
 24 |     "Version" : "2012-10-17",
 25 |     "Statement" : [{
 26 |       "Effect" : "Allow",
 27 |       "Principal" : {
 28 |         "Service" : "sagemaker.amazonaws.com"
 29 |       },
 30 |       "Action" : "sts:AssumeRole"
 31 |     }]
 32 |   })
 33 | }
 34 | 
 35 | resource "aws_iam_policy" "sagemaker_policy" {
 36 | 
 37 |   description = "Policy for SageMaker access to S3 and IAM role assumption"
 38 | 
 39 |   policy = jsonencode({
 40 |     Version = "2012-10-17",
 41 |     Statement = [
 42 |       {
 43 |         Effect   = "Allow",
 44 |         Action   = ["s3:GetObject", "s3:ListBucket"],
 45 |         Resource = [module.model_bucket.arn, "${module.model_bucket.arn}/*"],
 46 |       }
 47 |     ]
 48 |   })
 49 | }
 50 | 
 51 | # Attach SageMaker permissions to IAM role
 52 | resource "aws_iam_policy_attachment" "sagemaker_permissions" {
 53 | 
 54 |   name       = "sagemaker_permissions"
 55 |   roles      = [aws_iam_role.sagemaker_execution_role.name]
 56 |   policy_arn = "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess" # Adjust permissions as needed
 57 | }
 58 | 
 59 | resource "aws_iam_policy_attachment" "sagemaker_policy_attachment" {
 60 | 
 61 |   name       = "SageMakerPolicyAttachment"
 62 |   roles      = [aws_iam_role.sagemaker_execution_role.name] # Replace with your IAM role name
 63 |   policy_arn = aws_iam_policy.sagemaker_policy.arn
 64 | }
 65 | 
 66 | # Define SageMaker model  
 67 | resource "aws_sagemaker_model" "pytorch_model" {
 68 |   count = var.model_name != "titan" ? 1 : 0
 69 | 
 70 |   name = "model-${var.app_name}-${var.env_name}"
 71 | 
 72 |   execution_role_arn = aws_iam_role.sagemaker_execution_role.arn
 73 |   primary_container {
 74 |     image = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.1.0-transformers4.37.0-gpu-py310-cu118-ubuntu20.04"
 75 |     environment = {
 76 |       BIT_LOADING                   = "4"
 77 |       MODEL_NAME                    = var.model_name
 78 |       SAGEMAKER_CONTAINER_LOG_LEVEL = "20"
 79 |       SAGEMAKER_PROGRAM             = "inference.py"
 80 |       SAGEMAKER_REGION              = data.aws_region.current.name
 81 |       SAGEMAKER_SUBMIT_DIRECTORY    = "/opt/ml/model/code"
 82 |       # Only for Multi-GPU processing (mistral)
 83 |       # HF_MODEL_ID                   = "intfloat/e5-mistral-7b-instruct" # ToDO Paramaterize
 84 |       # PYTORCH_CUDA_ALLOC_CONF        = "max_split_size_mb:50" 
 85 |       # SAGEMAKER_MODEL_SERVER_WORKERS = 4
 86 |     }
 87 | 
 88 |     model_data_source {
 89 |       s3_data_source {
 90 |         s3_uri           = "s3://${module.model_bucket.id}/model/"
 91 |         s3_data_type     = "S3Prefix"
 92 |         compression_type = "None"
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   depends_on = [
 98 |     aws_s3_object.uncompressed_model_artifact,
 99 |     module.model_bucket,
100 |     aws_iam_role.sagemaker_execution_role
101 |   ]
102 | }
103 | 
104 | # Create SageMaker endpoint configuration
105 | resource "aws_sagemaker_endpoint_configuration" "pytorch_endpoint_config" {
106 |   count = var.model_name != "titan" ? 1 : 0
107 | 
108 |   #checkov:skip=CKV_AWS_98: "Ensure all data stored in the Sagemaker Endpoint is securely encrypted at rest"
109 |   name = "endpoint-config-${var.app_name}-${var.env_name}"
110 |   production_variants {
111 |     variant_name           = "${var.app_name}-${var.env_name}-traffic"
112 |     instance_type          = var.embedding_endpoint_instance_type
113 |     initial_instance_count = var.embedding_endpoint_instance_count
114 |     model_name             = aws_sagemaker_model.pytorch_model[count.index].name
115 |   }
116 | }
117 | 
118 | # Create SageMaker endpoint
119 | resource "aws_sagemaker_endpoint" "pytorch_endpoint" {
120 |   count = var.model_name != "titan" ? 1 : 0
121 | 
122 |   name                 = "endpoint-${var.app_name}-${var.env_name}"
123 |   endpoint_config_name = aws_sagemaker_endpoint_configuration.pytorch_endpoint_config[count.index].name
124 | }
125 | 
126 | # Auto scaling
127 | # resource "aws_appautoscaling_target" "sagemaker_target" {
128 | #   max_capacity       = var.max_embedding_instance_count
129 | #   min_capacity       = var.min_embedding_instance_count
130 | #   resource_id        = "endpoint/${aws_sagemaker_endpoint.pytorch_endpoint.name}/variant/${var.app_name}-${var.env_name}-traffic"
131 | #   scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
132 | #   service_namespace  = "sagemaker"
133 | # }
134 | 
135 | # resource "aws_appautoscaling_policy" "sagemaker_policy" {
136 | #   name               = "${var.app_name}-${var.env_name}-target-tracking"
137 | #   policy_type        = "TargetTrackingScaling"
138 | #   resource_id        = aws_appautoscaling_target.sagemaker_target.resource_id
139 | #   scalable_dimension = aws_appautoscaling_target.sagemaker_target.scalable_dimension
140 | #   service_namespace  = aws_appautoscaling_target.sagemaker_target.service_namespace
141 | 
142 | #   target_tracking_scaling_policy_configuration {
143 | #     predefined_metric_specification {
144 | #       predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
145 | #     }
146 | #     target_value       = 3
147 | #     scale_in_cooldown  = 300
148 | #     scale_out_cooldown = 60
149 | #   }
150 | # }
151 | 


--------------------------------------------------------------------------------
/iac/roots/main/eventbridge.tf:
--------------------------------------------------------------------------------
 1 | resource "time_sleep" "wait_30_seconds" {
 2 |   depends_on = [aws_sqs_queue.dead_letter_queue, aws_kinesis_stream.input_stream, aws_iam_role.cloudwatch_event_role, aws_iam_role_policy.eventbridge_sfn_policy, aws_sfn_state_machine.pre_processing_sfn]
 3 | 
 4 |   create_duration = "30s"
 5 | }
 6 | 
 7 | #SQS Queue
 8 | resource "aws_sqs_queue" "dead_letter_queue" {
 9 |   name                    = "dead-letter-pipe-${local.standard_resource_name}"
10 |   sqs_managed_sse_enabled = true
11 | }
12 | 
13 | # Eventbridge rule used to trigger step functions off kinesis 
14 | resource "aws_pipes_pipe" "event_pipe" {
15 |   depends_on = [time_sleep.wait_30_seconds, aws_sqs_queue.dead_letter_queue, aws_kinesis_stream.input_stream, aws_iam_role.cloudwatch_event_role, aws_iam_role_policy.eventbridge_sfn_policy, aws_sfn_state_machine.pre_processing_sfn]
16 |   name       = "event-pipe-${local.standard_resource_name}"
17 |   role_arn   = aws_iam_role.cloudwatch_event_role.arn
18 |   source     = aws_kinesis_stream.input_stream.arn
19 |   target     = aws_sfn_state_machine.pre_processing_sfn.arn
20 | 
21 |   source_parameters {
22 |     kinesis_stream_parameters {
23 |       batch_size             = 1
24 |       parallelization_factor = 1
25 |       starting_position      = "TRIM_HORIZON"
26 |       maximum_retry_attempts = 0
27 |       dead_letter_config {
28 |         arn = aws_sqs_queue.dead_letter_queue.arn
29 |       }
30 |     }
31 |   }
32 | 
33 |   target_parameters {
34 |     step_function_state_machine_parameters {
35 |       invocation_type = "FIRE_AND_FORGET"
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/iac/roots/main/iam.tf:
--------------------------------------------------------------------------------
  1 | # IAM Roles
  2 | resource "aws_iam_role" "preprocessing_lambda_role" {
  3 |   name = "preprocessing-lambda_role-${var.app_name}-${var.env_name}"
  4 |   assume_role_policy = jsonencode({
  5 |     Version = "2012-10-17",
  6 |     Statement = [
  7 |       {
  8 |         Action = "sts:AssumeRole",
  9 |         Effect = "Allow",
 10 |         Principal = {
 11 |           Service = "lambda.amazonaws.com"
 12 |         },
 13 |       }
 14 |     ],
 15 |   })
 16 | }
 17 | 
 18 | resource "aws_iam_role" "embedding_lambda_role" {
 19 |   name = "embedding-lambda_role-${var.app_name}-${var.env_name}"
 20 |   assume_role_policy = jsonencode({
 21 |     Version = "2012-10-17",
 22 |     Statement = [
 23 |       {
 24 |         Action = "sts:AssumeRole",
 25 |         Effect = "Allow",
 26 |         Principal = {
 27 |           Service = "lambda.amazonaws.com"
 28 |         },
 29 |       }
 30 |     ],
 31 |   })
 32 | }
 33 | 
 34 | resource "aws_iam_role" "step_functions_role" {
 35 |   name = "step_functions_role-${var.app_name}-${var.env_name}"
 36 | 
 37 |   assume_role_policy = jsonencode({
 38 |     Version = "2012-10-17",
 39 |     Statement = [
 40 |       {
 41 |         Action = "sts:AssumeRole",
 42 |         Effect = "Allow",
 43 |         Principal = {
 44 |           Service = "states.amazonaws.com"
 45 |         },
 46 |       },
 47 |     ],
 48 |   })
 49 | }
 50 | 
 51 | resource "aws_iam_policy" "step_functions_lambda_policy" {
 52 |   description = "Policy for Lamba Access"
 53 |   policy = jsonencode({
 54 |     Version = "2012-10-17",
 55 |     Statement = [
 56 |       {
 57 |         "Effect" : "Allow",
 58 |         "Action" : [
 59 |           "lambda:InvokeFunction"
 60 |         ],
 61 |         "Resource" : [
 62 |           aws_lambda_function.pre_processing_lambda.arn, aws_lambda_function.embedding_lambda.arn
 63 |         ]
 64 |       },
 65 |     ]
 66 |   })
 67 | }
 68 | 
 69 | resource "aws_iam_policy_attachment" "step_functions_lambda_policy_attachment" {
 70 |   name       = "step_functions_lambda_policy_attachment"
 71 |   roles      = [aws_iam_role.step_functions_role.name]
 72 |   policy_arn = aws_iam_policy.step_functions_lambda_policy.arn
 73 | }
 74 | 
 75 | 
 76 | resource "aws_iam_role" "cloudwatch_event_role" {
 77 |   name = "cloudwatch_event_role-${var.app_name}-${var.env_name}"
 78 | 
 79 |   assume_role_policy = jsonencode({
 80 |     Version : "2012-10-17",
 81 |     Statement : [
 82 |       {
 83 |         Action : "sts:AssumeRole",
 84 |         Effect : "Allow",
 85 |         Principal : {
 86 |           Service : "pipes.amazonaws.com"
 87 |         }
 88 |       }
 89 |     ]
 90 |   })
 91 | }
 92 | 
 93 | resource "aws_iam_role_policy" "eventbridge_sfn_policy" {
 94 |   name = "eventbridge_sfn_policy-${var.app_name}-${var.env_name}"
 95 |   role = aws_iam_role.cloudwatch_event_role.id
 96 |   policy = jsonencode({
 97 |     Version = "2012-10-17",
 98 |     Statement = [
 99 |       {
100 |         Action   = ["states:StartExecution"]
101 |         Resource = [aws_sfn_state_machine.pre_processing_sfn.arn]
102 |         Effect   = "Allow"
103 |       },
104 |       {
105 |         Action   = ["sqs:SendMessage"]
106 |         Resource = [aws_sqs_queue.dead_letter_queue.arn]
107 |         Effect   = "Allow"
108 |       }
109 |     ]
110 |     }
111 |   )
112 | }
113 | 
114 | data "aws_iam_policy_document" "eventbridge_kinesis_policy_document" {
115 |   statement {
116 |     actions = [
117 |       "kinesis:DescribeStream",
118 |       "kinesis:DescribeStreamSummary",
119 |       "kinesis:GetRecords",
120 |       "kinesis:GetShardIterator",
121 |       "kinesis:ListStreams",
122 |       "kinesis:ListShards"
123 |     ]
124 |     resources = [aws_kinesis_stream.input_stream.arn]
125 |   }
126 | }
127 | 
128 | resource "aws_iam_role_policy" "eventbridge_kinesis_policy" {
129 |   name   = "eventbridge_kinesis_policy-${var.app_name}-${var.env_name}"
130 |   role   = aws_iam_role.cloudwatch_event_role.id
131 |   policy = data.aws_iam_policy_document.eventbridge_kinesis_policy_document.json
132 | }
133 | 
134 | # Consumer Role
135 | resource "aws_iam_role" "stream_consumer_role" {
136 |   name = "stream-consumer-role-${var.app_name}-${var.env_name}"
137 | 
138 |   assume_role_policy = jsonencode({
139 |     Version = "2012-10-17",
140 |     Statement = [
141 |       {
142 |         Action = "sts:AssumeRole",
143 |         Effect = "Allow",
144 |         Principal = {
145 |           Service = "ec2.amazonaws.com"
146 |         }
147 |       },
148 |     ]
149 |   })
150 | }
151 | 
152 | resource "aws_iam_role_policy" "stream_consumer_policy" {
153 |   name = "stream_consumer_policy-${var.app_name}-${var.env_name}"
154 |   role = aws_iam_role.stream_consumer_role.id
155 | 
156 |   policy = jsonencode({
157 |     Version = "2012-10-17",
158 |     Statement = [
159 |       {
160 |         Action = [
161 |           "s3:GetObject",
162 |           "s3:ListBucket",
163 |           "s3:DeleteObject",
164 |           "s3:GetBucketLocation",
165 |           "s3:PutObject"
166 |         ],
167 |         Effect   = "Allow",
168 |         Resource = ["${module.cluster_code_bucket.arn}/*", module.cluster_code_bucket.arn]
169 |       },
170 |       {
171 |         "Effect" : "Allow",
172 |         "Action" : [
173 |           "logs:CreateLogGroup",
174 |           "logs:CreateLogStream",
175 |           "logs:PutLogEvents",
176 |           "logs:DescribeLogStreams"
177 |         ],
178 |         "Resource" : [
179 |           "*"
180 |         ]
181 |       },
182 |       {
183 |         Action = [
184 |           "dynamodb:PutItem",
185 |           "dynamodb:UpdateItem",
186 |           "dynamodb:BatchWriteItem",
187 |           "dynamodb:BatchGetItem",
188 |           "dynamodb:CreateTable",
189 |           "dynamodb:DescribeTable",
190 |           "dynamodb:GetItem",
191 |           "dynamodb:Scan",
192 |         ],
193 |         Effect = "Allow",
194 |         Resource = [
195 |           aws_dynamodb_table.cluster_table.arn,
196 |           "${aws_dynamodb_table.cluster_table.arn}/*"
197 |         ],
198 |       },
199 |       {
200 |         Action = [
201 |           "sqs:ReceiveMessage",
202 |           "sqs:DeleteMessage",
203 |           "sqs:DeleteMessageBatch",
204 |         ]
205 |         Effect   = "Allow"
206 |         Resource = aws_sqs_queue.tags.arn
207 |       },
208 |       {
209 |         "Effect" : "Allow",
210 |         "Action" : [
211 |           "ssm:DescribeAssociation",
212 |           "ssm:GetDeployablePatchSnapshotForInstance",
213 |           "ssm:GetDocument",
214 |           "ssm:DescribeDocument",
215 |           "ssm:GetManifest",
216 |           "ssm:GetParameter",
217 |           "ssm:GetParameters",
218 |           "ssm:ListAssociations",
219 |           "ssm:ListInstanceAssociations",
220 |           "ssm:PutInventory",
221 |           "ssm:PutComplianceItems",
222 |           "ssm:PutConfigurePackageResult",
223 |           "ssm:UpdateAssociationStatus",
224 |           "ssm:UpdateInstanceAssociationStatus",
225 |           "ssm:UpdateInstanceInformation"
226 |         ],
227 |         "Resource" : "*"
228 |       },
229 |       {
230 |         "Effect" : "Allow",
231 |         "Action" : [
232 |           "ssmmessages:CreateControlChannel",
233 |           "ssmmessages:CreateDataChannel",
234 |           "ssmmessages:OpenControlChannel",
235 |           "ssmmessages:OpenDataChannel"
236 |         ],
237 |         "Resource" : "*"
238 |       },
239 |       {
240 |         "Effect" : "Allow",
241 |         "Action" : [
242 |           "ec2messages:AcknowledgeMessage",
243 |           "ec2messages:DeleteMessage",
244 |           "ec2messages:FailMessage",
245 |           "ec2messages:GetEndpoint",
246 |           "ec2messages:GetMessages",
247 |           "ec2messages:SendReply"
248 |         ],
249 |         "Resource" : "*"
250 |       }
251 |     ]
252 |   })
253 | }
254 | 
255 | resource "aws_iam_policy" "lambda_execution_policy" {
256 |   #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow &quot;*&quot; as a statement's resource for restrictable actions"
257 |   #checkov:skip=CKV_AWS_290: "Ensure IAM policies does not allow write access without constraints"
258 |   description = "Policy for Lambda Execution"
259 |   policy = jsonencode({
260 |     Version = "2012-10-17",
261 |     Statement = [
262 |       {
263 |         "Effect" : "Allow",
264 |         "Action" : [
265 |           "logs:CreateLogGroup",
266 |           "logs:CreateLogStream",
267 |           "logs:PutLogEvents"
268 |         ],
269 |         "Resource" : "arn:aws:logs:*:*:*"
270 |       },
271 |       {
272 |         "Sid" : "AllowModelnvocation",
273 |         "Effect" : "Allow",
274 |         "Action" : [
275 |           "bedrock:InvokeModel"
276 |         ],
277 |         "Resource" : "*"
278 |       },
279 |       {
280 |         Effect = "Allow",
281 |         Action = [
282 |           "ecr:GetAuthorizationToken",
283 |           "ec2:CreateNetworkInterface",
284 |           "ec2:DescribeNetworkInterfaces",
285 |           "ec2:DeleteNetworkInterface",
286 |           "ec2:AssignPrivateIpAddresses",
287 |           "ec2:UnassignPrivateIpAddresses"
288 |         ],
289 |         Resource = "*"
290 |       },
291 |       {
292 |         "Sid" : "ECRGrantsToConnectAndDownload",
293 |         "Effect" : "Allow",
294 |         "Action" : [
295 |           "ecr:BatchCheckLayerAvailability",
296 |           "ecr:BatchGetImage",
297 |           "ecr:GetDownloadUrlForLayer"
298 |         ],
299 |         "Resource" : "arn:aws:ecr:*:*:repository/*"
300 |       },
301 |       {
302 |         "Sid" : "AccessToEncryptAndDeccryptKMSKeys",
303 |         "Effect" : "Allow",
304 |         "Action" : [
305 |           "kms:Decrypt",
306 |           "kms:DescribeKey",
307 |           "kms:Encrypt",
308 |           "kms:GenerateDataKey",
309 |           "kms:GetKeyPolicy",
310 |           "kms:GetKeyRotationStatus",
311 |           "kms:ListGrants",
312 |           "kms:ListKeys",
313 |           "kms:ListAliases",
314 |           "kms:ListKeyPolicies",
315 |           "kms:ListResourceTags",
316 |           "kms:ListRetirableGrants",
317 |           "kms:ReEncryptTo"
318 |         ],
319 |         "Resource" : [
320 |           aws_kms_key.this_aws_kms_key.arn
321 |         ]
322 |       },
323 |     ]
324 |   })
325 | }
326 | 
327 | resource "aws_iam_policy" "lambda_kinesis_policy" {
328 |   description = "Policy for Kinesis Stream Access"
329 |   policy = jsonencode({
330 |     Version = "2012-10-17",
331 |     Statement = [
332 |       {
333 |         "Effect" : "Allow",
334 |         "Action" : [
335 |           "kinesis:GetShardIterator",
336 |           "kinesis:GetRecords"
337 |         ],
338 |         "Resource" : [
339 |           aws_kinesis_stream.input_stream.arn
340 |         ]
341 |       },
342 |     ]
343 |   })
344 | }
345 | 
346 | resource "aws_iam_policy" "lambda_sagemaker_policy" {
347 |   description = "Policy for Sagemaker Endpoint Access"
348 |   policy = jsonencode({
349 |     Version = "2012-10-17",
350 |     Statement = [
351 |       {
352 |         "Effect" : "Allow",
353 |         "Action" : [
354 |           "Sagemaker:InvokeEndpoint"
355 |         ],
356 |         "Resource" : [
357 |           var.model_name != "titan" ? aws_sagemaker_endpoint.pytorch_endpoint[0].arn : "arn:aws:sagemaker:us-west-2:123456789012:endpoint/dummy-endpoint" # Generate a dummy arn if we aren't using ours
358 |         ]
359 |       },
360 |     ]
361 |   })
362 | }
363 | 
364 | resource "aws_iam_policy" "lambda_sqs_policy" {
365 |   description = "Policy for Sagemaker Endpoint Access"
366 |   policy = jsonencode({
367 |     Version = "2012-10-17",
368 |     Statement = [
369 |       {
370 |         "Effect" : "Allow",
371 |         "Action" : [
372 |           "sqs:SendMessage"
373 |         ],
374 |         "Resource" : [
375 |           aws_sqs_queue.tags.arn
376 |         ]
377 |       },
378 |     ]
379 |   })
380 | }
381 | 
382 | resource "aws_iam_policy" "lambda_s3_policy" {
383 |   description = "Policy for S3 Access"
384 |   policy = jsonencode({
385 |     Version = "2012-10-17",
386 |     Statement = [
387 |       {
388 |         "Effect" : "Allow",
389 |         "Action" : [
390 |           "s3:GetObject",
391 |           "s3:ListBucket",
392 |           "s3:PutObject"
393 |         ],
394 |         "Resource" : [
395 |           module.preprocess_data_bucket.arn,
396 |           "${module.preprocess_data_bucket.arn}/*",
397 |           module.embedding_data_bucket.arn,
398 |           "${module.embedding_data_bucket.arn}/*",
399 |         ]
400 |       },
401 |     ]
402 |   })
403 | }
404 | 
405 | resource "aws_iam_policy_attachment" "lambda_execution_policy_attachment" {
406 |   name       = "lambda_execution_policy_attachment"
407 |   roles      = [aws_iam_role.summarization_lambda_role.name, aws_iam_role.trigger_sfn_lambda_role.name, aws_iam_role.preprocessing_lambda_role.name, aws_iam_role.embedding_lambda_role.name, aws_iam_role.step_functions_role.name]
408 |   policy_arn = aws_iam_policy.lambda_execution_policy.arn
409 | }
410 | 
411 | resource "aws_iam_policy_attachment" "lambda_kinesis_policy_attachment" {
412 |   name       = "lambda_kinesis_policy_attachment"
413 |   roles      = [aws_iam_role.preprocessing_lambda_role.name]
414 |   policy_arn = aws_iam_policy.lambda_kinesis_policy.arn
415 | }
416 | 
417 | resource "aws_iam_policy_attachment" "lambda_sagemaker_policy_attachment" {
418 |   name       = "lambda_sagemaker_policy_attachment"
419 |   roles      = [aws_iam_role.embedding_lambda_role.name]
420 |   policy_arn = aws_iam_policy.lambda_sagemaker_policy.arn
421 | }
422 | 
423 | resource "aws_iam_policy_attachment" "lambda_s3_policy_attachment" {
424 |   name       = "lambda_s3_policy_attachment"
425 |   roles      = [aws_iam_role.embedding_lambda_role.name, aws_iam_role.preprocessing_lambda_role.name]
426 |   policy_arn = aws_iam_policy.lambda_s3_policy.arn
427 | }
428 | 
429 | resource "aws_iam_policy_attachment" "lambda_sqs_policy_attachment" {
430 |   name       = "lambda_sqs_policy_attachment"
431 |   roles      = [aws_iam_role.embedding_lambda_role.name, aws_iam_role.step_functions_role.name]
432 |   policy_arn = aws_iam_policy.lambda_sqs_policy.arn
433 | }
434 | 
435 | resource "aws_iam_role" "summarization_lambda_role" {
436 |   name = "summarization-role-${var.app_name}-${var.env_name}"
437 | 
438 |   assume_role_policy = jsonencode({
439 |     Version = "2012-10-17",
440 |     Statement = [
441 |       {
442 |         Action = "sts:AssumeRole",
443 |         Effect = "Allow",
444 |         Principal = {
445 |           Service = "lambda.amazonaws.com"
446 |         },
447 |       },
448 |     ],
449 |   })
450 | }
451 | 
452 | resource "aws_iam_role_policy" "summarization_policy" {
453 |   #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow &quot;*&quot; as a statement's resource for restrictable actions"
454 |   #checkov:skip=CKV_AWS_290: "Ensure IAM policies does not allow write access without constraints"
455 |   #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow &quot;*&quot; as a statement's resource for restrictable actions"
456 |   name = "summarization-policy-${var.app_name}-${var.env_name}"
457 |   role = aws_iam_role.summarization_lambda_role.id
458 | 
459 |   policy = jsonencode({
460 |     Version = "2012-10-17",
461 |     Statement = [
462 |       {
463 |         Action = [
464 |           "dynamodb:Query",
465 |         ],
466 |         Resource = [
467 |           aws_dynamodb_table.cluster_table.arn,
468 |           "${aws_dynamodb_table.cluster_table.arn}/*"
469 |         ],
470 |         Effect = "Allow",
471 |       },
472 |       {
473 |         Action   = "bedrock:InvokeModel",
474 |         Resource = "*",
475 |         Effect   = "Allow",
476 |       },
477 |       {
478 |         Action   = "logs:*",
479 |         Resource = "arn:aws:logs:${local.region}:${local.account_id}:*",
480 |         Effect   = "Allow",
481 |       },
482 |     ],
483 |   })
484 | }
485 | 
486 | resource "aws_iam_role" "summary_sfn_exec_role" {
487 |   name = "summary_sfn_exec_role-${var.app_name}-${var.env_name}"
488 | 
489 |   assume_role_policy = jsonencode({
490 |     Version = "2012-10-17",
491 |     Statement = [
492 |       {
493 |         Action = "sts:AssumeRole",
494 |         Effect = "Allow",
495 |         Principal = {
496 |           Service = "states.amazonaws.com"
497 |         },
498 |       },
499 |     ],
500 |   })
501 | }
502 | 
503 | # IAM Policy for Step Functions to write to DynamoDB
504 | resource "aws_iam_role_policy" "summary_sfn_exec_policy" {
505 |   name = "summary_sfn_exec_policy-${var.app_name}-${var.env_name}"
506 |   role = aws_iam_role.summary_sfn_exec_role.id
507 | 
508 |   policy = jsonencode({
509 |     Version = "2012-10-17",
510 |     Statement = [
511 |       {
512 |         Action = [
513 |           "dynamodb:PutItem",
514 |           "dynamodb:UpdateItem",
515 |           "dynamodb:DeleteItem"
516 |         ],
517 |         Effect = "Allow",
518 |         Resource = [
519 |           aws_dynamodb_table.cluster_table.arn,
520 |           "${aws_dynamodb_table.cluster_table.arn}/*"
521 |         ],
522 |       },
523 |       {
524 |         Action = [
525 |           "lambda:InvokeFunction"
526 |         ],
527 |         Effect   = "Allow",
528 |         Resource = [aws_lambda_function.summarization_function.arn]
529 |       },
530 |       {
531 |         Action : [
532 |           "xray:GetSamplingRules",
533 |           "xray:GetSamplingTargets",
534 |           "xray:PutTelemetryRecords",
535 |           "xray:PutTraceSegments"
536 |         ],
537 |         Resource : "*",
538 |         Effect : "Allow"
539 |       }
540 |     ]
541 |   })
542 | }
543 | 
544 | resource "aws_iam_role" "trigger_sfn_lambda_role" {
545 |   name = "triggers-sfn-role-${var.app_name}-${var.env_name}"
546 | 
547 |   assume_role_policy = jsonencode({
548 |     Version = "2012-10-17",
549 |     Statement = [
550 |       {
551 |         Action = "sts:AssumeRole",
552 |         Effect = "Allow",
553 |         Principal = {
554 |           Service = "lambda.amazonaws.com"
555 |         },
556 |       },
557 |     ],
558 |   })
559 | }
560 | 
561 | resource "aws_iam_role_policy" "trigger_sfn_policy" {
562 |   name = "trigger-sfn-policy-${var.app_name}-${var.env_name}"
563 |   role = aws_iam_role.trigger_sfn_lambda_role.id
564 | 
565 |   policy = jsonencode({
566 |     Version = "2012-10-17",
567 |     Statement = [
568 |       {
569 |         Action = [
570 |           "states:StartExecution",
571 |         ],
572 |         Resource = aws_sfn_state_machine.summary_sfn.arn,
573 |         Effect   = "Allow",
574 |       },
575 |       {
576 |         Action = [
577 |           "dynamodb:GetItem",
578 |           "dynamodb:Query",
579 |           "dynamodb:PutItem",
580 |           "dynamodb:UpdateItem",
581 |           "dynamodb:DeleteItem",
582 |           "dynamodb:GetRecords",
583 |           "dynamodb:GetShardIterator",
584 |           "dynamodb:DescribeStream",
585 |           "dynamodb:ListStreams"
586 |         ],
587 |         Resource = [
588 |           aws_dynamodb_table.cluster_table.arn,
589 |           "${aws_dynamodb_table.cluster_table.arn}/*"
590 |         ],
591 |         Effect = "Allow",
592 |       },
593 |       {
594 |         Action   = "logs:*",
595 |         Resource = "arn:aws:logs:${local.region}:${local.account_id}:*",
596 |         Effect   = "Allow",
597 |       },
598 |     ],
599 |   })
600 | }
601 | 
602 | resource "aws_iam_service_linked_role" "this_asg_aws_iam_service_linked_role" {
603 |   aws_service_name = "autoscaling.amazonaws.com"
604 |   custom_suffix    = local.standard_resource_name
605 |   description      = "A service linked role for autoscaling to use to call other AWS services"
606 |   tags             = local.tags
607 | }
608 | 


--------------------------------------------------------------------------------
/iac/roots/main/kms.tf:
--------------------------------------------------------------------------------
 1 | # Create KMS Key and allow the use of it
 2 | resource "aws_kms_key" "this_aws_kms_key" {
 3 |   description             = "clustering-summarization-${local.standard_resource_name}"
 4 |   deletion_window_in_days = 30
 5 |   multi_region            = true
 6 |   enable_key_rotation     = true
 7 |   tags                    = merge(local.tags)
 8 | }
 9 | 
10 | resource "aws_kms_key_policy" "this_aws_kms_key_policy" {
11 |   key_id = aws_kms_key.this_aws_kms_key.key_id
12 |   policy = jsonencode({
13 |     "Version" : "2012-10-17",
14 |     "Id" : "key-default-1",
15 |     "Statement" : [
16 |       {
17 |         "Sid" : "Enable IAM User Permissions",
18 |         "Effect" : "Allow",
19 |         "Principal" : {
20 |           "AWS" : "arn:aws:iam::${local.account_id}:root"
21 |         },
22 |         "Action" : "kms:*",
23 |         "Resource" : "*"
24 |       },
25 |       {
26 |         "Effect" : "Allow",
27 |         "Principal" : {
28 |           "Service" : "logs.${local.region}.amazonaws.com"
29 |         },
30 |         "Action" : [
31 |           "kms:Encrypt*",
32 |           "kms:Decrypt*",
33 |           "kms:ReEncrypt*",
34 |           "kms:GenerateDataKey*",
35 |           "kms:Describe*"
36 |         ],
37 |         "Resource" : "*",
38 |         "Condition" : {
39 |           "ArnEquals" : {
40 |             "kms:EncryptionContext:aws:logs:arn" : "arn:aws:logs:${local.region}:${local.account_id}:log-group:*${local.standard_resource_name}*"
41 |           }
42 |         }
43 |       },
44 |       {
45 |         "Sid" : "Allow service-linked role use of the customer managed key",
46 |         "Effect" : "Allow",
47 |         "Principal" : {
48 |           "AWS" : aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
49 |         },
50 |         "Action" : [
51 |           "kms:Encrypt",
52 |           "kms:Decrypt",
53 |           "kms:ReEncrypt*",
54 |           "kms:GenerateDataKey*",
55 |           "kms:DescribeKey"
56 |         ],
57 |         "Resource" : "*"
58 |       },
59 |       {
60 |         "Sid" : "Allow attachment of persistent resources",
61 |         "Effect" : "Allow",
62 |         "Principal" : {
63 |           "AWS" : aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
64 |         },
65 |         "Action" : "kms:CreateGrant",
66 |         "Resource" : "*",
67 |         "Condition" : {
68 |           "Bool" : {
69 |             "kms:GrantIsForAWSResource" : "true"
70 |           }
71 |         }
72 |       }
73 |     ]
74 |   })
75 | }
76 | 
77 | resource "aws_kms_alias" "this_aws_kms_alias" {
78 |   name          = "alias/clustering-summarization-${local.standard_resource_name}"
79 |   target_key_id = aws_kms_key.this_aws_kms_key.key_id
80 | }


--------------------------------------------------------------------------------
/iac/roots/main/lambda.tf:
--------------------------------------------------------------------------------
  1 | # Lambda functions
  2 | 
  3 | module "pre_process_docs_ecr" {
  4 |   source              = "../../templates/modules/ecr"
  5 |   region              = local.region
  6 |   ecr_name            = "pre-process-docs-${local.standard_resource_name}"
  7 |   build_script_path   = "${path.module}/${var.build_script_path}"
  8 |   business_logic_path = "${path.module}/${var.lambda_code_path}/pre_process_docs/"
  9 |   tags                = local.tags
 10 |   aws_kms_key_arn     = aws_kms_key.this_aws_kms_key.arn
 11 |   ecr_count_number    = 2
 12 |   ecr_base_arn        = local.ecr_base_arn
 13 | }
 14 | 
 15 | resource "aws_lambda_function" "pre_processing_lambda" {
 16 |   #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
 17 |   #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
 18 |   #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
 19 |   #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
 20 |   description                    = "Executes the pre-process_docs-${local.standard_resource_name} Function"
 21 |   function_name                  = "pre-process-docs-${local.standard_resource_name}"
 22 |   role                           = aws_iam_role.preprocessing_lambda_role.arn
 23 |   timeout                        = 300 # Timeout in seconds (5 minutes)
 24 |   kms_key_arn                    = aws_kms_key.this_aws_kms_key.arn
 25 |   image_uri                      = module.pre_process_docs_ecr.latest_image_uri
 26 |   package_type                   = "Image"
 27 |   tags                           = local.tags
 28 |   reserved_concurrent_executions = -1
 29 |   # vpc_config {
 30 |   #   # If the list of security group ids and subnets are empty,
 31 |   #   # this property is effectively ignored
 32 |   #   subnet_ids         = [aws_subnet.subnet.id]
 33 |   #   security_group_ids = [aws_security_group.sg.id]
 34 |   # }
 35 | 
 36 |   tracing_config {
 37 |     mode = "Active"
 38 |   }
 39 | 
 40 |   environment {
 41 |     variables = {
 42 |       PREPROCESS_BUCKET = module.preprocess_data_bucket.name
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | module "embedding_lambda_ecr" {
 48 |   source              = "../../templates/modules/ecr"
 49 |   region              = local.region
 50 |   ecr_name            = "embed-docs-${local.standard_resource_name}"
 51 |   build_script_path   = "${path.module}/${var.build_script_path}"
 52 |   business_logic_path = "${path.module}/${var.lambda_code_path}/embed_docs/"
 53 |   tags                = local.tags
 54 |   aws_kms_key_arn     = aws_kms_key.this_aws_kms_key.arn
 55 |   ecr_count_number    = 2
 56 |   ecr_base_arn        = local.ecr_base_arn
 57 | }
 58 | 
 59 | resource "aws_lambda_function" "embedding_lambda" {
 60 |   #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
 61 |   #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
 62 |   #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
 63 |   #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
 64 |   description                    = "Executes the embed-docs-${local.standard_resource_name} Function"
 65 |   function_name                  = "embed-docs-${local.standard_resource_name}"
 66 |   role                           = aws_iam_role.embedding_lambda_role.arn
 67 |   timeout                        = 300 # Timeout in seconds (5 minutes)
 68 |   kms_key_arn                    = aws_kms_key.this_aws_kms_key.arn
 69 |   image_uri                      = module.embedding_lambda_ecr.latest_image_uri
 70 |   package_type                   = "Image"
 71 |   tags                           = local.tags
 72 |   reserved_concurrent_executions = -1
 73 |   # vpc_config {
 74 |   #   # If the list of security group ids and subnets are empty,
 75 |   #   # this property is effectively ignored
 76 |   #   subnet_ids         = [aws_subnet.subnet.id]
 77 |   #   security_group_ids = [aws_security_group.sg.id]
 78 |   # }
 79 | 
 80 |   tracing_config {
 81 |     mode = "Active"
 82 |   }
 83 | 
 84 |   environment {
 85 |     variables = {
 86 |       EMBEDDING_ENDPOINT_NAME = var.model_name != "titan" ? aws_sagemaker_endpoint.pytorch_endpoint[0].name : ""
 87 |       MAX_LENGTH              = var.max_length_embedding
 88 |       SQS_QUEUE_URL           = aws_sqs_queue.tags.url
 89 |       PREPROCESS_BUCKET       = module.preprocess_data_bucket.name
 90 |       EMBEDDING_BUCKET        = module.embedding_data_bucket.name
 91 |       MAX_ARTICLES            = var.max_articles_embedding_endpoint
 92 |       EMBEDDING_MODEL         = var.model_name
 93 |     }
 94 |   }
 95 | }
 96 | 
 97 | module "trigger_sfn_ecr" {
 98 |   source              = "../../templates/modules/ecr"
 99 |   region              = local.region
100 |   ecr_name            = "trigger-sfn-${local.standard_resource_name}"
101 |   build_script_path   = "${path.module}/${var.build_script_path}"
102 |   business_logic_path = "${path.module}/${var.lambda_code_path}/trigger_sfn/"
103 |   tags                = local.tags
104 |   aws_kms_key_arn     = aws_kms_key.this_aws_kms_key.arn
105 |   ecr_count_number    = 2
106 |   ecr_base_arn        = local.ecr_base_arn
107 | }
108 | 
109 | resource "aws_lambda_function" "trigger_sfn_function" {
110 |   #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
111 |   #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
112 |   #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
113 |   #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
114 |   description   = "Executes the trigger-sfn-${local.standard_resource_name} Function"
115 |   function_name = "trigger-sfn-${local.standard_resource_name}"
116 |   role          = aws_iam_role.trigger_sfn_lambda_role.arn
117 |   timeout       = 30
118 |   kms_key_arn   = aws_kms_key.this_aws_kms_key.arn
119 |   image_uri     = module.trigger_sfn_ecr.latest_image_uri
120 |   package_type  = "Image"
121 |   tags          = local.tags
122 | 
123 |   reserved_concurrent_executions = -1
124 |   # vpc_config {
125 |   #   # If the list of security group ids and subnets are empty,
126 |   #   # this property is effectively ignored
127 |   #   subnet_ids         = [aws_subnet.subnet.id]
128 |   #   security_group_ids = [aws_security_group.sg.id]
129 |   # }
130 | 
131 |   tracing_config {
132 |     mode = "Active"
133 |   }
134 | 
135 |   environment {
136 |     variables = {
137 |       STATE_MACHINE_ARN   = aws_sfn_state_machine.summary_sfn.arn
138 |       ARTICLES_THRESHOLD  = 5
139 |       DYNAMODB_TABLE_NAME = aws_dynamodb_table.cluster_table.name
140 |     }
141 |   }
142 | }
143 | 
144 | module "summarization_function_ecr" {
145 |   source              = "../../templates/modules/ecr"
146 |   region              = local.region
147 |   ecr_name            = "summarization-function-docs-${local.standard_resource_name}"
148 |   build_script_path   = "${path.module}/${var.build_script_path}"
149 |   business_logic_path = "${path.module}/${var.lambda_code_path}/summarization/"
150 |   tags                = local.tags
151 |   aws_kms_key_arn     = aws_kms_key.this_aws_kms_key.arn
152 |   ecr_count_number    = 2
153 |   ecr_base_arn        = local.ecr_base_arn
154 | }
155 | 
156 | resource "aws_lambda_function" "summarization_function" {
157 |   #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
158 |   #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
159 |   #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
160 |   #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
161 |   description                    = "Executes the summarization-function-${local.standard_resource_name} Function"
162 |   function_name                  = "summarization-function-${local.standard_resource_name}"
163 |   role                           = aws_iam_role.summarization_lambda_role.arn
164 |   timeout                        = 30
165 |   kms_key_arn                    = aws_kms_key.this_aws_kms_key.arn
166 |   image_uri                      = module.summarization_function_ecr.latest_image_uri
167 |   package_type                   = "Image"
168 |   tags                           = local.tags
169 |   reserved_concurrent_executions = -1
170 | 
171 |   tracing_config {
172 |     mode = "Active"
173 |   }
174 | 
175 |   # vpc_config {
176 |   #   # If the list of security group ids and subnets are empty,
177 |   #   # this property is effectively ignored
178 |   #   subnet_ids         = [aws_subnet.subnet.id]
179 |   #   security_group_ids = [aws_security_group.sg.id]
180 |   # }
181 | 
182 |   environment {
183 |     variables = {
184 |       DYNAMODB_TABLE_NAME = aws_dynamodb_table.cluster_table.name
185 |       MODEL_ID            = "anthropic.claude-3-haiku-20240307-v1:0"
186 |     }
187 |   }
188 | }
189 | 


--------------------------------------------------------------------------------
/iac/roots/main/outputs.tf:
--------------------------------------------------------------------------------
1 | output "sample_user_creds" {
2 |   description = "Sample User Credentials"
3 |   value       = var.cognito_users
4 | }
5 | 
6 | output "dns_record_for_application" {
7 |   description = "DNS Address to Access the Application"
8 |   value       = "https://${aws_alb.this_aws_alb_front_end.dns_name}"
9 | }


--------------------------------------------------------------------------------
/iac/roots/main/summarization_pipeline.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_sfn_state_machine" "summary_sfn" {
 2 |   #checkov:skip=CKV_AWS_285: "Ensure State Machine has execution history logging enabled"
 3 | 
 4 |   name     = "summary-sfn-${var.app_name}-${var.env_name}"
 5 |   role_arn = aws_iam_role.summary_sfn_exec_role.arn
 6 | 
 7 |   tracing_configuration {
 8 |     enabled = true
 9 |   }
10 |   definition = jsonencode({
11 |     Comment = "An example state machine that invokes a Lambda function and updates DynamoDB.",
12 |     StartAt = "SummarizeCluster",
13 |     States = {
14 |       SummarizeCluster = {
15 |         Type     = "Task",
16 |         Resource = "arn:aws:states:::lambda:invoke",
17 |         Parameters = {
18 |           FunctionName = aws_lambda_function.summarization_function.arn,
19 |           "Payload.$"  = "$"
20 |         },
21 |         ResultPath = "$.LambdaOutput",
22 |         Next       = "UpdateDynamoDB"
23 |       },
24 |       UpdateDynamoDB = {
25 |         Type     = "Task",
26 |         Resource = "arn:aws:states:::dynamodb:updateItem",
27 |         Parameters = {
28 |           TableName = aws_dynamodb_table.cluster_table.id
29 |           Key = {
30 |             "PK" : {
31 |               "S.$" : "$.cluster_id"
32 |             },
33 |             "SK" : {
34 |               "S.$" : "States.Format('#METADATA#{}', $.cluster_id)"
35 |             }
36 |           },
37 |           "UpdateExpression" : "SET #description = :description_val, #generated_summary = :generated_summary_val, #summary_count = :summary_count_val, #most_common_location = :most_common_location_val, #most_common_organization = :most_common_organization_val, #earliest_date = :earliest_date_val, #latest_date = :latest_date_val",
38 |           "ExpressionAttributeNames" : {
39 |             "#description" : "description",
40 |             "#generated_summary" : "generated_summary",
41 |             "#summary_count" : "summary_count",
42 |             "#most_common_location" : "most_common_location",
43 |             "#most_common_organization" : "most_common_organization",
44 |             "#earliest_date" : "earliest_date",
45 |             "#latest_date" : "latest_date"
46 |           },
47 |           "ExpressionAttributeValues" : {
48 |             ":description_val" : { "S.$" : "$.LambdaOutput.Payload.title" },
49 |             ":generated_summary_val" : { "S.$" : "$.LambdaOutput.Payload.summary" },
50 |             ":summary_count_val" : { "N.$" : "States.Format('{}', $.LambdaOutput.Payload.summary_count)" }, // Convert to a string
51 |             ":most_common_location_val" : { "S.$" : "$.LambdaOutput.Payload.most_common_location" },
52 |             ":most_common_organization_val" : { "S.$" : "$.LambdaOutput.Payload.most_common_organization" },
53 |             ":earliest_date_val" : { "S.$" : "$.LambdaOutput.Payload.earliest_date" },
54 |             ":latest_date_val" : { "S.$" : "$.LambdaOutput.Payload.latest_date" }
55 |           }
56 |         },
57 |         End = true
58 |       }
59 |     }
60 |   })
61 | }
62 | 


--------------------------------------------------------------------------------
/iac/roots/main/templates/ClusterList-js.template:
--------------------------------------------------------------------------------
  1 | // src/components/ClusterList.js
  2 | import React, { useState, useEffect, useRef } from "react";
  3 | import AWS from "aws-sdk";
  4 | import {
  5 |   Button,
  6 |   Table,
  7 |   Box,
  8 |   ProgressBar,
  9 |   SpaceBetween,
 10 | } from "@cloudscape-design/components";
 11 | import { fetchAuthSession } from "@aws-amplify/auth";
 12 | import ClusterModal from "./ClusterModal";
 13 | import awsConfig from "../aws-exports";
 14 | 
 15 | const refreshInterval = 5000;
 16 | 
 17 | const ClusterList = () => {
 18 |   const [clusters, setClusters] = useState([]);
 19 |   const [selectedCluster, setSelectedCluster] = useState(null);
 20 |   const [totalArticles, setTotalArticles] = useState(0);
 21 |   const [isModalVisible, setModalVisible] = useState(false);
 22 |   const [progress, setProgress] = useState(0); // Initialize progress at 0%
 23 |   const [secondsRemaining, setSecondsRemaining] = useState(
 24 |     refreshInterval / 1000
 25 |   ); // Initialize countdown
 26 | 
 27 |   const dynamoDbRef = useRef();
 28 | 
 29 |   useEffect(() => {
 30 |     const configureAWS = async () => {
 31 |       const session = await fetchAuthSession();
 32 |       const { accessKeyId, secretAccessKey, sessionToken } =
 33 |         session.credentials;
 34 |       AWS.config.update({
 35 |         region: awsConfig.aws_cognito_region,
 36 |         credentials: new AWS.Credentials(
 37 |           accessKeyId,
 38 |           secretAccessKey,
 39 |           sessionToken
 40 |         ),
 41 |       });
 42 |       dynamoDbRef.current = new AWS.DynamoDB.DocumentClient();
 43 |       fetchClusters();
 44 |     };
 45 |     configureAWS();
 46 |   }, []);
 47 | 
 48 |   useEffect(() => {
 49 |     const intervalId = setInterval(() => {
 50 |       fetchClusters();
 51 |     }, refreshInterval);
 52 | 
 53 |     const progressId = setInterval(() => {
 54 |       setProgress(
 55 |         (prevProgress) => (prevProgress + (1000 / refreshInterval) * 100) % 100
 56 |       );
 57 |       setSecondsRemaining((prevSeconds) =>
 58 |         prevSeconds <= 1 ? refreshInterval / 1000 : prevSeconds - 1
 59 |       );
 60 |     }, 1000);
 61 | 
 62 |     return () => {
 63 |       clearInterval(intervalId);
 64 |       clearInterval(progressId);
 65 |     };
 66 |   }, []);
 67 | 
 68 |   const fetchClusters = async () => {
 69 |     if (!dynamoDbRef.current) {
 70 |       console.log("DynamoDB client not initialized");
 71 |       return;
 72 |     }
 73 |     let lastEvaluatedKey = null;
 74 |     const allItems = [];
 75 |     let articlesCount = 0;
 76 |     const params = {
 77 |       TableName: "${DYNAMODB_TABLE_NAME}",
 78 |     };
 79 | 
 80 |     do {
 81 |       if (lastEvaluatedKey) {
 82 |         params.ExclusiveStartKey = lastEvaluatedKey;
 83 |       }
 84 |       const data = await dynamoDbRef.current.scan(params).promise();
 85 |       allItems.push(...data.Items);
 86 |       lastEvaluatedKey = data.LastEvaluatedKey;
 87 |     } while (lastEvaluatedKey);
 88 | 
 89 |     const articlesByCluster = allItems.reduce((acc, item) => {
 90 |       if (item.is_cluster) {
 91 |         acc[item.PK] = acc[item.PK] || [];
 92 |       } else if (item.SK.startsWith("ARTICLE#")) {
 93 |         if (item.publication_date) {
 94 |           articlesCount++;
 95 |           if (acc[item.PK]) {
 96 |             acc[item.PK].push(item);
 97 |           }
 98 |         }
 99 |       }
100 |       return acc;
101 |     }, {});
102 | 
103 |     const newClusters = allItems
104 |       .filter(
105 |         (item) =>
106 |           item.is_cluster &&
107 |           item.generated_summary &&
108 |           articlesByCluster[item.PK] &&
109 |           articlesByCluster[item.PK].length > 2
110 |       )
111 |       .map((cluster) => ({
112 |         ...cluster,
113 |         articles: articlesByCluster[cluster.PK],
114 |         number_of_articles: articlesByCluster[cluster.PK].length,
115 |       }))
116 |       .sort((a, b) => b.number_of_articles - a.number_of_articles);
117 | 
118 |     setClusters(newClusters);
119 |     setTotalArticles(articlesCount);
120 |   };
121 | 
122 |   const handleViewArticles = (cluster) => {
123 |     console.log("Opening modal for cluster:", cluster.PK);
124 |     setSelectedCluster(cluster);
125 |     setModalVisible(true); // Set the modal to be visible
126 |   };
127 | 
128 |   const wrapStyleSummary = {
129 |     whiteSpace: "normal", // Allow the text to wrap to the next line
130 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
131 |     maxWidth: "600px", // Set a maximum width for the cell content
132 |     textAlign: "justify", // Center the text
133 |   };
134 | 
135 |   const wrapStyleTitle = {
136 |     whiteSpace: "normal", // Allow the text to wrap to the next line
137 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
138 |     maxWidth: "150px", // Set a maximum width for the cell content
139 |     textAlign: "center",
140 |   };
141 | 
142 |   const wrapStyleNumberOfArticles = {
143 |     whiteSpace: "normal", // Allow the text to wrap to the next line
144 |     wordBreak: "break-word", // Ensure words break correctly at the end of the line
145 |     maxWidth: "100px", // Set a maximum width for the cell content
146 |     textAlign: "center",
147 |   };
148 | 
149 |   // Column definitions using inline styles
150 |   const columnDefinitions = [
151 |     {
152 |       header: "Title",
153 |       cell: (item) => <div style={wrapStyleTitle}>{item.description}</div>,
154 |     },
155 |     {
156 |       header: "Summary",
157 |       cell: (item) => (
158 |         <div style={wrapStyleSummary}>{item.generated_summary}</div>
159 |       ),
160 |     },
161 |     {
162 |       header: "Articles",
163 |       cell: (item) => (
164 |         <div style={wrapStyleNumberOfArticles}>{item.number_of_articles}</div>
165 |       ),
166 |     },
167 |     {
168 |       header: "View",
169 |       cell: (item) => (
170 |         <Button onClick={() => handleViewArticles(item)}>View Articles</Button>
171 |       ),
172 |     },
173 |   ];
174 | 
175 |   return (
176 |     <Box textAlign="center" padding="m">
177 |       <SpaceBetween direction="vertical" size="s">
178 |         <h1 textAlign="center">
179 |           {" "}
180 |           Near Real Time News Clustering and Summarization Demo
181 |         </h1>
182 |         <b>
183 |           Total Clusters: {clusters.length} | Total Articles: {totalArticles}
184 |         </b>
185 |         <div style={{ width: "30%", margin: "0 auto" }}>
186 |           <ProgressBar
187 |             value={progress}
188 |             label={`Next refresh in $${secondsRemaining} seconds`}
189 |           />
190 |         </div>
191 | 
192 |         <Table
193 |           items={clusters}
194 |           columnDefinitions={columnDefinitions}
195 |           trackBy="PK"
196 |         />
197 |         {selectedCluster && (
198 |           <ClusterModal
199 |             cluster={selectedCluster}
200 |             articles={selectedCluster.articles} // Pass articles directly to the modal
201 |             onClose={() => {
202 |               setSelectedCluster(null);
203 |               setModalVisible(false); // Hide the modal when closed
204 |             }}
205 |             visible={isModalVisible} // Control visibility with state
206 |           />
207 |         )}
208 |       </SpaceBetween>
209 |     </Box>
210 |   );
211 | };
212 | 
213 | export default ClusterList;
214 | 


--------------------------------------------------------------------------------
/iac/roots/main/templates/ConfigureNode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Update the system
 4 | dnf install python3.11 -y
 5 | dnf install python3.11-pip -y
 6 | dnf install amazon-cloudwatch-agent -y
 7 | 
 8 | # Set the environment variables
 9 | %{ for config_key, config_value in config }
10 | export ${config_key}="${config_value}"
11 | %{ endfor ~}
12 | 
13 | # Download and set up the code
14 | cat > /usr/local/bin/clustering-compute.sh << EOF
15 | #!/bin/bash
16 | for i in 1;do
17 |     %{ for config_key, config_value in config }
18 |     export ${config_key}="${config_value}"
19 |     %{ endfor ~}
20 | 
21 |     cd /home/ec2-user
22 |     mkdir -p stream_consumer
23 |     cd stream_consumer
24 |     aws s3 sync s3://$${S3_BUCKET_PATH} .
25 | 
26 |     # Run script
27 |     python3.11 -m pip install -r requirements.txt
28 |     python3.11 process_records.py >> /var/log/clustering-compute-python.log 2>&1
29 |     
30 | done
31 | EOF
32 | 
33 | # Permission the script
34 | chmod +x /usr/local/bin/clustering-compute.sh
35 | 
36 | # Sleeping just for things to get settled
37 | sleep 30
38 | 
39 | # Sending the logs to Cloudwatch
40 | touch /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
41 | cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json <<EOF
42 | {
43 |     "agent": {
44 |             "run_as_user": "root"
45 |     },
46 |     "logs": {
47 |             "logs_collected": {
48 |                     "files": {
49 |                             "collect_list": [
50 |                                     {
51 |                                             "file_path": "/var/log/clustering-compute.log",
52 |                                             "log_group_class": "STANDARD",
53 |                                             "log_group_name": "/aws/clustering-compute-instance",
54 |                                             "log_stream_name": "{instance_id}",
55 |                                             "retention_in_days": -1
56 |                                     },
57 |                                     {
58 |                                             "file_path": "/var/log/clustering-compute-python.log",
59 |                                             "log_group_class": "STANDARD",
60 |                                             "log_group_name": "/aws/clustering-compute-python",
61 |                                             "log_stream_name": "{instance_id}",
62 |                                             "retention_in_days": -1
63 |                                     }                                    
64 |                             ]
65 |                     }
66 |             }
67 |     }
68 | }
69 | EOF
70 | 
71 | # Start the Cloudwatch Agent
72 | systemctl enable amazon-cloudwatch-agent.service
73 | systemctl start amazon-cloudwatch-agent.service
74 | systemctl status amazon-cloudwatch-agent.service
75 | 
76 | # Create and Install it as a service
77 | cat > /etc/systemd/system/clustering-compute.service << EOF
78 | [Unit]
79 | Description=Clustering Compute Process
80 | After=syslog.target network.target remote-fs.target nss-lookup.target
81 | 
82 | [Service]
83 | ExecStart=/usr/local/bin/clustering-compute.sh
84 | RestartSec=300
85 | Restart=always
86 | 
87 | [Install]
88 | WantedBy=multi-user.target
89 | EOF
90 | 
91 | # Start the clustering-compute.service
92 | systemctl daemon-reload
93 | systemctl enable clustering-compute.service
94 | systemctl start clustering-compute.service
95 | systemctl status clustering-compute.service
96 | 


--------------------------------------------------------------------------------
/iac/roots/main/templates/aws-exports-js.template:
--------------------------------------------------------------------------------
1 | const awsConfig = {
2 |     aws_project_region: '${AWS_REGION}',                                                    // AWS region of Cognito
3 |     aws_cognito_region: '${AWS_REGION}',                                                    // AWS region of Cognito
4 |     aws_cognito_identity_pool_id: '${AWS_COGNITO_IDENTITY_POOL}',                           // Identity pool ID
5 |     aws_user_pools_id: '${AWS_COGNITO_USER_POOL_ID}',                                       // User Pool ID
6 |     aws_user_pools_web_client_id: '${AWS_CONGITO_USER_POOL_APP_CLIENT_ID}',                 // App client ID
7 |     federationTarget: "COGNITO_USER_POOLS"                                                  // keep as "COGNITO_USER_POOLS"
8 | };
9 | export default awsConfig;


--------------------------------------------------------------------------------
/iac/roots/main/templates/cognito-policy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Effect": "Allow",
 6 |       "Action": [
 7 |         "cognito-identity:GetCredentialsForIdentity"
 8 |       ],
 9 |       "Resource": [
10 |         "*"
11 |       ]
12 |     },
13 |     {
14 |       "Sid": "VisualEditor0",
15 |       "Effect": "Allow",
16 |       "Action": [
17 |         "dynamodb:Scan"
18 |       ],
19 |       "Resource": "${dd_table_arn}"
20 |     }
21 |   ]
22 | }


--------------------------------------------------------------------------------
/iac/roots/main/templates/ecs-role.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Version": "2012-10-17",
 3 |   "Statement": [
 4 |     {
 5 |       "Sid": "BasicDescribePolicy",
 6 |       "Effect": "Allow",
 7 |       "Action": [
 8 |         "ecr:GetAuthorizationToken",
 9 |         "logs:CreateLogGroup",
10 |         "logs:CreateLogStream",
11 |         "logs:DescribeLogStreams",
12 |         "logs:PutLogEvents"
13 |       ],
14 |       "Resource": "*"
15 |     },
16 |     {
17 |       "Sid": "AccessToEncryptAndDeccryptKMSKeys",
18 |       "Effect": "Allow",
19 |       "Action": [
20 |         "kms:Decrypt",
21 |         "kms:DescribeKey",
22 |         "kms:Encrypt",
23 |         "kms:GenerateDataKey",
24 |         "kms:GetKeyPolicy",
25 |         "kms:GetKeyRotationStatus",
26 |         "kms:ListGrants",
27 |         "kms:ListKeys",
28 |         "kms:ListAliases",
29 |         "kms:ListKeyPolicies",
30 |         "kms:ListResourceTags",
31 |         "kms:ListRetirableGrants",
32 |         "kms:ReEncryptTo"
33 |       ],
34 |       "Resource": [
35 |         "${kms_key_arn}"
36 |       ]
37 |     },
38 |     {
39 |       "Sid": "ECRGrantsToConnectAndDownload",
40 |       "Effect": "Allow",
41 |       "Action": [
42 |         "ecr:BatchCheckLayerAvailability",
43 |         "ecr:BatchGetImage",
44 |         "ecr:GetDownloadUrlForLayer"
45 |       ],
46 |       "Resource": "arn:aws:ecr:*:*:repository/*${standard_resource_name}*"
47 |     },
48 |     {
49 |       "Sid": "ECSGrants",
50 |       "Effect": "Allow",
51 |       "Action": [
52 |         "ecs:CreateCluster",
53 |         "ecs:DeregisterContainerInstance",
54 |         "ecs:DescribeServices",
55 |         "ecs:DiscoverPollEndpoint",
56 |         "ecs:Poll",
57 |         "ecs:RegisterContainerInstance",
58 |         "ecs:RegisterContainerInstance",
59 |         "ecs:StartTelemetrySession",
60 |         "ecs:Submit*",
61 |         "ecs:UpdateContainerInstancesState",
62 |         "ecs:UpdateService"
63 |       ],
64 |       "Resource": "*",
65 |       "Condition": {
66 |         "ForAllValues:StringEquals": {
67 |           "aws:ResourceTag/common_identifier": "*${standard_resource_name}*"
68 |         }
69 |       }
70 |     }
71 |   ]
72 | }


--------------------------------------------------------------------------------
/iac/roots/main/templates/init.cfg:
--------------------------------------------------------------------------------
 1 | #cloud-config
 2 | write_files:
 3 |   - content: |
 4 |       ${CONFIGURE_NODE_SCRIPT}
 5 |     encoding: gz+b64
 6 |     path: /usr/local/bin/ConfigureNode.sh
 7 |     permissions: "0755"
 8 | runcmd:
 9 |   - /usr/local/bin/ConfigureNode.sh
10 | 


--------------------------------------------------------------------------------
/iac/roots/main/terraform.tfvars:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Amazon.com and its affiliates; all rights reserved.
2 | // This file is Amazon Web Services Content and may not be duplicated or distributed without permission.
3 | 
4 | app_name       = "clustering"
5 | env_name       = "demo2"
6 | cidr_block     = "10.0.0.0/16"
7 | public_subnet  = ["10.0.2.0/24", "10.0.3.0/24", "10.0.4.0/24"]
8 | private_subnet = ["10.0.10.0/24", "10.0.11.0/24", "10.0.12.0/24"]
9 | 


--------------------------------------------------------------------------------
/iac/roots/main/variables.tf:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 Amazon.com and its affiliates; all rights reserved.
  2 | # This file is Amazon Web Services Content and may not be duplicated or distributed without permission.
  3 | 
  4 | variable "app_name" {
  5 |   type        = string
  6 |   description = "Name of the app"
  7 | }
  8 | 
  9 | variable "env_name" {
 10 |   type        = string
 11 |   description = "Name of the environment"
 12 | }
 13 | 
 14 | # VPC Variables
 15 | variable "cidr_block" {
 16 |   description = "The CIDR block for the VPC. Default value is a valid CIDR, but not acceptable by AWS and should be overridden"
 17 |   type        = string
 18 |   default     = "0.0.0.0/0"
 19 | }
 20 | 
 21 | variable "public_subnet" {
 22 |   description = "A list of public subnets inside the VPC"
 23 |   type        = list(string)
 24 |   default     = []
 25 | }
 26 | 
 27 | variable "private_subnet" {
 28 |   description = "A list of private subnets inside the VPC"
 29 |   type        = list(string)
 30 |   default     = []
 31 | }
 32 | 
 33 | variable "lambda_code_path" {
 34 |   description = "Relative path to the Lambda functions' code"
 35 |   type        = string
 36 |   default     = "../../../business_logic/lambdas"
 37 | }
 38 | 
 39 | variable "build_script_path" {
 40 |   description = "Relative path to the Build functions' code"
 41 |   type        = string
 42 |   default     = "../../../build-script"
 43 | }
 44 | 
 45 | variable "model_name" {
 46 |   description = "'bge', 'titan', 'mistralinstruct'"
 47 |   type        = string
 48 |   default     = "titan"
 49 | }
 50 | 
 51 | variable "max_length_embedding" {
 52 |   description = "Max length on the encode call within the Sagemaker endpoint: 512, 1024, 2048, 4096"
 53 |   type        = string
 54 |   default     = "512"
 55 | }
 56 | 
 57 | variable "embedding_endpoint_instance_type" {
 58 |   description = "Instance type for embedding endpoint"
 59 |   type        = string
 60 |   # default     = "ml.inf2.xlarge"
 61 |   default = "ml.g5.2xlarge"
 62 |   # default = "ml.g5.12xlarge"
 63 | }
 64 | 
 65 | variable "embedding_endpoint_instance_count" {
 66 |   description = "Number of instances of embedding endpoint"
 67 |   type        = number
 68 |   default     = 2
 69 | }
 70 | 
 71 | /* 
 72 | variable "azs" {
 73 |   description = "A list of availability zones in the region"
 74 |   type        = list(string)
 75 |   default     = []
 76 | }
 77 | 
 78 | variable "embedding_strategy" {
 79 |   description = "'concat' or 'pooling'"
 80 |   type        = string
 81 |   default     = "concat"
 82 | }
 83 | 
 84 | variable "pooling_strategy" {
 85 |   description = "'mean' or 'max'"
 86 |   type        = string
 87 |   default     = "mean"
 88 | }
 89 | 
 90 | variable "min_embedding_instance_count" {
 91 |   description = "Number of instances of embedding endpoint"
 92 |   type        = number
 93 |   default     = 1
 94 | }
 95 | 
 96 | variable "max_embedding_instance_count" {
 97 |   description = "Number of instances of embedding endpoint"
 98 |   type        = number
 99 |   default     = 8
100 | }
101 | */
102 | 
103 | variable "max_articles_embedding_endpoint" {
104 |   description = "Maximum number of articles the embedding endpoint can take in one API call"
105 |   type        = number
106 |   default     = 200
107 | }
108 | 
109 | variable "instance_type" {
110 |   type        = string
111 |   default     = "c7g.4xlarge"
112 |   description = "Instance type for the for the clustering compute"
113 | }
114 | 
115 | variable "volume_size" {
116 |   type        = number
117 |   description = "Volume Size of the EBS Volume"
118 |   default     = 35
119 | }
120 | 
121 | variable "number_of_nodes" {
122 |   type        = number
123 |   description = "Number of Nodes Needed for the clustering compute"
124 |   default     = 1
125 | }
126 | 
127 | variable "auto_verified_attributes" {
128 |   type        = list(any)
129 |   default     = ["email"]
130 |   description = "Attributes to be auto-verified. Valid values: email, phone_number."
131 | }
132 | 
133 | variable "mfa_configuration" {
134 |   type        = string
135 |   default     = "OFF"
136 |   description = "Multi-Factor Authentication (MFA) configuration for the User Pool. Defaults of OFF. Valid values are OFF, ON and OPTIONAL."
137 | }
138 | 
139 | variable "advanced_security_mode" {
140 |   type        = string
141 |   default     = "OFF"
142 |   description = "Mode for advanced security, must be one of OFF, AUDIT or ENFORCED."
143 | }
144 | 
145 | variable "allow_software_mfa_token" {
146 |   description = "(Optional) Boolean whether to enable software token Multi-Factor (MFA) tokens, such as Time-based One-Time Password (TOTP). To disable software token MFA when 'sms_configuration' is not present, the 'mfa_configuration' argument must be set to OFF and the 'software_token_mfa_configuration' configuration block must be fully removed."
147 |   type        = bool
148 |   default     = false
149 | }
150 | 
151 | variable "case_sensitive" {
152 |   type        = bool
153 |   default     = true
154 |   description = "Whether username case sensitivity will be applied for all users in the user pool through Cognito APIs."
155 | }
156 | 
157 | variable "sms_authentication_message" {
158 |   type        = string
159 |   default     = "Your username is {username}. Sign up at {####}"
160 |   description = "String representing the SMS authentication message. The Message must contain the {####} placeholder, which will be replaced with the code."
161 | }
162 | 
163 | variable "minimum_length" {
164 |   type        = number
165 |   description = "(Optional) The minimum length of the password policy that you have set."
166 |   default     = 6
167 | }
168 | 
169 | variable "require_lowercase" {
170 |   type        = bool
171 |   description = "(Optional) Whether you have required users to use at least one lowercase letter in their password."
172 |   default     = false
173 | }
174 | 
175 | variable "require_numbers" {
176 |   type        = bool
177 |   default     = false
178 |   description = "Whether you have required users to use at least one number in their password."
179 | }
180 | 
181 | variable "require_symbols" {
182 |   type        = bool
183 |   default     = false
184 |   description = "Whether you have required users to use at least one symbol in their password."
185 | }
186 | 
187 | variable "require_uppercase" {
188 |   type        = bool
189 |   default     = false
190 |   description = "Whether you have required users to use at least one uppercase letter in their password."
191 | }
192 | 
193 | variable "temporary_password_validity_days" {
194 |   type        = number
195 |   description = "(Optional) In the password policy you have set, refers to the number of days a temporary password is valid. If the user does not sign-in during this time, their password will need to be reset by an administrator."
196 |   default     = 100
197 | }
198 | 
199 | variable "cognito_users" {
200 |   description = "A map of user attributes for each user in the User Pool. Each attribute is a name-value pair."
201 |   type = map(object({
202 |     name     = string
203 |     email    = string
204 |     password = string
205 |   }))
206 |   default = {
207 |     user1 = {
208 |       name     = "aws-user"
209 |       email    = "donotreply@amazon.com"
210 |       password = "awsiscool$"
211 |     }
212 |   }
213 | }
214 | 
215 | variable "front_end_path" {
216 |   description = "Relative path to the Lambda functions' code"
217 |   type        = string
218 |   default     = "../../../front_end"
219 | }
220 | 
221 | variable "task_cpu" {
222 |   type        = number
223 |   description = "VCPUs for Task"
224 |   default     = 512
225 | }
226 | 
227 | variable "task_memory" {
228 |   type        = number
229 |   description = "Memory for Task"
230 |   default     = 2048
231 | }
232 | 
233 | variable "launch_type" {
234 |   type        = string
235 |   description = "Launch type for the service."
236 |   default     = "FARGATE"
237 | }
238 | 
239 | variable "desired_count" {
240 |   type        = number
241 |   description = "The number of instances of the task definition to place and keep running"
242 |   default     = 1
243 | }


--------------------------------------------------------------------------------
/iac/roots/main/versions.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 1.4.2"
 3 | 
 4 |   required_providers {
 5 |     aws = {
 6 |       source  = "hashicorp/aws"
 7 |       version = ">= 5.60"
 8 |     }
 9 |     cloudinit = {
10 |       source  = "hashicorp/cloudinit"
11 |       version = "2.3.4"
12 |     }
13 |     time = {
14 |       source  = "hashicorp/time"
15 |       version = ">= 0.11"
16 |     }
17 |     local = {
18 |       source  = "hashicorp/local"
19 |       version = "2.5.1"
20 |     }
21 |   }
22 | }


--------------------------------------------------------------------------------
/iac/templates/README.md:
--------------------------------------------------------------------------------
1 | The templates directory contains reusable Terraform configurations.
2 | They are reusable because they avoid hardcoding values and instead
3 | expose input parameters so that these values can be set. 
4 | 
5 | Templates can be used multiple times by top-level projects. For example,
6 | you might have a template that creates an SSM parameter. Your top-level
7 | project could call your template once for a primary region and a second
8 | time for a DR region.


--------------------------------------------------------------------------------
/iac/templates/components/README.md:
--------------------------------------------------------------------------------
1 | Components are higher-level reusable Terraform configurations. Components
2 | combine modules to create higher-level abstractions.


--------------------------------------------------------------------------------
/iac/templates/modules/README.md:
--------------------------------------------------------------------------------
1 | Modules are reusable infrastructure building blocks that are used
2 | by higher-level components or top-level projects.


--------------------------------------------------------------------------------
/iac/templates/modules/ecr/main.tf:
--------------------------------------------------------------------------------
 1 | # Checks if build folder has changed
 2 | data "external" "this_external" {
 3 |   program = ["bash", "${var.build_script_path}/dir_md5.sh", "${var.business_logic_path}"]
 4 | }
 5 | 
 6 | resource "aws_ecr_repository" "this_aws_ecr_repository" {
 7 |   name                 = var.ecr_name
 8 |   tags                 = var.tags
 9 |   image_tag_mutability = "IMMUTABLE"
10 |   force_delete         = true
11 |   image_scanning_configuration {
12 |     scan_on_push = true
13 |   }
14 |   encryption_configuration {
15 |     encryption_type = "KMS"
16 |     kms_key         = var.aws_kms_key_arn
17 |   }
18 | }
19 | 
20 | resource "aws_ecr_lifecycle_policy" "this_aws_ecr_lifecycle_policy" {
21 |   policy     = <<EOF
22 | {
23 |     "rules": [
24 |         {
25 |             "rulePriority": 1,
26 |             "description": "Keep last x images",
27 |             "selection": {
28 |                 "tagStatus": "any",
29 |                 "countType": "imageCountMoreThan",
30 |                 "countNumber": ${var.ecr_count_number}
31 |             },
32 |             "action": {
33 |                 "type": "expire"
34 |             }
35 |         }
36 |     ]
37 | }
38 | EOF
39 |   repository = aws_ecr_repository.this_aws_ecr_repository.name
40 | }
41 | 
42 | resource "terraform_data" "this_terraform_data_build_ecr_image" {
43 |   depends_on = [aws_ecr_repository.this_aws_ecr_repository]
44 |   triggers_replace = [
45 |     data.external.this_external.result.md5,
46 |     aws_ecr_repository.this_aws_ecr_repository.id
47 |   ]
48 |   provisioner "local-exec" {
49 |     command = "bash ${var.build_script_path}/build.sh ${var.ecr_base_arn} ${var.business_logic_path} ${aws_ecr_repository.this_aws_ecr_repository.name} ${aws_ecr_repository.this_aws_ecr_repository.repository_url} ${var.region}"
50 |   }
51 | }
52 | 
53 | data "aws_ecr_image" "this_aws_ecr_image" {
54 |   depends_on      = [terraform_data.this_terraform_data_build_ecr_image, aws_ecr_repository.this_aws_ecr_repository]
55 |   repository_name = aws_ecr_repository.this_aws_ecr_repository.name
56 |   most_recent     = true
57 | }


--------------------------------------------------------------------------------
/iac/templates/modules/ecr/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "latest_image_uri" {
 2 |   value       = "${aws_ecr_repository.this_aws_ecr_repository.repository_url}@${data.aws_ecr_image.this_aws_ecr_image.image_digest}"
 3 |   description = "URI of the ECR Images"
 4 | }
 5 | 
 6 | output "latest_image_url" {
 7 |   value       = aws_ecr_repository.this_aws_ecr_repository.repository_url
 8 |   description = "URL of the ECR Images"
 9 | }
10 | 
11 | output "latest_image_tag" {
12 |   value       = data.aws_ecr_image.this_aws_ecr_image.image_tags
13 |   description = "URI of the ECR Images"
14 | }


--------------------------------------------------------------------------------
/iac/templates/modules/ecr/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "build_script_path" {
 2 |   type        = string
 3 |   description = "Path to the Build Script"
 4 | }
 5 | 
 6 | variable "business_logic_path" {
 7 |   type        = string
 8 |   description = "Path to the Business Logic"
 9 | }
10 | 
11 | variable "ecr_name" {
12 |   type        = string
13 |   description = "Name of the ECR Repository"
14 | }
15 | 
16 | variable "tags" {
17 |   type        = map(string)
18 |   description = "Common tags for all resources"
19 | }
20 | 
21 | variable "aws_kms_key_arn" {
22 |   type        = string
23 |   description = "KMS Key ARN"
24 | }
25 | variable "ecr_count_number" {
26 |   type        = number
27 |   description = "Number of ECR Images to Keep"
28 | }
29 | 
30 | variable "ecr_base_arn" {
31 |   type        = string
32 |   description = "Base ARN for ECR Images"
33 | }
34 | variable "region" {
35 |   type        = string
36 |   description = "AWS Region"
37 | }


--------------------------------------------------------------------------------
/iac/templates/modules/lambda/README.md:
--------------------------------------------------------------------------------
 1 | # What is this module for?
 2 | This module creates following resources:
 3 | * Lambda function
 4 | * **prod** and **test** lambda aliases
 5 | * Lambda execution IAM role
 6 | 
 7 | # How do I use it?
 8 | Simple useage:
 9 | 
10 | ```hcl
11 | module "lambda" {
12 |   source = "../modules/lambda"
13 |   function_name = "test"
14 |   handler_name  = "app.handler"
15 |   description   = "Test function"
16 |   resource_policy = data.aws_iam_policy_document.lambda_policy.json
17 | } 
18 | ```
19 | # Inputs
20 | |Variable name|Required|Description|
21 | |-------------|--------|-----------|
22 | |function_name|Yes|Name of the lambda function|
23 | |handler_name|Yes|Name of the function that will act as lambda handler e.g. **app.handler** for python|
24 | |description|Yes|Description of the function|
25 | |resource_policy|Yes|IAM policy to be attached to lambda execution role in JSON|
26 | |runtime|No|Lambda runtime e.g. dotnetcore. Defaults to **python3.9**|
27 | |code_archive|No|Zip file containing lambda code appropriate for the runtime|
28 | |environment_variables|No|Map of environment variables. **NOTE:** Environment variables are not encrypted so do not use them to pass credentials or other secrets to lambda.|
29 | |meomory_size|No|Memory size for the lambda in MB. Defaults to 512|
30 | |subnet_ids|No|List of subnet ids if lambda is to be attached to a VPC|
31 | |security_group_ids|No|List of security group ids if lambda is to be attached to a VPC|
32 | 
33 | 
34 | # Outputs
35 | |Output|Description|
36 | |---|---|
37 | |arn|ARN of the lambda function|
38 | |invocation_arn|Invocation ARN of the function|
39 | 
40 | # Ignored checkov warnings
41 | 
42 | |Warning|Description|Reason|
43 | |---|---|---|
44 | |CKV_AWS_116|Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)|Ony valid/required for asynchronous lambda functions|
45 | |CKV_AWS_173|Check encryption settings for Lambda environmental variable|No secrets should be stored in env variables|
46 | |CKV_AWS_272|Ensure AWS Lambda function is configured to validate code-signing|Surplus to requirements and overhead not required for majority of PoCs|
47 | 


--------------------------------------------------------------------------------
/iac/templates/modules/lambda/main.tf:
--------------------------------------------------------------------------------
  1 | 
  2 | #
  3 | # IAM role for the lambda function
  4 | #
  5 | data "aws_iam_policy_document" "lambda_role_policy" {
  6 |   statement {
  7 |     actions = ["sts:AssumeRole"]
  8 | 
  9 |     principals {
 10 |       type        = "Service"
 11 |       identifiers = ["lambda.amazonaws.com"]
 12 |     }
 13 |   }
 14 | }
 15 | 
 16 | resource "aws_iam_role" "iam_for_lambda" {
 17 |   name               = "${var.function_name}-Lambda-Role"
 18 |   assume_role_policy = data.aws_iam_policy_document.lambda_role_policy.json
 19 | }
 20 | 
 21 | #
 22 | # Attaching AWSLambdaBasicExecutionRole policy to the Lambda role (AWS Managed Policy)
 23 | #
 24 | resource "aws_iam_role_policy_attachment" "lambda_execution_role_policy" {
 25 |   role       = aws_iam_role.iam_for_lambda.name
 26 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
 27 | }
 28 | 
 29 | #
 30 | # Resource access policy
 31 | #
 32 | resource "aws_iam_role_policy" "lambda_policy" {
 33 |   name   = "Resource_access_policy"
 34 |   role   = aws_iam_role.iam_for_lambda.id
 35 |   policy = var.resource_policy
 36 | }
 37 | 
 38 | #
 39 | # Lambda function
 40 | #
 41 | resource "aws_lambda_function" "function" {
 42 |   #Skipping checkov checks
 43 |   #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
 44 |   #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
 45 |   #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
 46 |   function_name = var.function_name
 47 |   description   = var.description
 48 |   role          = aws_iam_role.iam_for_lambda.arn
 49 |   publish       = true # Creates version of the function so we can reference it in an alias
 50 |   handler       = var.handler_name
 51 |   runtime       = var.runtime
 52 |   timeout       = 15
 53 |   memory_size   = var.memory_size
 54 |   # Just a dummy empty lambda implementation
 55 |   filename = var.code_archive == null ? "${path.module}/code.zip" : var.code_archive
 56 | 
 57 |   tracing_config {
 58 |     mode = "Active"
 59 |   }
 60 | 
 61 |   reserved_concurrent_executions = -1
 62 |   environment {
 63 |     variables = var.environment_variables
 64 |   }
 65 | 
 66 |   vpc_config {
 67 |     # If the list of security group ids and subnets are empty,
 68 |     # this property is effectively ignored
 69 |     subnet_ids         = var.subnet_ids
 70 |     security_group_ids = var.security_group_ids
 71 |   }
 72 | }
 73 | 
 74 | #
 75 | # Test alias for the function
 76 | #
 77 | resource "aws_lambda_alias" "test_alias" {
 78 |   name             = "test"
 79 |   description      = "Test version of the function"
 80 |   function_name    = aws_lambda_function.function.arn
 81 |   function_version = aws_lambda_function.function.version
 82 | 
 83 |   lifecycle {
 84 |     ignore_changes = [
 85 |       # Ignore changes to function version as those will be managed by the lambda function build
 86 |       function_version
 87 |     ]
 88 |   }
 89 | }
 90 | 
 91 | #
 92 | # Prod alias for the function
 93 | #
 94 | resource "aws_lambda_alias" "prod_alias" {
 95 |   name             = "prod"
 96 |   description      = "Prod version of the function"
 97 |   function_name    = aws_lambda_function.function.arn
 98 |   function_version = aws_lambda_function.function.version
 99 | 
100 |   lifecycle {
101 |     ignore_changes = [
102 |       # Ignore changes to function version as those will be managed by the lambda function build
103 |       function_version
104 |     ]
105 |   }
106 | }


--------------------------------------------------------------------------------
/iac/templates/modules/lambda/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "arn" {
 2 |   value       = aws_lambda_function.function.arn
 3 |   description = "ARN of the lambda function"
 4 | }
 5 | 
 6 | output "invocation_arn" {
 7 |   value       = aws_lambda_function.function.invoke_arn
 8 |   description = "Invocation ARN of the lambda function"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/iac/templates/modules/lambda/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "function_name" {
 2 |   type        = string
 3 |   description = "Name of the lambda function"
 4 | }
 5 | 
 6 | variable "handler_name" {
 7 |   type        = string
 8 |   description = "Name of the function handler"
 9 | }
10 | 
11 | variable "runtime" {
12 |   type        = string
13 |   default     = "python3.9"
14 |   description = "Lambda runtime e.g. dotnet, python3.9 etc"
15 | }
16 | 
17 | variable "description" {
18 |   type        = string
19 |   description = "Description of the function"
20 | }
21 | 
22 | variable "resource_policy" {
23 |   type        = string
24 |   description = "Lambda role IAM policy document"
25 | }
26 | 
27 | variable "environment_variables" {
28 |   type        = map(any)
29 |   description = "Optional map of environment variables"
30 |   default     = { DUMMY = "" }
31 | }
32 | 
33 | variable "memory_size" {
34 |   default     = 512
35 |   description = "Memory size for the function"
36 |   type        = number
37 | }
38 | 
39 | variable "subnet_ids" {
40 |   type        = list(string)
41 |   default     = []
42 |   description = "List of subnet id(s) to which lambda should be attached"
43 | }
44 | 
45 | variable "security_group_ids" {
46 |   type        = list(string)
47 |   default     = []
48 |   description = "List of security group id(s) to which lambda should be attached"
49 | }
50 | 
51 | variable "code_archive" {
52 |   type        = string
53 |   default     = null
54 |   description = "Zip file with lambda's code package"
55 | }


--------------------------------------------------------------------------------
/iac/templates/modules/s3_bucket/README.md:
--------------------------------------------------------------------------------
 1 | # What is this module for
 2 | Creates S3 bucket with versioning, logging & encryption.
 3 | 
 4 | # How do I use it?
 5 | Simple useage:
 6 | 
 7 | ```hcl
 8 | module mybucket { 
 9 |    source = "../modules/s3_bucket" 
10 |    name_prefix = "my_bucket_" 
11 | }
12 | ```
13 | 
14 | # Inputs
15 | |Variable name|Required|Description|
16 | |-------------|--------|-----------|
17 | |name_prefix|Yes|Prefix of the S3 bucket name. TF will automaticaly generate bucket name using this prefix. **NOTE:** S3 bucket names do not accept uppercase characters in their names!|
18 | |sse_algorithm|No|By default **aws:kms** will be used. **AES256** can be specified if the bucket is to be used with CloudFront.
19 | |log_bucket|No|Name of the bucket where access logs are to be stored. If not specified, the bucket will store the logs with the /log prefix in itself.|
20 | |access_policy|No|JSON document of the bucket access policy|
21 | |kms_key_id|No|Id of the KMS key to use for encryption|
22 | |enable_eventbridge_notifications|No|Set to true to enable events being sent from this bucket to EventBridge. Defaults to false.|
23 | 
24 | # Outputs
25 | |Output|Description|
26 | |---|---|
27 | |name|Generated name of the bucket|
28 | |arn|ARN of the bucket created|
29 | |id|Id of the bucket|
30 | |regional_domain_name|The bucket region-specific domain name. Used for creating Cloudfront S3 origins|
31 | 
32 | # Ignored checkov warnings
33 | 
34 | |Warning|Description|Reason|
35 | |---|---|---|
36 | |CKV_AWS_144|Ensure that S3 bucket has cross-region replication enabled|Redundant to requirements
37 | 


--------------------------------------------------------------------------------
/iac/templates/modules/s3_bucket/main.tf:
--------------------------------------------------------------------------------
 1 | #
 2 | # Bucket resource
 3 | #
 4 | resource "aws_s3_bucket" "bucket" {
 5 |   #Skipping checkov checks
 6 |   #checkov:skip=CKV_AWS_144: no s3 cross region replication
 7 |   #checkov:skip=CKV2_AWS_61: "Ensure that an S3 bucket has a lifecycle configuration"
 8 |   #checkov:skip=CKV_AWS_21: "re all data stored in the S3 bucket have versioning enabled"
 9 |   bucket_prefix = var.name_prefix
10 |   force_destroy = true
11 | }
12 | 
13 | #
14 | # Access logs
15 | #
16 | resource "aws_s3_bucket_logging" "logging" {
17 |   bucket        = aws_s3_bucket.bucket.id
18 |   target_bucket = var.log_bucket != null ? var.log_bucket : aws_s3_bucket.bucket.id
19 |   target_prefix = "log/"
20 | }
21 | 
22 | #
23 | # SSE configration
24 | #
25 | resource "aws_s3_bucket_server_side_encryption_configuration" "sse_conf" {
26 |   bucket = aws_s3_bucket.bucket.bucket
27 |   rule {
28 |     apply_server_side_encryption_by_default {
29 |       sse_algorithm     = "aws:kms"
30 |       kms_master_key_id = var.kms_key_id
31 |     }
32 |   }
33 | }
34 | 
35 | #
36 | # Public access block
37 | #
38 | resource "aws_s3_bucket_public_access_block" "public_access" {
39 |   bucket = aws_s3_bucket.bucket.id
40 | 
41 |   block_public_acls       = true
42 |   block_public_policy     = true
43 |   ignore_public_acls      = true
44 |   restrict_public_buckets = true
45 | }
46 | 
47 | #
48 | # Dummy access policy. We need it to allow the user of this module to provide 
49 | # custom policy (valid use case for CloudFront, VPC logs etc) . 
50 | # The policy below is pretty benign, denying access to the bucket over HTTP, 
51 | # which would only happen in case of static website hosting on S3 (and we would 
52 | # not support it anyway).
53 | #
54 | data "aws_iam_policy_document" "policy" {
55 |   statement {
56 |     sid     = "DenyNonHttpsTrafic"
57 |     effect  = "Deny"
58 |     actions = ["s3:*"]
59 |     resources = [
60 |       aws_s3_bucket.bucket.arn,
61 |       "${aws_s3_bucket.bucket.arn}/*"
62 |     ]
63 |     principals {
64 |       type        = "*"
65 |       identifiers = ["*"]
66 |     }
67 |     condition {
68 |       test     = "Bool"
69 |       variable = "aws:SecureTransport"
70 |       values   = [false]
71 |     }
72 |   }
73 | }
74 | 
75 | #
76 | # Attaching policy to the bucket
77 | #
78 | resource "aws_s3_bucket_policy" "bucket_policy" {
79 |   bucket = aws_s3_bucket.bucket.bucket
80 |   policy = coalesce(var.access_policy, data.aws_iam_policy_document.policy.json)
81 | }
82 | 
83 | #
84 | # Notification settings
85 | #
86 | resource "aws_s3_bucket_notification" "notification" {
87 |   bucket      = aws_s3_bucket.bucket.bucket
88 |   eventbridge = var.enable_eventbridge_notifications
89 | }
90 | 


--------------------------------------------------------------------------------
/iac/templates/modules/s3_bucket/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "id" {
 2 |   value       = aws_s3_bucket.bucket.id
 3 |   description = "ID of S3 bucket"
 4 | }
 5 | 
 6 | output "name" {
 7 |   value       = aws_s3_bucket.bucket.bucket
 8 |   description = "Name of S3 bucket"
 9 | }
10 | 
11 | output "arn" {
12 |   value       = aws_s3_bucket.bucket.arn
13 |   description = "ARN of S3 bucket"
14 | }
15 | 
16 | 
17 | output "regional_domain_name" {
18 |   value       = aws_s3_bucket.bucket.bucket_regional_domain_name
19 |   description = "Regional domain name of S3 bucket"
20 | }


--------------------------------------------------------------------------------
/iac/templates/modules/s3_bucket/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name_prefix" {
 2 |   type        = string
 3 |   description = "Prefix for the S3 Bucket's name, ensuring it's full name is unique"
 4 | 
 5 | }
 6 | 
 7 | variable "log_bucket" {
 8 |   type        = string
 9 |   description = "Target bucket for access logs (optional). If not provided, bucket will store log in itself"
10 |   default     = null
11 | }
12 | 
13 | variable "access_policy" {
14 |   type        = string
15 |   description = "Access policy for the bucket (in json)"
16 |   default     = null
17 | }
18 | 
19 | variable "kms_key_id" {
20 |   type        = string
21 |   description = "Optional ID of the KMS key"
22 |   default     = null
23 | }
24 | 
25 | variable "enable_eventbridge_notifications" {
26 |   type        = bool
27 |   default     = false
28 |   description = "Enable or disable EventBridge notifications from the biucket. Defaults to false"
29 | }


--------------------------------------------------------------------------------