├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── artifacts
├── architecture.png
├── demo.gif
├── webui.png
└── webui_news.png
├── build-script
├── build.sh
└── dir_md5.sh
├── business_logic
├── lambdas
│ ├── embed_docs
│ │ ├── Dockerfile
│ │ ├── embed_docs.py
│ │ └── requirements.txt
│ ├── pre_process_docs
│ │ ├── Dockerfile
│ │ ├── pre_process_docs.py
│ │ └── requirements.txt
│ ├── summarization
│ │ ├── Dockerfile
│ │ ├── news summarization streaming.ipynb
│ │ ├── requirements.txt
│ │ └── summarization.py
│ └── trigger_sfn
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ └── trigger_sfn.py
├── model_artifacts
│ ├── embedding
│ │ └── model
│ │ │ ├── code
│ │ │ ├── inference.py
│ │ │ └── requirements.txt
│ │ │ └── model
│ │ │ ├── __init__.py
│ │ │ ├── embed_documents.py
│ │ │ └── embedding_model_utils.py
│ └── multi_gpu_embedding
│ │ └── model
│ │ └── code
│ │ ├── inference.py
│ │ └── requirements.txt
├── stream_consumer
│ ├── clustering.py
│ ├── process_records.py
│ └── requirements.txt
└── temp.json
├── data
├── clear_data.py
├── download_public_data.sh
├── example_article.json
├── put_records.py
├── script.py
└── send_articles.sh
├── front_end
├── Dockerfile
├── README.md
├── nginx.conf
├── package-lock.json
├── package.json
├── public
│ ├── favicon.ico
│ ├── index.html
│ ├── logo192.png
│ ├── logo512.png
│ ├── manifest.json
│ └── robots.txt
└── src
│ ├── App.css
│ ├── App.js
│ ├── components
│ ├── ClusterList.js
│ └── ClusterModal.js
│ ├── index.css
│ └── index.js
└── iac
├── roots
├── README.md
└── main
│ ├── clustering_compute.tf
│ ├── embedding_endpoint.tf
│ ├── eventbridge.tf
│ ├── iam.tf
│ ├── kms.tf
│ ├── lambda.tf
│ ├── main.tf
│ ├── outputs.tf
│ ├── summarization_pipeline.tf
│ ├── templates
│ ├── ClusterList-js.template
│ ├── ConfigureNode.sh
│ ├── aws-exports-js.template
│ ├── cognito-policy.json
│ ├── ecs-role.json
│ └── init.cfg
│ ├── terraform.tfvars
│ ├── variables.tf
│ └── versions.tf
└── templates
├── README.md
├── components
└── README.md
└── modules
├── README.md
├── ecr
├── main.tf
├── outputs.tf
└── variables.tf
├── lambda
├── README.md
├── main.tf
├── outputs.tf
└── variables.tf
└── s3_bucket
├── README.md
├── main.tf
├── outputs.tf
└── variables.tf
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Local .terraform directories
3 | **/.terraform/*
4 |
5 | # .tfstate files
6 | *.tfstate
7 | *.tfstate.*
8 | *.lock.hcl
9 | *.venv
10 | # Crash log files
11 | crash.log
12 |
13 | *.zip
14 | *.pem
15 | *.tar.gz
16 |
17 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
18 | # .tfvars files are managed as part of configuration and so should be included in
19 | # version control.
20 | #
21 | # example.tfvars
22 |
23 | # Ignore override files as they are usually used to override resources locally and so
24 | # are not checked in
25 | override.tf
26 | override.tf.json
27 | *_override.tf
28 | *_override.tf.json
29 |
30 | # Include override files you do wish to add to version control using negated pattern
31 | #
32 | # !example_override.tf
33 |
34 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
35 | # example: *tfplan*
36 |
37 | # Pycharm
38 | .venv
39 | .idea
40 | .idea/
41 |
42 | # Go files
43 | go.mod
44 | go.sum
45 |
46 | # node modules
47 | **/node_modules/*
48 |
49 | # build folder
50 | **/build/*
51 |
52 | # MacOS folder
53 | .DS_Store
54 |
55 | # Local developer temporary files such as for scratchpads
56 | temp-*
57 |
58 | # Files created by gitlab-runner
59 | builds
60 |
61 | # project generated files
62 | environment/.environment-*.json
63 | environment/.current-environment
64 | environment/.cli-profiles.json
65 | environment/app-env-var-names-backup.txt
66 | environment/.choice-cache.json
67 | environment/make-env
68 | environment/.log.txt
69 | cicd/iam-role/final-cicd-iam-role.json
70 | iac/bootstrap/final-tf-backend-cf-stack.json
71 | *.bak
72 | config/.env
73 | package
74 |
75 | model_evaluation/
76 |
77 | .env
78 | 1000_embeddings.json
79 | 5000_embeddings.json
80 |
81 | articles/
82 | customer_data/
83 | public_data/
84 | featured_data/
85 | eps_screen_results/
86 | test_results/
87 | cluster_results
88 |
89 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
90 |
91 | # dependencies
92 | **/node_modules
93 | **/.pnp
94 | .pnp.js
95 |
96 | # testing
97 | **/coverage
98 |
99 | # production
100 | **/build
101 |
102 | # misc
103 | .DS_Store
104 | .env.local
105 | .env.development.local
106 | .env.test.local
107 | .env.production.local
108 |
109 | npm-debug.log*
110 | yarn-debug.log*
111 | yarn-error.log*
112 |
113 | *.env
114 | **/*.env
115 | **/aws-exports.js
116 | venv
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT No Attribution
2 |
3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 |
18 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # List all Make targets in alphabetical order
2 | .PHONY: list send-article
3 |
4 | # Terraform Init
5 | init:
6 | terraform -chdir=iac/roots/main init
7 |
8 | # Deploy all targets in the correct order
9 | deploy-all:
10 | terraform -chdir=iac/roots/main apply -auto-approve
11 |
12 | # Destroy all targets in the correct order
13 | destroy-all:
14 | terraform -chdir=iac/roots/main apply -destroy
15 |
16 | send-articles:
17 | @echo "Sending articles..."
18 | cd data && ./send_articles.sh && cd ..
19 |
20 | download-public-dataset:
21 | @echo "Downloading public dataset..."
22 | cd data && ./download_public_data.sh && cd ..
23 |
24 | clear-data:
25 | @echo "Clearing DynamoDB table, SQS queue, S3 bucket DBSCAN memory and removing EC2 instance from ASG..."
26 | cd data && python clear_data.py && cd ..
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # News Clustering And Summarization
2 |
3 | ## Table of Contents
4 | 1. [About this Repository](#About)
5 | 2. [Architecture](#Architecture)
6 | 3. [Demo](#Demo)
7 | 4. [Tool Versions](#Versions)
8 | 5. [Prerequisites](#Prerequisites)
9 | 6. [Build and Deploy](#Build_Deploy)
10 | 7. [Test](#Test)
11 | 8. [Destroy](#Destroy)
12 | 9. [License](#License)
13 |
14 | ## About this Repository
15 |
16 | ### News Clustering And Summarization
17 |
18 | This solution aims to launch a news Event feature that clusters related news stories into summaries, providing customers with near real-time updates on unfolding events.
19 | This augmented news consumption experience will enable users to easily follow evolving stories while maximizing relevance and reducing the firehose of information for articles covering the same event. By tailoring news clusters around key events, this application can improve customer satisfaction and engagement.
20 |
21 | This project was built by the AWS GFS SA team, FSI PACE team and the Generative AI Innovation Center.
22 |
23 | ### Repository Details
24 |
25 | Below are some descriptions of the content in this repository.
26 | ```
27 | artifacts/ # Contains the architecture diagram and demo gif
28 | build-script/ # Hosts required build scripts
29 | |--- build.sh # Creates and Uploads docker image for the Lambda function
30 | |--- dir_md5.sh # Trigger the rebuild of docker image once the underlying code changes
31 | business_logic/ # Hosts the necessary artifacts and logic to run the solution
32 | |--- lambdas/ # Each folder contains a .py file and a requirements.txt to run the code
33 | |--- embed_docs/ # Takes a list of processed documents and embeds with SageMaker endpoint
34 | |--- preprocess_docs/ # Generates the string to embed, by concatenating relevant fields
35 | |--- summarization/ # Generates the summary of a list of articles
36 | |--- trigger_sfn/ # Determines if the added articles should trigger a new summarization or if we should wait
37 | |--- model_artifacts/ # Necesary artifacts to deploy the embedding model
38 | |--- embedding/
39 | |--- model/
40 | |--- code/
41 | | inference.py # In specific SageMaker format for handling inputs and outputs, and invoking model
42 | | requirements.txt # List of requirements for model to run
43 | |--- model/ # Contains the code for importing and running model
44 | |--- stream_consumer/ # Contains the code and requirements for the clsutering compute
45 | |--- example.json
46 | data/ # For testing data upload to kinesis stream
47 | |--- customer_data/ # DOES NOT CURRENTLY EXIST, place json documents in here to have them accessible for testing
48 | |--- put_records.py # Pushes files from customer_data to kinesis stream for processing
49 | |--- clear_data.py # Clears the DynamoDB table, SQS queue, S3 bucket DBSCAN memory and removes the EC2 instance from the ASG
50 | |--- send_articles.sh # Sends articles to the kinesis stream for processing to simulate a data feed
51 | frontend/ # Front end code for demo purposes
52 | iac/ # All infrastructure as code
53 | |--- roots/
54 | |--- main/
55 | |--- _globals.tf
56 | |--- backend.tf
57 | |--- clustering_compute.tf # Deploys the clustering compute
58 | |--- dynamodb.tf # Creates the table that contains cluster information
59 | |--- embedding_endpoint.tf # Deploys the embedding SageMaker endpoint
60 | |--- eventbridge.tf # Creates the EventBridge pipe
61 | |--- iam.tf # Defines IAM policies and roles
62 | |--- lambda.tf # Builds all necessary lambda functions
63 | |--- main.tf # Creates some S3 buckets, Kinesis streams, and Step Function for ingestion
64 | |--- outputs.tf # Not used in this solution
65 | |--- summarization_pipeline.tf # Deploys the summarization Step Functions
66 | |--- terraform.tfvars # Defines app and environment names
67 | |--- variables.tf # Necessary variables
68 | |--- vpc.tf # Creates VPC and other necessary networking
69 | |--- README.md
70 | |--- templates/
71 | Makefile # Simplifies scripts for easy deployment
72 | README.md # File you are currently reading with details on how to operate the solution
73 | ```
74 |
75 | ## Architecture
76 |
77 | 
78 |
79 | This solution leverages a combination of AWS managed services and serverless options to create a scalable, event-driven, microservice architecture capable of processing up dozens or news artcles per second. The architecture utilizes AWS Lambda, Step Functions, Amazon Kinesis, EventBridge (Pipes), DynamoDB, EC2 with Auto Scaling Groups, S3, and Amazon Bedrock.
80 |
81 | The workflow begins with raw JSON article ingestion through Amazon Kinesis, bridged to Step Functions via EventBridge Pipes. The first Step Functions state machine preprocesses documents and embeds articles using Titan Embeddings on Bedrock. Data is temporarily stored in S3 between steps to handle large payloads. Processed articles are then sent to SQS for micro-batch clustering.
82 |
83 | Clustering occurs on EC2 instances, which pull batches from SQS and apply the DBSCAN algorithm. Results update a DynamoDB table, with periodic checkpoints saved to S3. DynamoDB Streams trigger summarization pipelines when clusters reach a specified threshold. Summaries are generated using Claude Haiku through another Step Functions workflow and stored back in DynamoDB for UI access.
84 |
85 | This architecture ensures high scalability, fault tolerance, and near real-time processing of large volumes of articles, making it suitable for applications requiring rapid content analysis and clustering.
86 |
87 | ## Demo
88 |
89 | Below is a GIF that demostrates the solution in action.
90 |
91 | 
92 |
93 | The demo demostrates the solution by sending articles to the Kinesis stream, and waiting those articles to be clustered and summarized. The solution will begin clustering once it hits 500 articles (this can be changed). The web UI gets updated every 5 seconds reading the DynamoDB table which contains the clusters, articles and summaries.
94 |
95 | ## Tool Versions
96 |
97 | To build and deploy this template the following tools are required.
98 |
99 | 1. AWS CLI >= 2
100 | 2. Terraform >= 1.4.6
101 | 3. Docker
102 | 4. md5
103 |
104 | ## Prerequisites
105 |
106 | ### Credentials
107 |
108 | Use the secret access key of a user or export the temporary credentials.
109 |
110 | ### Environment
111 |
112 | The environment and application are current defined as below in ```iac/roots/main/terraform.tfvars```:
113 | ```
114 | appName = "clustering"
115 | envName = "demo2"
116 | ```
117 | To edit these values navigate to ```iac/roots/main/terraform.tfvars``` and manually change them.
118 |
119 | ### Understanding the MakeFile
120 |
121 | At the root of the repository there is a ```Makefile```. This has custom commands to abstract some of the terraform commands for ease of use.
122 |
123 | This includes the following commands:
124 | ```
125 | # Terraform Init
126 | init:
127 | terraform -chdir=iac/roots/main init
128 |
129 | # Deploy all targets in the correct order
130 | deploy-all:
131 | terraform -chdir=iac/roots/main apply -auto-approve
132 |
133 | # Destroy all targets in the correct order
134 | destroy-all:
135 | terraform -chdir=iac/roots/main apply -destroy
136 |
137 | # Send Articles
138 | send-articles:
139 | cd data && ./send_articles.sh && cd ..
140 |
141 | # Download Public Dataset
142 | download-public-dataset:
143 | cd data && ./download_public_data.sh && cd ..
144 |
145 | # Clear Data
146 | clear-data:
147 | cd data && python clear_data.py && cd ..
148 | ```
149 |
150 | In the next sections, we will explain when to use these commands.
151 |
152 |
153 | ### Init Terraform
154 |
155 | To initialize terraform, run
156 | ```
157 | make init
158 | ```
159 |
160 | This will run ```terraform -chdir=iac/roots/main init```.
161 |
162 |
163 | ## Build and Deploy
164 |
165 | ### Deploy
166 |
167 | To deploy the resources, run:
168 |
169 | ```
170 | make deploy-all
171 | ```
172 |
173 | This will run ```terraform -chdir=iac/roots/main apply -auto-approve```.
174 |
175 | ### Accessing the Frontend
176 |
177 | To access the frontend, you'll see the Terraform output that should look something like:
178 |
179 | ```
180 | dns_record_for_application = "https://front-end-clustering-demo2-1234567890.us-east-1.elb.amazonaws.com"
181 | sample_user_creds = tomap({
182 | "user1" = {
183 | "email" = "donotreply@amazon.com"
184 | "name" = "aws-user"
185 | "password" = "awsiscool$"
186 | }
187 | })
188 | ```
189 |
190 | Open the link in your browser. You should see the following login screen using the email and password provided in the output.
191 |
192 | The page should look like the following:
193 |
194 | 
195 |
196 | After you have logged in, please go ahead and start sending articles to the solution. You can follow the instrucions in the sections below.
197 |
198 | ## Test
199 |
200 | ### Testing Infrastructure
201 |
202 | Running ```make download-public-dataset``` will download a public dataset to ```data/public-data```.
203 |
204 | The following is an example of one of the items in the dataset. The solution expects the first 3 keys to be filled
205 | ```json
206 | {
207 | "id": "66536",
208 | "text": "this is the body of the article", // ! Required
209 | "title": "Article Title", // ! Required
210 | "date": "2013-12-18 08:14:00", // ! Required
211 | "event_id": "9102",
212 | "duplicate": false,
213 | "lang": "deu",
214 | "bag_id": "b738a3b7-2db3-4d38-88c8-76c4eb4f835b-2325",
215 | "source": "finanzen.net",
216 | "cluster": "322"
217 | }
218 |
219 | ```
220 |
221 | > Note: Use this data for exploration/testing. You should use your own data for prod. Just ensure it has the same expected fields: text, title, and date.
222 |
223 | ### Installing Required Python Libraries
224 |
225 | Before sending articles, ensure you have `boto3` and `tdqdm` python libraries installed. You can install them using the following command:
226 |
227 | ```bash
228 | pip install boto3 tqdm
229 | ```
230 |
231 | ### Sending Articles
232 |
233 | Running ```make send-articles``` will call on ```data/put_records.py```.
234 |
235 | ```put_records.py``` relies on the following global variables
236 | ```
237 | STREAM_NAME = "input-stream-clustering-demo2" # Name of Kinesis stream
238 | PARTITION_KEY = "a" # Partition key of Kinesis stream (does not need editing)
239 | JSON_DIR = "./customer_data" # Path to article json files
240 | COUNT = 1200000 # Number of articles to test with (actual number run is min(COUNT, num articles in JSON_DIR))
241 | BATCH_SIZE = 5 # Number of articles to send as a batch to the Kinesis stream
242 | ```
243 | Once you have sent articles, you should see them in the frontend. The frontend will display clusters as they are formed and updated in real time.
244 | A screenshot of the frontend displaying news clusters is shown below:
245 |
246 | 
247 |
248 | Each row in the web UI displays a cluster, its summary, the number of articles in the cluster, and a link to see each articles in the cluster. If you click in the "View Articles" button, you are able to see each article in detail with title, date, and full text.
249 |
250 | *Note: after testing, it may be required to clear the SQS queue and the DyanmoDB table*
251 |
252 | ### Clearing Data
253 |
254 | Running ```make clear-data``` will clear the DynamoDB table, SQS queue, S3 bucket DBSCAN memory and remove the EC2 instance from the ASG. Please wait 5-10 minutes after deleting the data to send more news to the solution. It takes few minutes for the ASG to create a new EC2 instance.
255 |
256 | If you changed the variables with a specific project name, you might need to edit the ```data/clear_data.py``` file to match the project name.
257 |
258 | ### Testing Business Logic
259 |
260 | The ```test``` folder has automated embedding and epsilon tests with notebooks for evaluating clustering and summarization.
261 |
262 | For more details, navigate to the ```README.md``` in the ```test``` folder.
263 |
264 |
265 | ## Destroy
266 |
267 | To destroy the resources, run:
268 |
269 | ```
270 | make destroy-all
271 | ```
272 |
273 | This will run ```terraform -chdir=iac/roots/main apply -destroy ```.
274 |
275 | # Contributors
276 |
277 | - [Samuel Baruffi](https://www.linkedin.com/in/samuelbaruffi/)
278 | - [Kareem Abdol-Hamid](https://www.linkedin.com/in/kabdolha/)
279 | - [Alexandar (Ally) Meringer](https://www.linkedin.com/in/kabdolha/)
280 | - [Hector Lopez Hernandez](https://www.linkedin.com/in/hlopezhernandez/)
281 | - [Yanxiang Yu](https://www.linkedin.com/in/yyu2/)
282 | - [Nitin Jain](https://www.linkedin.com/in/annjay/)
283 |
284 | ## License
285 |
286 | This library is licensed under the Amazon Software License.
287 |
288 |
289 |
290 |
291 |
--------------------------------------------------------------------------------
/artifacts/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/architecture.png
--------------------------------------------------------------------------------
/artifacts/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/demo.gif
--------------------------------------------------------------------------------
/artifacts/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/webui.png
--------------------------------------------------------------------------------
/artifacts/webui_news.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/artifacts/webui_news.png
--------------------------------------------------------------------------------
/build-script/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This is the order of arguments
4 | ECR_BASE_ARN=${1}
5 | BUILD_FOLDER=${2}
6 | IMAGE_NAME=${3}
7 | IMAGE_URI=${4}
8 | TARGET_AWS_REGION=${5}
9 | MYTAG=$(date +%Y%m%d%H%M%S)
10 |
11 | # Check that git is installed
12 | which git >/dev/null || {
13 | echo 'ERROR: git is not installed'
14 | exit 1
15 | }
16 |
17 | # Check that aws is installed
18 | which aws >/dev/null || {
19 | echo 'ERROR: aws-cli is not installed'
20 | exit 1
21 | }
22 |
23 | # Check that docker is installed
24 | which docker >/dev/null || {
25 | echo 'ERROR: docker is not installed'
26 | exit 1
27 | }
28 |
29 | # Connect into aws and login into ECR
30 | SLEEP_INT=$((1 + RANDOM % 11))
31 | for CTR in {1..5}; do
32 |
33 | # Check that docker is running
34 | docker ps >/dev/null
35 | DOCKER_STATUS=$?
36 |
37 | # Check that ECR creds are obtained
38 | aws ecr get-login-password --region ${TARGET_AWS_REGION} | docker login --username AWS --password-stdin ${ECR_BASE_ARN}
39 | ECR_GET_CREDS_STATUS=$?
40 |
41 | if [ ${ECR_GET_CREDS_STATUS} -ne 0 ] || [ ${DOCKER_STATUS} -ne 0 ]; then
42 | echo "ERROR: aws ecr login failed, trying again in ${SLEEP_INT} Seconds"
43 | sleep ${SLEEP_INT}
44 | ((CTR=CTR+1))
45 | continue
46 | else
47 | echo "SUCCESS: aws ecr login succeded in ${CTR} attempt"
48 | break
49 | fi
50 | exit 1
51 | done
52 |
53 | # Build image
54 | docker build --no-cache -t ${IMAGE_NAME} ${BUILD_FOLDER} --platform linux/amd64 || {
55 | echo 'ERROR: docker build faied'
56 | exit 1
57 | }
58 |
59 | # Docker Tag and Push
60 | docker tag ${IMAGE_NAME} ${IMAGE_URI}:${MYTAG}
61 | docker push ${IMAGE_URI}:${MYTAG} || {
62 | echo 'ERROR: docker push faied'
63 | exit 1
64 | }
65 |
66 | # Get the sha of the image
67 | SHA_IMAGE=$(docker inspect --format='{{.RepoDigests}}' ${IMAGE_URI}:${MYTAG})
68 | echo "Tags Used for ${IMAGE_NAME} Image are ${MYTAG} with this SHA : ${SHA_IMAGE}"
--------------------------------------------------------------------------------
/build-script/dir_md5.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script facilitates checking if the contents are different between `terraform apply` runs
4 |
5 | # List of arguments
6 | build_folder=${1}
7 |
8 | # Linux has command md5sum and OSX has command md5
9 | if command -v md5sum >/dev/null 2>&1; then
10 | MD5_PROGRAM=md5sum
11 | elif command -v md5 >/dev/null 2>&1; then
12 | MD5_PROGRAM=md5
13 | else
14 | echo "ERROR: md5sum is not installed"
15 | exit 255
16 | fi
17 |
18 | # Take md5 from each object inside the program and then take a md5 of that output
19 | md5_output="$(eval ${MD5_PROGRAM} $build_folder/** | ${MD5_PROGRAM})"
20 |
21 | # Output result as JSON back to terraform
22 | echo "{ \"md5\": \"${md5_output}\" }"
23 |
--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/Dockerfile:
--------------------------------------------------------------------------------
1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
3 |
4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
5 |
6 | USER root
7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all
8 |
9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 |
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 | && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 | && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 | && chmod 0440 /etc/sudoers.d/$USERNAME
18 |
19 | WORKDIR /var/task
20 |
21 | COPY requirements.txt /var/task
22 | COPY embed_docs.py /var/task
23 |
24 | RUN chown -R ${user}:${user} /var/task && \
25 | chmod 755 /var/task/embed_docs.py /var/task/requirements.txt
26 |
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt
28 |
29 | USER ${USERNAME}
30 |
31 | CMD ["embed_docs.handler"]
32 |
--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/embed_docs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import boto3
4 |
5 | SQS_QUEUE_URL = os.environ["SQS_QUEUE_URL"]
6 | MAX_ARTICLES = int(os.environ["MAX_ARTICLES"])
7 | EMBEDDING_ENDPOINT_NAME = os.environ["EMBEDDING_ENDPOINT_NAME"]
8 | EMBEDDING_MODEL = os.environ["EMBEDDING_MODEL"]
9 | MAX_LENGTH = int(os.environ["MAX_LENGTH"])
10 | EMBEDDING_FIELDS = [
11 | "title",
12 | "summary",
13 | "text",
14 | # * Useful for embneddings not in public dataset
15 | # "subjects",
16 | # "industries",
17 | # "organizations",
18 | # "people",
19 | # "locations",
20 | ]
21 | PREPROCESS_BUCKET = os.environ["PREPROCESS_BUCKET"]
22 | EMBEDDING_BUCKET = os.environ["EMBEDDING_BUCKET"]
23 |
24 | s3_client = boto3.client("s3")
25 | sagemaker_client = boto3.client("sagemaker-runtime")
26 | sqs_client = boto3.client("sqs")
27 | bedrock_client = boto3.client("bedrock-runtime")
28 |
29 |
30 | def create_concat_text(doc_list):
31 | concat_list = []
32 | for doc in doc_list:
33 | concat_text = []
34 | for field in EMBEDDING_FIELDS:
35 | if isinstance(doc[field], str):
36 | concat_text.append(doc[field])
37 |
38 | # concat_text = [doc[f] for f in EMBEDDING_FIELDS]
39 | print("Concat Text", concat_text)
40 | concatenated = "\n".join(concat_text)
41 | concat_list.append(concatenated)
42 | return concat_list
43 |
44 |
45 | # Event is list of S3 keys
46 | def handler(event, context):
47 |
48 | document_list = []
49 | for s3_key in event:
50 | print("Getting article from ", s3_key)
51 | response = s3_client.get_object(Bucket=PREPROCESS_BUCKET, Key=s3_key)
52 | data = response["Body"].read().decode("utf-8")
53 | doc = json.loads(data)
54 | document_list.append(doc)
55 |
56 | text_list = create_concat_text(document_list)
57 | print("Text list: ", text_list)
58 |
59 | data = {"input_texts": text_list, "max_length": MAX_LENGTH}
60 |
61 | # Print the content
62 | print("Data:")
63 | print(data)
64 | json_data = json.dumps(data)
65 | print("Embedding endpoint name: ", EMBEDDING_ENDPOINT_NAME)
66 |
67 | if len(document_list) > MAX_ARTICLES:
68 | document_list[:MAX_ARTICLES]
69 |
70 | # If titan use bedrock, otherwise use sagemaker
71 | prediction = {"embeddings": []}
72 | if EMBEDDING_MODEL == "titan":
73 | for text in text_list:
74 | response = bedrock_client.invoke_model(
75 | body=json.dumps(
76 | {"inputText": text, "dimensions": MAX_LENGTH, "normalize": True}
77 | ),
78 | modelId="amazon.titan-embed-text-v2:0",
79 | accept="application/json",
80 | contentType="application/json",
81 | )
82 | response_body = json.loads(response.get("body").read().decode("utf-8"))
83 | prediction["embeddings"].append(response_body["embedding"])
84 | else:
85 | # Push content to the SageMaker endpoint
86 | response = sagemaker_client.invoke_endpoint(
87 | EndpointName=EMBEDDING_ENDPOINT_NAME,
88 | ContentType="application/json",
89 | Body=json_data,
90 | )
91 | prediction = json.loads(response["Body"].read().decode("utf-8"))
92 |
93 | print("Prediction:")
94 | print(prediction)
95 | embedding_list = prediction["embeddings"]
96 |
97 | for i, doc in enumerate(document_list):
98 | doc["concat_embedding"] = [embedding_list[i]]
99 | message_body = json.dumps(doc)
100 | if len(message_body.encode("utf-8")) > 262144:
101 | print(f"Skipping item at index {i} due to size limit")
102 | continue
103 | s3_key = doc["id"] + ".json"
104 | json_data = json.dumps(doc)
105 | sqs_client.send_message(QueueUrl=SQS_QUEUE_URL, MessageBody=json_data)
106 | s3_client.put_object(Bucket=EMBEDDING_BUCKET, Key=s3_key, Body=json_data)
107 |
108 | print("End of function")
109 | return "Success"
110 |
--------------------------------------------------------------------------------
/business_logic/lambdas/embed_docs/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/Dockerfile:
--------------------------------------------------------------------------------
1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
3 |
4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
5 |
6 | USER root
7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all
8 |
9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 |
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 | && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 | && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 | && chmod 0440 /etc/sudoers.d/$USERNAME
18 |
19 | WORKDIR /var/task
20 |
21 | COPY requirements.txt /var/task
22 | COPY pre_process_docs.py /var/task
23 |
24 | RUN chown -R ${user}:${user} /var/task && \
25 | chmod 755 /var/task/pre_process_docs.py /var/task/requirements.txt
26 |
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt
28 |
29 | USER ${USERNAME}
30 |
31 | CMD ["pre_process_docs.handler"]
32 |
--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/pre_process_docs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import pickle
4 | import boto3
5 | from typing import List, Dict
6 | from bs4 import BeautifulSoup
7 | import re
8 | import base64
9 |
10 | kinesis_client = boto3.client("kinesis")
11 | s3_client = boto3.client("s3")
12 |
13 | PREPROCESS_BUCKET = os.environ["PREPROCESS_BUCKET"]
14 |
15 |
16 | def clean_text(text):
17 | # apply to title
18 | text = text.replace(""", '"')
19 | text = re.sub(r'[^:a-zA-Z0-9\s"\'-]', "", text)
20 | return text
21 |
22 |
23 | def extract_top_subjects(subject_entry: List[dict], threshold: float):
24 | subjects = []
25 | for e in subject_entry:
26 | if e["relevance"] >= threshold:
27 | subjects.append(e["long_name"])
28 |
29 | return "StorySubjects: " + ", ".join(subjects)
30 |
31 |
32 | def extract_top_industries(industries_entry: List[dict], threshold: float):
33 | industries = []
34 | for e in industries_entry:
35 | if e["relevance"] >= threshold:
36 | industries.append(e["long_name"])
37 |
38 | result = "RelevantIndustries: " + ", ".join(industries) if industries else ""
39 |
40 | return result
41 |
42 |
43 | def extract_top_organizations(orgs_entry: List[dict], threshold: float):
44 | orgs = []
45 | for e in orgs_entry:
46 | if e["relevance"] >= threshold:
47 | orgs.append(e["name"])
48 |
49 | result = "RelevantOrganizations: " + ", ".join(orgs) if orgs else ""
50 |
51 | return result
52 |
53 |
54 | def remove_tags(text: str):
55 | soup = BeautifulSoup(text, "html.parser")
56 | return soup.get_text()
57 |
58 |
59 | def get_names(people: List[Dict], threshold=0.5):
60 |
61 | names = [person["name"] for person in people if person["relevance"] > threshold]
62 |
63 | result = "PeopleOfInterest: " + ", ".join(names) if names else ""
64 |
65 | return result
66 |
67 |
68 | def get_locations(locations: List[dict], threshold=0.8):
69 | result = []
70 | if locations:
71 | names = [
72 | location["long_name"]
73 | for location in locations
74 | if location["relevance"] > threshold
75 | ]
76 |
77 | result = "Location: " + ", ".join(names) if names else ""
78 |
79 | return result
80 |
81 |
82 | def process_data(data: dict):
83 |
84 | # irrelevant columns for embedding
85 | drop = [
86 | "vendor_data",
87 | "headline_only",
88 | "deckline",
89 | "version",
90 | "story_link",
91 | "copyright_line",
92 | "display_date",
93 | "received_date",
94 | "publication_reason",
95 | "media",
96 | "spam",
97 | "control_flags",
98 | "issuer",
99 | "market",
100 | "business_relevance",
101 | "cluster_signature",
102 | "headline_cluster_signature",
103 | "signals",
104 | "cik",
105 | "feed",
106 | ]
107 |
108 | processed_data = {}
109 | for k, v in data.items():
110 | if k not in drop:
111 | processed_data[k] = v
112 |
113 | processed_data["title"] = clean_text(data["title"])
114 | processed_data["summary"] = clean_text(
115 | data["text"]
116 | ) # No summary in public dataset using text
117 | processed_data["text"] = remove_tags(data["text"])
118 | processed_data["publication_date"] = remove_tags(data["date"])
119 |
120 | ## * Additional data that's useful for embeddings but isn't in public data
121 | # processed_data["subjects"] = extract_top_subjects(data["subjects"], threshold=0.8)
122 | # processed_data["summary"] = clean_text(data["summary"])
123 | # processed_data["industries"] = extract_top_industries(
124 | # data["industries"], threshold=0.8
125 | # )
126 | # processed_data["organizations"] = extract_top_organizations(
127 | # data["organizations"], threshold=0.6
128 | # )
129 | # processed_data["people"] = get_names(data["people"], threshold=0.5)
130 | # processed_data["locations"] = get_locations(data.get("locations"), threshold=0.8)
131 |
132 | return processed_data
133 |
134 |
135 | def handler(events, context):
136 | event = events[0]
137 | print("EVENT: ", event)
138 |
139 | encrypted_list = event["data"]
140 | document_json = base64.b64decode(encrypted_list).decode("utf-8")
141 |
142 | document_list = json.loads(document_json)
143 | print("Document List: ", document_list)
144 | s3_key_list = []
145 |
146 | for doc in document_list:
147 | processed_data = process_data(doc)
148 |
149 | print("Processed Data:")
150 | print(processed_data)
151 |
152 | s3_key = processed_data["id"] + ".json"
153 | json_data = json.dumps(processed_data)
154 | print("Pushing data to ", PREPROCESS_BUCKET + "/" + s3_key)
155 | s3_client.put_object(Bucket=PREPROCESS_BUCKET, Key=s3_key, Body=json_data)
156 |
157 | s3_key_list.append(s3_key)
158 |
159 | print("End of function: ", s3_key_list)
160 | return s3_key_list
161 |
--------------------------------------------------------------------------------
/business_logic/lambdas/pre_process_docs/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | bs4
3 | chardet
--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/Dockerfile:
--------------------------------------------------------------------------------
1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
3 |
4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
5 |
6 | USER root
7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all
8 |
9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 |
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 | && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 | && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 | && chmod 0440 /etc/sudoers.d/$USERNAME
18 |
19 | WORKDIR /var/task
20 |
21 | COPY requirements.txt /var/task
22 | COPY summarization.py /var/task
23 |
24 | RUN chown -R ${user}:${user} /var/task && \
25 | chmod 755 /var/task/summarization.py /var/task/requirements.txt
26 |
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt
28 |
29 | USER ${USERNAME}
30 |
31 | CMD ["summarization.handler"]
32 |
--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
--------------------------------------------------------------------------------
/business_logic/lambdas/summarization/summarization.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import boto3
4 | from datetime import datetime
5 | from collections import Counter
6 |
7 | bedrock_client = boto3.client("bedrock-runtime")
8 | dynamodb = boto3.resource("dynamodb")
9 | model_id = os.environ["MODEL_ID"]
10 | table_name = os.environ["DYNAMODB_TABLE_NAME"]
11 |
12 |
13 | def generate_average_cluster_data(articles):
14 | # Initialize counters and variables for tracking
15 | location_counter = Counter()
16 | organization_counter = Counter()
17 | earliest_date = datetime.max
18 | latest_date = datetime.min
19 |
20 | # Check if articles list is empty
21 | if not articles:
22 | return {
23 | "most_common_location": "",
24 | "most_common_organization": "",
25 | "earliest_date": "",
26 | "latest_date": "",
27 | }
28 |
29 | # Process each article
30 | for article in articles:
31 | publication_date = None
32 | if article.get("publication_date"):
33 | publication_date = datetime.fromisoformat(
34 | article.get("publication_date").rstrip("Z")
35 | )
36 | location_counter.update(article.get("locations"))
37 | organization_counter.update(article.get("organizations"))
38 |
39 | if publication_date and publication_date < earliest_date:
40 | earliest_date = publication_date
41 | if publication_date and publication_date > latest_date:
42 | latest_date = publication_date
43 |
44 | # Handle case where no locations or organizations were found
45 | if location_counter:
46 | most_common_location, _ = location_counter.most_common(1)[0]
47 | else:
48 | most_common_location = ""
49 |
50 | if organization_counter:
51 | most_common_organization, _ = organization_counter.most_common(1)[0]
52 | else:
53 | most_common_organization = ""
54 |
55 | # Adjusted return to include a check for the date range
56 | return {
57 | "most_common_location": most_common_location,
58 | "most_common_organization": most_common_organization,
59 | "earliest_date": earliest_date.strftime("%Y-%m-%d %H:%M:%S"),
60 | "latest_date": latest_date.strftime("%Y-%m-%d %H:%M:%S"),
61 | }
62 |
63 |
64 | def get_cluster_data(cluster_id):
65 | # Initialize a DynamoDB client
66 | table = dynamodb.Table(table_name)
67 |
68 | # Query the table
69 | response = table.query(
70 | KeyConditionExpression=boto3.dynamodb.conditions.Key("PK").eq(cluster_id),
71 | )
72 | cluster_data = response.get("Items", [])
73 |
74 | # Extract the first item
75 | metadata = cluster_data[0]
76 | articles = cluster_data[1:]
77 | summary_count = metadata.get("summary_count", 0)
78 |
79 | return metadata.get("generated_summary", ""), summary_count, articles
80 |
81 |
82 | def generate_bedrock_claude(input_tokens):
83 | claude_body = {
84 | "modelId": model_id,
85 | "body": json.dumps(
86 | {
87 | "anthropic_version": "bedrock-2023-05-31",
88 | "messages": [{"role": "user", "content": input_tokens}],
89 | "max_tokens": 500, # the higher this is the longer it takes
90 | "temperature": 0.1, # these parameters affect response diversity
91 | "top_p": 1,
92 | "top_k": 100,
93 | }
94 | ),
95 | }
96 | bedrock_response = bedrock_client.invoke_model(
97 | **claude_body,
98 | accept="*/*",
99 | contentType="application/json",
100 | )
101 | body = bedrock_response.get("body")
102 | rd = body.read()
103 | body_json = json.loads(rd)
104 | try:
105 | response = body_json["content"][0].get("text")
106 | output_token_cnt = int(
107 | bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
108 | "x-amzn-bedrock-output-token-count"
109 | )
110 | )
111 | input_token_cnt = int(
112 | bedrock_response["ResponseMetadata"]["HTTPHeaders"].get(
113 | "x-amzn-bedrock-input-token-count"
114 | )
115 | )
116 | except Exception:
117 | print(rd)
118 | return input_token_cnt, output_token_cnt, response
119 |
120 |
121 | def parse_res(res):
122 | try:
123 | title = res.split("
")[-1].split("")[0]
124 | summary = res.split("")[-1].split("")[0]
125 | return title, summary
126 | except Exception:
127 | return "", res
128 |
129 |
130 | def generate_cluster_summary(previous_summary, articles, limit):
131 | input_context = []
132 | # If we've done summaries before we'll limit the input tokens for each summary
133 | limit_number = 2000
134 | if limit:
135 | limit_number = 1500
136 | instructions = "You will be provided with multiple sets of titles and summaries from different articles in tag, and the current title and summary for a story in tag. Compile, summarize and update the current title and summary for the story. The summary should be less than 100 words. Put the generated context inside and tag. Do not hallucinate or make up content.\n\n"
137 | texts = "\n".join(
138 | [
139 | f"title: {article.get('title')}, summary: {article.get('summary', "")[:limit_number]}"
140 | for article in articles
141 | ]
142 | )
143 | prompt = f"{instructions} \n{previous_summary} \n\n \n{texts}\n\n"
144 | print("Prompt Length:", len(prompt))
145 | input_context.append(prompt)
146 | output = generate_bedrock_claude(prompt[:12000])
147 | title, summary = parse_res(output[2])
148 |
149 | return {"title": title, "summary": summary}
150 |
151 |
152 | """
153 | Event Expected in following format
154 | {
155 | cluster_id: "198be4aa-95e8-4d8e-9e0b-a37eef6c29e2"
156 | }
157 | """
158 |
159 |
160 | def handler(event, context):
161 | print("Input Event", event)
162 |
163 | previous_summary, summary_count, articles = get_cluster_data(event["cluster_id"])
164 | generated_summary = generate_cluster_summary(previous_summary, articles, summary_count > 0)
165 | averages = generate_average_cluster_data(articles)
166 |
167 | print("Generated Summary", generated_summary)
168 | print("Averages", averages)
169 |
170 | return {**generated_summary, **averages, "summary_count": summary_count + 1}
171 |
--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/Dockerfile:
--------------------------------------------------------------------------------
1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
2 | #checkov:skip=CKV2_DOCKER_1: Ensure that sudo isn't used
3 |
4 | FROM amazon/aws-lambda-python:3.12@sha256:a108241bf16fab9559420cbd64d8a608d175f56551ae35bc304c5dcf55f0ec0d
5 |
6 | USER root
7 | RUN dnf update -y && dnf install shadow-utils sudo util-linux -y && dnf clean all
8 |
9 | # Set a non-root user
10 | ARG USERNAME=lambda
11 | ARG USER_UID=1000
12 | ARG USER_GID=$USER_UID
13 |
14 | RUN /usr/sbin/groupadd --gid $USER_GID $USERNAME \
15 | && /usr/sbin/useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} \
16 | && echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers.d/$USERNAME \
17 | && chmod 0440 /etc/sudoers.d/$USERNAME
18 |
19 | WORKDIR /var/task
20 |
21 | COPY requirements.txt /var/task
22 | COPY trigger_sfn.py /var/task
23 |
24 | RUN chown -R ${user}:${user} /var/task && \
25 | chmod 755 /var/task/trigger_sfn.py /var/task/requirements.txt
26 |
27 | RUN pip install --no-cache-dir -r /var/task/requirements.txt
28 |
29 | USER ${USERNAME}
30 |
31 | CMD ["trigger_sfn.handler"]
32 |
--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
--------------------------------------------------------------------------------
/business_logic/lambdas/trigger_sfn/trigger_sfn.py:
--------------------------------------------------------------------------------
1 | import json
2 | import boto3
3 | import os
4 |
5 |
6 | def handler(event, context):
7 | # Initialize the DynamoDB and Step Functions clients
8 | dynamodb_client = boto3.client("dynamodb")
9 | sfn_client = boto3.client("stepfunctions")
10 |
11 | # State Machine ARN and the threshold for number_of_articles from environment variables
12 | state_machine_arn = os.environ["STATE_MACHINE_ARN"]
13 | articles_threshold = int(os.environ["ARTICLES_THRESHOLD"])
14 | article_cap = 3 # A multiple of articles_threshold, to stop processing summaries
15 | # DynamoDB table name
16 | table_name = os.environ["DYNAMODB_TABLE_NAME"]
17 |
18 | # Process each record in the DynamoDB Stream
19 | for record in event[
20 | "Records"
21 | ]: # ! ToDo aggregate records and send them in batch instead of one at at Time
22 | if record["eventName"] == "INSERT":
23 | new_image = record["dynamodb"].get("NewImage", {})
24 | print("New Record")
25 | if "type" in new_image and new_image["type"]["S"] == "article":
26 | print("Record is an Article")
27 |
28 | # Extract primary key (PK)
29 | pk_value = new_image["PK"]["S"]
30 | metadata_key = f"#METADATA#{pk_value}"
31 | print("PK is", pk_value)
32 |
33 | # Get the item with PK and #METADATA#[PK] sort key
34 | response = dynamodb_client.get_item(
35 | TableName=table_name,
36 | Key={"PK": {"S": pk_value}, "SK": {"S": metadata_key}},
37 | )
38 | item = response.get("Item", {})
39 | print("Cluster: ", item)
40 |
41 | # If we get an empty item with no articles move to the next record
42 | if "number_of_articles" not in item:
43 | continue
44 |
45 | summary_count = int(item.get("summary_count", {"N": "0"})["N"])
46 | lower_limit_flag = int(item["number_of_articles"]["N"]) > articles_threshold * (summary_count + 1)
47 | upper_limit_flag = int(item["number_of_articles"]["N"]) < 3 * articles_threshold
48 |
49 | print("Summary Count:", summary_count)
50 | print("Lower Limit Flag:", lower_limit_flag)
51 | print("Upper Limit Flag:", upper_limit_flag)
52 | print("Overall flag:", (lower_limit_flag and upper_limit_flag) or (lower_limit_flag and summary_count == 0))
53 |
54 | # Check if number_of_articles is within a range or if it is outside the upper limit but still hasn't been summarized
55 | if (lower_limit_flag and upper_limit_flag) or (lower_limit_flag and summary_count == 0):
56 | # Prepare data for Step Functions
57 | input_data = {
58 | "cluster_id": pk_value,
59 | }
60 |
61 | # Start execution of the state machine
62 | response = sfn_client.start_execution(
63 | stateMachineArn=state_machine_arn, input=json.dumps(input_data)
64 | )
65 |
66 | print(
67 | f"Started Step Functions execution for 'article' record: {response['executionArn']}"
68 | )
69 | else:
70 | print(
71 | "Not enough articles in the cluster yet, less than ",
72 | articles_threshold,
73 | )
74 |
75 | return {
76 | "statusCode": 200,
77 | "body": json.dumps(
78 | 'Processed DynamoDB stream records of type "article" with sufficient count.'
79 | ),
80 | }
81 |
--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/code/inference.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import time
5 | import torch.multiprocessing as mp
6 |
7 | mp.set_start_method("spawn", force=True)
8 |
9 | MODEL_NAME = os.environ.get("MODEL_NAME")
10 | BIT_LOADING = os.environ.get("BIT_LOADING")
11 | print(f"MODEL_NAME: {MODEL_NAME}")
12 | print(f"BIT_LOADING: {BIT_LOADING}")
13 | MODEL_MAP = {
14 | "mistralinstruct": None,
15 | "bge": None,
16 | }
17 |
18 |
19 | print("Current working directory: ", os.getcwd())
20 | print("List current working directory: ", os.listdir(os.getcwd()))
21 |
22 |
23 | def model_fn(model_dir):
24 | try:
25 | print(f"In model_fn, model_dir={model_dir}")
26 | print(f"CWD: {os.getcwd()}")
27 | print(f"List CWD: {os.listdir(os.getcwd())}")
28 | print(f"List model_dir: {os.listdir(model_dir)}")
29 | sys.path.append(model_dir + "/model")
30 | print(f"Sys path: {sys.path}")
31 | print(f"List model_dir/model: {os.listdir(model_dir+'/model')}")
32 | print(f"List model_dir/code: {os.listdir(model_dir+'/code')}")
33 |
34 | from embed_documents import EmbedDocuments
35 |
36 | print("Successfully imported EmbedDocuments")
37 | model_cls = EmbedDocuments(MODEL_NAME, MODEL_MAP[MODEL_NAME], BIT_LOADING)
38 |
39 | except Exception as e:
40 | print(f"WEIRD, error: {e}")
41 | return model_cls
42 |
43 |
44 | def input_fn(input_data, content_type="application/json"):
45 | """A default input_fn that can handle JSON, CSV and NPZ formats.
46 |
47 | Args:
48 | input_data: the request payload serialized in the content_type format
49 | content_type: the request content_type
50 |
51 | Returns: input_data deserialized into torch.FloatTensor or torch.cuda.FloatTensor depending if cuda is available.
52 | """
53 | print(f"input_fn, input_data={input_data}, content_type={content_type}")
54 | # Process the input data (e.g., convert from JSON)
55 | print("input_fn")
56 | print("request body: ", input_data)
57 | if content_type == "application/json":
58 | print("request_content_type is application/json")
59 | data = json.loads(input_data)
60 | texts = data["input_texts"]
61 | return texts
62 | else:
63 | raise ValueError(f"Unsupported content type: {content_type}")
64 |
65 |
66 | def predict_fn(data, model):
67 | """A default predict_fn for PyTorch. Calls a model on data deserialized in input_fn.
68 | Runs prediction on GPU if cuda is available.
69 |
70 | Args:
71 | data: input data (torch.Tensor) for prediction deserialized by input_fn
72 | model: PyTorch model loaded in memory by model_fn
73 |
74 | Returns: a prediction
75 | """
76 | print(f"predict_fn, data={data}, model={model}")
77 | start_time = time.time()
78 | new_doc = model.model_handler.encode(data)
79 | end_time = time.time()
80 | new_data = {"embeddings": new_doc, "time": end_time - start_time}
81 | return new_data
82 |
83 |
84 | def output_fn(prediction, content_type="application/json"):
85 | """A default output_fn for PyTorch. Serializes predictions from predict_fn to JSON, CSV or NPY format.
86 |
87 | Args:
88 | prediction: a prediction result from predict_fn
89 | accept: type which the output data needs to be serialized
90 |
91 | Returns: output data serialized
92 | """
93 | print(f"output_fn, prediction={prediction}, content_type={content_type}")
94 | if content_type == "application/json":
95 | print("content_type is application/json")
96 | return json.dumps(prediction)
97 | else:
98 | raise ValueError(f"Unsupported content type: {content_type}")
99 |
--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/code/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate
2 | bitsandbytes
3 | spacy
4 | torch
5 | transformers
--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/business_logic/model_artifacts/embedding/model/model/__init__.py
--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/embed_documents.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing import Literal
3 | import importlib
4 | from embedding_model_utils import PretrainedHandler
5 |
6 | print("END EMBED_DOCUMENNTS IMPORTS")
7 |
8 |
9 | class EmbedDocuments:
10 | def __init__(
11 | self,
12 | model_name: Literal["bge", "mistralinstruct"],
13 | pretrained_path=None,
14 | bit_loading=None,
15 | device=None,
16 | model_handler_module: str = "embedding_model_utils",
17 | ):
18 |
19 | self.supported_models = dict(
20 | bge="PretrainedBGELarge",
21 | mistralinstruct="PretrainedMistral7bInstruct",
22 | )
23 |
24 | self.model_name = model_name.lower().strip()
25 | assert (
26 | model_name in self.supported_models
27 | ), f"model_name is not supported. Choose from {list(self.supported_models.keys())}"
28 |
29 | self.bit_loading = bit_loading
30 | self.model_handler: PretrainedHandler = None
31 |
32 | if device is None:
33 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34 | else:
35 | self.device = device
36 |
37 | self.models_module = importlib.import_module(model_handler_module)
38 | self.load_model(pretrained_path=pretrained_path)
39 |
40 | def load_model(self, pretrained_path=None):
41 | model_class_name = self.supported_models[self.model_name]
42 |
43 | if hasattr(self.models_module, model_class_name):
44 | model_class = getattr(self.models_module, model_class_name)
45 | else:
46 | raise NotImplementedError(
47 | "Model loading method does not exist. Check for typos or implement"
48 | )
49 |
50 | self.model_handler = model_class(
51 | pretrained_path=pretrained_path, bit_loading=self.bit_loading
52 | )
53 |
54 | assert self.model_handler is not None
55 |
56 | def delete_model(self):
57 | self.model_handler.model.to("cpu")
58 | del self.model_handler.model
59 | torch.cuda.empty_cache()
60 |
--------------------------------------------------------------------------------
/business_logic/model_artifacts/embedding/model/model/embedding_model_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from torch import Tensor
4 | from transformers import AutoTokenizer, AutoModel
5 | from typing import List
6 | from abc import ABC, abstractmethod
7 |
8 |
9 | class PretrainedHandler(ABC):
10 | def __init__(self, pretrained_path=None, bit_loading=None, device=None):
11 | self.model = None
12 | self.tokenizer = None
13 |
14 | if device is None:
15 | self.device = "cuda" if torch.cuda.is_available() else "cpu"
16 | else:
17 | assert device in set(
18 | ["cuda", "cpu"]
19 | ), "Incorrect device chosen. Choose from [cuda, cpu]"
20 | self.device = device
21 |
22 | self.bit_loading = bit_loading
23 | self.get_model(pretrained_path=pretrained_path)
24 |
25 | @abstractmethod
26 | def get_model(self, pretrained_path=None) -> None:
27 | """
28 | Instantiates self.model and self.tokenizer
29 | """
30 | raise NotImplementedError
31 |
32 | def encode(self, texts: List[str]):
33 | """encode texts"""
34 | return self._encode()(texts)
35 |
36 | def _encode(self):
37 | """return the encoding method for the target model
38 | Can differ between models (e.g. model.encode, model, model.forward)"""
39 | return self.model.encode
40 |
41 |
42 | class PretrainedMistral7bInstruct(PretrainedHandler):
43 |
44 | @classmethod
45 | def last_token_pool(
46 | cls, last_hidden_states: Tensor, attention_mask: Tensor
47 | ) -> Tensor:
48 | left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
49 | if left_padding:
50 | return last_hidden_states[:, -1]
51 | else:
52 | sequence_lengths = attention_mask.sum(dim=1) - 1
53 | batch_size = last_hidden_states.shape[0]
54 | return last_hidden_states[
55 | torch.arange(batch_size, device=last_hidden_states.device),
56 | sequence_lengths,
57 | ]
58 |
59 | @classmethod
60 | def get_detailed_instruct(cls, task_description: str, query: str) -> str:
61 | return f"Instruct: {task_description}\nNewsPassage: {query}"
62 |
63 | def get_model(self, pretrained_path=None):
64 |
65 | model_source = (
66 | "intfloat/e5-mistral-7b-instruct"
67 | if pretrained_path is None
68 | else pretrained_path
69 | )
70 |
71 | # Each query must come with a one-sentence instruction that describes the task
72 | # Example
73 | # task = 'Given a web search query, retrieve relevant passages that answer the query'
74 | # input_texts = [self.get_detailed_instruct(task, 'how much protein should a female eat'),
75 | # self.get_detailed_instruct(task, 'summit define'),
76 | # "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
77 | # "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments."]
78 | self.tokenizer = AutoTokenizer.from_pretrained(
79 | pretrained_model_name_or_path=model_source
80 | )
81 |
82 | assert (
83 | torch.cuda.is_available()
84 | ), "GPU is needed to load model in 4-bit or 8-bit"
85 |
86 | if self.bit_loading == "4":
87 | print("loading in 4bit")
88 |
89 | self.model = AutoModel.from_pretrained(
90 | pretrained_model_name_or_path=model_source,
91 | load_in_4bit=True,
92 | bnb_4bit_compute_dtype=torch.float16,
93 | device_map=self.device,
94 | )
95 | else:
96 | print("loading in 8bit")
97 | self.model = AutoModel.from_pretrained(
98 | pretrained_model_name_or_path=model_source, load_in_8bit=True
99 | )
100 |
101 | self.model.eval()
102 |
103 | def encode(self, texts: List[str]):
104 | max_length = 4096
105 |
106 | task = "Given this news passage, retrieve relevant news passages that pertain to the same event (who, what, where, when)"
107 | texts = [self.get_detailed_instruct(task, text) for text in texts]
108 |
109 | # Tokenize the input texts
110 | batch_dict = self.tokenizer(
111 | texts,
112 | max_length=max_length - 1,
113 | return_attention_mask=False,
114 | padding=False,
115 | truncation=True,
116 | )
117 |
118 | # append eos_token_id to every input_ids
119 | batch_dict["input_ids"] = [
120 | input_ids + [self.tokenizer.eos_token_id]
121 | for input_ids in batch_dict["input_ids"]
122 | ]
123 | batch_dict = self.tokenizer.pad(
124 | batch_dict, padding=True, return_attention_mask=True, return_tensors="pt"
125 | )
126 |
127 | return self._encode(encoded_input=batch_dict)
128 |
129 | def _encode(self, encoded_input=None):
130 | with torch.no_grad():
131 | outputs = self.model(**encoded_input)
132 |
133 | embeddings = self.last_token_pool(
134 | outputs.last_hidden_state, encoded_input["attention_mask"]
135 | )
136 |
137 | # normalize embeddings
138 | embeddings = F.normalize(embeddings, p=2, dim=1)
139 |
140 | embeddings = embeddings.to("cpu").tolist()
141 |
142 | return embeddings
143 |
144 |
145 | class PretrainedBGELarge(PretrainedHandler):
146 |
147 | def get_model(self, pretrained_path=None):
148 |
149 | model_source = (
150 | "BAAI/bge-large-zh-v1.5" if pretrained_path is None else pretrained_path
151 | )
152 |
153 | # Load model from HuggingFace Hub
154 | tokenizer = AutoTokenizer.from_pretrained(model_source)
155 | model = AutoModel.from_pretrained(model_source)
156 | model.eval()
157 |
158 | self.model = model
159 | self.tokenizer = tokenizer
160 |
161 | model.to(self.device)
162 |
163 | def encode(self, texts: List[str]):
164 |
165 | # # Tokenize sentencesxs
166 | # encoded_input = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
167 |
168 | # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
169 | instruction = "Embed this passage for clustering on the topic of discussion in the news article: "
170 | encoded_input = self.tokenizer(
171 | [instruction + t for t in texts],
172 | padding=True,
173 | truncation=True,
174 | max_length=512,
175 | return_tensors="pt",
176 | )
177 |
178 | encoded_input.to(self.device)
179 |
180 | return self._encode()(encoded_input)
181 |
182 | def _encode(self):
183 | def forward(encoded_input):
184 | # Compute token embeddings
185 | with torch.no_grad():
186 | model_output = self.model(**encoded_input)
187 | # Perform pooling. In this case, cls pooling.
188 | sentence_embeddings = model_output[0][:, 0]
189 |
190 | # normalize embeddings
191 | sentence_embeddings = (
192 | torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
193 | .to("cpu")
194 | .tolist()
195 | )
196 |
197 | return sentence_embeddings
198 |
199 | return forward
200 |
--------------------------------------------------------------------------------
/business_logic/model_artifacts/multi_gpu_embedding/model/code/inference.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
2 | import torch
3 | import torch.nn.functional as F
4 | from torch import Tensor
5 | from transformers import AutoTokenizer, AutoModel
6 | import traceback
7 | from accelerate import Accelerator
8 |
9 | accelerate = Accelerator()
10 |
11 |
12 | def model_fn(model_dir, context):
13 |
14 | # load tokenizer and model from model_dir
15 | try:
16 | device = f"cuda:{context._system_properties['gpu_id']}"
17 | print(f"LOADING MODEL onto: {device}")
18 | model = AutoModel.from_pretrained(
19 | model_dir,
20 | quantization_config=BitsAndBytesConfig(load_in_8bit=True),
21 | device_map=device,
22 | )
23 | model.eval()
24 |
25 | except Exception as e:
26 | print("FAILED: LOADING MODEL")
27 | print(e)
28 | print(traceback.format_exc())
29 |
30 | tokenizer = AutoTokenizer.from_pretrained(model_dir)
31 |
32 | return tokenizer, model
33 |
34 |
35 | def predict_fn(data, tokenizer_and_model):
36 | torch.cuda.empty_cache()
37 |
38 | # unpack tokenizer and model
39 | tokenizer, model = tokenizer_and_model
40 |
41 | # Grab the data
42 | texts = data.pop("input_texts")
43 | max_length = data.pop("max_length")
44 |
45 | def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
46 |
47 | left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
48 | if left_padding:
49 | return last_hidden_states[:, -1]
50 | else:
51 | sequence_lengths = attention_mask.sum(dim=1) - 1
52 | batch_size = last_hidden_states.shape[0]
53 | return last_hidden_states[
54 | torch.arange(batch_size, device=last_hidden_states.device),
55 | sequence_lengths,
56 | ]
57 |
58 | def get_detailed_instruct(task_description: str, query: str) -> str:
59 | return f"Instruct: {task_description}\nQuery: {query}"
60 |
61 | print("PROCESSING texts")
62 | task = "Given this news passage, retrieve relevant news passages that pertain to the same event (who, what, where, when)"
63 | texts = [get_detailed_instruct(task, text) for text in texts]
64 |
65 | # Tokenize the input texts
66 | batch_dict = tokenizer(
67 | texts,
68 | max_length=max_length - 1,
69 | return_attention_mask=False,
70 | padding=False,
71 | truncation=True,
72 | )
73 |
74 | print("TOKENIZED texts")
75 | # append eos_token_id to every input_ids
76 | batch_dict["input_ids"] = [
77 | input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict["input_ids"]
78 | ]
79 | batch_dict = tokenizer.pad(
80 | batch_dict, padding=True, return_attention_mask=True, return_tensors="pt"
81 | )
82 |
83 | try:
84 | print("FORWARD PASS")
85 | with torch.no_grad():
86 | outputs = model(**batch_dict)
87 |
88 | print("GET EMBEDDINGS")
89 | embeddings = last_token_pool(
90 | outputs.last_hidden_state, batch_dict["attention_mask"]
91 | )
92 |
93 | # normalize embeddings
94 | embeddings = F.normalize(embeddings.to(torch.float32), p=2, dim=1)
95 |
96 | embeddings = embeddings.to("cpu").tolist()
97 | except Exception as e:
98 | print("FORWARD ERROR")
99 | print(traceback.format_exc())
100 | print(e)
101 | embeddings = [None for _ in range(len(texts))]
102 |
103 | del batch_dict
104 |
105 | return {"embeddings": embeddings}
106 |
--------------------------------------------------------------------------------
/business_logic/model_artifacts/multi_gpu_embedding/model/code/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate==0.27.2
2 | transformers==4.48.0
3 | bitsandbytes==0.42.0
4 | --extra-index-url https://download.pytorch.org/whl/cu118
5 | torch==2.2.1
6 | huggingface-hub
--------------------------------------------------------------------------------
/business_logic/stream_consumer/clustering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | from sklearn.neighbors import sort_graph_by_row_values
4 | from scipy.sparse import csr_matrix, tril
5 | from joblib import Parallel, delayed
6 | import functools
7 |
8 |
9 | def timer(func):
10 | @functools.wraps(func)
11 | def wrapper(*args, **kwargs):
12 | start = time.time()
13 | result = func(*args, **kwargs)
14 | end = time.time()
15 | print(f"{func.__name__}\t{end - start:f}")
16 | return result
17 |
18 | return wrapper
19 |
20 |
21 | def sort_row(data_slice, indices_slice):
22 | order = np.argsort(data_slice, kind="mergesort")
23 | return data_slice[order], indices_slice[order]
24 |
25 |
26 | def parallel_sort_rows(graph):
27 | # Get the slices of data and indices
28 | data_slices = [
29 | graph.data[start:stop]
30 | for start, stop in zip(graph.indptr[:-1], graph.indptr[1:])
31 | ]
32 | indices_slices = [
33 | graph.indices[start:stop]
34 | for start, stop in zip(graph.indptr[:-1], graph.indptr[1:])
35 | ]
36 |
37 | # Sort each slice in parallel
38 | sorted_slices = Parallel(n_jobs=-1)(
39 | delayed(sort_row)(data_slice, indices_slice)
40 | for data_slice, indices_slice in zip(data_slices, indices_slices)
41 | )
42 |
43 | # Update the graph with sorted slices
44 | for (start, stop), (sorted_data, sorted_indices) in zip(
45 | zip(graph.indptr[:-1], graph.indptr[1:]), sorted_slices
46 | ):
47 | graph.data[start:stop] = sorted_data
48 | graph.indices[start:stop] = sorted_indices
49 |
50 | return graph
51 |
52 |
53 | def batch_update_numpy_distance_matrix(new_embeds, cluster_pool, batch_size=120):
54 |
55 | # Convert the vectors to NumPy arrays
56 | vectors_numpy = np.array(new_embeds)
57 | cluster_pool_numpy = np.array(cluster_pool)
58 | norms = np.linalg.norm(vectors_numpy, axis=1, keepdims=True) # L2 Norm
59 | normalized_vectors = vectors_numpy / norms # Unit vectors
60 | norms = np.linalg.norm(cluster_pool_numpy, axis=1, keepdims=True) # L2 Norm
61 | normalized_pool = cluster_pool_numpy / norms
62 |
63 | # Initialize an empty similarity matrix
64 | distance_matrix = np.zeros(
65 | (len(vectors_numpy), len(cluster_pool_numpy)), dtype=np.float16
66 | )
67 |
68 | # Iterate through the vectors in batches
69 | for start in range(0, len(cluster_pool_numpy), batch_size):
70 | end = min(start + batch_size, len(cluster_pool_numpy))
71 | batch_cluster_pool = normalized_pool[start:end]
72 |
73 | # Compute cosine similarity for the batch
74 | similarity_batch = np.dot(normalized_vectors, batch_cluster_pool.T)
75 |
76 | # Convert similarity to distance
77 | distance_batch = 1 - similarity_batch
78 |
79 | # Fill in the corresponding section of the distance matrix
80 | distance_matrix[:, start:end] = distance_batch
81 |
82 | # Clip values to prevent numerical issues that might result in values slightly outside [0, 1]
83 | distance_matrix = np.clip(distance_matrix, 0, 1)
84 |
85 | return distance_matrix
86 |
87 |
88 | def get_sparse_distance_matrix(dense, n_priors):
89 |
90 | values = dense.flatten().astype(np.float32)
91 |
92 | row_indices = [*range(0, dense.shape[1])] * dense.shape[0]
93 |
94 | column_pointers = [0] * (n_priors + 1) + [
95 | *range(dense.shape[1], dense.shape[0] * (dense.shape[1] + 1), dense.shape[1])
96 | ]
97 |
98 | sparse_matrix = csr_matrix(
99 | (values, row_indices, column_pointers), shape=(dense.shape[1], dense.shape[1])
100 | )
101 | sparse_matrix = make_symmetric(sparse_matrix=sparse_matrix)
102 |
103 | if n_priors < 15000:
104 | res = sort_graph_by_row_values(
105 | sparse_matrix, copy=True, warn_when_not_sorted=False
106 | )
107 | else:
108 | res = parallel_sort_rows(sparse_matrix)
109 |
110 | return res
111 |
112 |
113 | def make_symmetric(sparse_matrix):
114 |
115 | low_tri = tril(sparse_matrix, k=0)
116 | symmetric_matrix = low_tri + tril(low_tri, k=-1).T
117 |
118 | return symmetric_matrix
119 |
120 |
121 | def prep_for_streaming(documents, interval=40):
122 |
123 | # split for streaming
124 | doc_splits = {}
125 |
126 | aug_records = documents
127 | estimated_time = len(aug_records) / interval
128 | for j, i in enumerate(range(0, len(aug_records), interval)):
129 | doc_splits[j] = aug_records[i : i + interval]
130 |
131 | return doc_splits, estimated_time
132 |
--------------------------------------------------------------------------------
/business_logic/stream_consumer/process_records.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | import json
3 | import botocore
4 | from clustering import (
5 | batch_update_numpy_distance_matrix,
6 | get_sparse_distance_matrix,
7 | )
8 | import numpy as np
9 | import time
10 | import boto3
11 | from sklearn.cluster import DBSCAN
12 | import uuid
13 | from datetime import datetime
14 | import ast # Use Abstract Syntax Trees module to safely evaluate string representation of dictionaries
15 | import functools
16 | import os
17 | import pickle
18 | import threading
19 | import copy
20 | from botocore.exceptions import ClientError
21 |
22 | # Initialize AWS clients
23 | s3 = boto3.client("s3")
24 | ssm = boto3.client("ssm")
25 | dynamodb = boto3.resource("dynamodb")
26 | sqs = boto3.client("sqs")
27 |
28 | # Configuration variables
29 | S3_BUCKET_NAME = os.environ["S3_BUCKET_NAME"]
30 | S3_FILE_KEY = os.environ["S3_FILE_KEY"]
31 | SQS_QUEUE = os.environ["SQS_QUEUE"]
32 | DYNAMODB_TABLE = os.environ["DYNAMODB_TABLE"]
33 |
34 | # Setup for clustering
35 | label_tracker: List[tuple] = []
36 | is_cluster: List[bool] = []
37 | embeds: List = None
38 |
39 | distance_matrix = None
40 |
41 | unique_article_id = 0
42 | unique_cluster_id = 0
43 | cluster_count = 0
44 |
45 | # Stream
46 | batch_times = [] #
47 | processed_pool_sizes = []
48 | incoming_articles = []
49 |
50 |
51 | def timer(func):
52 | @functools.wraps(func)
53 | def wrapper(*args, **kwargs):
54 | start = time.time()
55 | result = func(*args, **kwargs)
56 | end = time.time()
57 | print(f"{func.__name__}\t{end - start:f}")
58 | return result
59 |
60 | return wrapper
61 |
62 |
63 | # Format docs for clustering
64 | @timer
65 | def format_documents(messages):
66 | print("Format Docs")
67 | converted_messages = []
68 | associated_articles = {}
69 | seen_ids = set() # Keep track of seen ids
70 |
71 | for msg in messages:
72 | try:
73 | message_body = json.loads(msg.get("Body", "{}"))
74 | except json.JSONDecodeError:
75 | continue # Skip this message if there's a problem parsing it
76 |
77 | message_id = message_body.get("id")
78 |
79 | # Check for duplicate ids and skip if found
80 | if message_id in seen_ids:
81 | continue
82 | else:
83 | seen_ids.add(message_id)
84 |
85 | # Proceed if id is not a duplicate
86 | embeddings = np.asarray(message_body["concat_embedding"][0])
87 |
88 | converted_messages.append(
89 | {
90 | "id": message_id,
91 | "concat_embedding": embeddings,
92 | }
93 | )
94 | associated_articles[message_id] = message_body
95 |
96 | return converted_messages, associated_articles
97 |
98 |
99 | @timer
100 | def batch_get_meta_data(keys_to_get):
101 | items = [] # List to store the successfully retrieved items
102 | missing_keys = [] # List to store keys of items that were not found
103 | unprocessed_keys = keys_to_get # Start with all keys as unprocessed
104 |
105 | while unprocessed_keys:
106 | # Prepare the current batch request
107 | request = {
108 | "RequestItems": {
109 | DYNAMODB_TABLE: {
110 | "Keys": unprocessed_keys[
111 | :100
112 | ] # DynamoDB limits to 100 items per batch
113 | }
114 | }
115 | }
116 |
117 | # Perform the batch get operation
118 | response = dynamodb.batch_get_item(RequestItems=request["RequestItems"])
119 |
120 | # Add the successfully retrieved items to our results list
121 | items.extend(response["Responses"][DYNAMODB_TABLE])
122 |
123 | # Update unprocessed_keys based on UnprocessedKeys from the response
124 | unprocessed_keys_info = response.get("UnprocessedKeys", {})
125 | unprocessed_keys = unprocessed_keys_info.get(DYNAMODB_TABLE, {}).get("Keys", [])
126 |
127 | # If there are more than 100 unprocessed keys, prepare the next batch
128 | if unprocessed_keys:
129 | unprocessed_keys = unprocessed_keys[100:]
130 |
131 | # Assuming items is the list of items returned from DynamoDB
132 | found_keys = [{"PK": item["PK"], "SK": item["SK"]} for item in items]
133 |
134 | # Assuming keys_to_get is the list of keys you originally requested
135 | requested_keys = keys_to_get # No change needed if keys_to_get already structured as [{'PK': ..., 'SK': ...}, ...]
136 |
137 | # To find missing keys, we'll convert these dictionaries to a comparable format (e.g., string) because dictionaries cannot be directly compared in sets
138 | found_keys_str = set([str(k) for k in found_keys])
139 | requested_keys_str = set([str(k) for k in requested_keys])
140 |
141 | # Identify missing keys by comparing their string representations
142 | missing_keys_str = requested_keys_str - found_keys_str
143 |
144 | # Convert back to dictionaries using ast.literal_eval for safety
145 | missing_keys = [ast.literal_eval(k) for k in missing_keys_str]
146 |
147 | return items, missing_keys
148 |
149 |
150 | def check_for_repeats(strings):
151 | seen = set()
152 | for string in strings:
153 | if string in seen:
154 | return True # Found a repeat
155 | seen.add(string)
156 | return False # No repeats found
157 |
158 |
159 | def find_duplicates(items):
160 | # Track occurrences of (PK, SK) tuples
161 | occurrences = {}
162 | # Track duplicates
163 | duplicates = {}
164 |
165 | for item in items:
166 | pk_sk_tuple = (item["PK"], item["SK"])
167 | if pk_sk_tuple in occurrences:
168 | occurrences[pk_sk_tuple] += 1
169 | duplicates[pk_sk_tuple] = occurrences[pk_sk_tuple]
170 | else:
171 | occurrences[pk_sk_tuple] = 1
172 |
173 | # Check if there are duplicates and throw an error
174 | if duplicates:
175 | duplicate_details = ", ".join(
176 | [f"{duplicate}: {count}" for duplicate, count in duplicates.items()]
177 | )
178 | raise ValueError(f"Duplicates found - {duplicate_details}")
179 |
180 |
181 | @timer
182 | def add_items_to_dynamodb(articles, clusters, associated_articles):
183 | # Get the table
184 | table = dynamodb.Table(DYNAMODB_TABLE)
185 |
186 | keys_to_get = [
187 | {"PK": cluster[0], "SK": f"#METADATA#{cluster[0]}"} for cluster in clusters
188 | ]
189 |
190 | # Convert to the desired dictionary format
191 | cluster_associations = {}
192 |
193 | # Initialize a dictionary to keep track of items to batch write
194 | items_to_batch_write = {}
195 | for item in clusters:
196 | key, article_ids = item
197 | cluster_associations[key] = article_ids
198 |
199 | existing_metadata, missing_keys = batch_get_meta_data(keys_to_get)
200 | print("Missing Keys: ", len(missing_keys))
201 | print("Existing Metadata: ", len(existing_metadata))
202 |
203 | for item in existing_metadata:
204 | pk_sk = (item["PK"], item["SK"])
205 |
206 | # Assume 'NumAttribute' exists, increment it
207 | if "number_of_articles" in item:
208 | item["number_of_articles"] += (
209 | len(cluster_associations[item["PK"]]) - 1
210 | ) # Subtract one for metadata
211 | # Check for duplicates
212 | if pk_sk in items_to_batch_write:
213 | print(f"Duplicate found for existing metadata: {pk_sk}")
214 | items_to_batch_write[pk_sk] = item
215 |
216 | # For unprocessed keys write a new METADATA entry
217 | for key in missing_keys:
218 | pk_sk = (key["PK"], f"#METADATA#{key['PK']}")
219 | item = {
220 | "PK": key["PK"],
221 | "SK": f"#METADATA#{key['PK']}",
222 | "type": "metadata",
223 | "created_at": datetime.now().isoformat(),
224 | "number_of_articles": len(cluster_associations[key["PK"]]) + 1,
225 | "generated_summary": "",
226 | "summary_count": 0,
227 | "description": "",
228 | "is_cluster": True,
229 | } # Partition Key # Sort Key
230 | if pk_sk in items_to_batch_write:
231 | print(f"Duplicate found for new metadata: {pk_sk}")
232 | items_to_batch_write[pk_sk] = item
233 |
234 | for cluster_id, ids in clusters + articles:
235 | for article_id in ids:
236 | pk_sk = (cluster_id, f"ARTICLE#{article_id}")
237 | article = associated_articles.get(article_id)
238 |
239 | # ! This is accounting for a bug, should not have to be done!!
240 | if article is not None:
241 | # Define the item to be inserted
242 | item = {
243 | "PK": cluster_id,
244 | "SK": f"ARTICLE#{article_id}",
245 | "type": "article",
246 | "article_id": article_id,
247 | "title": article.get("title"),
248 | "summary": article.get("summary"),
249 | "text": article.get("text"),
250 | "organizations": article.get("organizations_fd"),
251 | "locations": article.get("locations_fd"),
252 | # "article_sentiment": article.get("article_sentiment"),
253 | "publication_date": article.get("publication_date"),
254 | "entry_creation_date": datetime.now().isoformat(),
255 | } # Partition Key # Sort Key
256 | else:
257 | item = {
258 | "PK": cluster_id,
259 | "SK": f"ARTICLE#{article_id}",
260 | "type": "article",
261 | "article_id": article_id,
262 | "entry_creation_date": datetime.now().isoformat(),
263 | } # Partition Key # Sort Key
264 |
265 | # Check for duplicates
266 | if pk_sk in items_to_batch_write:
267 | print(f"Duplicate found for article: {pk_sk}")
268 | items_to_batch_write[pk_sk] = item
269 |
270 | # Write aggregated items to DynamoDB using batch writer
271 | with table.batch_writer() as batch:
272 | for pk_sk, item in items_to_batch_write.items():
273 | batch.put_item(Item=item)
274 |
275 |
276 | def find_string_duplicates(strings):
277 | seen = set()
278 | duplicates = set(string for string in strings if string in seen or seen.add(string))
279 | if duplicates:
280 | raise ValueError(f"Duplicates: {', '.join(duplicates)}")
281 |
282 |
283 | @timer
284 | def cluster(records):
285 | # Set Global Variables # ToDO Find "pythonic" way of doing this
286 | global label_tracker
287 | global is_cluster
288 | global distance_matrix
289 | global embeds
290 |
291 | global unique_article_id
292 | global unique_cluster_id
293 | global cluster_count
294 | global batch_times
295 | global processed_pool_sizes
296 |
297 | batch_update_distance_matrix = (
298 | batch_update_numpy_distance_matrix # For now we will always use this function
299 | )
300 |
301 | eps = 0.10 # ToDo Parameterize
302 |
303 | print("***\t***")
304 | print(f"Starting eps:\t{eps}")
305 |
306 | # Configure logging
307 | metric = "precomputed"
308 | clustering_args = dict(eps=eps, min_samples=2, metric=metric, n_jobs=-1)
309 |
310 | batch_time = time.time()
311 |
312 | # report cluster pool metrics
313 | processed_pool_size = len(label_tracker)
314 | number_of_singletons = processed_pool_size - cluster_count
315 | print(f"Number of clusters in pool:\t{cluster_count}")
316 | print(f"Number of singletons in pool:\t{number_of_singletons}")
317 |
318 | # add this batch to bookkeeping
319 | processed_pool_sizes.append(processed_pool_size)
320 |
321 | label_tracker.extend(
322 | [(str(uuid.uuid4()), [doc["id"]]) for i, doc in enumerate(records)]
323 | )
324 |
325 | is_cluster.extend([False for _ in range(len(records))])
326 |
327 | # Size of existing cluster_pool.
328 | old_size = len(embeds) if embeds is not None else 0
329 |
330 | # update embedding list
331 | new_embeds = [doc["concat_embedding"] for doc in records]
332 |
333 | if embeds is not None:
334 | embeds.extend(new_embeds)
335 | else:
336 | embeds = new_embeds
337 |
338 | unique_article_id += len(records) # increment by number of samples added
339 |
340 | # get distances from new samples to old samples
341 | # M X [[N], [M]] = M x N+M matrix
342 | # TODO: This implementation vs. Database
343 | # TODO: Thresholding to make it more sparse
344 | add_to_distance_matrix = batch_update_distance_matrix(
345 | np.ascontiguousarray(new_embeds),
346 | cluster_pool=np.ascontiguousarray(embeds),
347 | )
348 |
349 | # Convert (M, N+M) -> (N+M, N+M), make sparse if possible
350 | if distance_matrix is None:
351 | distance_matrix = add_to_distance_matrix
352 | else:
353 | distance_matrix = get_sparse_distance_matrix(
354 | add_to_distance_matrix, old_size if old_size > 0 else None
355 | )
356 |
357 | # Cluster
358 | clusterer = DBSCAN(**clustering_args).fit(distance_matrix)
359 |
360 | # Update clusters and singletons
361 | update_time = time.time()
362 | unique_labels = np.unique(clusterer.labels_)
363 | to_remove = set()
364 | updated_clusters = [] # Indicies to update database
365 |
366 | # Cluster formation
367 | for label in unique_labels:
368 | if label != -1:
369 | indices = np.nonzero(clusterer.labels_ == label)[0]
370 |
371 | update_idx = indices[0]
372 |
373 | # * Don't need for DB
374 | to_remove.update(
375 | [i for i in indices[1:] if not is_cluster[i]]
376 | ) # keep track of items to remove from all items
377 |
378 | added_articles = [
379 | label_tracker[id_idx][1][0]
380 | for id_idx in indices[1:]
381 | if not is_cluster[id_idx]
382 | ]
383 |
384 | updated_clusters.append((label_tracker[update_idx][0], added_articles))
385 |
386 | # extend first instance with all like labels
387 | label_tracker[update_idx][1].extend(added_articles)
388 |
389 | # rename if not labeled cluster yet
390 | if is_cluster[update_idx] is False:
391 | cluster_count += 1
392 |
393 | unique_cluster_id += 1
394 | is_cluster[update_idx] = True
395 |
396 | # Update embeddings with the mean of all the embeddings in cluster
397 | embeddings_for_this_cluster_label = [embeds[id_idx] for id_idx in indices]
398 |
399 | centroid = np.mean(embeddings_for_this_cluster_label, axis=0)
400 | embeds[update_idx] = centroid.tolist()
401 |
402 | print(f"update_time:\t{time.time() - update_time}")
403 |
404 | # delete indices that were merged
405 | cleanup_time = time.time()
406 | update_label_time = time.time()
407 |
408 | label_tracker = [
409 | label_tracker[i] for i in range(len(label_tracker)) if i not in to_remove
410 | ]
411 |
412 | is_cluster = [is_cluster[i] for i in range(len(is_cluster)) if i not in to_remove]
413 | print(f"Labeling cleanup\t{time.time() - update_label_time:.2f}")
414 |
415 | embed_cleanup = time.time()
416 | embeds = [e for i, e in enumerate(embeds) if i not in to_remove]
417 |
418 | print(f"embed cleanup\t{time.time() - embed_cleanup:.2f}")
419 | print(f"cleanup_time:\t{time.time() - cleanup_time}")
420 |
421 | # Track times
422 | batch_time = time.time() - batch_time
423 | batch_times.append(batch_time)
424 | print(f"Batch time:\t{batch_time}")
425 | print(f"mean batch time:\t{sum(batch_times)/len(batch_times)}")
426 |
427 | # dont use aggregated variables here, recalculate to double check accuracy
428 | number_of_clusters = len(np.nonzero(is_cluster)[0])
429 | number_of_singletons = len(np.nonzero(~np.asarray(is_cluster, dtype=bool))[0])
430 | print(f"Number of clusters\t{number_of_clusters}")
431 | print(f"Number of singletons\t{number_of_singletons}")
432 |
433 | number_of_stories_in_saved = sum([len(samples[1]) for samples in label_tracker])
434 | print(f"total_stories_clustered\t{number_of_stories_in_saved}")
435 |
436 | new_entries_articles = [
437 | label_tracker[i]
438 | for i in range(old_size, len(label_tracker))
439 | if is_cluster[i] is False
440 | ]
441 |
442 | total_new_articles = sum([len(a[1]) for a in new_entries_articles])
443 | print("Total New Articles Actual", total_new_articles)
444 | print("Total New Articles Expected", len(new_entries_articles))
445 | return new_entries_articles, updated_clusters
446 |
447 |
448 | @timer
449 | def process_messages(records):
450 | formatted_records, associated_articles = format_documents(records)
451 | new_entries_articles, updated_clusters = cluster(formatted_records)
452 | add_items_to_dynamodb(new_entries_articles, updated_clusters, associated_articles)
453 |
454 |
455 | @timer
456 | def delete_messages_in_batches(messages):
457 | # Split messages into batches of 10 for deletion
458 | batch_size = 10
459 | for i in range(0, len(messages), batch_size):
460 | batch = messages[i : i + batch_size]
461 | entries = [
462 | {"Id": str(index), "ReceiptHandle": msg["ReceiptHandle"]}
463 | for index, msg in enumerate(batch)
464 | ]
465 | sqs.delete_message_batch(QueueUrl=SQS_QUEUE, Entries=entries)
466 | print("Deleted messages from queue")
467 |
468 |
469 | def consume_records(batch_size=20):
470 | global incoming_articles
471 |
472 | # -----------------------------------------------------------------
473 | # Get the records.
474 | # Get max_records from the shard, or run continuously if you wish.
475 | # -----------------------------------------------------------------
476 | all_messages = []
477 | while len(all_messages) < batch_size:
478 |
479 | response = sqs.receive_message(
480 | QueueUrl=SQS_QUEUE,
481 | MaxNumberOfMessages=min(10, int(batch_size - len(all_messages))),
482 | WaitTimeSeconds=0, # Short polling to avoid long waits
483 | )
484 |
485 | messages = response.get("Messages", [])
486 | if not messages:
487 | # print("The queue is empty.")
488 | break
489 |
490 | all_messages.extend(messages)
491 | if len(all_messages) >= batch_size:
492 | break
493 |
494 | incoming_articles.extend(all_messages)
495 |
496 |
497 | @timer
498 | def checkpoint():
499 | global label_tracker
500 | global is_cluster
501 | global distance_matrix
502 | global embeds
503 | global incoming_articles
504 |
505 | data_to_serialize = {
506 | "label_tracker": label_tracker,
507 | "is_cluster": is_cluster,
508 | "embeds": embeds,
509 | }
510 |
511 | serialized_data = pickle.dumps(data_to_serialize)
512 |
513 | # Upload the updated data back to S3 as a checkpoint
514 | s3.put_object(Body=serialized_data, Bucket=S3_BUCKET_NAME, Key=S3_FILE_KEY)
515 | print(f"Updated file uploaded successfully to {S3_BUCKET_NAME}/{S3_FILE_KEY}")
516 |
517 |
518 | @timer
519 | def load_from_checkpoint():
520 | global label_tracker
521 | global is_cluster
522 | global embeds
523 | global distance_matrix
524 | global cluster_count
525 |
526 | try:
527 | # Retrieve the object from S3
528 | s3_response_object = s3.get_object(Bucket=S3_BUCKET_NAME, Key=S3_FILE_KEY)
529 |
530 | # Read the file's content
531 | serialized_data = s3_response_object["Body"].read()
532 | loaded_data = pickle.loads(serialized_data)
533 |
534 | label_tracker = loaded_data["label_tracker"]
535 | is_cluster = loaded_data["is_cluster"]
536 | embeds = loaded_data["embeds"]
537 | # distance_matrix = "" # ToDo ask hector best way to deal with this
538 | distance_matrix = "" if embeds is not None and len(embeds) > 0 else None
539 |
540 | print(
541 | "Successfully loaded from checkpoint, cluster pool size: ",
542 | len(label_tracker),
543 | )
544 | number_of_clusters = len(np.nonzero(is_cluster)[0])
545 | number_of_singletons = len(np.nonzero(~np.asarray(is_cluster, dtype=bool))[0])
546 | print(f"Number of clusters\t{number_of_clusters}")
547 | print(f"Number of singletons\t{number_of_singletons}")
548 |
549 | cluster_count = number_of_clusters
550 | except s3.exceptions.NoSuchKey:
551 | print(
552 | f"No existing checkpoint found at {S3_BUCKET_NAME}/{S3_FILE_KEY}. Starting with new data."
553 | )
554 |
555 |
556 | if __name__ == "__main__":
557 |
558 | batch_size = 500
559 | checkpoint_rate = 5 # How many batches before checkpointing
560 | batches_processed = 0
561 | number_of_threads = 50
562 | number_of_articles = batch_size / number_of_threads
563 | print("Batch Size", batch_size)
564 | print("Checkpoint Rate", checkpoint_rate)
565 |
566 | load_from_checkpoint()
567 |
568 | print(f"Article queue: {len(incoming_articles)}")
569 |
570 | ### Define number of threads
571 | # articles_received = number_of_threads * batch_size
572 | threads = [
573 | threading.Thread(target=lambda: consume_records(number_of_articles))
574 | for _ in range(number_of_threads)
575 | ]
576 | # start all threads
577 | start = time.time()
578 | [t.start() for t in threads]
579 | # collect threads to finish
580 | [t.join() for t in threads]
581 |
582 | print(f"Processed batches: {len(incoming_articles)}")
583 | print(f"total time: {time.time() - start:.2f} seconds")
584 |
585 | # Consumer Server
586 | while True:
587 | threads = [
588 | threading.Thread(target=lambda: consume_records(number_of_articles))
589 | for _ in range(number_of_threads)
590 | ]
591 | if batches_processed % checkpoint_rate == 0:
592 | checkpoint_thread = threading.Thread(target=lambda: checkpoint())
593 | threads.append(checkpoint_thread)
594 |
595 | # start all threads
596 | start = time.time()
597 | [t.start() for t in threads]
598 |
599 | if len(incoming_articles) >= batch_size: # Check we have enough articles
600 | process_messages(incoming_articles)
601 | delete_messages_in_batches(incoming_articles)
602 |
603 | batches_processed += 1
604 | incoming_articles = []
605 |
606 | [t.join() for t in threads]
607 | print(f"Processed batches: {len(incoming_articles)}")
608 | print(f"TOTAL TIME FOR CLUSTERING BATCH: {time.time() - start:.2f} seconds")
609 |
--------------------------------------------------------------------------------
/business_logic/stream_consumer/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | scikit-learn
3 | scipy
4 | numpy
5 | joblib
6 | utils
--------------------------------------------------------------------------------
/data/clear_data.py:
--------------------------------------------------------------------------------
1 | import boto3
2 |
3 |
4 | def clear_dynamodb_table(table_name):
5 | # Initialize a DynamoDB resource
6 | dynamodb = boto3.resource("dynamodb")
7 | table = dynamodb.Table(table_name)
8 |
9 | # Scan the table for all items (note: this is resource-intensive and not recommended for large tables)
10 | scan = table.scan()
11 | items = scan["Items"]
12 |
13 | # Continue scanning if all items were not returned in the first scan
14 | while "LastEvaluatedKey" in scan:
15 | scan = table.scan(ExclusiveStartKey=scan["LastEvaluatedKey"])
16 | items.extend(scan["Items"])
17 |
18 | # Delete items in batches
19 | with table.batch_writer() as batch:
20 | for item in items:
21 | batch.delete_item(
22 | Key={
23 | "PK": item["PK"], # Primary Key
24 | "SK": item["SK"], # Sort Key, if applicable
25 | }
26 | )
27 |
28 | print(f"Cleared {len(items)} items from the table {table_name}.")
29 |
30 |
31 | def clear_sqs_queue(queue_name):
32 | sqs = boto3.client("sqs")
33 | response = sqs.get_queue_url(QueueName=queue_name)
34 | queue_url = response['QueueUrl']
35 | response = sqs.receive_message(QueueUrl=queue_url, MaxNumberOfMessages=10)
36 | messages = response.get("Messages", [])
37 | for message in messages:
38 | sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message["ReceiptHandle"])
39 | print(f"Cleared {len(messages)} messages from the queue {queue_name}.")
40 |
41 |
42 | def remove_s3_objects(bucket_name):
43 | s3 = boto3.client("s3")
44 | s3.delete_object(Bucket=bucket_name, Key="checkpoint.pkl")
45 | print(f"Deleted the 'checkpoint.pkl' object from the bucket {bucket_name}.")
46 |
47 | def terminate_ec2_instance(instance_name):
48 | ec2 = boto3.client("ec2")
49 | response = ec2.describe_instances(Filters=[
50 | {'Name': 'tag:Name', 'Values': [instance_name]},
51 | {'Name': 'instance-state-name', 'Values': ['running']}
52 | ])
53 | if response['Reservations']:
54 | instance_id = response['Reservations'][0]['Instances'][0]['InstanceId']
55 | ec2.terminate_instances(InstanceIds=[instance_id])
56 | print(f"Terminated the running EC2 instance {instance_name}.")
57 | else:
58 | print(f"No running EC2 instance found with the name {instance_name}.")
59 |
60 |
61 | # Clean DynamoDB table
62 | table_name = "cluster-table-clustering-demo2"
63 | clear_dynamodb_table(table_name)
64 |
65 | # Clean SQS queue
66 | queue_name = "clustering-demo2-queue"
67 | clear_sqs_queue(queue_name)
68 |
69 | # Clean S3 bucket, need to find the bucket name dynamically starting with "code-bucket-clustering-demo"
70 | def get_s3_bucket_name(prefix):
71 | s3 = boto3.client('s3')
72 | response = s3.list_buckets()
73 | for bucket in response['Buckets']:
74 | if bucket['Name'].startswith(prefix):
75 | print(f"Found S3 bucket: {bucket['Name']}")
76 | return bucket['Name']
77 | return None
78 |
79 | bucket_prefix = "code-bucket-clustering-demo"
80 | bucket_name = get_s3_bucket_name(bucket_prefix)
81 | if bucket_name:
82 | remove_s3_objects(bucket_name)
83 | else:
84 | print(f"No S3 bucket found with prefix: {bucket_prefix}")
85 |
86 | # Terminate EC2 instance
87 | instance_name = "stream-consumer-instance-clustering-demo2"
88 | terminate_ec2_instance(instance_name)
89 |
--------------------------------------------------------------------------------
/data/download_public_data.sh:
--------------------------------------------------------------------------------
1 | mkdir public_data
2 | wget -P public_data ftp://"ftp.priberam.pt|anonymous"@ftp.priberam.pt/SUMMAPublic/Corpora/Clustering/2018.0/dataset/dataset.dev.json
--------------------------------------------------------------------------------
/data/example_article.json:
--------------------------------------------------------------------------------
1 | {
2 | "id": "12345",
3 | "text": "(FAKE) In a surprising turn of events, recent studies have shown a significant surge in penguin populations across several species, bringing a glimmer of hope amidst ongoing climate change concerns. This unexpected rise has been observed in regions spanning from Antarctica to the coasts of South America and Africa. Marine biologists attribute this population increase to several factors. Firstly, conservation efforts have played a crucial role. Strict regulations on fishing in certain areas have reduced competition for food, allowing penguin populations to recover. Marine protected areas (MPAs) have provided safe havens where penguins can breed and forage without human interference.",
4 | "title": "Surge in Penguin Populations Brings Hope Amidst Climate Change Concerns",
5 | "event_id": "1234",
6 | "duplicate": false,
7 | "lang": "en",
8 | "bag_id": "9e1f2c6b-4b27-4d5f-91d4-3e0aafae1987-2325",
9 | "date": "2024-06-06 16:09:00",
10 | "source": "naturewatchnews.com",
11 | "cluster": "444"
12 | }
13 |
--------------------------------------------------------------------------------
/data/put_records.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | import json
3 | import time
4 | import random
5 | import string
6 | from tqdm import tqdm
7 |
8 | STREAM_NAME = "input-stream-clustering-demo2"
9 | PARTITION_KEY = "a"
10 | JSON_FILE_PATH = "./public_data/dataset.dev.json" # Path to the single JSON file
11 | COUNT = 1200000
12 | BATCH_SIZE = 5
13 |
14 | # Create a Kinesis client
15 | kinesis = boto3.client("kinesis")
16 |
17 |
18 | # Helper function to generate a random partition key
19 | def generate_partition_key():
20 | return "".join(random.choices(string.ascii_letters + string.digits, k=16))
21 |
22 |
23 | # Helper function to check record size does not exceed 1MB
24 | def is_record_size_valid(record):
25 | return len(record) < 1024 * 1024 # less than 1MB
26 |
27 |
28 | # Helper function to check batch size does not exceed 5MB
29 | def is_batch_size_valid(batch):
30 | return (
31 | sum(len(record["Data"]) for record in batch) < 5 * 1024 * 1024
32 | ) # less than 5MB
33 |
34 |
35 | # Read the JSON data from the file
36 | with open(JSON_FILE_PATH, "r") as f:
37 | data_list = json.load(f)
38 |
39 | # Iterate through the JSON data in batches
40 | for batch_index in tqdm(range(0, min(COUNT, len(data_list)), BATCH_SIZE)):
41 | batch_list = data_list[batch_index : batch_index + BATCH_SIZE]
42 | data_json = json.dumps(batch_list)
43 |
44 | # Check if the individual record size is valid
45 | if not is_record_size_valid(data_json):
46 | print(
47 | f"Batch starting at index {batch_index} exceeds the maximum allowed size of 1MB."
48 | )
49 | continue # Skip this batch
50 |
51 | # Create a record to put to Kinesis
52 | record = {
53 | "Data": data_json,
54 | "PartitionKey": generate_partition_key(),
55 | }
56 |
57 | records_to_put = [record]
58 |
59 | # Add the record to the batch if it doesn't exceed the batch size
60 | if is_batch_size_valid(records_to_put):
61 | # Delay for 0.2 seconds
62 | time.sleep(0.2)
63 |
64 | # Create the PutRecords request
65 | put_records_request = {
66 | "Records": records_to_put,
67 | "StreamName": STREAM_NAME,
68 | }
69 |
70 | # Put the records to Kinesis
71 | response = kinesis.put_records(**put_records_request)
72 |
73 | # Check for any failed records
74 | failed_records = response.get("Records", [])
75 | for record in failed_records:
76 | if "ErrorCode" in record:
77 | print(
78 | f"Error: {record['ErrorCode']}, Message: {record['ErrorMessage']}"
79 | )
80 | else:
81 | print(
82 | f"Batch starting at index {batch_index} would exceed the batch size limit of 5MB."
83 | )
84 |
--------------------------------------------------------------------------------
/data/script.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | with open("public_data/dataset.test.json", "r") as f:
4 | data = json.load(f)
5 | print(len(data))
6 |
--------------------------------------------------------------------------------
/data/send_articles.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python put_records.py
4 |
5 |
6 | # aws kinesis put-record --stream-name "$STREAM_NAME" --data file://"$FILE_PATH" --partition-key "$PARTITION_KEY"
7 |
8 | # aws kinesis put-record --stream-name "$STREAM_NAME" --data file://"$FILE_PATH" --partition-key "id"
9 |
10 | # aws kinesis put-records \
11 | # --stream-name "$STREAM_NAME" \
12 | # --records file://"$FILE_PATH"
13 |
--------------------------------------------------------------------------------
/front_end/Dockerfile:
--------------------------------------------------------------------------------
1 | #checkov:skip=CKV_DOCKER_2: Ensure that HEALTHCHECK instructions have been added to container images
2 | #checkov:skip=CKV_DOCKER_3: Ensure that a user for the container has been created
3 |
4 | # Use a multi-stage build
5 | FROM node:18-alpine3.19 as builder
6 |
7 | # Set the working directory
8 | WORKDIR /app
9 |
10 | # Copy package.json and package-lock.json
11 | COPY package.json package-lock.json ./
12 |
13 | # Install dependencies
14 | RUN npm ci
15 |
16 | # Copy the application code
17 | COPY . .
18 |
19 | # Build the application
20 | RUN npm run build
21 |
22 | # Create the production image
23 | FROM nginx:1.23-alpine
24 |
25 | # Install OpenSSL to generate a self-signed certificate
26 | RUN apk add --no-cache openssl
27 |
28 | # Create a directory for SSL certificates
29 | RUN mkdir -p /etc/nginx/ssl
30 |
31 | # Generate a self-signed SSL certificate
32 | RUN openssl req \
33 | -x509 \
34 | -nodes \
35 | -days 365 \
36 | -newkey rsa:2048 \
37 | -keyout /etc/nginx/ssl/nginx-selfsigned.key \
38 | -out /etc/nginx/ssl/nginx-selfsigned.crt \
39 | -subj "/C=US/ST=State/L=City/O=Company/OU=Department/CN=localhost"
40 |
41 | # Remove the default NGINX config and replace with custom config
42 | RUN rm -rf /etc/nginx/conf.d/default.conf
43 | COPY nginx.conf /etc/nginx/conf.d/
44 |
45 | # Set the working directory
46 | WORKDIR /usr/share/nginx/html
47 |
48 | # Copy the built assets from the builder stage
49 | COPY --from=builder /app/build .
50 |
51 | # Expose the port
52 | EXPOSE 443
53 |
54 | # Start Nginx
55 | CMD ["nginx", "-g", "daemon off;"]
56 |
--------------------------------------------------------------------------------
/front_end/README.md:
--------------------------------------------------------------------------------
1 | # React Application Setup and Deployment
2 |
3 | This guide covers the setup of a React application using Amazon Cognito for authentication, and deployment options using Amazon S3 with CloudFront or a containerized approach with Nginx and a Load Balancer.
4 |
5 | ## Prerequisites
6 |
7 | - AWS Account
8 | - Node.js installed
9 | - NPM or Yarn installed
10 | - AWS CLI installed and configured
11 |
12 | ## Setup
13 |
14 | ### Step 1: Create a Cognito User Pool
15 |
16 | 1. Go to the Amazon Cognito Console.
17 | 2. Click **Manage User Pools** and then **Create a user pool**.
18 | 3. Name your user pool and click **Review defaults**.
19 | 4. Click **Create pool**.
20 | 5. Note the **Pool Id** and **Pool ARN**.
21 |
22 | ### Step 2: Create a Cognito Identity Pool
23 |
24 | 1. Go back to the main Cognito console and select **Manage Identity Pools**.
25 | 2. Click **Create new identity pool**.
26 | 3. Give your identity pool a name, and check **Enable access to unauthenticated identities** if required.
27 | 4. Under **Authentication providers**, in the **Cognito** tab, enter your User Pool ID and App client id.
28 | 5. Click **Create Pool**.
29 | 6. On the next screen, you will be prompted to set up IAM roles for your identity pool. AWS can create default roles for you, or you can choose to edit these roles. It is critical to attach the appropriate permissions to these roles depending on what AWS resources your application will access.
30 |
31 | #### Configuring IAM Roles
32 |
33 | After the Identity Pool is created, AWS assigns two roles: one for authenticated users and another for unauthenticated users (if enabled). To allow authenticated users to access DynamoDB resources, you must attach a policy with the necessary permissions to the authenticated role.
34 |
35 | 1. Go to the IAM console.
36 | 2. Find the role associated with your Cognito Identity Pool for authenticated users.
37 | 3. Click **Attach policies** and then **Create policy**.
38 | 4. In the policy editor, paste the following JSON. This policy allows actions on the DynamoDB table used by your application:
39 |
40 | ```json
41 | {
42 | "Version": "2012-10-17",
43 | "Statement": [
44 | {
45 | "Sid": "VisualEditor0",
46 | "Effect": "Allow",
47 | "Action": [
48 | "dynamodb:Scan"
49 | ],
50 | "Resource": "arn:aws:dynamodb:us-east-1::table/cluster-table-clustering-demo"
51 | }
52 | ]
53 | }
54 | ```
55 |
56 | Be sure to replace `your-aws-account-id` with your actual AWS account ID.
57 |
58 | 5. Click **Review policy**, give your policy a name, and then click **Create policy**.
59 | 6. Attach the newly created policy to the IAM role for authenticated users.
60 |
61 | This setup ensures that your application has the necessary permissions to interact with the specified DynamoDB table, following the principle of least privilege by granting only the permissions needed.
62 |
63 |
64 | ### Step 3: Configuration File
65 |
66 | 1. Create a file named `aws-exports.js` in your React app's `src` directory.
67 | 2. Add the following configuration:
68 |
69 | ```javascript
70 | const awsConfig = {
71 | aws_project_region: 'AWS_REGION', // AWS region of Cognito
72 | aws_cognito_region: 'AWS_REGION', // AWS region of Cognito
73 | aws_cognito_identity_pool_id: 'AWS_COGNITO_IDENTITY_POOL', // Identity pool ID
74 | aws_user_pools_id: 'AWS_COGNITO_USER_POOL_ID', // User Pool ID
75 | aws_user_pools_web_client_id: 'AWS_CONGITO_USER_POOL_APP_CLIENT_ID', // App client ID
76 | federationTarget: "COGNITO_USER_POOLS" // keep as "COGNITO_USER_POOLS"
77 | };
78 |
79 | export default awsConfig;
80 | ```
81 | 3. Make sure all the fields above are properly filled, if you're using Terraform to deploy the tool, make sure you can extract and create the file dynamically.
82 |
83 |
84 | ### Step 4: Build the React Application
85 |
86 | 1. Navigate to your project directory.
87 | 2. Run `npm install` to install all required dependencies.
88 | 3. Build your React application by running:
89 | ```bash
90 | npm run build
91 | ```
92 | 4. This command creates a build directory containing your static files (HTML, CSS, JS).
93 |
94 | ## Running the Application Locally
95 |
96 | Before deploying your React application, it is crucial to ensure everything functions correctly in a local development environment. Follow these steps to run your application locally:
97 |
98 | ### Prerequisites for Running Locally
99 |
100 | 1. **Configure aws-exports.js:**
101 | - Ensure that you have created `aws-exports.js` in the src directory of your project. This file should include all necessary configurations for Amazon Cognito:
102 | ```javascript
103 | const awsConfig = {
104 | aws_project_region: 'AWS_REGION', // AWS region of Cognito
105 | aws_cognito_region: 'AWS_REGION', // AWS region of Cognito
106 | aws_cognito_identity_pool_id: 'AWS_COGNITO_IDENTITY_POOL', // Identity pool ID
107 | aws_user_pools_id: 'AWS_COGNITO_USER_POOL_ID', // User Pool ID
108 | aws_user_pools_web_client_id: 'AWS_CONGITO_USER_POOL_APP_CLIENT_ID', // App client ID
109 | federationTarget: "COGNITO_USER_POOLS" // keep as "COGNITO_USER_POOLS"
110 | };
111 | export default awsConfig;
112 | ```
113 | - Replace `your-region`, `identity-pool-id`, `your-user-pool-id`, and `your-app-client-id` with the actual values from your Cognito setup.
114 |
115 | 2. **Install Project Dependencies:**
116 | - Open a terminal and navigate to your project directory.
117 | - Install all necessary dependencies by running:
118 | ```bash
119 | npm install
120 | ```
121 |
122 | 3. **Start the React Application:**
123 | - Run the following command to start your React application:
124 | ```bash
125 | npm start
126 | ```
127 | - This will compile the application and start a development server.
128 |
129 | 4. **Access the Application:**
130 | - Open a web browser and navigate to [http://localhost:3000](http://localhost:3000).
131 | - You should see your React application running locally. Make sure to test all functionalities, especially those interacting with AWS services, to ensure everything is working as expected.
132 |
133 | By following these steps, you can run and test your React application locally before moving on to deploy it in a production environment. This local setup is crucial for development and debugging purposes.
134 |
135 |
136 | ## Deployment Options
137 |
138 | ### Option 1: Deploy to Amazon S3 with CloudFront using Origin Access Identity (OAI)
139 |
140 | This method utilizes an Origin Access Identity (OAI) to securely serve your React application's static files from an S3 bucket via CloudFront, without the bucket being publicly accessible.
141 |
142 | 1. **Create an S3 Bucket:**
143 | - Navigate to the Amazon S3 service within the AWS Management Console and create a new bucket:
144 | ```bash
145 | aws s3 mb s3://your-bucket-name --region your-region
146 | ```
147 | - Replace `your-bucket-name` and `your-region` with your specific details.
148 | - Do not enable public access; keep the default settings which block all public access.
149 |
150 | 2. **Upload the Build Directory to S3:**
151 | - Upload your React application's build directory to the S3 bucket using the AWS CLI:
152 | ```bash
153 | aws s3 sync build/ s3://your-bucket-name/
154 | ```
155 |
156 | 3. **Create an Origin Access Identity (OAI):**
157 | - Navigate to the CloudFront service in the AWS Management Console.
158 | - Go to the **Security** section, then click on **Origin Access Identity**.
159 | - Click **Create Origin Access Identity**.
160 | - Provide a comment to describe the OAI (e.g., "OAI for React App"), then create it.
161 |
162 | 4. **Configure S3 Bucket Permissions:**
163 | - Go to your S3 bucket in the AWS Management Console.
164 | - Under the **Permissions** tab, click on **Bucket Policy**.
165 | - Use the following policy, replacing `your-oai-id` and `your-bucket-name` with your specific OAI ID and bucket name:
166 | ```json
167 | {
168 | "Version": "2012-10-17",
169 | "Statement": [
170 | {
171 | "Effect": "Allow",
172 | "Principal": {
173 | "AWS": "arn:aws:iam::cloudfront:user/CloudFront Origin Access Identity your-oai-id"
174 | },
175 | "Action": "s3:GetObject",
176 | "Resource": "arn:aws:s3:::your-bucket-name/*"
177 | }
178 | ]
179 | }
180 | ```
181 |
182 | 5. **Create a CloudFront Distribution:**
183 | - Go back to the CloudFront console and create a new distribution.
184 | - For the origin source, select your S3 bucket.
185 | - Enter the Origin Access Identity you just created.
186 | - Set the origin to use HTTPS only.
187 | - Set the Viewer Protocol Policy to "Redirect HTTP to HTTPS" for security.
188 | - Optionally, specify your index document under the Default Root Object, such as `index.html`.
189 | - Create the distribution.
190 | - Note the distribution's domain name provided by CloudFront.
191 |
192 | 6. **Update DNS Records:**
193 | - If you have a domain name, update your DNS settings to create a CNAME record that points to your CloudFront distribution's domain name.
194 |
195 | ### Option 2: Containerize with Nginx and Deploy Using a Load Balancer
196 |
197 | - Create Docker from with Nginx
198 | - Host the static files from the React `build` folder
199 | - Expose port
200 | - Create ALB
201 | - ACM is used to store the certificate for your load balancer. For demonstration purposes, we are utilizing a self-signed certificate stored in ACM. However, for production applications, it is recommended to obtain a certificate from a trusted Certificate Authority (CA), which can be either external or internal.
202 |
203 | ## Package Considerations
204 |
205 | We leverage AWS Amplify package for the frontend which has certain dependencies that will trigger an NPM audit. Either update to a newer version or use leverage a different frontend/library to avoid the following:
206 | ```
207 | Dependency: fast-xml-parser Version: 4.2.5 (npm)
208 | Dependency: nth-check Version: 1.0.2 (npm)
209 | Dependency: fast-xml-parser Version: 4.3.6 (npm)
210 | Dependency: webpack Version: 5.91.0 (npm)
211 | Dependency: postcss Version: 7.0.39 (npm)
212 | Dependency: braces Version: 3.0.2 (npm)
213 | ```
214 |
215 | ## Conclusion
216 |
217 | These steps guide you through deploying your React application using AWS Cognito for authentication. Choose between a secure, serverless deployment using Amazon S3 with CloudFront or a containerized approach using Nginx for traditional server-based hosting.
218 |
219 |
--------------------------------------------------------------------------------
/front_end/nginx.conf:
--------------------------------------------------------------------------------
1 | server {
2 | listen 443 ssl;
3 | server_name localhost;
4 |
5 | ssl_certificate /etc/nginx/ssl/nginx-selfsigned.crt;
6 | ssl_certificate_key /etc/nginx/ssl/nginx-selfsigned.key;
7 |
8 | ssl_protocols TLSv1.2 TLSv1.3;
9 | ssl_ciphers HIGH:!aNULL:!MD5;
10 |
11 | root /usr/share/nginx/html;
12 | index index.html;
13 |
14 | location / {
15 | try_files $uri $uri/ =404;
16 | }
17 | }
--------------------------------------------------------------------------------
/front_end/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "news-cluster-ui",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "@aws-amplify/ui-react": "6.1.9",
7 | "@cloudscape-design/components": "3.0.625",
8 | "@cloudscape-design/global-styles": "1.0.27",
9 | "@testing-library/jest-dom": "5.17.0",
10 | "@testing-library/react": "13.4.0",
11 | "@testing-library/user-event": "13.5.0",
12 | "amazon-cognito-identity-js": "6.3.12",
13 | "aws-amplify": "6.6.3",
14 | "aws-sdk": "2.1613.0",
15 | "dotenv": "16.4.5",
16 | "react": "^18.3.1",
17 | "react-dom": "^18.3.1",
18 | "react-scripts": "5.0.1",
19 | "web-vitals": "2.1.4"
20 | },
21 | "scripts": {
22 | "start": "react-scripts start",
23 | "build": "react-scripts build",
24 | "test": "react-scripts test",
25 | "eject": "react-scripts eject"
26 | },
27 | "eslintConfig": {
28 | "extends": [
29 | "react-app",
30 | "react-app/jest"
31 | ]
32 | },
33 | "browserslist": {
34 | "production": [
35 | ">0.2%",
36 | "not dead",
37 | "not op_mini all"
38 | ],
39 | "development": [
40 | "last 1 chrome version",
41 | "last 1 firefox version",
42 | "last 1 safari version"
43 | ]
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/front_end/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/favicon.ico
--------------------------------------------------------------------------------
/front_end/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
12 |
13 |
17 |
18 |
27 | News Clustering and Summarization Demo
28 |
29 |
30 |
31 |
32 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/front_end/public/logo192.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/logo192.png
--------------------------------------------------------------------------------
/front_end/public/logo512.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/news-clustering-and-summarization/6174ccd16510f2255b6de19ce71350286760266e/front_end/public/logo512.png
--------------------------------------------------------------------------------
/front_end/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "React App",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | },
10 | {
11 | "src": "logo192.png",
12 | "type": "image/png",
13 | "sizes": "192x192"
14 | },
15 | {
16 | "src": "logo512.png",
17 | "type": "image/png",
18 | "sizes": "512x512"
19 | }
20 | ],
21 | "start_url": ".",
22 | "display": "standalone",
23 | "theme_color": "#000000",
24 | "background_color": "#ffffff"
25 | }
26 |
--------------------------------------------------------------------------------
/front_end/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/front_end/src/App.css:
--------------------------------------------------------------------------------
1 | .App {
2 | text-align: center;
3 | }
4 |
5 | .App-logo {
6 | height: 40vmin;
7 | pointer-events: none;
8 | }
9 |
10 | @media (prefers-reduced-motion: no-preference) {
11 | .App-logo {
12 | animation: App-logo-spin infinite 20s linear;
13 | }
14 | }
15 |
16 | .App-header {
17 | background-color: #282c34;
18 | min-height: 100vh;
19 | display: flex;
20 | flex-direction: column;
21 | align-items: center;
22 | justify-content: center;
23 | font-size: calc(10px + 2vmin);
24 | color: white;
25 | }
26 |
27 | .App-link {
28 | color: #61dafb;
29 | }
30 |
31 | @keyframes App-logo-spin {
32 | from {
33 | transform: rotate(0deg);
34 | }
35 | to {
36 | transform: rotate(360deg);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/front_end/src/App.js:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from 'react';
2 | import './App.css';
3 | import ClusterList from './components/ClusterList';
4 | import { AppLayout, Toggle, Box, SpaceBetween, Header, Icon, Button } from '@cloudscape-design/components';
5 | import { applyMode, Mode } from '@cloudscape-design/global-styles';
6 | import awsConfig from './aws-exports'; // Path to your aws-exports file
7 | import { Amplify } from 'aws-amplify';
8 | import { Authenticator, View, Image, Heading, components } from '@aws-amplify/ui-react';
9 | import '@aws-amplify/ui-react/styles.css';
10 |
11 |
12 |
13 |
14 | console.log('Configuring Amplify with:', awsConfig);
15 | Amplify.configure(awsConfig);
16 | // console.log('Amplify configuration:', Amplify.configure());
17 |
18 | function App() {
19 | const [darkMode, setDarkMode] = useState(false);
20 |
21 | const toggleDarkMode = () => {
22 | setDarkMode(!darkMode);
23 | };
24 |
25 | applyMode(darkMode ? Mode.Dark : Mode.Light);
26 |
27 |
28 | // Custom Header for the Sign In page
29 | const SignInHeader = () => {
30 | return (
31 |
32 |
37 | Near Real Time News Clustering and Summarization Demo
38 |
39 | );
40 | };
41 |
42 |
43 | return (
44 |
51 | {({ signOut }) => (
52 |
59 | }
60 | navigationHide
61 | tools={
62 |
63 | }>Settings
64 |
65 |
66 | Dark Mode
67 |
72 |
73 |
74 |
75 |
76 |
77 | }
78 | />
79 |
80 | )}
81 |
82 |
83 |
84 |
85 |
86 | );
87 | }
88 |
89 | export default App;
90 |
--------------------------------------------------------------------------------
/front_end/src/components/ClusterList.js:
--------------------------------------------------------------------------------
1 | // src/components/ClusterList.js
2 | import React, { useState, useEffect, useRef } from "react";
3 | import AWS from "aws-sdk";
4 | import {
5 | Button,
6 | Table,
7 | Box,
8 | ProgressBar,
9 | SpaceBetween,
10 | } from "@cloudscape-design/components";
11 | import { fetchAuthSession } from "@aws-amplify/auth";
12 | import ClusterModal from "./ClusterModal";
13 | import awsConfig from "../aws-exports";
14 |
15 | const refreshInterval = 5000;
16 |
17 | const ClusterList = () => {
18 | const [clusters, setClusters] = useState([]);
19 | const [selectedCluster, setSelectedCluster] = useState(null);
20 | const [totalArticles, setTotalArticles] = useState(0);
21 | const [isModalVisible, setModalVisible] = useState(false);
22 | const [progress, setProgress] = useState(0); // Initialize progress at 0%
23 | const [secondsRemaining, setSecondsRemaining] = useState(
24 | refreshInterval / 1000
25 | ); // Initialize countdown
26 |
27 | const dynamoDbRef = useRef();
28 |
29 | useEffect(() => {
30 | const configureAWS = async () => {
31 | const session = await fetchAuthSession();
32 | const { accessKeyId, secretAccessKey, sessionToken } =
33 | session.credentials;
34 | AWS.config.update({
35 | region: awsConfig.aws_cognito_region,
36 | credentials: new AWS.Credentials(
37 | accessKeyId,
38 | secretAccessKey,
39 | sessionToken
40 | ),
41 | });
42 | dynamoDbRef.current = new AWS.DynamoDB.DocumentClient();
43 | fetchClusters();
44 | };
45 | configureAWS();
46 | }, []);
47 |
48 | useEffect(() => {
49 | const intervalId = setInterval(() => {
50 | fetchClusters();
51 | }, refreshInterval);
52 |
53 | const progressId = setInterval(() => {
54 | setProgress(
55 | (prevProgress) => (prevProgress + (1000 / refreshInterval) * 100) % 100
56 | );
57 | setSecondsRemaining((prevSeconds) =>
58 | prevSeconds <= 1 ? refreshInterval / 1000 : prevSeconds - 1
59 | );
60 | }, 1000);
61 |
62 | return () => {
63 | clearInterval(intervalId);
64 | clearInterval(progressId);
65 | };
66 | }, []);
67 |
68 | const fetchClusters = async () => {
69 | if (!dynamoDbRef.current) {
70 | console.log("DynamoDB client not initialized");
71 | return;
72 | }
73 | let lastEvaluatedKey = null;
74 | const allItems = [];
75 | let articlesCount = 0;
76 | const params = {
77 | TableName: "cluster-table-clustering-demo2",
78 | };
79 |
80 | do {
81 | if (lastEvaluatedKey) {
82 | params.ExclusiveStartKey = lastEvaluatedKey;
83 | }
84 | const data = await dynamoDbRef.current.scan(params).promise();
85 | allItems.push(...data.Items);
86 | lastEvaluatedKey = data.LastEvaluatedKey;
87 | } while (lastEvaluatedKey);
88 |
89 | const articlesByCluster = allItems.reduce((acc, item) => {
90 | if (item.is_cluster) {
91 | acc[item.PK] = acc[item.PK] || [];
92 | } else if (item.SK.startsWith("ARTICLE#")) {
93 | if (item.publication_date) {
94 | articlesCount++;
95 | if (acc[item.PK]) {
96 | acc[item.PK].push(item);
97 | }
98 | }
99 | }
100 | return acc;
101 | }, {});
102 |
103 | const newClusters = allItems
104 | .filter(
105 | (item) =>
106 | item.is_cluster &&
107 | item.generated_summary &&
108 | articlesByCluster[item.PK] &&
109 | articlesByCluster[item.PK].length > 2
110 | )
111 | .map((cluster) => ({
112 | ...cluster,
113 | articles: articlesByCluster[cluster.PK],
114 | number_of_articles: articlesByCluster[cluster.PK].length,
115 | }))
116 | .sort((a, b) => b.number_of_articles - a.number_of_articles);
117 |
118 | setClusters(newClusters);
119 | setTotalArticles(articlesCount);
120 | };
121 |
122 | const handleViewArticles = (cluster) => {
123 | console.log("Opening modal for cluster:", cluster.PK);
124 | setSelectedCluster(cluster);
125 | setModalVisible(true); // Set the modal to be visible
126 | };
127 |
128 | const wrapStyleSummary = {
129 | whiteSpace: "normal", // Allow the text to wrap to the next line
130 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
131 | maxWidth: "600px", // Set a maximum width for the cell content
132 | textAlign: "justify", // Center the text
133 | };
134 |
135 | const wrapStyleTitle = {
136 | whiteSpace: "normal", // Allow the text to wrap to the next line
137 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
138 | maxWidth: "150px", // Set a maximum width for the cell content
139 | textAlign: "center",
140 | };
141 |
142 | const wrapStyleNumberOfArticles = {
143 | whiteSpace: "normal", // Allow the text to wrap to the next line
144 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
145 | maxWidth: "100px", // Set a maximum width for the cell content
146 | textAlign: "center",
147 | };
148 |
149 | // Column definitions using inline styles
150 | const columnDefinitions = [
151 | {
152 | header: "Title",
153 | cell: (item) => {item.description}
,
154 | },
155 | {
156 | header: "Summary",
157 | cell: (item) => (
158 | {item.generated_summary}
159 | ),
160 | },
161 | {
162 | header: "Articles",
163 | cell: (item) => (
164 | {item.number_of_articles}
165 | ),
166 | },
167 | {
168 | header: "View",
169 | cell: (item) => (
170 |
171 | ),
172 | },
173 | ];
174 |
175 | return (
176 |
177 |
178 |
179 | {" "}
180 | Near Real Time News Clustering and Summarization Demo
181 |
182 |
183 | Total Clusters: {clusters.length} | Total Articles: {totalArticles}
184 |
185 |
191 |
192 |
197 | {selectedCluster && (
198 | {
202 | setSelectedCluster(null);
203 | setModalVisible(false); // Hide the modal when closed
204 | }}
205 | visible={isModalVisible} // Control visibility with state
206 | />
207 | )}
208 |
209 |
210 | );
211 | };
212 |
213 | export default ClusterList;
214 |
--------------------------------------------------------------------------------
/front_end/src/components/ClusterModal.js:
--------------------------------------------------------------------------------
1 | // src/components/ClusterModal.js
2 | import { Modal, Button } from "@cloudscape-design/components";
3 | import React, { useState } from "react";
4 |
5 | const ClusterModal = ({ cluster, articles, onClose, visible }) => {
6 | // State to manage visibility of each article's full text
7 | const [visibleArticles, setVisibleArticles] = useState({});
8 |
9 | // Function to toggle article text visibility
10 | const toggleArticleVisibility = (id) => {
11 | setVisibleArticles((prev) => ({ ...prev, [id]: !prev[id] }));
12 | };
13 |
14 | // Helper function to format date
15 | const formatDate = (dateString) => {
16 | const date = new Date(dateString);
17 | return date.toLocaleDateString("en-US", {
18 | year: "numeric",
19 | month: "long",
20 | day: "numeric",
21 | hour: "2-digit",
22 | minute: "2-digit",
23 | });
24 | };
25 |
26 | return (
27 | Close}
31 | visible={visible}
32 | size="large"
33 | >
34 | {articles && articles.length > 0 ? (
35 | articles.map((article) => (
36 |
37 |
{article.title}
38 |
{formatDate(article.publication_date)}
39 |
{article.summary}
40 |
45 | {visibleArticles[article.SK] &&
{article.text}
}
46 |
47 |
48 | ))
49 | ) : (
50 | No articles found.
51 | )}
52 |
53 | );
54 | };
55 |
56 | export default ClusterModal;
57 |
--------------------------------------------------------------------------------
/front_end/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/front_end/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom/client';
3 | import './index.css';
4 | import App from './App';
5 |
6 | const root = ReactDOM.createRoot(document.getElementById('root'));
7 | root.render(
8 |
9 |
10 |
11 | );
12 |
13 |
--------------------------------------------------------------------------------
/iac/roots/README.md:
--------------------------------------------------------------------------------
1 | The roots directory should include all top-level Terraform projects.
2 |
3 | A top-level project is defined as a directory containing a main.tf file
4 | that you would want to run "terraform apply" on. Each top-level project
5 | has its own separate Terraform state file.
6 |
7 | Top-level projects should make use of reusable components and modules,
8 | which are located under the "templates" directory. Essentially, your
9 | top-level projects should not define any behavior on their own. They simply
10 | define input variables and make calls to reusable templates.
--------------------------------------------------------------------------------
/iac/roots/main/clustering_compute.tf:
--------------------------------------------------------------------------------
1 | # Code Deployment
2 | module "cluster_code_bucket" {
3 | source = "../../templates/modules/s3_bucket"
4 | name_prefix = "code-bucket-${var.app_name}-${var.env_name}"
5 | log_bucket = module.log_bucket.name
6 | }
7 |
8 | resource "aws_s3_object" "clustering_code" {
9 | bucket = module.cluster_code_bucket.name
10 |
11 | for_each = fileset("../../../business_logic/stream_consumer", "**/*.*")
12 | key = "stream_consumer/${each.value}"
13 | source = "../../../business_logic/stream_consumer/${each.value}"
14 | source_hash = filemd5("../../../business_logic/stream_consumer/${each.value}")
15 | content_type = each.value
16 | force_destroy = true
17 | }
18 |
19 | # SQS Queue
20 | resource "aws_sqs_queue" "tags" {
21 | name = "${var.app_name}-${var.env_name}-queue"
22 | sqs_managed_sse_enabled = true
23 | }
24 |
25 | # EC2 Instance
26 | data "aws_ami" "amazon_linux" {
27 | most_recent = true
28 | filter {
29 | name = "owner-alias"
30 | values = ["amazon"]
31 | }
32 | filter {
33 | name = "name"
34 | values = ["al2023-ami-2023*"] # Arm
35 | }
36 | filter {
37 | name = "architecture"
38 | values = ["arm64"] # Arm
39 | }
40 |
41 | }
42 |
43 | resource "aws_iam_instance_profile" "this_aws_iam_instance_profile_stream_consumer" {
44 | name = "stream-consumer-instance-profile-${var.app_name}-${var.env_name}"
45 | role = aws_iam_role.stream_consumer_role.name
46 | }
47 |
48 | # User Data
49 | data "cloudinit_config" "this_cloudinit_config" {
50 | gzip = false
51 | base64_encode = false
52 | # Main cloud-config configuration file.
53 | part {
54 | filename = "init.cfg"
55 | content_type = "text/cloud-config"
56 | content = templatefile("${path.module}/templates/init.cfg",
57 | {
58 | CONFIGURE_NODE_SCRIPT = base64gzip(templatefile("${path.module}/templates/ConfigureNode.sh",
59 | {
60 | config = {
61 | "S3_BUCKET_PATH" = "${module.cluster_code_bucket.id}/stream_consumer/"
62 | "S3_BUCKET_NAME" = module.cluster_code_bucket.id
63 | "S3_FILE_KEY" = "checkpoint.pkl"
64 | "SQS_QUEUE" = aws_sqs_queue.tags.url
65 | "DYNAMODB_TABLE" = aws_dynamodb_table.cluster_table.name
66 | "AWS_DEFAULT_REGION" = local.region
67 | }
68 | }
69 | )
70 | )
71 | }
72 | )
73 | }
74 | }
75 |
76 | resource "aws_security_group" "this_aws_security_group_ec2" {
77 | name = "${local.standard_resource_name}-ec2"
78 | description = "Security group for EC2"
79 | vpc_id = module.vpc.vpc_id
80 | egress {
81 | description = "Internet access"
82 | from_port = 0
83 | to_port = 0
84 | protocol = "-1"
85 | cidr_blocks = ["0.0.0.0/0"]
86 | }
87 | tags = merge(local.tags, { Name = "${local.standard_resource_name}-ec2" })
88 |
89 | }
90 |
91 | resource "aws_launch_template" "this_aws_launch_template" {
92 | name_prefix = "stream-consumer-instance-${local.standard_resource_name}-"
93 | description = "Launch template for stream-consumer-instance-${local.standard_resource_name}"
94 | tags = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
95 | image_id = data.aws_ami.amazon_linux.id
96 | instance_type = var.instance_type
97 | vpc_security_group_ids = [aws_security_group.this_aws_security_group_ec2.id]
98 | user_data = base64encode(data.cloudinit_config.this_cloudinit_config.rendered)
99 | ebs_optimized = true
100 | instance_initiated_shutdown_behavior = "stop"
101 | update_default_version = true
102 | disable_api_termination = false
103 |
104 | iam_instance_profile {
105 | arn = aws_iam_instance_profile.this_aws_iam_instance_profile_stream_consumer.arn
106 | }
107 |
108 | tag_specifications {
109 | resource_type = "instance"
110 | tags = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
111 | }
112 |
113 | tag_specifications {
114 | resource_type = "volume"
115 | tags = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
116 | }
117 |
118 | tag_specifications {
119 | resource_type = "network-interface"
120 | tags = merge(local.tags, { Name = "stream-consumer-instance-${local.standard_resource_name}" })
121 | }
122 |
123 | block_device_mappings {
124 | device_name = "/dev/xvda"
125 | ebs {
126 | volume_size = var.volume_size
127 | volume_type = "gp3"
128 | delete_on_termination = true
129 | encrypted = true
130 | kms_key_id = aws_kms_key.this_aws_kms_key.arn
131 | }
132 | }
133 |
134 | metadata_options {
135 | http_endpoint = "enabled"
136 | http_tokens = "required" # Enforces IMDSv2
137 | http_put_response_hop_limit = 1
138 | }
139 |
140 | monitoring {
141 | enabled = true
142 | }
143 | }
144 |
145 | resource "aws_autoscaling_group" "this_aws_autoscaling_group_stream_consumer" {
146 | depends_on = [aws_s3_object.clustering_code] # Code needs to be uploaded to s3 first
147 | name_prefix = "stream-consumer-instance-${local.standard_resource_name}"
148 | launch_template {
149 | id = aws_launch_template.this_aws_launch_template.id
150 | version = "$Latest"
151 | }
152 | vpc_zone_identifier = [module.vpc.private_subnets[0]]
153 | max_size = var.number_of_nodes
154 | min_size = 0
155 | desired_capacity = var.number_of_nodes
156 | service_linked_role_arn = aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
157 | dynamic "tag" {
158 | for_each = local.tags
159 | iterator = tags
160 | content {
161 | key = tags.key
162 | value = tags.value
163 | propagate_at_launch = true
164 | }
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/iac/roots/main/embedding_endpoint.tf:
--------------------------------------------------------------------------------
1 | # Create an S3 bucket for storing model artifacts
2 | module "model_bucket" {
3 | source = "../../templates/modules/s3_bucket"
4 | name_prefix = "models-${var.app_name}-${var.env_name}"
5 | log_bucket = module.log_bucket.name
6 | }
7 |
8 | resource "aws_s3_object" "uncompressed_model_artifact" {
9 | bucket = module.model_bucket.name
10 |
11 | for_each = fileset("../../../business_logic/model_artifacts/embedding/model", "**/*.*")
12 | key = "model/${each.value}"
13 | source = "../../../business_logic/model_artifacts/embedding/model/${each.value}"
14 | source_hash = filemd5("../../../business_logic/model_artifacts/embedding/model/${each.value}")
15 | content_type = each.value
16 | force_destroy = true
17 | }
18 |
19 |
20 | # Define IAM Role for SageMaker
21 | resource "aws_iam_role" "sagemaker_execution_role" {
22 |
23 | assume_role_policy = jsonencode({
24 | "Version" : "2012-10-17",
25 | "Statement" : [{
26 | "Effect" : "Allow",
27 | "Principal" : {
28 | "Service" : "sagemaker.amazonaws.com"
29 | },
30 | "Action" : "sts:AssumeRole"
31 | }]
32 | })
33 | }
34 |
35 | resource "aws_iam_policy" "sagemaker_policy" {
36 |
37 | description = "Policy for SageMaker access to S3 and IAM role assumption"
38 |
39 | policy = jsonencode({
40 | Version = "2012-10-17",
41 | Statement = [
42 | {
43 | Effect = "Allow",
44 | Action = ["s3:GetObject", "s3:ListBucket"],
45 | Resource = [module.model_bucket.arn, "${module.model_bucket.arn}/*"],
46 | }
47 | ]
48 | })
49 | }
50 |
51 | # Attach SageMaker permissions to IAM role
52 | resource "aws_iam_policy_attachment" "sagemaker_permissions" {
53 |
54 | name = "sagemaker_permissions"
55 | roles = [aws_iam_role.sagemaker_execution_role.name]
56 | policy_arn = "arn:aws:iam::aws:policy/AmazonSageMakerFullAccess" # Adjust permissions as needed
57 | }
58 |
59 | resource "aws_iam_policy_attachment" "sagemaker_policy_attachment" {
60 |
61 | name = "SageMakerPolicyAttachment"
62 | roles = [aws_iam_role.sagemaker_execution_role.name] # Replace with your IAM role name
63 | policy_arn = aws_iam_policy.sagemaker_policy.arn
64 | }
65 |
66 | # Define SageMaker model
67 | resource "aws_sagemaker_model" "pytorch_model" {
68 | count = var.model_name != "titan" ? 1 : 0
69 |
70 | name = "model-${var.app_name}-${var.env_name}"
71 |
72 | execution_role_arn = aws_iam_role.sagemaker_execution_role.arn
73 | primary_container {
74 | image = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.1.0-transformers4.37.0-gpu-py310-cu118-ubuntu20.04"
75 | environment = {
76 | BIT_LOADING = "4"
77 | MODEL_NAME = var.model_name
78 | SAGEMAKER_CONTAINER_LOG_LEVEL = "20"
79 | SAGEMAKER_PROGRAM = "inference.py"
80 | SAGEMAKER_REGION = data.aws_region.current.name
81 | SAGEMAKER_SUBMIT_DIRECTORY = "/opt/ml/model/code"
82 | # Only for Multi-GPU processing (mistral)
83 | # HF_MODEL_ID = "intfloat/e5-mistral-7b-instruct" # ToDO Paramaterize
84 | # PYTORCH_CUDA_ALLOC_CONF = "max_split_size_mb:50"
85 | # SAGEMAKER_MODEL_SERVER_WORKERS = 4
86 | }
87 |
88 | model_data_source {
89 | s3_data_source {
90 | s3_uri = "s3://${module.model_bucket.id}/model/"
91 | s3_data_type = "S3Prefix"
92 | compression_type = "None"
93 | }
94 | }
95 | }
96 |
97 | depends_on = [
98 | aws_s3_object.uncompressed_model_artifact,
99 | module.model_bucket,
100 | aws_iam_role.sagemaker_execution_role
101 | ]
102 | }
103 |
104 | # Create SageMaker endpoint configuration
105 | resource "aws_sagemaker_endpoint_configuration" "pytorch_endpoint_config" {
106 | count = var.model_name != "titan" ? 1 : 0
107 |
108 | #checkov:skip=CKV_AWS_98: "Ensure all data stored in the Sagemaker Endpoint is securely encrypted at rest"
109 | name = "endpoint-config-${var.app_name}-${var.env_name}"
110 | production_variants {
111 | variant_name = "${var.app_name}-${var.env_name}-traffic"
112 | instance_type = var.embedding_endpoint_instance_type
113 | initial_instance_count = var.embedding_endpoint_instance_count
114 | model_name = aws_sagemaker_model.pytorch_model[count.index].name
115 | }
116 | }
117 |
118 | # Create SageMaker endpoint
119 | resource "aws_sagemaker_endpoint" "pytorch_endpoint" {
120 | count = var.model_name != "titan" ? 1 : 0
121 |
122 | name = "endpoint-${var.app_name}-${var.env_name}"
123 | endpoint_config_name = aws_sagemaker_endpoint_configuration.pytorch_endpoint_config[count.index].name
124 | }
125 |
126 | # Auto scaling
127 | # resource "aws_appautoscaling_target" "sagemaker_target" {
128 | # max_capacity = var.max_embedding_instance_count
129 | # min_capacity = var.min_embedding_instance_count
130 | # resource_id = "endpoint/${aws_sagemaker_endpoint.pytorch_endpoint.name}/variant/${var.app_name}-${var.env_name}-traffic"
131 | # scalable_dimension = "sagemaker:variant:DesiredInstanceCount"
132 | # service_namespace = "sagemaker"
133 | # }
134 |
135 | # resource "aws_appautoscaling_policy" "sagemaker_policy" {
136 | # name = "${var.app_name}-${var.env_name}-target-tracking"
137 | # policy_type = "TargetTrackingScaling"
138 | # resource_id = aws_appautoscaling_target.sagemaker_target.resource_id
139 | # scalable_dimension = aws_appautoscaling_target.sagemaker_target.scalable_dimension
140 | # service_namespace = aws_appautoscaling_target.sagemaker_target.service_namespace
141 |
142 | # target_tracking_scaling_policy_configuration {
143 | # predefined_metric_specification {
144 | # predefined_metric_type = "SageMakerVariantInvocationsPerInstance"
145 | # }
146 | # target_value = 3
147 | # scale_in_cooldown = 300
148 | # scale_out_cooldown = 60
149 | # }
150 | # }
151 |
--------------------------------------------------------------------------------
/iac/roots/main/eventbridge.tf:
--------------------------------------------------------------------------------
1 | resource "time_sleep" "wait_30_seconds" {
2 | depends_on = [aws_sqs_queue.dead_letter_queue, aws_kinesis_stream.input_stream, aws_iam_role.cloudwatch_event_role, aws_iam_role_policy.eventbridge_sfn_policy, aws_sfn_state_machine.pre_processing_sfn]
3 |
4 | create_duration = "30s"
5 | }
6 |
7 | #SQS Queue
8 | resource "aws_sqs_queue" "dead_letter_queue" {
9 | name = "dead-letter-pipe-${local.standard_resource_name}"
10 | sqs_managed_sse_enabled = true
11 | }
12 |
13 | # Eventbridge rule used to trigger step functions off kinesis
14 | resource "aws_pipes_pipe" "event_pipe" {
15 | depends_on = [time_sleep.wait_30_seconds, aws_sqs_queue.dead_letter_queue, aws_kinesis_stream.input_stream, aws_iam_role.cloudwatch_event_role, aws_iam_role_policy.eventbridge_sfn_policy, aws_sfn_state_machine.pre_processing_sfn]
16 | name = "event-pipe-${local.standard_resource_name}"
17 | role_arn = aws_iam_role.cloudwatch_event_role.arn
18 | source = aws_kinesis_stream.input_stream.arn
19 | target = aws_sfn_state_machine.pre_processing_sfn.arn
20 |
21 | source_parameters {
22 | kinesis_stream_parameters {
23 | batch_size = 1
24 | parallelization_factor = 1
25 | starting_position = "TRIM_HORIZON"
26 | maximum_retry_attempts = 0
27 | dead_letter_config {
28 | arn = aws_sqs_queue.dead_letter_queue.arn
29 | }
30 | }
31 | }
32 |
33 | target_parameters {
34 | step_function_state_machine_parameters {
35 | invocation_type = "FIRE_AND_FORGET"
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/iac/roots/main/iam.tf:
--------------------------------------------------------------------------------
1 | # IAM Roles
2 | resource "aws_iam_role" "preprocessing_lambda_role" {
3 | name = "preprocessing-lambda_role-${var.app_name}-${var.env_name}"
4 | assume_role_policy = jsonencode({
5 | Version = "2012-10-17",
6 | Statement = [
7 | {
8 | Action = "sts:AssumeRole",
9 | Effect = "Allow",
10 | Principal = {
11 | Service = "lambda.amazonaws.com"
12 | },
13 | }
14 | ],
15 | })
16 | }
17 |
18 | resource "aws_iam_role" "embedding_lambda_role" {
19 | name = "embedding-lambda_role-${var.app_name}-${var.env_name}"
20 | assume_role_policy = jsonencode({
21 | Version = "2012-10-17",
22 | Statement = [
23 | {
24 | Action = "sts:AssumeRole",
25 | Effect = "Allow",
26 | Principal = {
27 | Service = "lambda.amazonaws.com"
28 | },
29 | }
30 | ],
31 | })
32 | }
33 |
34 | resource "aws_iam_role" "step_functions_role" {
35 | name = "step_functions_role-${var.app_name}-${var.env_name}"
36 |
37 | assume_role_policy = jsonencode({
38 | Version = "2012-10-17",
39 | Statement = [
40 | {
41 | Action = "sts:AssumeRole",
42 | Effect = "Allow",
43 | Principal = {
44 | Service = "states.amazonaws.com"
45 | },
46 | },
47 | ],
48 | })
49 | }
50 |
51 | resource "aws_iam_policy" "step_functions_lambda_policy" {
52 | description = "Policy for Lamba Access"
53 | policy = jsonencode({
54 | Version = "2012-10-17",
55 | Statement = [
56 | {
57 | "Effect" : "Allow",
58 | "Action" : [
59 | "lambda:InvokeFunction"
60 | ],
61 | "Resource" : [
62 | aws_lambda_function.pre_processing_lambda.arn, aws_lambda_function.embedding_lambda.arn
63 | ]
64 | },
65 | ]
66 | })
67 | }
68 |
69 | resource "aws_iam_policy_attachment" "step_functions_lambda_policy_attachment" {
70 | name = "step_functions_lambda_policy_attachment"
71 | roles = [aws_iam_role.step_functions_role.name]
72 | policy_arn = aws_iam_policy.step_functions_lambda_policy.arn
73 | }
74 |
75 |
76 | resource "aws_iam_role" "cloudwatch_event_role" {
77 | name = "cloudwatch_event_role-${var.app_name}-${var.env_name}"
78 |
79 | assume_role_policy = jsonencode({
80 | Version : "2012-10-17",
81 | Statement : [
82 | {
83 | Action : "sts:AssumeRole",
84 | Effect : "Allow",
85 | Principal : {
86 | Service : "pipes.amazonaws.com"
87 | }
88 | }
89 | ]
90 | })
91 | }
92 |
93 | resource "aws_iam_role_policy" "eventbridge_sfn_policy" {
94 | name = "eventbridge_sfn_policy-${var.app_name}-${var.env_name}"
95 | role = aws_iam_role.cloudwatch_event_role.id
96 | policy = jsonencode({
97 | Version = "2012-10-17",
98 | Statement = [
99 | {
100 | Action = ["states:StartExecution"]
101 | Resource = [aws_sfn_state_machine.pre_processing_sfn.arn]
102 | Effect = "Allow"
103 | },
104 | {
105 | Action = ["sqs:SendMessage"]
106 | Resource = [aws_sqs_queue.dead_letter_queue.arn]
107 | Effect = "Allow"
108 | }
109 | ]
110 | }
111 | )
112 | }
113 |
114 | data "aws_iam_policy_document" "eventbridge_kinesis_policy_document" {
115 | statement {
116 | actions = [
117 | "kinesis:DescribeStream",
118 | "kinesis:DescribeStreamSummary",
119 | "kinesis:GetRecords",
120 | "kinesis:GetShardIterator",
121 | "kinesis:ListStreams",
122 | "kinesis:ListShards"
123 | ]
124 | resources = [aws_kinesis_stream.input_stream.arn]
125 | }
126 | }
127 |
128 | resource "aws_iam_role_policy" "eventbridge_kinesis_policy" {
129 | name = "eventbridge_kinesis_policy-${var.app_name}-${var.env_name}"
130 | role = aws_iam_role.cloudwatch_event_role.id
131 | policy = data.aws_iam_policy_document.eventbridge_kinesis_policy_document.json
132 | }
133 |
134 | # Consumer Role
135 | resource "aws_iam_role" "stream_consumer_role" {
136 | name = "stream-consumer-role-${var.app_name}-${var.env_name}"
137 |
138 | assume_role_policy = jsonencode({
139 | Version = "2012-10-17",
140 | Statement = [
141 | {
142 | Action = "sts:AssumeRole",
143 | Effect = "Allow",
144 | Principal = {
145 | Service = "ec2.amazonaws.com"
146 | }
147 | },
148 | ]
149 | })
150 | }
151 |
152 | resource "aws_iam_role_policy" "stream_consumer_policy" {
153 | name = "stream_consumer_policy-${var.app_name}-${var.env_name}"
154 | role = aws_iam_role.stream_consumer_role.id
155 |
156 | policy = jsonencode({
157 | Version = "2012-10-17",
158 | Statement = [
159 | {
160 | Action = [
161 | "s3:GetObject",
162 | "s3:ListBucket",
163 | "s3:DeleteObject",
164 | "s3:GetBucketLocation",
165 | "s3:PutObject"
166 | ],
167 | Effect = "Allow",
168 | Resource = ["${module.cluster_code_bucket.arn}/*", module.cluster_code_bucket.arn]
169 | },
170 | {
171 | "Effect" : "Allow",
172 | "Action" : [
173 | "logs:CreateLogGroup",
174 | "logs:CreateLogStream",
175 | "logs:PutLogEvents",
176 | "logs:DescribeLogStreams"
177 | ],
178 | "Resource" : [
179 | "*"
180 | ]
181 | },
182 | {
183 | Action = [
184 | "dynamodb:PutItem",
185 | "dynamodb:UpdateItem",
186 | "dynamodb:BatchWriteItem",
187 | "dynamodb:BatchGetItem",
188 | "dynamodb:CreateTable",
189 | "dynamodb:DescribeTable",
190 | "dynamodb:GetItem",
191 | "dynamodb:Scan",
192 | ],
193 | Effect = "Allow",
194 | Resource = [
195 | aws_dynamodb_table.cluster_table.arn,
196 | "${aws_dynamodb_table.cluster_table.arn}/*"
197 | ],
198 | },
199 | {
200 | Action = [
201 | "sqs:ReceiveMessage",
202 | "sqs:DeleteMessage",
203 | "sqs:DeleteMessageBatch",
204 | ]
205 | Effect = "Allow"
206 | Resource = aws_sqs_queue.tags.arn
207 | },
208 | {
209 | "Effect" : "Allow",
210 | "Action" : [
211 | "ssm:DescribeAssociation",
212 | "ssm:GetDeployablePatchSnapshotForInstance",
213 | "ssm:GetDocument",
214 | "ssm:DescribeDocument",
215 | "ssm:GetManifest",
216 | "ssm:GetParameter",
217 | "ssm:GetParameters",
218 | "ssm:ListAssociations",
219 | "ssm:ListInstanceAssociations",
220 | "ssm:PutInventory",
221 | "ssm:PutComplianceItems",
222 | "ssm:PutConfigurePackageResult",
223 | "ssm:UpdateAssociationStatus",
224 | "ssm:UpdateInstanceAssociationStatus",
225 | "ssm:UpdateInstanceInformation"
226 | ],
227 | "Resource" : "*"
228 | },
229 | {
230 | "Effect" : "Allow",
231 | "Action" : [
232 | "ssmmessages:CreateControlChannel",
233 | "ssmmessages:CreateDataChannel",
234 | "ssmmessages:OpenControlChannel",
235 | "ssmmessages:OpenDataChannel"
236 | ],
237 | "Resource" : "*"
238 | },
239 | {
240 | "Effect" : "Allow",
241 | "Action" : [
242 | "ec2messages:AcknowledgeMessage",
243 | "ec2messages:DeleteMessage",
244 | "ec2messages:FailMessage",
245 | "ec2messages:GetEndpoint",
246 | "ec2messages:GetMessages",
247 | "ec2messages:SendReply"
248 | ],
249 | "Resource" : "*"
250 | }
251 | ]
252 | })
253 | }
254 |
255 | resource "aws_iam_policy" "lambda_execution_policy" {
256 | #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow "*" as a statement's resource for restrictable actions"
257 | #checkov:skip=CKV_AWS_290: "Ensure IAM policies does not allow write access without constraints"
258 | description = "Policy for Lambda Execution"
259 | policy = jsonencode({
260 | Version = "2012-10-17",
261 | Statement = [
262 | {
263 | "Effect" : "Allow",
264 | "Action" : [
265 | "logs:CreateLogGroup",
266 | "logs:CreateLogStream",
267 | "logs:PutLogEvents"
268 | ],
269 | "Resource" : "arn:aws:logs:*:*:*"
270 | },
271 | {
272 | "Sid" : "AllowModelnvocation",
273 | "Effect" : "Allow",
274 | "Action" : [
275 | "bedrock:InvokeModel"
276 | ],
277 | "Resource" : "*"
278 | },
279 | {
280 | Effect = "Allow",
281 | Action = [
282 | "ecr:GetAuthorizationToken",
283 | "ec2:CreateNetworkInterface",
284 | "ec2:DescribeNetworkInterfaces",
285 | "ec2:DeleteNetworkInterface",
286 | "ec2:AssignPrivateIpAddresses",
287 | "ec2:UnassignPrivateIpAddresses"
288 | ],
289 | Resource = "*"
290 | },
291 | {
292 | "Sid" : "ECRGrantsToConnectAndDownload",
293 | "Effect" : "Allow",
294 | "Action" : [
295 | "ecr:BatchCheckLayerAvailability",
296 | "ecr:BatchGetImage",
297 | "ecr:GetDownloadUrlForLayer"
298 | ],
299 | "Resource" : "arn:aws:ecr:*:*:repository/*"
300 | },
301 | {
302 | "Sid" : "AccessToEncryptAndDeccryptKMSKeys",
303 | "Effect" : "Allow",
304 | "Action" : [
305 | "kms:Decrypt",
306 | "kms:DescribeKey",
307 | "kms:Encrypt",
308 | "kms:GenerateDataKey",
309 | "kms:GetKeyPolicy",
310 | "kms:GetKeyRotationStatus",
311 | "kms:ListGrants",
312 | "kms:ListKeys",
313 | "kms:ListAliases",
314 | "kms:ListKeyPolicies",
315 | "kms:ListResourceTags",
316 | "kms:ListRetirableGrants",
317 | "kms:ReEncryptTo"
318 | ],
319 | "Resource" : [
320 | aws_kms_key.this_aws_kms_key.arn
321 | ]
322 | },
323 | ]
324 | })
325 | }
326 |
327 | resource "aws_iam_policy" "lambda_kinesis_policy" {
328 | description = "Policy for Kinesis Stream Access"
329 | policy = jsonencode({
330 | Version = "2012-10-17",
331 | Statement = [
332 | {
333 | "Effect" : "Allow",
334 | "Action" : [
335 | "kinesis:GetShardIterator",
336 | "kinesis:GetRecords"
337 | ],
338 | "Resource" : [
339 | aws_kinesis_stream.input_stream.arn
340 | ]
341 | },
342 | ]
343 | })
344 | }
345 |
346 | resource "aws_iam_policy" "lambda_sagemaker_policy" {
347 | description = "Policy for Sagemaker Endpoint Access"
348 | policy = jsonencode({
349 | Version = "2012-10-17",
350 | Statement = [
351 | {
352 | "Effect" : "Allow",
353 | "Action" : [
354 | "Sagemaker:InvokeEndpoint"
355 | ],
356 | "Resource" : [
357 | var.model_name != "titan" ? aws_sagemaker_endpoint.pytorch_endpoint[0].arn : "arn:aws:sagemaker:us-west-2:123456789012:endpoint/dummy-endpoint" # Generate a dummy arn if we aren't using ours
358 | ]
359 | },
360 | ]
361 | })
362 | }
363 |
364 | resource "aws_iam_policy" "lambda_sqs_policy" {
365 | description = "Policy for Sagemaker Endpoint Access"
366 | policy = jsonencode({
367 | Version = "2012-10-17",
368 | Statement = [
369 | {
370 | "Effect" : "Allow",
371 | "Action" : [
372 | "sqs:SendMessage"
373 | ],
374 | "Resource" : [
375 | aws_sqs_queue.tags.arn
376 | ]
377 | },
378 | ]
379 | })
380 | }
381 |
382 | resource "aws_iam_policy" "lambda_s3_policy" {
383 | description = "Policy for S3 Access"
384 | policy = jsonencode({
385 | Version = "2012-10-17",
386 | Statement = [
387 | {
388 | "Effect" : "Allow",
389 | "Action" : [
390 | "s3:GetObject",
391 | "s3:ListBucket",
392 | "s3:PutObject"
393 | ],
394 | "Resource" : [
395 | module.preprocess_data_bucket.arn,
396 | "${module.preprocess_data_bucket.arn}/*",
397 | module.embedding_data_bucket.arn,
398 | "${module.embedding_data_bucket.arn}/*",
399 | ]
400 | },
401 | ]
402 | })
403 | }
404 |
405 | resource "aws_iam_policy_attachment" "lambda_execution_policy_attachment" {
406 | name = "lambda_execution_policy_attachment"
407 | roles = [aws_iam_role.summarization_lambda_role.name, aws_iam_role.trigger_sfn_lambda_role.name, aws_iam_role.preprocessing_lambda_role.name, aws_iam_role.embedding_lambda_role.name, aws_iam_role.step_functions_role.name]
408 | policy_arn = aws_iam_policy.lambda_execution_policy.arn
409 | }
410 |
411 | resource "aws_iam_policy_attachment" "lambda_kinesis_policy_attachment" {
412 | name = "lambda_kinesis_policy_attachment"
413 | roles = [aws_iam_role.preprocessing_lambda_role.name]
414 | policy_arn = aws_iam_policy.lambda_kinesis_policy.arn
415 | }
416 |
417 | resource "aws_iam_policy_attachment" "lambda_sagemaker_policy_attachment" {
418 | name = "lambda_sagemaker_policy_attachment"
419 | roles = [aws_iam_role.embedding_lambda_role.name]
420 | policy_arn = aws_iam_policy.lambda_sagemaker_policy.arn
421 | }
422 |
423 | resource "aws_iam_policy_attachment" "lambda_s3_policy_attachment" {
424 | name = "lambda_s3_policy_attachment"
425 | roles = [aws_iam_role.embedding_lambda_role.name, aws_iam_role.preprocessing_lambda_role.name]
426 | policy_arn = aws_iam_policy.lambda_s3_policy.arn
427 | }
428 |
429 | resource "aws_iam_policy_attachment" "lambda_sqs_policy_attachment" {
430 | name = "lambda_sqs_policy_attachment"
431 | roles = [aws_iam_role.embedding_lambda_role.name, aws_iam_role.step_functions_role.name]
432 | policy_arn = aws_iam_policy.lambda_sqs_policy.arn
433 | }
434 |
435 | resource "aws_iam_role" "summarization_lambda_role" {
436 | name = "summarization-role-${var.app_name}-${var.env_name}"
437 |
438 | assume_role_policy = jsonencode({
439 | Version = "2012-10-17",
440 | Statement = [
441 | {
442 | Action = "sts:AssumeRole",
443 | Effect = "Allow",
444 | Principal = {
445 | Service = "lambda.amazonaws.com"
446 | },
447 | },
448 | ],
449 | })
450 | }
451 |
452 | resource "aws_iam_role_policy" "summarization_policy" {
453 | #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow "*" as a statement's resource for restrictable actions"
454 | #checkov:skip=CKV_AWS_290: "Ensure IAM policies does not allow write access without constraints"
455 | #checkov:skip=CKV_AWS_355: "Ensure no IAM policies documents allow "*" as a statement's resource for restrictable actions"
456 | name = "summarization-policy-${var.app_name}-${var.env_name}"
457 | role = aws_iam_role.summarization_lambda_role.id
458 |
459 | policy = jsonencode({
460 | Version = "2012-10-17",
461 | Statement = [
462 | {
463 | Action = [
464 | "dynamodb:Query",
465 | ],
466 | Resource = [
467 | aws_dynamodb_table.cluster_table.arn,
468 | "${aws_dynamodb_table.cluster_table.arn}/*"
469 | ],
470 | Effect = "Allow",
471 | },
472 | {
473 | Action = "bedrock:InvokeModel",
474 | Resource = "*",
475 | Effect = "Allow",
476 | },
477 | {
478 | Action = "logs:*",
479 | Resource = "arn:aws:logs:${local.region}:${local.account_id}:*",
480 | Effect = "Allow",
481 | },
482 | ],
483 | })
484 | }
485 |
486 | resource "aws_iam_role" "summary_sfn_exec_role" {
487 | name = "summary_sfn_exec_role-${var.app_name}-${var.env_name}"
488 |
489 | assume_role_policy = jsonencode({
490 | Version = "2012-10-17",
491 | Statement = [
492 | {
493 | Action = "sts:AssumeRole",
494 | Effect = "Allow",
495 | Principal = {
496 | Service = "states.amazonaws.com"
497 | },
498 | },
499 | ],
500 | })
501 | }
502 |
503 | # IAM Policy for Step Functions to write to DynamoDB
504 | resource "aws_iam_role_policy" "summary_sfn_exec_policy" {
505 | name = "summary_sfn_exec_policy-${var.app_name}-${var.env_name}"
506 | role = aws_iam_role.summary_sfn_exec_role.id
507 |
508 | policy = jsonencode({
509 | Version = "2012-10-17",
510 | Statement = [
511 | {
512 | Action = [
513 | "dynamodb:PutItem",
514 | "dynamodb:UpdateItem",
515 | "dynamodb:DeleteItem"
516 | ],
517 | Effect = "Allow",
518 | Resource = [
519 | aws_dynamodb_table.cluster_table.arn,
520 | "${aws_dynamodb_table.cluster_table.arn}/*"
521 | ],
522 | },
523 | {
524 | Action = [
525 | "lambda:InvokeFunction"
526 | ],
527 | Effect = "Allow",
528 | Resource = [aws_lambda_function.summarization_function.arn]
529 | },
530 | {
531 | Action : [
532 | "xray:GetSamplingRules",
533 | "xray:GetSamplingTargets",
534 | "xray:PutTelemetryRecords",
535 | "xray:PutTraceSegments"
536 | ],
537 | Resource : "*",
538 | Effect : "Allow"
539 | }
540 | ]
541 | })
542 | }
543 |
544 | resource "aws_iam_role" "trigger_sfn_lambda_role" {
545 | name = "triggers-sfn-role-${var.app_name}-${var.env_name}"
546 |
547 | assume_role_policy = jsonencode({
548 | Version = "2012-10-17",
549 | Statement = [
550 | {
551 | Action = "sts:AssumeRole",
552 | Effect = "Allow",
553 | Principal = {
554 | Service = "lambda.amazonaws.com"
555 | },
556 | },
557 | ],
558 | })
559 | }
560 |
561 | resource "aws_iam_role_policy" "trigger_sfn_policy" {
562 | name = "trigger-sfn-policy-${var.app_name}-${var.env_name}"
563 | role = aws_iam_role.trigger_sfn_lambda_role.id
564 |
565 | policy = jsonencode({
566 | Version = "2012-10-17",
567 | Statement = [
568 | {
569 | Action = [
570 | "states:StartExecution",
571 | ],
572 | Resource = aws_sfn_state_machine.summary_sfn.arn,
573 | Effect = "Allow",
574 | },
575 | {
576 | Action = [
577 | "dynamodb:GetItem",
578 | "dynamodb:Query",
579 | "dynamodb:PutItem",
580 | "dynamodb:UpdateItem",
581 | "dynamodb:DeleteItem",
582 | "dynamodb:GetRecords",
583 | "dynamodb:GetShardIterator",
584 | "dynamodb:DescribeStream",
585 | "dynamodb:ListStreams"
586 | ],
587 | Resource = [
588 | aws_dynamodb_table.cluster_table.arn,
589 | "${aws_dynamodb_table.cluster_table.arn}/*"
590 | ],
591 | Effect = "Allow",
592 | },
593 | {
594 | Action = "logs:*",
595 | Resource = "arn:aws:logs:${local.region}:${local.account_id}:*",
596 | Effect = "Allow",
597 | },
598 | ],
599 | })
600 | }
601 |
602 | resource "aws_iam_service_linked_role" "this_asg_aws_iam_service_linked_role" {
603 | aws_service_name = "autoscaling.amazonaws.com"
604 | custom_suffix = local.standard_resource_name
605 | description = "A service linked role for autoscaling to use to call other AWS services"
606 | tags = local.tags
607 | }
608 |
--------------------------------------------------------------------------------
/iac/roots/main/kms.tf:
--------------------------------------------------------------------------------
1 | # Create KMS Key and allow the use of it
2 | resource "aws_kms_key" "this_aws_kms_key" {
3 | description = "clustering-summarization-${local.standard_resource_name}"
4 | deletion_window_in_days = 30
5 | multi_region = true
6 | enable_key_rotation = true
7 | tags = merge(local.tags)
8 | }
9 |
10 | resource "aws_kms_key_policy" "this_aws_kms_key_policy" {
11 | key_id = aws_kms_key.this_aws_kms_key.key_id
12 | policy = jsonencode({
13 | "Version" : "2012-10-17",
14 | "Id" : "key-default-1",
15 | "Statement" : [
16 | {
17 | "Sid" : "Enable IAM User Permissions",
18 | "Effect" : "Allow",
19 | "Principal" : {
20 | "AWS" : "arn:aws:iam::${local.account_id}:root"
21 | },
22 | "Action" : "kms:*",
23 | "Resource" : "*"
24 | },
25 | {
26 | "Effect" : "Allow",
27 | "Principal" : {
28 | "Service" : "logs.${local.region}.amazonaws.com"
29 | },
30 | "Action" : [
31 | "kms:Encrypt*",
32 | "kms:Decrypt*",
33 | "kms:ReEncrypt*",
34 | "kms:GenerateDataKey*",
35 | "kms:Describe*"
36 | ],
37 | "Resource" : "*",
38 | "Condition" : {
39 | "ArnEquals" : {
40 | "kms:EncryptionContext:aws:logs:arn" : "arn:aws:logs:${local.region}:${local.account_id}:log-group:*${local.standard_resource_name}*"
41 | }
42 | }
43 | },
44 | {
45 | "Sid" : "Allow service-linked role use of the customer managed key",
46 | "Effect" : "Allow",
47 | "Principal" : {
48 | "AWS" : aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
49 | },
50 | "Action" : [
51 | "kms:Encrypt",
52 | "kms:Decrypt",
53 | "kms:ReEncrypt*",
54 | "kms:GenerateDataKey*",
55 | "kms:DescribeKey"
56 | ],
57 | "Resource" : "*"
58 | },
59 | {
60 | "Sid" : "Allow attachment of persistent resources",
61 | "Effect" : "Allow",
62 | "Principal" : {
63 | "AWS" : aws_iam_service_linked_role.this_asg_aws_iam_service_linked_role.arn
64 | },
65 | "Action" : "kms:CreateGrant",
66 | "Resource" : "*",
67 | "Condition" : {
68 | "Bool" : {
69 | "kms:GrantIsForAWSResource" : "true"
70 | }
71 | }
72 | }
73 | ]
74 | })
75 | }
76 |
77 | resource "aws_kms_alias" "this_aws_kms_alias" {
78 | name = "alias/clustering-summarization-${local.standard_resource_name}"
79 | target_key_id = aws_kms_key.this_aws_kms_key.key_id
80 | }
--------------------------------------------------------------------------------
/iac/roots/main/lambda.tf:
--------------------------------------------------------------------------------
1 | # Lambda functions
2 |
3 | module "pre_process_docs_ecr" {
4 | source = "../../templates/modules/ecr"
5 | region = local.region
6 | ecr_name = "pre-process-docs-${local.standard_resource_name}"
7 | build_script_path = "${path.module}/${var.build_script_path}"
8 | business_logic_path = "${path.module}/${var.lambda_code_path}/pre_process_docs/"
9 | tags = local.tags
10 | aws_kms_key_arn = aws_kms_key.this_aws_kms_key.arn
11 | ecr_count_number = 2
12 | ecr_base_arn = local.ecr_base_arn
13 | }
14 |
15 | resource "aws_lambda_function" "pre_processing_lambda" {
16 | #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
17 | #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
18 | #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
19 | #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
20 | description = "Executes the pre-process_docs-${local.standard_resource_name} Function"
21 | function_name = "pre-process-docs-${local.standard_resource_name}"
22 | role = aws_iam_role.preprocessing_lambda_role.arn
23 | timeout = 300 # Timeout in seconds (5 minutes)
24 | kms_key_arn = aws_kms_key.this_aws_kms_key.arn
25 | image_uri = module.pre_process_docs_ecr.latest_image_uri
26 | package_type = "Image"
27 | tags = local.tags
28 | reserved_concurrent_executions = -1
29 | # vpc_config {
30 | # # If the list of security group ids and subnets are empty,
31 | # # this property is effectively ignored
32 | # subnet_ids = [aws_subnet.subnet.id]
33 | # security_group_ids = [aws_security_group.sg.id]
34 | # }
35 |
36 | tracing_config {
37 | mode = "Active"
38 | }
39 |
40 | environment {
41 | variables = {
42 | PREPROCESS_BUCKET = module.preprocess_data_bucket.name
43 | }
44 | }
45 | }
46 |
47 | module "embedding_lambda_ecr" {
48 | source = "../../templates/modules/ecr"
49 | region = local.region
50 | ecr_name = "embed-docs-${local.standard_resource_name}"
51 | build_script_path = "${path.module}/${var.build_script_path}"
52 | business_logic_path = "${path.module}/${var.lambda_code_path}/embed_docs/"
53 | tags = local.tags
54 | aws_kms_key_arn = aws_kms_key.this_aws_kms_key.arn
55 | ecr_count_number = 2
56 | ecr_base_arn = local.ecr_base_arn
57 | }
58 |
59 | resource "aws_lambda_function" "embedding_lambda" {
60 | #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
61 | #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
62 | #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
63 | #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
64 | description = "Executes the embed-docs-${local.standard_resource_name} Function"
65 | function_name = "embed-docs-${local.standard_resource_name}"
66 | role = aws_iam_role.embedding_lambda_role.arn
67 | timeout = 300 # Timeout in seconds (5 minutes)
68 | kms_key_arn = aws_kms_key.this_aws_kms_key.arn
69 | image_uri = module.embedding_lambda_ecr.latest_image_uri
70 | package_type = "Image"
71 | tags = local.tags
72 | reserved_concurrent_executions = -1
73 | # vpc_config {
74 | # # If the list of security group ids and subnets are empty,
75 | # # this property is effectively ignored
76 | # subnet_ids = [aws_subnet.subnet.id]
77 | # security_group_ids = [aws_security_group.sg.id]
78 | # }
79 |
80 | tracing_config {
81 | mode = "Active"
82 | }
83 |
84 | environment {
85 | variables = {
86 | EMBEDDING_ENDPOINT_NAME = var.model_name != "titan" ? aws_sagemaker_endpoint.pytorch_endpoint[0].name : ""
87 | MAX_LENGTH = var.max_length_embedding
88 | SQS_QUEUE_URL = aws_sqs_queue.tags.url
89 | PREPROCESS_BUCKET = module.preprocess_data_bucket.name
90 | EMBEDDING_BUCKET = module.embedding_data_bucket.name
91 | MAX_ARTICLES = var.max_articles_embedding_endpoint
92 | EMBEDDING_MODEL = var.model_name
93 | }
94 | }
95 | }
96 |
97 | module "trigger_sfn_ecr" {
98 | source = "../../templates/modules/ecr"
99 | region = local.region
100 | ecr_name = "trigger-sfn-${local.standard_resource_name}"
101 | build_script_path = "${path.module}/${var.build_script_path}"
102 | business_logic_path = "${path.module}/${var.lambda_code_path}/trigger_sfn/"
103 | tags = local.tags
104 | aws_kms_key_arn = aws_kms_key.this_aws_kms_key.arn
105 | ecr_count_number = 2
106 | ecr_base_arn = local.ecr_base_arn
107 | }
108 |
109 | resource "aws_lambda_function" "trigger_sfn_function" {
110 | #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
111 | #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
112 | #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
113 | #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
114 | description = "Executes the trigger-sfn-${local.standard_resource_name} Function"
115 | function_name = "trigger-sfn-${local.standard_resource_name}"
116 | role = aws_iam_role.trigger_sfn_lambda_role.arn
117 | timeout = 30
118 | kms_key_arn = aws_kms_key.this_aws_kms_key.arn
119 | image_uri = module.trigger_sfn_ecr.latest_image_uri
120 | package_type = "Image"
121 | tags = local.tags
122 |
123 | reserved_concurrent_executions = -1
124 | # vpc_config {
125 | # # If the list of security group ids and subnets are empty,
126 | # # this property is effectively ignored
127 | # subnet_ids = [aws_subnet.subnet.id]
128 | # security_group_ids = [aws_security_group.sg.id]
129 | # }
130 |
131 | tracing_config {
132 | mode = "Active"
133 | }
134 |
135 | environment {
136 | variables = {
137 | STATE_MACHINE_ARN = aws_sfn_state_machine.summary_sfn.arn
138 | ARTICLES_THRESHOLD = 5
139 | DYNAMODB_TABLE_NAME = aws_dynamodb_table.cluster_table.name
140 | }
141 | }
142 | }
143 |
144 | module "summarization_function_ecr" {
145 | source = "../../templates/modules/ecr"
146 | region = local.region
147 | ecr_name = "summarization-function-docs-${local.standard_resource_name}"
148 | build_script_path = "${path.module}/${var.build_script_path}"
149 | business_logic_path = "${path.module}/${var.lambda_code_path}/summarization/"
150 | tags = local.tags
151 | aws_kms_key_arn = aws_kms_key.this_aws_kms_key.arn
152 | ecr_count_number = 2
153 | ecr_base_arn = local.ecr_base_arn
154 | }
155 |
156 | resource "aws_lambda_function" "summarization_function" {
157 | #checkov:skip=CKV_AWS_116: "Ensure that AWS Lambda function is configured for a Dead Letter Queue(DLQ)"
158 | #checkov:skip=CKV_AWS_173: "Check encryption settings for Lambda environmental variable"
159 | #checkov:skip=CKV_AWS_272: "Ensure AWS Lambda function is configured to validate code-signing"
160 | #checkov:skip=CKV_AWS_117: "Ensure that AWS Lambda function is configured inside a VPC"
161 | description = "Executes the summarization-function-${local.standard_resource_name} Function"
162 | function_name = "summarization-function-${local.standard_resource_name}"
163 | role = aws_iam_role.summarization_lambda_role.arn
164 | timeout = 30
165 | kms_key_arn = aws_kms_key.this_aws_kms_key.arn
166 | image_uri = module.summarization_function_ecr.latest_image_uri
167 | package_type = "Image"
168 | tags = local.tags
169 | reserved_concurrent_executions = -1
170 |
171 | tracing_config {
172 | mode = "Active"
173 | }
174 |
175 | # vpc_config {
176 | # # If the list of security group ids and subnets are empty,
177 | # # this property is effectively ignored
178 | # subnet_ids = [aws_subnet.subnet.id]
179 | # security_group_ids = [aws_security_group.sg.id]
180 | # }
181 |
182 | environment {
183 | variables = {
184 | DYNAMODB_TABLE_NAME = aws_dynamodb_table.cluster_table.name
185 | MODEL_ID = "anthropic.claude-3-haiku-20240307-v1:0"
186 | }
187 | }
188 | }
189 |
--------------------------------------------------------------------------------
/iac/roots/main/outputs.tf:
--------------------------------------------------------------------------------
1 | output "sample_user_creds" {
2 | description = "Sample User Credentials"
3 | value = var.cognito_users
4 | }
5 |
6 | output "dns_record_for_application" {
7 | description = "DNS Address to Access the Application"
8 | value = "https://${aws_alb.this_aws_alb_front_end.dns_name}"
9 | }
--------------------------------------------------------------------------------
/iac/roots/main/summarization_pipeline.tf:
--------------------------------------------------------------------------------
1 | resource "aws_sfn_state_machine" "summary_sfn" {
2 | #checkov:skip=CKV_AWS_285: "Ensure State Machine has execution history logging enabled"
3 |
4 | name = "summary-sfn-${var.app_name}-${var.env_name}"
5 | role_arn = aws_iam_role.summary_sfn_exec_role.arn
6 |
7 | tracing_configuration {
8 | enabled = true
9 | }
10 | definition = jsonencode({
11 | Comment = "An example state machine that invokes a Lambda function and updates DynamoDB.",
12 | StartAt = "SummarizeCluster",
13 | States = {
14 | SummarizeCluster = {
15 | Type = "Task",
16 | Resource = "arn:aws:states:::lambda:invoke",
17 | Parameters = {
18 | FunctionName = aws_lambda_function.summarization_function.arn,
19 | "Payload.$" = "$"
20 | },
21 | ResultPath = "$.LambdaOutput",
22 | Next = "UpdateDynamoDB"
23 | },
24 | UpdateDynamoDB = {
25 | Type = "Task",
26 | Resource = "arn:aws:states:::dynamodb:updateItem",
27 | Parameters = {
28 | TableName = aws_dynamodb_table.cluster_table.id
29 | Key = {
30 | "PK" : {
31 | "S.$" : "$.cluster_id"
32 | },
33 | "SK" : {
34 | "S.$" : "States.Format('#METADATA#{}', $.cluster_id)"
35 | }
36 | },
37 | "UpdateExpression" : "SET #description = :description_val, #generated_summary = :generated_summary_val, #summary_count = :summary_count_val, #most_common_location = :most_common_location_val, #most_common_organization = :most_common_organization_val, #earliest_date = :earliest_date_val, #latest_date = :latest_date_val",
38 | "ExpressionAttributeNames" : {
39 | "#description" : "description",
40 | "#generated_summary" : "generated_summary",
41 | "#summary_count" : "summary_count",
42 | "#most_common_location" : "most_common_location",
43 | "#most_common_organization" : "most_common_organization",
44 | "#earliest_date" : "earliest_date",
45 | "#latest_date" : "latest_date"
46 | },
47 | "ExpressionAttributeValues" : {
48 | ":description_val" : { "S.$" : "$.LambdaOutput.Payload.title" },
49 | ":generated_summary_val" : { "S.$" : "$.LambdaOutput.Payload.summary" },
50 | ":summary_count_val" : { "N.$" : "States.Format('{}', $.LambdaOutput.Payload.summary_count)" }, // Convert to a string
51 | ":most_common_location_val" : { "S.$" : "$.LambdaOutput.Payload.most_common_location" },
52 | ":most_common_organization_val" : { "S.$" : "$.LambdaOutput.Payload.most_common_organization" },
53 | ":earliest_date_val" : { "S.$" : "$.LambdaOutput.Payload.earliest_date" },
54 | ":latest_date_val" : { "S.$" : "$.LambdaOutput.Payload.latest_date" }
55 | }
56 | },
57 | End = true
58 | }
59 | }
60 | })
61 | }
62 |
--------------------------------------------------------------------------------
/iac/roots/main/templates/ClusterList-js.template:
--------------------------------------------------------------------------------
1 | // src/components/ClusterList.js
2 | import React, { useState, useEffect, useRef } from "react";
3 | import AWS from "aws-sdk";
4 | import {
5 | Button,
6 | Table,
7 | Box,
8 | ProgressBar,
9 | SpaceBetween,
10 | } from "@cloudscape-design/components";
11 | import { fetchAuthSession } from "@aws-amplify/auth";
12 | import ClusterModal from "./ClusterModal";
13 | import awsConfig from "../aws-exports";
14 |
15 | const refreshInterval = 5000;
16 |
17 | const ClusterList = () => {
18 | const [clusters, setClusters] = useState([]);
19 | const [selectedCluster, setSelectedCluster] = useState(null);
20 | const [totalArticles, setTotalArticles] = useState(0);
21 | const [isModalVisible, setModalVisible] = useState(false);
22 | const [progress, setProgress] = useState(0); // Initialize progress at 0%
23 | const [secondsRemaining, setSecondsRemaining] = useState(
24 | refreshInterval / 1000
25 | ); // Initialize countdown
26 |
27 | const dynamoDbRef = useRef();
28 |
29 | useEffect(() => {
30 | const configureAWS = async () => {
31 | const session = await fetchAuthSession();
32 | const { accessKeyId, secretAccessKey, sessionToken } =
33 | session.credentials;
34 | AWS.config.update({
35 | region: awsConfig.aws_cognito_region,
36 | credentials: new AWS.Credentials(
37 | accessKeyId,
38 | secretAccessKey,
39 | sessionToken
40 | ),
41 | });
42 | dynamoDbRef.current = new AWS.DynamoDB.DocumentClient();
43 | fetchClusters();
44 | };
45 | configureAWS();
46 | }, []);
47 |
48 | useEffect(() => {
49 | const intervalId = setInterval(() => {
50 | fetchClusters();
51 | }, refreshInterval);
52 |
53 | const progressId = setInterval(() => {
54 | setProgress(
55 | (prevProgress) => (prevProgress + (1000 / refreshInterval) * 100) % 100
56 | );
57 | setSecondsRemaining((prevSeconds) =>
58 | prevSeconds <= 1 ? refreshInterval / 1000 : prevSeconds - 1
59 | );
60 | }, 1000);
61 |
62 | return () => {
63 | clearInterval(intervalId);
64 | clearInterval(progressId);
65 | };
66 | }, []);
67 |
68 | const fetchClusters = async () => {
69 | if (!dynamoDbRef.current) {
70 | console.log("DynamoDB client not initialized");
71 | return;
72 | }
73 | let lastEvaluatedKey = null;
74 | const allItems = [];
75 | let articlesCount = 0;
76 | const params = {
77 | TableName: "${DYNAMODB_TABLE_NAME}",
78 | };
79 |
80 | do {
81 | if (lastEvaluatedKey) {
82 | params.ExclusiveStartKey = lastEvaluatedKey;
83 | }
84 | const data = await dynamoDbRef.current.scan(params).promise();
85 | allItems.push(...data.Items);
86 | lastEvaluatedKey = data.LastEvaluatedKey;
87 | } while (lastEvaluatedKey);
88 |
89 | const articlesByCluster = allItems.reduce((acc, item) => {
90 | if (item.is_cluster) {
91 | acc[item.PK] = acc[item.PK] || [];
92 | } else if (item.SK.startsWith("ARTICLE#")) {
93 | if (item.publication_date) {
94 | articlesCount++;
95 | if (acc[item.PK]) {
96 | acc[item.PK].push(item);
97 | }
98 | }
99 | }
100 | return acc;
101 | }, {});
102 |
103 | const newClusters = allItems
104 | .filter(
105 | (item) =>
106 | item.is_cluster &&
107 | item.generated_summary &&
108 | articlesByCluster[item.PK] &&
109 | articlesByCluster[item.PK].length > 2
110 | )
111 | .map((cluster) => ({
112 | ...cluster,
113 | articles: articlesByCluster[cluster.PK],
114 | number_of_articles: articlesByCluster[cluster.PK].length,
115 | }))
116 | .sort((a, b) => b.number_of_articles - a.number_of_articles);
117 |
118 | setClusters(newClusters);
119 | setTotalArticles(articlesCount);
120 | };
121 |
122 | const handleViewArticles = (cluster) => {
123 | console.log("Opening modal for cluster:", cluster.PK);
124 | setSelectedCluster(cluster);
125 | setModalVisible(true); // Set the modal to be visible
126 | };
127 |
128 | const wrapStyleSummary = {
129 | whiteSpace: "normal", // Allow the text to wrap to the next line
130 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
131 | maxWidth: "600px", // Set a maximum width for the cell content
132 | textAlign: "justify", // Center the text
133 | };
134 |
135 | const wrapStyleTitle = {
136 | whiteSpace: "normal", // Allow the text to wrap to the next line
137 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
138 | maxWidth: "150px", // Set a maximum width for the cell content
139 | textAlign: "center",
140 | };
141 |
142 | const wrapStyleNumberOfArticles = {
143 | whiteSpace: "normal", // Allow the text to wrap to the next line
144 | wordBreak: "break-word", // Ensure words break correctly at the end of the line
145 | maxWidth: "100px", // Set a maximum width for the cell content
146 | textAlign: "center",
147 | };
148 |
149 | // Column definitions using inline styles
150 | const columnDefinitions = [
151 | {
152 | header: "Title",
153 | cell: (item) => {item.description}
,
154 | },
155 | {
156 | header: "Summary",
157 | cell: (item) => (
158 | {item.generated_summary}
159 | ),
160 | },
161 | {
162 | header: "Articles",
163 | cell: (item) => (
164 | {item.number_of_articles}
165 | ),
166 | },
167 | {
168 | header: "View",
169 | cell: (item) => (
170 |
171 | ),
172 | },
173 | ];
174 |
175 | return (
176 |
177 |
178 |
179 | {" "}
180 | Near Real Time News Clustering and Summarization Demo
181 |
182 |
183 | Total Clusters: {clusters.length} | Total Articles: {totalArticles}
184 |
185 |
191 |
192 |
197 | {selectedCluster && (
198 | {
202 | setSelectedCluster(null);
203 | setModalVisible(false); // Hide the modal when closed
204 | }}
205 | visible={isModalVisible} // Control visibility with state
206 | />
207 | )}
208 |
209 |
210 | );
211 | };
212 |
213 | export default ClusterList;
214 |
--------------------------------------------------------------------------------
/iac/roots/main/templates/ConfigureNode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Update the system
4 | dnf install python3.11 -y
5 | dnf install python3.11-pip -y
6 | dnf install amazon-cloudwatch-agent -y
7 |
8 | # Set the environment variables
9 | %{ for config_key, config_value in config }
10 | export ${config_key}="${config_value}"
11 | %{ endfor ~}
12 |
13 | # Download and set up the code
14 | cat > /usr/local/bin/clustering-compute.sh << EOF
15 | #!/bin/bash
16 | for i in 1;do
17 | %{ for config_key, config_value in config }
18 | export ${config_key}="${config_value}"
19 | %{ endfor ~}
20 |
21 | cd /home/ec2-user
22 | mkdir -p stream_consumer
23 | cd stream_consumer
24 | aws s3 sync s3://$${S3_BUCKET_PATH} .
25 |
26 | # Run script
27 | python3.11 -m pip install -r requirements.txt
28 | python3.11 process_records.py >> /var/log/clustering-compute-python.log 2>&1
29 |
30 | done
31 | EOF
32 |
33 | # Permission the script
34 | chmod +x /usr/local/bin/clustering-compute.sh
35 |
36 | # Sleeping just for things to get settled
37 | sleep 30
38 |
39 | # Sending the logs to Cloudwatch
40 | touch /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
41 | cat > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json < /etc/systemd/system/clustering-compute.service << EOF
78 | [Unit]
79 | Description=Clustering Compute Process
80 | After=syslog.target network.target remote-fs.target nss-lookup.target
81 |
82 | [Service]
83 | ExecStart=/usr/local/bin/clustering-compute.sh
84 | RestartSec=300
85 | Restart=always
86 |
87 | [Install]
88 | WantedBy=multi-user.target
89 | EOF
90 |
91 | # Start the clustering-compute.service
92 | systemctl daemon-reload
93 | systemctl enable clustering-compute.service
94 | systemctl start clustering-compute.service
95 | systemctl status clustering-compute.service
96 |
--------------------------------------------------------------------------------
/iac/roots/main/templates/aws-exports-js.template:
--------------------------------------------------------------------------------
1 | const awsConfig = {
2 | aws_project_region: '${AWS_REGION}', // AWS region of Cognito
3 | aws_cognito_region: '${AWS_REGION}', // AWS region of Cognito
4 | aws_cognito_identity_pool_id: '${AWS_COGNITO_IDENTITY_POOL}', // Identity pool ID
5 | aws_user_pools_id: '${AWS_COGNITO_USER_POOL_ID}', // User Pool ID
6 | aws_user_pools_web_client_id: '${AWS_CONGITO_USER_POOL_APP_CLIENT_ID}', // App client ID
7 | federationTarget: "COGNITO_USER_POOLS" // keep as "COGNITO_USER_POOLS"
8 | };
9 | export default awsConfig;
--------------------------------------------------------------------------------
/iac/roots/main/templates/cognito-policy.json:
--------------------------------------------------------------------------------
1 | {
2 | "Version": "2012-10-17",
3 | "Statement": [
4 | {
5 | "Effect": "Allow",
6 | "Action": [
7 | "cognito-identity:GetCredentialsForIdentity"
8 | ],
9 | "Resource": [
10 | "*"
11 | ]
12 | },
13 | {
14 | "Sid": "VisualEditor0",
15 | "Effect": "Allow",
16 | "Action": [
17 | "dynamodb:Scan"
18 | ],
19 | "Resource": "${dd_table_arn}"
20 | }
21 | ]
22 | }
--------------------------------------------------------------------------------
/iac/roots/main/templates/ecs-role.json:
--------------------------------------------------------------------------------
1 | {
2 | "Version": "2012-10-17",
3 | "Statement": [
4 | {
5 | "Sid": "BasicDescribePolicy",
6 | "Effect": "Allow",
7 | "Action": [
8 | "ecr:GetAuthorizationToken",
9 | "logs:CreateLogGroup",
10 | "logs:CreateLogStream",
11 | "logs:DescribeLogStreams",
12 | "logs:PutLogEvents"
13 | ],
14 | "Resource": "*"
15 | },
16 | {
17 | "Sid": "AccessToEncryptAndDeccryptKMSKeys",
18 | "Effect": "Allow",
19 | "Action": [
20 | "kms:Decrypt",
21 | "kms:DescribeKey",
22 | "kms:Encrypt",
23 | "kms:GenerateDataKey",
24 | "kms:GetKeyPolicy",
25 | "kms:GetKeyRotationStatus",
26 | "kms:ListGrants",
27 | "kms:ListKeys",
28 | "kms:ListAliases",
29 | "kms:ListKeyPolicies",
30 | "kms:ListResourceTags",
31 | "kms:ListRetirableGrants",
32 | "kms:ReEncryptTo"
33 | ],
34 | "Resource": [
35 | "${kms_key_arn}"
36 | ]
37 | },
38 | {
39 | "Sid": "ECRGrantsToConnectAndDownload",
40 | "Effect": "Allow",
41 | "Action": [
42 | "ecr:BatchCheckLayerAvailability",
43 | "ecr:BatchGetImage",
44 | "ecr:GetDownloadUrlForLayer"
45 | ],
46 | "Resource": "arn:aws:ecr:*:*:repository/*${standard_resource_name}*"
47 | },
48 | {
49 | "Sid": "ECSGrants",
50 | "Effect": "Allow",
51 | "Action": [
52 | "ecs:CreateCluster",
53 | "ecs:DeregisterContainerInstance",
54 | "ecs:DescribeServices",
55 | "ecs:DiscoverPollEndpoint",
56 | "ecs:Poll",
57 | "ecs:RegisterContainerInstance",
58 | "ecs:RegisterContainerInstance",
59 | "ecs:StartTelemetrySession",
60 | "ecs:Submit*",
61 | "ecs:UpdateContainerInstancesState",
62 | "ecs:UpdateService"
63 | ],
64 | "Resource": "*",
65 | "Condition": {
66 | "ForAllValues:StringEquals": {
67 | "aws:ResourceTag/common_identifier": "*${standard_resource_name}*"
68 | }
69 | }
70 | }
71 | ]
72 | }
--------------------------------------------------------------------------------
/iac/roots/main/templates/init.cfg:
--------------------------------------------------------------------------------
1 | #cloud-config
2 | write_files:
3 | - content: |
4 | ${CONFIGURE_NODE_SCRIPT}
5 | encoding: gz+b64
6 | path: /usr/local/bin/ConfigureNode.sh
7 | permissions: "0755"
8 | runcmd:
9 | - /usr/local/bin/ConfigureNode.sh
10 |
--------------------------------------------------------------------------------
/iac/roots/main/terraform.tfvars:
--------------------------------------------------------------------------------
1 | // Copyright 2023 Amazon.com and its affiliates; all rights reserved.
2 | // This file is Amazon Web Services Content and may not be duplicated or distributed without permission.
3 |
4 | app_name = "clustering"
5 | env_name = "demo2"
6 | cidr_block = "10.0.0.0/16"
7 | public_subnet = ["10.0.2.0/24", "10.0.3.0/24", "10.0.4.0/24"]
8 | private_subnet = ["10.0.10.0/24", "10.0.11.0/24", "10.0.12.0/24"]
9 |
--------------------------------------------------------------------------------
/iac/roots/main/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Amazon.com and its affiliates; all rights reserved.
2 | # This file is Amazon Web Services Content and may not be duplicated or distributed without permission.
3 |
4 | variable "app_name" {
5 | type = string
6 | description = "Name of the app"
7 | }
8 |
9 | variable "env_name" {
10 | type = string
11 | description = "Name of the environment"
12 | }
13 |
14 | # VPC Variables
15 | variable "cidr_block" {
16 | description = "The CIDR block for the VPC. Default value is a valid CIDR, but not acceptable by AWS and should be overridden"
17 | type = string
18 | default = "0.0.0.0/0"
19 | }
20 |
21 | variable "public_subnet" {
22 | description = "A list of public subnets inside the VPC"
23 | type = list(string)
24 | default = []
25 | }
26 |
27 | variable "private_subnet" {
28 | description = "A list of private subnets inside the VPC"
29 | type = list(string)
30 | default = []
31 | }
32 |
33 | variable "lambda_code_path" {
34 | description = "Relative path to the Lambda functions' code"
35 | type = string
36 | default = "../../../business_logic/lambdas"
37 | }
38 |
39 | variable "build_script_path" {
40 | description = "Relative path to the Build functions' code"
41 | type = string
42 | default = "../../../build-script"
43 | }
44 |
45 | variable "model_name" {
46 | description = "'bge', 'titan', 'mistralinstruct'"
47 | type = string
48 | default = "titan"
49 | }
50 |
51 | variable "max_length_embedding" {
52 | description = "Max length on the encode call within the Sagemaker endpoint: 512, 1024, 2048, 4096"
53 | type = string
54 | default = "512"
55 | }
56 |
57 | variable "embedding_endpoint_instance_type" {
58 | description = "Instance type for embedding endpoint"
59 | type = string
60 | # default = "ml.inf2.xlarge"
61 | default = "ml.g5.2xlarge"
62 | # default = "ml.g5.12xlarge"
63 | }
64 |
65 | variable "embedding_endpoint_instance_count" {
66 | description = "Number of instances of embedding endpoint"
67 | type = number
68 | default = 2
69 | }
70 |
71 | /*
72 | variable "azs" {
73 | description = "A list of availability zones in the region"
74 | type = list(string)
75 | default = []
76 | }
77 |
78 | variable "embedding_strategy" {
79 | description = "'concat' or 'pooling'"
80 | type = string
81 | default = "concat"
82 | }
83 |
84 | variable "pooling_strategy" {
85 | description = "'mean' or 'max'"
86 | type = string
87 | default = "mean"
88 | }
89 |
90 | variable "min_embedding_instance_count" {
91 | description = "Number of instances of embedding endpoint"
92 | type = number
93 | default = 1
94 | }
95 |
96 | variable "max_embedding_instance_count" {
97 | description = "Number of instances of embedding endpoint"
98 | type = number
99 | default = 8
100 | }
101 | */
102 |
103 | variable "max_articles_embedding_endpoint" {
104 | description = "Maximum number of articles the embedding endpoint can take in one API call"
105 | type = number
106 | default = 200
107 | }
108 |
109 | variable "instance_type" {
110 | type = string
111 | default = "c7g.4xlarge"
112 | description = "Instance type for the for the clustering compute"
113 | }
114 |
115 | variable "volume_size" {
116 | type = number
117 | description = "Volume Size of the EBS Volume"
118 | default = 35
119 | }
120 |
121 | variable "number_of_nodes" {
122 | type = number
123 | description = "Number of Nodes Needed for the clustering compute"
124 | default = 1
125 | }
126 |
127 | variable "auto_verified_attributes" {
128 | type = list(any)
129 | default = ["email"]
130 | description = "Attributes to be auto-verified. Valid values: email, phone_number."
131 | }
132 |
133 | variable "mfa_configuration" {
134 | type = string
135 | default = "OFF"
136 | description = "Multi-Factor Authentication (MFA) configuration for the User Pool. Defaults of OFF. Valid values are OFF, ON and OPTIONAL."
137 | }
138 |
139 | variable "advanced_security_mode" {
140 | type = string
141 | default = "OFF"
142 | description = "Mode for advanced security, must be one of OFF, AUDIT or ENFORCED."
143 | }
144 |
145 | variable "allow_software_mfa_token" {
146 | description = "(Optional) Boolean whether to enable software token Multi-Factor (MFA) tokens, such as Time-based One-Time Password (TOTP). To disable software token MFA when 'sms_configuration' is not present, the 'mfa_configuration' argument must be set to OFF and the 'software_token_mfa_configuration' configuration block must be fully removed."
147 | type = bool
148 | default = false
149 | }
150 |
151 | variable "case_sensitive" {
152 | type = bool
153 | default = true
154 | description = "Whether username case sensitivity will be applied for all users in the user pool through Cognito APIs."
155 | }
156 |
157 | variable "sms_authentication_message" {
158 | type = string
159 | default = "Your username is {username}. Sign up at {####}"
160 | description = "String representing the SMS authentication message. The Message must contain the {####} placeholder, which will be replaced with the code."
161 | }
162 |
163 | variable "minimum_length" {
164 | type = number
165 | description = "(Optional) The minimum length of the password policy that you have set."
166 | default = 6
167 | }
168 |
169 | variable "require_lowercase" {
170 | type = bool
171 | description = "(Optional) Whether you have required users to use at least one lowercase letter in their password."
172 | default = false
173 | }
174 |
175 | variable "require_numbers" {
176 | type = bool
177 | default = false
178 | description = "Whether you have required users to use at least one number in their password."
179 | }
180 |
181 | variable "require_symbols" {
182 | type = bool
183 | default = false
184 | description = "Whether you have required users to use at least one symbol in their password."
185 | }
186 |
187 | variable "require_uppercase" {
188 | type = bool
189 | default = false
190 | description = "Whether you have required users to use at least one uppercase letter in their password."
191 | }
192 |
193 | variable "temporary_password_validity_days" {
194 | type = number
195 | description = "(Optional) In the password policy you have set, refers to the number of days a temporary password is valid. If the user does not sign-in during this time, their password will need to be reset by an administrator."
196 | default = 100
197 | }
198 |
199 | variable "cognito_users" {
200 | description = "A map of user attributes for each user in the User Pool. Each attribute is a name-value pair."
201 | type = map(object({
202 | name = string
203 | email = string
204 | password = string
205 | }))
206 | default = {
207 | user1 = {
208 | name = "aws-user"
209 | email = "donotreply@amazon.com"
210 | password = "awsiscool$"
211 | }
212 | }
213 | }
214 |
215 | variable "front_end_path" {
216 | description = "Relative path to the Lambda functions' code"
217 | type = string
218 | default = "../../../front_end"
219 | }
220 |
221 | variable "task_cpu" {
222 | type = number
223 | description = "VCPUs for Task"
224 | default = 512
225 | }
226 |
227 | variable "task_memory" {
228 | type = number
229 | description = "Memory for Task"
230 | default = 2048
231 | }
232 |
233 | variable "launch_type" {
234 | type = string
235 | description = "Launch type for the service."
236 | default = "FARGATE"
237 | }
238 |
239 | variable "desired_count" {
240 | type = number
241 | description = "The number of instances of the task definition to place and keep running"
242 | default = 1
243 | }
--------------------------------------------------------------------------------
/iac/roots/main/versions.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.4.2"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = ">= 5.60"
8 | }
9 | cloudinit = {
10 | source = "hashicorp/cloudinit"
11 | version = "2.3.4"
12 | }
13 | time = {
14 | source = "hashicorp/time"
15 | version = ">= 0.11"
16 | }
17 | local = {
18 | source = "hashicorp/local"
19 | version = "2.5.1"
20 | }
21 | }
22 | }
--------------------------------------------------------------------------------
/iac/templates/README.md:
--------------------------------------------------------------------------------
1 | The templates directory contains reusable Terraform configurations.
2 | They are reusable because they avoid hardcoding values and instead
3 | expose input parameters so that these values can be set.
4 |
5 | Templates can be used multiple times by top-level projects. For example,
6 | you might have a template that creates an SSM parameter. Your top-level
7 | project could call your template once for a primary region and a second
8 | time for a DR region.
--------------------------------------------------------------------------------
/iac/templates/components/README.md:
--------------------------------------------------------------------------------
1 | Components are higher-level reusable Terraform configurations. Components
2 | combine modules to create higher-level abstractions.
--------------------------------------------------------------------------------
/iac/templates/modules/README.md:
--------------------------------------------------------------------------------
1 | Modules are reusable infrastructure building blocks that are used
2 | by higher-level components or top-level projects.
--------------------------------------------------------------------------------
/iac/templates/modules/ecr/main.tf:
--------------------------------------------------------------------------------
1 | # Checks if build folder has changed
2 | data "external" "this_external" {
3 | program = ["bash", "${var.build_script_path}/dir_md5.sh", "${var.business_logic_path}"]
4 | }
5 |
6 | resource "aws_ecr_repository" "this_aws_ecr_repository" {
7 | name = var.ecr_name
8 | tags = var.tags
9 | image_tag_mutability = "IMMUTABLE"
10 | force_delete = true
11 | image_scanning_configuration {
12 | scan_on_push = true
13 | }
14 | encryption_configuration {
15 | encryption_type = "KMS"
16 | kms_key = var.aws_kms_key_arn
17 | }
18 | }
19 |
20 | resource "aws_ecr_lifecycle_policy" "this_aws_ecr_lifecycle_policy" {
21 | policy = <