├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── amazon_comprehend_events_tutorial
    ├── README.md
    ├── data
    │   └── sample_finance_dataset.txt
    └── notebooks
    │   ├── compact_nx.html
    │   ├── comprehend-2017-11-27.normal.json
    │   ├── comprehend_events_finance_tutorial.ipynb
    │   ├── events_graph.py
    │   ├── nx.html
    │   └── requirements.txt
├── amazon_comprehend_moderation
    ├── amazon_comprehend_chain.ipynb
    ├── moderated_chat.ipynb
    └── moderation_default_01.png
├── building-custom-classifier
    ├── BuildingCustomClassifier.ipynb
    ├── sample-docs
    │   ├── CMS1500.png
    │   ├── discharge-summary.pdf
    │   ├── doctors-notes.pdf
    │   ├── drivers_license.png
    │   ├── insurance_card.png
    │   ├── insurance_invoice.png
    │   └── passport.pdf
    └── train-data
    │   └── comprehend_train_data.csv
├── building-custom-entity-recognizer-for-PDFs
    ├── BuildingCustomEntityRecognizerForPDFs.ipynb
    └── helperPackage
    │   └── pdfhelper
    │       └── PDFHelper.py
├── comprehend_groundtruth_integration
    ├── README.md
    └── src
    │   ├── __init__.py
    │   └── comprehend_customer_scripts
    │       ├── GroundTruth
    │           ├── DocumentClassifier
    │           │   ├── __init__.py
    │           │   ├── __pycache__
    │           │   │   ├── customer_errors.cpython-37.pyc
    │           │   │   └── groundtruth_to_comprehend_clr_format_converter.cpython-37.pyc
    │           │   ├── convertGroundTruthToComprehendCLRFormat.sh
    │           │   ├── customer_errors.py
    │           │   ├── groundtruth_format_conversion_handler.py
    │           │   └── groundtruth_to_comprehend_clr_format_converter.py
    │           ├── EntityRecognizer
    │           │   ├── __init__.py
    │           │   ├── convertGroundtruthToComprehendERFormat.sh
    │           │   ├── customer_errors.py
    │           │   ├── groundtruth_format_conversion_handler.py
    │           │   └── groundtruth_to_comprehend_format_converter.py
    │           ├── __init__.py
    │           └── __pycache__
    │           │   ├── customer_errors.cpython-37.pyc
    │           │   └── groundtruth_to_comprehend_format_converter.cpython-37.pyc
    │       ├── __init__.py
    │       └── validation
    │           ├── __init__.py
    │           └── semi_structured
    │               ├── __init__.py
    │               └── entity_recognizer
    │                   ├── README.md
    │                   ├── __init__.py
    │                   ├── annotation_model.py
    │                   ├── utils
    │                       ├── __init__.py
    │                       ├── __pycache__
    │                       │   ├── __init__.cpython-38.pyc
    │                       │   ├── annotation_utils.cpython-38.pyc
    │                       │   ├── log_utils.cpython-38.pyc
    │                       │   └── s3_utils.cpython-38.pyc
    │                       ├── annotation_utils.py
    │                       ├── log_utils.py
    │                       └── s3_utils.py
    │                   ├── validate_annotation.py
    │                   └── validate_manifest.py
├── s3_object_lambda_pii_protection_blog
    ├── access-control
    │   ├── innocuous.txt
    │   ├── s3olap-access-control-foundation.yaml
    │   └── survey-results.txt
    └── redaction
    │   ├── s3olap-redaction-foundation.yaml
    │   └── transcript.txt
└── topic_wise_review_analysis
    ├── data_processing.ipynb
    ├── model_training.ipynb
    └── topic_mapping_sentiment_generation.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .DS_Store


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lambda_conversion_hook_example"]
2 | 	path = comprehend_groundtruth_integration/lambda_conversion_hook_example
3 | 	url = git@github.com:aws-samples/amazon-sagemaker-groundtruth-and-amazon-comprehend-ner-examples.git
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Comprehend Examples
 2 | 
 3 | This repository contains scripts, tutorials, and data for our customers to use when experimenting with features released by AWS Comprehend.
 4 | 
 5 | # Packages
 6 | 
 7 | 1. [comprehend_groundtruth_integration](./comprehend_groundtruth_integration/README.md): This package contains shell scripts for conversion of SageMaker GroundTruth NER and MultiClass/MultiLabel labeling job output to formats suitable for use with Comprehend's Custom NER and Custom Document Classifier APIs.
 8 | 
 9 | 2. [amazon_comprehend_events_tutorial](./amazon_comprehend_events_tutorial/README.md): This package contains a Jupyter notebook, supporting script, and sample data necessary to produce tabulations and visualizations of Comprehend Events asynchronous API output.
10 | 
11 | 
12 | # Amazon Comprehend Solutions and Resources
13 | Amazon Comprehend Document Search- Using Amazon Comprehend, Amazon Elasticsearch with Kibana, Amazon S3, Amazon Cognito to search over large number of documents such as pdf files.https://github.com/aws-samples/amazon-comprehend-doc-search
14 | 
15 | Amazon Textract Comprehend Image Search with Elasticsearch https://github.com/aws-samples/amazon-textract-comprehend-OCRimage-search-and-analyze
16 | 
17 | Easily setup human review of your NLP based Entity Recognition workflows with Amazon SageMaker Ground Truth, Amazon Comprehend AutoML and Amazon Augmented AI (A2I) - https://github.com/aws-samples/augmentedai-comprehendner-groundtruth
18 | 
19 | Deriving conversational insights from invoices with Amazon Textract, Amazon Comprehend, and Amazon Lex - https://github.com/aws-samples/aws-textract-comprehend-lex-chatbot
20 | 
21 | Active learning workflow for Amazon Comprehend Custom Classification models with Amazon Augmented AI https://github.com/aws-samples/amazon-comprehend-active-learning-framework
22 | 
23 | Easily setup built-in human review loops for NLP based entity recognition workflows using Amazon SageMaker Ground Truth, Amazon Comprehend and Amazon Augmented AI https://github.com/aws-samples/augmentedai-comprehendner-groundtruth
24 | 
25 | Amazon Transcribe Comprehend Podcast- A demo application that transcribes and indexes podcast episodes so the listeners can explore and discover episodes of interest and podcast owners can do analytics on the content over time. This solution leverages Amazon Transcribe, Amazon Comprehend, Amazon Elasticsearch, AWS Step Functions and AWS Lambda.https://github.com/aws-samples/amazon-transcribe-comprehend-podcast
26 | 
27 | Notebooks and recipes for creating custom entity recognizer for Amazon comprehend https://github.com/aws-samples/amazon-comprehend-custom-entity
28 | Document Analysis Solution using Amazon Textract, Amazon Comprehend and Amazon A2I https://github.com/aws-samples/amazon-textract-comprehend-a2i
29 | nlp-analysis-demo - The purpose of this demo is to build a stack that uses Amazon Comprehend and Amazon Textract to analyze unstructured data and generate insights and trends https://github.com/aws-samples/nlp-textract-comprehend-demo
30 | 
31 | 
32 | # Workshops
33 | 
34 | workshop-textract-comprehend-es https://github.com/aws-samples/workshop-textract-comprehend-es
35 | 
36 | 
37 | # LICENSE
38 | This library is licensed under the MIT-0 License. See the LICENSE file. 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/amazon_comprehend_events_tutorial/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon Comprehend Events Tutorial
 2 | 
 3 | This repo contains a Jupyter notebook, a helper script, and a sample data set designed to help users make the most of [Comprehend Events](http://). Currently, it contains the following artifacts:
 4 | 
 5 | * [comprehend_events_finance_tutorial.ipynb](./notebooks/comprehend_events_finance_tutorial.ipynb). This Jupyter notebook contains functions necessary to transform Comprehend Events service output for various analytic tasks, including highlighting of events and entities in text, tabulation of event structure, and graphing of event structure.
 6 | * [events_graph.py](./notebooks/events_graph.py). A helper module for converting Events output to a graph with `networkx` and `pyvis`.
 7 | * [sample_finance_dataset.txt](./data/sample_finance_dataset.txt). A set of 118 Amazon press releases in doclines format.
 8 | 
 9 | For further information, please see our launch blog post, "[Announcing the launch of Amazon Comprehend Events](http://)".
10 | 
11 | 
12 | ==============================================
13 | 
14 | Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
15 | 
16 | SPDX-License-Identifier: MIT-0
17 | 


--------------------------------------------------------------------------------
/amazon_comprehend_events_tutorial/notebooks/compact_nx.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis.css" type="text/css" />
  4 | <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis-network.min.js"> </script>
  5 | <center>
  6 | <h1></h1>
  7 | </center>
  8 | 
  9 | <!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
 10 | <script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
 11 | 
 12 | <style type="text/css">
 13 | 
 14 |         #mynetwork {
 15 |             width: 800px;
 16 |             height: 600px;
 17 |             background-color: #ffffff;
 18 |             border: 1px solid lightgray;
 19 |             position: relative;
 20 |             float: left;
 21 |         }
 22 | 
 23 |         
 24 | 
 25 |         
 26 | 
 27 |         
 28 | </style>
 29 | 
 30 | </head>
 31 | 
 32 | <body>
 33 | <div id = "mynetwork"></div>
 34 | 
 35 | 
 36 | <script type="text/javascript">
 37 | 
 38 |     // initialize global variables.
 39 |     var edges;
 40 |     var nodes;
 41 |     var network; 
 42 |     var container;
 43 |     var options, data;
 44 | 
 45 |     
 46 |     // This method is responsible for drawing the graph, returns the drawn network
 47 |     function drawGraph() {
 48 |         var container = document.getElementById('mynetwork');
 49 |         
 50 |         
 51 | 
 52 |         // parsing and collecting nodes and edges from the python
 53 |         nodes = new vis.DataSet([{"color": "#5160aa", "group": "trigger", "id": "tr0", "label": "CORPORATE_MERGER", "shape": "dot", "size": 9, "tag": "CORPORATE_MERGER"}, {"color": "#5e4fa2", "group": "entity", "id": "en4", "label": "today", "shape": "dot", "size": 9, "tag": "DATE"}, {"color": "#cf374d", "group": "entity", "id": "en1", "label": "NASDAQ:AMZN", "shape": "dot", "size": 9, "tag": "STOCK_CODE"}, {"color": "#cf374d", "group": "entity", "id": "en3", "label": "NASDAQ:WFM", "shape": "dot", "size": 9, "tag": "STOCK_CODE"}, {"color": "#4471b2", "group": "entity", "id": "en2", "label": "Whole Foods Market, Inc.", "shape": "dot", "size": 9, "tag": "ORGANIZATION"}, {"color": "#5e4fa2", "group": "entity", "id": "en8", "label": "during the second half of 2017", "shape": "dot", "size": 9, "tag": "DATE"}, {"color": "#fcaa5f", "group": "trigger", "id": "tr1", "label": "CORPORATE_ACQUISITION", "shape": "dot", "size": 9, "tag": "CORPORATE_ACQUISITION"}, {"color": "#fdba6b", "group": "entity", "id": "en5", "label": "$13.7 billion", "shape": "dot", "size": 9, "tag": "MONETARY_VALUE"}, {"color": "#4471b2", "group": "entity", "id": "en0", "label": "Amazon", "shape": "dot", "size": 9, "tag": "ORGANIZATION"}, {"color": "#6ac4a5", "group": "trigger", "id": "tr2", "label": "EMPLOYMENT", "shape": "dot", "size": 9, "tag": "EMPLOYMENT"}, {"color": "#7ecca5", "group": "entity", "id": "en6", "label": "John Mackey", "shape": "dot", "size": 9, "tag": "PERSON"}, {"color": "#5ab4ab", "group": "entity", "id": "en7", "label": "CEO", "shape": "dot", "size": 9, "tag": "PERSON_TITLE"}]);
 54 |         edges = new vis.DataSet([{"color": "grey", "from": "tr0", "label": "DATE", "to": "en4", "weight": 99.45779999999999}, {"color": "grey", "from": "tr0", "label": "PARTICIPANT", "to": "en1", "weight": 99.0119}, {"color": "grey", "from": "tr0", "label": "PARTICIPANT", "to": "en3", "weight": 98.0082}, {"color": "grey", "from": "tr0", "label": "PARTICIPANT", "to": "en2", "weight": 99.9654}, {"color": "grey", "from": "tr0", "label": "DATE", "to": "en8", "weight": 99.9491}, {"color": "grey", "from": "tr1", "label": "AMOUNT", "to": "en5", "weight": 99.873}, {"color": "grey", "from": "tr1", "label": "DATE", "to": "en4", "weight": 99.45779999999999}, {"color": "grey", "from": "tr1", "label": "INVESTEE", "to": "en2", "weight": 99.9668}, {"color": "grey", "from": "tr1", "label": "INVESTOR", "to": "en0", "weight": 99.9615}, {"color": "grey", "from": "tr2", "label": "EMPLOYEE", "to": "en6", "weight": 99.9699}, {"color": "grey", "from": "tr2", "label": "EMPLOYEE_TITLE", "to": "en7", "weight": 99.8065}, {"color": "grey", "from": "tr2", "label": "EMPLOYER", "to": "en2", "weight": 99.98129999999999}]);
 55 | 
 56 |         // adding nodes and edges to the graph
 57 |         data = {nodes: nodes, edges: edges};
 58 | 
 59 |         var options = {
 60 |     "configure": {
 61 |         "enabled": false
 62 |     },
 63 |     "edges": {
 64 |         "color": {
 65 |             "inherit": true
 66 |         },
 67 |         "smooth": {
 68 |             "enabled": false,
 69 |             "type": "continuous"
 70 |         }
 71 |     },
 72 |     "interaction": {
 73 |         "dragNodes": true,
 74 |         "hideEdgesOnDrag": false,
 75 |         "hideNodesOnDrag": false
 76 |     },
 77 |     "physics": {
 78 |         "enabled": true,
 79 |         "stabilization": {
 80 |             "enabled": true,
 81 |             "fit": true,
 82 |             "iterations": 1000,
 83 |             "onlyDynamicEdges": false,
 84 |             "updateInterval": 50
 85 |         }
 86 |     }
 87 | };
 88 |         
 89 |         
 90 | 
 91 |         
 92 | 
 93 |         network = new vis.Network(container, data, options);
 94 | 
 95 |         
 96 | 
 97 | 
 98 |         
 99 | 
100 |         return network;
101 | 
102 |     }
103 | 
104 |     drawGraph();
105 | 
106 | </script>
107 | </body>
108 | </html>


--------------------------------------------------------------------------------
/amazon_comprehend_events_tutorial/notebooks/events_graph.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions and constants for Comprehend Events semantic network graphing.
  3 | """
  4 | 
  5 | from collections import Counter
  6 | from matplotlib import cm, colors
  7 | import networkx as nx
  8 | from pyvis.network import Network
  9 | 
 10 | 
 11 | ENTITY_TYPES = ['DATE', 'FACILITY', 'LOCATION', 'MONETARY_VALUE', 'ORGANIZATION',
 12 |                 'PERSON', 'PERSON_TITLE', 'QUANTITY', 'STOCK_CODE']
 13 | 
 14 | TRIGGER_TYPES = ['BANKRUPTCY', 'EMPLOYMENT', 'CORPORATE_ACQUISITION', 
 15 |                  'INVESTMENT_GENERAL', 'CORPORATE_MERGER', 'IPO', 'RIGHTS_ISSUE', 
 16 |                  'SECONDARY_OFFERING', 'SHELF_OFFERING', 'TENDER_OFFERING', 'STOCK_SPLIT']
 17 | 
 18 | PROPERTY_MAP = {
 19 |     "event": {"size": 10, "shape": "box", "color": "#dbe3e5"},
 20 |     "entity_group": {"size": 6, "shape": "dot", "color": "#776d8a"},
 21 |     "entity": {"size": 4, "shape": "square", "color": "#f3e6e3"},
 22 |     "trigger": {"size": 4, "shape": "diamond", "color": "#f3e6e3"}
 23 | }
 24 | 
 25 | def get_color_map(tags):
 26 |     spectral = cm.get_cmap("Spectral", len(tags))
 27 |     tag_colors = [colors.rgb2hex(spectral(i)) for i in range(len(tags))]
 28 |     color_map = dict(zip(*(tags, tag_colors)))
 29 |     color_map.update({'ROLE': 'grey'})
 30 |     return color_map
 31 | 
 32 | COLOR_MAP = get_color_map(ENTITY_TYPES + TRIGGER_TYPES)
 33 | COLOR_MAP['ROLE'] = "grey"
 34 | 
 35 | IFRAME_DIMS = ("600", "800")
 36 | 
 37 | 
 38 | def get_canonical_mention(mentions, method="longest"):
 39 |     extents = enumerate([m['Text'] for m in mentions])
 40 |     if method == "longest":
 41 |         name = sorted(extents, key=lambda x: len(x[1]))
 42 |     elif method == "most_common": 
 43 |         name = [Counter(extents).most_common()[0][0]]
 44 |     else:
 45 |         name = [list(extents)[0]]
 46 |     return [mentions[name[-1][0]]]
 47 | 
 48 | 
 49 | def get_nodes_and_edges(
 50 |     result, node_types=['event', 'trigger', 'entity_group', 'entity'], thr=0.0
 51 |     ):
 52 |     """Convert results to (nodelist, edgelist) depending on specified entity types."""
 53 |     nodes = []
 54 |     edges = []
 55 |     event_nodes = []
 56 |     entity_nodes = []  
 57 |     entity_group_nodes = [] 
 58 |     trigger_nodes = []
 59 |     
 60 |     # Nodes are (id, type, tag, score, mention_type) tuples.
 61 |     if 'event' in node_types:
 62 |         event_nodes = [
 63 |             (
 64 |                 "ev%d" % i,
 65 |                  t['Type'],
 66 |                  t['Type'],
 67 |                  t['Score'],
 68 |                  "event"
 69 |             )
 70 |             for i, e in enumerate(result['Events'])
 71 |             for t in e['Triggers'][:1]
 72 |             if t['GroupScore'] > thr
 73 |         ]
 74 |         nodes.extend(event_nodes)
 75 |     
 76 |     if 'trigger' in node_types:
 77 |         trigger_nodes = [
 78 |             (
 79 |                 "ev%d-tr%d" % (i, j),
 80 |                 t['Type'],
 81 |                 t['Text'],
 82 |                 t['Score'],
 83 |                 "trigger"
 84 |             )
 85 |             for i, e in enumerate(result['Events'])
 86 |             for j, t in enumerate(e['Triggers'])
 87 |             if t['Score'] > thr
 88 |         ]
 89 |         trigger_nodes = list({t[1:3]: t for t in trigger_nodes}.values())
 90 |         nodes.extend(trigger_nodes)
 91 |         
 92 |     if 'entity_group' in node_types:
 93 |         entity_group_nodes = [
 94 |             (
 95 |                 "gr%d" % i,
 96 |                 m['Type'],
 97 |                 m['Text'] if 'entity' not in node_types else m['Type'],
 98 |                 m['Score'],
 99 |                 "entity_group"
100 |             )
101 |             for i, e in enumerate(result['Entities'])
102 |             for m in get_canonical_mention(e['Mentions'])
103 |             if m['GroupScore'] > thr
104 |         ]
105 |         nodes.extend(entity_group_nodes)
106 |         
107 |     if 'entity' in node_types:
108 |         entity_nodes = [
109 |             (
110 |                 "gr%d-en%d" % (i, j),
111 |                 m['Type'],
112 |                 m['Text'],
113 |                 m['Score'],
114 |                 "entity"
115 |             )
116 |             for i, e in enumerate(result['Entities'])
117 |             for j, m in enumerate(e['Mentions'])
118 |             if m['Score'] > thr
119 |         ]
120 |         entity_nodes = list({t[1:3]: t for t in entity_nodes}.values())
121 |         nodes.extend(entity_nodes)
122 | 
123 |     # Edges are (trigger_id, node_id, role, score, type) tuples.
124 |     if event_nodes and entity_group_nodes:
125 |         edges.extend([
126 |             ("ev%d" % i, "gr%d" % a['EntityIndex'], a['Role'], a['Score'], "argument")
127 |             for i, e in enumerate(result['Events'])
128 |             for j, a in enumerate(e['Arguments'])
129 |             #if a['Score'] > THR
130 |         ])
131 |     
132 |     if entity_nodes and entity_group_nodes:
133 |         entity_keys = set([n[0] for n in entity_nodes])
134 |         edges.extend([
135 |             ("gr%d" % i, "gr%d-en%d" % (i, j), "", m['GroupScore'], "coref")
136 |             for i, e in enumerate(result['Entities'])
137 |             for j, m in enumerate(e['Mentions'])
138 |             if "gr%d-en%d" % (i, j) in entity_keys
139 |             if m['GroupScore'] > thr
140 |         ])
141 | 
142 |     if event_nodes and trigger_nodes:
143 |         trigger_keys = set([n[0] for n in trigger_nodes])
144 |         edges.extend([
145 |             ("ev%d" % i, "ev%d-tr%d" % (i, j), "", a['GroupScore'], "coref")
146 |             for i, e in enumerate(result['Events'])
147 |             for j, a in enumerate(e['Triggers'])
148 |             if "ev%d-tr%d" % (i, j) in trigger_keys
149 |             if a['GroupScore'] > thr
150 |         ])
151 |         
152 |     return nodes, edges
153 | 
154 | 
155 | def build_network_graph(nodelist, edgelist, drop_isolates=True):
156 |     G = nx.Graph()
157 |     # Iterate over triggers and entity mentions.
158 |     for mention_id, tag, extent, score, mtype in nodelist:
159 |         G.add_node(
160 |             mention_id,
161 |             label=extent,
162 |             tag=tag,
163 |             group=mtype,
164 |             size=PROPERTY_MAP[mtype]['size'],
165 |             color=COLOR_MAP[tag],
166 |             shape=PROPERTY_MAP[mtype]['shape']
167 |             )
168 |     # Iterate over argument role assignments
169 |     if edgelist:
170 |         for n1_id, n2_id, role, score, etype in edgelist:
171 |             label = role if etype == "argument" else "coref"
172 |             G.add_edges_from(
173 |                 [(n1_id, n2_id)],
174 |                 label=role,
175 |                 weight=score*100,
176 |                 color="grey"
177 |             )
178 |     # Drop mentions that don't participate in events
179 |     if len(edgelist) > 0 and drop_isolates:
180 |         G.remove_nodes_from(list(nx.isolates(G)))
181 |     return G
182 | 
183 | 
184 | def plot(result, node_types, filename="nx.html", thr=0.0):
185 |     nodes, edges = get_nodes_and_edges(result, node_types, thr)
186 |     G = build_network_graph(
187 |         nodes, edges,
188 |         drop_isolates=True
189 |     )
190 |     nt = Network(*IFRAME_DIMS, notebook=True, heading="")
191 |     nt.from_nx(G)
192 |     display(nt.show(filename))


--------------------------------------------------------------------------------
/amazon_comprehend_events_tutorial/notebooks/nx.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 | <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis.css" type="text/css" />
  4 | <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis-network.min.js"> </script>
  5 | <center>
  6 | <h1></h1>
  7 | </center>
  8 | 
  9 | <!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
 10 | <script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
 11 | 
 12 | <style type="text/css">
 13 | 
 14 |         #mynetwork {
 15 |             width: 800;
 16 |             height: 600;
 17 |             background-color: #ffffff;
 18 |             border: 1px solid lightgray;
 19 |             position: relative;
 20 |             float: left;
 21 |         }
 22 | 
 23 |         
 24 | 
 25 |         
 26 | 
 27 |         
 28 | </style>
 29 | 
 30 | </head>
 31 | 
 32 | <body>
 33 | <div id = "mynetwork"></div>
 34 | 
 35 | 
 36 | <script type="text/javascript">
 37 | 
 38 |     // initialize global variables.
 39 |     var edges;
 40 |     var nodes;
 41 |     var network; 
 42 |     var container;
 43 |     var options, data;
 44 | 
 45 |     
 46 |     // This method is responsible for drawing the graph, returns the drawn network
 47 |     function drawGraph() {
 48 |         var container = document.getElementById('mynetwork');
 49 |         
 50 |         
 51 | 
 52 |         // parsing and collecting nodes and edges from the python
 53 |         nodes = new vis.DataSet([{"color": "#b4e1a2", "group": "event", "id": "ev0", "label": "CORPORATE_MERGER", "shape": "box", "size": 10, "tag": "CORPORATE_MERGER"}, {"color": "#9e0142", "group": "entity_group", "id": "gr4", "label": "DATE", "shape": "dot", "size": 6, "tag": "DATE"}, {"color": "#fee796", "group": "entity_group", "id": "gr1", "label": "STOCK_CODE", "shape": "dot", "size": 6, "tag": "STOCK_CODE"}, {"color": "#fee796", "group": "entity_group", "id": "gr3", "label": "STOCK_CODE", "shape": "dot", "size": 6, "tag": "STOCK_CODE"}, {"color": "#f57446", "group": "entity_group", "id": "gr2", "label": "ORGANIZATION", "shape": "dot", "size": 6, "tag": "ORGANIZATION"}, {"color": "#9e0142", "group": "entity_group", "id": "gr8", "label": "DATE", "shape": "dot", "size": 6, "tag": "DATE"}, {"color": "#b4e1a2", "group": "trigger", "id": "ev0-tr0", "label": "merger", "shape": "diamond", "size": 4, "tag": "CORPORATE_MERGER"}, {"color": "#b4e1a2", "group": "trigger", "id": "ev0-tr1", "label": "partnership", "shape": "diamond", "size": 4, "tag": "CORPORATE_MERGER"}, {"color": "#ebf7a0", "group": "event", "id": "ev1", "label": "CORPORATE_ACQUISITION", "shape": "box", "size": 10, "tag": "CORPORATE_ACQUISITION"}, {"color": "#e75948", "group": "entity_group", "id": "gr5", "label": "MONETARY_VALUE", "shape": "dot", "size": 6, "tag": "MONETARY_VALUE"}, {"color": "#f57446", "group": "entity_group", "id": "gr0", "label": "ORGANIZATION", "shape": "dot", "size": 6, "tag": "ORGANIZATION"}, {"color": "#ebf7a0", "group": "trigger", "id": "ev1-tr0", "label": "acquire", "shape": "diamond", "size": 4, "tag": "CORPORATE_ACQUISITION"}, {"color": "#ebf7a0", "group": "trigger", "id": "ev1-tr1", "label": "transaction", "shape": "diamond", "size": 4, "tag": "CORPORATE_ACQUISITION"}, {"color": "#f8fcb5", "group": "event", "id": "ev2", "label": "EMPLOYMENT", "shape": "box", "size": 10, "tag": "EMPLOYMENT"}, {"color": "#fa9656", "group": "entity_group", "id": "gr6", "label": "PERSON", "shape": "dot", "size": 6, "tag": "PERSON"}, {"color": "#fdb668", "group": "entity_group", "id": "gr7", "label": "PERSON_TITLE", "shape": "dot", "size": 6, "tag": "PERSON_TITLE"}, {"color": "#f8fcb5", "group": "trigger", "id": "ev2-tr0", "label": "remain", "shape": "diamond", "size": 4, "tag": "EMPLOYMENT"}, {"color": "#f57446", "group": "entity", "id": "gr0-en2", "label": "Amazon", "shape": "square", "size": 4, "tag": "ORGANIZATION"}, {"color": "#fee796", "group": "entity", "id": "gr1-en0", "label": "NASDAQ:AMZN", "shape": "square", "size": 4, "tag": "STOCK_CODE"}, {"color": "#f57446", "group": "entity", "id": "gr2-en0", "label": "Whole Foods Market, Inc.", "shape": "square", "size": 4, "tag": "ORGANIZATION"}, {"color": "#fee796", "group": "entity", "id": "gr3-en0", "label": "NASDAQ:WFM", "shape": "square", "size": 4, "tag": "STOCK_CODE"}, {"color": "#9e0142", "group": "entity", "id": "gr4-en0", "label": "today", "shape": "square", "size": 4, "tag": "DATE"}, {"color": "#e75948", "group": "entity", "id": "gr5-en0", "label": "$42", "shape": "square", "size": 4, "tag": "MONETARY_VALUE"}, {"color": "#e75948", "group": "entity", "id": "gr5-en1", "label": "$13.7 billion", "shape": "square", "size": 4, "tag": "MONETARY_VALUE"}, {"color": "#fa9656", "group": "entity", "id": "gr6-en1", "label": "John Mackey", "shape": "square", "size": 4, "tag": "PERSON"}, {"color": "#fdb668", "group": "entity", "id": "gr7-en1", "label": "CEO", "shape": "square", "size": 4, "tag": "PERSON_TITLE"}, {"color": "#9e0142", "group": "entity", "id": "gr8-en0", "label": "during the second half of 2017", "shape": "square", "size": 4, "tag": "DATE"}]);
 54 |         edges = new vis.DataSet([{"color": "grey", "from": "ev0", "label": "DATE", "to": "gr4", "weight": 99.45779999999999}, {"color": "grey", "from": "ev0", "label": "PARTICIPANT", "to": "gr1", "weight": 99.0119}, {"color": "grey", "from": "ev0", "label": "PARTICIPANT", "to": "gr3", "weight": 98.0082}, {"color": "grey", "from": "ev0", "label": "PARTICIPANT", "to": "gr2", "weight": 99.9654}, {"color": "grey", "from": "ev0", "label": "DATE", "to": "gr8", "weight": 99.9491}, {"color": "grey", "from": "ev0", "label": "", "to": "ev0-tr0", "weight": 100.0}, {"color": "grey", "from": "ev0", "label": "", "to": "ev0-tr1", "weight": 99.9969}, {"color": "grey", "from": "ev1", "label": "AMOUNT", "to": "gr5", "weight": 99.873}, {"color": "grey", "from": "ev1", "label": "DATE", "to": "gr4", "weight": 99.45779999999999}, {"color": "grey", "from": "ev1", "label": "INVESTEE", "to": "gr2", "weight": 99.9668}, {"color": "grey", "from": "ev1", "label": "INVESTOR", "to": "gr0", "weight": 99.9615}, {"color": "grey", "from": "ev1", "label": "", "to": "ev1-tr0", "weight": 100.0}, {"color": "grey", "from": "ev1", "label": "", "to": "ev1-tr1", "weight": 99.9985}, {"color": "grey", "from": "ev2", "label": "EMPLOYEE", "to": "gr6", "weight": 99.9699}, {"color": "grey", "from": "ev2", "label": "EMPLOYEE_TITLE", "to": "gr7", "weight": 99.8065}, {"color": "grey", "from": "ev2", "label": "EMPLOYER", "to": "gr2", "weight": 99.98129999999999}, {"color": "grey", "from": "ev2", "label": "", "to": "ev2-tr0", "weight": 100.0}, {"color": "grey", "from": "gr0", "label": "", "to": "gr0-en2", "weight": 58.46940000000001}, {"color": "grey", "from": "gr1", "label": "", "to": "gr1-en0", "weight": 100.0}, {"color": "grey", "from": "gr2", "label": "", "to": "gr2-en0", "weight": 100.0}, {"color": "grey", "from": "gr3", "label": "", "to": "gr3-en0", "weight": 100.0}, {"color": "grey", "from": "gr4", "label": "", "to": "gr4-en0", "weight": 100.0}, {"color": "grey", "from": "gr5", "label": "", "to": "gr5-en0", "weight": 100.0}, {"color": "grey", "from": "gr5", "label": "", "to": "gr5-en1", "weight": 54.7346}, {"color": "grey", "from": "gr6", "label": "", "to": "gr6-en1", "weight": 97.7111}, {"color": "grey", "from": "gr7", "label": "", "to": "gr7-en1", "weight": 77.8198}, {"color": "grey", "from": "gr8", "label": "", "to": "gr8-en0", "weight": 100.0}]);
 55 | 
 56 |         // adding nodes and edges to the graph
 57 |         data = {nodes: nodes, edges: edges};
 58 | 
 59 |         var options = {
 60 |     "configure": {
 61 |         "enabled": false
 62 |     },
 63 |     "edges": {
 64 |         "color": {
 65 |             "inherit": true
 66 |         },
 67 |         "smooth": {
 68 |             "enabled": false,
 69 |             "type": "continuous"
 70 |         }
 71 |     },
 72 |     "interaction": {
 73 |         "dragNodes": true,
 74 |         "hideEdgesOnDrag": false,
 75 |         "hideNodesOnDrag": false
 76 |     },
 77 |     "physics": {
 78 |         "enabled": true,
 79 |         "stabilization": {
 80 |             "enabled": true,
 81 |             "fit": true,
 82 |             "iterations": 1000,
 83 |             "onlyDynamicEdges": false,
 84 |             "updateInterval": 50
 85 |         }
 86 |     }
 87 | };
 88 |         
 89 |         
 90 | 
 91 |         
 92 | 
 93 |         network = new vis.Network(container, data, options);
 94 | 
 95 |         
 96 | 
 97 | 
 98 |         
 99 | 
100 |         return network;
101 | 
102 |     }
103 | 
104 |     drawGraph();
105 | 
106 | </script>
107 | </body>
108 | </html>


--------------------------------------------------------------------------------
/amazon_comprehend_events_tutorial/notebooks/requirements.txt:
--------------------------------------------------------------------------------
1 | ipywidgets==7.5.1
2 | networkx==2.5
3 | pandas==1.1.3
4 | pyvis==0.1.8.2
5 | spacy==2.2.4
6 | smart-open==3.0.0
7 | 


--------------------------------------------------------------------------------
/amazon_comprehend_moderation/moderation_default_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/amazon_comprehend_moderation/moderation_default_01.png


--------------------------------------------------------------------------------
/building-custom-classifier/BuildingCustomClassifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Amazon Comprehend custom document classification"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "attachments": {},
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "## Step 1: Create Amazon Comprehend Classification Training Job <a id=\"step1\"></a>\n",
 16 |     "\n",
 17 |     "In this step, we will import some necessary libraries that will be used throughout this notebook.\n",
 18 |     "\n",
 19 |     "We will then use a prepared dataset, of the appropriate filetype (.csv) and structure - one column containing the raw text of a document, and the other column containing the label of that document.\n",
 20 |     "\n",
 21 |     "The custom classification model we are going to train is in [Multi-class mode](https://docs.aws.amazon.com/comprehend/latest/dg/prep-classifier-data-multi-class.html) and we will use a CSV file to train the model. You can also use an Augmented manifest file to train the model, please review the documentation on how to use augmented manifest file. \n",
 22 |     "\n",
 23 |     "We will look at the CSV training data in the subsequent sections."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import boto3\n",
 33 |     "import botocore\n",
 34 |     "import sagemaker\n",
 35 |     "import json\n",
 36 |     "import os\n",
 37 |     "import io\n",
 38 |     "import datetime\n",
 39 |     "import pandas as pd\n",
 40 |     "from PIL import Image\n",
 41 |     "from pathlib import Path\n",
 42 |     "import multiprocessing as mp\n",
 43 |     "from sagemaker import get_execution_role\n",
 44 |     "from IPython.display import Image, display, HTML, JSON\n",
 45 |     "\n",
 46 |     "# variables\n",
 47 |     "data_bucket = sagemaker.Session().default_bucket()\n",
 48 |     "region = boto3.session.Session().region_name\n",
 49 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
 50 |     "\n",
 51 |     "os.environ[\"BUCKET\"] = data_bucket\n",
 52 |     "os.environ[\"REGION\"] = region\n",
 53 |     "role = sagemaker.get_execution_role()\n",
 54 |     "\n",
 55 |     "print(f\"SageMaker role is: {role}\\nDefault SageMaker Bucket: s3://{data_bucket}\")\n",
 56 |     "\n",
 57 |     "s3=boto3.client('s3')\n",
 58 |     "comprehend=boto3.client('comprehend', region_name=region)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "We will use the pre-prepared dataset and upload it to Amazon S3. The dataset is in `CSV` format and will be named `comprehend_train_data.csv`. Note that you can have more than one `CSV` file in an S3 bucket for training a Comprehend custom classifier. If you have more than one file, you can specify only the bucket/prefix in call to train the custom classifier. Amazon Comprehend will automatically use all the files under the bucket/prefix for training purposes.\n",
 66 |     "\n",
 67 |     "The following code cells will upload the training data to the S3 bucket, and create a Custom Comprehend Classifier. You can also create a custom classifier manually, please see the subsequent sections for instructions on how to do that."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Upload Comprehend training data to S3\n",
 77 |     "key='comprehend/doc-class-train/comprehend_train_data.csv'\n",
 78 |     "s3.upload_file(Filename='./train-data/comprehend_train_data.csv', \n",
 79 |     "               Bucket=data_bucket, \n",
 80 |     "               Key=key)\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Let's review the training data"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "df = pd.read_csv('./train-data/comprehend_train_data.csv', names=[\"Class\", \"Document\"])\n",
 97 |     "df"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "classes = df['Class'].unique()\n",
107 |     "classes_df = pd.DataFrame(classes, columns = ['Classes'])\n",
108 |     "classes_df"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Our training dataset contains exactly 7 classes that we are going to train the custom classifier with. The first column in the CSV is the class label, and the second column in the CSV is the document's text. Together, each line of the file contains a single class and the text of a document that demonstrates that class. If you have samples in the form of PDF, PNG, JPG, TIFF etc. you can extract the text using OCR technology such as [Amazon Textract](https://docs.aws.amazon.com/textract/latest/dg/what-is.html) to extract the text from the documents to prepare the CSV training data. "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "---\n",
123 |     "\n",
124 |     "Once we have a labeled dataset ready we are going to create and train a [Amazon Comprehend custom classification model](https://docs.aws.amazon.com/comprehend/latest/dg/how-document-classification.html) with the dataset."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "### Create Amazon Comprehend custom classification Training Job\n",
132 |     "\n",
133 |     "<div class=\"alert alert-block alert-warning\"> <b>💡 NOTE:</b> <p>Executing the model training code block below will start a training job which can take upwards of 40 to 60 minutes to complete. </div>\n",
134 |     "\n",
135 |     "We will use Amazon Comprehend custom classification to train our own model for classifying the documents. We will use Amazon Comprehend `CreateDocumentClassifier` API to create a classifier which will train a custom model using the labeled data CSV file we created above. The training data contains extracted text, that was extracted using Amazon Textract, and then labeled."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "scrolled": true
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "import uuid\n",
147 |     "uuid_id = uuid.uuid1()\n",
148 |     "\n",
149 |     "# Create a document classifier\n",
150 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
151 |     "id = str(datetime.datetime.now().strftime(\"%s\"))\n",
152 |     "\n",
153 |     "document_classifier_name = f\"custom-doc-class-{uuid_id}\"\n",
154 |     "document_classifier_version = 'v1'\n",
155 |     "document_classifier_arn = ''\n",
156 |     "response = None\n",
157 |     "\n",
158 |     "try:\n",
159 |     "    print(f'Starting training job in region: {region} for account ID: {account_id}, with training data s3://{data_bucket}/{key}')\n",
160 |     "    create_response = comprehend.create_document_classifier(\n",
161 |     "        InputDataConfig={\n",
162 |     "            'DataFormat': 'COMPREHEND_CSV',\n",
163 |     "            'S3Uri': f's3://{data_bucket}/{key}'\n",
164 |     "        },\n",
165 |     "        DataAccessRoleArn=role,\n",
166 |     "        DocumentClassifierName=document_classifier_name,\n",
167 |     "        VersionName=document_classifier_version,\n",
168 |     "        LanguageCode='en',\n",
169 |     "        Mode='MULTI_CLASS'\n",
170 |     "    )\n",
171 |     "    \n",
172 |     "    document_classifier_arn = create_response['DocumentClassifierArn']\n",
173 |     "    %store document_classifier_arn\n",
174 |     "    print(f\"Comprehend Custom Classifier created with ARN: {document_classifier_arn}\")\n",
175 |     "except Exception as error:\n",
176 |     "    if error.response['Error']['Code'] == 'ResourceInUseException':\n",
177 |     "        print(f'A classifier with the name \"{document_classifier_name}\" already exists.')\n",
178 |     "        document_classifier_arn = f'arn:aws:comprehend:{region}:{account_id}:document-classifier/{document_classifier_name}/version/{document_classifier_version}'\n",
179 |     "        print(f'The classifier ARN is: \"{document_classifier_arn}\"')\n",
180 |     "    else:\n",
181 |     "        print(error)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "\n",
189 |     "Alternatively, to create a Comprehend Custom Classifier Job manually using the console go to [Amazon Comprehend Console](https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#classification)\n",
190 |     "  \n",
191 |     "- On the left menu click \"Custom Classification\"\n",
192 |     "- In the \"Classifier models\" section, click on \"Create new model\"\n",
193 |     "- In Model Setting for Model name, enter a name \n",
194 |     "- In Data Specification; select \"Using Single-label\" mode and for Data format select CSV file\n",
195 |     "- For Training dataset browse to your data-bucket created above and select the file `comprehend_train_data.csv`\n",
196 |     "- For IAM role select \"Create an IAM role\" and specify a prefix (this will create a new IAM Role for Comprehend)\n",
197 |     "- Click create"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "This job can take ~30 minutes to complete. Once the training job is completed move on to next step."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "### Check status of the Comprehend Custom Classification Job\n",
212 |     "\n",
213 |     "Let's check the status of the training job."
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {
220 |     "scrolled": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "%%time\n",
225 |     "# Loop through and wait for the training to complete.\n",
226 |     "import time\n",
227 |     "from datetime import datetime\n",
228 |     "\n",
229 |     "jobArn = create_response['DocumentClassifierArn']\n",
230 |     "\n",
231 |     "max_time = time.time() + 3*60*60 # 3 hours\n",
232 |     "while time.time() < max_time:\n",
233 |     "    now = datetime.now()\n",
234 |     "    current_time = now.strftime(\"%H:%M:%S\")\n",
235 |     "    describe_custom_classifier = comprehend.describe_document_classifier(\n",
236 |     "        DocumentClassifierArn = jobArn\n",
237 |     "    )\n",
238 |     "    status = describe_custom_classifier[\"DocumentClassifierProperties\"][\"Status\"]\n",
239 |     "    print(f\"{current_time} : Custom document classifier: {status}\")\n",
240 |     "    \n",
241 |     "    if status == \"TRAINED\" or status == \"IN_ERROR\":\n",
242 |     "        break\n",
243 |     "        \n",
244 |     "    time.sleep(60)\n",
245 |     "    "
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "Alternatively, you can also check the status of the training job from the Amazon Comprehend console. Navigate to the [Amazon Comprehend console](https://console.aws.amazon.com/comprehend) screen and click _\"Custom classification\"_ under the _\"Customization\"_ menu on the left panel."
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "---\n",
260 |     "## Step 2: Classify Documents using the custom classifier asynchronous analysis job<a id=\"step2\"></a>\n",
261 |     "\n",
262 |     "In this step we will use the Comprehend classifier model that we just trained to classify a group of un-identified documents. We will use Comprehend [StartDocumentClassificationJob](https://docs.aws.amazon.com/comprehend/latest/APIReference/API_StartDocumentClassificationJob.html) API to run an asynchronous job that will classify our documents.\n",
263 |     "\n",
264 |     "Amazon Comprehend Async classification works with PDF, PNG, JPEG, as well as UTF-8 encoded plaintext files. Since our sample documents under the `sample_docs` directory are of wither JPEG, PNG, or PDF format, we will specify a `DocumentReadAction` and use Amazon Textract with the `TEXTRACT_DETECT_DOCUMENT_TEXT`. This will tell Amazon Comprehend to use Amazon Textract [DetectDocumentText](https://docs.aws.amazon.com/textract/latest/dg/API_DetectDocumentText.html) API behind the scenes to extract the text and then perform classification. For `InputFormat`, we will use `ONE_DOC_PER_FILE` mode which signifies that each file is a single document (the other mode is `ONE_DOC_PER_LINE` which means every line in the plaintext file is a document, this is best suited for small documents such as product reviews or customer service chat transcripts etc.). More on this, see [documentation](https://docs.aws.amazon.com/comprehend/latest/dg/how-class-run.html)\n",
265 |     "\n",
266 |     "To begin with the classification of the sample documents, first let's upload them into the S3 bucket."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "scrolled": true
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "# Upload data to S3 bucket:\n",
278 |     "!aws s3 sync ./sample-docs s3://{data_bucket}/comprehend/doc-class-samples/"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "Once the documents are uploaded, we will start a a classification job using the [StartDocumentClassificationJob](https://docs.aws.amazon.com/comprehend/latest/APIReference/API_StartDocumentClassificationJob.html) API and the configurations discussed above."
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "import uuid\n",
295 |     "\n",
296 |     "jobname = f'classification-job-{uuid.uuid1()}'\n",
297 |     "print(f'Starting Comprehend Classification job {jobname} with model {document_classifier_arn}')\n",
298 |     "\n",
299 |     "response = comprehend.start_document_classification_job(\n",
300 |     "    JobName=jobname,\n",
301 |     "    DocumentClassifierArn=document_classifier_arn,\n",
302 |     "    InputDataConfig={\n",
303 |     "        'S3Uri': f's3://{data_bucket}/comprehend/doc-class-samples/',\n",
304 |     "        'InputFormat': 'ONE_DOC_PER_FILE',\n",
305 |     "        'DocumentReaderConfig': {\n",
306 |     "            'DocumentReadAction': 'TEXTRACT_DETECT_DOCUMENT_TEXT',\n",
307 |     "            'DocumentReadMode': 'FORCE_DOCUMENT_READ_ACTION'\n",
308 |     "        }\n",
309 |     "    },\n",
310 |     "    OutputDataConfig={\n",
311 |     "        'S3Uri': f's3://{data_bucket}/comprehend/doc-class-output/'\n",
312 |     "    },\n",
313 |     "    DataAccessRoleArn=role\n",
314 |     ")\n",
315 |     "\n",
316 |     "JSON(response)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "### Check status of the classification job\n",
324 |     "\n",
325 |     "The code block below will check the status of the classification job. If the job completes then it will download the output predictions. The output is a zip file which will contain the inference result for each of the documents being classified. The zip will also contain the output of the Textract operation performed by Amazon Comprehend."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "scrolled": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "%%time\n",
337 |     "# Loop through and wait for the training to complete . Takes up to 10 mins \n",
338 |     "import time\n",
339 |     "from datetime import datetime\n",
340 |     "import tarfile\n",
341 |     "import os\n",
342 |     "\n",
343 |     "classify_response=response\n",
344 |     "max_time = time.time() + 3*60*60 # 3 hours\n",
345 |     "documents=[]\n",
346 |     "\n",
347 |     "while time.time() < max_time:\n",
348 |     "    now = datetime.now()\n",
349 |     "    current_time = now.strftime(\"%H:%M:%S\")\n",
350 |     "    describe_job = comprehend.describe_document_classification_job(\n",
351 |     "        JobId=classify_response['JobId']\n",
352 |     "    )\n",
353 |     "    status = describe_job[\"DocumentClassificationJobProperties\"][\"JobStatus\"]\n",
354 |     "\n",
355 |     "    print(f\"{current_time} : Custom document classifier Job: {status}\")\n",
356 |     "    \n",
357 |     "    if status == \"COMPLETED\" or status == \"FAILED\":\n",
358 |     "        if status == \"COMPLETED\":\n",
359 |     "            classify_output_file = describe_job[\"DocumentClassificationJobProperties\"][\"OutputDataConfig\"][\"S3Uri\"]\n",
360 |     "            print(f'Output generated - {classify_output_file}')\n",
361 |     "            !mkdir -p classification-output\n",
362 |     "            !aws s3 cp {classify_output_file} ./classification-output\n",
363 |     "            \n",
364 |     "            opfile = os.path.basename(classify_output_file)\n",
365 |     "            # open file\n",
366 |     "            file = tarfile.open(f'./classification-output/{opfile}')\n",
367 |     "            # extracting file\n",
368 |     "            file.extractall('./classification-output')\n",
369 |     "            file.close()\n",
370 |     "            \n",
371 |     "            for file in os.listdir('./classification-output'):\n",
372 |     "                if file.endswith('.out'):\n",
373 |     "                    with open(f'./classification-output/{file}', 'r') as f:\n",
374 |     "                        documents.append(dict(file=file, classification_output=json.load(f)['Classes']))        \n",
375 |     "        else:\n",
376 |     "            print(\"Classification job failed\")\n",
377 |     "            print(describe_job)\n",
378 |     "        break\n",
379 |     "        \n",
380 |     "    time.sleep(10)"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "markdown",
385 |    "metadata": {},
386 |    "source": [
387 |     "Let's take a look at the Amazon Comprehend classification output. We have collected the output for all the files in a `documents` variable. The script above will download and un-zip the zip file locally, so you can navigate into the `classification-output` directory from the file browser panel on the left and inspect the files manually."
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "metadata": {
394 |     "scrolled": true
395 |    },
396 |    "outputs": [],
397 |    "source": [
398 |     "for doc in documents:\n",
399 |     "    print(f\"File: {doc['file']}\")\n",
400 |     "    for doc_class in doc['classification_output']:\n",
401 |     "        print(f\"└── Class: {doc_class['Name']} , Score: {round(doc_class['Score'] * 100, 2)}%\")\n",
402 |     "    print(\"\\n\")"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "---\n",
410 |     "\n",
411 |     "## Step 3: Create Document classification real-time endpoint\n",
412 |     "\n",
413 |     "<div class=\"alert alert-block alert-warning\">\n",
414 |     "    <b>⚠️ Note:</b> Creation of a real-time endpoint can take up to 15 minutes.\n",
415 |     "</div>\n",
416 |     "\n",
417 |     "\n",
418 |     "Once our Comprehend custom classifier is fully trained (i.e. status = `TRAINED`). You can also create a real-time endpoint. You can then use this endpoint to classify documents in real time. The following code cells use the `comprehend` Boto3 client to create an endpoint, but you can also create one manually via the console. Instructions on how to do that can be found in the subsequent section."
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "#create comprehend endpoint\n",
428 |     "import uuid\n",
429 |     "temp_id = str(uuid.uuid1())\n",
430 |     "model_arn = document_classifier_arn\n",
431 |     "ep_name = f'classifier-endpoint-{temp_id.split(\"-\")[0]}'\n",
432 |     "\n",
433 |     "try:\n",
434 |     "    endpoint_response = comprehend.create_endpoint(\n",
435 |     "        EndpointName=ep_name,\n",
436 |     "        ModelArn=model_arn,\n",
437 |     "        DesiredInferenceUnits=1,    \n",
438 |     "        DataAccessRoleArn=role\n",
439 |     "    )\n",
440 |     "    ENDPOINT_ARN=endpoint_response['EndpointArn']\n",
441 |     "    print(f'Endpoint created with ARN: {ENDPOINT_ARN}')    \n",
442 |     "except Exception as error:\n",
443 |     "    if error.response['Error']['Code'] == 'ResourceInUseException':\n",
444 |     "        print(f'An endpoint with the name \"{ep_name}\" already exists.')\n",
445 |     "        ENDPOINT_ARN = f'arn:aws:comprehend:{region}:{account_id}:document-classifier-endpoint/{ep_name}'\n",
446 |     "        print(f'The classifier endpoint ARN is: \"{ENDPOINT_ARN}\"')\n",
447 |     "        %store ENDPOINT_ARN\n",
448 |     "    else:\n",
449 |     "        print(error)\n",
450 |     "    "
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "%store ENDPOINT_ARN"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "JSON(endpoint_response)"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "Alternatively, use the steps below to create a Comprehend endpoint using the AWS console.\n",
476 |     "\n",
477 |     "- Go to [Comprehend on AWS Console](https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#endpoints) and click on Endpoints in the left menu.\n",
478 |     "- Click on \"Create endpoint\"\n",
479 |     "- Give an Endpoint name; for Custom model type select Custom classification; for version select no version or the latest version of the model.\n",
480 |     "- For Classifier model select from the drop down menu\n",
481 |     "- For Inference Unit select 1\n",
482 |     "- Check \"Acknowledge\"\n",
483 |     "- Click \"Create endpoint\"\n",
484 |     "\n",
485 |     "[It may take ~15 minutes](https://console.aws.amazon.com/comprehend/v2/home?region=us-east-1#endpoints) for the endpoint to get created. The code cell below checks the creation status.\n"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "metadata": {
492 |     "scrolled": true
493 |    },
494 |    "outputs": [],
495 |    "source": [
496 |     "%%time\n",
497 |     "# Loop through and wait for the training to complete . Takes up to 10 mins \n",
498 |     "import time\n",
499 |     "from datetime import datetime\n",
500 |     "\n",
501 |     "ep_arn = endpoint_response[\"EndpointArn\"]\n",
502 |     "\n",
503 |     "max_time = time.time() + 3*60*60 # 3 hours\n",
504 |     "while time.time() < max_time:\n",
505 |     "    now = datetime.now()\n",
506 |     "    current_time = now.strftime(\"%H:%M:%S\")\n",
507 |     "    describe_endpoint_resp = comprehend.describe_endpoint(\n",
508 |     "        EndpointArn=ep_arn\n",
509 |     "    )\n",
510 |     "    status = describe_endpoint_resp[\"EndpointProperties\"][\"Status\"]\n",
511 |     "    print(f\"{current_time} : Custom document classifier: {status}\")\n",
512 |     "    \n",
513 |     "    if status == \"IN_SERVICE\" or status == \"FAILED\":\n",
514 |     "        break\n",
515 |     "        \n",
516 |     "    time.sleep(10)\n",
517 |     "    "
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "markdown",
522 |    "metadata": {},
523 |    "source": [
524 |     "---\n",
525 |     "## Step 4: Classify Documents using the real-time endpoint <a id=\"step4\"></a>\n",
526 |     "\n",
527 |     "Once the endpoint has been created, we will use some sample documents under the `/samle-docs` directory and try to classify them."
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "metadata": {},
534 |    "outputs": [],
535 |    "source": [
536 |     "\"\"\"\n",
537 |     "Section below will be removed prior to publish, only applicable for beta environment\n",
538 |     "\"\"\"\n",
539 |     "\n",
540 |     "import base64\n",
541 |     "from botocore.exceptions import ClientError\n",
542 |     "\n",
543 |     "os.environ['AWS_DATA_PATH'] = './botodata/'\n",
544 |     "session = boto3.session.Session()\n",
545 |     "comprehend = session.client('comprehend', region_name='us-east-1')\n",
546 |     "\n",
547 |     "\"\"\"\n",
548 |     "Section above will be removed prior to publish, only applicable for beta environment\n",
549 |     "\"\"\"\n",
550 |     "\n",
551 |     "# Replace this with any document name in the /sample-docs/ directory\n",
552 |     "document = \"CMS1500.png\"\n",
553 |     "\n",
554 |     "with open(f\"./sample-docs/{document}\", mode='rb') as file:\n",
555 |     "        document_bytes = file.read()\n",
556 |     "try:\n",
557 |     "    response = comprehend.classify_document(Bytes=document_bytes, \n",
558 |     "                                        DocumentReaderConfig={\n",
559 |     "                                            \"DocumentReadAction\": \"TEXTRACT_ANALYZE_DOCUMENT\",\n",
560 |     "                                            \"DocumentReadMode\": \"FORCE_DOCUMENT_READ_ACTION\",\n",
561 |     "                                            \"FeatureTypes\": [\"FORMS\"]\n",
562 |     "                                        },\n",
563 |     "                                        EndpointArn=ENDPOINT_ARN)\n",
564 |     "    classes = response['Classes']\n",
565 |     "    metadata = response['DocumentMetadata']['ExtractedCharacters'][0]\n",
566 |     "    print(f\"File: {document}\")\n",
567 |     "    print(f\"Page Count: {metadata['Page']}, Character count: {metadata['Count']}\")\n",
568 |     "    for doc_class in classes:\n",
569 |     "        print(f\"└── Class: {doc_class['Name']} , Score: {round(doc_class['Score'] * 100, 2)}%\")\n",
570 |     "except ClientError as e:\n",
571 |     "    print(e)\n",
572 |     "    print(\"Error\", e.response['Reason'], e.response['Detail']['Reason'])"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "In the above code cell, we classified a document in real-time using the endpoint we created earlier. Real-time endpoints are suitable for use-cases that have low latency, real-time requirements. One important thing to consider is that the size of document when using native semi-structured documents with classify document real-time API is that the max number of pages supported is one. So real-time endpoint is suitable for single page documents. If you have more than 20 documents, and or have multi-page documents, you should look at using Async analysis (aka asynchronous jobs) API, as we have seen earlier in the notebook."
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {},
585 |    "source": [
586 |     "---\n",
587 |     "\n",
588 |     "## Cleanup\n",
589 |     "\n",
590 |     "In this step we will delete the document classification real-time endpoint since will be charged for any deployed. It could take ~5 to 10 minutes to delete the endpoint"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": null,
596 |    "metadata": {},
597 |    "outputs": [],
598 |    "source": [
599 |     "ep_del_response = comprehend.delete_endpoint(EndpointArn=ENDPOINT_ARN)\n",
600 |     "JSON(ep_del_response)"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "Once, the endpoint is fully deleted, let's delete the document classifier trained model."
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": null,
613 |    "metadata": {},
614 |    "outputs": [],
615 |    "source": [
616 |     "dc_del_response = comprehend.delete_document_classifier(DocumentClassifierArn = document_classifier_arn)\n",
617 |     "JSON(dc_del_response)"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "markdown",
622 |    "metadata": {},
623 |    "source": [
624 |     "Delete sample document and classification output files from S3"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": [
633 |     "!aws s3 rm s3://{data_bucket}/comprehend/ --recursive"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "markdown",
638 |    "metadata": {},
639 |    "source": [
640 |     "---\n",
641 |     "## Conslusion"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "markdown",
646 |    "metadata": {},
647 |    "source": [
648 |     "In this notebook we learned how to train an Amazon Comprehend custom classifier using our pre-prepared dataset, that was constructed from sample documents by extracting the text from the documents using Amazon Textract and labeling the data into a CSV file format. We then trained an Amazon Comprehend custom classifier with the extracted text and created an Amazon Comprehend Classifier real time endpoint to performe classification of documents. We used documents in their native format (JPG, PNG, PDF..) without any extraction and conversion directly with the classification APIs to determine the document class with both asynchronous analysis job as well as real-time endpoint."
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": null,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": []
657 |   }
658 |  ],
659 |  "metadata": {
660 |   "instance_type": "ml.t3.medium",
661 |   "kernelspec": {
662 |    "display_name": "Python 3",
663 |    "language": "python",
664 |    "name": "python3"
665 |   },
666 |   "language_info": {
667 |    "codemirror_mode": {
668 |     "name": "ipython",
669 |     "version": 3
670 |    },
671 |    "file_extension": ".py",
672 |    "mimetype": "text/x-python",
673 |    "name": "python",
674 |    "nbconvert_exporter": "python",
675 |    "pygments_lexer": "ipython3",
676 |    "version": "3.7.3 (default, Mar 21 2021, 16:54:57) \n[Clang 12.0.0 (clang-1200.0.31.1)]"
677 |   },
678 |   "vscode": {
679 |    "interpreter": {
680 |     "hash": "e7ce91952fc711336efd7f2a69f291fbbebe704093ede89b650fd59e96d51ae8"
681 |    }
682 |   }
683 |  },
684 |  "nbformat": 4,
685 |  "nbformat_minor": 4
686 | }
687 | 


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/CMS1500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/CMS1500.png


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/discharge-summary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/discharge-summary.pdf


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/doctors-notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/doctors-notes.pdf


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/drivers_license.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/drivers_license.png


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/insurance_card.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/insurance_card.png


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/insurance_invoice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/insurance_invoice.png


--------------------------------------------------------------------------------
/building-custom-classifier/sample-docs/passport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/building-custom-classifier/sample-docs/passport.pdf


--------------------------------------------------------------------------------
/building-custom-entity-recognizer-for-PDFs/helperPackage/pdfhelper/PDFHelper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper utility for visualizing custom annotations on PDF files
  3 | """
  4 | 
  5 | import os
  6 | import csv
  7 | 
  8 | class PDFHelper:
  9 |     @staticmethod
 10 |     def add_annotations_to_file(annotations, input_filepath, output_filepath):
 11 |         """
 12 |         annotations: json annotations
 13 |         input_filepath: location of original pdf file to annotate
 14 |         output_filepath: location to write new pdf file with annotations
 15 |         """
 16 |         # First, add bounding box coordinates to the entities (groups word-level bounding boxes)
 17 |         page_entities_map = PDFHelper.enhanceEntitiesWithBoundingBoxInfo(annotations)
 18 | 
 19 |         # Second, add the annotation information to the document
 20 |         doc = PDFHelper.addLabelsToDoc(input_filepath, page_entities_map)
 21 |         doc.save(output_filepath, deflate=True)
 22 |         return output_filepath
 23 |         
 24 |     @staticmethod
 25 |     def enhanceEntitiesWithBoundingBoxInfo(PDF_ANNOTATIONS):
 26 |         
 27 |         
 28 |         def generateBoundingBoxCoordinates(block_ref):
 29 |             '''Function to get bounding box coordinates of an entity. If entity has more than one child blocks, it combines them together.'''
 30 |             if "ChildBlocks" in block_ref:
 31 |                 for index, child_block_ref in enumerate(block_ref["ChildBlocks"]):
 32 |                     block = blocks[block_ref["ChildBlocks"][index]["ChildBlockId"]]
 33 |                     if index==0:
 34 |                         top = block["Geometry"]["BoundingBox"]["Top"]
 35 |                         left = block["Geometry"]["BoundingBox"]["Left"]
 36 |                         right = block["Geometry"]["BoundingBox"]["Left"] + block["Geometry"]["BoundingBox"]["Width"]
 37 |                         bottom = block["Geometry"]["BoundingBox"]["Top"] + block["Geometry"]["BoundingBox"]["Height"]
 38 |                     else:
 39 |                         if block["Geometry"]["BoundingBox"]["Top"]<top:
 40 |                             top = block["Geometry"]["BoundingBox"]["Top"]
 41 |                         if block["Geometry"]["BoundingBox"]["Left"]<left:
 42 |                             left = block["Geometry"]["BoundingBox"]["Left"]
 43 |                         if block["Geometry"]["BoundingBox"]["Left"] + block["Geometry"]["BoundingBox"]["Width"]>right:
 44 |                             right = block["Geometry"]["BoundingBox"]["Left"] + block["Geometry"]["BoundingBox"]["Width"]
 45 |                         if block["Geometry"]["BoundingBox"]["Top"] + block["Geometry"]["BoundingBox"]["Height"]>bottom:
 46 |                             bottom = block["Geometry"]["BoundingBox"]["Top"] + block["Geometry"]["BoundingBox"]["Height"]
 47 |                 return {
 48 |                     "Top": top,
 49 |                     "Left": left,
 50 |                     "Width": right - left,
 51 |                     "Height": bottom - top
 52 |                 }
 53 |             else:
 54 |                 block = blocks[block_ref["BlockId"]]
 55 |                 return block["Geometry"]["BoundingBox"]
 56 |         
 57 |         blocks = {block["Id"]: block for block in PDF_ANNOTATIONS["Blocks"]}
 58 |         entities = PDF_ANNOTATIONS["Entities"]
 59 | 
 60 |         for entity in entities:
 61 |             entity["BoundingBox"] = generateBoundingBoxCoordinates(entity["BlockReferences"][0])
 62 |             entity["Page"] = blocks[entity["BlockReferences"][0]["BlockId"]]["Page"]
 63 | 
 64 |         #Create map of entities in each page
 65 |         page_entities_map = {}
 66 |         for entity in entities:
 67 |             page_entities_map.setdefault(entity["Page"], []).append(entity)
 68 | 
 69 |         return page_entities_map
 70 | 
 71 |     
 72 |     @staticmethod
 73 |     def addLabelsToDoc(PDF_FILE_LOCAL_URL, page_entities_map):
 74 | 
 75 |         RGB_COLORS = [[1, 112.0/255, 166.0/255], [252.0/255, 122.0/255, 87.0/255], [0, 139.0/255, 248.0/255], 
 76 |                       [199.0/255, 62.0/255, 29.0/255], [102.0/255, 16.0/255, 242.0/255]]
 77 | 
 78 |         import fitz
 79 |         print(fitz.__doc__)
 80 |         if fitz.VersionBind.split(".") < ["1", "17", "0"]:
 81 |             print("PyMuPDF v1.17.0+ is needed.")
 82 | 
 83 |         doc = fitz.open(PDF_FILE_LOCAL_URL)
 84 |         num_pages = doc.page_count
 85 | 
 86 |         entity_type_color_map = {}
 87 |         for i in range(num_pages):
 88 |             page = doc.load_page(i)
 89 |             page.set_rotation(0)
 90 |             page_width = page.bound().width
 91 |             page_height = page.bound().height
 92 |             for entity in page_entities_map[i+1]:
 93 |                 # Assign color to entity type
 94 |                 if entity["Type"] not in entity_type_color_map:
 95 |                     entity_type_color_map[entity["Type"]] = RGB_COLORS[0]
 96 |                     del RGB_COLORS[0]
 97 |                 entity_type_color = entity_type_color_map[entity["Type"]]
 98 | 
 99 |                 # Box annotation over entity text
100 |                 box_rect = fitz.Rect(page_width * entity["BoundingBox"]["Left"], page_height * entity["BoundingBox"]["Top"], 
101 |                                      page_width * (entity["BoundingBox"]["Left"] + entity["BoundingBox"]["Width"]), 
102 |                                      page_height * (entity["BoundingBox"]["Top"] + entity["BoundingBox"]["Height"]))
103 |                 rect_annot = page.add_rect_annot(box_rect)
104 |                 rect_annot.set_border(width=0.5)
105 |                 rect_annot.set_colors(stroke=entity_type_color)
106 |                 rect_annot.update()
107 | 
108 |                 text_rect = fitz.Rect(page_width * entity["BoundingBox"]["Left"] - 15, 
109 |                 page_height * entity["BoundingBox"]["Top"] - 20, 
110 |                 page_width * (entity["BoundingBox"]["Left"] + entity["BoundingBox"]["Width"] + 20), 
111 |                 page_height * (entity["BoundingBox"]["Top"] + entity["BoundingBox"]["Height"]))
112 |                 text_annot = page.add_freetext_annot(text_rect, text=entity["Type"], text_color=entity_type_color, fontsize=20)
113 |                 
114 |                 text_annot.update(border_color=[])
115 | 
116 |         return doc
117 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/README.md:
--------------------------------------------------------------------------------
  1 | # ComprehendCustomerScripts
  2 | 
  3 | ## Introduction
  4 | 
  5 | This package contains scripts for our customers to experiment the features released by AWS Comprehend.
  6 | 
  7 | ## Install dependencies:
  8 | 1. Install AWS CLI
  9 | 2. Install python3
 10 | 
 11 | ## Documentation
 12 | To provide our customers a seamless integration between SageMaker GroundTruth and Comprehend's Custom API's, this package contains the following: 1) a shell script (convertGroundtruthToComprehendERFormat.sh) that converts the output of SageMaker GroundTruth NER labeling job to a format which is compatible with Comprehend's EntityRecognizer API. 2) a shell script (convertGroundtruthToComprehendCLRFormat.sh) that converts the output of SageMaker GroundTruth MultiClass and MultiLabel labeling job to a format which is compatible with Comprehend's DocumentClassifier API.
 13 | 
 14 | ### EntityRecognizer
 15 | The script takes the following 3 inputs from the customer:
 16 | - S3Uri of the bucket where the output.manifest file (SageMaker Groundtruth labeling job output) is stored
 17 | - S3Uri of the bucket where the customer expects the dataset.csv (Comprehend's CreateEntityRecognizer API input) to be stored
 18 | - S3Uri of the bucket where the customer expects the annotations.csv (Comprehend's CreateEntityRecognizer API input) to be stored
 19 | 
 20 | The script performs the following tasks:
 21 | 1) Download the output.manifest file from the S3Uri provided by the customer
 22 | 2) Parse the output.manifest file and create dataset.csv and annotations.csv
 23 | 3) Upload the dataset and annotations file in the S3 bucket provided by the customer
 24 | 
 25 | To run the script, execute the following command:
 26 | ```
 27 | ./convertGroundtruthToCompERFormat.sh <inputS3Uri> <outputDatasetS3Uri> <outputAnnotationsS3Uri>
 28 | ```
 29 | 
 30 | ## Example:
 31 | output.manifest.json:
 32 | ```
 33 | {"source":"Bob was born on Jan 1 1990 and lived his whole life in Minneapolis.","EntityRecognizerPOC-1":{"annotations":{"entities":[{"endOffset":22,"startOffset":16,"label":"Date"},{"endOffset":3,"startOffset":0,"label":"Person"},{"endOffset":67,"startOffset":56,"label":"Location"}],"labels":[{"label":"Date"},{"label":"Location"},{"label":"Person"}]}},"EntityRecognizerPOC-1-metadata":{"entities":[{"confidence":0.08},{"confidence":0.08},{"confidence":0.09}],"job-name":"labeling-job/entityrecognizerpoc-1","type":"groundtruth/text-span","creation-date":"2020-04-17T23:27:41.344393","human-annotated":"yes"}}
 34 | {"source":"Bob was born on Jan 1 1990 and lived his whole life in Minneapolis.","EntityRecognizerPOC-1":{"annotations":{"entities":[{"endOffset":26,"startOffset":16,"label":"Date"},{"endOffset":67,"startOffset":56,"label":"Location"}],"labels":[{"label":"Date"},{"label":"Location"},{"label":"Person"}]}},"EntityRecognizerPOC-1-metadata":{"entities":[{"confidence":0.09},{"confidence":0.09}],"job-name":"labeling-job/entityrecognizerpoc-1","type":"groundtruth/text-span","creation-date":"2020-04-17T23:26:35.975508","human-annotated":"yes"}}
 35 | ```
 36 | where each line is a JSON object.
 37 | The shell script takes the S3Uri of where this file is stored as the first argument.
 38 | 
 39 | The script will executes the AWS CLI command to download the file to the local.
 40 | 
 41 | The script will parse the outputS3Uri's provided, to fetch the expected dataset and annotation file name.
 42 | It will parse output.manifest file and generate dataset.csv and annotations.csv file based on the file names obtained from parsing the outputS3Uri.
 43 | 
 44 | dataset.csv:
 45 | ```
 46 | Bob was born on Jan 1 1990 and lived his whole life in Minneapolis
 47 | Bob was born on Jan 1 1990 and lived his whole life in Minneapolis
 48 | ```
 49 | 
 50 | annotations.csv
 51 | ```
 52 | File,Line,Begin Offset,End Offset,Type
 53 | dataset.csv,0,0,3,Person
 54 | dataset.csv,0,16,22,Date
 55 | dataset.csv,0,56,67,Location
 56 | ```
 57 | 
 58 | Eventually, the shell script will execute the AWS CLI command to upload dataset and annotations file to the S3Uri provided as the input.
 59 | 
 60 | ### DocumentClassifier:
 61 | The convertGroundtruthToComprehendCLRFormat.sh script takes the following 3 inputs from the customer:
 62 | - Mode of the training job. Valid values are MULTI_CLASS and MULTI_LABEL
 63 | - S3Uri of the bucket where the output.manifest file (SageMaker Groundtruth labeling job output) is stored
 64 | - S3Uri of the bucket where the customer expects the dataset.csv (Comprehend's CreateDocumentClassifier API input) to be stored
 65 | - LabelDelimiter in case of MultiLabel job. This is an optional field, which is needed only for MULTI_LABEL mode jobs, default value = "|"
 66 | 
 67 | The script performs the following tasks:
 68 | 1) Download the output.manifest file from the S3Uri provided by the customer
 69 | 2) Parse the output.manifest file and create dataset.csv
 70 | 3) Upload the dataset file to the S3 bucket provided by the customer
 71 | 
 72 | To run the script, execute the following command:
 73 | ```
 74 | ./convertGroundtruthToCompCLRFormat.sh <mode> <inputS3Uri> <outputDatasetS3Uri> <label_delimiter>
 75 | ```
 76 | 
 77 | #### Multi_Class Example:
 78 | 
 79 | output.manifest
 80 | 
 81 | ```
 82 | {"source":"Whatever you decide to do make sure it makes you #happy.","cutomDocClassification-multi-class":0,"cutomDocClassification-multi-class-metadata":{"confidence":0,"job-name":"labeling-job/cutomdocclassification-multi-class","class-name":"joy","human-annotated":"yes","creation-date":"2020-08-18T05:14:21.122782","type":"groundtruth/text-classification"}}
 83 | ```
 84 |  
 85 | dataset.csv:
 86 | 
 87 | ```
 88 | joy,Whatever you decide to do make sure it makes you #happy.
 89 | ```
 90 | 
 91 | #### Multi_Label Example:
 92 | 
 93 | output.manifest
 94 | ```
 95 | {"source":"Whatever you decide to do make sure it makes you #happy.","cutomDocClassification":[5,1],"cutomDocClassification-metadata":{"job-name":"labeling-job/cutomdocclassification","class-map":{"1":"optimism","5":"joy"},"human-annotated":"yes","creation-date":"2020-08-14T12:09:02.115245","confidence-map":{"1":0.49,"5":0.91},"type":"groundtruth/text-classification-multilabel"}}
 96 | ```
 97 | 
 98 | dataset.csv
 99 | ```
100 | optimism|joy,Whatever you decide to do make sure it makes you #happy.
101 | ```
102 | 
103 | Each line in the output.manifest file is a JSON object.
104 | The shell script takes the mode of the classifier job as the first argument. It also takes S3Uri of where this manifest file is stored as the second argument.
105 | 
106 | The script will executes the AWS CLI command to download the file to the local.
107 | 
108 | It will parse output.manifest file and generate dataset.csv file based on the file names obtained from parsing the outputS3Uri.
109 | 
110 | # LICENSE
111 | This library is licensed under the MIT-0 License. See the LICENSE file. 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__pycache__/customer_errors.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__pycache__/customer_errors.cpython-37.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__pycache__/groundtruth_to_comprehend_clr_format_converter.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/__pycache__/groundtruth_to_comprehend_clr_format_converter.cpython-37.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/convertGroundTruthToComprehendCLRFormat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "$#" -lt 3 ]]; then
 4 |     echo "USAGE: $0 <mode> <inputS3Uri> <outputDatasetS3Uri> <optional: label_delimiter>"
 5 |     echo " <mode>: Provide mode of DocumentClassifier, Valid values: MULTI_CLASS|MULTI_LABEL"
 6 |     echo " <inputS3Bucket>: Provide the S3Uri where the SageMaker GroundTruth output file is located"
 7 |     echo " <outputDatasetS3Uri>: Provide the complete S3Uri where the dataset file should be uploaded"
 8 |     echo " <label_delimiter>: Provide a delimiter for multilabel job. Default value='|' "
 9 |     echo " example: ./convertGroundtruthToCompCLRFormat.sh MULTI_CLASS s3://input-bucket/DocumentClassifier/manifests/output/output.manifest s3://output-bucket/CLR/dataset.csv"
10 |     echo " example: ./convertGroundtruthToCompCLRFormat.sh MULTI_LABEL s3://input-bucket/DocumentClassifier/manifests/output/output.manifest s3://output-bucket/CLR/dataset.csv"
11 |     echo " example: ./convertGroundtruthToCompCLRFormat.sh MULTI_LABEL s3://input-bucket/DocumentClassifier/manifests/output/output.manifest s3://output-bucket/CLR/dataset.csv $"
12 |     exit 1
13 | fi
14 | 
15 | echo "Provided mode=$1, inputS3Uri=$2, outputDatasetS3Uri=$3, label_delimiter=$4"
16 | 
17 | MODE=$1
18 | INPUT_S3_URI=$2
19 | DATASET_OUTPUT_S3_URI=$3
20 | LABEL_DELIMITER=$4
21 | 
22 | if [[ -z ${LABEL_DELIMITER} ]]; then
23 |     LABEL_DELIMITER="|"
24 | fi
25 | 
26 | printf "\nDownloading the output.manifest file from the S3 location: [%s]\n" $2
27 | 
28 | aws s3 cp ${INPUT_S3_URI} "output.manifest" || exit 1
29 | 
30 | printf "\nTransforming the output.manifest file to csv format\n"
31 | 
32 | array=()
33 | while read line ; do
34 |   array+=($line)
35 | done < <(python3 groundtruth_format_conversion_handler.py ${MODE} ${DATASET_OUTPUT_S3_URI} ${LABEL_DELIMITER})
36 | 
37 | printf "\nUploading the files to the destination S3 location: \n"
38 | aws s3 cp ${array[0]} ${DATASET_OUTPUT_S3_URI} || exit 1
39 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/customer_errors.py:
--------------------------------------------------------------------------------
 1 | from string import Template
 2 | 
 3 | CANNOT_PARSE_AUGMENTED_MANIFEST = Template('An augmented manifest file in your request is an invalid JSON file. '
 4 |                                            'Amazon Comprehend is unable to parse line ${line} in the file ${file_name}. '
 5 |                                            'Correct the file and try again.')
 6 | 
 7 | DOCUMENT_TOO_BIG = Template(
 8 |     'The maximum size of an individual document is ${size}MB. The document '
 9 |     'on line: ${line} of file: ${file} was greater than the maximum size.')
10 | 
11 | EMPTY_LABEL_FOUND = Template(
12 |     'Empty label found on line: ${line} of file: ${file}. This could be because '
13 |     'of 1) a leading label delimiter 2) a trailing label delimiter 3) consecutive '
14 |     'label delimiters in the labels list column or 4) an empty string.')
15 | 
16 | EMPTY_LABEL_UNSUPPORTED = \
17 |     Template('Labels cannot be empty. The training file ${filename} contained '
18 |              'at least one empty label.')
19 | 
20 | LABEL_TOO_BIG = Template(
21 |     'The maximum size of an individual label is ${size} characters. The label '
22 |     'on line: ${line} of file: ${file} was greater than the maximum size.')
23 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/groundtruth_format_conversion_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | from urllib.parse import urlparse
 4 | 
 5 | from groundtruth_to_comprehend_clr_format_converter import GroundTruthToComprehendCLRFormatConverter
 6 | 
 7 | 
 8 | class GroundTruthToCLRFormatConversionHandler:
 9 | 
10 |     def __init__(self):
11 |         self.convert_object = GroundTruthToComprehendCLRFormatConverter()
12 |         self.dataset_filename = ""
13 | 
14 |     def validate_s3_input(self, args):
15 |         dataset_output_S3Uri = args.dataset_output_S3Uri
16 | 
17 |         dataset_url = urlparse(dataset_output_S3Uri)
18 |         dataset_scheme = dataset_url.scheme
19 |         self.dataset_filename = dataset_url.path.split("/")[-1]
20 | 
21 |         print(self.dataset_filename)
22 | 
23 |         if dataset_scheme != "s3" or self.dataset_filename.split(".")[-1] != "csv":
24 |             raise Exception("Either of the output S3 lo cation provided is incorrect!")
25 | 
26 |     def read_write_multiclass_dataset(self):
27 |         with open('output.manifest', 'r', encoding='utf-8') as groundtruth_output_file, \
28 |                 open(self.dataset_filename, 'a', encoding='utf8') as multiclass_dataset:
29 |             for index, jsonLine in enumerate(groundtruth_output_file):
30 |                 class_name, source = self.convert_object.convert_to_multiclass_dataset(index, jsonLine)
31 |                 source = json.dumps(source).strip('"')
32 |                 multiclass_dataset.write(class_name + ',"' + source + '"')
33 |                 multiclass_dataset.write("\n")
34 | 
35 |     def read_write_multilabel_dataset(self, label_delimiter):
36 |         with open('output.manifest', 'r', encoding='utf-8') as groundtruth_output_file, \
37 |                 open(self.dataset_filename, 'a', encoding='utf8') as multilabel_dataset:
38 |             for index, jsonLine in enumerate(groundtruth_output_file):
39 |                 labels, source = self.convert_object.convert_to_multilabel_dataset(index, jsonLine, label_delimiter)
40 |                 source = json.dumps(source).strip('"')
41 |                 multilabel_dataset.write(labels + ',"' + source + '"')
42 |                 multilabel_dataset.write("\n")
43 | 
44 | 
45 | def main():
46 |     parser = argparse.ArgumentParser(description="Parsing the output S3Uri")
47 |     parser.add_argument('mode')
48 |     parser.add_argument('dataset_output_S3Uri')
49 |     parser.add_argument('label_delimiter')
50 |     args = parser.parse_args()
51 |     handler = GroundTruthToCLRFormatConversionHandler()
52 |     handler.validate_s3_input(args)
53 |     if args.mode == "MULTI_CLASS":
54 |         handler.read_write_multiclass_dataset()
55 |     elif args.mode == "MULTI_LABEL":
56 |         handler.read_write_multilabel_dataset(args.label_delimiter)
57 |     else:
58 |         raise Exception("The value provided for mode is invalid. Valid values are MUTLI_CLASS|MULTI_LABEL")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/DocumentClassifier/groundtruth_to_comprehend_clr_format_converter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from customer_errors import CANNOT_PARSE_AUGMENTED_MANIFEST, DOCUMENT_TOO_BIG, LABEL_TOO_BIG, EMPTY_LABEL_UNSUPPORTED, \
  4 |     EMPTY_LABEL_FOUND
  5 | 
  6 | 
  7 | SOURCE = 'source'
  8 | CLASS_NAME = 'class-name'
  9 | CLASS_MAP = 'class-map'
 10 | ATTRIBUTE_NAME_PARAMETER = 'attributeNames'
 11 | FAILURE_REASON = 'failure-reason'
 12 | BYTES_TO_MIB = 1024 * 1024
 13 | 
 14 | default_limits = {
 15 |     'MAX_LABEL_SIZE_IN_CHARS': 5000,
 16 |     'MAX_DOCUMENT_SIZE_MB': 10
 17 | }
 18 | 
 19 | 
 20 | class GroundTruthToComprehendCLRFormatConverter:
 21 | 
 22 |     def __init__(self):
 23 |         self.groundtruth_manifest_file_name = "output.manifest"
 24 |         self.labeling_job_name = ""
 25 |         self.label_delimiter = ""
 26 | 
 27 |     def _parse_manifest_input(self, index, input):
 28 |         try:
 29 |             if input is not None:
 30 |                 return json.loads(input)
 31 |         except ValueError:
 32 |             raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 33 |                                                                        file_name=self.groundtruth_manifest_file_name))
 34 | 
 35 |     # Raise CustomerError if the document size > 10MB
 36 |     def _check_document_size(self, source, index, limits):
 37 |         document_size_mb = len(source.encode('utf-8')) / BYTES_TO_MIB
 38 |         if document_size_mb > limits['MAX_DOCUMENT_SIZE_MB']:
 39 |             raise Exception(DOCUMENT_TOO_BIG.substitute(
 40 |                 size=limits['MAX_DOCUMENT_SIZE_MB'],
 41 |                 line=index,
 42 |                 file=self.groundtruth_manifest_file_name,
 43 |             ))
 44 | 
 45 |     def get_labeling_job_name(self, index, jsonLine_input):
 46 |         job_name = None
 47 |         for key, value in jsonLine_input.items():
 48 |             if "-metadata" in key:
 49 |                 job_name = key
 50 | 
 51 |         if job_name is None:
 52 |             raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 53 |                                                                        file_name=self.groundtruth_manifest_file_name))
 54 |         return job_name
 55 | 
 56 |     # Raise CustomerError if the class/label size is >5000 characters
 57 |     def _check_label_size(self, label, index, limits):
 58 |         if len(label) > limits['MAX_LABEL_SIZE_IN_CHARS']:
 59 |             raise Exception(LABEL_TOO_BIG.substitute(size=limits['MAX_LABEL_SIZE_IN_CHARS'],
 60 |                                                      line=index,
 61 |                                                      file=self.groundtruth_manifest_file_name))
 62 | 
 63 |     """
 64 |     Convert dict of labels into a string where each label is joined using the label_delimiter
 65 |     Example: "label1|label2|label3"
 66 |     """
 67 | 
 68 |     def _get_labels(self, class_map):
 69 |         return ''.join([value + self.label_delimiter for value in class_map.values()])[:-1]
 70 | 
 71 |     def convert_to_multiclass_dataset(self, index, jsonLine):
 72 | 
 73 |         jsonLine_object = self._parse_manifest_input(index, jsonLine)
 74 |         if jsonLine_object is not None:
 75 |             if SOURCE not in jsonLine_object.keys():
 76 |                 raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 77 |                                                                            file_name=self.groundtruth_manifest_file_name))
 78 |             source = jsonLine_object[SOURCE]
 79 |             self._check_document_size(source, index, limits=default_limits)
 80 | 
 81 |             self.labeling_job_name = self.get_labeling_job_name(index, jsonLine_object)
 82 |             if CLASS_NAME not in jsonLine_object[self.labeling_job_name].keys():
 83 |                 raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 84 |                                                                            file_name=self.groundtruth_manifest_file_name))
 85 | 
 86 |             class_name = jsonLine_object[self.labeling_job_name][CLASS_NAME]
 87 |             if not class_name:
 88 |                 raise Exception(EMPTY_LABEL_UNSUPPORTED.substitute(filename=self.groundtruth_manifest_file_name))
 89 |             self._check_label_size(class_name, index, limits=default_limits)
 90 | 
 91 |         return class_name, source
 92 | 
 93 |     def convert_to_multilabel_dataset(self, index, jsonLine, label_delimiter):
 94 |         self.label_delimiter = label_delimiter
 95 | 
 96 |         jsonLine_object = self._parse_manifest_input(index, jsonLine)
 97 |         if jsonLine_object is not None:
 98 |             if SOURCE not in jsonLine_object.keys():
 99 |                 raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
100 |                                                                            file_name=self.groundtruth_manifest_file_name))
101 |             source = jsonLine_object[SOURCE]
102 |             self._check_document_size(source, index, limits=default_limits)
103 | 
104 |             self.labeling_job_name = self.get_labeling_job_name(index, jsonLine_object)
105 | 
106 |             if CLASS_MAP not in jsonLine_object[self.labeling_job_name].keys():
107 |                 raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
108 |                                                                            file_name=self.groundtruth_manifest_file_name))
109 |             class_map = jsonLine_object[self.labeling_job_name][CLASS_MAP]
110 | 
111 |             # Raise CustomerError when no label found for the document
112 |             if len(class_map) == 0:
113 |                 raise Exception(EMPTY_LABEL_UNSUPPORTED.substitute(filename=self.groundtruth_manifest_file_name))
114 | 
115 |             # Raise CustomerError if label size is more than 5000 characters
116 |             for label in class_map.values():
117 |                 self._check_label_size(label, index, limits=default_limits)
118 | 
119 |             labels = self._get_labels(class_map)
120 | 
121 |             # Raise Customer error when empty label found in the list of labels
122 |             label_list = labels.split(self.label_delimiter)
123 |             for label in label_list:
124 |                 if len(label) == 0:
125 |                     raise Exception(EMPTY_LABEL_FOUND.substitute(line=index,
126 |                                                                  file=self.groundtruth_manifest_file_name))
127 | 
128 |         return labels, source
129 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/convertGroundtruthToComprehendERFormat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "$#" -lt 3 ]]; then
 4 |     echo "USAGE: $0 <inputS3Uri> <outputDatasetS3Uri> <outputAnnotationsS3Uri>"
 5 |     echo " <inputS3Bucket>: Provide the S3Uri where the SageMaker GroundTruth output file is located"
 6 |     echo " <outputDatasetS3Uri>: Provide the complete S3Uri where the dataset file should be uploaded"
 7 |     echo " <outputAnnotationsS3Uri>: Provide the complete S3Uri, where the annotation file should be upload"
 8 |     echo " example: ./convertGroundtruthToCompERFormat.sh s3://input-bucket/EntityRecognizer/manifests/output/output.manifest s3://output-bucket/ER/dataset.csv s3://output-bucket/ER/annotations.csv"
 9 |     exit 1
10 | fi
11 | 
12 | echo "Provided inputS3Uri=$1, outputDatasetS3Uri=$2, outputAnnotationsS3Uri=$3"
13 | 
14 | INPUT_S3_URI=$1
15 | DATASET_OUTPUT_S3_URI=$2
16 | ANNOTATIONS_OUTPUT_S3_URI=$3
17 | 
18 | printf "\nDownloading the output.manifest file from the S3 location: [%s]\n" $1
19 | 
20 | aws s3 cp ${INPUT_S3_URI} "output.manifest" || exit 1
21 | 
22 | printf "\nTransforming the output.manifest file to csv format\n"
23 | 
24 | array=()
25 | while read line ; do
26 |   array+=($line)
27 | done < <(python3 groundtruth_format_conversion_handler.py ${DATASET_OUTPUT_S3_URI} ${ANNOTATIONS_OUTPUT_S3_URI})
28 | 
29 | printf "\nUploading the files to the destination S3 location: \n"
30 | aws s3 cp ${array[0]} ${DATASET_OUTPUT_S3_URI} || exit 1
31 | aws s3 cp ${array[1]} ${ANNOTATIONS_OUTPUT_S3_URI} || exit 1


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/customer_errors.py:
--------------------------------------------------------------------------------
 1 | from string import Template
 2 | 
 3 | CANNOT_PARSE_AUGMENTED_MANIFEST = Template('An augmented manifest file in your request is an invalid JSON lines file. '
 4 |                                            'Amazon Comprehend is unable to parse line ${line} in the file ${file_name}. '
 5 |                                            'Correct the file and try again.')
 6 | 
 7 | DOC_SIZE_EXCEEDED = Template('A document exceeds the maximum size in the file ${file} on line ${line}. '
 8 |                              'Each document can be up to ${size} bytes.')
 9 | 
10 | WRONG_ANNOTATION = Template('An incorrect annotation is located in the file ${file_name} on line ${line}. The offset '
11 |                             'begins at position ${begin_offset} and ends at position ${end_offset}. ${message}.')
12 | 
13 | INVALID_OFFSETS = Template('An offset exceeds the maximum length in the file ${doc} on line ${line_index}. The offset '
14 |                            'begins at position ${begin_offset} and ends at position  ${end_offset}. An offset can be '
15 |                            'up to ${line_size} in length.')
16 | 
17 | OVERLAPPING_ANNOTATIONS = Template('Overlapping annotations are located in the file ${doc} on line ${line}. '
18 |                                    'Annotations must not overlap. The annotations are: ${annotations1} and ${annotations2}.')
19 | 
20 | 
21 | INVALID_END_OFFSET = 'End Offset cannot be less than Begin Offset.'
22 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/groundtruth_format_conversion_handler.py:
--------------------------------------------------------------------------------
 1 | from groundtruth_to_comprehend_format_converter import GroundTruthToComprehendFormatConverter
 2 | import csv
 3 | import json
 4 | import argparse
 5 | from urllib.parse import urlparse
 6 | 
 7 | ANNOTATION_CSV_HEADER = ['File', 'Line', 'Begin Offset', 'End Offset', 'Type']
 8 | 
 9 | 
10 | class GroundTruthFormatConversionHandler:
11 | 
12 |     def __init__(self):
13 |         self.convert_object = GroundTruthToComprehendFormatConverter()
14 |         self.dataset_filename = ""
15 |         self.annotation_filename = ""
16 | 
17 |     def validate_s3_input(self, args):
18 |         dataset_output_S3Uri = args.dataset_output_S3Uri
19 |         annotations_output_S3Uri = args.annotations_output_S3Uri
20 | 
21 |         dataset_url = urlparse(dataset_output_S3Uri)
22 |         dataset_scheme = dataset_url.scheme
23 |         self.dataset_filename = dataset_url.path.split("/")[-1]
24 | 
25 |         annotations_url = urlparse(annotations_output_S3Uri)
26 |         annotation_scheme = annotations_url.scheme
27 |         self.annotation_filename = annotations_url.path.split("/")[-1]
28 | 
29 |         print(self.dataset_filename)
30 |         print(self.annotation_filename)
31 | 
32 |         if dataset_scheme != "s3" or annotation_scheme != "s3" or self.dataset_filename.split(".")[-1] != "csv" or self.annotation_filename.split(".")[-1] != "csv":
33 |             raise Exception("Either of the output S3 location provided is incorrect!")
34 |         
35 |         # write header
36 |         with open(self.annotation_filename, 'w', encoding='utf8') as annotation_file:
37 |             datawriter = csv.writer(annotation_file, delimiter=',', lineterminator='\n')
38 |             datawriter.writerow(ANNOTATION_CSV_HEADER)
39 |     
40 |     def read_augmented_manifest_file(self):
41 |         with open('output.manifest', 'r', encoding='utf-8') as groundtruth_output_file:
42 |             for index, jsonLine in enumerate(groundtruth_output_file):
43 |                 self.read_write_dataset_annotations(index, jsonLine)
44 | 
45 |     def read_write_dataset_annotations(self, index, jsonLine):
46 |         with open(self.dataset_filename, 'a', encoding='utf8') as dataset, open(self.annotation_filename, 'a', encoding='utf8') as annotation_file:
47 |             datawriter = csv.writer(annotation_file, delimiter=',', lineterminator='\n')
48 |             source, annotations = self.convert_object.convert_to_dataset_annotations(index, jsonLine)
49 |             # write the document in the dataset file
50 |             source = json.dumps(source).strip('"')
51 |             dataset.write('"' + source + '"')
52 |             dataset.write("\n")
53 |             
54 |             # write the annotations of each document in the annotations file
55 |             for entry in annotations:
56 |                 datawriter.writerow(entry)
57 |                 
58 | 
59 | def main():
60 |     parser = argparse.ArgumentParser(description="Parsing the output S3Uri")
61 |     parser.add_argument('dataset_output_S3Uri')
62 |     parser.add_argument('annotations_output_S3Uri')
63 |     args = parser.parse_args()
64 |     handler = GroundTruthFormatConversionHandler()
65 |     handler.validate_s3_input(args)
66 |     handler.read_augmented_manifest_file()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/EntityRecognizer/groundtruth_to_comprehend_format_converter.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from operator import itemgetter
  3 | from customer_errors import CANNOT_PARSE_AUGMENTED_MANIFEST, DOC_SIZE_EXCEEDED, WRONG_ANNOTATION, INVALID_END_OFFSET, \
  4 |     INVALID_OFFSETS, OVERLAPPING_ANNOTATIONS
  5 | 
  6 | SOURCE = 'source'
  7 | ANNOTATIONS = 'annotations'
  8 | ENTITIES = 'entities'
  9 | START_OFFSET = 'startOffset'
 10 | END_OFFSET = 'endOffset'
 11 | LABEL = 'label'
 12 | MAX_TRAIN_DOC_SIZE = 5000
 13 | 
 14 | 
 15 | class GroundTruthToComprehendFormatConverter:
 16 | 
 17 |     def __init__(self):
 18 |         self.input_file_name = "dataset.csv"
 19 |         self.groundtruth_manifest_file_name = "output.manifest"
 20 |         self.labeling_job_name = ""
 21 |         self.maximum_offset = 0
 22 | 
 23 |     def convert_to_dataset_annotations(self, index, jsonLine):
 24 |         # parse the jsonLine to generate the dataset entry
 25 |         jsonObj = self.parse_manifest_input(jsonLine)
 26 |         if SOURCE not in jsonObj:
 27 |             raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 28 |                                                                        file_name=self.groundtruth_manifest_file_name))
 29 |         source = jsonObj[SOURCE]
 30 |         if len(source.encode('utf-8')) > MAX_TRAIN_DOC_SIZE:
 31 |             raise Exception(DOC_SIZE_EXCEEDED.substitute(file=self.groundtruth_manifest_file_name,
 32 |                                                          line=index,
 33 |                                                          size=MAX_TRAIN_DOC_SIZE))
 34 |         self.maximum_offset = len(source.encode('utf-8'))
 35 | 
 36 |         # parse the jsonLine to generate the annotations entry
 37 |         annotations = []
 38 |         
 39 |         self.labeling_job_name = self.get_labeling_job_name(index, jsonObj)
 40 |         number_of_labels = len(jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES])
 41 |         labeling_job_info = jsonObj[self.labeling_job_name][ANNOTATIONS][ENTITIES]
 42 |         for ind in range(number_of_labels):
 43 |             begin_offset = int(labeling_job_info[ind][START_OFFSET])
 44 |             end_offset = int(labeling_job_info[ind][END_OFFSET])
 45 |             label = labeling_job_info[ind][LABEL]
 46 |             if end_offset < begin_offset:
 47 |                 raise Exception(WRONG_ANNOTATION.substitute(file_name=self.groundtruth_manifest_file_name,
 48 |                                                             line=int(index),
 49 |                                                             begin_offset=begin_offset,
 50 |                                                             end_offset=end_offset,
 51 |                                                             message=INVALID_END_OFFSET))
 52 |             if (begin_offset >= self.maximum_offset) or (end_offset > self.maximum_offset):
 53 |                 raise Exception(INVALID_OFFSETS.substitute(doc=self.groundtruth_manifest_file_name,
 54 |                                                            line_index=index,
 55 |                                                            begin_offset=begin_offset,
 56 |                                                            end_offset=end_offset,
 57 |                                                            line_size=self.maximum_offset))
 58 |             annotations.append((self.input_file_name, index, begin_offset, end_offset, label))
 59 |         
 60 |         self._check_for_overlapping_annotations(annotations)
 61 |            
 62 |         return source, annotations
 63 | 
 64 |     def parse_manifest_input(self, jsonLine):
 65 |         try:
 66 |             jsonObj = json.loads(jsonLine)
 67 |             return jsonObj
 68 |         except ValueError as e:
 69 |             print(f"Error decoding the string: {jsonLine}, {e}")
 70 |             raise
 71 | 
 72 |     def get_labeling_job_name(self, index, jsonObj):
 73 |         job_name = None
 74 |         for key, value in jsonObj.items():
 75 |             if self.is_json_serializable(value):
 76 |                 if ANNOTATIONS in value:
 77 |                     job_name = key
 78 |         if job_name is None or ANNOTATIONS not in jsonObj[job_name].keys() or ENTITIES not in jsonObj[job_name][ANNOTATIONS].keys():
 79 |             raise Exception(CANNOT_PARSE_AUGMENTED_MANIFEST.substitute(line=index,
 80 |                                                                        file_name=self.groundtruth_manifest_file_name))
 81 |         return job_name
 82 | 
 83 |     def is_json_serializable(self, value):
 84 |         try:
 85 |             json.dumps(value)
 86 |             return True
 87 |         except ValueError as e:
 88 |             print(e)
 89 |             return False
 90 |     
 91 |     """
 92 |         Example: annotations = [(doc.txt,0,25,16,DATE), (doc.txt,0,0,3,PROGRAMMER), (doc.txt,0,55,66,LOCATION)]
 93 |         Sort the annotations based on the begin offset,
 94 |         annotations = [(doc.txt,0,0,3,PROGRAMMER), (doc.txt,0,25,16,DATE), (doc.txt,0,55,66,LOCATION)]
 95 |         Considering 2 annotations at a time, Compare the end offset of 1st annotation with begin offset of 2nd annotation and 
 96 |         raise an exception if they overlap
 97 |     """
 98 | 
 99 |     def _check_for_overlapping_annotations(self, annotations):
100 |         annotations.sort(key=itemgetter(2))  # 2 represents the index of beginOffset in the tuple
101 |         for i in range(1, len(annotations)):
102 |             previous_end_offset = annotations[i - 1][3]  # 3 represents the index of the endOffset in the previous tuple
103 |             current_begin_offset = annotations[i][2]  # 2 represents the index of the beginOffset in the current tuple
104 |             if previous_end_offset > current_begin_offset:
105 |                 raise Exception(OVERLAPPING_ANNOTATIONS.substitute(doc=self.groundtruth_manifest_file_name,
106 |                                                                    line=annotations[i][1],
107 |                                                                    annotations1=annotations[i - 1][4], # represents entity types in the previous tuple that is overlapping
108 |                                                                    annotations2=annotations[i][4]))  # represents other entity type in the current tuple that is overlapping
109 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__pycache__/customer_errors.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__pycache__/customer_errors.cpython-37.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__pycache__/groundtruth_to_comprehend_format_converter.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/GroundTruth/__pycache__/groundtruth_to_comprehend_format_converter.cpython-37.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/README.md:
--------------------------------------------------------------------------------
  1 | ## Install dependencies
  2 | 
  3 | 1. Install Python modules
  4 |   - `boto3`
  5 |   - `marshmallow`
  6 | 
  7 | ## Documentation
  8 | 
  9 | ### Prerequisite
 10 | 1. `cd amazon-comprehend-examples/comprehend_groundtruth_integration/src`
 11 | 
 12 | ### Scripts
 13 | 
 14 | 1. Includes a script to validate a Comprehend custom Entity Recognizer training manifest for semi-structured. The following will be detailed:
 15 |     1. The source reference is given and exists in S3.
 16 |     2. The annotation reference is given and exists in S3.
 17 |     3. The annotation file schema is valid and all entities can be referenced in the annotation file's data.
 18 |         a. Annotation files contain a "Blocks" key which contains a list of https://docs.aws.amazon.com/comprehend/latest/APIReference/API_Block.html.
 19 |         b. Annotation files contain a "Blocks" key which contains a list of https://docs.aws.amazon.com/comprehend/latest/APIReference/API_Entity.html.
 20 |     4. All entities and their counts will be logged.
 21 |         a. Valid entity counts will be logged.
 22 |         b. Invalid entity counts will be logged. An invalid entity consists of an entity which cannot be located within the annotation file's Blocks.
 23 |     5. There will be no failure on validation unless `--fail-on-invalid` is also passed in the script call.
 24 |     6. Local directories for documents/source files and annotations can be used with `--document-local-ref` and `--annotations-local-ref`, respectively, to avoid S3 calls.
 25 | 
 26 |     ### Example script calls and outputs
 27 | 
 28 |     S3 manifest example call:
 29 |     ```
 30 |     python -m comprehend_customer_scripts.validation.semi_structured.entity_recognizer.validate_manifest --manifest-s3-ref s3://bucket/path/to/output.manifest
 31 |     ```
 32 | 
 33 |     Local manifest example call:
 34 |     ```
 35 |     python -m comprehend_customer_scripts.validation.semi_structured.entity_recognizer.validate_manifest --manifest-local-ref /local/path/to/output.manifest
 36 |     ```
 37 | 
 38 |     Example output:
 39 |     ```
 40 |     INFO: root: DATASET AGGREGATE STATS
 41 |     INFO: root: annotation_files_with_format_issues: []
 42 |     INFO: root: annotation_files_containing_invalid_entities: {
 43 |         "OFFERING_PRICE": {
 44 |             "sreg-0063475c-b633-43a5-8709-f19e47a6b38c-1-e3374cd4-ann.json": 2,
 45 |             ...
 46 |         },
 47 |         ...
 48 |     }
 49 |     INFO: root: {
 50 |         'OFFERING_PRICE': {
 51 |             'VALID': 721,
 52 |             'INVALID': 0
 53 |         },
 54 |         'OFFERED_SHARES': {
 55 |             'VALID': 378,
 56 |             'INVALID': 0
 57 |         },
 58 |         'COMMISSION_UNDERWRITER': {
 59 |             'VALID': 88,
 60 |             'INVALID': 0
 61 |         },
 62 |         'COMMISSION_OTHER': {
 63 |             'VALID': 74,
 64 |             'INVALID': 0
 65 |         },
 66 |         'PROCEEDS': {
 67 |             'VALID': 64,
 68 |             'INVALID': 0
 69 |         }
 70 |     }
 71 |     INFO: root: ANNOTATION FILE AGGREGATESTATS INFO: root: {
 72 |         'sreg-0063475c-b633-43a5-8709-f19e47a6b38c-1-e3374cd4-ann.json': {
 73 |             'OFFERING_PRICE': {
 74 |                 'VALID': 4,
 75 |                 'INVALID': 0
 76 |             },
 77 |             'OFFERED_SHARES': {
 78 |                 'VALID': 2,
 79 |                 'INVALID': 0
 80 |             }
 81 |         },
 82 |         ...
 83 |     }
 84 |     ```
 85 |     ### Failure examples
 86 | 
 87 |     1. Failure validation example output in the case of an INVALID entity:
 88 |         ```
 89 |         {
 90 |             ...,
 91 |             "sreg-0063475c-b633-43a5-8709-f19e47a6b38c-1-e3374cd4-ann.json": {
 92 |                 "INVALID": 1,
 93 |                 "VALID": 0,
 94 |             },
 95 |             ...
 96 |         }
 97 |         ```
 98 |         The entity's Text "$4.10" does not match the text "$4.20" tracked in the referenced Block(s).
 99 | 
100 |         Entity:
101 |         ```
102 |         {
103 |             "BlockReferences": [
104 |                 {
105 |                     "BlockId": "089cda72-86ff-494a-8a29-ed0ab39e925b",
106 |                     "ChildBlocks": [
107 |                         {
108 |                             "BeginOffset": 0,
109 |                             "EndOffset": 5,
110 |                             "ChildBlockId": "f4dba2cf-ad51-430e-9a38-8f9554d1b094"
111 |                         }
112 |                     ],
113 |                     "BeginOffset": 0,
114 |                     "EndOffset": 5
115 |                 }
116 |             ],
117 |             "Text": "$4.10",
118 |             "Type": "OFFERING_PRICE",
119 |             "Score": 1,
120 |             "Properties": {
121 |                 "OFFERING_PRICE-SUBTYPE": "PER_SHARE"
122 |             }
123 |         }
124 |         ```
125 | 
126 |         Blocks:
127 |         ``````
128 |         {
129 |             "BlockType": "LINE",
130 |             "Id": "089cda72-86ff-494a-8a29-ed0ab39e925b",
131 |             "Text": "$4.20",
132 |             "Geometry": {
133 |                 "BoundingBox": {
134 |                     "Width": 0.03611570969223976,
135 |                     "Top": 0.15035110712051392,
136 |                     "Left": 0.5531672835350037,
137 |                     "Height": 0.010543919168412685
138 |                 },
139 |                 "Polygon": [
140 |                     {
141 |                         "X": 0.5531672835350037,
142 |                         "Y": 0.15035110712051392
143 |                     },
144 |                     {
145 |                         "X": 0.5892829932272434,
146 |                         "Y": 0.15035110712051392
147 |                     },
148 |                     {
149 |                         "X": 0.5892829932272434,
150 |                         "Y": 0.1608950262889266
151 |                     },
152 |                     {
153 |                         "X": 0.5531672835350037,
154 |                         "Y": 0.1608950262889266
155 |                     }
156 |                 ]
157 |             },
158 |             "Relationships": [
159 |                 {
160 |                     "Ids": [
161 |                         "f4dba2cf-ad51-430e-9a38-8f9554d1b094"
162 |                     ],
163 |                     "Type": "CHILD"
164 |                 }
165 |             ],
166 |             "Page": 1
167 |         },
168 |         {
169 |             "BlockType": "WORD",
170 |             "Id": "f4dba2cf-ad51-430e-9a38-8f9554d1b094",
171 |             "Text": "$4.20",
172 |             "Geometry": {
173 |                 "BoundingBox": {
174 |                     "Width": 0.03611570969223976,
175 |                     "Top": 0.15035110712051392,
176 |                     "Left": 0.5531672835350037,
177 |                     "Height": 0.010543919168412685
178 |                 },
179 |                 "Polygon": [
180 |                     {
181 |                         "X": 0.5531672835350037,
182 |                         "Y": 0.15035110712051392
183 |                     },
184 |                     {
185 |                         "X": 0.5892829932272434,
186 |                         "Y": 0.15035110712051392
187 |                     },
188 |                     {
189 |                         "X": 0.5892829932272434,
190 |                         "Y": 0.1608950262889266
191 |                     },
192 |                     {
193 |                         "X": 0.5531672835350037,
194 |                         "Y": 0.1608950262889266
195 |                     }
196 |                 ]
197 |             },
198 |             "Relationships": [],
199 |             "Page": 1
200 |         }
201 |         ```
202 |     2. If `--fail-on-invalid` argument is passed in script call, failure validation example output in the case of the `Version` attribute not existing within an annotation file:
203 |         ```
204 |         ERROR:root:Failed to validate annotation schema s3://bucket/folder/annotations/annotation.json due to {'Version': ['Missing data for required field.']}.
205 |         ERROR:root:Failed validation at line 8: {"source-ref": "s3://bucket/folder/pdfs/source.pdf", "page": "1", "metadata": {"pages": "1", "use-textract-only": true, "labels": ["COMMISSION_OTHER", "COMMISSION_UNDERWRITER", "OFFERING_PRICE", "OFFERED_SHARES", "PROCEEDS"]}, "primary-annotation-ref": null, "secondary-annotation-ref": null, "semi-structured-job": {"annotation-ref": "s3://bucket/folder/annotations/annotation.json"}, "semi-structured-job-metadata": {"type": "groundtruth/custom", "job-name": "semi-structured-job", "human-annotated": "yes", "creation-date": "2021-08-23T22:41:12.546000"}}
206 |         ```
207 | 
208 | 2. Includes a script to validate a Comprehend custom Entity Recognizer annotation for semi-structured data. It checks that the schema is valid and all entities can be referenced in the annotation file's data.
209 | 
210 |     All entities and their counts will be logged in a successful validation.
211 | 
212 |     ### Example script calls and outputs
213 | 
214 |     S3 annotation example call:
215 |     ```
216 |     python -m comprehend_customer_scripts.validation.semi_structured.entity_recognizer.validate_annotation --annotation-s3-ref s3://bucket/path/to/annotation.json
217 |     ```
218 | 
219 |     Local annotation example call:
220 |     ```
221 |     python -m comprehend_customer_scripts.validation.semi_structured.entity_recognizer.validate_annotation --annotation-local-ref /Users/dnlen/Desktop/untitled_folder/blog/annotations/sreg-0e12539d-bd1a-4048-8055-d3f9e478d4a5-1-2e456931-ann.json
222 |     ```
223 | 
224 |     Failure validation example output in the case of the `Version` attribute not existing within the annotation file:
225 |     ```
226 |     ERROR:root:Failed to validate annotation schema sreg-41d55a2f-cf4a-43d1-8905-4f441c9caccb-1-99b6261c-ann_issue.json due to {'Version': ['Missing data for required field.']}.
227 |     ERROR:root:Validation failed.
228 |     ```
229 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.getLogger().setLevel(logging.INFO)
4 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/annotation_model.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | from marshmallow import EXCLUDE, Schema, fields, validates_schema, ValidationError, validate
  3 | 
  4 | 
  5 | class Fields(enum.Enum):
  6 |     VERSION = 'Version'
  7 |     DOCUMENT_TYPE = 'DocumentType'
  8 |     DOCUMENT_METADATA = 'DocumentMetadata'
  9 |     BLOCKS = 'Blocks'
 10 |     ENTITIES = 'Entities'
 11 | 
 12 | 
 13 | # Entities objects
 14 | 
 15 | 
 16 | class EntitiesFields(enum.Enum):
 17 |     BLOCK_REFERENCES = 'BlockReferences'
 18 |     TEXT = 'Text'
 19 |     TYPE = 'Type'
 20 |     SCORE = 'Score'
 21 | 
 22 | 
 23 | class BlockReferenceFields(enum.Enum):
 24 |     BEGIN_OFFSET = 'BeginOffset'
 25 |     END_OFFSET = 'EndOffset'
 26 |     BLOCK_ID = 'BlockId'
 27 |     CHILD_BLOCKS = 'ChildBlocks'
 28 | 
 29 | 
 30 | class ChildBlockFields(enum.Enum):
 31 |     BEGIN_OFFSET = 'BeginOffset'
 32 |     END_OFFSET = 'EndOffset'
 33 |     CHILD_BLOCK_ID = 'ChildBlockId'
 34 | 
 35 | 
 36 | class ChildBlockSchema(Schema.from_dict({
 37 |     ChildBlockFields.BEGIN_OFFSET.value: fields.Int(required=True),
 38 |     ChildBlockFields.END_OFFSET.value: fields.Int(required=True),
 39 |     ChildBlockFields.CHILD_BLOCK_ID.value: fields.Str(required=True)
 40 | })):
 41 |     class Meta:
 42 |         unknown = EXCLUDE
 43 | 
 44 | 
 45 | class BlockReferenceSchema(Schema.from_dict({
 46 |     BlockReferenceFields.CHILD_BLOCKS.value: fields.List(
 47 |         fields.Nested(ChildBlockSchema), required=False, allow_none=True
 48 |     ),
 49 |     BlockReferenceFields.END_OFFSET.value: fields.Int(required=True),
 50 |     BlockReferenceFields.BEGIN_OFFSET.value: fields.Int(required=True),
 51 |     BlockReferenceFields.BLOCK_ID.value: fields.Str(required=True)
 52 | })):
 53 |     class Meta:
 54 |         unknown = EXCLUDE
 55 | 
 56 | 
 57 | class EntityAnnotationSchema(Schema.from_dict({
 58 |     EntitiesFields.BLOCK_REFERENCES.value: fields.List(
 59 |         fields.Nested(BlockReferenceSchema), required=False, allow_none=True
 60 |     ),
 61 |     EntitiesFields.TEXT.value: fields.Str(required=True),
 62 |     EntitiesFields.TYPE.value: fields.Str(required=True),
 63 |     EntitiesFields.SCORE.value: fields.Float(allow_none=True)
 64 | })):
 65 |     class Meta:
 66 |         unknown = EXCLUDE
 67 | 
 68 | 
 69 | # Blocks objects
 70 | 
 71 | 
 72 | class BlockFields(enum.Enum):
 73 |     # Block Object Fields
 74 |     BLOCK_TYPE = 'BlockType'
 75 |     ID = 'Id'
 76 |     TEXT = 'Text'
 77 |     GEOMETRY = 'Geometry'
 78 |     RELATIONSHIPS = 'Relationships'
 79 |     PAGE = 'Page'
 80 | 
 81 | 
 82 | class GeometryFields(enum.Enum):
 83 |     BOUNDING_BOX = 'BoundingBox'
 84 |     POLYGON = 'Polygon'
 85 | 
 86 | 
 87 | class BoundingBoxFields(enum.Enum):
 88 |     WIDTH = 'Width'
 89 |     TOP = 'Top'
 90 |     LEFT = 'Left'
 91 |     HEIGHT = 'Height'
 92 | 
 93 | 
 94 | class PolygonCoordinateFields(enum.Enum):
 95 |     X = 'X'
 96 |     Y = 'Y'
 97 | 
 98 | 
 99 | class RelationshipFields(enum.Enum):
100 |     IDS = 'Ids'
101 |     TYPE = 'Type'
102 | 
103 | 
104 | class PolygonSchema(Schema.from_dict({
105 |     PolygonCoordinateFields.X.value: fields.Float(required=True),
106 |     PolygonCoordinateFields.Y.value: fields.Float(required=True)
107 | })):
108 |     class Meta:
109 |         unknown = EXCLUDE
110 | 
111 | 
112 | class BoundingBoxSchema(Schema.from_dict({
113 |     BoundingBoxFields.WIDTH.value: fields.Float(required=True),
114 |     BoundingBoxFields.TOP.value: fields.Float(required=True),
115 |     BoundingBoxFields.HEIGHT.value: fields.Float(required=True),
116 |     BoundingBoxFields.LEFT.value: fields.Float(required=True)
117 | })):
118 |     class Meta:
119 |         unknown = EXCLUDE
120 | 
121 | 
122 | class RelationshipSchema(Schema.from_dict({
123 |     RelationshipFields.IDS.value: fields.List(fields.Str(), required=False),
124 |     RelationshipFields.TYPE.value: fields.Str(required=False)
125 | })):
126 |     class Meta:
127 |         unknown = EXCLUDE
128 | 
129 | 
130 | class GeometrySchema(Schema.from_dict({
131 |     GeometryFields.BOUNDING_BOX.value: fields.Nested(BoundingBoxSchema, required=True),
132 |     GeometryFields.POLYGON.value: fields.List(
133 |         fields.Nested(PolygonSchema), required=True
134 |     )
135 | })):
136 |     class Meta:
137 |         unknown = EXCLUDE
138 | 
139 | 
140 | class BlockSchema(Schema.from_dict({
141 |     BlockFields.GEOMETRY.value: fields.Nested(GeometrySchema, required=True),
142 |     BlockFields.ID.value: fields.Str(required=True),
143 |     BlockFields.TEXT.value: fields.Str(allow_none=True),
144 |     BlockFields.PAGE.value: fields.Int(allow_none=True),
145 |     BlockFields.RELATIONSHIPS.value: fields.List(
146 |         fields.Nested(RelationshipSchema), default=[], allow_none=True
147 |     ),
148 |     BlockFields.BLOCK_TYPE.value: fields.Str(required=True)
149 | })):
150 |     class Meta:
151 |         unknown = EXCLUDE
152 | 
153 |     @validates_schema
154 |     def validate_text_field(self, data, **kwargs):
155 |         if data[BlockFields.BLOCK_TYPE.value] in ["WORD", "LINE"]:
156 |             if not data.get(BlockFields.TEXT.value):
157 |                 raise ValidationError("Text must be present for WORD and LINE blocks")
158 | 
159 | 
160 | class AnnotationSchema(Schema.from_dict({
161 |     Fields.VERSION.value: fields.Str(required=True),
162 |     Fields.BLOCKS.value: fields.List(fields.Nested(BlockSchema), required=True),
163 |     Fields.ENTITIES.value: fields.List(fields.Nested(EntityAnnotationSchema), required=True),
164 |     Fields.DOCUMENT_METADATA.value: fields.Dict(required=True),
165 |     Fields.DOCUMENT_TYPE.value: fields.Str(required=True)
166 | })):
167 |     class Meta:
168 |         unknown = EXCLUDE
169 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__init__.py


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/annotation_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/annotation_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/log_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/log_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/s3_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-comprehend-examples/3c92187988e9a391f06d16825012c9ac08d20a19/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/__pycache__/s3_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/annotation_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.annotation_model import AnnotationSchema
 5 | 
 6 | 
 7 | def is_valid_entities(annotation_json: dict, annotation_name: str, stats: dict = {}, fail_on_invalid: bool = True):
 8 |     """Validate if all entities are correctly referenced by their line and word blocks and there are no duplicates."""
 9 |     blocks_map = {block["Id"]: block for block in annotation_json["Blocks"]}
10 |     block_reference_id_set = set()
11 |     annotation_stats_valid = stats[annotation_name]["VALID"]
12 |     for entity in annotation_json["Entities"]:
13 |         if entity["Type"] not in annotation_stats_valid:
14 |             annotation_stats_valid[entity["Type"]] = {"VALID": 0, "INVALID": 0}
15 | 
16 |         line_block_strings = [] 
17 |         word_block_strings = []
18 |         block_reference_ids = []
19 |         for block_reference in entity["BlockReferences"]:
20 |             block_reference_ids.append(block_reference["BlockId"])
21 |             line_block = blocks_map.get(block_reference["BlockId"])
22 |             if not line_block:
23 |                 log_content = f"Line block not found for line block id: {block_reference['BlockId']}"
24 |                 logging.error(log_content)
25 |                 continue
26 |             line_block_strings.append(line_block["Text"][block_reference["BeginOffset"]:block_reference["EndOffset"]])
27 | 
28 |             for child_block_reference in block_reference["ChildBlocks"]:
29 |                 block_reference_ids.append(child_block_reference["ChildBlockId"])
30 |                 word_block = blocks_map.get(child_block_reference["ChildBlockId"])
31 |                 if not word_block:
32 |                     log_content = f"Word block not found for word block id: {block_reference['BlockId']}"
33 |                     logging.error(log_content)
34 |                     continue
35 |                 word_block_strings.append(word_block["Text"][child_block_reference["BeginOffset"]:child_block_reference["EndOffset"]])
36 |         block_reference_id = "-".join(block_reference_ids)
37 |         is_duplicate_entity = block_reference_id in block_reference_id_set
38 |         if is_duplicate_entity:
39 |             log_content = f"Duplicate entity: {json.dumps(entity)}"
40 |             logging.error(log_content)
41 |             continue
42 |         if " ".join(line_block_strings) != entity["Text"] or " ".join(word_block_strings) != entity["Text"]:
43 |             log_content = f"For annotation: {annotation_name}, failed to validate entity: {json.dumps(entity)}, " \
44 |                             f"using line_block_strings: {line_block_strings} and word_block_strings: {word_block_strings}"
45 |             logging.error(log_content)
46 |             annotation_stats_valid[entity["Type"]]["INVALID"] += 1
47 | 
48 |             if fail_on_invalid:
49 |                 return False
50 |         block_reference_id_set.add(block_reference_id)
51 | 
52 |         annotation_stats_valid[entity["Type"]]["VALID"] += 1
53 | 
54 |     return True
55 | 
56 | 
57 | def is_valid_annotation(annotation_content: str, annotation_name: str, stats: dict = {}, fail_on_invalid: bool = True):
58 |     """Validate an annotation."""
59 |     if annotation_name not in stats:
60 |         stats[annotation_name] = {"VALID": {}, "INVALID_FORMAT": False}
61 |     try:
62 |         annotation_json = json.loads(annotation_content)
63 |         AnnotationSchema().load(annotation_json)
64 |     except Exception as e:
65 |         logging.error(f"Failed to validate annotation schema {annotation_name} due to {e}.")
66 |         stats[annotation_name]["INVALID_FORMAT"] = True
67 | 
68 |         if fail_on_invalid:
69 |             return False
70 |         return True
71 | 
72 |     return is_valid_entities(
73 |         annotation_json=annotation_json, annotation_name=annotation_name, stats=stats, fail_on_invalid=fail_on_invalid
74 |     )
75 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/log_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def log_stats(stats: dict):
 5 |     dataset_level_stats = {"annotation_files_with_format_issues": [], "entity_stats": {}, "annotation_files_containing_invalid_entities": {}}
 6 |     annotation_file_level_stats = {}
 7 |     for annotation_name in stats.keys():
 8 |         if stats[annotation_name]["INVALID_FORMAT"]:
 9 |             dataset_level_stats["annotation_files_with_format_issues"].append(annotation_name)
10 | 
11 |         valid_annotation_stats = stats[annotation_name]["VALID"]
12 |         for entity_type in valid_annotation_stats.keys():
13 |             if annotation_name not in annotation_file_level_stats:
14 |                 annotation_file_level_stats[annotation_name] = {}
15 |             if entity_type not in annotation_file_level_stats[annotation_name]:
16 |                 annotation_file_level_stats[annotation_name][entity_type] = {"VALID": 0, "INVALID": 0}
17 |             annotation_file_level_stats[annotation_name][entity_type]["VALID"] = valid_annotation_stats[entity_type]["VALID"]
18 |             annotation_file_level_stats[annotation_name][entity_type]["INVALID"] = valid_annotation_stats[entity_type]["INVALID"]
19 | 
20 |             dataset_level_entity_stats = dataset_level_stats["entity_stats"]
21 |             if entity_type not in dataset_level_entity_stats:
22 |                 dataset_level_entity_stats[entity_type] = {"VALID": 0, "INVALID": 0}
23 |             dataset_level_entity_stats[entity_type]["VALID"] += annotation_file_level_stats[annotation_name][entity_type]["VALID"]
24 |             dataset_level_entity_stats[entity_type]["INVALID"] += annotation_file_level_stats[annotation_name][entity_type]["INVALID"]
25 | 
26 |             if annotation_file_level_stats[annotation_name][entity_type]["INVALID"]:
27 |                 annotation_files_containing_invalid_entities = dataset_level_stats["annotation_files_containing_invalid_entities"]
28 |                 if entity_type not in annotation_files_containing_invalid_entities:
29 |                     annotation_files_containing_invalid_entities[entity_type] = {}
30 |                 if annotation_name not in annotation_files_containing_invalid_entities[entity_type]:
31 |                     annotation_files_containing_invalid_entities[entity_type][annotation_name] = 0
32 | 
33 |                 annotation_files_containing_invalid_entities[entity_type][annotation_name] += 1
34 | 
35 |     logging.info(f"DATASET AGGREGATE STATS")
36 |     logging.info(f"annotation_files_with_format_issues: {list(dataset_level_stats['annotation_files_with_format_issues'])}")
37 |     logging.info(f"annotation_files_containing_invalid_entities: {dataset_level_stats['annotation_files_containing_invalid_entities']}")
38 |     logging.info(f"{dataset_level_stats['entity_stats']}\n")
39 | 
40 |     logging.info(f"ANNOTATION FILE AGGREGATE STATS")
41 |     logging.info(f"{annotation_file_level_stats}\n")
42 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/utils/s3_utils.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | 
 4 | def bucket_key_from_s3_uri(s3_path: str):
 5 |     """Get bucket and key from s3 URL."""
 6 |     o = urlparse(s3_path, allow_fragments=False)
 7 |     bucket = o.netloc
 8 |     key = o.path.lstrip('/')
 9 |     return bucket, key
10 | 
11 | 
12 | def get_object_content(s3_client, ref: str):
13 |     """Get UTF-8 content from an S3 object."""
14 |     bucket, path = bucket_key_from_s3_uri(ref)
15 |     return s3_client.get_object(Bucket=bucket, Key=path).get('Body').read().decode('utf-8')
16 | 
17 | 
18 | def get_bucket_and_objects_in_folder(s3_client, ref: str, is_file=False):
19 |     """Get bucket and objects in folder prefixed with given reference."""
20 |     bucket, key = bucket_key_from_s3_uri(ref)
21 |     if not is_file:
22 |         key = key + ('' if key.endswith('/') else '/')
23 |     paginator = s3_client.get_paginator('list_objects_v2')
24 |     pages = paginator.paginate(Bucket=bucket, Prefix=key)
25 |     all_objs = []
26 |     for page in pages:
27 |         all_objs.extend(page.get('Contents', []))            
28 |     return bucket, sorted([obj for obj in all_objs if not obj["Key"].endswith("/")], key=lambda obj: obj["Key"])
29 | 
30 | 
31 | def s3_file_exists(s3_client, ref: str):
32 |     bucket, objs = get_bucket_and_objects_in_folder(s3_client=s3_client, ref=ref, is_file=True)
33 |     return len(objs) and f"s3://{bucket}/{objs[0]['Key']}" == ref
34 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/validate_annotation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import boto3
 3 | import logging
 4 | import os
 5 | 
 6 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.annotation_utils import is_valid_annotation
 7 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.log_utils import log_stats
 8 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.s3_utils import get_object_content
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--annotation-s3-ref", required=False, type=str, help="S3 reference to annotation file. Usage: --annotation-s3-ref s3://bucket/path/to/annotation.json")
14 |     parser.add_argument("--annotation-local-ref", required=False, type=str, help="Local reference to annotation file. Usage: --annotation-local-ref /local/path/to/annotation.json")
15 |     args = parser.parse_args()
16 | 
17 |     annotation_s3_ref = args.annotation_s3_ref
18 |     annotation_local_ref = args.annotation_local_ref
19 | 
20 |     s3_client = boto3.client("s3")
21 |     if annotation_s3_ref is not None:
22 |         annotation_s3_ref = annotation_s3_ref.rstrip("/")
23 |         annotation_content = get_object_content(s3_client=s3_client, ref=annotation_s3_ref)
24 |         annotation_name = os.path.basename(annotation_s3_ref)
25 |     elif annotation_local_ref is not None:
26 |         annotation_local_ref = annotation_local_ref.rstrip(os.sep)
27 |         with open(annotation_local_ref, "r", encoding="utf-8", errors="strict") as manifest_file:
28 |             annotation_content = manifest_file.read()
29 |         annotation_name = os.path.basename(annotation_local_ref)
30 |     else:
31 |         logging.error(f"Must provide either annotation-s3-ref or annotation-local-ref.")
32 |         return
33 | 
34 |     stats = {}
35 |     if not is_valid_annotation(
36 |         annotation_content=annotation_content,
37 |         annotation_name=annotation_name,
38 |         stats=stats
39 |     ):
40 |         logging.error("Validation failed.")
41 |         return
42 | 
43 |     log_stats(stats=stats)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/comprehend_groundtruth_integration/src/comprehend_customer_scripts/validation/semi_structured/entity_recognizer/validate_manifest.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import boto3
  3 | import json
  4 | import logging
  5 | import os
  6 | import time
  7 | import traceback
  8 | 
  9 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.annotation_utils import is_valid_annotation
 10 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.log_utils import log_stats
 11 | from comprehend_customer_scripts.validation.semi_structured.entity_recognizer.utils.s3_utils import get_object_content, s3_file_exists
 12 | 
 13 | 
 14 | def is_valid_annotation_ref(s3_client, ref: str, stats: dict = {}, fail_on_invalid: bool = True, is_local: bool = False):
 15 |     """Validate an annotation S3 reference."""
 16 |     if is_local:
 17 |         with open(ref, "r", encoding="utf-8", errors="strict") as annotation_file:
 18 |             annotation_content = annotation_file.read()
 19 |     else:
 20 |         annotation_content = get_object_content(s3_client=s3_client, ref=ref)
 21 |     return is_valid_annotation(
 22 |         annotation_content=annotation_content,
 23 |         annotation_name=os.path.basename(ref),
 24 |         stats=stats,
 25 |         fail_on_invalid=fail_on_invalid,
 26 |     )
 27 | 
 28 | 
 29 | def file_exists(s3_client, ref: str, is_local: bool):
 30 |     if is_local:
 31 |         return os.path.exists(ref)
 32 |     else:
 33 |         return s3_file_exists(s3_client=s3_client, ref=ref)
 34 | 
 35 | 
 36 | def is_valid_manifest_line(
 37 |     s3_client, line: str, stats: dict = {}, fail_on_invalid: bool = True, documents_local_ref=None, annotations_local_ref=None
 38 | ):
 39 |     """Validate a single line in the custom EntityRecognizer manifest file."""
 40 |     is_local = bool(documents_local_ref and annotations_local_ref)
 41 |     try:
 42 |         obj = json.loads(line)
 43 |         source_ref = obj.get("source-ref")
 44 |         for key in obj.keys():
 45 |             if obj.get(key) and type(obj.get(key)) == dict and obj.get(key).get("annotation-ref"):
 46 |                 annotation_ref = obj.get(key).get("annotation-ref")
 47 |                 break
 48 |         if not annotation_ref:
 49 |             logging.info(f"No annotation-ref found in: {line}. Skipping.")
 50 |             if fail_on_invalid:
 51 |                 return False
 52 |             return True
 53 |         if "source-ref" not in obj:
 54 |             logging.info(f"No source-ref found in: {line}. Skipping.")
 55 |             if fail_on_invalid:
 56 |                 return False
 57 |             return True
 58 | 
 59 |         if is_local:
 60 |             source_ref = os.path.join(documents_local_ref, os.path.basename(source_ref))
 61 |             logging.info(f"Local source path: {source_ref}")
 62 |             annotation_ref = os.path.join(annotations_local_ref, os.path.basename(annotation_ref))
 63 |             logging.info(f"Local annotation path: {annotation_ref}")
 64 | 
 65 |         return source_ref and annotation_ref and \
 66 |             file_exists(s3_client=s3_client, ref=source_ref, is_local=is_local) and \
 67 |             file_exists(s3_client=s3_client, ref=annotation_ref, is_local=is_local) and \
 68 |             is_valid_annotation_ref(s3_client=s3_client, ref=annotation_ref, stats=stats, fail_on_invalid=fail_on_invalid, is_local=is_local)
 69 |     except Exception as e:
 70 |         logging.error(f"Failed to validate manifest line due to {e}.")
 71 |         traceback.print_tb(e.__traceback__)
 72 | 
 73 |     if fail_on_invalid:
 74 |         return False
 75 |     else:
 76 |         return True
 77 | 
 78 | 
 79 | def main():
 80 |     start_time = time.time()
 81 |     parser = argparse.ArgumentParser()
 82 |     parser.add_argument("--manifest-s3-ref", required=False, type=str, help="S3 reference to manifest file. Usage: --manifest-s3-ref s3://bucket/path/to/output.manifest")
 83 |     parser.add_argument("--manifest-local-ref", required=False, type=str, help="Local reference to manifest file. Usage: --manifest-local-ref /local/path/to/output.manifest")
 84 |     parser.add_argument("--documents-local-ref", required=False, type=str, help="Local reference to document files. Usage: --documents-local-ref /local/path/to/documents")
 85 |     parser.add_argument("--annotations-local-ref", required=False, type=str, help="Local reference to annotation files. Usage: --annotations-local-ref /local/path/to/annotations")
 86 |     parser.add_argument("--fail-on-invalid", action='store_true', help="Fail validation on invalid manifest line or annotation file. Usage: --fail-on-invalid")
 87 |     
 88 |     args = parser.parse_args()
 89 | 
 90 |     manifest_s3_ref = args.manifest_s3_ref
 91 |     manifest_local_ref = args.manifest_local_ref
 92 |     documents_local_ref = args.documents_local_ref
 93 |     annotations_local_ref = args.annotations_local_ref
 94 |     fail_on_invalid = bool(args.fail_on_invalid)
 95 | 
 96 |     s3_client = boto3.client("s3")
 97 |     if manifest_s3_ref is not None:
 98 |         manifest_s3_ref = manifest_s3_ref.rstrip("/")
 99 |         manifest_content = get_object_content(s3_client=s3_client, ref=manifest_s3_ref)
100 |     elif manifest_local_ref is not None:
101 |         manifest_local_ref = manifest_local_ref.rstrip(os.sep)
102 |         with open(manifest_local_ref, "r", encoding="utf-8", errors="strict") as manifest_file:
103 |             manifest_content = manifest_file.read()
104 |     else:
105 |         logging.error(f"Must provide either manifest-s3-ref or manifest-local-ref.")
106 |         return
107 | 
108 |     """
109 |     {
110 |         "<annotation_file_name>": {
111 |             "VALID": {
112 |                 "<entity_type>": {
113 |                     "VALID": <int>,
114 |                     "INVALID": <int>
115 |                 }
116 |             },
117 |             "INVALID_FORMAT": <boolean>
118 |         }
119 |     }
120 |     """
121 |     stats = {}
122 |     for i, manifest_line in enumerate(manifest_content.splitlines()):
123 |         if not is_valid_manifest_line(
124 |             s3_client=s3_client, line=manifest_line, stats=stats, fail_on_invalid=fail_on_invalid,
125 |             documents_local_ref=documents_local_ref, annotations_local_ref=annotations_local_ref
126 |         ):
127 |             logging.error(f"Failed validation at line {i + 1}: {manifest_line}")
128 |             return
129 |     log_stats(stats=stats)
130 |     logging.info(f"Processing took {time.time() - start_time} seconds")
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     main()
135 | 


--------------------------------------------------------------------------------
/s3_object_lambda_pii_protection_blog/access-control/innocuous.txt:
--------------------------------------------------------------------------------
1 | no PII here!


--------------------------------------------------------------------------------
/s3_object_lambda_pii_protection_blog/access-control/s3olap-access-control-foundation.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09" 
  2 | Description: This template creates the foundation necessary to deploy the S3 Object Lambda Access Control Use Case. Deploy this after provisioning the AWS Serverless Application Repository Comprehend Lambda Functions
  3 | 
  4 | Parameters:
  5 | # Standard Access Point Parameters
  6 | 
  7 |   pAccessControlAccessPointName: 
  8 |       Type: String  
  9 |       Description: Access Control Standard Access Point Name.
 10 |       Default: accessctl-s3-ap-survey-results-unknown-pii
 11 | 
 12 | # Role Naming Parameter for Access Points
 13 | 
 14 |   pGeneralRoleName: 
 15 |     Type: String  
 16 |     Description: Privileged IAM Role Name for use with S3 Access Control Access Point.
 17 |     Default: GeneralRole
 18 | 
 19 | # User Supplied Postfix for Uniqueness
 20 | 
 21 |   pPostfix: 
 22 |       Type: String  
 23 |       Description: Supply a Postfix to make resources recognizable and unique (ideally 6 characters or fewer)
 24 | 
 25 | # S3 Bucket for hosting the survey-results text file
 26 | Resources:
 27 |   surveyResultsBucket:
 28 |     Type: AWS::S3::Bucket
 29 |     Properties:
 30 |       PublicAccessBlockConfiguration:
 31 |         BlockPublicAcls: TRUE
 32 |         BlockPublicPolicy: TRUE
 33 |         IgnorePublicAcls: TRUE
 34 |         RestrictPublicBuckets: TRUE
 35 | 
 36 |       BucketName: 
 37 |         !Join
 38 |           - ''
 39 |           - - 'survey-results-unknown-pii-'
 40 |             - !Ref pPostfix
 41 | 
 42 | # S3 Bucket Policy for S3 Bucket only allowing put access, and get object access only through AccessPoints              
 43 |   surveyResultsBucketPolicy:
 44 |     DependsOn: 
 45 |       - surveyResultsBucket
 46 |       - rGeneralRole
 47 |       - rAccessControlAccessPoint
 48 |     Type: AWS::S3::BucketPolicy
 49 |     Properties: 
 50 |       Bucket: 
 51 |         Ref: "surveyResultsBucket"
 52 |       PolicyDocument: 
 53 |         Statement: 
 54 |           - Sid: AWSBucketGetPolicy
 55 |             Action: 's3:GetObject'
 56 |             Effect: Allow
 57 |             Resource: !Join
 58 |               - ''
 59 |               - - 'arn:aws:s3:::'
 60 |                 - !Ref 'surveyResultsBucket'
 61 |                 - /*
 62 |             Principal: 
 63 |               AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root'
 64 |             Condition: 
 65 |               StringEquals:
 66 |                 's3:DataAccessPointAccount': !Ref 'AWS::AccountId'  
 67 |           - Sid: AWSBucketPutPolicy
 68 |             Action: 's3:PutObject'
 69 |             Effect: Allow
 70 |             Resource: !Join
 71 |               - ''
 72 |               - - 'arn:aws:s3:::'
 73 |                 - !Ref 'surveyResultsBucket'
 74 |                 - /*
 75 |             Principal: 
 76 |               AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root'
 77 | 
 78 | 
 79 | # Access Control Access Point and Policy
 80 |   rAccessControlAccessPoint: 
 81 |       DependsOn: 
 82 |       - surveyResultsBucket 
 83 |       - rGeneralRole
 84 |       Type: AWS::S3::AccessPoint
 85 |       Properties: 
 86 |         Bucket: !Ref surveyResultsBucket
 87 |         Name: !Ref pAccessControlAccessPointName
 88 |         NetworkOrigin: Internet
 89 | 
 90 | # IAM Role
 91 |   rGeneralRole:
 92 |     Type: AWS::IAM::Role
 93 |     Properties:
 94 |       RoleName: !Ref pGeneralRoleName 
 95 |       AssumeRolePolicyDocument: 
 96 |         Version: 2012-10-17
 97 |         Statement:
 98 |         - Effect: Allow
 99 |           Principal:
100 |             AWS:
101 |                 Fn::Join: 
102 |                 - ""
103 |                 - 
104 |                     - "arn:aws:iam::"
105 |                     - 
106 |                       !Ref AWS::AccountId
107 |                     - ":root"
108 |               
109 |           Action:
110 |             - 'sts:AssumeRole'
111 | 
112 | 
113 | ## IAM Policy Doc
114 |   rGeneralRolePolicy:
115 |       DependsOn: 
116 |       - rGeneralRole
117 |       Type: AWS::IAM::Policy
118 |       Properties: 
119 |         PolicyName: 'general-role-s3olap-policy'
120 |         PolicyDocument: 
121 |           Version: '2012-10-17'
122 |           Statement:
123 |           - Sid: AllowListingObjects
124 |             Effect: Allow
125 |             Action: s3:ListBucket
126 |             Resource: "*"
127 |           - Sid: AllowListingBucketsAndAccessPoints
128 |             Effect: Allow
129 |             Action:
130 |             - s3:GetAccessPointForObjectLambda
131 |             - s3:GetAccessPointConfigurationForObjectLambda
132 |             - s3:ListAccessPointsForObjectLambda
133 |             - s3:ListAllMyBuckets
134 |             - s3:ListAccessPoints
135 |             - s3:GetAccessPoint
136 |             - s3:GetAccountPublicAccessBlock
137 |             - s3:GetBucketPublicAccessBlock
138 |             - s3:GetBucketPolicyStatus
139 |             - s3:GetBucketAcl
140 |             - s3:GetAccessPointPolicyStatus
141 |             Resource: "*"
142 |           - Sid: AllowObjectLambdaAccess
143 |             Action:
144 |             - s3-object-lambda:Get*
145 |             - s3-object-lambda:List*
146 |             Effect: Allow
147 |             Resource:
148 |               !Sub 'arn:aws:s3-object-lambda:${AWS::Region}:${AWS::AccountId}:accesspoint/accessctl-s3olap-survey-results-unknown-pii'
149 |           
150 |           - Sid: AllowStandardAccessPointAccess
151 |             Action:
152 |             - s3:Get*
153 |             - s3:List*
154 |             Effect: Allow
155 |             Resource: "*"
156 |             Condition:
157 |               ForAnyValue:StringEquals:
158 |                 aws:CalledVia:
159 |                 - s3-object-lambda.amazonaws.com
160 |           - Sid: AllowLambdaInvocation
161 |             Action:
162 |             - lambda:InvokeFunction
163 |             Effect: Allow
164 |             Resource: "*"
165 |             Condition:
166 |               ForAnyValue:StringEquals:
167 |                 aws:CalledVia:
168 |                 - s3-object-lambda.amazonaws.com
169 |         Roles: 
170 |         - 
171 |           !Ref pGeneralRoleName
172 | 


--------------------------------------------------------------------------------
/s3_object_lambda_pii_protection_blog/access-control/survey-results.txt:
--------------------------------------------------------------------------------
1 | How was your experience with using the XYZ product from AnyCompany ?
2 | Options :
3 | 1. Very satisfactory - I enjoyed using the product
4 | 2. Did not enjoy using the product. It was very buggy.
5 | 3. Please provide additional feedback on your experience
6 | Hello I am Victor Baze and for some reason my ssn is not being accepted in the banking application. I am typing my last four digits of my SSN correctly and it is 9921 but it's not working as expected. 
7 | 


--------------------------------------------------------------------------------
/s3_object_lambda_pii_protection_blog/redaction/s3olap-redaction-foundation.yaml:
--------------------------------------------------------------------------------
  1 | # (c) 2021 Amazon Web Services, Inc. or its affiliates. All Rights Reserved. 
  2 | # This AWS Content is provided subject to the terms of the AWS Customer 
  3 | # Agreement available at https://aws.amazon.com/agreement or other written 
  4 | # agreement between Customer and Amazon Web Services, Inc. 
  5 | 
  6 | AWSTemplateFormatVersion: "2010-09-09" 
  7 | Description: This template creates the foundation necessary to deploy the S3 Object Lambda Redaction Use Case. Deploy this after provisioning the AWS Serverless Application Repository Comprehend Lambda Function
  8 | 
  9 | Parameters:
 10 |   # Standard Access Point Parameters
 11 | 
 12 |   pAdminAccessPointName: 
 13 |     Type: String  
 14 |     Description: Admin Standard Access Point Name.
 15 |     Default: admin-s3-access-point-call-transcripts-known-pii
 16 | 
 17 |   pBillingAccessPointName: 
 18 |     Type: String  
 19 |     Description: Billing Standard Access Point Name.
 20 |     Default: billing-s3-access-point-call-transcripts-known-pii
 21 | 
 22 |   pCustomerSupportAccessPointName: 
 23 |     Type: String  
 24 |     Description: Customer Support Access Point Name.
 25 |     Default: cs-s3-access-point-call-transcripts-known-pii
 26 | 
 27 | # Role Parameter Access Points
 28 | 
 29 |   pAdminRoleName: 
 30 |     Type: String  
 31 |     Description: Admin S3 Access Point Role Name.
 32 |     Default: RedactionAdminRole
 33 | 
 34 |   pBillingRoleName: 
 35 |     Type: String  
 36 |     Description: S3 Access Point Role Name.
 37 |     Default: RedactionBillingRole
 38 | 
 39 |   pCustomerSupportRoleName: 
 40 |     Type: String  
 41 |     Description: S3 Access Point Role Name.
 42 |     Default: RedactionCustSupportRole
 43 | 
 44 | # User Supplied Postfix for Uniqueness
 45 | 
 46 |   pPostfix: 
 47 |     Type: String  
 48 |     Description: Supply a Postfix to make resources recognizable and unique (ideally 6 characters or fewer)
 49 | 
 50 | # S3 Bucket for hosting the transcript text file
 51 | Resources:
 52 |   transcriptBucket:
 53 |     Type: AWS::S3::Bucket
 54 |     Properties:
 55 |       PublicAccessBlockConfiguration:
 56 |         BlockPublicAcls: TRUE
 57 |         BlockPublicPolicy: TRUE
 58 |         IgnorePublicAcls: TRUE
 59 |         RestrictPublicBuckets: TRUE
 60 | 
 61 |       BucketName: 
 62 |         !Join
 63 |           - ''
 64 |           - - 'call-transcripts-known-pii-'
 65 |             - !Ref pPostfix
 66 | 
 67 | # S3 Bucket Policy for S3 Bucket only allowing put access, and get object access only through AccessPoints              
 68 |   transcriptBucketPolicy:
 69 |     DependsOn: 
 70 |     - transcriptBucket
 71 |     - rAdminRole
 72 |     - rBillingRole
 73 |     - rCustomerSupportRole
 74 |     - rAdminAccessPoint
 75 |     - rBillingAccessPoint
 76 |     - rCustomerSupportAccessPoint
 77 |     Type: AWS::S3::BucketPolicy
 78 |     Properties: 
 79 |       Bucket: 
 80 |         Ref: "transcriptBucket"
 81 |       PolicyDocument: 
 82 |         Statement: 
 83 |           - Sid: AWSBucketGetPolicy
 84 |             Action: 's3:GetObject'
 85 |             Effect: Allow
 86 |             Resource: !Join
 87 |               - ''
 88 |               - - 'arn:aws:s3:::'
 89 |                 - !Ref 'transcriptBucket'
 90 |                 - /*
 91 |             Principal: 
 92 |               AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root'
 93 |             Condition: 
 94 |               StringEquals:
 95 |                 's3:DataAccessPointAccount': !Ref 'AWS::AccountId'  
 96 |           - Sid: AWSBucketPutPolicy
 97 |             Action: 's3:PutObject'
 98 |             Effect: Allow
 99 |             Resource: !Join
100 |               - ''
101 |               - - 'arn:aws:s3:::'
102 |                 - !Ref 'transcriptBucket'
103 |                 - /*
104 |             Principal: 
105 |               AWS: !Sub 'arn:aws:iam::${AWS::AccountId}:root'
106 | 
107 | # Access Points per Role
108 | 
109 |   rAdminAccessPoint: 
110 |     DependsOn: 
111 |     - transcriptBucket 
112 |     - rAdminRole
113 |     Type: AWS::S3::AccessPoint
114 |     Properties: 
115 |       Bucket: !Ref transcriptBucket
116 |       Name: !Ref pAdminAccessPointName
117 |       NetworkOrigin: Internet
118 | 
119 |   rBillingAccessPoint: 
120 |     DependsOn: 
121 |     - transcriptBucket 
122 |     - rBillingRole
123 |     Type: AWS::S3::AccessPoint
124 |     Properties: 
125 |       Bucket: !Ref transcriptBucket
126 |       Name: !Ref pBillingAccessPointName
127 |       NetworkOrigin: Internet
128 | 
129 |   rCustomerSupportAccessPoint: 
130 |     DependsOn: 
131 |     - transcriptBucket 
132 |     - rCustomerSupportRole
133 |     Type: AWS::S3::AccessPoint
134 |     Properties: 
135 |       Bucket: !Ref transcriptBucket
136 |       Name: !Ref pCustomerSupportAccessPointName
137 |       NetworkOrigin: Internet
138 | 
139 | # IAM Roles per Persona
140 | 
141 |   rAdminRole:
142 |     Type: AWS::IAM::Role
143 |     Properties:
144 |       RoleName: !Ref pAdminRoleName 
145 |       AssumeRolePolicyDocument: 
146 |         Version: 2012-10-17
147 |         Statement:
148 |           - Effect: Allow
149 |             Principal:
150 |               AWS:
151 |                   Fn::Join: 
152 |                   - ""
153 |                   - 
154 |                       - "arn:aws:iam::"
155 |                       - 
156 |                         !Ref AWS::AccountId
157 |                       - ":root"
158 |                 
159 |             Action:
160 |               - 'sts:AssumeRole'
161 | 
162 | 
163 |   rBillingRole:
164 |     Type: AWS::IAM::Role
165 |     Properties:
166 |       RoleName: !Ref pBillingRoleName 
167 |       AssumeRolePolicyDocument: 
168 |         Version: 2012-10-17
169 |         Statement:
170 |           - Effect: Allow
171 |             Principal:
172 |               AWS:
173 |                   Fn::Join: 
174 |                   - ""
175 |                   - 
176 |                       - "arn:aws:iam::"
177 |                       - 
178 |                         !Ref AWS::AccountId
179 |                       - ":root"
180 |                 
181 |             Action:
182 |               - 'sts:AssumeRole'
183 | 
184 | 
185 |   rCustomerSupportRole:
186 |     Type: AWS::IAM::Role
187 |     Properties:
188 |       RoleName: !Ref pCustomerSupportRoleName 
189 |       AssumeRolePolicyDocument: 
190 |         Version: 2012-10-17
191 |         Statement:
192 |           - Effect: Allow
193 |             Principal:
194 |               AWS:
195 |                   Fn::Join: 
196 |                   - ""
197 |                   - 
198 |                       - "arn:aws:iam::"
199 |                       - 
200 |                         !Ref AWS::AccountId
201 |                       - ":root"
202 |                 
203 |             Action:
204 |               - 'sts:AssumeRole'
205 | 
206 | ## IAM Policy Documents for the roles
207 | 
208 |   rAdminRolePolicy:
209 |       DependsOn: 
210 |       - rAdminRole
211 |       Type: AWS::IAM::Policy
212 |       Properties: 
213 |         PolicyName: 'admin-role-s3olap-policy'
214 |         PolicyDocument: 
215 |           Version: '2012-10-17'
216 |           #
217 |           Statement:
218 |           - Sid: AllowListingObjects
219 |             Effect: Allow
220 |             Action: s3:ListBucket
221 |             Resource: "*"
222 |           - Sid: AllowListingBucketsAndAccessPoints
223 |             Effect: Allow
224 |             Action:
225 |             - s3:GetAccessPointForObjectLambda
226 |             - s3:GetAccessPointConfigurationForObjectLambda
227 |             - s3:ListAccessPointsForObjectLambda
228 |             - s3:ListAllMyBuckets
229 |             - s3:ListAccessPoints
230 |             - s3:GetAccessPoint
231 |             - s3:GetAccountPublicAccessBlock
232 |             - s3:GetBucketPublicAccessBlock
233 |             - s3:GetBucketPolicyStatus
234 |             - s3:GetBucketAcl
235 |             - s3:GetAccessPointPolicyStatus
236 |             Resource: "*"
237 |           - Sid: AllowObjectLambdaAccess
238 |             Action:
239 |             - s3-object-lambda:Get*
240 |             - s3-object-lambda:List*
241 |             Effect: Allow
242 |             Resource:
243 |               !Sub 'arn:aws:s3-object-lambda:${AWS::Region}:${AWS::AccountId}:accesspoint/admin-s3olap-call-transcripts-known-pii'
244 | 
245 |           - Sid: AllowStandardAccessPointAccess
246 |             Action:
247 |             - s3:Get*
248 |             - s3:List*
249 |             Effect: Allow
250 |             Resource: "*"
251 |             Condition:
252 |               ForAnyValue:StringEquals:
253 |                 aws:CalledVia:
254 |                 - s3-object-lambda.amazonaws.com
255 |           - Sid: AllowLambdaInvocation
256 |             Action:
257 |             - lambda:InvokeFunction
258 |             Effect: Allow
259 |             Resource: "*"
260 |             Condition:
261 |               ForAnyValue:StringEquals:
262 |                 aws:CalledVia:
263 |                 - s3-object-lambda.amazonaws.com
264 |         Roles: 
265 |         - 
266 |           !Ref pAdminRoleName
267 | 
268 |   rBillingRolePolicy:
269 |       DependsOn: 
270 |       - rBillingRole
271 |       Type: AWS::IAM::Policy
272 |       Properties: 
273 |         PolicyName: 'billing-role-s3olap-policy'
274 |         PolicyDocument: 
275 |           Version: '2012-10-17'
276 |           Statement:
277 |           - Sid: AllowListingObjects
278 |             Effect: Allow
279 |             Action: s3:ListBucket
280 |             Resource: "*"
281 |           - Sid: AllowListingBucketsAndAccessPoints
282 |             Effect: Allow
283 |             Action:
284 |             - s3:GetAccessPointForObjectLambda
285 |             - s3:GetAccessPointConfigurationForObjectLambda
286 |             - s3:ListAccessPointsForObjectLambda
287 |             - s3:ListAllMyBuckets
288 |             - s3:ListAccessPoints
289 |             - s3:GetAccessPoint
290 |             - s3:GetAccountPublicAccessBlock
291 |             - s3:GetBucketPublicAccessBlock
292 |             - s3:GetBucketPolicyStatus
293 |             - s3:GetBucketAcl
294 |             - s3:GetAccessPointPolicyStatus
295 |             Resource: "*"
296 |           - Sid: AllowObjectLambdaAccess
297 |             Action:
298 |             - s3-object-lambda:Get*
299 |             - s3-object-lambda:List*
300 |             Effect: Allow
301 |             Resource:
302 |               !Sub 'arn:aws:s3-object-lambda:${AWS::Region}:${AWS::AccountId}:accesspoint/billing-s3olap-call-transcripts-known-pii'
303 | 
304 |           - Sid: AllowStandardAccessPointAccess
305 |             Action:
306 |             - s3:Get*
307 |             - s3:List*
308 |             Effect: Allow
309 |             Resource: "*"
310 |             Condition:
311 |               ForAnyValue:StringEquals:
312 |                 aws:CalledVia:
313 |                 - s3-object-lambda.amazonaws.com
314 |           - Sid: AllowLambdaInvocation
315 |             Action:
316 |             - lambda:InvokeFunction
317 |             Effect: Allow
318 |             Resource: "*"
319 |             Condition:
320 |               ForAnyValue:StringEquals:
321 |                 aws:CalledVia:
322 |                 - s3-object-lambda.amazonaws.com
323 |         Roles: 
324 |         - 
325 |           !Ref pBillingRoleName
326 | 
327 | 
328 |   rCustomerSupportRolePolicy:
329 |       DependsOn: 
330 |       - rCustomerSupportRole
331 |       Type: AWS::IAM::Policy
332 |       Properties: 
333 |         PolicyName: 'customersupport-role-s3olap-policy'
334 |         PolicyDocument: 
335 |           Version: '2012-10-17'
336 |           Statement:
337 |           - Sid: AllowListingObjects
338 |             Effect: Allow
339 |             Action: s3:ListBucket
340 |             Resource: "*"
341 |           - Sid: AllowListingBucketsAndAccessPoints
342 |             Effect: Allow
343 |             Action:
344 |             - s3:GetAccessPointForObjectLambda
345 |             - s3:GetAccessPointConfigurationForObjectLambda
346 |             - s3:ListAccessPointsForObjectLambda
347 |             - s3:ListAllMyBuckets
348 |             - s3:ListAccessPoints
349 |             - s3:GetAccessPoint
350 |             - s3:GetAccountPublicAccessBlock
351 |             - s3:GetBucketPublicAccessBlock
352 |             - s3:GetBucketPolicyStatus
353 |             - s3:GetBucketAcl
354 |             - s3:GetAccessPointPolicyStatus
355 |             Resource: "*"
356 |           - Sid: AllowObjectLambdaAccess
357 |             Action:
358 |             - s3-object-lambda:Get*
359 |             - s3-object-lambda:List*
360 |             Effect: Allow
361 |             Resource:
362 |               !Sub 'arn:aws:s3-object-lambda:${AWS::Region}:${AWS::AccountId}:accesspoint/custsupport-s3olap-call-transcripts-known-pii'
363 | 
364 |           - Sid: AllowStandardAccessPointAccess
365 |             Action:
366 |             - s3:Get*
367 |             - s3:List*
368 |             Effect: Allow
369 |             Resource: "*"
370 |             Condition:
371 |               ForAnyValue:StringEquals:
372 |                 aws:CalledVia:
373 |                 - s3-object-lambda.amazonaws.com
374 |           - Sid: AllowLambdaInvocation
375 |             Action:
376 |             - lambda:InvokeFunction
377 |             Effect: Allow
378 |             Resource: "*"
379 |             Condition:
380 |               ForAnyValue:StringEquals:
381 |                 aws:CalledVia:
382 |                 - s3-object-lambda.amazonaws.com
383 |         Roles: 
384 |         - 
385 |           !Ref pCustomerSupportRoleName
386 | 


--------------------------------------------------------------------------------
/s3_object_lambda_pii_protection_blog/redaction/transcript.txt:
--------------------------------------------------------------------------------
 1 | *Call center Agent :*
 2 | 
 3 | Thank you for calling customer support. I am Zhang Wei. How can I assist you today ? 
 4 | 
 5 | *Customer :*
 6 | 
 7 | Hello Zhang Wei. My name is Victor Baze. My credit card number 1111-0000-1111-0000 has a minimum payment of $24.53 that is due by July 31st. Based on my autopay settings, that amount should have withdrawn on the due date from my bank account XXXXXX1111 with the routing number XXXXX0000. But that hasn’t happened and now I might have to pay a late fee, this is frustrating
 8 | 
 9 | *Customer Agent :*
10 | 
11 | Can you please confirm your mailing address and email ?
12 | 
13 | *Customer :*
14 | 
15 | victor@xyz.com (mailto:xyz@xyz.com) and 100 Main Street, Anytown, WA 98121
16 | 
17 | *Customer Agent :*
18 | 
19 | Could your tell me the last four digits of your social security number ?
20 | 
21 | *Customer :*
22 | 
23 | 8920
24 | 
25 | *Customer Agent :*
26 | 
27 | Thank you. I have verified your information and I find that there was error in processing your payment from your Bank and I have gone ahead and corrected problem. Rest assured, you won’t be charged the late fee. 
28 | 
29 | Also you will receive a confirmation of the payment as text message on your phone number. The number that I have on file is 206-555-0100 (tel:2065550100). Is that the correct number and did you receive the payment confirmation message ?
30 | 
31 | *Customer :*
32 | 
33 | Yes I received the confirmation and its the right phone number
34 | 


--------------------------------------------------------------------------------
/topic_wise_review_analysis/data_processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "43eeae79",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Getting insight from customer reviews using Amazon Comprehend"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "d958be25",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Introduction\n",
 17 |     "<a id=\"Introduction\"></a>\n",
 18 |     "\n"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "c28d0e7d",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "We will use a NLP AI Service from Amazon Web Services - [Amazon Comprehend](https://aws.amazon.com/comprehend/) to solve the business problem. Amazon Comprehend is a natural language processing (NLP) service that uses machine learning (ML) to find insights and relationships in texts. Amazon Comprehend has ability for you to train it to recognize custom entities and perform custom classification. \n",
 27 |     "\n",
 28 |     "*Notes*: `boto3`, the Python SDK for AWS, is used in the different examples of this notebook. It is already installed if you are executing this Notebook from a Sagemaker Notebook environment. "
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "51e308c9",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Problem Statetment\n",
 37 |     "<a id=\"ProblemStatetement\"></a>"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "12fbea3d",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Consumers are increasingly engaging with businesses through digital surfaces and multiple touch-points. Statistics shows that the majority of shoppers use reviews to determine what products to buy and which services to purchase. Reviews have the power to influence consumer decisions and strengthen brand value. Customer review is a great tool to estimate product quality, identify improvement opportunities, launch promotional campaigns and make great product recommendations. We will use Amazon Comprehend to extract meaningful information from product reviews, analyze it to understand how users of different demographies are reacting to products, and also analyze aggregated information on user affinity towards a product."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "edf9154a",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Use AWS NLP Service Amazon Comprehend as a Solution\n",
 54 |     "<a id=\"Rescue\"></a>"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "838f4396",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "We will use Natural Language Processing to solve the problem by following the below mentioned approach - \n",
 63 |     "\n",
 64 |     "#### 1. Data Processing and Transformation Notebook\n",
 65 |     "Exploratory Data Analysis to understand the dataset\n",
 66 |     "#### 2. Comprehend Topic Modelling Job Notebook\n",
 67 |     "Use Topic Modeling to generate topics\n",
 68 |     "#### 3. Topic Mapping and Sentiment Generation Notebook\n",
 69 |     "Use topics to understand segments and sentiment associated with each item\n",
 70 |     "\n",
 71 |     "\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "id": "d74b567c",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "### Data Loading"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "bdc7d747",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "#### Initialize Input & Output Paths\n",
 88 |     "<a id=\"InitialiazeS3Data\"></a>"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "1e416495",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Library imports\n",
 99 |     "import pandas as pd\n",
100 |     "import os"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "16071eb6",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Input-paths"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "4f4e40e5",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Bucket containing the data\n",
119 |     "BUCKET = 'clothing-shoe-jewel-tm-blog'\n",
120 |     "\n",
121 |     "# Item ratings and metadata\n",
122 |     "S3_DATA_FILE = 'Clothing_Shoes_and_Jewelry.json.gz' # Zip\n",
123 |     "S3_META_FILE = 'meta_Clothing_Shoes_and_Jewelry.json.gz' # Zip\n",
124 |     "\n",
125 |     "S3_DATA = 's3://' + BUCKET + '/' + S3_DATA_FILE\n",
126 |     "S3_META = 's3://' + BUCKET + '/' + S3_META_FILE"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "9481086d",
132 |    "metadata": {},
133 |    "source": [
134 |     "### Output-paths"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "42f62365",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Transformed review, input for Comprehend\n",
145 |     "LOCAL_TRANSFORMED_REVIEW = os.path.join('data', 'TransformedReviews.txt')\n",
146 |     "S3_OUT = 's3://' + BUCKET + '/out/' + 'TransformedReviews.txt'\n",
147 |     "\n",
148 |     "# Final dataframe where topics and sentiments are going to be joined\n",
149 |     "S3_FEEDBACK_TOPICS = 's3://' + BUCKET + '/out/' + 'FinalDataframe.csv'"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "id": "1e93528f",
155 |    "metadata": {},
156 |    "source": [
157 |     "#### Load Review and Meta Data into Dataframe"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "b7ee3328",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "def convert_json_to_df(path):\n",
168 |     "    \"\"\"Reads a subset of a json file in a given path in chunks, combines, and returns\n",
169 |     "    \"\"\"\n",
170 |     "    # Creating chunks from 500k data points each of chunk size 10k\n",
171 |     "    chunks = pd.read_json(path, orient='records', \n",
172 |     "                                lines=True, \n",
173 |     "                                nrows=500000, \n",
174 |     "                                chunksize=10000, \n",
175 |     "                                compression='gzip')\n",
176 |     "    # Creating a single dataframe from all the chunks\n",
177 |     "    load_df = pd.DataFrame()\n",
178 |     "    for chunk in chunks:\n",
179 |     "        load_df = pd.concat([load_df, chunk], axis=0)\n",
180 |     "    return load_df"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "id": "5f568612",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "# Review data\n",
191 |     "original_df = convert_json_to_df(S3_DATA)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "id": "1d5a56b7",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "# Metadata\n",
202 |     "original_meta = convert_json_to_df(S3_META)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "81e5247f",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Exploratory Data Analysis"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "464b3ff0",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# Shape of reviews and metadata\n",
221 |     "print('Shape of review data: ', original_df.shape)\n",
222 |     "print('Shape of metadata: ', original_meta.shape)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "574c66a7",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "# We are interested in verified reviews only\n",
233 |     "# Also checking the amount of missing values in the review data\n",
234 |     "print('Frequency of verified/non verified review data: ', original_df['verified'].value_counts())\n",
235 |     "print('Frequency of missing values in review data: ', original_df.isna().sum())"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "5f9a989f",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Sneak peek for review data\n",
246 |     "original_df.head()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "b42afb23",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "# Sneak peek for metadata\n",
257 |     "original_meta.head()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "id": "a9159836",
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "# Count of each categories for EDA.\n",
268 |     "print('Frequncy of different item categories in metadata: ', original_meta['category'].value_counts())"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "id": "36b57789",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "# Checking null values for metadata\n",
279 |     "print('Frequency of missing values in metadata: ', original_meta.isna().sum())"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "96de5d88",
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "# Checking if there are duplicated data. There are indeed duplicated data in the dataframe.\n",
290 |     "print('Duplicate items in metadata: ', original_meta[original_meta['asin'].duplicated()])"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "id": "1801a8ce",
296 |    "metadata": {},
297 |    "source": [
298 |     "### Preprocessing"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "id": "fa25512c",
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "def clean_text(df):\n",
309 |     "    \"\"\"Preprocessing review text.\n",
310 |     "    The text becomes Comprehend compatible as a result.\n",
311 |     "    This is the most important preprocessing step.\n",
312 |     "    \"\"\"\n",
313 |     "    # Encode and decode reviews\n",
314 |     "    df['reviewText'] = df['reviewText'].str.encode(\"utf-8\", \"ignore\")\n",
315 |     "    df['reviewText'] = df['reviewText'].str.decode('ascii')\n",
316 |     "\n",
317 |     "    # Replacing characters with whitespace\n",
318 |     "    df['reviewText'] = df['reviewText'].replace(r'\\r+|\\n+|\\t+|\\u2028',' ', regex=True)\n",
319 |     "\n",
320 |     "    # Replacing punctuations\n",
321 |     "    df['reviewText'] = df['reviewText'].str.replace('[^\\w\\s]','', regex=True)\n",
322 |     "\n",
323 |     "    # Lowercasing reviews\n",
324 |     "    df['reviewText'] = df['reviewText'].str.lower()\n",
325 |     "    return df"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "id": "86bce500",
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "def prepare_input_data(df):\n",
336 |     "    \"\"\"Encoding and getting reviews in byte size.\n",
337 |     "    Review gets encoded to utf-8 format and getting the size of the reviews in bytes. \n",
338 |     "    Comprehend requires each review input to be no more than 5000 Bytes\n",
339 |     "    \"\"\"\n",
340 |     "    df['review_size'] = df['reviewText'].apply(lambda x:len(x.encode('utf-8')))\n",
341 |     "    df = df[(df['review_size'] > 0) & (df['review_size'] < 5000)]\n",
342 |     "    df = df.drop(columns=['review_size'])\n",
343 |     "    return df"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "id": "c3de304e",
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "# Only data points with a verified review will be selected and the review must not be missing\n",
354 |     "filter = (original_df['verified'] == True) & (~original_df['reviewText'].isna())\n",
355 |     "filtered_df = original_df[filter]"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "508d96b0",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "# Only a subset of fields are selected in this experiment. \n",
366 |     "filtered_df = filtered_df[['asin', 'reviewText', 'summary', 'unixReviewTime', 'overall', 'reviewerID']]"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "id": "9eaea908",
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "# Just in case, once again, dropping data points with missing review text\n",
377 |     "filtered_df = filtered_df.dropna(subset=['reviewText'])\n",
378 |     "print('Shape of review data: ', filtered_df.shape)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "43cec4d8",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "# Dropping duplicate items from metadata\n",
389 |     "original_meta = original_meta.drop_duplicates(subset=['asin'])"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "id": "c5629207",
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "# Only a subset of fields are selected in this experiment. \n",
400 |     "original_meta = original_meta[['asin', 'category', 'title', 'description', 'brand', 'main_cat']]"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "id": "1bc9df66",
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "# Clean reviews using text cleaning pipeline\n",
411 |     "df = clean_text(filtered_df)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "eb104e8c",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "# Reset index as we are merging metadata with reviews shortly\n",
422 |     "df = df.reset_index().drop(columns=['index'])"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "id": "ec7efa3f",
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "# Merge metadata with review data\n",
433 |     "df = df.merge(original_meta, how='left', on='asin')"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "id": "298cf094",
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# Dataframe where Comprehend outputs (topics and sentiments) will be added\n",
444 |     "df = prepare_input_data(df)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "id": "e187c57d",
450 |    "metadata": {},
451 |    "source": [
452 |     "### Save Data in S3"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "id": "150babe7",
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "# Saving dataframe on S3\n",
463 |     "df.to_csv(S3_FEEDBACK_TOPICS, index=False)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "id": "f5b06d5b",
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "# Reviews are transformed per Comprehend guideline- one review per line\n",
474 |     "# The txt file will be used as input for Comprehend\n",
475 |     "# We first save the input file locally\n",
476 |     "with open(LOCAL_TRANSFORMED_REVIEW, \"w\") as outfile:\n",
477 |     "    outfile.write(\"\\n\".join(df['reviewText'].tolist()))"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "id": "407b27ca",
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "# Transferring the transformed review (input to Comprehend) to S3\n",
488 |     "!aws s3 mv {LOCAL_TRANSFORMED_REVIEW} {S3_OUT}"
489 |    ]
490 |   }
491 |  ],
492 |  "metadata": {
493 |   "kernelspec": {
494 |    "display_name": "Python 3.9.10 64-bit",
495 |    "language": "python",
496 |    "name": "python3"
497 |   },
498 |   "language_info": {
499 |    "codemirror_mode": {
500 |     "name": "ipython",
501 |     "version": 3
502 |    },
503 |    "file_extension": ".py",
504 |    "mimetype": "text/x-python",
505 |    "name": "python",
506 |    "nbconvert_exporter": "python",
507 |    "pygments_lexer": "ipython3",
508 |    "version": "3.9.10"
509 |   },
510 |   "vscode": {
511 |    "interpreter": {
512 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
513 |    }
514 |   }
515 |  },
516 |  "nbformat": 4,
517 |  "nbformat_minor": 5
518 | }
519 | 


--------------------------------------------------------------------------------
/topic_wise_review_analysis/model_training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f1a266ef",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Getting insight from customer reviews using Amazon Comprehend"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "049ac667",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Comprehend Topic Modelling Job Notebook\n",
 17 |     "In the previous Notebook we performed data cleaning, exploration, and analysis. Now in this Notebook we will run a Topic Modeling job in Amazon Comprehend to get - \n",
 18 |     "\n",
 19 |     "\n",
 20 |     "1. List of words associated with each topic with high probability\n",
 21 |     "2. Assignment of each document to topics\n",
 22 |     "\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "dde059b8",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Initialize\n",
 31 |     "<a id=\"InitialiazeS3Data\"></a>"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "56f42e13",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Library imports\n",
 42 |     "import pandas as pd\n",
 43 |     "import boto3\n",
 44 |     "import json, time, tarfile, os"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "id": "4c60fcf2",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Client and session information\n",
 55 |     "session = boto3.Session()\n",
 56 |     "s3 = boto3.resource('s3')\n",
 57 |     "\n",
 58 |     "# Account id. Required downstream.\n",
 59 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
 60 |     "\n",
 61 |     "# Initializing Comprehend client\n",
 62 |     "comprehend = boto3.client(service_name='comprehend', \n",
 63 |     "                          region_name=session.region_name)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "id": "315cbe1d",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "### Variables"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "27e17c0d",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Number of topics set to 5 after having a human-in-the-loop\n",
 82 |     "# This needs to be fully aligned with topicMaps dictionary in the third script \n",
 83 |     "NUMBER_OF_TOPICS = 5"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "21cd6e94",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Input file format of one review per line\n",
 94 |     "input_doc_format = \"ONE_DOC_PER_LINE\"\n",
 95 |     "\n",
 96 |     "# Role arn (Hard coded, masked)\n",
 97 |     "data_access_role_arn = \"arn:aws:iam::XXXXXXXXXXXX:role/service-role/AmazonSageMaker-ExecutionRole-XXXXXXXXXXXXXXX\""
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "id": "c6da53f5",
103 |    "metadata": {},
104 |    "source": [
105 |     "### Input and Output"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "7b5f5e47",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# Constants for S3 bucket and input data file\n",
116 |     "BUCKET = 'clothing-shoe-jewel-tm-blog'\n",
117 |     "input_s3_url = 's3://' + BUCKET + '/out/' + 'TransformedReviews.txt'\n",
118 |     "output_s3_url = 's3://' + BUCKET + '/out/' + 'output/'\n",
119 |     "\n",
120 |     "# Final dataframe where we will join Comprehend outputs later\n",
121 |     "S3_FEEDBACK_TOPICS = 's3://' + BUCKET + '/out/' + 'FinalDataframe.csv'\n",
122 |     "\n",
123 |     "# Local copy of Comprehend output\n",
124 |     "LOCAL_COMPREHEND_OUTPUT_DIR = os.path.join('comprehend_out', '')\n",
125 |     "LOCAL_COMPREHEND_OUTPUT_FILE = os.path.join(LOCAL_COMPREHEND_OUTPUT_DIR, 'output.tar.gz')"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "26db0a13",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "INPUT_CONFIG={\n",
136 |     "    # The S3 URI where Comprehend output is placed.\n",
137 |     "    'S3Uri':    input_s3_url,\n",
138 |     "    # Document format\n",
139 |     "    'InputFormat': input_doc_format,\n",
140 |     "}\n",
141 |     "OUTPUT_CONFIG={\n",
142 |     "    # The S3 URI where Comprehend output is placed.\n",
143 |     "    'S3Uri':    output_s3_url,\n",
144 |     "}"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "id": "eb08a3f0",
150 |    "metadata": {},
151 |    "source": [
152 |     "### Data Check"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "id": "3738592e",
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Reading the Comprehend input file just to double check if number of reviews \n",
163 |     "# and the number of lines in the input file have an exact match.\n",
164 |     "obj = s3.Object(input_s3_url)\n",
165 |     "comprehend_input = obj.get()['Body'].read().decode('utf-8')\n",
166 |     "comprehend_input_lines = len(comprehend_input.split('\\n'))\n",
167 |     "\n",
168 |     "# Reviews where Comprehend outputs will be merged\n",
169 |     "df = pd.read_csv(S3_FEEDBACK_TOPICS)\n",
170 |     "review_df_length = df.shape[0]\n",
171 |     "\n",
172 |     "# The two lengths must be equal\n",
173 |     "assert comprehend_input_lines == review_df_length"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "78f7e2f5",
179 |    "metadata": {},
180 |    "source": [
181 |     "### Topic Modelling Job"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "359af63e",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "# Start Comprehend topic modelling job.\n",
192 |     "# Specifies the number of topics, input and output config and IAM role ARN \n",
193 |     "# that grants Amazon Comprehend read access to data.\n",
194 |     "start_topics_detection_job_result = comprehend.start_topics_detection_job(\n",
195 |     "                                                    NumberOfTopics=NUMBER_OF_TOPICS,\n",
196 |     "                                                    InputDataConfig=INPUT_CONFIG,\n",
197 |     "                                                    OutputDataConfig=OUTPUT_CONFIG,\n",
198 |     "                                                    DataAccessRoleArn=data_access_role_arn)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "id": "895fdb76",
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "print('start_topics_detection_job_result: ' + json.dumps(start_topics_detection_job_result))\n",
209 |     "\n",
210 |     "# Job ID is required downstream for extracting the Comprehend results\n",
211 |     "job_id = start_topics_detection_job_result[\"JobId\"]\n",
212 |     "print('job_id: ', job_id)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "id": "a84b88d0",
218 |    "metadata": {},
219 |    "source": [
220 |     "### Check Topic Detection Status"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "id": "0ddc57c2",
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "# Topic detection takes a while to complete. \n",
231 |     "# We can track the current status by calling Use the DescribeTopicDetectionJob operation.\n",
232 |     "# Keeping track if Comprehend has finished its job\n",
233 |     "description = comprehend.describe_topics_detection_job(JobId=job_id)\n",
234 |     "\n",
235 |     "topic_detection_job_status = description['TopicsDetectionJobProperties'][\"JobStatus\"]\n",
236 |     "print(topic_detection_job_status)\n",
237 |     "while topic_detection_job_status not in [\"COMPLETED\", \"FAILED\"]:\n",
238 |     "    time.sleep(120)\n",
239 |     "    topic_detection_job_status = comprehend.describe_topics_detection_job(JobId=job_id)['TopicsDetectionJobProperties'][\"JobStatus\"]\n",
240 |     "    print(topic_detection_job_status)\n",
241 |     "\n",
242 |     "topic_detection_job_status = comprehend.describe_topics_detection_job(JobId=job_id)['TopicsDetectionJobProperties'][\"JobStatus\"]\n",
243 |     "print(topic_detection_job_status)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "id": "6428a6a2",
249 |    "metadata": {},
250 |    "source": [
251 |     "### Save Output"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "id": "f22bd92b",
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "# Bucket prefix where model artifacts are stored\n",
262 |     "prefix = f'{account_id}-TOPICS-{job_id}'\n",
263 |     "\n",
264 |     "# Model artifact zipped file\n",
265 |     "artifact_file = 'output.tar.gz'\n",
266 |     "\n",
267 |     "# Location on S3 where model artifacts are stored\n",
268 |     "target = f's3://{BUCKET}/out/output/{prefix}/{artifact_file}'"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "id": "66783c4a",
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "# Copy Comprehend output from S3 to local notebook instance\n",
279 |     "! aws s3 cp {target}  ./comprehend-out/"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "b6ae6f4a",
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "# Unzip the Comprehend output file. \n",
290 |     "# Two files are now saved locally- \n",
291 |     "#       (1) comprehend-out/doc-topics.csv and \n",
292 |     "#       (2) comprehend-out/topic-terms.csv\n",
293 |     "\n",
294 |     "comprehend_tars = tarfile.open(LOCAL_COMPREHEND_OUTPUT_FILE)\n",
295 |     "comprehend_tars.extractall(LOCAL_COMPREHEND_OUTPUT_DIR)\n",
296 |     "comprehend_tars.close()"
297 |    ]
298 |   }
299 |  ],
300 |  "metadata": {
301 |   "kernelspec": {
302 |    "display_name": "Python 3.9.10 64-bit",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.9.10"
317 |   },
318 |   "vscode": {
319 |    "interpreter": {
320 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
321 |    }
322 |   }
323 |  },
324 |  "nbformat": 4,
325 |  "nbformat_minor": 5
326 | }
327 | 


--------------------------------------------------------------------------------
/topic_wise_review_analysis/topic_mapping_sentiment_generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "45f34600",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Getting Insight from Customer Reviews using Amazon Comprehend"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "bf36e72d",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Comprehend Topic Mapping & Sentiment Analysis Notebook\n",
 17 |     "In the previous Notebook we performed topic modeling job. Now in this Notebook we will use the output of the topic modeling job and map it with the topic names. We also understand how sentiment of the reviews are w.r.t to the item and the associated topic to have an aggregated view\n",
 18 |     "\n",
 19 |     "\n",
 20 |     "\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "44818f98",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "### Import Libararies"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "af38c3ff",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Library imports\n",
 39 |     "import pandas as pd\n",
 40 |     "import boto3\n",
 41 |     "import os\n",
 42 |     "from collections import Counter\n",
 43 |     "\n",
 44 |     "# boto3 session to access service\n",
 45 |     "session = boto3.Session()\n",
 46 |     "comprehend = boto3.client(  'comprehend',\n",
 47 |     "                            region_name=session.region_name)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "183c5f85",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Input Paths"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "5006d430",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# S3 bucket\n",
 66 |     "BUCKET = 'clothing-shoe-jewel-tm-blog'\n",
 67 |     "\n",
 68 |     "# Local copy of doc-topic file\n",
 69 |     "DOC_TOPIC_FILE = os.path.join('comprehend-out', 'doc-topics.csv')\n",
 70 |     "\n",
 71 |     "# Final dataframe where we will join Comprehend outputs later\n",
 72 |     "S3_FEEDBACK_TOPICS = 's3://' + BUCKET + '/out/' + 'FinalDataframe.csv'"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "df260363",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Output paths"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "50adada3",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Final output\n",
 91 |     "S3_FINAL_OUTPUT = 's3://' + BUCKET + '/out/' + 'reviewTopicsSentiments.csv'"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "703ab209",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### Variables"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "d3a48893",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# Top 3 topics per product will be aggregated\n",
110 |     "TOP_TOPICS = 3\n",
111 |     "\n",
112 |     "# Working on English language only. \n",
113 |     "language_code = 'en'"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "7bc371ff",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Topic names for 5 topics created by human-in-the-loop or SME feed\n",
124 |     "topicMaps = {\n",
125 |     "    0: 'Product comfortability',\n",
126 |     "    1: 'Product Quality and Price',\n",
127 |     "    2: 'Product Size',\n",
128 |     "    3: 'Product Color',\n",
129 |     "    4: 'Product Return',\n",
130 |     "}"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "id": "209b2bb8",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Process doc-topics to list Document-Topic # Mapping"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "id": "a7df9828",
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# Loading documents and topics assigned to each of them by Comprehend\n",
149 |     "docTopics = pd.read_csv(DOC_TOPIC_FILE)\n",
150 |     "docTopics.head()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "c61463aa",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "# Creating a field with doc number. \n",
161 |     "# This doc number is the line number of the input file to Comprehend.\n",
162 |     "docTopics['doc'] = docTopics['docname'].str.split(':').str[1]\n",
163 |     "docTopics['doc'] = docTopics['doc'].astype(int)\n",
164 |     "docTopics.head()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "id": "a4025bd4",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Generate Topic Names from Topic Terms"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "id": "38565578",
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "# Load topics and associated terms\n",
183 |     "topicTerms = pd.read_csv(DOC_TOPIC_FILE)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "565aa54b",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# Consolidate terms for each topic\n",
194 |     "aggregatedTerms = topicTerms.groupby('topic')['term'].aggregate(lambda term: term.unique().tolist()).reset_index()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "id": "9f88db1f",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# Sneak peek\n",
205 |     "aggregatedTerms.head(10)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "id": "5520df95",
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# Map topic names to topic number\n",
216 |     "aggregatedTerms['TopicNames'] = aggregatedTerms['topic'].apply(lambda x:topicMaps[x])"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "75ab6641",
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# Sneak peek\n",
227 |     "aggregatedTerms.head(10)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "id": "db82e49c",
233 |    "metadata": {},
234 |    "source": [
235 |     "### Load main feedback data"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "33c02ade",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Load final dataframe where Comprehend results will be merged to \n",
246 |     "feedbackTopics = pd.read_csv(S3_FEEDBACK_TOPICS)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "id": "ee54f6f9",
252 |    "metadata": {},
253 |    "source": [
254 |     "### Adding Back Topic Number, Terms, and Names to Main Data"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "id": "74f22148",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# Joining topic numbers to main data\n",
265 |     "# The index of feedbackTopics is referring to doc field of docTopics dataframe\n",
266 |     "feedbackTopics = pd.merge(feedbackTopics, \n",
267 |     "                          docTopics, \n",
268 |     "                          left_index=True, \n",
269 |     "                          right_on='doc', \n",
270 |     "                          how='left')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "id": "4a236388",
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# Reviews will now have topic numbers, associated terms and topics names\n",
281 |     "feedbackTopics = feedbackTopics.merge(aggregatedTerms, \n",
282 |     "                                      on='topic', \n",
283 |     "                                      how='left')\n",
284 |     "feedbackTopics.head()"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "id": "057e64a6",
290 |    "metadata": {},
291 |    "source": [
292 |     "### Generate Sentiments for Each Feedback"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "id": "7a340619",
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "def detect_sentiment(text, language_code):\n",
303 |     "    \"\"\"Detects sentiment for a given text and language\n",
304 |     "    \"\"\"\n",
305 |     "    comprehend_json_out = comprehend.detect_sentiment(Text=text, LanguageCode=language_code)\n",
306 |     "    return comprehend_json_out"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "id": "de549cdf",
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "# Comprehend output for sentiment in raw json \n",
317 |     "feedbackTopics['comprehend_sentiment_json_out'] = feedbackTopics['reviewText'].apply(lambda x: detect_sentiment(x, language_code))"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "id": "3e23f0ab",
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "# Extracting the exact sentiment from raw Comprehend Json\n",
328 |     "feedbackTopics['sentiment'] = feedbackTopics['comprehend_sentiment_json_out'].apply(lambda x: x['Sentiment'])"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "id": "41d1112b",
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "# Sneak peek\n",
339 |     "feedbackTopics.head(2)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "id": "e5c474c8",
345 |    "metadata": {},
346 |    "source": [
347 |     "### Combining Topics and Sentiments"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "id": "5950bca0",
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "# Creating a composite key of topic name and sentiment.\n",
358 |     "# This is because we are counting frequency of this combination.\n",
359 |     "feedbackTopics['TopicSentiment'] = feedbackTopics['TopicNames'] + '_' + feedbackTopics['sentiment']"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "id": "81c49f1b",
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "# Sneak peek\n",
370 |     "feedbackTopics.head(2)"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "id": "ff4a32b9",
376 |    "metadata": {},
377 |    "source": [
378 |     "### Aggregate Topics and Sentiment for Each Item"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "00d5600b",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "# Create product id group\n",
389 |     "asinWiseDF = feedbackTopics.groupby('asin')"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "id": "7c872d17",
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "# Each product now has a list of topics and sentiment combo (topics can appear multiple times)\n",
400 |     "topicDF = asinWiseDF['TopicSentiment'].apply(lambda x:list(x)).reset_index()"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "id": "cfec30f4",
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "# Count appreances of topics-sentiment combo for product\n",
411 |     "topicDF['TopTopics'] = topicDF['TopicSentiment'].apply(Counter)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "9b181786",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "# Sorting topics-sentiment combo based on their appearance\n",
422 |     "topicDF['TopTopics'] = topicDF['TopTopics'].apply(lambda x: sorted(x, key=x.get, reverse=True))"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "id": "5d9f587e",
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "# Select Top k topics-sentiment combo for each product/review\n",
433 |     "topicDF['TopTopics'] = topicDF['TopTopics'].apply(lambda x: x[:TOP_TOPICS])"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "id": "cca6ae70",
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# Sneak peek\n",
444 |     "topicDF.head()"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "id": "909c813d",
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "# Adding the topic-sentiment combo back to product metadata\n",
455 |     "finalDF = S3_FEEDBACK_TOPICS.merge(topicDF, on='asin', how='left')"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "id": "4e982061",
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "# Only selecting a subset of fields\n",
466 |     "finalDF = finalDF[['asin', 'TopTopics', 'category', 'title']]"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "id": "b563e5bd",
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "# Frequency of sentiments for all reviews\n",
477 |     "feedbackTopics['sentiment'].value_counts()"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "id": "b5519bfc",
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "# Saving the final output locally\n",
488 |     "finalDF.to_csv(S3_FINAL_OUTPUT, index=False)"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "id": "cc76ee18",
494 |    "metadata": {},
495 |    "source": []
496 |   }
497 |  ],
498 |  "metadata": {
499 |   "kernelspec": {
500 |    "display_name": "Python 3.9.10 64-bit",
501 |    "language": "python",
502 |    "name": "python3"
503 |   },
504 |   "language_info": {
505 |    "codemirror_mode": {
506 |     "name": "ipython",
507 |     "version": 3
508 |    },
509 |    "file_extension": ".py",
510 |    "mimetype": "text/x-python",
511 |    "name": "python",
512 |    "nbconvert_exporter": "python",
513 |    "pygments_lexer": "ipython3",
514 |    "version": "3.9.10"
515 |   },
516 |   "vscode": {
517 |    "interpreter": {
518 |     "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
519 |    }
520 |   }
521 |  },
522 |  "nbformat": 4,
523 |  "nbformat_minor": 5
524 | }
525 | 


--------------------------------------------------------------------------------