├── .gitignore ├── README.md ├── command-line-path ├── ComprehendBucketAccessRole-Permissions.json ├── ComprehendBucketAccessRole-TrustPolicy.json ├── README.md ├── prepare_data.py └── requirements.txt ├── comprehend-experiment-notebook.yml ├── comprehend.ipynb ├── images ├── DUS_Arch.png ├── analyzing-text-amazon-es.png ├── custom-classifier-performances.png ├── custom-classifier-training-process.drawio.png ├── entities.png └── sam-app.drawio.png └── sam-app ├── .aws-sam └── build.toml ├── .gitignore ├── README.md ├── custom_classifier ├── __init__.py ├── app.py └── requirements.txt ├── events └── event.json ├── samconfig.toml ├── template.yaml └── tests └── unit ├── __init__.py └── test_handler.py /.gitignore: -------------------------------------------------------------------------------- 1 | yahoo_answers_csv.tgz 2 | yahoo_answers* 3 | comprehend-*.csv 4 | output.tar.gz 5 | output/* 6 | .ipynb_checkpoints 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon Comprehend Experiment 2 | 3 | 4 | ## Purpose 🎯 5 | 6 | This repository provides resources to quickly analyze text and build a custom text classifier able to assign a specific class to a given text. It relates to the NLP (Natural Language Processing) field. 7 | 8 | 9 | ## AWS Services ☁️ 10 | 11 | This repository explores [Amazon Comprehend](https://aws.amazon.com/comprehend/), a natural language processing (NLP) service that uses machine learning (ML) to find insights and relationships in texts. Amazon Comprehend identifies the language of the text; extracts key phrases, places, people, brands, or events; and understands how positive or negative the text is. For more information about everything Amazon Comprehend can do, see [Amazon Comprehend Features](https://aws.amazon.com/comprehend/features/). 12 | 13 | In order to support that experiments other Amazon services are leveraged: 14 | 15 | Amazon S3 to store the dataset for training and asynchronous analysis. 16 | 17 | >[Amazon Simple Storage Service](https://aws.amazon.com/s3/getting-started/) is storage for the Internet. It is designed to make web-scale computing easier for developers. Amazon S3 has a simple web services interface that you can use to store and retrieve any amount of data, at any time, from anywhere on the web. 18 | 19 | Amazon Sagemaker Notebook Instances to get an integrated to AWS environment able to manipulate and explore data: 20 | 21 | > [Amazon Sagemaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html) provides an integrated Jupyter authoring notebook instance for easy access to your data sources for exploration and analysis, so you don't have to manage servers. 22 | 23 | Amazon API Gateway and AWS Lambda to build a serverless API: 24 | 25 | > [Amazon API Gateway](https://aws.amazon.com/api-gateway/getting-started/) is a fully managed service that makes it easy for developers to create, publish, maintain, monitor, and secure APIs at any scale. APIs act as the "front door" for applications to access data, business logic, or functionality from your backend services. 26 | 27 | > [AWS Lambda](https://aws.amazon.com/lambda/getting-started/) lets you run code without provisioning or managing servers. You pay only for the compute time you consume. 28 | 29 | 30 | ## Data and labels 🗄 31 | 32 | We are going to use Yahoo answers corpus used in “[Text Understanding from Scratch](https://arxiv.org/abs/1502.01710)” paper by Xiang Zhang and Yann LeCun. This dataset is made available on the [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/). 33 | 34 | The guided steps aim at helping you using your own dataset to train your Custom Classifier, follow detailed recommendations, they are here to help you. 35 | 36 | 37 | ## Repository structure 🏗 38 | 39 | ```bash 40 | . 41 | ├── README.md <-- This file 42 | ├── comprehend.ipynb <-- Jupyter notebook which provides all details to interact with Amazon Comprehend 43 | ├── comprehend-experiment-notebook.yml <-- Cloud formation template to deploy the notebook 44 | ├── sam-app <-- To support a real-time analysis API open to others in the Jupyter notebook, this repository also provides a SAM application to quickly deploy this API 45 | │ ├── README.md 46 | │ ├── custom_classifier <-- Lambda function code 47 | │ │ ├── __init__.py 48 | │ │ ├── app.py 49 | │ │ └── requirements.txt 50 | │ ├── events 51 | │ │ └── event.json <-- event to test the API 52 | │ ├── template.yaml <-- AWS SAM template 53 | │ └── tests <-- Unit test for the lambda function 54 | ├── images <-- Images used in the jupyther notebook. Some are drawio based and can be edited with xcode + drawio extension 55 | └── command-line-path <-- Directory for creating a Custom Classifier from the AWS CLI 56 | ├── ComprehendBucketAccessRole-Permissions.json <-- Permissions for Amazon Comprehend to read the bucket 57 | ├── ComprehendBucketAccessRole-TrustPolicy.json <-- Trust Policy for Amazon Comprehend to read the bucket 58 | ├── README.md <-- Detailed step by step for the command line version 59 | ├── prepare_data.py <-- Python script for data preparation 60 | └── requirements.txt <-- Python script dependencies 61 | ``` 62 | 63 | ## Prerequisites ⚙️ 64 | 65 | You have an [AWS account](http://docs.aws.amazon.com/sagemaker/latest/dg/gs-account.html), and the AWS CLI is [installed and configured](https://docs.aws.amazon.com/cli/latest/userguide/install-macos.html). You have the proper [IAM User and Role](http://docs.aws.amazon.com/sagemaker/latest/dg/authentication-and-access-control.html) setup to run to both create and run a Sagemaker notebook instance. 66 | 67 | ## Deploy the Sagemaker notebook instance 68 | 69 | ```bash 70 | aws cloudformation deploy --template-file comprehend-experiment-notebook.yml --stack-name comprehend-experiment --capabilities CAPABILITY_IAM --region us-east-1 71 | ``` 72 | 73 | ## Note ℹ 74 | 75 | Although the `comprehend.ipynb` notebook has been built to run in an [Amazon SageMaker Notebook Instance](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), you should be able to run it outside of a notebook instance with minimal modifications (updating IAM role definition and installing the necessary libraries). 76 | -------------------------------------------------------------------------------- /command-line-path/ComprehendBucketAccessRole-Permissions.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": [ 6 | "s3:GetObject", 7 | "s3:PutObject" 8 | ], 9 | "Resource": [ 10 | "arn:aws:s3:::nivonh-poc/*" 11 | ], 12 | "Effect": "Allow" 13 | }, 14 | { 15 | "Action": [ 16 | "s3:ListBucket" 17 | ], 18 | "Resource": [ 19 | "arn:aws:s3:::nivonh-poc" 20 | ], 21 | "Effect": "Allow" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /command-line-path/ComprehendBucketAccessRole-TrustPolicy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Principal": { 7 | "Service": "comprehend.amazonaws.com" 8 | }, 9 | "Action": "sts:AssumeRole" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /command-line-path/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Comprehend Custom Classification 2 | 3 | This is the command line version for creating a Amazon Comprehend Custom Classifier model. 4 | 5 | * [Overview](#overview) 6 | * [Command line prerequisites](#command-line-prerequisites) 7 | * [Detailed Steps](#detailed-steps) 8 | * [Creating the bucket](#creating-the-bucket) 9 | * [Configure an IAM role](#configure-an-iam-role) 10 | * [Preparing the data](#preparing-the-data) 11 | * [Uploading the data](#uploading-the-data) 12 | * [Training the model](#training-the-model) 13 | * [Inference](#inference) 14 | 15 | ## Overview 16 | 17 | The custom classifier workload is built in two steps: 18 | 19 | 1. Training the custom model – no particular machine learning or deep learning knowledge is necessary 20 | 2. Classifying new data 21 | 22 | Steps to follow are relatively simple: 23 | 24 | 1. Create a bucket that will host training data 25 | 2. Create a bucket that will host training data artifacts and production results. That can be the same 26 | 3. Configure an IAM role allowing Comprehend to [access newly created buckets](https://docs.aws.amazon.com/comprehend/latest/dg/access-control-managing-permissions.html#auth-role-permissions) 27 | 4. Prepare data for training 28 | 5. Upload training data in the S3 bucket 29 | 6. Launch a “Train Classifier” job from the console: “Amazon Comprehend” > “Custom Classification” > “Train Classifier” 30 | 7. Prepare data for classification (one text per line, no header, same format as training data). Some more details [here](https://docs.aws.amazon.com/comprehend/latest/dg/how-class-run.html) 31 | 8. Launch a custom classification job 32 | 9. Gather results: a file name output.tar.gz is generated in the destination bucket. File format is [JSON Line]( https://docs.aws.amazon.com/comprehend/latest/dg/how-class-run.html). 33 | 34 | ## Command line prerequisites 35 | 36 | You have anaconda [available](https://docs.conda.io/projects/conda/en/latest/user-guide/install/macos.html). 37 | 38 | Create the conda environment for data preparation: 39 | 40 | ```shell 41 | $> conda create --name comprehendCustomClassification python=3.7 pandas tqdm ipython 42 | ``` 43 | 44 | Activate conda environment: 45 | 46 | ```shell 47 | $> conda activate comprehendCustomClassification 48 | ``` 49 | 50 | ## Detailed Steps 51 | 52 | Now, it is time to get our hands dirty. 53 | 54 | ### Creating the bucket 55 | 56 | The following command creates the bucket `hervenivon-poc`. As bucket names are unique, please change it to your desire. 57 | 58 | ```shell 59 | $> aws s3api create-bucket --acl private --bucket `hervenivon-poc` --region us-east-1 60 | ``` 61 | 62 | You should see something like: 63 | 64 | ```json 65 | { 66 | "Location": "/hervenivon-poc" 67 | } 68 | ``` 69 | 70 | Note 💡: if you want to create your bucket in another location you must add a Location Constraint. Example: 71 | 72 | ```shell 73 | $> aws s3api create-bucket --bucket my-bucket --region eu-west-1 --create-bucket-configuration LocationConstraint=eu-west-1 74 | ``` 75 | 76 | ### Configure an IAM role 77 | 78 | In order to authorize Amazon Comprehend to perform bucket reads and writes during the training or during the inference, we must grant Amazon Comprehend access to the Amazon S3 bucket that we created. 79 | 80 | We are going to create a data access role in our account to trust the Amazon Comprehend service principal. 81 | 82 | Create a file `ComprehendBucketAccessRole-TrustPolicy.json` that contains the role’s trust policy: 83 | 84 | ```json 85 | { 86 | "Version": "2012-10-17", 87 | "Statement": [ 88 | { 89 | "Effect": "Allow", 90 | "Principal": { 91 | "Service": "comprehend.amazonaws.com" 92 | }, 93 | "Action": "sts:AssumeRole" 94 | } 95 | ] 96 | } 97 | ``` 98 | 99 | Create a file `ComprehendBucketAccessRole-Permissions.json` that contains the following access policy. Please change bucket name accordingly to the bucket you created. 100 | 101 | ```json 102 | { 103 | "Version": "2012-10-17", 104 | "Statement": [ 105 | { 106 | "Action": [ 107 | "s3:GetObject", 108 | "s3:PutObject" 109 | ], 110 | "Resource": [ 111 | "arn:aws:s3:::hervenivon-poc/*" 112 | ], 113 | "Effect": "Allow" 114 | }, 115 | { 116 | "Action": [ 117 | "s3:ListBucket" 118 | ], 119 | "Resource": [ 120 | "arn:aws:s3:::hervenivon-poc" 121 | ], 122 | "Effect": "Allow" 123 | } 124 | ] 125 | } 126 | ``` 127 | 128 | The following command create the role: 129 | 130 | ```shell 131 | $> aws iam create-role --role-name ComprehendBucketAccessRole --assume-role-policy-document file://ComprehendBucketAccessRole-TrustPolicy.json 132 | ``` 133 | 134 | You should see something like: 135 | 136 | ```shell 137 | { 138 | "Role": { 139 | "Path": "/", 140 | "RoleName": "ComprehendBucketAccessRole", 141 | "RoleId": "AROAUS7UWFDI7L3MYSW7B", 142 | "Arn": "arn:aws:iam::312306070809:role/ComprehendBucketAccessRole", 143 | "CreateDate": "2019-06-27T09:02:50Z", 144 | "AssumeRolePolicyDocument": { 145 | "Version": "2012-10-17", 146 | "Statement": [ 147 | { 148 | "Effect": "Allow", 149 | "Principal": { 150 | "Service": "comprehend.amazonaws.com" 151 | }, 152 | "Action": "sts:AssumeRole" 153 | } 154 | ] 155 | } 156 | } 157 | } 158 | ``` 159 | 160 | Now we must attach permissions to the role: 161 | 162 | ```shell 163 | $> aws iam put-role-policy --role-name ComprehendBucketAccessRole --policy-name BucketAccessPolicy --policy-document file://ComprehendBucketAccessRole-Permissions.json 164 | ``` 165 | 166 | You should see no output. 167 | 168 | ### Preparing the data 169 | 170 | Once you have downloaded the data from the mentioned Drive, you get a zip file `yahoo_answers_csv.tar.gz` containing four files: 171 | 172 | - `classes.txt` 173 | - `readme.txt` 174 | - `test.csv` 175 | - `train.csv` 176 | 177 | As per the readme: 178 | 179 | _The files train.csv and test.csv contain all the training samples as comma-separated values. There are 4 columns in them, corresponding to class index (1 to 10), question title, question content and best answer. The text fields are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n"._ 180 | 181 | Overview of file content 182 | 183 | ```csv 184 | "5","why doesn't an optical mouse work on a glass table?","or even on some surfaces?","Optical mice use an LED 185 | "6","What is the best off-road motorcycle trail ?","long-distance trail throughout CA","i hear that the mojave 186 | "3","What is Trans Fat? How to reduce that?","I heard that tras fat is bad for the body. Why is that? Where ca 187 | "7","How many planes Fedex has?","I heard that it is the largest airline in the world","according to the www.fe 188 | "7","In the san francisco bay area, does it make sense to rent or buy ?","the prices of rent and the price of b 189 | ``` 190 | 191 | The file `classes.txt` contains the label for each line: 192 | 193 | 1. Society & Culture 194 | 2. Science & Mathematics 195 | 3. Health 196 | 4. Education & Reference 197 | 5. Computers & Internet 198 | 6. Sports 199 | 7. Business & Finance 200 | 8. Entertainment & Music 201 | 9. Family & Relationships 202 | 10. Politics & Government 203 | 204 | `train.csv` contains 1400000 lines and `test.csv` 60000 lines. Amazon Comprehend uses between 10 and 20 percent of the documents submitted for training to test the custom classifier. 205 | 206 | The following command indicates us that the data are evenly distributed. 207 | 208 | ```shel 209 | $> awk -F '","' '{print $1}' yahoo_answers_csv/train.csv | sort | uniq -c 210 | ``` 211 | 212 | 140000 lines per label. Amazon Comprehend “recommend[s] that you train the model with up to 1,000 training documents for each label.” and no more than 1000000 documents. 213 | 214 | Amazon Comprehend recommends the following: 215 | 216 | > For each class, provide a minimum of 10 documents for training. For example, if you have 10 possible classes, you need a total of at least 100 classified documents to train the model. For more accurate training, we recommend at least 50 documents or more for each class. While a minimum of 10 training documents for each class is required, you get better accuracy with more documents. The total size of the training documents must be less than 5 GB. 217 | 218 | With 20 percent of 1000000 use for test, that is still plenty of data to train our custom classifier. 219 | 220 | So, we are going to use a shortened version of `train.csv` to train our custom comprehend model and we are going to use `test.csv` to perform our validation and see how well our custom model performs. 221 | 222 | For training, the file format must conform with the [following](https://docs.aws.amazon.com/comprehend/latest/dg/how-document-classification-training.html): 223 | 224 | - File must contain one label and one text per line – 2 columns 225 | - No header 226 | - Format UTF-8, carriage return “\n”. 227 | 228 | Labels “must be uppercase, can be multitoken, have whitespace, consist of multiple words connect by underscores or hyphens or may even contain a comma in it, as long as it is correctly escaped.” 229 | 230 | Here are the proposed labels: 231 | 232 | | Index | Original | For training | 233 | | --- | --- | --- | 234 | | 1 | Society & Culture | SOCIETY_AND_CULTURE | 235 | | 2 | Science & Mathematics | SCIENCE_AND_MATHEMATICS | 236 | | 3 | Health | HEALTH | 237 | | 4 | Education & Reference | EDUCATION_AND_REFERENCE | 238 | | 5 | Computers & Internet | COMPUTERS_AND_INTERNET | 239 | | 6 | Sports | SPORTS | 240 | | 7 | Business & Finance | BUSINESS_AND_FINANCE | 241 | | 8 | Entertainment & Music | ENTERTAINMENT_AND_MUSIC | 242 | | 9 | Family & Relationships | FAMILY_AND_RELATIONSHIPS | 243 | | 10 | Politics & Government | POLITICS_AND_GOVERNMENT | 244 | 245 | For the inference part of it - when you want your custom model to determine which label corresponds to a given text -, the file format must conform with the following: 246 | 247 | - File must contain text per line 248 | - No header 249 | - Format UTF-8, carriage return “\n”. 250 | 251 | Launch data preparation with the following Terminal command. `prepare_data.py` assumes that you are at the root folder of that repository and that you have extracted the Yahoo corpus into the `yahoo_answers_csv` directory. 252 | 253 | ```shell 254 | $> ./prepare_data.py 255 | ``` 256 | 257 | This script is tied to the Yahoo corpus and leverage the [pandas](https://pandas.pydata.org/) library to format the training and testing datasets to match Amazon Comprehend expectations described above. 258 | 259 | Note 💡: for the moment, we encode comma characters in sentences with the equivalent HTML encoding: ','. May a better escaping exist, I did not find it in the documentation. Between double quotes doesn’t work, ‘\,’ doesn’t work neither. I opened an [issue](https://github.com/awsdocs/amazon-comprehend-developer-guide/issues/18) on the Comprehend documentation to get the recommended approach. 260 | 261 | ### Uploading the data 262 | 263 | ```shell 264 | $> aws s3 cp comprehend-test.csv s3://hervenivon-poc/ComprehendCustomClassification/ 265 | $> aws s3 cp comprehend-train.csv s3://hervenivon-poc/ComprehendCustomClassification/ 266 | ``` 267 | 268 | ### Training the model 269 | 270 | Launch the classifier training: 271 | 272 | ```shell 273 | aws comprehend create-document-classifier --document-classifier-name "yahoo-answers" --data-access-role-arn arn:aws:iam::312306070809:role/ComprehendBucketAccessRole --input-data-config S3Uri=s3://hervenivon-poc/ComprehendCustomClassification/comprehend-train.csv --output-data-config S3Uri=s3://hervenivon-poc/ComprehendCustomClassification/TrainingOutput/ --language-code en 274 | ``` 275 | 276 | You should see something like: 277 | 278 | ```shell 279 | { 280 | "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers" 281 | } 282 | ``` 283 | 284 | You can then track the progress with: 285 | 286 | ```shell 287 | aws comprehend describe-document-classifier --document-classifier-arn arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers 288 | ``` 289 | 290 | You should see something like: 291 | 292 | ```shell 293 | { 294 | "DocumentClassifierProperties": { 295 | "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers", 296 | "LanguageCode": "en", 297 | "Status": "TRAINING", 298 | "SubmitTime": 1561649608.232, 299 | "InputDataConfig": { 300 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/comprehend-train.csv" 301 | }, 302 | "OutputDataConfig": { 303 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/TrainingOutput/312306070809-CLR-92408cee392a4f3a83273ddd1d22bcef/output/output.tar.gz" 304 | }, 305 | "DataAccessRoleArn": "arn:aws:iam::312306070809:role/ComprehendBucketAccessRole" 306 | } 307 | } 308 | ``` 309 | 310 | Or when the training is finished: 311 | 312 | ```shell 313 | { 314 | "DocumentClassifierProperties": { 315 | "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers", 316 | "LanguageCode": "en", 317 | "Status": "TRAINED", 318 | "SubmitTime": 1561677325.862, 319 | "EndTime": 1561679052.677, 320 | "TrainingStartTime": 1561677482.464, 321 | "TrainingEndTime": 1561679043.669, 322 | "InputDataConfig": { 323 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/comprehend-train.csv" 324 | }, 325 | "OutputDataConfig": { 326 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/TrainingOutput/312306070809-CLR-e53d82b1190e7d69065355d2636d80c9/output/output.tar.gz" 327 | }, 328 | "ClassifierMetadata": { 329 | "NumberOfLabels": 10, 330 | "NumberOfTrainedDocuments": 989873, 331 | "NumberOfTestDocuments": 10000, 332 | "EvaluationMetrics": { 333 | "Accuracy": 0.7235, 334 | "Precision": 0.722, 335 | "Recall": 0.7235, 336 | "F1Score": 0.7219 337 | } 338 | }, 339 | "DataAccessRoleArn": "arn:aws:iam::312306070809:role/ComprehendBucketAccessRole" 340 | } 341 | } 342 | ``` 343 | 344 | In our case the training took 28 minutes. 345 | 346 | We see that our model has a precision of 0.72—in other words, when it predicts a label, it is correct 72% of the time. 347 | 348 | We also see that our model has a recall of 0.72—in other words, it correctly identifies 72% of labels. 349 | 350 | ### Inference 351 | 352 | In order to launch a new job, execute the following 353 | 354 | ```shell 355 | $> aws comprehend start-document-classification-job --document-classifier-arn arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers --input-data-config S3Uri=s3://hervenivon-poc/ComprehendCustomClassification/comprehend-test.csv,InputFormat=ONE_DOC_PER_LINE --output-data-config S3Uri=s3://hervenivon-poc/ComprehendCustomClassification/InferenceOutput/ --data-access-role-arn arn:aws:iam::312306070809:role/ComprehendBucketAccessRole 356 | ``` 357 | 358 | You should see something like this: 359 | 360 | ```shell 361 | { 362 | "DocumentClassificationJobProperties": { 363 | "JobId": "42129ccb06ee9e7ffd74c343497c8aab", 364 | "JobStatus": "IN_PROGRESS", 365 | "SubmitTime": 1561679679.036, 366 | "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers", 367 | "InputDataConfig": { 368 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/comprehend-test.csv", 369 | "InputFormat": "ONE_DOC_PER_LINE" 370 | }, 371 | "OutputDataConfig": { 372 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/InferenceOutput/312306070809-CLN-42129ccb06ee9e7ffd74c343497c8aab/output/output.tar.gz" 373 | }, 374 | "DataAccessRoleArn": "arn:aws:iam::312306070809:role/ComprehendBucketAccessRole" 375 | } 376 | } 377 | ``` 378 | 379 | If you want to check the newly launched job: 380 | 381 | ```shell 382 | $> aws comprehend describe-document-classification-job --job-id 42129ccb06ee9e7ffd74c343497c8aab 383 | ``` 384 | 385 | You should see something like: 386 | 387 | ```shell 388 | { 389 | "DocumentClassificationJobProperties": { 390 | "JobId": "42129ccb06ee9e7ffd74c343497c8aab", 391 | "JobStatus": "IN_PROGRESS", 392 | "SubmitTime": 1561679679.036, 393 | "DocumentClassifierArn": "arn:aws:comprehend:us-east-1:312306070809:document-classifier/yahoo-answers", 394 | "InputDataConfig": { 395 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/comprehend-test.csv", 396 | "InputFormat": "ONE_DOC_PER_LINE" 397 | }, 398 | "OutputDataConfig": { 399 | "S3Uri": "s3://hervenivon-poc/ComprehendCustomClassification/InferenceOutput/312306070809-CLN-42129ccb06ee9e7ffd74c343497c8aab/output/output.tar.gz" 400 | }, 401 | "DataAccessRoleArn": "arn:aws:iam::312306070809:role/ComprehendBucketAccessRole" 402 | } 403 | } 404 | ``` 405 | 406 | When it is completed, `JobStatus` move to `COMPLETED`. 407 | 408 | Then you can download the results using `OutputDataConfig.S3Uri` path: 409 | 410 | ```shell 411 | aws s3 cp s3://hervenivon-poc/ComprehendCustomClassification/InferenceOutput/312306070809-CLN-42129ccb06ee9e7ffd74c343497c8aab/output/output.tar.gz 412 | ``` 413 | 414 | Then you can pick and choose lines in the `predictions.jsonl` file that you’ll find in the `output.tar.gz` tarball to check if you agree with your newly configured custom Amazon comprehend model. 415 | 416 | One line from the predictions example: 417 | 418 | ```json 419 | {"File": "comprehend-test.csv", "Line": "9", "Classes": [{"Name": "ENTERTAINMENT_AND_MUSIC", "Score": 0.9685}, {"Name": "EDUCATION_AND_REFERENCE", "Score": 0.0159}, {"Name": "BUSINESS_AND_FINANCE", "Score": 0.0102}]} 420 | ``` 421 | 422 | Which means that our custom model predicted with a 96.8% confidence score that the following text was related to the "Entertainment and music" category. 423 | 424 | ```txt 425 | "What was the first Disney animated character to appear in color? \n Donald Duck was the first major Disney character to appear in color, in his debut cartoon, \"The Wise Little Hen\" in 1934.\n\nFYI: Mickey Mouse made his color debut in the 1935 'toon, \"The Band Concert,\" and the first color 'toon from Disney was \"Flowers and Trees,\" in 1932." 426 | ``` 427 | 428 | Not that bad! 429 | -------------------------------------------------------------------------------- /command-line-path/prepare_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | SRCTRAINFILE='yahoo_answers_csv/train.csv' 7 | SRCVALIDATIONFILE='yahoo_answers_csv/test.csv' 8 | 9 | DSTTRAINFILE='comprehend-train.csv' 10 | DSTVALIDATIONFILE='comprehend-test.csv' 11 | 12 | # Preparation of the train set 13 | trainFrame = pd.read_csv(SRCTRAINFILE, header=None) 14 | tqdm.pandas() 15 | 16 | # Amazon Comprehend "recommend[s] that you train the model with up to 1,000 training documents for 17 | # each label". and no more than 1000000 documents. 18 | # 19 | # Here, we are limiting to 20000 documents per label in order to reduce costs of this demo. 20 | # 21 | # If you want to test Amazon Comprehend on the full dataset, set MAXITEM to 100000 22 | # MAXITEM=100000 23 | 24 | MAXITEM=10000 25 | # Keeping MAXITEM for each label 26 | for i in range(1, 11): 27 | num = len(trainFrame[trainFrame[0] == i]) 28 | dropnum = num - MAXITEM 29 | indextodrop = trainFrame[trainFrame[0] == i].sample(n=dropnum).index 30 | trainFrame.drop(indextodrop, inplace=True) 31 | 32 | # Applying translation of numerical codes into labels 33 | trainFrame[0] = trainFrame[0].progress_apply({ 34 | 1:'SOCIETY_AND_CULTURE', 35 | 2:'SCIENCE_AND_MATHEMATICS', 36 | 3:'HEALTH', 37 | 4:'EDUCATION_AND_REFERENCE', 38 | 5:'COMPUTERS_AND_INTERNET', 39 | 6:'SPORTS', 40 | 7:'BUSINESS_AND_FINANCE', 41 | 8:'ENTERTAINMENT_AND_MUSIC', 42 | 9:'FAMILY_AND_RELATIONSHIPS', 43 | 10:'POLITICS_AND_GOVERNMENT' 44 | }.get) 45 | # Joining "Question title", "question content", and "best answer". 46 | trainFrame['document'] = trainFrame[trainFrame.columns[1:]].progress_apply( 47 | lambda x: ' \\n '.join(x.dropna().astype(str)), 48 | axis=1 49 | ) 50 | # Keeping only the first two columns: label and joint text 51 | trainFrame.drop([1, 2, 3], axis=1, inplace=True) 52 | # Escaping ',' 53 | trainFrame['document'] = trainFrame['document'].str.replace(',', ',') 54 | # Writing csv file 55 | trainFrame.to_csv(path_or_buf=DSTTRAINFILE, 56 | header=False, 57 | index=False, 58 | escapechar='\\', 59 | doublequote=False, 60 | quotechar='"') 61 | 62 | 63 | # Preparation of the validation set 64 | validationFrame = pd.read_csv(SRCVALIDATIONFILE, header=None) 65 | tqdm.pandas() 66 | 67 | # Here, we are limiting to 100 documents to test in order to reduce costs of this demo. 68 | # If you want to test Amazon Comprehend on the full dataset, set MAXITEM to None 69 | # MAXITEM=None 70 | MAXITEM=100 71 | # Keeping MAXITEM 72 | if MAXITEM: 73 | num = len(validationFrame) 74 | dropnum = num - MAXITEM 75 | indextodrop = validationFrame.sample(n=dropnum).index 76 | validationFrame.drop(indextodrop, inplace=True) 77 | 78 | # Joining "Question title", "question content", and "best answer". 79 | validationFrame['document'] = validationFrame[validationFrame.columns[1:]].progress_apply( 80 | lambda x: ' \\n '.join(x.dropna().astype(str)), 81 | axis=1 82 | ) 83 | # Removing all column but the aggregated one 84 | validationFrame.drop([0, 1, 2, 3], axis=1, inplace=True) 85 | # Escaping ',' 86 | validationFrame['document'] = validationFrame['document'].str.replace(',', ',') 87 | # Writing csv file 88 | validationFrame.to_csv(path_or_buf=DSTVALIDATIONFILE, 89 | header=False, 90 | index=False, 91 | escapechar='\\', 92 | doublequote=False, 93 | quotechar='"') 94 | -------------------------------------------------------------------------------- /command-line-path/requirements.txt: -------------------------------------------------------------------------------- 1 | panda 2 | tqdm 3 | -------------------------------------------------------------------------------- /comprehend-experiment-notebook.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: 'Easily create a comprehend experimentation notebook' 3 | Resources: 4 | SageMakerRole: 5 | Type: AWS::IAM::Role 6 | Properties: 7 | AssumeRolePolicyDocument: 8 | Version: 2012-10-17 9 | Statement: 10 | - Effect: Allow 11 | Principal: 12 | Service: sagemaker.amazonaws.com 13 | Action: sts:AssumeRole 14 | ManagedPolicyArns: 15 | - arn:aws:iam::aws:policy/AmazonS3FullAccess 16 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 17 | - arn:aws:iam::aws:policy/ComprehendFullAccess 18 | - arn:aws:iam::aws:policy/IAMFullAccess 19 | 20 | SagemakerInstance: 21 | Type: AWS::SageMaker::NotebookInstance 22 | Properties: 23 | InstanceType: ml.t3.2xlarge 24 | DirectInternetAccess: Enabled 25 | NotebookInstanceName: comprehend-experiment 26 | VolumeSizeInGB: 5 27 | DefaultCodeRepository: https://github.com/hervenivon/aws-experiments-comprehend-custom-classifier.git 28 | RoleArn: 29 | Fn::GetAtt: 30 | - SageMakerRole 31 | - Arn 32 | LifecycleConfigName: 33 | Fn::GetAtt: 34 | - NotebookInstanceLifecycleConfig 35 | - NotebookInstanceLifecycleConfigName 36 | NotebookInstanceLifecycleConfig: 37 | Type: AWS::SageMaker::NotebookInstanceLifecycleConfig 38 | Properties: 39 | NotebookInstanceLifecycleConfigName: comprehend-experiment-lifecycle-config 40 | OnCreate: 41 | - Content: 42 | Fn::Base64: 43 | '#!/bin/bash 44 | 45 | echo "Custom OnCreate Lifecycle Config for future use"' 46 | OnStart: 47 | - Content: 48 | Fn::Base64: 49 | '#!/bin/bash 50 | 51 | echo "Custom OnStart Lifecycle Config for future use"' 52 | 53 | Outputs: 54 | SagemakerNotebookInstanceARN: 55 | Description: Sagemaker notebook instance arn 56 | Value: !Ref SagemakerInstance 57 | -------------------------------------------------------------------------------- /images/DUS_Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/DUS_Arch.png -------------------------------------------------------------------------------- /images/analyzing-text-amazon-es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/analyzing-text-amazon-es.png -------------------------------------------------------------------------------- /images/custom-classifier-performances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/custom-classifier-performances.png -------------------------------------------------------------------------------- /images/custom-classifier-training-process.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/custom-classifier-training-process.drawio.png -------------------------------------------------------------------------------- /images/entities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/entities.png -------------------------------------------------------------------------------- /images/sam-app.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/images/sam-app.drawio.png -------------------------------------------------------------------------------- /sam-app/.aws-sam/build.toml: -------------------------------------------------------------------------------- 1 | # This file is auto generated by SAM CLI build command 2 | 3 | [build_definitions] 4 | [build_definitions.a4246237-4861-414a-ab6a-adb1f0e1199d] 5 | codeuri = "custom_classifier/" 6 | runtime = "python3.7" 7 | functions = ["CustomClassifierFunction"] 8 | -------------------------------------------------------------------------------- /sam-app/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode 3 | 4 | ### Linux ### 5 | *~ 6 | 7 | # temporary files which can be created if a process still has a handle open of a deleted file 8 | .fuse_hidden* 9 | 10 | # KDE directory preferences 11 | .directory 12 | 13 | # Linux trash folder which might appear on any partition or disk 14 | .Trash-* 15 | 16 | # .nfs files are created when an open file is removed but is still being accessed 17 | .nfs* 18 | 19 | ### OSX ### 20 | *.DS_Store 21 | .AppleDouble 22 | .LSOverride 23 | 24 | # Icon must end with two \r 25 | Icon 26 | 27 | # Thumbnails 28 | ._* 29 | 30 | # Files that might appear in the root of a volume 31 | .DocumentRevisions-V100 32 | .fseventsd 33 | .Spotlight-V100 34 | .TemporaryItems 35 | .Trashes 36 | .VolumeIcon.icns 37 | .com.apple.timemachine.donotpresent 38 | 39 | # Directories potentially created on remote AFP share 40 | .AppleDB 41 | .AppleDesktop 42 | Network Trash Folder 43 | Temporary Items 44 | .apdisk 45 | 46 | ### PyCharm ### 47 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 48 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 49 | 50 | # User-specific stuff: 51 | .idea/**/workspace.xml 52 | .idea/**/tasks.xml 53 | .idea/dictionaries 54 | 55 | # Sensitive or high-churn files: 56 | .idea/**/dataSources/ 57 | .idea/**/dataSources.ids 58 | .idea/**/dataSources.xml 59 | .idea/**/dataSources.local.xml 60 | .idea/**/sqlDataSources.xml 61 | .idea/**/dynamic.xml 62 | .idea/**/uiDesigner.xml 63 | 64 | # Gradle: 65 | .idea/**/gradle.xml 66 | .idea/**/libraries 67 | 68 | # CMake 69 | cmake-build-debug/ 70 | 71 | # Mongo Explorer plugin: 72 | .idea/**/mongoSettings.xml 73 | 74 | ## File-based project format: 75 | *.iws 76 | 77 | ## Plugin-specific files: 78 | 79 | # IntelliJ 80 | /out/ 81 | 82 | # mpeltonen/sbt-idea plugin 83 | .idea_modules/ 84 | 85 | # JIRA plugin 86 | atlassian-ide-plugin.xml 87 | 88 | # Cursive Clojure plugin 89 | .idea/replstate.xml 90 | 91 | # Ruby plugin and RubyMine 92 | /.rakeTasks 93 | 94 | # Crashlytics plugin (for Android Studio and IntelliJ) 95 | com_crashlytics_export_strings.xml 96 | crashlytics.properties 97 | crashlytics-build.properties 98 | fabric.properties 99 | 100 | ### PyCharm Patch ### 101 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 102 | 103 | # *.iml 104 | # modules.xml 105 | # .idea/misc.xml 106 | # *.ipr 107 | 108 | # Sonarlint plugin 109 | .idea/sonarlint 110 | 111 | ### Python ### 112 | # Byte-compiled / optimized / DLL files 113 | __pycache__/ 114 | *.py[cod] 115 | *$py.class 116 | 117 | # C extensions 118 | *.so 119 | 120 | # Distribution / packaging 121 | .Python 122 | build/ 123 | develop-eggs/ 124 | dist/ 125 | downloads/ 126 | eggs/ 127 | .eggs/ 128 | lib/ 129 | lib64/ 130 | parts/ 131 | sdist/ 132 | var/ 133 | wheels/ 134 | *.egg-info/ 135 | .installed.cfg 136 | *.egg 137 | 138 | # PyInstaller 139 | # Usually these files are written by a python script from a template 140 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 141 | *.manifest 142 | *.spec 143 | 144 | # Installer logs 145 | pip-log.txt 146 | pip-delete-this-directory.txt 147 | 148 | # Unit test / coverage reports 149 | htmlcov/ 150 | .tox/ 151 | .coverage 152 | .coverage.* 153 | .cache 154 | .pytest_cache/ 155 | nosetests.xml 156 | coverage.xml 157 | *.cover 158 | .hypothesis/ 159 | 160 | # Translations 161 | *.mo 162 | *.pot 163 | 164 | # Flask stuff: 165 | instance/ 166 | .webassets-cache 167 | 168 | # Scrapy stuff: 169 | .scrapy 170 | 171 | # Sphinx documentation 172 | docs/_build/ 173 | 174 | # PyBuilder 175 | target/ 176 | 177 | # Jupyter Notebook 178 | .ipynb_checkpoints 179 | 180 | # pyenv 181 | .python-version 182 | 183 | # celery beat schedule file 184 | celerybeat-schedule.* 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Environments 190 | .env 191 | .venv 192 | env/ 193 | venv/ 194 | ENV/ 195 | env.bak/ 196 | venv.bak/ 197 | 198 | # Spyder project settings 199 | .spyderproject 200 | .spyproject 201 | 202 | # Rope project settings 203 | .ropeproject 204 | 205 | # mkdocs documentation 206 | /site 207 | 208 | # mypy 209 | .mypy_cache/ 210 | 211 | ### VisualStudioCode ### 212 | .vscode/* 213 | !.vscode/settings.json 214 | !.vscode/tasks.json 215 | !.vscode/launch.json 216 | !.vscode/extensions.json 217 | .history 218 | 219 | ### Windows ### 220 | # Windows thumbnail cache files 221 | Thumbs.db 222 | ehthumbs.db 223 | ehthumbs_vista.db 224 | 225 | # Folder config file 226 | Desktop.ini 227 | 228 | # Recycle Bin used on file shares 229 | $RECYCLE.BIN/ 230 | 231 | # Windows Installer files 232 | *.cab 233 | *.msi 234 | *.msm 235 | *.msp 236 | 237 | # Windows shortcuts 238 | *.lnk 239 | 240 | # Build folder 241 | 242 | */build/* 243 | 244 | # End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm,visualstudiocode -------------------------------------------------------------------------------- /sam-app/README.md: -------------------------------------------------------------------------------- 1 | # comprehend-custom-classifier-api 2 | 3 | This project contains source code and supporting files for a serverless application that you can deploy with the SAM CLI. It includes the following files and folders. 4 | 5 | - custom_classifier - Code for the application's Lambda function. 6 | - events - Invocation events that you can use to invoke the function. 7 | - tests - Unit tests for the application code. 8 | - template.yaml - A template that defines the application's AWS resources. 9 | 10 | The application uses several AWS resources, including Lambda functions and an API Gateway API. These resources are defined in the `template.yaml` file in this project. You can update the template to add AWS resources through the same deployment process that updates your application code. 11 | 12 | If you prefer to use an integrated development environment (IDE) to build and test your application, you can use the AWS Toolkit. 13 | The AWS Toolkit is an open source plug-in for popular IDEs that uses the SAM CLI to build and deploy serverless applications on AWS. The AWS Toolkit also adds a simplified step-through debugging experience for Lambda function code. See the following links to get started: 14 | 15 | - [PyCharm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) 16 | - [VS Code](https://docs.aws.amazon.com/toolkit-for-vscode/latest/userguide/welcome.html) 17 | 18 | ## Deploy the sample application 19 | 20 | The Serverless Application Model Command Line Interface (SAM CLI) is an extension of the AWS CLI that adds functionality for building and testing Lambda applications. It uses Docker to run your functions in an Amazon Linux environment that matches AWS Lambda. It can also emulate your application's build environment and API. 21 | 22 | To use the SAM CLI, you need the following tools. 23 | 24 | - SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) 25 | - Python 3 - [Install Python](https://www.python.org/downloads/) 26 | - Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community) 27 | 28 | To build and deploy your application for the first time, run the following command in your shell: 29 | 30 | ```bash 31 | sam build --use-container 32 | sam deploy --guided 33 | ``` 34 | 35 | The first command will build the source of your application. The second command will package and deploy your application to AWS, with a series of prompts: 36 | 37 | - **Stack Name**: The name of the stack to deploy to CloudFormation. This should be unique to your account and region, and a good starting point would be something matching your project name. 38 | - **AWS Region**: The AWS region you want to deploy your app to. 39 | - **Confirm changes before deploy**: If set to yes, any change sets will be shown to you before execution for manual review. If set to no, the AWS SAM CLI will automatically deploy application changes. 40 | - **Allow SAM CLI IAM role creation**: Many AWS SAM templates, including this example, create AWS IAM roles required for the AWS Lambda function(s) included to access AWS services. By default, these are scoped down to minimum required permissions. To deploy an AWS CloudFormation stack which creates or modified IAM roles, the `CAPABILITY_IAM` value for `capabilities` must be provided. If permission isn't provided through this prompt, to deploy this example you must explicitly pass `--capabilities CAPABILITY_IAM` to the `sam deploy` command. 41 | - **Save arguments to samconfig.toml**: If set to yes, your choices will be saved to a configuration file inside the project, so that in the future you can just re-run `sam deploy` without parameters to deploy changes to your application. 42 | 43 | The guided deployment will also ask for the comprehend custom endpoint ARN to pass it to the AWS Lambda function at deployment time as an environment variable. 44 | 45 | You can find your API Gateway Endpoint URL in the output values displayed after deployment. 46 | 47 | As the `template.yaml` requires an API Key (`ApiKeyRequired: true`), it is necessary to [create a usage plan]( 48 | https://docs.aws.amazon.com/apigateway/latest/developerguide/api-gateway-api-usage-plans.html) in order to get access to the API 49 | 50 | ## Use the SAM CLI to build and test locally 51 | 52 | Build your application with the `sam build --use-container` command. 53 | 54 | ```bash 55 | sam-app$ sam build --use-container 56 | ``` 57 | 58 | The SAM CLI installs dependencies defined in `custom_classifier/requirements.txt`, creates a deployment package, and saves it in the `.aws-sam/build` folder. 59 | 60 | Test a single function by invoking it directly with a test event. An event is a JSON document that represents the input that the function receives from the event source. Test events are included in the `events` folder in this project. 61 | 62 | Run functions locally and invoke them with the `sam local invoke` command or with the more complete: 63 | 64 | ```bash 65 | sam-app$ sam local invoke CustomClassifierFunction --event events/event.json 66 | ``` 67 | 68 | The SAM CLI can also emulate your application's API. Use the `sam local start-api` to run the API locally on port 3000. 69 | 70 | ```bash 71 | sam-app$ sam local start-api 72 | sam-app$ curl http://localhost:3000/classify?text=What%20is%20the%20answer%20to%20life%20the%20universe%20and%20everything%3F 73 | ``` 74 | 75 | The SAM CLI reads the application template to determine the API's routes and the functions that they invoke. The `Events` property on each function's definition includes the route and method for each path. 76 | 77 | ```yaml 78 | Events: 79 | Classification: 80 | Type: Api 81 | Properties: 82 | Path: /classify 83 | Method: get 84 | Auth: 85 | ApiKeyRequired: true 86 | RequestParameters: 87 | - method.request.querystring.text: 88 | Required: true 89 | ``` 90 | 91 | ## Add a resource to your application 92 | 93 | The application template uses AWS Serverless Application Model (AWS SAM) to define application resources. AWS SAM is an extension of AWS CloudFormation with a simpler syntax for configuring common serverless application resources such as functions, triggers, and APIs. For resources not included in [the SAM specification](https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md), you can use standard [AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html) resource types. 94 | 95 | ## Fetch, tail, and filter Lambda function logs 96 | 97 | To simplify troubleshooting, SAM CLI has a command called `sam logs`. `sam logs` lets you fetch logs generated by your deployed Lambda function from the command line. In addition to printing the logs on the terminal, this command has several nifty features to help you quickly find the bug. 98 | 99 | `NOTE`: This command works for all AWS Lambda functions; not just the ones you deploy using SAM. 100 | 101 | ```bash 102 | sam-app$ sam logs -n CustomClassifierFunction --stack-name comprehend-custom-classifier-api --tail 103 | ``` 104 | 105 | You can find more information and examples about filtering Lambda function logs in the [SAM CLI Documentation](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-logging.html). 106 | 107 | ## Unit tests 108 | 109 | Tests are defined in the `tests` folder in this project. Use PIP to install the [pytest](https://docs.pytest.org/en/latest/) and run unit tests. 110 | 111 | ```bash 112 | sam-app$ pip install pytest pytest-mock --user 113 | sam-app$ python -m pytest tests/ -v 114 | ``` 115 | 116 | ## Cleanup 117 | 118 | To delete the sample application that you created, use the AWS CLI. Assuming you used your project name for the stack name, you can run the following: 119 | 120 | ```bash 121 | aws cloudformation delete-stack --stack-name comprehend-custom-classifier-api 122 | ``` 123 | 124 | ## Resources 125 | 126 | See the [AWS SAM developer guide](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/what-is-sam.html) for an introduction to SAM specification, the SAM CLI, and serverless application concepts. 127 | -------------------------------------------------------------------------------- /sam-app/custom_classifier/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/sam-app/custom_classifier/__init__.py -------------------------------------------------------------------------------- /sam-app/custom_classifier/app.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | import os 4 | 5 | # Init 6 | client = boto3.client('comprehend') 7 | endpoint_arn = os.environ['ENDPOINT_ARN'] 8 | 9 | def lambda_handler(event, context): 10 | """Sample pure Lambda function 11 | 12 | Parameters 13 | ---------- 14 | event: dict, required 15 | API Gateway Lambda Proxy Input Format 16 | 17 | Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format 18 | 19 | context: object, required 20 | Lambda Context runtime methods and attributes 21 | 22 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html 23 | 24 | Returns 25 | ------ 26 | API Gateway Lambda Proxy Output Format: dict 27 | 28 | Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html 29 | """ 30 | 31 | try: 32 | print('QueryStringParameter: {}'.format(event['queryStringParameters'])) 33 | if event['queryStringParameters'] == None or not 'text' in event['queryStringParameters']: 34 | return { 35 | 'statusCode': 200, 36 | 'body': json.dumps({'message': 'Please provide a text parameter as a querystring.'}), 37 | } 38 | text = event['queryStringParameters']['text'] 39 | if len(text) <= 5: 40 | return { 41 | 'statusCode': 200, 42 | 'body': json.dumps({'message': 'Text length must be superior to 5 caracters.'}), 43 | } 44 | response = client.classify_document( 45 | Text=text, 46 | EndpointArn=endpoint_arn 47 | ) 48 | return { 49 | 'statusCode': 200, 50 | 'body': json.dumps(response['Classes']), 51 | } 52 | except Exception as e: 53 | print(e) 54 | return { 55 | 'statusCode': 200, 56 | 'body': json.dumps({'message': 'An error occured, please try again later.'}), 57 | } -------------------------------------------------------------------------------- /sam-app/custom_classifier/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.15.16 2 | requests -------------------------------------------------------------------------------- /sam-app/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "body": "{\"message\": \"hello world\"}", 3 | "resource": "/{proxy+}", 4 | "path": "/path/to/resource", 5 | "httpMethod": "POST", 6 | "isBase64Encoded": false, 7 | "queryStringParameters": { 8 | "foo": "bar" 9 | }, 10 | "pathParameters": { 11 | "proxy": "/path/to/resource" 12 | }, 13 | "stageVariables": { 14 | "baz": "qux" 15 | }, 16 | "headers": { 17 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 18 | "Accept-Encoding": "gzip, deflate, sdch", 19 | "Accept-Language": "en-US,en;q=0.8", 20 | "Cache-Control": "max-age=0", 21 | "CloudFront-Forwarded-Proto": "https", 22 | "CloudFront-Is-Desktop-Viewer": "true", 23 | "CloudFront-Is-Mobile-Viewer": "false", 24 | "CloudFront-Is-SmartTV-Viewer": "false", 25 | "CloudFront-Is-Tablet-Viewer": "false", 26 | "CloudFront-Viewer-Country": "US", 27 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 28 | "Upgrade-Insecure-Requests": "1", 29 | "User-Agent": "Custom User Agent String", 30 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 31 | "X-Amz-Cf-Id": "cDehVQoZnx43VYQb9j2-nvCh-9z396Uhbp027Y2JvkCPNLmGJHqlaA==", 32 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 33 | "X-Forwarded-Port": "443", 34 | "X-Forwarded-Proto": "https" 35 | }, 36 | "requestContext": { 37 | "accountId": "123456789012", 38 | "resourceId": "123456", 39 | "stage": "prod", 40 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 41 | "requestTime": "09/Apr/2015:12:34:56 +0000", 42 | "requestTimeEpoch": 1428582896000, 43 | "identity": { 44 | "cognitoIdentityPoolId": null, 45 | "accountId": null, 46 | "cognitoIdentityId": null, 47 | "caller": null, 48 | "accessKey": null, 49 | "sourceIp": "127.0.0.1", 50 | "cognitoAuthenticationType": null, 51 | "cognitoAuthenticationProvider": null, 52 | "userArn": null, 53 | "userAgent": "Custom User Agent String", 54 | "user": null 55 | }, 56 | "path": "/prod/path/to/resource", 57 | "resourcePath": "/{proxy+}", 58 | "httpMethod": "POST", 59 | "apiId": "1234567890", 60 | "protocol": "HTTP/1.1" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /sam-app/samconfig.toml: -------------------------------------------------------------------------------- 1 | version = 0.1 2 | [default] 3 | [default.deploy] 4 | [default.deploy.parameters] 5 | stack_name = "custom-comprehend-api" 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-1xi4a1760o1sq" 7 | s3_prefix = "custom-comprehend-api" 8 | region = "us-east-1" 9 | confirm_changeset = true 10 | capabilities = "CAPABILITY_IAM" 11 | parameter_overrides = "EndpointARN=\"arn:aws:comprehend:us-east-1:313506734225:document-classifier-endpoint/yahoo-answers-endpoint\"" 12 | -------------------------------------------------------------------------------- /sam-app/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | comprehend-custom-classifier-api 5 | 6 | Sample SAM Template for comprehend-custom-classifier-api 7 | 8 | # More info about Globals: https://github.com/awslabs/serverless-application-model/blob/master/docs/globals.rst 9 | Globals: 10 | Function: 11 | Timeout: 3 12 | 13 | # More info about Parameters: https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/sam-cli-command-reference-sam-deploy.html 14 | Parameters: 15 | EndpointARN: 16 | Type: String 17 | Description: The Custom Classifier Endpoint ARN to use 18 | Default: arn:aws:comprehend:us-east-1:123456789012:document-classifier-endpoint/yahoo-answers-endpoint 19 | 20 | Resources: 21 | CustomClassifierFunction: 22 | Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction 23 | Properties: 24 | CodeUri: custom_classifier/ 25 | Handler: app.lambda_handler 26 | Runtime: python3.7 27 | Description: > 28 | Looks for a `text` parameter in the event query string. 29 | 30 | With that parameter, calls the Amazon Comprehend Custom Classifier and returns classification. 31 | Environment: 32 | Variables: 33 | ENDPOINT_ARN: !Ref EndpointARN 34 | Policies: 35 | - AWSLambdaExecute 36 | - Version: '2012-10-17' # Policy Document 37 | Statement: 38 | - Effect: Allow 39 | Action: 40 | - comprehend:ClassifyDocument 41 | Resource: '*' 42 | Events: 43 | Classification: 44 | Type: Api # More info about API Event Source: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#api 45 | Properties: 46 | Path: /classify 47 | Method: get 48 | Auth: 49 | ApiKeyRequired: true 50 | RequestParameters: 51 | - method.request.querystring.text: 52 | Required: true 53 | 54 | Outputs: 55 | # ServerlessRestApi is an implicit API created out of Events key under Serverless::Function 56 | # Find out more about other implicit resources you can reference within SAM 57 | # https://github.com/awslabs/serverless-application-model/blob/master/docs/internals/generated_resources.rst#api 58 | CustomClassifierApi: 59 | Description: "API Gateway endpoint URL for Prod stage for Custom Classifier function" 60 | Value: !Sub "https://${ServerlessRestApi}.execute-api.${AWS::Region}.amazonaws.com/Prod/classify/" 61 | CustomClassifierFunction: 62 | Description: "Custom Classification Lambda Function ARN" 63 | Value: !GetAtt CustomClassifierFunction.Arn 64 | CustomClassificationIamRole: 65 | Description: "Implicit IAM Role created for Custom Classification function" 66 | Value: !GetAtt CustomClassifierFunctionRole.Arn -------------------------------------------------------------------------------- /sam-app/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hervenivon/aws-experiments-comprehend-custom-classifier/663ebf9cab52be09e462ddb7d7ced5853ce53e47/sam-app/tests/unit/__init__.py -------------------------------------------------------------------------------- /sam-app/tests/unit/test_handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | 5 | from custom_classifier import app 6 | 7 | 8 | @pytest.fixture() 9 | def apigw_event(): 10 | """ Generates API GW Event""" 11 | 12 | return { 13 | "body": '{ "test": "body"}', 14 | "resource": "/{proxy+}", 15 | "requestContext": { 16 | "resourceId": "123456", 17 | "apiId": "1234567890", 18 | "resourcePath": "/{proxy+}", 19 | "httpMethod": "POST", 20 | "requestId": "c6af9ac6-7b61-11e6-9a41-93e8deadbeef", 21 | "accountId": "123456789012", 22 | "identity": { 23 | "apiKey": "", 24 | "userArn": "", 25 | "cognitoAuthenticationType": "", 26 | "caller": "", 27 | "userAgent": "Custom User Agent String", 28 | "user": "", 29 | "cognitoIdentityPoolId": "", 30 | "cognitoIdentityId": "", 31 | "cognitoAuthenticationProvider": "", 32 | "sourceIp": "127.0.0.1", 33 | "accountId": "", 34 | }, 35 | "stage": "prod", 36 | }, 37 | "queryStringParameters": {"foo": "bar"}, 38 | "headers": { 39 | "Via": "1.1 08f323deadbeefa7af34d5feb414ce27.cloudfront.net (CloudFront)", 40 | "Accept-Language": "en-US,en;q=0.8", 41 | "CloudFront-Is-Desktop-Viewer": "true", 42 | "CloudFront-Is-SmartTV-Viewer": "false", 43 | "CloudFront-Is-Mobile-Viewer": "false", 44 | "X-Forwarded-For": "127.0.0.1, 127.0.0.2", 45 | "CloudFront-Viewer-Country": "US", 46 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 47 | "Upgrade-Insecure-Requests": "1", 48 | "X-Forwarded-Port": "443", 49 | "Host": "1234567890.execute-api.us-east-1.amazonaws.com", 50 | "X-Forwarded-Proto": "https", 51 | "X-Amz-Cf-Id": "aaaaaaaaaae3VYQb9jd-nvCd-de396Uhbp027Y2JvkCPNLmGJHqlaA==", 52 | "CloudFront-Is-Tablet-Viewer": "false", 53 | "Cache-Control": "max-age=0", 54 | "User-Agent": "Custom User Agent String", 55 | "CloudFront-Forwarded-Proto": "https", 56 | "Accept-Encoding": "gzip, deflate, sdch", 57 | }, 58 | "pathParameters": {"proxy": "/examplepath"}, 59 | "httpMethod": "POST", 60 | "stageVariables": {"baz": "qux"}, 61 | "path": "/examplepath", 62 | } 63 | 64 | 65 | def test_lambda_handler(apigw_event, mocker): 66 | 67 | ret = app.lambda_handler(apigw_event, "") 68 | data = json.loads(ret["body"]) 69 | 70 | assert ret["statusCode"] == 200 71 | assert "message" in ret["body"] 72 | assert data["message"] == "Please provide a text parameter as a querystring." 73 | # assert "location" in data.dict_keys() 74 | --------------------------------------------------------------------------------