├── web ├── public │ ├── favicon.ico │ ├── manifest.json │ ├── index.html │ ├── template.html │ └── test_template.html ├── src │ ├── index.js │ ├── App.test.js │ ├── App.css │ ├── index.css │ └── App.js ├── README.md └── package.json ├── server ├── processing │ ├── deploy.sh │ ├── s3_helper.py │ ├── cfn-template.json │ ├── sagemaker-gt-preprocess.py │ └── sagemaker-gt-postprocess.py ├── data │ ├── mini_manifest.json │ └── manifest.json └── prep │ ├── prep_manifest.py │ └── detect_lines.py ├── .gitignore └── README.md /web/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nitinaws/gt-custom-workflow/HEAD/web/public/favicon.ico -------------------------------------------------------------------------------- /web/src/index.js: -------------------------------------------------------------------------------- 1 | import 'bootstrap/dist/css/bootstrap.min.css'; 2 | import React from 'react'; 3 | import ReactDOM from 'react-dom'; 4 | import './index.css'; 5 | import App from './App'; 6 | 7 | ReactDOM.render(, document.getElementById('root')); 8 | -------------------------------------------------------------------------------- /web/src/App.test.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import App from './App'; 4 | 5 | it('renders without crashing', () => { 6 | const div = document.createElement('div'); 7 | ReactDOM.render(, div); 8 | ReactDOM.unmountComponentAtNode(div); 9 | }); 10 | -------------------------------------------------------------------------------- /web/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | } 10 | ], 11 | "start_url": ".", 12 | "display": "standalone", 13 | "theme_color": "#000000", 14 | "background_color": "#ffffff" 15 | } 16 | -------------------------------------------------------------------------------- /server/processing/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ $# != 1 ] 4 | then 5 | echo "Usage deploy.sh " 6 | exit -1 7 | fi 8 | 9 | if [ -f 'labeling_lambda.zip' ] 10 | then 11 | echo True 12 | fi 13 | 14 | zip labeling_lambda.zip *.py 15 | 16 | aws s3 cp labeling_lambda.zip s3://$1/coderepo/ 17 | 18 | aws s3 cp cfn-template.json s3://$1/cft/ 19 | 20 | rm labeling_lambda.zip 21 | -------------------------------------------------------------------------------- /server/data/mini_manifest.json: -------------------------------------------------------------------------------- 1 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00006.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00006.jpg.csv', 'metadata': {'Author': 'Robert Underwood', 'ISBN': '1-358-98355-0'}} 2 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00015.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00015.jpg.csv', 'metadata': {'Author': 'Stephanie Morgan', 'ISBN': '1-242-55362-2'}} 3 | -------------------------------------------------------------------------------- /web/src/App.css: -------------------------------------------------------------------------------- 1 | .App { 2 | text-align: center; 3 | } 4 | 5 | .App-logo { 6 | animation: App-logo-spin infinite 20s linear; 7 | height: 40vmin; 8 | } 9 | 10 | .App-header { 11 | background-color: #282c34; 12 | min-height: 100vh; 13 | display: flex; 14 | flex-direction: column; 15 | align-items: center; 16 | justify-content: center; 17 | font-size: calc(10px + 2vmin); 18 | color: white; 19 | } 20 | 21 | .App-link { 22 | color: #61dafb; 23 | } 24 | 25 | @keyframes App-logo-spin { 26 | from { 27 | transform: rotate(0deg); 28 | } 29 | to { 30 | transform: rotate(360deg); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /web/node_modules 2 | /web/package-lock.json 3 | 4 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 5 | 6 | # dependencies 7 | web/node_modules 8 | web/.pnp 9 | web/.pnp.js 10 | 11 | # testing 12 | web/coverage 13 | 14 | # production 15 | web/build 16 | 17 | # misc 18 | web/.DS_Store 19 | web/.env.local 20 | web/.env.development.local 21 | web/.env.test.local 22 | web/.env.production.local 23 | 24 | web/npm-debug.log* 25 | web/yarn-debug.log* 26 | web/yarn-error.log* 27 | web/yarn.lock 28 | !web/src 29 | !web/src/* 30 | web/.idea 31 | 32 | # server 33 | 34 | server/.idea 35 | server/.serverless 36 | server/processing/labeling_lambda.zip 37 | server/.DS_Store 38 | server/data/.DS_Store 39 | -------------------------------------------------------------------------------- /web/README.md: -------------------------------------------------------------------------------- 1 | #### Steps taken to create this project: 2 | - Install Node, NPM Package Manager and Yarn(https://www.npmjs.com/package/yarn) 3 | 4 | - Clone repository and install pre-requisites 5 | 6 | ``` 7 | git clone https://github.com/nitinaws/gt-custom-workflow.git 8 | cd web 9 | yarn add react-text-annotate 10 | yarn build 11 | ``` 12 | 13 | - Update your s3 bucket/prefix in `package.json` 14 | 15 | ``` 16 | "clean": "aws s3 rm --recursive s3:///web/", 17 | "deploy": "aws s3 cp --recursive build/ s3:///web/ --acl public-read" 18 | ``` 19 | 20 | - Now run to upload project to S3 bucket 21 | ```yarn clean 22 | yarn deploy 23 | ``` 24 | 25 | - Make sure to change CSS and JS URLs to point to your S3 urls in `public\template.html` 26 | 27 | 28 | -------------------------------------------------------------------------------- /web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "text-annotator", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "bootstrap": "^4.2.1", 7 | "react": "^16.7.0", 8 | "react-dom": "^16.7.0", 9 | "react-native": "^0.57.8", 10 | "react-scripts": "2.1.3", 11 | "react-text-annotate": "^0.1.0" 12 | }, 13 | "scripts": { 14 | "start": "react-scripts start", 15 | "build": "react-scripts build", 16 | "test": "react-scripts test", 17 | "eject": "react-scripts eject", 18 | "clean": "aws s3 rm --recursive s3://smgtannotation/web/", 19 | "deploy": "aws s3 cp --recursive build/ s3://smgtannotation/web/ --acl public-read" 20 | }, 21 | "eslintConfig": { 22 | "extends": "react-app" 23 | }, 24 | "browserslist": [ 25 | ">0.2%", 26 | "not dead", 27 | "not ie <= 11", 28 | "not op_mini all" 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /web/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 9 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 23 | Submit 24 | 25 | 34 | -------------------------------------------------------------------------------- /web/public/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 12 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | Submit 27 | 28 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /web/public/test_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 12 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | Submit 27 | 28 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /web/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | padding: 0; 4 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", 5 | "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", 6 | sans-serif; 7 | -webkit-font-smoothing: antialiased; 8 | -moz-osx-font-smoothing: grayscale; 9 | } 10 | 11 | code { 12 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", 13 | monospace; 14 | } 15 | 16 | div.img_contain { 17 | overflow-y: auto; 18 | width: 100%; 19 | max-height: 100%; 20 | margin-left: 2%; 21 | margin-right: 2%; 22 | margin-top: 5%; 23 | margin-bottom: 5%; 24 | border-width: 2px; 25 | border-color: black; 26 | border-style: solid; 27 | background-color: white; 28 | } 29 | 30 | img.page { 31 | max-width:100%; 32 | max-length:100%; 33 | } 34 | 35 | div.controls { 36 | width:50%; 37 | display: table-cell; 38 | font-size: 0.75em; 39 | font-weight: bold; 40 | } 41 | 42 | button { 43 | width: 250px; 44 | height: 50px; 45 | background-color: white; 46 | vertical-align: top; 47 | font-size: 1.6em; 48 | font-weight: bold; 49 | border: 2px; 50 | border-style: solid; 51 | border-color: black; 52 | } 53 | 54 | button.yb { 55 | margin-right: 0.5em; 56 | width: 28%; 57 | height: 80%; 58 | background-color:#3DE63D; 59 | vertical-align: top; 60 | font-size: 1.8em; 61 | font-weight: bold; 62 | border: 2px; 63 | border-style: solid; 64 | border-color: black; 65 | } 66 | 67 | button.nb { 68 | width: 28%; 69 | height: 80%; 70 | margin-right: 0.5em; 71 | background-color: #FF4444; 72 | vertical-align: top; 73 | font-size: 1.8em; 74 | font-weight: bold; 75 | border: 2px; 76 | border-style: solid; 77 | border-color: black; 78 | } 79 | 80 | button.ud { 81 | width: 14%; 82 | height: 40%; 83 | background-color: rgb(151, 151, 151); 84 | vertical-align: bottom; 85 | font-size: 2em; 86 | font-weight: bolder; 87 | border: 2px; 88 | border-style: solid; 89 | border-color: black; 90 | } 91 | -------------------------------------------------------------------------------- /server/data/manifest.json: -------------------------------------------------------------------------------- 1 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00006.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00006.jpg.csv', 'metadata': {'Author': 'Robert Underwood', 'ISBN': '1-358-98355-0'}} 2 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00015.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00015.jpg.csv', 'metadata': {'Author': 'Stephanie Morgan', 'ISBN': '1-242-55362-2'}} 3 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00040.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00040.jpg.csv', 'metadata': {'Author': 'Angela Anderson', 'ISBN': '0-567-58708-8'}} 4 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00041.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00041.jpg.csv', 'metadata': {'Author': 'Kenneth Stanley', 'ISBN': '1-68939-208-8'}} 5 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00052.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00052.jpg.csv', 'metadata': {'Author': 'Bruce Peck', 'ISBN': '0-7126-2438-4'}} 6 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00090.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00090.jpg.csv', 'metadata': {'Author': 'Kerry Phillips', 'ISBN': '1-61581-532-5'}} 7 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00114.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00114.jpg.csv', 'metadata': {'Author': 'Frederick Watson', 'ISBN': '1-342-56153-8'}} 8 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00128.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00128.jpg.csv', 'metadata': {'Author': 'Jennifer Gray', 'ISBN': '0-10-489174-2'}} 9 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00146.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00146.jpg.csv', 'metadata': {'Author': 'William Alexander', 'ISBN': '0-683-11895-1'}} 10 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/arXiv_1801_00067v2__astro-ph_GA__3_Apr_2018.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/arXiv_1801_00067v2__astro-ph_GA__3_Apr_2018.jpg.csv', 'metadata': {'Author': 'Sean Moore', 'ISBN': '0-00-940460-0'}} 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Build your own custom labeling workflow using SageMaker Ground Truth 2 | 3 | Successful machine learning models are built on the shoulders of large volumes of high-quality training data, but the process to create the training data necessary to build these models is expensive, complicated, and time-consuming. The majority of models created today require a human to manually label data in a way that allows the model to learn how to make correct decisions. 4 | 5 | Amazon SageMaker Ground Truth provides built-in workflows for image classification, bounding boxes, text classification, and semantic segmentation use cases. You also have the option of building your own custom workflows where you define the user interface (UI) for performing data labeling. To help you move quickly, SageMaker provides you a number of commonly used custom UI templates for image, text, and audio data labeling use cases. These templates take advantage of SageMaker Ground Truth’s crowd HTML elements that are meant to simplify the process of building data labeling UIs. You can also specify your own arbitrary HTML for the UI. 6 | 7 | You may need to build custom workflow for various reasons, such as: 8 | - Your own custom data labeling requirements 9 | - Complex input consisting of multiple elements per task (e.g., images, text, or custom metadata) 10 | - Dynamic decision making on task input to prevent certain items from going to labelers 11 | - Custom logic for consolidating labeling output to improve labeling accuracy 12 | 13 | 14 | In this blog post, we demonstrate a custom text annotation labeling workflow to build labelled dataset for Natural language processing (NLP) problem 15 | 16 | 17 | #### Augumented Manifest 18 | server/data/manifest.json 19 | server/data/mini_manifest.json 20 | 21 | #### Script to extract text using Amazon Textract 22 | server/prep/detect_lines.py 23 | 24 | #### Script to create Manifest 25 | server/prep/prep_manifest.py 26 | 27 | #### Cloudformation script to deploy Lambda 28 | server/processing/cfn-template.json 29 | 30 | #### Pre and post labeling lambdas 31 | server/processing/sagemaker-gt-postprocess.py 32 | server/processing/sagemaker-gt-preprocess.py 33 | 34 | #### React Components 35 | web/README.md 36 | web/package.json 37 | web/src/App.css 38 | web/src/App.js 39 | web/src/App.test.js 40 | web/src/index.css 41 | web/src/index.js 42 | web/public/index.html 43 | web/public/manifest.json 44 | web/public/template.html 45 | web/publuc/test_template.html 46 | -------------------------------------------------------------------------------- /server/prep/prep_manifest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import boto3 3 | import json 4 | from urllib.parse import urlparse 5 | from faker import Faker 6 | 7 | faker = Faker() 8 | 9 | def prepare(s3_image_path,s3_data_path ,s3_manifest_path): 10 | 11 | image_url = urlparse(s3_image_path) 12 | data_url = urlparse(s3_data_path) 13 | output_url = urlparse(s3_manifest_path) 14 | 15 | 16 | s3 = boto3.client("s3") 17 | 18 | image_response = s3.list_objects(Bucket=image_url.netloc, Prefix=image_url.path[1:]) 19 | text_response = s3.list_objects(Bucket=data_url.netloc,Prefix=data_url.path[1:] ) 20 | 21 | image_list = parse_response(image_response) 22 | text_file_list = parse_response(text_response) 23 | 24 | content_list = [] 25 | 26 | for item in image_list: 27 | print(item) 28 | image_filename = item.split('/')[-1] 29 | text_filename = "{}.csv".format(image_filename) 30 | print ("Trying to find {}/{}".format(data_url.path[1:] ,text_filename)) 31 | entry = {} 32 | if "{}/{}".format(data_url.path[1:],text_filename) in text_file_list: 33 | print("Adding new Entry") 34 | entry['source-ref'] = "s3://{}/{}".format(image_url.netloc,item) 35 | entry['text-file-s3-uri'] = "s3://{}/{}/{}".format(data_url.netloc,data_url.path[1:],text_filename) 36 | entry['metadata'] = fake_metadata() 37 | print(entry) 38 | content_list.append(entry) 39 | 40 | print(content_list) 41 | content = "".join(str("{}\n".format(line)) for line in content_list) 42 | 43 | 44 | body = bytes(content,'utf-8') 45 | 46 | resp = s3.put_object(Bucket=output_url.netloc, Key="{}/manifest.json".format(output_url.path[1:]), Body=body) 47 | 48 | 49 | def parse_response(response): 50 | list=[] 51 | prefix = '' 52 | for content in response['Contents']: 53 | if (content['Size'] > 0): 54 | print(content['Key']) 55 | file_name = content['Key'] 56 | list.append(file_name) 57 | 58 | return list 59 | 60 | def fake_metadata(): 61 | 62 | return { "Author": faker.name(), "ISBN": faker.isbn10() }; 63 | 64 | 65 | def main(args): 66 | try: 67 | s3_image_path = args[1] 68 | s3_data_path = args[2] 69 | s3_manifest_path = args[3] 70 | 71 | prepare(s3_image_path,s3_data_path,s3_manifest_path) 72 | 73 | except: 74 | raise 75 | 76 | if __name__ == "__main__": 77 | 78 | main(sys.argv) 79 | -------------------------------------------------------------------------------- /server/processing/s3_helper.py: -------------------------------------------------------------------------------- 1 | from botocore.exceptions import ClientError 2 | import boto3 3 | 4 | 5 | class S3Client(object): 6 | """ 7 | Helper Class for S3 operations 8 | """ 9 | s3_client = boto3.client("s3") 10 | s3 = boto3.resource("s3") 11 | 12 | def __init__(self, role_arn=None, kms_key_id=None): 13 | """ 14 | Initialize the S3 resource using provided Role and Kms Key 15 | 16 | :param role_arn: Role which have access to consolidation request S3 payload file. 17 | :param kms_key_id: KMS key if S3 bucket is encrypted 18 | :return: 19 | """ 20 | DEFAULT_SESSION = "Custom_Annotation_Consolidation_Lambda_Session" 21 | sts_connection = boto3.client('sts') 22 | assume_role_object = sts_connection.assume_role(RoleArn=role_arn, RoleSessionName=DEFAULT_SESSION) 23 | session = boto3.Session( 24 | aws_access_key_id=assume_role_object['Credentials']['AccessKeyId'], 25 | aws_secret_access_key=assume_role_object['Credentials']['SecretAccessKey'], 26 | aws_session_token=assume_role_object['Credentials']['SessionToken']) 27 | self.s3 = session.resource('s3') 28 | self.s3_client = session.client('s3') 29 | self.kms_key_id = kms_key_id 30 | 31 | def put_object_to_s3(self, data, bucket, key, content_type): 32 | """ 33 | Helper function to persist data in S3 34 | """ 35 | try: 36 | if not content_type: 37 | # Default content type 38 | content_type = "application/octet-stream" 39 | image_object = self.s3.Object(bucket, key) 40 | if self.kms_key_id: 41 | image_object.put(Body=data, ContentType=content_type, SSEKMSKeyId=self.kms_key_id, 42 | ServerSideEncryption="aws:kms") 43 | else: 44 | image_object.put(Body=data, ContentType=content_type) 45 | except ClientError as e: 46 | raise ValueError("Failed to put data in bucket: {} with key {}.".format(bucket, key), e) 47 | return "s3://" + image_object.bucket_name + "/" + image_object.key 48 | 49 | def get_object_from_s3(self, s3_url): 50 | """ Helper function to retrieve data from S3 """ 51 | bucket, path = S3Client.bucket_key_from_s3_uri(s3_url) 52 | 53 | try: 54 | payload = self.s3_client.get_object(Bucket=bucket, Key=path).get('Body').read().decode('utf-8') 55 | except ClientError as e: 56 | print(e) 57 | if e.response['Error']['Code'] == "404" or e.response['Error']['Code'] == 'NoSuchKey': 58 | return None 59 | else: 60 | raise ValueError("Failed to retrieve data from {}.".format(s3_url), e) 61 | 62 | return payload 63 | 64 | @staticmethod 65 | def bucket_key_from_s3_uri(s3_path): 66 | """ Return bucket and key from s3 URL 67 | 68 | Parameters 69 | ---------- 70 | s3_path: str, required 71 | s3 URL of data object ( image/video/text/audio etc ) 72 | 73 | Returns 74 | ------ 75 | bucket: str 76 | S3 Bucket of the passed URL 77 | key: str 78 | S3 Key of the passed URL 79 | """ 80 | path_parts = s3_path.replace("s3://", "").split("/") 81 | bucket = path_parts.pop(0) 82 | key = "/".join(path_parts) 83 | 84 | return bucket, key 85 | -------------------------------------------------------------------------------- /server/processing/cfn-template.json: -------------------------------------------------------------------------------- 1 | { "AWSTemplateFormatVersion": "2010-09-09", 2 | "Description": "AWS CloudFormation to deploy Lambdas", 3 | "Resources": { 4 | "ConsolidationLambdaSMGTExecutionRole": { 5 | "Type": "AWS::IAM::Role", 6 | "Properties": { 7 | "AssumeRolePolicyDocument": { 8 | "Version": "2012-10-17", 9 | "Statement": [ 10 | { 11 | "Effect": "Allow", 12 | "Principal": { 13 | "Service": ["lambda.amazonaws.com"] 14 | }, 15 | "Action": ["sts:AssumeRole"] 16 | } 17 | ] 18 | }, 19 | "ManagedPolicyArns": ["arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"], 20 | "Path": "/" 21 | } 22 | }, 23 | "PreLabelingLambdaSMGTExecutionRole": { 24 | "Type": "AWS::IAM::Role", 25 | "Properties": { 26 | "AssumeRolePolicyDocument": { 27 | "Version": "2012-10-17", 28 | "Statement": [ 29 | { 30 | "Effect": "Allow", 31 | "Principal": { 32 | "Service": ["lambda.amazonaws.com"] 33 | }, 34 | "Action": ["sts:AssumeRole"] 35 | } 36 | ] 37 | }, 38 | "ManagedPolicyArns": ["arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole","arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"], 39 | "Path": "/" 40 | } 41 | }, 42 | "PreLabelTaskLambda": { 43 | "Type": "AWS::Lambda::Function", 44 | "DependsOn": [ 45 | "PreLabelingLambdaSMGTExecutionRole" 46 | ], 47 | "Properties": { 48 | "Code": { 49 | "S3Bucket": "smgtannotation", 50 | "S3Key": "coderepo/labeling_lambda.zip" 51 | }, 52 | "Role": { 53 | "Fn::GetAtt": ["PreLabelingLambdaSMGTExecutionRole", "Arn"] 54 | }, 55 | "FunctionName": "gt-prelabel-task-lambda", 56 | "Timeout": 60, 57 | "Handler": "sagemaker-gt-preprocess.lambda_handler", 58 | "Runtime": "python3.6", 59 | "MemorySize": 128 60 | } 61 | }, 62 | "PostLabelTaskLambda": { 63 | "Type": "AWS::Lambda::Function", 64 | "DependsOn": [ 65 | "ConsolidationLambdaSMGTExecutionRole" 66 | ], 67 | "Properties": { 68 | "Code": { 69 | "S3Bucket": "smgtannotation", 70 | "S3Key": "coderepo/labeling_lambda.zip" 71 | }, 72 | "Role": { 73 | "Fn::GetAtt": ["ConsolidationLambdaSMGTExecutionRole", "Arn"] 74 | }, 75 | "FunctionName": "gt-postlabel-task-lambda", 76 | "Timeout": 60, 77 | "Handler": "sagemaker-gt-postprocess.lambda_handler", 78 | "Runtime": "python3.6", 79 | "MemorySize": 128 80 | } 81 | } 82 | }, 83 | "Outputs" : { 84 | "PostLabelingLambdaIAMRole" : { 85 | "Description": "Post Labeling Consolidation Lambda IAM Role", 86 | "Value" : { "Fn::GetAtt" : [ "ConsolidationLambdaSMGTExecutionRole", "Arn" ]} 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /server/processing/sagemaker-gt-preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import base64 3 | from urllib.parse import urlparse 4 | import boto3 5 | 6 | 7 | def lambda_handler(event, context): 8 | """Sample PreHumanTaskLambda ( pre-processing lambda) for custom labeling jobs. 9 | For custom AWS SageMaker Ground Truth Labeling Jobs, you have to specify a PreHumanTaskLambda (pre-processing lambda). 10 | AWS SageMaker invokes this lambda for each item to be labeled. Output of this lambda, is merged with the specified 11 | custom UI template. This code assumes that specified custom template have only one placeholder "taskObject". 12 | If your UI template have more parameters, please modify output of this lambda. 13 | Parameters 14 | ---------- 15 | event: dict, required 16 | Content of event looks some thing like following 17 | { 18 | "version":"2018-10-16", 19 | "labelingJobArn":"", 20 | "dataObject":{ 21 | "source-ref":"s3:////awesome.jpg" 22 | } 23 | } 24 | As SageMaker product evolves, content of event object will change. For a latest version refer following URL 25 | Event doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html 26 | context: object, required 27 | Lambda Context runtime methods and attributes 28 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html 29 | Returns 30 | ------ 31 | output: dict 32 | This output is an example JSON. We assume that your template have only one placeholder named "taskObject". 33 | If your template have more than one placeholder, make sure to add one more attribute under "taskInput" 34 | { 35 | "taskInput":{ 36 | "taskObject":src_url_http 37 | }, 38 | "isHumanAnnotationRequired":"true" 39 | } 40 | Note: Output of this lambda will be merged with the template, you specify in your labeling job. 41 | You can use preview button on SageMaker Ground Truth console to make sure merge is successful. 42 | Return doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html 43 | """ 44 | 45 | # Event received 46 | print("Received event: " + json.dumps(event, indent=2)) 47 | 48 | # Get source if specified 49 | source = event['dataObject']['source'] if "source" in event['dataObject'] else None 50 | 51 | # Get source-ref if specified 52 | source_ref = event['dataObject']['source-ref'] if "source-ref" in event['dataObject'] else None 53 | 54 | metadata = event['dataObject']['metadata'] if "metadata" in event['dataObject'] else None 55 | 56 | 57 | text_file_s3_uri = event['dataObject']['text-file-s3-uri'] if "text-file-s3-uri" in event['dataObject'] else None 58 | 59 | # if source field present, take that otherwise take source-ref 60 | task_object = source if source is not None else source_ref 61 | 62 | # Build response object 63 | output = { 64 | "taskInput": { 65 | "taskObject": task_object 66 | }, 67 | "isHumanAnnotationRequired": "true" 68 | } 69 | 70 | if metadata is not None: 71 | # Add s3 URI for text file to metadata so it is preserved in output 72 | output['taskInput']['metadata'] = metadata 73 | 74 | if text_file_s3_uri is not None: 75 | print(text_file_s3_uri) 76 | output['taskInput']['text'] = getText(text_file_s3_uri) 77 | 78 | print(output) 79 | # If neither source nor source-ref specified, mark the annotation failed 80 | if task_object is None: 81 | print(" Failed to pre-process {} !".format(event["labelingJobArn"])) 82 | output["isHumanAnnotationRequired"] = "false" 83 | 84 | return output 85 | 86 | 87 | def getText(s3uri): 88 | # Get S3 object and return text 89 | o = urlparse(s3uri) 90 | bucket = o.netloc 91 | key = o.path.lstrip('/') 92 | text = "" 93 | boto3.client('s3') 94 | s3 = boto3.resource('s3') 95 | try: 96 | obj = s3.Object(bucket, key); 97 | text = obj.get()['Body'].read().decode('utf8') 98 | return text 99 | except: 100 | print("The object does not exist.") 101 | raise 102 | 103 | return text 104 | 105 | 106 | -------------------------------------------------------------------------------- /server/prep/detect_lines.py: -------------------------------------------------------------------------------- 1 | import webbrowser, os 2 | import json 3 | import boto3 4 | import io 5 | import time 6 | from io import BytesIO 7 | import sys 8 | from pprint import pprint 9 | from urlparse import urlparse 10 | 11 | # get the results 12 | client = boto3.client( 13 | service_name='textract', 14 | region_name='us-east-1', 15 | endpoint_url='https://textract.us-east-1.amazonaws.com', 16 | ) 17 | 18 | 19 | 20 | def get_rows_columns_map(table_result, blocks_map): 21 | rows = {} 22 | for relationship in table_result['Relationships']: 23 | if relationship['Type'] == 'CHILD': 24 | for child_id in relationship['Ids']: 25 | cell = blocks_map[child_id] 26 | if cell['BlockType'] == 'CELL': 27 | row_index = cell['RowIndex'] 28 | col_index = cell['ColumnIndex'] 29 | if row_index not in rows: 30 | # create new row 31 | rows[row_index] = {} 32 | 33 | # get the text value 34 | rows[row_index][col_index] = get_text(cell, blocks_map) 35 | return rows 36 | 37 | 38 | def get_text(result, blocks_map): 39 | text = '' 40 | if 'Relationships' in result: 41 | for relationship in result['Relationships']: 42 | if relationship['Type'] == 'CHILD': 43 | for child_id in relationship['Ids']: 44 | word = blocks_map[child_id] 45 | if word['BlockType'] == 'WORD': 46 | text += word['Text'] + ' ' 47 | return text 48 | 49 | 50 | def get_table_csv_results(bucket,key): 51 | 52 | response = client.start_document_text_detection(DocumentLocation={"S3Object": { 53 | "Bucket": bucket, 54 | "Name": key }}) 55 | 56 | jobid=response['JobId'] 57 | 58 | 59 | job_response = client.get_document_text_detection(JobId=jobid) 60 | 61 | while job_response['JobStatus'] == 'IN_PROGRESS': 62 | time.sleep(15) 63 | job_response = client.get_document_text_detection(JobId=jobid) 64 | 65 | if job_response['JobStatus'] == 'SUCCEEDED' or job_response['JobStatus'] == 'PARTIAL_SUCCESS': 66 | blocks = job_response['Blocks'] 67 | else: 68 | raise exception 69 | 70 | 71 | table_blocks = [] 72 | blocks_map = {} 73 | for block in blocks: 74 | blocks_map[block['Id']] = block 75 | if block['BlockType'] == "LINE": 76 | #pprint(block) 77 | table_blocks.append(block) 78 | 79 | if len(table_blocks) <= 0: 80 | return " NO Table FOUND " 81 | 82 | csv = '' 83 | for index, table in enumerate(table_blocks): 84 | csv += generate_table_csv_2(table, blocks_map, index + 1) 85 | csv += '\n\n' 86 | 87 | return csv 88 | 89 | 90 | def generate_table_csv(table_result, blocks_map, table_index): 91 | rows = get_rows_columns_map(table_result, blocks_map) 92 | 93 | table_id = 'Table_' + str(table_index) 94 | 95 | # get cells. 96 | csv = 'Table: {0}\n\n'.format(table_id) 97 | 98 | for row_index, cols in rows.items(): 99 | 100 | for col_index, text in cols.items(): 101 | csv += '{}'.format(text) + "," 102 | csv += '\n' 103 | 104 | csv += '\n\n\n' 105 | return csv 106 | 107 | def generate_table_csv_2(table_result, blocks_map, table_index): 108 | 109 | table_id = 'Line_' + str(table_index) 110 | 111 | # get cells. 112 | csv = 'Line: {0}\n\n'.format(table_id) 113 | 114 | #pprint(table_result['Text']) 115 | csv = table_result['Text'] 116 | 117 | return csv 118 | 119 | def main(args): 120 | 121 | input_loc = args[1] 122 | output_loc = args[2] 123 | 124 | if (input_loc[len(input_loc)-1] == '/'): 125 | input_loc = input_loc[:-1] 126 | 127 | if (output_loc[len(output_loc)-1] == '/'): 128 | output_loc = output_loc[:-1] 129 | 130 | input_url = urlparse(input_loc) 131 | output_url = urlparse(output_loc) 132 | 133 | bucket = input_url.netloc 134 | key = input_url.path[1:] 135 | 136 | print(key) 137 | 138 | s3 = boto3.client('s3') 139 | response = s3.list_objects(Bucket=bucket, Prefix=key) 140 | 141 | for content in response['Contents']: 142 | if (content['Size'] > 0): 143 | print(content['Key']) 144 | file_name = content['Key'] 145 | csv_content = get_table_csv_results(bucket,file_name) 146 | 147 | csv_file = os.path.basename(file_name) 148 | output_file = '{}.csv'.format(csv_file) 149 | 150 | # replace content 151 | body = bytes(csv_content) 152 | resp = s3.put_object(Bucket=output_url.netloc, 153 | Key="{}/{}".format(output_url.path[1:],output_file), 154 | Body=body) 155 | time.sleep(5) 156 | 157 | 158 | if __name__ == "__main__": 159 | main(sys.argv) -------------------------------------------------------------------------------- /web/src/App.js: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react' 2 | import { TokenAnnotator } from 'react-text-annotate' 3 | 4 | const TEXT = document.querySelector('#document-text').innerText.trim(); 5 | const IMAGE_URL = document.querySelector('#document-image').innerText.trim(); 6 | const METADATA = document.querySelector('#metadata').innerText.trim(); 7 | 8 | const TAG_COLORS = { 9 | BACKGROUND: '#84d2ff', 10 | METHODS: '#00ffa2', 11 | RESULTS: '#FFD700', 12 | CONCLUSIONS: '#ADFF2F', 13 | OBJECTIVES : '#B0E0E6', 14 | LIMITATIONS : '#FFE4E1' 15 | } 16 | 17 | const Card = ({ children }) => ( 18 |
26 | {children} 27 |
28 | ) 29 | 30 | class App extends Component { 31 | constructor(props) { 32 | super(props); 33 | 34 | this.state = { 35 | value: [], 36 | tag: 'BACKGROUND', 37 | claimResult: false, 38 | notes: '', 39 | metadata: METADATA, 40 | numPages: 1, 41 | pageNumber: 1, 42 | }; 43 | } 44 | 45 | handleChange = value => { 46 | this.setState({ value }); 47 | } 48 | 49 | handleTagChange = e => { 50 | this.setState({ tag: e.target.value }); 51 | } 52 | 53 | handleYesBtn = () => { 54 | this.setState({ claimResult: true }); 55 | } 56 | 57 | handleNoBtn = () => { 58 | this.setState({ claimResult: false }); 59 | } 60 | 61 | handleNotes = e => { 62 | this.setState({ notes: e.target.value }); 63 | } 64 | 65 | render() { 66 | return ( 67 |
68 | 69 |
70 |
71 |

Instructions

72 |

The task can be completed with blank, or saved and returned to when time is available to make more 73 | progress. 74 | If there is evidence in the record to support or deny abstract quality, highlight it with the 75 | cursor and select Yes or No. 76 | Add any notes you have for each task in the Notes free text area.

77 |
78 |
79 |
80 | 81 |
82 | 83 |
84 | alt 85 |
86 |
87 | 88 | 89 |
90 | 98 |
99 | 100 |
104 | ({ 114 | ...span, 115 | tag: this.state.tag, 116 | color: TAG_COLORS[this.state.tag], 117 | })} 118 | /> 119 |
120 | 121 |
122 |
123 |
Notes:
124 | 126 |
127 |
128 |

129 |
Is this a good Abstract?
130 |
131 | 132 |
133 |

134 |
135 | 136 |
137 |
138 |
139 |
140 |
141 | 142 |
143 | ) 144 | } 145 | } 146 | 147 | export default App 148 | -------------------------------------------------------------------------------- /server/processing/sagemaker-gt-postprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from s3_helper import S3Client 4 | 5 | 6 | def lambda_handler(event, context): 7 | """This is a sample Annotation Consolidation Lambda for custom labeling jobs. It takes all worker responses for the 8 | item to be labeled, and output a consolidated annotation. 9 | 10 | 11 | Parameters 12 | ---------- 13 | event: dict, required 14 | Content of an example event 15 | 16 | { 17 | "version": "2018-10-16", 18 | "labelingJobArn": , 19 | "labelCategories": [], # If you created labeling job using aws console, labelCategories will be null 20 | "labelAttributeName": , 21 | "roleArn" : "string", 22 | "payload": { 23 | "s3Uri": 24 | } 25 | "outputConfig":"s3://" 26 | } 27 | 28 | 29 | Content of payload.s3Uri 30 | [ 31 | { 32 | "datasetObjectId": , 33 | "dataObject": { 34 | "s3Uri": , 35 | "content": 36 | }, 37 | "annotations": [{ 38 | "workerId": , 39 | "annotationData": { 40 | "content": , 41 | "s3Uri": 42 | } 43 | }] 44 | } 45 | ] 46 | 47 | As SageMaker product evolves, content of event object & payload.s3Uri will change. For a latest version refer following URL 48 | 49 | Event doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html 50 | 51 | context: object, required 52 | Lambda Context runtime methods and attributes 53 | 54 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html 55 | 56 | Returns 57 | ------ 58 | consolidated_output: dict 59 | AnnotationConsolidation 60 | 61 | [ 62 | { 63 | "datasetObjectId": , 64 | "consolidatedAnnotation": { 65 | "content": { 66 | "": { 67 | # ... label content 68 | } 69 | } 70 | } 71 | } 72 | ] 73 | 74 | Return doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html 75 | """ 76 | 77 | # Event received 78 | print("Received event: " + json.dumps(event, indent=2)) 79 | 80 | labeling_job_arn = event["labelingJobArn"] 81 | label_attribute_name = event["labelAttributeName"] 82 | 83 | label_categories = None 84 | if "label_categories" in event: 85 | label_categories = event["labelCategories"] 86 | print(" Label Categories are : " + label_categories) 87 | 88 | payload = event["payload"] 89 | role_arn = event["roleArn"] 90 | 91 | output_config = None # Output s3 location. You can choose to write your annotation to this location 92 | if "outputConfig" in event: 93 | output_config = event["outputConfig"] 94 | 95 | # If you specified a KMS key in your labeling job, you can use the key to write 96 | # consolidated_output to s3 location specified in outputConfig. 97 | kms_key_id = None 98 | if "kmsKeyId" in event: 99 | kms_key_id = event["kmsKeyId"] 100 | 101 | # Create s3 client object 102 | s3_client = S3Client(role_arn, kms_key_id) 103 | 104 | # Perform consolidation 105 | return do_consolidation(labeling_job_arn, payload, label_attribute_name, s3_client) 106 | 107 | 108 | def do_consolidation(labeling_job_arn, payload, label_attribute_name, s3_client): 109 | """ 110 | Core Logic for consolidation 111 | 112 | :param labeling_job_arn: labeling job ARN 113 | :param payload: payload data for consolidation 114 | :param label_attribute_name: identifier for labels in output JSON 115 | :param s3_client: S3 helper class 116 | :return: output JSON string 117 | """ 118 | 119 | # Extract payload data 120 | if "s3Uri" in payload: 121 | s3_ref = payload["s3Uri"] 122 | payload = json.loads(s3_client.get_object_from_s3(s3_ref)) 123 | print(payload) 124 | 125 | # Payload data contains a list of data objects. 126 | # Iterate over it to consolidate annotations for individual data object. 127 | consolidated_output = [] 128 | success_count = 0 # Number of data objects that were successfully consolidated 129 | failure_count = 0 # Number of data objects that failed in consolidation 130 | 131 | for p in range(len(payload)): 132 | response = None 133 | try: 134 | dataset_object_id = payload[p]['datasetObjectId'] 135 | log_prefix = "[{}] data object id [{}] :".format(labeling_job_arn, dataset_object_id) 136 | print("{} Consolidating annotations BEGIN ".format(log_prefix)) 137 | 138 | annotations = payload[p]['annotations'] 139 | print("{} Received Annotations from all workers {}".format(log_prefix, annotations)) 140 | 141 | # Iterate over annotations. Log all annotation to your CloudWatch logs 142 | for i in range(len(annotations)): 143 | worker_id = annotations[i]["workerId"] 144 | annotation_content = annotations[i]['annotationData'].get('content') 145 | annotation_s3_uri = annotations[i]['annotationData'].get('s3uri') 146 | annotation = annotation_content if annotation_s3_uri is None else s3_client.get_object_from_s3( 147 | annotation_s3_uri) 148 | annotation_from_single_worker = json.loads(annotation) 149 | 150 | print("{} Received Annotations from worker [{}] is [{}]" 151 | .format(log_prefix, worker_id, annotation_from_single_worker)) 152 | 153 | # Notice that, no consolidation is performed, worker responses are combined and appended to final output 154 | # You can put your consolidation logic here 155 | consolidated_annotation = {"annotationsFromAllWorkers": annotations} # TODO : Add your consolidation logic 156 | 157 | # Build consolidation response object for an individual data object 158 | response = { 159 | "datasetObjectId": dataset_object_id, 160 | "consolidatedAnnotation": { 161 | "content": { 162 | label_attribute_name: consolidated_annotation 163 | } 164 | } 165 | } 166 | 167 | success_count += 1 168 | print("{} Consolidating annotations END ".format(log_prefix)) 169 | 170 | # Append individual data object response to the list of responses. 171 | if response is not None: 172 | consolidated_output.append(response) 173 | 174 | except: 175 | failure_count += 1 176 | print(" Consolidation failed for dataobject {}".format(p)) 177 | print(" Unexpected error: Consolidation failed." + str(sys.exc_info()[0])) 178 | 179 | print("Consolidation Complete. Success Count {} Failure Count {}".format(success_count, failure_count)) 180 | 181 | print(" -- Consolidated Output -- ") 182 | print(consolidated_output) 183 | print(" ------------------------- ") 184 | return consolidated_output 185 | --------------------------------------------------------------------------------