/web/ --acl public-read"
18 | ```
19 |
20 | - Now run to upload project to S3 bucket
21 | ```yarn clean
22 | yarn deploy
23 | ```
24 |
25 | - Make sure to change CSS and JS URLs to point to your S3 urls in `public\template.html`
26 |
27 |
28 |
--------------------------------------------------------------------------------
/web/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "text-annotator",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "bootstrap": "^4.2.1",
7 | "react": "^16.7.0",
8 | "react-dom": "^16.7.0",
9 | "react-native": "^0.57.8",
10 | "react-scripts": "2.1.3",
11 | "react-text-annotate": "^0.1.0"
12 | },
13 | "scripts": {
14 | "start": "react-scripts start",
15 | "build": "react-scripts build",
16 | "test": "react-scripts test",
17 | "eject": "react-scripts eject",
18 | "clean": "aws s3 rm --recursive s3://smgtannotation/web/",
19 | "deploy": "aws s3 cp --recursive build/ s3://smgtannotation/web/ --acl public-read"
20 | },
21 | "eslintConfig": {
22 | "extends": "react-app"
23 | },
24 | "browserslist": [
25 | ">0.2%",
26 | "not dead",
27 | "not ie <= 11",
28 | "not op_mini all"
29 | ]
30 | }
31 |
--------------------------------------------------------------------------------
/web/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{ task.input.text }}
5 |
6 |
7 | {{ task.input.textObject }}
8 |
9 |
10 | {{ task.input.metadata }}
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | Submit
24 |
25 |
34 |
--------------------------------------------------------------------------------
/web/public/template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | {{ task.input.text }}
8 |
9 |
10 | {{ task.input.taskObject | grant_read_access }}
11 |
12 |
13 | {{ task.input.metadata }}
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | Submit
27 |
28 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/web/public/test_template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | This is my Test
8 |
9 |
10 | https://s3.amazonaws.com/smgtannotation/raw-abstracts-jpgs/arXiv_1801_00067v2__astro-ph_GA__3_Apr_2018.jpg
11 |
12 |
13 | "{'Author': 'Jennifer Roberts', 'ISBN': '0-7970-7671-9'}"
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | Submit
27 |
28 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/web/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | padding: 0;
4 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen",
5 | "Ubuntu", "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue",
6 | sans-serif;
7 | -webkit-font-smoothing: antialiased;
8 | -moz-osx-font-smoothing: grayscale;
9 | }
10 |
11 | code {
12 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New",
13 | monospace;
14 | }
15 |
16 | div.img_contain {
17 | overflow-y: auto;
18 | width: 100%;
19 | max-height: 100%;
20 | margin-left: 2%;
21 | margin-right: 2%;
22 | margin-top: 5%;
23 | margin-bottom: 5%;
24 | border-width: 2px;
25 | border-color: black;
26 | border-style: solid;
27 | background-color: white;
28 | }
29 |
30 | img.page {
31 | max-width:100%;
32 | max-length:100%;
33 | }
34 |
35 | div.controls {
36 | width:50%;
37 | display: table-cell;
38 | font-size: 0.75em;
39 | font-weight: bold;
40 | }
41 |
42 | button {
43 | width: 250px;
44 | height: 50px;
45 | background-color: white;
46 | vertical-align: top;
47 | font-size: 1.6em;
48 | font-weight: bold;
49 | border: 2px;
50 | border-style: solid;
51 | border-color: black;
52 | }
53 |
54 | button.yb {
55 | margin-right: 0.5em;
56 | width: 28%;
57 | height: 80%;
58 | background-color:#3DE63D;
59 | vertical-align: top;
60 | font-size: 1.8em;
61 | font-weight: bold;
62 | border: 2px;
63 | border-style: solid;
64 | border-color: black;
65 | }
66 |
67 | button.nb {
68 | width: 28%;
69 | height: 80%;
70 | margin-right: 0.5em;
71 | background-color: #FF4444;
72 | vertical-align: top;
73 | font-size: 1.8em;
74 | font-weight: bold;
75 | border: 2px;
76 | border-style: solid;
77 | border-color: black;
78 | }
79 |
80 | button.ud {
81 | width: 14%;
82 | height: 40%;
83 | background-color: rgb(151, 151, 151);
84 | vertical-align: bottom;
85 | font-size: 2em;
86 | font-weight: bolder;
87 | border: 2px;
88 | border-style: solid;
89 | border-color: black;
90 | }
91 |
--------------------------------------------------------------------------------
/server/data/manifest.json:
--------------------------------------------------------------------------------
1 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00006.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00006.jpg.csv', 'metadata': {'Author': 'Robert Underwood', 'ISBN': '1-358-98355-0'}}
2 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00015.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00015.jpg.csv', 'metadata': {'Author': 'Stephanie Morgan', 'ISBN': '1-242-55362-2'}}
3 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00040.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00040.jpg.csv', 'metadata': {'Author': 'Angela Anderson', 'ISBN': '0-567-58708-8'}}
4 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00041.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00041.jpg.csv', 'metadata': {'Author': 'Kenneth Stanley', 'ISBN': '1-68939-208-8'}}
5 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00052.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00052.jpg.csv', 'metadata': {'Author': 'Bruce Peck', 'ISBN': '0-7126-2438-4'}}
6 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00090.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00090.jpg.csv', 'metadata': {'Author': 'Kerry Phillips', 'ISBN': '1-61581-532-5'}}
7 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00114.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00114.jpg.csv', 'metadata': {'Author': 'Frederick Watson', 'ISBN': '1-342-56153-8'}}
8 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00128.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00128.jpg.csv', 'metadata': {'Author': 'Jennifer Gray', 'ISBN': '0-10-489174-2'}}
9 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/1801_00146.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/1801_00146.jpg.csv', 'metadata': {'Author': 'William Alexander', 'ISBN': '0-683-11895-1'}}
10 | {'source-ref': 's3://smgtannotation/raw-abstracts-jpgs/arXiv_1801_00067v2__astro-ph_GA__3_Apr_2018.jpg', 'text-file-s3-uri': 's3://smgtannotation/text/arXiv_1801_00067v2__astro-ph_GA__3_Apr_2018.jpg.csv', 'metadata': {'Author': 'Sean Moore', 'ISBN': '0-00-940460-0'}}
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Build your own custom labeling workflow using SageMaker Ground Truth
2 |
3 | Successful machine learning models are built on the shoulders of large volumes of high-quality training data, but the process to create the training data necessary to build these models is expensive, complicated, and time-consuming. The majority of models created today require a human to manually label data in a way that allows the model to learn how to make correct decisions.
4 |
5 | Amazon SageMaker Ground Truth provides built-in workflows for image classification, bounding boxes, text classification, and semantic segmentation use cases. You also have the option of building your own custom workflows where you define the user interface (UI) for performing data labeling. To help you move quickly, SageMaker provides you a number of commonly used custom UI templates for image, text, and audio data labeling use cases. These templates take advantage of SageMaker Ground Truth’s crowd HTML elements that are meant to simplify the process of building data labeling UIs. You can also specify your own arbitrary HTML for the UI.
6 |
7 | You may need to build custom workflow for various reasons, such as:
8 | - Your own custom data labeling requirements
9 | - Complex input consisting of multiple elements per task (e.g., images, text, or custom metadata)
10 | - Dynamic decision making on task input to prevent certain items from going to labelers
11 | - Custom logic for consolidating labeling output to improve labeling accuracy
12 |
13 |
14 | In this blog post, we demonstrate a custom text annotation labeling workflow to build labelled dataset for Natural language processing (NLP) problem
15 |
16 |
17 | #### Augumented Manifest
18 | server/data/manifest.json
19 | server/data/mini_manifest.json
20 |
21 | #### Script to extract text using Amazon Textract
22 | server/prep/detect_lines.py
23 |
24 | #### Script to create Manifest
25 | server/prep/prep_manifest.py
26 |
27 | #### Cloudformation script to deploy Lambda
28 | server/processing/cfn-template.json
29 |
30 | #### Pre and post labeling lambdas
31 | server/processing/sagemaker-gt-postprocess.py
32 | server/processing/sagemaker-gt-preprocess.py
33 |
34 | #### React Components
35 | web/README.md
36 | web/package.json
37 | web/src/App.css
38 | web/src/App.js
39 | web/src/App.test.js
40 | web/src/index.css
41 | web/src/index.js
42 | web/public/index.html
43 | web/public/manifest.json
44 | web/public/template.html
45 | web/publuc/test_template.html
46 |
--------------------------------------------------------------------------------
/server/prep/prep_manifest.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import boto3
3 | import json
4 | from urllib.parse import urlparse
5 | from faker import Faker
6 |
7 | faker = Faker()
8 |
9 | def prepare(s3_image_path,s3_data_path ,s3_manifest_path):
10 |
11 | image_url = urlparse(s3_image_path)
12 | data_url = urlparse(s3_data_path)
13 | output_url = urlparse(s3_manifest_path)
14 |
15 |
16 | s3 = boto3.client("s3")
17 |
18 | image_response = s3.list_objects(Bucket=image_url.netloc, Prefix=image_url.path[1:])
19 | text_response = s3.list_objects(Bucket=data_url.netloc,Prefix=data_url.path[1:] )
20 |
21 | image_list = parse_response(image_response)
22 | text_file_list = parse_response(text_response)
23 |
24 | content_list = []
25 |
26 | for item in image_list:
27 | print(item)
28 | image_filename = item.split('/')[-1]
29 | text_filename = "{}.csv".format(image_filename)
30 | print ("Trying to find {}/{}".format(data_url.path[1:] ,text_filename))
31 | entry = {}
32 | if "{}/{}".format(data_url.path[1:],text_filename) in text_file_list:
33 | print("Adding new Entry")
34 | entry['source-ref'] = "s3://{}/{}".format(image_url.netloc,item)
35 | entry['text-file-s3-uri'] = "s3://{}/{}/{}".format(data_url.netloc,data_url.path[1:],text_filename)
36 | entry['metadata'] = fake_metadata()
37 | print(entry)
38 | content_list.append(entry)
39 |
40 | print(content_list)
41 | content = "".join(str("{}\n".format(line)) for line in content_list)
42 |
43 |
44 | body = bytes(content,'utf-8')
45 |
46 | resp = s3.put_object(Bucket=output_url.netloc, Key="{}/manifest.json".format(output_url.path[1:]), Body=body)
47 |
48 |
49 | def parse_response(response):
50 | list=[]
51 | prefix = ''
52 | for content in response['Contents']:
53 | if (content['Size'] > 0):
54 | print(content['Key'])
55 | file_name = content['Key']
56 | list.append(file_name)
57 |
58 | return list
59 |
60 | def fake_metadata():
61 |
62 | return { "Author": faker.name(), "ISBN": faker.isbn10() };
63 |
64 |
65 | def main(args):
66 | try:
67 | s3_image_path = args[1]
68 | s3_data_path = args[2]
69 | s3_manifest_path = args[3]
70 |
71 | prepare(s3_image_path,s3_data_path,s3_manifest_path)
72 |
73 | except:
74 | raise
75 |
76 | if __name__ == "__main__":
77 |
78 | main(sys.argv)
79 |
--------------------------------------------------------------------------------
/server/processing/s3_helper.py:
--------------------------------------------------------------------------------
1 | from botocore.exceptions import ClientError
2 | import boto3
3 |
4 |
5 | class S3Client(object):
6 | """
7 | Helper Class for S3 operations
8 | """
9 | s3_client = boto3.client("s3")
10 | s3 = boto3.resource("s3")
11 |
12 | def __init__(self, role_arn=None, kms_key_id=None):
13 | """
14 | Initialize the S3 resource using provided Role and Kms Key
15 |
16 | :param role_arn: Role which have access to consolidation request S3 payload file.
17 | :param kms_key_id: KMS key if S3 bucket is encrypted
18 | :return:
19 | """
20 | DEFAULT_SESSION = "Custom_Annotation_Consolidation_Lambda_Session"
21 | sts_connection = boto3.client('sts')
22 | assume_role_object = sts_connection.assume_role(RoleArn=role_arn, RoleSessionName=DEFAULT_SESSION)
23 | session = boto3.Session(
24 | aws_access_key_id=assume_role_object['Credentials']['AccessKeyId'],
25 | aws_secret_access_key=assume_role_object['Credentials']['SecretAccessKey'],
26 | aws_session_token=assume_role_object['Credentials']['SessionToken'])
27 | self.s3 = session.resource('s3')
28 | self.s3_client = session.client('s3')
29 | self.kms_key_id = kms_key_id
30 |
31 | def put_object_to_s3(self, data, bucket, key, content_type):
32 | """
33 | Helper function to persist data in S3
34 | """
35 | try:
36 | if not content_type:
37 | # Default content type
38 | content_type = "application/octet-stream"
39 | image_object = self.s3.Object(bucket, key)
40 | if self.kms_key_id:
41 | image_object.put(Body=data, ContentType=content_type, SSEKMSKeyId=self.kms_key_id,
42 | ServerSideEncryption="aws:kms")
43 | else:
44 | image_object.put(Body=data, ContentType=content_type)
45 | except ClientError as e:
46 | raise ValueError("Failed to put data in bucket: {} with key {}.".format(bucket, key), e)
47 | return "s3://" + image_object.bucket_name + "/" + image_object.key
48 |
49 | def get_object_from_s3(self, s3_url):
50 | """ Helper function to retrieve data from S3 """
51 | bucket, path = S3Client.bucket_key_from_s3_uri(s3_url)
52 |
53 | try:
54 | payload = self.s3_client.get_object(Bucket=bucket, Key=path).get('Body').read().decode('utf-8')
55 | except ClientError as e:
56 | print(e)
57 | if e.response['Error']['Code'] == "404" or e.response['Error']['Code'] == 'NoSuchKey':
58 | return None
59 | else:
60 | raise ValueError("Failed to retrieve data from {}.".format(s3_url), e)
61 |
62 | return payload
63 |
64 | @staticmethod
65 | def bucket_key_from_s3_uri(s3_path):
66 | """ Return bucket and key from s3 URL
67 |
68 | Parameters
69 | ----------
70 | s3_path: str, required
71 | s3 URL of data object ( image/video/text/audio etc )
72 |
73 | Returns
74 | ------
75 | bucket: str
76 | S3 Bucket of the passed URL
77 | key: str
78 | S3 Key of the passed URL
79 | """
80 | path_parts = s3_path.replace("s3://", "").split("/")
81 | bucket = path_parts.pop(0)
82 | key = "/".join(path_parts)
83 |
84 | return bucket, key
85 |
--------------------------------------------------------------------------------
/server/processing/cfn-template.json:
--------------------------------------------------------------------------------
1 | { "AWSTemplateFormatVersion": "2010-09-09",
2 | "Description": "AWS CloudFormation to deploy Lambdas",
3 | "Resources": {
4 | "ConsolidationLambdaSMGTExecutionRole": {
5 | "Type": "AWS::IAM::Role",
6 | "Properties": {
7 | "AssumeRolePolicyDocument": {
8 | "Version": "2012-10-17",
9 | "Statement": [
10 | {
11 | "Effect": "Allow",
12 | "Principal": {
13 | "Service": ["lambda.amazonaws.com"]
14 | },
15 | "Action": ["sts:AssumeRole"]
16 | }
17 | ]
18 | },
19 | "ManagedPolicyArns": ["arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"],
20 | "Path": "/"
21 | }
22 | },
23 | "PreLabelingLambdaSMGTExecutionRole": {
24 | "Type": "AWS::IAM::Role",
25 | "Properties": {
26 | "AssumeRolePolicyDocument": {
27 | "Version": "2012-10-17",
28 | "Statement": [
29 | {
30 | "Effect": "Allow",
31 | "Principal": {
32 | "Service": ["lambda.amazonaws.com"]
33 | },
34 | "Action": ["sts:AssumeRole"]
35 | }
36 | ]
37 | },
38 | "ManagedPolicyArns": ["arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole","arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"],
39 | "Path": "/"
40 | }
41 | },
42 | "PreLabelTaskLambda": {
43 | "Type": "AWS::Lambda::Function",
44 | "DependsOn": [
45 | "PreLabelingLambdaSMGTExecutionRole"
46 | ],
47 | "Properties": {
48 | "Code": {
49 | "S3Bucket": "smgtannotation",
50 | "S3Key": "coderepo/labeling_lambda.zip"
51 | },
52 | "Role": {
53 | "Fn::GetAtt": ["PreLabelingLambdaSMGTExecutionRole", "Arn"]
54 | },
55 | "FunctionName": "gt-prelabel-task-lambda",
56 | "Timeout": 60,
57 | "Handler": "sagemaker-gt-preprocess.lambda_handler",
58 | "Runtime": "python3.6",
59 | "MemorySize": 128
60 | }
61 | },
62 | "PostLabelTaskLambda": {
63 | "Type": "AWS::Lambda::Function",
64 | "DependsOn": [
65 | "ConsolidationLambdaSMGTExecutionRole"
66 | ],
67 | "Properties": {
68 | "Code": {
69 | "S3Bucket": "smgtannotation",
70 | "S3Key": "coderepo/labeling_lambda.zip"
71 | },
72 | "Role": {
73 | "Fn::GetAtt": ["ConsolidationLambdaSMGTExecutionRole", "Arn"]
74 | },
75 | "FunctionName": "gt-postlabel-task-lambda",
76 | "Timeout": 60,
77 | "Handler": "sagemaker-gt-postprocess.lambda_handler",
78 | "Runtime": "python3.6",
79 | "MemorySize": 128
80 | }
81 | }
82 | },
83 | "Outputs" : {
84 | "PostLabelingLambdaIAMRole" : {
85 | "Description": "Post Labeling Consolidation Lambda IAM Role",
86 | "Value" : { "Fn::GetAtt" : [ "ConsolidationLambdaSMGTExecutionRole", "Arn" ]}
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/server/processing/sagemaker-gt-preprocess.py:
--------------------------------------------------------------------------------
1 | import json
2 | import base64
3 | from urllib.parse import urlparse
4 | import boto3
5 |
6 |
7 | def lambda_handler(event, context):
8 | """Sample PreHumanTaskLambda ( pre-processing lambda) for custom labeling jobs.
9 | For custom AWS SageMaker Ground Truth Labeling Jobs, you have to specify a PreHumanTaskLambda (pre-processing lambda).
10 | AWS SageMaker invokes this lambda for each item to be labeled. Output of this lambda, is merged with the specified
11 | custom UI template. This code assumes that specified custom template have only one placeholder "taskObject".
12 | If your UI template have more parameters, please modify output of this lambda.
13 | Parameters
14 | ----------
15 | event: dict, required
16 | Content of event looks some thing like following
17 | {
18 | "version":"2018-10-16",
19 | "labelingJobArn":"",
20 | "dataObject":{
21 | "source-ref":"s3:////awesome.jpg"
22 | }
23 | }
24 | As SageMaker product evolves, content of event object will change. For a latest version refer following URL
25 | Event doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html
26 | context: object, required
27 | Lambda Context runtime methods and attributes
28 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html
29 | Returns
30 | ------
31 | output: dict
32 | This output is an example JSON. We assume that your template have only one placeholder named "taskObject".
33 | If your template have more than one placeholder, make sure to add one more attribute under "taskInput"
34 | {
35 | "taskInput":{
36 | "taskObject":src_url_http
37 | },
38 | "isHumanAnnotationRequired":"true"
39 | }
40 | Note: Output of this lambda will be merged with the template, you specify in your labeling job.
41 | You can use preview button on SageMaker Ground Truth console to make sure merge is successful.
42 | Return doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html
43 | """
44 |
45 | # Event received
46 | print("Received event: " + json.dumps(event, indent=2))
47 |
48 | # Get source if specified
49 | source = event['dataObject']['source'] if "source" in event['dataObject'] else None
50 |
51 | # Get source-ref if specified
52 | source_ref = event['dataObject']['source-ref'] if "source-ref" in event['dataObject'] else None
53 |
54 | metadata = event['dataObject']['metadata'] if "metadata" in event['dataObject'] else None
55 |
56 |
57 | text_file_s3_uri = event['dataObject']['text-file-s3-uri'] if "text-file-s3-uri" in event['dataObject'] else None
58 |
59 | # if source field present, take that otherwise take source-ref
60 | task_object = source if source is not None else source_ref
61 |
62 | # Build response object
63 | output = {
64 | "taskInput": {
65 | "taskObject": task_object
66 | },
67 | "isHumanAnnotationRequired": "true"
68 | }
69 |
70 | if metadata is not None:
71 | # Add s3 URI for text file to metadata so it is preserved in output
72 | output['taskInput']['metadata'] = metadata
73 |
74 | if text_file_s3_uri is not None:
75 | print(text_file_s3_uri)
76 | output['taskInput']['text'] = getText(text_file_s3_uri)
77 |
78 | print(output)
79 | # If neither source nor source-ref specified, mark the annotation failed
80 | if task_object is None:
81 | print(" Failed to pre-process {} !".format(event["labelingJobArn"]))
82 | output["isHumanAnnotationRequired"] = "false"
83 |
84 | return output
85 |
86 |
87 | def getText(s3uri):
88 | # Get S3 object and return text
89 | o = urlparse(s3uri)
90 | bucket = o.netloc
91 | key = o.path.lstrip('/')
92 | text = ""
93 | boto3.client('s3')
94 | s3 = boto3.resource('s3')
95 | try:
96 | obj = s3.Object(bucket, key);
97 | text = obj.get()['Body'].read().decode('utf8')
98 | return text
99 | except:
100 | print("The object does not exist.")
101 | raise
102 |
103 | return text
104 |
105 |
106 |
--------------------------------------------------------------------------------
/server/prep/detect_lines.py:
--------------------------------------------------------------------------------
1 | import webbrowser, os
2 | import json
3 | import boto3
4 | import io
5 | import time
6 | from io import BytesIO
7 | import sys
8 | from pprint import pprint
9 | from urlparse import urlparse
10 |
11 | # get the results
12 | client = boto3.client(
13 | service_name='textract',
14 | region_name='us-east-1',
15 | endpoint_url='https://textract.us-east-1.amazonaws.com',
16 | )
17 |
18 |
19 |
20 | def get_rows_columns_map(table_result, blocks_map):
21 | rows = {}
22 | for relationship in table_result['Relationships']:
23 | if relationship['Type'] == 'CHILD':
24 | for child_id in relationship['Ids']:
25 | cell = blocks_map[child_id]
26 | if cell['BlockType'] == 'CELL':
27 | row_index = cell['RowIndex']
28 | col_index = cell['ColumnIndex']
29 | if row_index not in rows:
30 | # create new row
31 | rows[row_index] = {}
32 |
33 | # get the text value
34 | rows[row_index][col_index] = get_text(cell, blocks_map)
35 | return rows
36 |
37 |
38 | def get_text(result, blocks_map):
39 | text = ''
40 | if 'Relationships' in result:
41 | for relationship in result['Relationships']:
42 | if relationship['Type'] == 'CHILD':
43 | for child_id in relationship['Ids']:
44 | word = blocks_map[child_id]
45 | if word['BlockType'] == 'WORD':
46 | text += word['Text'] + ' '
47 | return text
48 |
49 |
50 | def get_table_csv_results(bucket,key):
51 |
52 | response = client.start_document_text_detection(DocumentLocation={"S3Object": {
53 | "Bucket": bucket,
54 | "Name": key }})
55 |
56 | jobid=response['JobId']
57 |
58 |
59 | job_response = client.get_document_text_detection(JobId=jobid)
60 |
61 | while job_response['JobStatus'] == 'IN_PROGRESS':
62 | time.sleep(15)
63 | job_response = client.get_document_text_detection(JobId=jobid)
64 |
65 | if job_response['JobStatus'] == 'SUCCEEDED' or job_response['JobStatus'] == 'PARTIAL_SUCCESS':
66 | blocks = job_response['Blocks']
67 | else:
68 | raise exception
69 |
70 |
71 | table_blocks = []
72 | blocks_map = {}
73 | for block in blocks:
74 | blocks_map[block['Id']] = block
75 | if block['BlockType'] == "LINE":
76 | #pprint(block)
77 | table_blocks.append(block)
78 |
79 | if len(table_blocks) <= 0:
80 | return " NO Table FOUND "
81 |
82 | csv = ''
83 | for index, table in enumerate(table_blocks):
84 | csv += generate_table_csv_2(table, blocks_map, index + 1)
85 | csv += '\n\n'
86 |
87 | return csv
88 |
89 |
90 | def generate_table_csv(table_result, blocks_map, table_index):
91 | rows = get_rows_columns_map(table_result, blocks_map)
92 |
93 | table_id = 'Table_' + str(table_index)
94 |
95 | # get cells.
96 | csv = 'Table: {0}\n\n'.format(table_id)
97 |
98 | for row_index, cols in rows.items():
99 |
100 | for col_index, text in cols.items():
101 | csv += '{}'.format(text) + ","
102 | csv += '\n'
103 |
104 | csv += '\n\n\n'
105 | return csv
106 |
107 | def generate_table_csv_2(table_result, blocks_map, table_index):
108 |
109 | table_id = 'Line_' + str(table_index)
110 |
111 | # get cells.
112 | csv = 'Line: {0}\n\n'.format(table_id)
113 |
114 | #pprint(table_result['Text'])
115 | csv = table_result['Text']
116 |
117 | return csv
118 |
119 | def main(args):
120 |
121 | input_loc = args[1]
122 | output_loc = args[2]
123 |
124 | if (input_loc[len(input_loc)-1] == '/'):
125 | input_loc = input_loc[:-1]
126 |
127 | if (output_loc[len(output_loc)-1] == '/'):
128 | output_loc = output_loc[:-1]
129 |
130 | input_url = urlparse(input_loc)
131 | output_url = urlparse(output_loc)
132 |
133 | bucket = input_url.netloc
134 | key = input_url.path[1:]
135 |
136 | print(key)
137 |
138 | s3 = boto3.client('s3')
139 | response = s3.list_objects(Bucket=bucket, Prefix=key)
140 |
141 | for content in response['Contents']:
142 | if (content['Size'] > 0):
143 | print(content['Key'])
144 | file_name = content['Key']
145 | csv_content = get_table_csv_results(bucket,file_name)
146 |
147 | csv_file = os.path.basename(file_name)
148 | output_file = '{}.csv'.format(csv_file)
149 |
150 | # replace content
151 | body = bytes(csv_content)
152 | resp = s3.put_object(Bucket=output_url.netloc,
153 | Key="{}/{}".format(output_url.path[1:],output_file),
154 | Body=body)
155 | time.sleep(5)
156 |
157 |
158 | if __name__ == "__main__":
159 | main(sys.argv)
--------------------------------------------------------------------------------
/web/src/App.js:
--------------------------------------------------------------------------------
1 | import React, { Component } from 'react'
2 | import { TokenAnnotator } from 'react-text-annotate'
3 |
4 | const TEXT = document.querySelector('#document-text').innerText.trim();
5 | const IMAGE_URL = document.querySelector('#document-image').innerText.trim();
6 | const METADATA = document.querySelector('#metadata').innerText.trim();
7 |
8 | const TAG_COLORS = {
9 | BACKGROUND: '#84d2ff',
10 | METHODS: '#00ffa2',
11 | RESULTS: '#FFD700',
12 | CONCLUSIONS: '#ADFF2F',
13 | OBJECTIVES : '#B0E0E6',
14 | LIMITATIONS : '#FFE4E1'
15 | }
16 |
17 | const Card = ({ children }) => (
18 |
26 | {children}
27 |
28 | )
29 |
30 | class App extends Component {
31 | constructor(props) {
32 | super(props);
33 |
34 | this.state = {
35 | value: [],
36 | tag: 'BACKGROUND',
37 | claimResult: false,
38 | notes: '',
39 | metadata: METADATA,
40 | numPages: 1,
41 | pageNumber: 1,
42 | };
43 | }
44 |
45 | handleChange = value => {
46 | this.setState({ value });
47 | }
48 |
49 | handleTagChange = e => {
50 | this.setState({ tag: e.target.value });
51 | }
52 |
53 | handleYesBtn = () => {
54 | this.setState({ claimResult: true });
55 | }
56 |
57 | handleNoBtn = () => {
58 | this.setState({ claimResult: false });
59 | }
60 |
61 | handleNotes = e => {
62 | this.setState({ notes: e.target.value });
63 | }
64 |
65 | render() {
66 | return (
67 |
68 |
69 |
70 |
71 |
Instructions
72 |
The task can be completed with blank, or saved and returned to when time is available to make more
73 | progress.
74 | If there is evidence in the record to support or deny abstract quality, highlight it with the
75 | cursor and select Yes or No .
76 | Add any notes you have for each task in the Notes free text area.
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 | BACKGROUND
92 | METHODS
93 | RESULTS
94 | CONCLUSIONS
95 | OBJECTIVES
96 | LIMITATIONS
97 |
98 |
99 |
100 |
104 | ({
114 | ...span,
115 | tag: this.state.tag,
116 | color: TAG_COLORS[this.state.tag],
117 | })}
118 | />
119 |
120 |
121 |
122 |
123 |
Notes:
124 |
126 |
127 |
128 |
129 |
Is this a good Abstract?
130 |
131 | Yes
132 |
133 |
134 |
135 | No
136 |
137 |
138 |
139 |
140 |
141 |
{JSON.stringify(this.state, null, 2)}
142 |
143 | )
144 | }
145 | }
146 |
147 | export default App
148 |
--------------------------------------------------------------------------------
/server/processing/sagemaker-gt-postprocess.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from s3_helper import S3Client
4 |
5 |
6 | def lambda_handler(event, context):
7 | """This is a sample Annotation Consolidation Lambda for custom labeling jobs. It takes all worker responses for the
8 | item to be labeled, and output a consolidated annotation.
9 |
10 |
11 | Parameters
12 | ----------
13 | event: dict, required
14 | Content of an example event
15 |
16 | {
17 | "version": "2018-10-16",
18 | "labelingJobArn": ,
19 | "labelCategories": [], # If you created labeling job using aws console, labelCategories will be null
20 | "labelAttributeName": ,
21 | "roleArn" : "string",
22 | "payload": {
23 | "s3Uri":
24 | }
25 | "outputConfig":"s3://"
26 | }
27 |
28 |
29 | Content of payload.s3Uri
30 | [
31 | {
32 | "datasetObjectId": ,
33 | "dataObject": {
34 | "s3Uri": ,
35 | "content":
36 | },
37 | "annotations": [{
38 | "workerId": ,
39 | "annotationData": {
40 | "content": ,
41 | "s3Uri":
42 | }
43 | }]
44 | }
45 | ]
46 |
47 | As SageMaker product evolves, content of event object & payload.s3Uri will change. For a latest version refer following URL
48 |
49 | Event doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html
50 |
51 | context: object, required
52 | Lambda Context runtime methods and attributes
53 |
54 | Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html
55 |
56 | Returns
57 | ------
58 | consolidated_output: dict
59 | AnnotationConsolidation
60 |
61 | [
62 | {
63 | "datasetObjectId": ,
64 | "consolidatedAnnotation": {
65 | "content": {
66 | "": {
67 | # ... label content
68 | }
69 | }
70 | }
71 | }
72 | ]
73 |
74 | Return doc: https://docs.aws.amazon.com/sagemaker/latest/dg/sms-custom-templates-step3.html
75 | """
76 |
77 | # Event received
78 | print("Received event: " + json.dumps(event, indent=2))
79 |
80 | labeling_job_arn = event["labelingJobArn"]
81 | label_attribute_name = event["labelAttributeName"]
82 |
83 | label_categories = None
84 | if "label_categories" in event:
85 | label_categories = event["labelCategories"]
86 | print(" Label Categories are : " + label_categories)
87 |
88 | payload = event["payload"]
89 | role_arn = event["roleArn"]
90 |
91 | output_config = None # Output s3 location. You can choose to write your annotation to this location
92 | if "outputConfig" in event:
93 | output_config = event["outputConfig"]
94 |
95 | # If you specified a KMS key in your labeling job, you can use the key to write
96 | # consolidated_output to s3 location specified in outputConfig.
97 | kms_key_id = None
98 | if "kmsKeyId" in event:
99 | kms_key_id = event["kmsKeyId"]
100 |
101 | # Create s3 client object
102 | s3_client = S3Client(role_arn, kms_key_id)
103 |
104 | # Perform consolidation
105 | return do_consolidation(labeling_job_arn, payload, label_attribute_name, s3_client)
106 |
107 |
108 | def do_consolidation(labeling_job_arn, payload, label_attribute_name, s3_client):
109 | """
110 | Core Logic for consolidation
111 |
112 | :param labeling_job_arn: labeling job ARN
113 | :param payload: payload data for consolidation
114 | :param label_attribute_name: identifier for labels in output JSON
115 | :param s3_client: S3 helper class
116 | :return: output JSON string
117 | """
118 |
119 | # Extract payload data
120 | if "s3Uri" in payload:
121 | s3_ref = payload["s3Uri"]
122 | payload = json.loads(s3_client.get_object_from_s3(s3_ref))
123 | print(payload)
124 |
125 | # Payload data contains a list of data objects.
126 | # Iterate over it to consolidate annotations for individual data object.
127 | consolidated_output = []
128 | success_count = 0 # Number of data objects that were successfully consolidated
129 | failure_count = 0 # Number of data objects that failed in consolidation
130 |
131 | for p in range(len(payload)):
132 | response = None
133 | try:
134 | dataset_object_id = payload[p]['datasetObjectId']
135 | log_prefix = "[{}] data object id [{}] :".format(labeling_job_arn, dataset_object_id)
136 | print("{} Consolidating annotations BEGIN ".format(log_prefix))
137 |
138 | annotations = payload[p]['annotations']
139 | print("{} Received Annotations from all workers {}".format(log_prefix, annotations))
140 |
141 | # Iterate over annotations. Log all annotation to your CloudWatch logs
142 | for i in range(len(annotations)):
143 | worker_id = annotations[i]["workerId"]
144 | annotation_content = annotations[i]['annotationData'].get('content')
145 | annotation_s3_uri = annotations[i]['annotationData'].get('s3uri')
146 | annotation = annotation_content if annotation_s3_uri is None else s3_client.get_object_from_s3(
147 | annotation_s3_uri)
148 | annotation_from_single_worker = json.loads(annotation)
149 |
150 | print("{} Received Annotations from worker [{}] is [{}]"
151 | .format(log_prefix, worker_id, annotation_from_single_worker))
152 |
153 | # Notice that, no consolidation is performed, worker responses are combined and appended to final output
154 | # You can put your consolidation logic here
155 | consolidated_annotation = {"annotationsFromAllWorkers": annotations} # TODO : Add your consolidation logic
156 |
157 | # Build consolidation response object for an individual data object
158 | response = {
159 | "datasetObjectId": dataset_object_id,
160 | "consolidatedAnnotation": {
161 | "content": {
162 | label_attribute_name: consolidated_annotation
163 | }
164 | }
165 | }
166 |
167 | success_count += 1
168 | print("{} Consolidating annotations END ".format(log_prefix))
169 |
170 | # Append individual data object response to the list of responses.
171 | if response is not None:
172 | consolidated_output.append(response)
173 |
174 | except:
175 | failure_count += 1
176 | print(" Consolidation failed for dataobject {}".format(p))
177 | print(" Unexpected error: Consolidation failed." + str(sys.exc_info()[0]))
178 |
179 | print("Consolidation Complete. Success Count {} Failure Count {}".format(success_count, failure_count))
180 |
181 | print(" -- Consolidated Output -- ")
182 | print(consolidated_output)
183 | print(" ------------------------- ")
184 | return consolidated_output
185 |
--------------------------------------------------------------------------------