├── .gitignore
├── static
    ├── neptune-ui.png
    ├── overview.png
    ├── gremlin-vertex-edge.png
    ├── neptune-creation-part1.png
    ├── neptune-creation-part2.png
    ├── neptune-creation-part3.png
    ├── notebook-creation-part01.png
    ├── notebook-creation-part02.png
    ├── notebook-creation-part1.png
    ├── notebook-creation-part2.png
    ├── notebook-creation-part3.png
    ├── notebook-creation-part4.png
    ├── notebook-creation-part5.png
    ├── notebook-creation-part6.png
    └── notebook-creation-part7.png
├── CODE_OF_CONDUCT.md
├── LICENSE
├── CONTRIBUTING.md
├── notebooks
    ├── part4-cleanup.ipynb
    ├── part1-rekognition.ipynb
    ├── part2-transcribe-comprehend.ipynb
    └── part0-setup.ipynb
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | tmp/*
3 | notebooks/.ipynb_checkpoints/*
4 | 


--------------------------------------------------------------------------------
/static/neptune-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/neptune-ui.png


--------------------------------------------------------------------------------
/static/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/overview.png


--------------------------------------------------------------------------------
/static/gremlin-vertex-edge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/gremlin-vertex-edge.png


--------------------------------------------------------------------------------
/static/neptune-creation-part1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/neptune-creation-part1.png


--------------------------------------------------------------------------------
/static/neptune-creation-part2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/neptune-creation-part2.png


--------------------------------------------------------------------------------
/static/neptune-creation-part3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/neptune-creation-part3.png


--------------------------------------------------------------------------------
/static/notebook-creation-part01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part01.png


--------------------------------------------------------------------------------
/static/notebook-creation-part02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part02.png


--------------------------------------------------------------------------------
/static/notebook-creation-part1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part1.png


--------------------------------------------------------------------------------
/static/notebook-creation-part2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part2.png


--------------------------------------------------------------------------------
/static/notebook-creation-part3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part3.png


--------------------------------------------------------------------------------
/static/notebook-creation-part4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part4.png


--------------------------------------------------------------------------------
/static/notebook-creation-part5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part5.png


--------------------------------------------------------------------------------
/static/notebook-creation-part6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part6.png


--------------------------------------------------------------------------------
/static/notebook-creation-part7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/aws-video-metadata-knowledge-graph-workshop/HEAD/static/notebook-creation-part7.png


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/notebooks/part4-cleanup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part 4 - Resources clean up\n",
  8 |     "\n",
  9 |     "This part of the lab is meant to help you clean up the resources we've created during set up notably.\n",
 10 |     "\n",
 11 |     "These are the resources that will get deleted by executing this notebook:\n",
 12 |     "- SNS policy\n",
 13 |     "- SNS topic\n",
 14 |     "- S3 bucket policy\n",
 15 |     "- S3 policy\n",
 16 |     "- the created S3 bucket (not yours) including all its content\n",
 17 |     "- IAM role that was used by Rekognition notably\n",
 18 |     "\n",
 19 |     "<b>IMPORTANT: Please note that the Amazon Neptune GraphDB and the notebook instance you're using to run this code will NOT be deleted by this notebook. Please delete them if you're not planning to use it in order to not incur additional costs.</b>"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import boto3\n",
 29 |     "s3 = boto3.client('s3')\n",
 30 |     "sns = boto3.client('sns')\n",
 31 |     "iam = boto3.client(\"iam\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "#load stored variable from lab0 notebook\n",
 41 |     "%store -r"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## SNS Policy deletion"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "#detaching policy from role first\n",
 58 |     "iam.detach_role_policy(\n",
 59 |     "    RoleName=role_name,\n",
 60 |     "    PolicyArn=sns_policy_arn\n",
 61 |     ")\n",
 62 |     "#deleting the policy\n",
 63 |     "iam.delete_policy(PolicyArn=sns_policy_arn)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## SNS Topic deletion"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "sns.delete_topic(TopicArn=sns_topic_arn)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## S3 Policy deletion"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#detaching policy from role first\n",
 96 |     "iam.detach_role_policy(\n",
 97 |     "    RoleName=role_name,\n",
 98 |     "    PolicyArn=s3_policy_arn\n",
 99 |     ")\n",
100 |     "#deleting the policy\n",
101 |     "iam.delete_policy(PolicyArn=s3_policy_arn)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## S3 bucket deletion"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "#deleting all files within the input folder in the bucket\n",
118 |     "response = s3.list_objects_v2(Bucket=bucket)\n",
119 |     "for object in response['Contents']:\n",
120 |     "    print('Deleting', object['Key'])\n",
121 |     "    s3.delete_object(Bucket=bucket, Key=object['Key'])"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "#delete the bucket\n",
131 |     "s3.delete_bucket(Bucket=bucket)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Role deletion"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "#delete role\n",
148 |     "iam.delete_role(RoleName=role_name)"
149 |    ]
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "instance_type": "ml.t3.medium",
154 |   "kernelspec": {
155 |    "display_name": "Python 3 (Data Science)",
156 |    "language": "python",
157 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-southeast-2:452832661640:image/datascience-1.0"
158 |   },
159 |   "language_info": {
160 |    "codemirror_mode": {
161 |     "name": "ipython",
162 |     "version": 3
163 |    },
164 |    "file_extension": ".py",
165 |    "mimetype": "text/x-python",
166 |    "name": "python",
167 |    "nbconvert_exporter": "python",
168 |    "pygments_lexer": "ipython3",
169 |    "version": "3.7.10"
170 |   }
171 |  },
172 |  "nbformat": 4,
173 |  "nbformat_minor": 5
174 | }
175 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Video metadata extraction and knowledge graph
  2 | 
  3 | This repository contains a series of 4 jupyter notebooks demonstrating how AWS AI Services like Amazon Rekognition, Amazon Transcribe and Amazon Comprehend can help you extract valuable metadata from your video assets and store that information in a Graph database like Amazon Neptune for maximum query performance and flexibility.
  4 | At the end of the workshop you'll typically be able to search for a specific label or entity and return a list of 1min video segments related to your search across your videos.
  5 | 
  6 | To extract metadata from a video, we'll use a the following AWS AI services:
  7 | - Amazon Rekognition to cut the video in scenes and detect label from the video itself
  8 | - Amazon Transcribe to convert audio into text
  9 | - Amazon Comprehend to extract entities and topics from the transcribed text via Topic Modelling and Named Entity recognition.
 10 | 
 11 | The metadata related to the video, segments, scenes, entities, labels will be stored in Amazon Neptune.
 12 | Amazon Neptune is a fully managed low latency graph database service that will allow us to store metadata as nodes (aka vertices) and branches (aka edges) to represent relationships between the nodes.
 13 | https://aws.amazon.com/neptune/
 14 | 
 15 | The diagram below summarises the workflow:
 16 | ![workflow overview](./static/overview.png?raw=true "workflow overview")
 17 | 
 18 | Topics addressed within the different notebooks:
 19 | 
 20 | Part 0:
 21 | Create the environment (S3 bucket, IAM roles/polices, SNS topic, etc) and upload your sample video
 22 | 
 23 | Part 1:
 24 | Use Amazon Rekognition to detect scenes and labels from your video
 25 | 
 26 | Part 2:
 27 | Use Amazon Transcribe and Amazon Comprehend to respectively transcibe audio to text and extract metadata (topics, Named Entities) from transcripts.
 28 | 
 29 | Part 3:
 30 | Store all the previously extracted metadata in Amazon Neptune and query the graph.
 31 | 
 32 | Part 4:
 33 | Resources clean-up
 34 | 
 35 | 
 36 | ## Getting started
 37 | 
 38 | To run those notebooks you'll need to create a jupyter notebook instance in sagemaker.
 39 | 
 40 | In the AWS console, first make sure you're in the right region and search for Sagemaker. Write down the region as you'll later need to create your Amazon Neptune database in the same region.
 41 | 
 42 | ![notebook creation](./static/notebook-creation-part01.png?raw=true "notebook-creation-part01")
 43 | 
 44 | In Amazon Sagemaker, click on "Notebook instances".
 45 | 
 46 | ![notebook creation](./static/notebook-creation-part02.png?raw=true "notebook-creation-part02")
 47 | 
 48 | In the "Create notebook instance wizard", enter the following:
 49 | 
 50 | ![notebook creation](./static/notebook-creation-part1.png?raw=true "notebook-creation-part1")
 51 | 
 52 | For Permissions, click on "create a new role". 
 53 | 
 54 | ![notebook creation](./static/notebook-creation-part2.png?raw=true "notebook-creation-part2")
 55 | 
 56 | Then specify an existing S3 bucket where you will later upload the .mp4 video sample to be used in the notebooks.
 57 | 
 58 | ![notebook creation](./static/notebook-creation-part3.png?raw=true "notebook-creation-part3")
 59 | 
 60 | In Network, specify a VPC, subnet and a security group. I used the default ones on my end. Write down the VPC name as you'll need to deploy your Amazon Neptune DB in the same VPC later in notebook part0 and make sure your security group allows traffic between the two.
 61 | 
 62 | ![notebook creation](./static/notebook-creation-part4.png?raw=true "notebook-creation-part4")
 63 | 
 64 | You can specify the git repo in the Git repositories section or do that later within the notebook.
 65 | Then hit the "Create notebook instance" button.
 66 | 
 67 | ![notebook creation](./static/notebook-creation-part5.png?raw=true "notebook-creation-part5")
 68 | 
 69 | Once your instance's status is "InService", click on "Open JupyterLab".
 70 | 
 71 | ![notebook creation](./static/notebook-creation-part6.png?raw=true "notebook-creation-part6")
 72 | 
 73 | Navigate in the aws-video-metadata-knowledge-graph-workshop folder, double click on the part0-setup.ipynb notebook and when prompted to select a kernel, choose "conda_python3". You'll need to repeat this operation when opening the other notebooks.
 74 | 
 75 | ![notebook creation](./static/notebook-creation-part7.png?raw=true "notebook-creation-part7")
 76 | 
 77 | 
 78 | ## Costs
 79 | 
 80 | Please note that you might incur costs by running those notebooks. Most of those AI services have free tier but depending on how much you've already used or depending on the size of the video assets you're using, it might go over the limit.
 81 | 
 82 | Finally, if you're not planning to use those resources anymore at the end of the workshop, don't forget to shutdown/delete your Amazon Neptune instance, your Sagemaker notebook instances and run the part4-cleanup notebook to delete all the other resources created throughout the notebooks (S3 buckets, IAM roles, SNS topics, etc).
 83 | 
 84 | Before proceeding, please check the related services pricing pages:
 85 | 
 86 | https://aws.amazon.com/transcribe/pricing/
 87 | 
 88 | https://aws.amazon.com/comprehend/pricing/
 89 | 
 90 | https://aws.amazon.com/rekognition/pricing/
 91 | 
 92 | https://aws.amazon.com/neptune/pricing/
 93 | 
 94 | 
 95 | ## Security
 96 | 
 97 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
 98 | 
 99 | ## License
100 | 
101 | This library is licensed under the MIT-0 License. See the LICENSE file.
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/notebooks/part1-rekognition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PART 1 - Video metadata extraction with Amazon Rekognition\n",
  8 |     "In the following section of the workshop, we're going to extract metadata and labels from the video as well as identify the different scenes from the video, all of that via computer vision methods and more precisely via Amazon Rekognition.<br>\n",
  9 |     "<br>\n",
 10 |     "Amazon Rekognition is an AWS services that makes it easy to add image and video analysis to your applications using proven, highly scalable, deep learning technology that requires no machine learning expertise to use. With Amazon Rekognition, you can identify objects, people, text, scenes, and activities in images and videos, as well as detect any inappropriate content. Amazon Rekognition also provides highly accurate facial analysis and facial search capabilities that you can use to detect, analyze, and compare faces for a wide variety of user verification, people counting, and public safety use cases.\n",
 11 |     "\n",
 12 |     "https://docs.aws.amazon.com/rekognition/latest/dg/what-is.html"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "#load stored variable from previous notebooks\n",
 22 |     "%store -r"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "!pip install boto3"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import boto3\n",
 41 |     "from botocore.exceptions import ClientError\n",
 42 |     "import os\n",
 43 |     "import time\n",
 44 |     "\n",
 45 |     "import logging\n",
 46 |     "import sys\n",
 47 |     "logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',\n",
 48 |     "                     level=logging.INFO, stream=sys.stdout)\n",
 49 |     "log = logging.getLogger('knowledge-graph-logger')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Scene detection job\n",
 57 |     "\n",
 58 |     "Let's start by detecting the different \"scenes\" or \"shots\" from the video. This is notably useful to detect fix/black screens."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "The below function starts a segment detection job with predefined settings for confidence and coverage notably. We're choosing to identify both \"SHOT\" (=scenes) and \"TECHNICAL_CUE\" (e.g. black/fix screen).</br>"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "rek = boto3.client('rekognition')\n",
 75 |     "\n",
 76 |     "#Method calling the Rekognition segment detection api\n",
 77 |     "def startSegmentDetection(s3_bucket, name):\n",
 78 |     "    min_Technical_Cue_Confidence = 80.0\n",
 79 |     "    min_Shot_Confidence = 80.0\n",
 80 |     "    max_pixel_threshold = 0.1\n",
 81 |     "    min_coverage_percentage = 60\n",
 82 |     "\n",
 83 |     "    response = rek.start_segment_detection(\n",
 84 |     "        Video={\"S3Object\": {\"Bucket\": s3_bucket, \"Name\": name}},\n",
 85 |     "        NotificationChannel={\n",
 86 |     "            \"RoleArn\": role_arn,\n",
 87 |     "            \"SNSTopicArn\": sns_topic_arn,\n",
 88 |     "        },\n",
 89 |     "        SegmentTypes=[\"TECHNICAL_CUE\", \"SHOT\"],\n",
 90 |     "        Filters={\n",
 91 |     "            \"TechnicalCueFilter\": {\n",
 92 |     "                \"MinSegmentConfidence\": min_Technical_Cue_Confidence,\n",
 93 |     "            },\n",
 94 |     "            \"ShotFilter\": {\"MinSegmentConfidence\": min_Shot_Confidence},\n",
 95 |     "        }\n",
 96 |     "    )\n",
 97 |     "    return response\n",
 98 |     "    \n",
 99 |     "segment_detection_job = startSegmentDetection(bucket, os.path.join(s3_video_input_path, video_file))"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "segment_detection_job_id = segment_detection_job['JobId']\n",
109 |     "print(f\"Job Id: {segment_detection_job_id}\")\n",
110 |     "\n",
111 |     "#Grab the segment detection response\n",
112 |     "SegmentDetectionOutput = rek.get_segment_detection(\n",
113 |     "    JobId=segment_detection_job_id\n",
114 |     ")\n",
115 |     "\n",
116 |     "#Determine the state. If the job is still processing we will wait a bit and check again\n",
117 |     "while(SegmentDetectionOutput['JobStatus'] == 'IN_PROGRESS'):\n",
118 |     "    time.sleep(5)\n",
119 |     "    print('.', end='')\n",
120 |     " \n",
121 |     "    SegmentDetectionOutput = rek.get_segment_detection(\n",
122 |     "    JobId=segment_detection_job_id)\n",
123 |     "    \n",
124 |     "#Once the job is no longer in progress we will proceed\n",
125 |     "print(SegmentDetectionOutput['JobStatus'])"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Let's have a look at the output from the job"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "SegmentDetectionOutput['VideoMetadata']"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "SegmentDetectionOutput['AudioMetadata']"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "#looking at some of the segments/scenes. note that they will either be of type TECHNICAL_CUE or SHOT \n",
160 |     "SegmentDetectionOutput['Segments'][:5]"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "Let's park segment detection for the moment. we'll use those metadata in the Neptune related notebook when storing our data into the graph."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Label detection in the video\n",
175 |     "We're now using a different functionality of Amazon Rekognition: Label detection.</br>\n",
176 |     "By running this job on the video, Rekognition will automatically labels objects, concepts, scenes, and actions in your images, and provides an associated confidence score."
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "#method starting a individual rekognition labeling job\n",
186 |     "def start_label_detection(s3_bucket, name, roleArn, sns_topic_arn):\n",
187 |     "    response = rek.start_label_detection(\n",
188 |     "        Video={\"S3Object\": {\"Bucket\": s3_bucket, \"Name\": name}},\n",
189 |     "        NotificationChannel={\n",
190 |     "            \"RoleArn\": roleArn,\n",
191 |     "            \"SNSTopicArn\": sns_topic_arn,\n",
192 |     "        }\n",
193 |     "    )\n",
194 |     "    return response"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "Launching the label detection job."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "#launching it on our test video.\n",
211 |     "label_detection_response = start_label_detection(bucket, os.path.join(s3_video_input_path, video_file), role_arn, sns_topic_arn)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "Monitoring when the job is finished"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "label_detection_job_id = label_detection_response['JobId']\n",
228 |     "print(f\"Job Id: {label_detection_job_id}\")\n",
229 |     "\n",
230 |     "#Grab the segment detection response\n",
231 |     "LabelDetectionOutput = rek.get_label_detection(\n",
232 |     "    JobId=label_detection_job_id\n",
233 |     ")\n",
234 |     "\n",
235 |     "#Determine the state. If the job is still processing we will wait a bit and check again\n",
236 |     "while(LabelDetectionOutput['JobStatus'] == 'IN_PROGRESS'):\n",
237 |     "    time.sleep(5)\n",
238 |     "    print('.', end='')\n",
239 |     " \n",
240 |     "    LabelDetectionOutput = rek.get_label_detection(\n",
241 |     "    JobId=label_detection_job_id)\n",
242 |     "    \n",
243 |     "#Once the job is no longer in progress we will proceed\n",
244 |     "print(LabelDetectionOutput['JobStatus'])"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "Let's have a look at the output from the label detection job"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "LabelDetectionOutput['Labels'][:5]"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "Again, let's park the label detection for the moment and export/store the outputs of those 2 jobs so that we can use it in part 3 for when we create our Graph."
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "%store SegmentDetectionOutput\n",
277 |     "%store LabelDetectionOutput"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": []
286 |   }
287 |  ],
288 |  "metadata": {
289 |   "instance_type": "ml.t3.medium",
290 |   "kernelspec": {
291 |    "display_name": "Python 3 (Base Python)",
292 |    "language": "python",
293 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-southeast-2:452832661640:image/python-3.6"
294 |   },
295 |   "language_info": {
296 |    "codemirror_mode": {
297 |     "name": "ipython",
298 |     "version": 3
299 |    },
300 |    "file_extension": ".py",
301 |    "mimetype": "text/x-python",
302 |    "name": "python",
303 |    "nbconvert_exporter": "python",
304 |    "pygments_lexer": "ipython3",
305 |    "version": "3.6.13"
306 |   }
307 |  },
308 |  "nbformat": 4,
309 |  "nbformat_minor": 4
310 | }
311 | 


--------------------------------------------------------------------------------
/notebooks/part2-transcribe-comprehend.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# PART 2 - Text and Metadata extraction from the audio of the video file\n",
  8 |     "\n",
  9 |     "In the following section of the workshop, we're going to:\n",
 10 |     "- Transcribe the file's audio into text using Amazon Transcribe\n",
 11 |     "- Prepare the transcript data in the format expected by Amazon Comprehend\n",
 12 |     "- Run a topic modelling job using Amazon Comprehend to extract topics\n",
 13 |     "- Run an NER (Named Entity Recognition) job using Amazon Comprehend to extract names and entities (e.g. countries, places, etc)\n",
 14 |     "<br>"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "All those metadata will then be used alongside with metadata extracted via computer vision with Rekognition to populate our knowlege graph in part 3.\n",
 22 |     "<br>"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "#load stored variable from previous notebooks\n",
 32 |     "%store -r"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "#installing a useful library to manipulate json object\n",
 42 |     "!pip install jsonlines\n",
 43 |     "!pip install pandas\n",
 44 |     "!pip install boto3"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Transcribe the file's audio into text\n",
 52 |     "Amazon Transcribe uses machine learning to recognize speech in audio and video files and transcribe that speech into text. Practical use cases for Amazon Transcribe include transcriptions of customer-agent calls and closed captions for videos.\n",
 53 |     "\n",
 54 |     "https://docs.aws.amazon.com/transcribe/latest/dg/transcribe-whatis.html"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import boto3\n",
 64 |     "import os\n",
 65 |     "import random\n",
 66 |     "import time\n",
 67 |     "import urllib\n",
 68 |     "import json\n",
 69 |     "import csv\n",
 70 |     "import tarfile\n",
 71 |     "import pandas as pd\n",
 72 |     "import jsonlines"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "transcribe = boto3.client('transcribe')\n",
 82 |     "\n",
 83 |     "#creating a unique name for the job\n",
 84 |     "transcribe_job_name = \"transcribe_job_knowledge_graph\" + str(random.randint(0, 100000))\n",
 85 |     "\n",
 86 |     "#s3 path to your video file\n",
 87 |     "transcribe_job_uri = \"s3://\" + os.path.join(bucket, s3_video_input_path, video_file)\n",
 88 |     "\n",
 89 |     "#starting the transcription job\n",
 90 |     "transcription_job = transcribe.start_transcription_job(\n",
 91 |     "    TranscriptionJobName=transcribe_job_name,\n",
 92 |     "    Media={'MediaFileUri': transcribe_job_uri},\n",
 93 |     "    MediaFormat='mp4',\n",
 94 |     "    LanguageCode='en-US',\n",
 95 |     "    OutputBucketName=bucket\n",
 96 |     ")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Monitoring the job's completion"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "print(transcribe_job_name)\n",
113 |     "while True:\n",
114 |     "    status = transcribe.get_transcription_job(TranscriptionJobName=transcribe_job_name)\n",
115 |     "    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:\n",
116 |     "        break\n",
117 |     "    print(\".\", end='')\n",
118 |     "    time.sleep(5)\n",
119 |     "print(status['TranscriptionJob']['TranscriptionJobStatus'])"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "Download the transcript file from the s3 bucket"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "s3 = boto3.Session().resource('s3')\n",
136 |     "\n",
137 |     "#retrieving the transcript file URI\n",
138 |     "s3_transcript_file_url = status['TranscriptionJob']['Transcript']['TranscriptFileUri']\n",
139 |     "\n",
140 |     "S3_transcript_file_name = s3_transcript_file_url.split('/')[-1]\n",
141 |     "\n",
142 |     "#local path where to store the transcript file\n",
143 |     "local_transcribe_file_path = os.path.join(tmp_local_folder, S3_transcript_file_name)\n",
144 |     "\n",
145 |     "#downloading locally the transcript file\n",
146 |     "s3.Bucket(bucket).Object(S3_transcript_file_name).download_file(local_transcribe_file_path)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "#loading the transcript object in memory\n",
156 |     "transcribe_file = open(local_transcribe_file_path)\n",
157 |     "transcribe_json_data = json.load(transcribe_file)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "Let's have a look at the output. below is the itemised version of the transcript, word by word for the 5 first words."
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "transcript_items = transcribe_json_data['results']['items']\n",
174 |     "transcript_items[:5]"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "## Formating the transcript to be consumed by Amazon Comprehend for the following 2 jobs.\n",
182 |     "The documentation explains that we can format the input CSV file in 2 ways. Either we provide one document per file or a file containing one document per line. We're going to pick the latter option."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "We have different ways of splitting that text into \"blocks\" of words. One logical way of doing it could be to do it sentence by sentence.<br>\n",
190 |     "We're choosing here to segment our text transcript by chunk of 1 minute.<br>\n",
191 |     "Reason being that later we're going to attach video/audio metadata to 1 minute video segments in order to have a fine grained level of information on our video. <br>"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "#setting the size of our segments to 1 minute\n",
201 |     "segment_size_ms = 60000"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "The following function is using the timestamp from each item to break the whole transcript into 1min chunks."
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "def prepare_transcribed_text_for_topic_modelling(transcript_items, segment_size_ms=60000):\n",
218 |     "\n",
219 |     "    #initiatlising current segment with segment size\n",
220 |     "    current_segment_end = segment_size_ms\n",
221 |     "    sentence_list_per_segment = []\n",
222 |     "    buffer_sentence = []\n",
223 |     "    for item in transcript_items:\n",
224 |     "        \n",
225 |     "        #filter on pronunciation, ignoring punctuation for the moment\n",
226 |     "        type_ = item['type']\n",
227 |     "        if type_ == 'pronunciation':\n",
228 |     "            start = float(item['start_time']) * 1000\n",
229 |     "            end = float(item['end_time']) * 1000\n",
230 |     "            content = item['alternatives'][0]['content']\n",
231 |     "            \n",
232 |     "            # splitting text across the different segments\n",
233 |     "            if start <= current_segment_end :\n",
234 |     "                buffer_sentence.append(content)\n",
235 |     "            else:\n",
236 |     "                if (len(buffer_sentence) > 0):\n",
237 |     "                    #appending \"\\r\\n\" at the end of each line - requirement from comprehend\n",
238 |     "                    sentence_list_per_segment.append(' '.join(buffer_sentence))\n",
239 |     "                buffer_sentence = []\n",
240 |     "                current_segment_end += segment_size_ms\n",
241 |     "                \n",
242 |     "    #flush the buffer at the end\n",
243 |     "    if (len(buffer_sentence) > 0):\n",
244 |     "        sentence_list_per_segment.append(' '.join(buffer_sentence))\n",
245 |     "    \n",
246 |     "    return sentence_list_per_segment"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "#getting the transcript in the right format\n",
256 |     "video_transcript = prepare_transcribed_text_for_topic_modelling(transcript_items, segment_size_ms)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "We're now writing the transcript in csv format in S3 to be consumed by Comprehend"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "#writing the transcript in csv format in S3 to be consumed by Comprehend\n",
273 |     "def write_list_to_csv(local_file_path, rows, bucket, path):\n",
274 |     "    filename = local_file_path.split('/')[-1]\n",
275 |     "    #create file locally\n",
276 |     "    with open(local_file_path, 'w+') as f:\n",
277 |     "        write = csv.writer(f)\n",
278 |     "        for row in rows:\n",
279 |     "            write.writerow([row])\n",
280 |     "    #upload to S3\n",
281 |     "    boto3.resource('s3').Bucket(bucket).Object(os.path.join(path, filename)).upload_file(local_file_path)\n",
282 |     "    print(f\"{filename} uploaded to s3://{bucket}/{path}/{filename}\")"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "transcript_filename = 'video_transcript.csv'\n",
292 |     "s3_comprehend_input_path = 'comprehend-input'\n",
293 |     "\n",
294 |     "write_list_to_csv(os.path.join(tmp_local_folder, transcript_filename), \n",
295 |     "                  video_transcript, \n",
296 |     "                  bucket, \n",
297 |     "                  s3_comprehend_input_path)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "We're just checking the number of lines in the file we just created which should correspond to the duration of our video in minutes."
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "num_lines = sum(1 for line in open(os.path.join(tmp_local_folder, transcript_filename)))\n",
314 |     "print(f'Number of lines in our file: {num_lines}')"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "# Comprehend - Topic detection\n",
322 |     "We're now ready to launch the first job.\n",
323 |     "\n",
324 |     "You can use Amazon Comprehend to examine the content of a collection of documents to determine common themes. For example, you can give Amazon Comprehend a collection of news articles, and it will determine the subjects, such as sports, politics, or entertainment. The text in the documents doesn't need to be annotated.\n",
325 |     "\n",
326 |     "Amazon Comprehend uses a Latent Dirichlet Allocation-based learning model to determine the topics in a set of documents. It examines each document to determine the context and meaning of a word. The set of words that frequently belong to the same context across the entire document set make up a topic.\n",
327 |     "\n",
328 |     "https://docs.aws.amazon.com/comprehend/latest/dg/topic-modeling.html"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "comprehend = boto3.client('comprehend')\n",
338 |     "\n",
339 |     "s3_output_data_comprehend = os.path.join(\"s3://\", bucket, 'comprehend-tm-output')\n",
340 |     "s3_input_data_comprehend = os.path.join(\"s3://\", bucket, s3_comprehend_input_path)\n",
341 |     "\n",
342 |     "#note that we're setting the number of topics to 15\n",
343 |     "response = comprehend.start_topics_detection_job(\n",
344 |     "    InputDataConfig={\n",
345 |     "        'S3Uri': s3_input_data_comprehend,\n",
346 |     "        'InputFormat': 'ONE_DOC_PER_LINE'\n",
347 |     "    },\n",
348 |     "    OutputDataConfig={\n",
349 |     "        'S3Uri': s3_output_data_comprehend,\n",
350 |     "    },\n",
351 |     "    DataAccessRoleArn=role_arn,\n",
352 |     "    JobName='comprehend_job_knowledge_graph_' + str(random.randint(0,100000)),\n",
353 |     "    NumberOfTopics=15\n",
354 |     ")\n"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "Monitoring the progress of the job"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "while True:\n",
371 |     "    status = comprehend.describe_topics_detection_job(JobId=response['JobId'])\n",
372 |     "    if status['TopicsDetectionJobProperties']['JobStatus']  in ['COMPLETED', 'FAILED']:\n",
373 |     "        break\n",
374 |     "    print(\".\", end='')\n",
375 |     "    time.sleep(10)\n",
376 |     "print(comprehend.describe_topics_detection_job(JobId=response['JobId'])['TopicsDetectionJobProperties']['JobStatus'])"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "After Amazon Comprehend processes your document collection, it returns a compressed archive containing two files, topic-terms.csv and doc-topics.csv. \n",
384 |     "\n",
385 |     "The first output file, topic-terms.csv, is a list of topics in the collection. For each topic, the list includes by default the top terms by topic according to their weight. \n",
386 |     "\n",
387 |     "The second file, doc-topics.csv, lists the documents associated with a topic and the proportion of the document that is concerned with the topic. If you specified ONE_DOC_PER_FILE, the document is identified by the file name. If you specified ONE_DOC_PER_LINE (like in our case), the document is identified by the file name and the 0-indexed line number within the file. "
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "### Download and extract the comprehend topic detection output"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "#function to extract a tar file\n",
404 |     "def extract(tar_file, path):\n",
405 |     "    opened_tar = tarfile.open(tar_file)\n",
406 |     "     \n",
407 |     "    if tarfile.is_tarfile(tar_file):\n",
408 |     "        opened_tar.extractall(path)\n",
409 |     "        return path\n",
410 |     "    else:\n",
411 |     "        print(\"The tar file you entered is not a tar file\")\n",
412 |     "\n",
413 |     "#download\n",
414 |     "def download_and_extract_comprehend_job_output(output_s3_uri, dl_path):\n",
415 |     "    s3_bucket = output_s3_uri.split('/')[2]\n",
416 |     "    s3_file_path = '/'.join(output_s3_uri.split('/', 3)[3:])\n",
417 |     "    local_file_path = os.path.join(dl_path, output_s3_uri.split('/')[-1])\n",
418 |     "\n",
419 |     "    boto3.resource('s3').Bucket(s3_bucket).Object(s3_file_path).download_file(local_file_path)\n",
420 |     "    return extract(local_file_path, dl_path)"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "topics_output_s3_uri = comprehend.describe_topics_detection_job(JobId=response['JobId'])['TopicsDetectionJobProperties']['OutputDataConfig']['S3Uri']\n",
430 |     "\n",
431 |     "job_comprehend_output_folder = download_and_extract_comprehend_job_output(topics_output_s3_uri, tmp_local_folder)"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "Looking into the 2 output files and loading this into dataframes for later use."
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "topics_file = 'doc-topics.csv'\n",
448 |     "topic_terms_file = 'topic-terms.csv'\n",
449 |     "comprehend_topics_df = pd.read_csv(os.path.join(tmp_local_folder, topics_file))\n",
450 |     "comprehend_terms_df = pd.read_csv(os.path.join(tmp_local_folder, topic_terms_file))"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "Displaying the 5 first documents and their topics"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "comprehend_topics_df.head(5)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "Displaying top 10 words for topic 1. This will give us an idea of what this topic is about. Remember that topic modelling is not outputing a specific label but instead an unlabeled topic or grouping of documents for which we have a list of prominent words and their weight/importance."
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "comprehend_terms_df[comprehend_terms_df['topic'] == 1]"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## Comprehend NER Named Entity Recognition\n",
490 |     "\n",
491 |     "We're now looking at extracting Named entities from the video's transcript, still using Amazon Comprehend.\n",
492 |     "\n",
493 |     "An entity is a textual reference to the unique name of a real-world object such as people, places, and commercial items, and to precise references to measures such as dates and quantities.\n",
494 |     "\n",
495 |     "https://docs.aws.amazon.com/comprehend/latest/dg/how-entities.html"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": null,
501 |    "metadata": {},
502 |    "outputs": [],
503 |    "source": [
504 |     "response_NER = comprehend.start_entities_detection_job(\n",
505 |     "    InputDataConfig={\n",
506 |     "        'S3Uri': s3_input_data_comprehend,\n",
507 |     "        'InputFormat': 'ONE_DOC_PER_LINE'\n",
508 |     "    },\n",
509 |     "    OutputDataConfig={\n",
510 |     "        'S3Uri': s3_output_data_comprehend,\n",
511 |     "    },\n",
512 |     "    LanguageCode='en',\n",
513 |     "    DataAccessRoleArn=role_arn,\n",
514 |     "    JobName='comprehend_job_knowledge_graph_NER' + str(random.randint(0,100000)),\n",
515 |     ")"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": [
524 |     "while True:\n",
525 |     "    status_NER = comprehend.describe_entities_detection_job(JobId=response_NER['JobId'])\n",
526 |     "    if status_NER['EntitiesDetectionJobProperties']['JobStatus']  in ['COMPLETED', 'FAILED']:\n",
527 |     "        break\n",
528 |     "    print(\".\", end='')\n",
529 |     "    time.sleep(10)\n",
530 |     "print(comprehend.describe_entities_detection_job(JobId=response_NER['JobId'])['EntitiesDetectionJobProperties']['JobStatus'])"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "#retrieving the outputs of our NER job\n",
540 |     "ner_output_s3_uri = comprehend.describe_entities_detection_job(JobId=response_NER['JobId'])['EntitiesDetectionJobProperties']['OutputDataConfig']['S3Uri']\n",
541 |     "job_comprehend_output_folder = download_and_extract_comprehend_job_output(ner_output_s3_uri, tmp_local_folder)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "Let's look into the output of the NER job. As you can see we've got different types of entities: PERSON, DATE, QUANTITY, LOCATION, ORGANIZATION, OTHERS."
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "ner_job_data = []\n",
558 |     "with jsonlines.open(os.path.join(tmp_local_folder, 'output')) as ner_json_reader:\n",
559 |     "    for obj in ner_json_reader:\n",
560 |     "        ner_job_data.append(obj['Entities'])"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "ner_job_data[0]"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "markdown",
574 |    "metadata": {},
575 |    "source": [
576 |     "Let's store that aside for it to be used in part 3 of the workshop"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "%store segment_size_ms\n",
586 |     "%store comprehend_terms_df\n",
587 |     "%store comprehend_topics_df\n",
588 |     "%store ner_job_data"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": null,
594 |    "metadata": {},
595 |    "outputs": [],
596 |    "source": []
597 |   }
598 |  ],
599 |  "metadata": {
600 |   "instance_type": "ml.t3.medium",
601 |   "kernelspec": {
602 |    "display_name": "Python 3 (Base Python)",
603 |    "language": "python",
604 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-southeast-2:452832661640:image/python-3.6"
605 |   },
606 |   "language_info": {
607 |    "codemirror_mode": {
608 |     "name": "ipython",
609 |     "version": 3
610 |    },
611 |    "file_extension": ".py",
612 |    "mimetype": "text/x-python",
613 |    "name": "python",
614 |    "nbconvert_exporter": "python",
615 |    "pygments_lexer": "ipython3",
616 |    "version": "3.6.13"
617 |   }
618 |  },
619 |  "nbformat": 4,
620 |  "nbformat_minor": 4
621 | }
622 | 


--------------------------------------------------------------------------------
/notebooks/part0-setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.\n",
  8 |     "\n",
  9 |     "SPDX-License-Identifier: MIT-0"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Video metadata extraction and knowledge graph workshop"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Objectives:\n",
 24 |     "This repository contains a series of 4 jupyter notebooks demonstrating how AWS AI Services like Amazon Rekognition, Amazon Transcribe and Amazon Comprehend can help you extract valuable metadata from your video assets and store that information in a Graph database like Amazon Neptune for maximum query performance and flexibility.\n",
 25 |     "At the end of the workshop you'll typically be able to search for a specific label or entity and return a list of 1min video segments related to your search across your videos.\n",
 26 |     "\n",
 27 |     "To extract metadata from a video, we'll use a the following AWS AI services:\n",
 28 |     "- Amazon Rekognition to cut the video in scenes and detect label from the video itself\n",
 29 |     "- Amazon Transcribe to convert audio into text\n",
 30 |     "- Amazon Comprehend to extract entities and topics from the transcribed text via Topic Modelling and Named Entity recognition.\n",
 31 |     "\n",
 32 |     "The metadata related to the video, segments, scenes, entities, labels will be stored in Amazon Neptune.\n",
 33 |     "Amazon Neptune is a fully managed low latency graph database service that will allow us to store metadata as nodes (aka vertices) and branches (aka edges) to represent relationships between the nodes.\n",
 34 |     "https://aws.amazon.com/neptune/\n",
 35 |     "\n",
 36 |     "The diagram below summarises the workflow:\n",
 37 |     "\n",
 38 |     "![Overall workflow](../static/overview.png \"Overall workflow\")\n",
 39 |     "\n",
 40 |     "Topics addressed within the different notebooks:\n",
 41 |     "\n",
 42 |     "Part 0:<br>\n",
 43 |     "Create the environment (S3 bucket, IAM roles/polices, SNS topic, etc) and upload your sample video\n",
 44 |     "\n",
 45 |     "Part 1:<br>\n",
 46 |     "Use Amazon Rekognition to detect scenes and labels from your video\n",
 47 |     "\n",
 48 |     "Part 2:<br>\n",
 49 |     "Use Amazon Transcribe and Amazon Comprehend to respectively transcibe audio to text and extract metadata (topics, Named Entities) from transcripts.\n",
 50 |     "\n",
 51 |     "Part 3:<br>\n",
 52 |     "Store all the previously extracted metadata in Amazon Neptune and query the graph.\n",
 53 |     "\n",
 54 |     "Part 4:<br>\n",
 55 |     "Resources clean-up"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Costs\n",
 63 |     "Please note that you might incur costs by running those notebooks. Most of those AI services have free tier but depending on how much you've already used or depending on the size of the video assets you're using, it might go over the limit.\n",
 64 |     "\n",
 65 |     "Finally, if you're not planning to use those resources anymore at the end of the workshop, don't forget to shutdown/delete your Amazon Neptune instance, your Sagemaker studio notebook instances and run the part4-cleanup notebook to delete all the other resources created throughout the notebooks (S3 buckets, IAM roles, SNS topics, etc).\n",
 66 |     "\n",
 67 |     "Before proceeding, please check the related services pricing pages:\n",
 68 |     "\n",
 69 |     "https://aws.amazon.com/transcribe/pricing/\n",
 70 |     "\n",
 71 |     "https://aws.amazon.com/comprehend/pricing/\n",
 72 |     "\n",
 73 |     "https://aws.amazon.com/rekognition/pricing/\n",
 74 |     "\n",
 75 |     "https://aws.amazon.com/neptune/pricing/"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "# Part 0 - Environment setup - S3 Bucket creation, SNS topic and IAM role"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "In the steps below we're going to create the S3 bucket where we'll upload our video, the SNS topic that some AWS services will use to publish outcomes of the jobs as well as the required policies/roles for the various AWS services to access those objects.<br>\n",
 90 |     "\n",
 91 |     "<b>Please note that you will need to provide an valid .mp4 video stored in a S3 bucket as input for this workshop. It is NOT included in the github repo assets.</b> \n",
 92 |     "\n",
 93 |     "This video will be used  for the different metadata extraction steps. We suggest you use ~5min editorial video or video trailer for which you have the required copyrights.\n",
 94 |     "\n",
 95 |     "The example we used to run the various jobs and generate the graphs is a video trailer from an Amazon Studios production."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "!pip install boto3\n",
105 |     "!pip install sagemaker"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "import boto3\n",
115 |     "import sagemaker\n",
116 |     "import random\n",
117 |     "import json\n",
118 |     "import time\n",
119 |     "import os\n",
120 |     "import shutil\n",
121 |     "import logging\n",
122 |     "import sys\n",
123 |     "logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',\n",
124 |     "                     level=logging.INFO, stream=sys.stdout)\n",
125 |     "log = logging.getLogger('knowledge-graph-logger')\n",
126 |     "\n",
127 |     "s3 = boto3.client('s3')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "IMPORTANT:<br>\n",
135 |     "\n",
136 |     "Make sure before you start executing this notebook that the execution role you've configured for your notebook or studio instance has the following permissions:\n",
137 |     "- read/write permission to your S3 buckets\n",
138 |     "- IAM permission to create the policy/role\n",
139 |     "- SNS permission to create a SNS topic\n",
140 |     "- permissions to invoke Amazon Rekognition, Amazon Comprehend, Amazon Transcribe APIs (e.g. AmazonRekognitionFullAccess, ComprehendFullAccess, AmazonTranscribeFullAccess)\n",
141 |     "\n",
142 |     "You'll get \"AuthorizationErrorException\" messages otherwise."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "iam = boto3.client(\"iam\")\n",
152 |     "\n",
153 |     "#get sagemaker execution role Arn\n",
154 |     "sagemaker_role = sagemaker.get_execution_role()\n",
155 |     "\n",
156 |     "#get the role's name\n",
157 |     "sagemaker_role_name = sagemaker_role.split('/')[-1]\n",
158 |     "\n",
159 |     "print(f'sagemaker role name:{sagemaker_role_name} \\n')"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "The below cell will list all managed iam policies associated with your sagemaker execution role. Check that it has the required permission before proceeding. Note that this cell will not run if your sagemaker execution role doesn't have the required IAM rights."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "#retrieve associated managed iam policies\n",
176 |     "paginator = iam.list_attached_role_policies(RoleName=sagemaker_role_name)\n",
177 |     "\n",
178 |     "#listing\n",
179 |     "for policy in paginator['AttachedPolicies']:\n",
180 |     "    print(policy)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### SNS topic creation\n",
188 |     "We're creating a simple topic that will later be used by Amazon Rekognition notably to publish the outcome/status of the video analysis jobs."
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "sns = boto3.client('sns')\n",
198 |     "\n",
199 |     "def create_sns_topic(name):\n",
200 |     "    try:\n",
201 |     "        topic = sns.create_topic(Name=name)\n",
202 |     "    except:\n",
203 |     "        log.exception(\"Couldn't create topic %s.\", name)\n",
204 |     "        raise\n",
205 |     "    else:\n",
206 |     "        return topic['TopicArn']\n",
207 |     "    \n",
208 |     "sns_topic_arn = create_sns_topic('knowledge-graph-lab-rek-sns-topic')\n",
209 |     "\n",
210 |     "print(sns_topic_arn)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "### S3 bucket creation"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "Amazon S3 bucket names are globally unique. To create a unique bucket name, we're appending your account ID and a random int at the end of the bucket name."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "region = 'ap-southeast-2' #specify the region of your choice\n",
234 |     "\n",
235 |     "#retrieving your account ID\n",
236 |     "account_id = boto3.client('sts').get_caller_identity().get('Account')\n",
237 |     "\n",
238 |     "#bucket name\n",
239 |     "bucket = 'sagemaker-knowledge-graph-' + region + '-' + account_id + '-' + str(random.randint(0,100000))\n",
240 |     "\n",
241 |     "log.info(f'bucket name: {bucket}')\n",
242 |     "\n",
243 |     "#create the bucket\n",
244 |     "s3.create_bucket(\n",
245 |     "        Bucket=bucket,\n",
246 |     "        CreateBucketConfiguration={'LocationConstraint': region}\n",
247 |     "        )"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "Creating the bucket"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "### Create IAM policy\n",
262 |     "Amazon Rekognition, Transcribe and Comprehend will need to be able to read the contents of your S3 bucket. So add a bucket policy which allows that."
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "s3_bucket_policy = {\n",
272 |     "    \"Version\": \"2012-10-17\",\n",
273 |     "    \"Id\": \"KnowledgeGraphS3BucketAccessPolicy\",\n",
274 |     "    \"Statement\": [\n",
275 |     "        {\n",
276 |     "            \"Sid\": \"KnowledgeGraphS3BucketAccessPolicy\",\n",
277 |     "            \"Effect\": \"Allow\",\n",
278 |     "            \"Principal\": {\n",
279 |     "                \"Service\": \"rekognition.amazonaws.com\",\n",
280 |     "                \"Service\": \"transcribe.amazonaws.com\",\n",
281 |     "                \"Service\": \"comprehend.amazonaws.com\"\n",
282 |     "            },\n",
283 |     "            \"Action\": [\n",
284 |     "                \"s3:GetObject\",\n",
285 |     "                \"s3:ListBucket\",\n",
286 |     "                \"s3:PutObject\"\n",
287 |     "            ],\n",
288 |     "            \"Resource\": [\n",
289 |     "                \"arn:aws:s3:::{}\".format(bucket),\n",
290 |     "                \"arn:aws:s3:::{}/*\".format(bucket)\n",
291 |     "            ]\n",
292 |     "        }\n",
293 |     "    ]\n",
294 |     "}\n",
295 |     "\n",
296 |     "s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(s3_bucket_policy));"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "### IAM Role creation"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "We create the role that Amazon Rekognition, Comprehend, Transcribe will need to run jobs."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "role_name = account_id+\"-knowledgeGraphLab\"\n",
320 |     "\n",
321 |     "assume_role_policy_document = {\n",
322 |     "    \"Version\": \"2012-10-17\",\n",
323 |     "    \"Statement\": [\n",
324 |     "        {\n",
325 |     "          \"Effect\": \"Allow\",\n",
326 |     "          \"Principal\": {\n",
327 |     "            \"Service\": \"rekognition.amazonaws.com\",\n",
328 |     "            \"Service\": \"transcribe.amazonaws.com\",\n",
329 |     "            \"Service\": \"comprehend.amazonaws.com\"\n",
330 |     "          },\n",
331 |     "          \"Action\": \"sts:AssumeRole\"\n",
332 |     "        }\n",
333 |     "    ]\n",
334 |     "}\n",
335 |     "\n",
336 |     "try:\n",
337 |     "    create_role_response = iam.create_role(\n",
338 |     "        RoleName = role_name,\n",
339 |     "        AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)\n",
340 |     "    );\n",
341 |     "    \n",
342 |     "except iam.exceptions.EntityAlreadyExistsException as e:\n",
343 |     "    print('Warning: role already exists:', e)\n",
344 |     "    create_role_response = iam.get_role(\n",
345 |     "        RoleName = role_name\n",
346 |     "    );\n",
347 |     "\n",
348 |     "role_arn = create_role_response[\"Role\"][\"Arn\"]\n",
349 |     "\n",
350 |     "# Pause to allow role to be fully consistent\n",
351 |     "time.sleep(10)\n",
352 |     "\n",
353 |     "print('IAM Role: {}'.format(role_arn))"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "<br>\n",
361 |     "We create 2 policies, for S3 and SNS, that we attach to the role we created above.\n"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "s3_policy = {\n",
371 |     "    \"Version\": \"2012-10-17\",\n",
372 |     "    \"Statement\": [\n",
373 |     "        {\n",
374 |     "            \"Effect\": \"Allow\",\n",
375 |     "            \"Action\": [\n",
376 |     "                \"s3:GetObject\",\n",
377 |     "                \"s3:ListBucket\",\n",
378 |     "                \"s3:PutObject\"\n",
379 |     "            ],\n",
380 |     "            \"Resource\": [\n",
381 |     "                \"arn:aws:s3:::{}\".format(bucket),\n",
382 |     "                \"arn:aws:s3:::{}/*\".format(bucket)\n",
383 |     "            ]\n",
384 |     "        }\n",
385 |     "    ]\n",
386 |     "}\n",
387 |     "\n",
388 |     "#creating the s3 policy\n",
389 |     "s3_policy_response = iam.create_policy(\n",
390 |     "    PolicyName='s3AccessForRekCompTrans',\n",
391 |     "    PolicyDocument=json.dumps(s3_policy),\n",
392 |     ")\n",
393 |     "\n",
394 |     "s3_policy_arn = s3_policy_response['Policy']['Arn']\n",
395 |     "\n",
396 |     "print(s3_policy_arn)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "#attaching the above policy to the role\n",
406 |     "attach_s3_policy_response = iam.attach_role_policy(\n",
407 |     "    RoleName = role_name,\n",
408 |     "    PolicyArn = s3_policy_response['Policy']['Arn'])\n",
409 |     "\n",
410 |     "print('Response:{}'.format(attach_s3_policy_response['ResponseMetadata']['HTTPStatusCode']))"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "sns_policy = {\n",
420 |     "    \"Version\": \"2012-10-17\",\n",
421 |     "    \"Statement\": [\n",
422 |     "        {\n",
423 |     "            \"Action\": [\n",
424 |     "                \"sns:*\"\n",
425 |     "            ],\n",
426 |     "            \"Effect\": \"Allow\",\n",
427 |     "            \"Resource\": sns_topic_arn\n",
428 |     "        }\n",
429 |     "    ]\n",
430 |     "}\n",
431 |     "#creating the sns policy\n",
432 |     "sns_policy_response = iam.create_policy(\n",
433 |     "    PolicyName='snsAccessForRekognition-' + str(random.randint(0,1000)),\n",
434 |     "    PolicyDocument=json.dumps(sns_policy),\n",
435 |     ")\n",
436 |     "\n",
437 |     "sns_policy_arn = sns_policy_response['Policy']['Arn']\n",
438 |     "\n",
439 |     "print(sns_policy_arn)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "#attaching the built-in AmazonSNSFullAccess\n",
449 |     "attach_sns_policy_response = iam.attach_role_policy(\n",
450 |     "    RoleName = role_name,\n",
451 |     "    PolicyArn = sns_policy_arn)\n",
452 |     "\n",
453 |     "print('Response:{}'.format(attach_sns_policy_response['ResponseMetadata']['HTTPStatusCode']))"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "### Uploading the video to the newly created S3 bucket"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "Please specify below the S3 bucket where you've stored the video file you'll use to run the notebooks. Please keep in mind that it needs to be a valid .mp4 and that your sagemaker execution role has access to your S3 bucket. You'll get an access denied exception otherwise."
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "#S3 URL where you have uploaded your video\n",
477 |     "your_s3_original_video = 's3://< your s3 bucket>/< path to the .mp4 file>'\n",
478 |     "\n",
479 |     "#extracting video names and prefix\n",
480 |     "your_s3_bucket = your_s3_original_video.split('/')[2]\n",
481 |     "your_s3_prefix = '/'.join(your_s3_original_video.split('/')[3:])\n",
482 |     "video_file = your_s3_original_video.split('/')[-1]\n",
483 |     "video_name = video_file.split('.')[0]"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "markdown",
488 |    "metadata": {},
489 |    "source": [
490 |     "Downloading the file locally from the public S3 bucket to your notebook instance and uploading it to the target S3 bucket for processing."
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "#creating a temporary folder on your instance to store the video locally.\n",
500 |     "tmp_local_folder = './tmp'\n",
501 |     "if not os.path.exists(tmp_local_folder):\n",
502 |     "    #create folder\n",
503 |     "    os.makedirs(tmp_local_folder)\n",
504 |     "else:\n",
505 |     "    #remove folder and files\n",
506 |     "    shutil.rmtree(tmp_local_folder)\n",
507 |     "    #wait for deletion to finish\n",
508 |     "    while os.path.exists(tmp_local_folder): # check if it exists\n",
509 |     "        pass\n",
510 |     "    #create folder\n",
511 |     "    os.makedirs(tmp_local_folder)"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "#download the file locally\n",
521 |     "s3.download_file(your_s3_bucket, your_s3_prefix, os.path.join(tmp_local_folder, video_file))\n",
522 |     "\n",
523 |     "#upload the video file to the target S3 bucket\n",
524 |     "s3_video_input_path = 'input'\n",
525 |     "s3.upload_file(os.path.join(tmp_local_folder, video_file), bucket, os.path.join(s3_video_input_path, video_file))"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "## Amazon Neptune\n",
533 |     "\n",
534 |     "For part3 of the workshop, you will need to create a Neptune DB cluster. \n",
535 |     "\n",
536 |     "<b>IMPORTANT: please make sure you create a brand new Neptune instance for this workshop as we'll be cleaning it of its content</b>\n",
537 |     "\n",
538 |     "The easiest is to create your db via the console. \n",
539 |     "\n",
540 |     "Make sure you are in the same region where you previously created your jupyter notebook instance.\n",
541 |     "\n",
542 |     "Engine options: at the time when I developed this workshop, the 1.0.5.1.R2 version was the latest.\n",
543 |     "\n",
544 |     "DB cluster identifier: specify a relevant name\n",
545 |     "\n",
546 |     "Templates: \"Development and Testing\""
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "![Amazon Neptune DB creation](../static/neptune-creation-part1.png \"Amazon Neptune DB creation\")"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "markdown",
558 |    "metadata": {},
559 |    "source": [
560 |     "DB instance size: db.t3.medium\n",
561 |     "\n",
562 |     "Multi-AZ deployment: No\n",
563 |     "\n",
564 |     "Connectivity: make sure you choose the same VPC as the one you're using for your notebook instance. In my case I am using the default one.\n"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "markdown",
569 |    "metadata": {},
570 |    "source": [
571 |     "![Amazon Neptune DB creation](../static/neptune-creation-part2.png \"Amazon Neptune DB creation\")"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "markdown",
576 |    "metadata": {},
577 |    "source": [
578 |     "Notebook configuration: uncheck the \"Create notebook\". we are going to create a separate notebook in sagemaker.\n",
579 |     "\n",
580 |     "leave the rest as default and click \"Create Database\""
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "metadata": {},
586 |    "source": [
587 |     "![Amazon Neptune DB creation](../static/neptune-creation-part3.png \"Amazon Neptune DB creation\")"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "markdown",
592 |    "metadata": {},
593 |    "source": [
594 |     "Once your cluster's status is \"Available\", retrieve the endpoint url and port and update the endpoint variable below."
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "markdown",
599 |    "metadata": {},
600 |    "source": [
601 |     "![Amazon Neptune endpoint](../static/neptune-ui.png \"Amazon Neptune endpoint\")"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "your_neptune_endpoint_url = 'wss://<your neptune endpoint>:<port>/gremlin'"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "markdown",
615 |    "metadata": {},
616 |    "source": [
617 |     "Defining some variable we'll use later for the different metadata extraction jobs"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": [
626 |     "%store tmp_local_folder\n",
627 |     "%store bucket\n",
628 |     "%store s3_video_input_path\n",
629 |     "%store video_file\n",
630 |     "%store video_name\n",
631 |     "%store role_arn\n",
632 |     "%store role_name\n",
633 |     "%store sns_topic_arn\n",
634 |     "%store s3_policy_arn\n",
635 |     "%store sns_policy_arn\n",
636 |     "%store your_neptune_endpoint_url"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "code",
641 |    "execution_count": null,
642 |    "metadata": {},
643 |    "outputs": [],
644 |    "source": []
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {},
650 |    "outputs": [],
651 |    "source": []
652 |   }
653 |  ],
654 |  "metadata": {
655 |   "instance_type": "ml.t3.medium",
656 |   "kernelspec": {
657 |    "display_name": "Python 3 (Base Python)",
658 |    "language": "python",
659 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-southeast-2:452832661640:image/python-3.6"
660 |   },
661 |   "language_info": {
662 |    "codemirror_mode": {
663 |     "name": "ipython",
664 |     "version": 3
665 |    },
666 |    "file_extension": ".py",
667 |    "mimetype": "text/x-python",
668 |    "name": "python",
669 |    "nbconvert_exporter": "python",
670 |    "pygments_lexer": "ipython3",
671 |    "version": "3.6.13"
672 |   }
673 |  },
674 |  "nbformat": 4,
675 |  "nbformat_minor": 4
676 | }
677 | 


--------------------------------------------------------------------------------