├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── codeql-analysis.yml
├── .gitignore
├── .vscode
    └── settings.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── api
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-38.pyc
    └── streamlit_experiments
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-38.pyc
    │       ├── covid.cpython-38.pyc
    │       ├── eda.cpython-38.pyc
    │       └── s3.cpython-38.pyc
    │   ├── covid.py
    │   ├── eda.py
    │   └── s3.py
├── cloud-experiments.png
├── docs
    ├── cloud-experiments-composition.png
    ├── what-are-cloud-experiments.md
    └── why-cloud-experiments.md
└── experiments
    ├── data-apps
        ├── covid_insights
        │   ├── 494724_1196190_compressed_covid_19_data.csv.zip
        │   ├── README.md
        │   ├── __pycache__
        │   │   └── covid.cpython-38.pyc
        │   ├── cfr.png
        │   ├── cov_dash.py
        │   ├── covid-app.png
        │   └── covid_app.py
        ├── exploratory_data_analysis
        │   ├── __pycache__
        │   │   └── eda.cpython-38.pyc
        │   ├── census-income.csv
        │   └── eda_app.py
        ├── open_data_explorer
        │   ├── README.md
        │   ├── __pycache__
        │   │   └── s3.cpython-38.pyc
        │   ├── list-nyc-tlc-filter.png
        │   ├── query-public-dataset.png
        │   ├── s3-app-start.png
        │   └── s3_app.py
        ├── rides
        │   └── uber_pickups.py
        └── wine
        │   ├── logs.log
        │   └── wine_app.py
    ├── guides
        └── flying-cars-with-glue-databrew
        │   ├── README.md
        │   ├── connect-dataset.png
        │   ├── corr-matrix.png
        │   ├── crawler-info.png
        │   ├── create-project.png
        │   ├── create-recipe.png
        │   ├── data-bins.png
        │   ├── data-lineage.png
        │   ├── data-schema.png
        │   ├── donut-charts.png
        │   ├── initiate-session.png
        │   ├── job-results.png
        │   ├── map.png
        │   ├── quicksight.png
        │   └── splash.png
    └── notebooks
        ├── ai-services
            ├── README.md
            └── using-ai-services-for-analyzing-public-data.ipynb
        ├── cloudstory-api
            ├── README.md
            ├── cloudstory-demo.ipynb
            └── cloudstory.py
        ├── comprehend-medical-ehr
            ├── LICENSE
            ├── README.md
            ├── comprehend-medical-citizen-health-record-analytics.ipynb
            ├── contributors.md
            └── images
            │   ├── Medical_Resume_Architecture.jpg
            │   ├── output_19_0.png
            │   ├── output_20_0.png
            │   ├── output_21_0.png
            │   └── output_22_0.png
        ├── covid
            ├── 2020-03-24-covid-india-stats.csv
            ├── README.md
            ├── covid.py
            ├── insights.ipynb
            └── output_5_0.png
        ├── exploring-data
            ├── README.md
            └── exploring-data-with-python-and-amazon-s3-select.ipynb
        ├── optimizing-data
            ├── README.md
            └── optimizing-data-for-analysis-with-amazon-athena-and-aws-glue.ipynb
        ├── video-analytics
            ├── README.md
            └── video-analytics.ipynb
        └── wine-pycaret
            ├── .ipynb_checkpoints
                └── wine_eda_model-checkpoint.ipynb
            ├── extra_tree_model.pkl
            ├── logs.log
            ├── wine_eda_model.ipynb
            └── winequality-red.csv


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '32 12 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'javascript', 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
37 |         # Learn more:
38 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
39 | 
40 |     steps:
41 |     - name: Checkout repository
42 |       uses: actions/checkout@v2
43 | 
44 |     # Initializes the CodeQL tools for scanning.
45 |     - name: Initialize CodeQL
46 |       uses: github/codeql-action/init@v1
47 |       with:
48 |         languages: ${{ matrix.language }}
49 |         # If you wish to specify custom queries, you can do so here or in a config file.
50 |         # By default, queries listed here will override any specified in a config file.
51 |         # Prefix the list here with "+" to use these queries and those in the config file.
52 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
53 | 
54 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
55 |     # If this step fails, then you should remove it and run the build manually (see below)
56 |     - name: Autobuild
57 |       uses: github/codeql-action/autobuild@v1
58 | 
59 |     # ℹ️ Command-line programs to run using the OS shell.
60 |     # 📚 https://git.io/JvXDl
61 | 
62 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
63 |     #    and modify them (or add more) to build your code if your project
64 |     #    uses a compiled language
65 | 
66 |     #- run: |
67 |     #   make bootstrap
68 |     #   make release
69 | 
70 |     - name: Perform CodeQL Analysis
71 |       uses: github/codeql-action/analyze@v1
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | node_modules/
4 | build/
5 | dist/
6 | .vscode/


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true
3 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/aws-open-data-analytics-notebooks/issues), or [recently closed](https://github.com/aws-samples/aws-open-data-analytics-notebooks/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/aws-open-data-analytics-notebooks/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/aws-open-data-analytics-notebooks/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Cloud Experiments
2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cloud Experiments (This Repo is Archived)
 2 | 
 3 | **Sample notebooks, starter apps, and low/no code guides for rapidly (within 60-minutes) building and running open innovation experiments on AWS Cloud**
 4 | 
 5 | Cloud experiments follow step-by-step workflow for performing analytics, machine learning, AI, and data science on AWS cloud. We present guidance on using AWS Cloud programmatically or visually using the console, introduce relevant AWS services, explaining the code, reviewing the code outputs, evaluating alternative steps in our workflow, and ultimately designing an abstrated reusable API for rapidly deploying these experiments on AWS cloud.
 6 | 
 7 | **Documentation:** [Why Cloud Experiments](https://github.com/aws-samples/cloud-experiments/tree/master/docs/why-cloud-experiments.md) | [What Are Cloud Experiments](https://github.com/aws-samples/cloud-experiments/tree/master/docs/what-are-cloud-experiments.md)
 8 | 
 9 | **Cloud Experiments:** [Guides](https://github.com/aws-samples/cloud-experiments#guides) | [Exploratory Data Apps](https://github.com/aws-samples/cloud-experiments#exploratory-data-apps) | [Notebooks](https://github.com/aws-samples/cloud-experiments#notebooks)
10 | 
11 | [![](cloud-experiments.png)](https://github.com/aws-samples/cloud-experiments)
12 | 
13 | ## Guides
14 | 
15 | All you need to run these experiments is access to an AWS Console from your web browser.
16 | 
17 | ### [Flying Cars with Glue DataBrew](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/guides/flying-cars-with-glue-databrew)
18 | 
19 | Smarter cities will have smarter transportation including multi-modal, eco-friendly, balancing commuter convenience with safety and social distancing. This 60 minute experiment uses open data for good and low/no code services provided by AWS to enable insights for business model innovation in smart transport use case. The experiment is intended as a step-by-step guided co-innovation and design workshop along with an AWS specialist. If you are familiar with the pre-requisites specified in the Cloud Experiment Guide (last section of this experiment) then feel free to make this experiment your own.
20 | 
21 | [![](https://github.com/aws-samples/cloud-experiments/blob/master/experiments/guides/flying-cars-with-glue-databrew/splash.png)](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/guides/flying-cars-with-glue-databrew)
22 | 
23 | ## Exploratory Data Apps
24 | 
25 | We use [Streamlit](https://streamlit.io/) for many expriments in this repository. Streamlit is the fastest way to build and share data apps. Streamlit turns data scripts into shareable web apps in minutes. All in Python. All for free. No front‑end experience required.
26 | 
27 | These three steps will set you up for running experiments on your laptop.
28 | 
29 | Step 1: Setup [AWS IAM user](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html#id_users_create_console) with programmatic access.
30 | 
31 | Step 2: Install AWS Shell and configure IAM credentials.
32 | 
33 | ```
34 | pip install aws-shell
35 | aws-shell
36 | aws> configure
37 | ```
38 | 
39 | Step 3: Install Streamlit. Clone repo. Add path to API. Run app.
40 | 
41 | ```
42 | pip install streamlit
43 | clone https://github.com/aws-samples/cloud-experiments
44 | export PYTHONPATH="$HOME/WhereYouClonedRepo/cloud-experiments"
45 | streamlit run cloud-experiments/experiments/data-apps/open_data_explorer/s3_app.py
46 | ```
47 | 
48 | 
49 | ### [Open Data Explorer](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/data-apps/open_data_explorer)
50 | 
51 | Apps and API for exploring open data sources including [AWS Registry of Open Data](https://registry.opendata.aws/) which lists datasets for genomics, satellite, transport, COVID, medical imaging, and other use cases in data for social good.
52 | 
53 | [![](https://github.com/aws-samples/cloud-experiments/blob/master/experiments/data-apps/open_data_explorer/s3-app-start.png)](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/data-apps/open_data_explorer)
54 | 
55 | ## [COVID EDA and Models](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/data-apps/covid_insights)
56 | 
57 | Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
58 | 
59 | [![](https://github.com/aws-samples/cloud-experiments/blob/master/experiments/data-apps/covid_insights/cfr.png)](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/data-apps/covid_insights)
60 | 
61 | ## Notebooks
62 | 
63 | You may want to run these notebooks using [Amazon SageMaker](https://aws.amazon.com/sagemaker/). Amazon SageMaker is a fully-managed service that covers the entire machine learning workflow to label and prepare your data, choose an algorithm, train the model, tune and optimize it for deployment, make predictions, and take action.
64 | 
65 | ### [COVID Insights](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/covid)
66 | 
67 | This experiment provides a catalog of open datasets for deriving insights related to COVID-19 and helping open source and open data community to collaborate in fighting this global threat. The notebook provides (a) reusable API to speed up open data analytics related to COVID-19, customized for India however can be adopted for other countries, (b) sample usage of the API, (c) documentation of insights, and (d) catalog of open datasets referenced.
68 | 
69 | ### [Comprehend Medical for Electronic Health Records](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/comprehend-medical-ehr)
70 | 
71 | Amazon Comprehend Medical is an API-level service which is HIPAA eligible and uses machine learning to extract medical information with high accuracy. The service eliminates the barriers to entry to access the biomedical knowledge stored in natural language text - from the research literature that entails biological processes and therapeutic mechanisms of action to the Electronic Medical Records that have the patients’ journeys through our healthcare systems documented. The service also helps us comb through that information and study relationships like symptoms, diagnosis, medication, dosage while redacting the Protected Health Information (PHI). This is an illustrative notebook that includes a step-by-step workflow for analyzing health data on the cloud.
72 | 
73 | ### [Video Analytics](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/video-analytics/)
74 | 
75 | Analyzing video based content requires transforming from one media format (video or audio) to another format (text or numeric) while identifying relevant structure in the resulting format. This multi-media transformation requires machine learning based recognition. Analytics libraries can work on the transformed data to determine the required outcomes including visualizations and charts. The structured data in text or numeric format can also be reused as input to training new machine learning models.
76 | 
77 | ### [Using AI Services for Analyzing Public Data](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/ai-services/)
78 | 
79 | So far we have been working with structured data in flat files as our data source. What if the source is images and unstructured text. AWS AI services provide vision, transcription, translation, personalization, and forecasting capabilities without the need for training and deploying machine learning models. AWS manages the machine learning complexity, you just focus on the problem at hand and send required inputs for analysis and receive output from these services within your applications.
80 | 
81 | ### [Exploring data with Python and Amazon S3 Select](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/exploring-data/)
82 | 
83 | For this notebook let us start with a big open dataset. Big enough that we will struggle to open it in Excel on a laptop. Excel has around million rows limit. We will setup AWS services to source from a 270GB data source, filter and store more than 8 million rows or 100 million data points into a flat file, extract schema from this file, transform this data, load into analytics tools, run Structured Query Language (SQL) on this data.
84 | 
85 | ### [Optimizing data for analysis with Amazon Athena and AWS Glue](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/optimizing-data/)
86 | 
87 | We will continue our open data analytics workflow starting with the AWS Console then moving to using the notebook. Using AWS Glue we can automate creating a metadata catalog based on flat files stored on Amazon S3. Glue is a fully managed extract, transform, and load (ETL) service that makes it easy for customers to prepare and load their data for analytics. You can create and run an ETL job with a few clicks in the AWS Management Console. You simply point AWS Glue to your data stored on AWS, and AWS Glue discovers your data and stores the associated metadata (e.g. table definition and schema) in the AWS Glue Data Catalog. Once cataloged, your data is immediately searchable, queryable, and available for ETL.
88 | 
89 | ### [Cloudstory API](https://github.com/aws-samples/cloud-experiments/tree/master/experiments/notebooks/cloudstory-api/)
90 | 
91 | Cloudstory API Python module and demo of using the API. The cloudstory API is documented in the other notebooks listed here.
92 | 
93 | ## License
94 | 
95 | This library is licensed under the Apache 2.0 License. 
96 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/__init__.py


--------------------------------------------------------------------------------
/api/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/api/streamlit_experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/streamlit_experiments/__init__.py


--------------------------------------------------------------------------------
/api/streamlit_experiments/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/streamlit_experiments/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/api/streamlit_experiments/__pycache__/covid.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/streamlit_experiments/__pycache__/covid.cpython-38.pyc


--------------------------------------------------------------------------------
/api/streamlit_experiments/__pycache__/eda.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/streamlit_experiments/__pycache__/eda.cpython-38.pyc


--------------------------------------------------------------------------------
/api/streamlit_experiments/__pycache__/s3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/api/streamlit_experiments/__pycache__/s3.cpython-38.pyc


--------------------------------------------------------------------------------
/api/streamlit_experiments/covid.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import plotly.graph_objects as go
  7 | from plotly.subplots import make_subplots
  8 | 
  9 | def growth_scatter(df):
 10 |     fig=go.Figure()
 11 |     fig.add_trace(go.Scatter(x=df.index, y=df["Confirmed"],
 12 |                         mode='lines+markers',
 13 |                         name='Confirmed Cases'))
 14 |     fig.add_trace(go.Scatter(x=df.index, y=df["Recovered"],
 15 |                         mode='lines+markers',
 16 |                         name='Recovered Cases'))
 17 |     fig.add_trace(go.Scatter(x=df.index, y=df["Deaths"],
 18 |                         mode='lines+markers',
 19 |                         name='Death Cases'))
 20 |     fig.update_layout(title="Growth of different types of cases",
 21 |                     xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
 22 | 
 23 |     st.write(fig)
 24 | 
 25 | def weekly_increase(df):
 26 |     week_num=[]
 27 |     weekwise_confirmed=[]
 28 |     weekwise_recovered=[]
 29 |     weekwise_deaths=[]
 30 |     w=1
 31 |     for i in list(df["WeekOfYear"].unique()):
 32 |         weekwise_confirmed.append(df[df["WeekOfYear"]==i]["Confirmed"].iloc[-1])
 33 |         weekwise_recovered.append(df[df["WeekOfYear"]==i]["Recovered"].iloc[-1])
 34 |         weekwise_deaths.append(df[df["WeekOfYear"]==i]["Deaths"].iloc[-1])
 35 |         week_num.append(w)
 36 |         w=w+1
 37 | 
 38 |     fig = plt.figure(figsize=(8,5))
 39 |     plt.plot(week_num,weekwise_confirmed,linewidth=3)
 40 |     plt.plot(week_num,weekwise_recovered,linewidth=3)
 41 |     plt.plot(week_num,weekwise_deaths,linewidth=3)
 42 |     plt.ylabel("Number of Cases")
 43 |     plt.xlabel("Week Number")
 44 |     plt.title("Weekly progress of Different Types of Cases")
 45 |     # plt.xlabel
 46 |     st.pyplot(fig)
 47 | 
 48 |     fig, (ax1,ax2) = plt.subplots(1, 2,figsize=(15,5))
 49 |     sns.barplot(x=week_num,y=pd.Series(weekwise_confirmed).diff().fillna(0),ax=ax1)
 50 |     sns.barplot(x=week_num,y=pd.Series(weekwise_deaths).diff().fillna(0),ax=ax2)
 51 |     ax1.set_xlabel("Week Number")
 52 |     ax2.set_xlabel("Week Number")
 53 |     ax1.set_ylabel("Number of Confirmed Cases")
 54 |     ax2.set_ylabel("Number of Death Cases")
 55 |     ax1.set_title("Weekly increase in Number of Confirmed Cases")
 56 |     ax2.set_title("Weekly increase in Number of Death Cases")
 57 | 
 58 |     st.pyplot(fig)
 59 | 
 60 | def mortality(df):
 61 |     df["Mortality Rate"]=(df["Deaths"]/df["Confirmed"])*100
 62 |     df["Recovery Rate"]=(df["Recovered"]/df["Confirmed"])*100
 63 |     df["Active Cases"]=df["Confirmed"]-df["Recovered"]-df["Deaths"]
 64 |     df["Closed Cases"]=df["Recovered"]+df["Deaths"]
 65 | 
 66 |     st.write("Average Mortality Rate = ",f'{df["Mortality Rate"].mean():.2f}')
 67 |     st.write("Median Mortality Rate = ",f'{df["Mortality Rate"].median():.2f}')
 68 |     st.write("Average Recovery Rate = ",f'{df["Recovery Rate"].mean():.2f}')
 69 |     st.write("Median Recovery Rate = ",f'{df["Recovery Rate"].median():.2f}')
 70 | 
 71 |     #Plotting Mortality and Recovery Rate 
 72 |     fig = make_subplots(rows=2, cols=1,
 73 |                     subplot_titles=("Recovery Rate", "Mortatlity Rate"))
 74 |     fig.add_trace(
 75 |         go.Scatter(x=df.index, y=(df["Recovered"]/df["Confirmed"])*100,name="Recovery Rate"),
 76 |         row=1, col=1
 77 |     )
 78 |     fig.add_trace(
 79 |         go.Scatter(x=df.index, y=(df["Deaths"]/df["Confirmed"])*100,name="Mortality Rate"),
 80 |         row=2, col=1
 81 |     )
 82 |     fig.update_layout(height=1000,legend=dict(x=0,y=0.5,traceorder="normal"))
 83 |     fig.update_xaxes(title_text="Date", row=1, col=1)
 84 |     fig.update_yaxes(title_text="Recovery Rate", row=1, col=1)
 85 |     fig.update_xaxes(title_text="Date", row=1, col=2)
 86 |     fig.update_yaxes(title_text="Mortality Rate", row=1, col=2)
 87 | 
 88 |     st.write(fig)
 89 | 
 90 | def growth_factor(df):
 91 |     daily_increase_confirm=[]
 92 |     daily_increase_recovered=[]
 93 |     daily_increase_deaths=[]
 94 |     for i in range(df.shape[0]-1):
 95 |         daily_increase_confirm.append(((df["Confirmed"].iloc[i+1]/df["Confirmed"].iloc[i])))
 96 |         daily_increase_recovered.append(((df["Recovered"].iloc[i+1]/df["Recovered"].iloc[i])))
 97 |         daily_increase_deaths.append(((df["Deaths"].iloc[i+1]/df["Deaths"].iloc[i])))
 98 |     daily_increase_confirm.insert(0,1)
 99 |     daily_increase_recovered.insert(0,1)
100 |     daily_increase_deaths.insert(0,1)
101 | 
102 |     fig = plt.figure(figsize=(15,7))
103 |     plt.plot(df.index,daily_increase_confirm,label="Growth Factor Confiremd Cases",linewidth=3)
104 |     plt.plot(df.index,daily_increase_recovered,label="Growth Factor Recovered Cases",linewidth=3)
105 |     plt.plot(df.index,daily_increase_deaths,label="Growth Factor Death Cases",linewidth=3)
106 |     plt.xlabel("Timestamp")
107 |     plt.ylabel("Growth Factor")
108 |     plt.title("Growth Factor of different Types of Cases")
109 |     plt.axhline(1,linestyle='--',color='black',label="Baseline")
110 |     plt.xticks(rotation=90)
111 |     plt.legend()
112 | 
113 |     st.pyplot(fig)
114 | 
115 | def daily_increase(df):
116 |     st.write("Average increase in number of Confirmed Cases every day: ",np.round(df["Confirmed"].diff().fillna(0).mean()))
117 |     st.write("Average increase in number of Recovered Cases every day: ",np.round(df["Recovered"].diff().fillna(0).mean()))
118 |     st.write("Average increase in number of Deaths Cases every day: ",np.round(df["Deaths"].diff().fillna(0).mean()))
119 | 
120 |     fig=go.Figure()
121 |     fig.add_trace(go.Scatter(x=df.index, y=df["Confirmed"].diff().fillna(0),mode='lines+markers',
122 |                         name='Confirmed Cases'))
123 |     fig.add_trace(go.Scatter(x=df.index, y=df["Recovered"].diff().fillna(0),mode='lines+markers',
124 |                         name='Recovered Cases'))
125 |     fig.add_trace(go.Scatter(x=df.index, y=df["Deaths"].diff().fillna(0),mode='lines+markers',
126 |                         name='Death Cases'))
127 |     fig.update_layout(title="Daily increase in different types of Cases",
128 |                     xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
129 |     st.write(fig)
130 | 
131 | def double_days(df):
132 |     c=1000
133 |     double_days=[]
134 |     C=[]
135 |     while(1):
136 |         double_days.append(df[df["Confirmed"]<=c].iloc[[-1]]["Days Since"][0])
137 |         C.append(c)
138 |         c=c*2
139 |         if(c<df["Confirmed"].max()):
140 |             continue
141 |         else:
142 |             break
143 | 
144 |     doubling_rate=pd.DataFrame(list(zip(C,double_days)),columns=["Cases","Days since first Case"])
145 |     doubling_rate["Doubling Days"]=doubling_rate["Days since first Case"].diff().fillna(doubling_rate["Days since first Case"])
146 | 
147 |     st.write(doubling_rate)
148 | 


--------------------------------------------------------------------------------
/api/streamlit_experiments/eda.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import seaborn as sns
 3 | import pandas as pd
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # Correlate features within a dataframe
 8 | def correlate(df):
 9 |     corr = df.corr()
10 |     sns.set(style="white")
11 | 
12 |     fig = plt.figure(figsize=(9,7))
13 |     cmap = sns.diverging_palette(220, 10, as_cmap=True)
14 | 
15 |     sns.heatmap(corr, vmax=0.3, center=0, cmap=cmap,
16 |         annot=True, linewidths=0.5, fmt="3.2f", square=True)
17 | 
18 |     st.pyplot(fig)


--------------------------------------------------------------------------------
/api/streamlit_experiments/s3.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import boto3
  3 | import botocore
  4 | import pandas as pd
  5 | import io
  6 | 
  7 | s3_client = boto3.client('s3')
  8 | s3_resource = boto3.resource('s3')
  9 | 
 10 | def search_buckets():
 11 |     search = st.text_input('Search S3 bucket in your account', '')
 12 |     response = s3_client.list_buckets()
 13 |     if search:
 14 |         buckets_found = 0
 15 |         for bucket in response['Buckets']:
 16 |             if search:
 17 |                 if search in bucket["Name"]:
 18 |                     buckets_found = buckets_found + 1
 19 |                     st.write(f'{bucket["Name"]}')
 20 |         if buckets_found:
 21 |             st.success(f'Listing existing **{buckets_found}** buckets containing **{search}** string')
 22 |         else:
 23 |             st.warning(f'No matching buckets found containing **{search}** string')
 24 | 
 25 |     else:   
 26 |         st.info('Provide string to search for listing buckets')
 27 | 
 28 | def list_bucket_contents():
 29 |     total_size_gb = 0
 30 |     total_files = 0
 31 |     match_size_gb = 0
 32 |     match_files = 0
 33 |     bucket = st.text_input('S3 bucket name (public bucket or private to your account)', '')
 34 |     bucket_resource = s3_resource.Bucket(bucket)
 35 |     match = st.text_input('(optional) Filter bucket contents with matching string', '')
 36 |     size_mb = st.text_input('(optional) Match files up to size in MB (0 for all sizes)', '0')
 37 |     if size_mb:
 38 |         size_mb = int(size_mb)
 39 |     else:
 40 |         size_mb = 0
 41 | 
 42 |     if bucket:
 43 |         for key in bucket_resource.objects.all():
 44 |             key_size_mb = key.size/1024/1024
 45 |             total_size_gb += key_size_mb
 46 |             total_files += 1
 47 |             list_check = False
 48 |             if not match:
 49 |                 list_check = True
 50 |             elif match in key.key:
 51 |                 list_check = True
 52 |             if list_check and not size_mb:
 53 |                 match_files += 1
 54 |                 match_size_gb += key_size_mb
 55 |                 st.write(f'{key.key} ({key_size_mb:3.0f}MB)')
 56 |             elif list_check and key_size_mb <= size_mb:
 57 |                 match_files += 1
 58 |                 match_size_gb += key_size_mb
 59 |                 st.write(f'{key.key} ({key_size_mb:3.0f}MB)')
 60 | 
 61 |         if match:
 62 |             st.info(f'Matched file size is **{match_size_gb/1024:3.1f}GB** with **{match_files}** files')            
 63 |         
 64 |         st.success(f'Bucket **{bucket}** total size is **{total_size_gb/1024:3.1f}GB** with **{total_files}** files')
 65 |     else:
 66 |         st.info('Provide bucket name to list contents')
 67 | 
 68 | def create_bucket():
 69 |     bucket = st.text_input('S3 bucket name to create', '')
 70 |     if bucket:
 71 |         try:
 72 |             s3_client.create_bucket(Bucket=bucket)
 73 |         except botocore.exceptions.ClientError as e:
 74 |             st.error('Bucket **' + bucket + '** could not be created. ' + e.response['Error']['Message'])
 75 |             return
 76 |         st.success('The S3 bucket **' + bucket + '** successfully created or already exists in your account')
 77 |     else:
 78 |         st.info('Provide unique bucket name to create')
 79 | 
 80 | def s3_select():
 81 |     bucket = st.text_input('S3 bucket name', '')
 82 |     csv = st.text_input('CSV File path and name', '')
 83 |     st.write("Example: `SELECT * FROM s3object s LIMIT 5`")
 84 |     sql = st.text_area('SQL statement', '')
 85 |     if bucket and csv and sql:
 86 |         s3_select_results = s3_client.select_object_content(
 87 |             Bucket=bucket,
 88 |             Key=csv,
 89 |             Expression=sql,
 90 |             ExpressionType='SQL',
 91 |             InputSerialization={'CSV': {"FileHeaderInfo": "Use"}},
 92 |             OutputSerialization={'JSON': {}},
 93 |         )
 94 | 
 95 |         for event in s3_select_results['Payload']:
 96 |             if 'Records' in event:
 97 |                 df = pd.read_json(io.StringIO(event['Records']['Payload'].decode('utf-8')), lines=True)
 98 |             elif 'Stats' in event:
 99 |                 st.write(f"Scanned: {int(event['Stats']['Details']['BytesScanned'])/1024/1024:5.2f}MB")            
100 |                 st.write(f"Processed: {int(event['Stats']['Details']['BytesProcessed'])/1024/1024:5.2f}MB")
101 |                 st.write(f"Returned: {int(event['Stats']['Details']['BytesReturned'])/1024/1024:5.2f}MB")
102 |         
103 |         st.write(df)
104 |     else:
105 |         st.info('Provide S3 bucket, CSV file name, and SQL statement')


--------------------------------------------------------------------------------
/cloud-experiments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/cloud-experiments.png


--------------------------------------------------------------------------------
/docs/cloud-experiments-composition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/docs/cloud-experiments-composition.png


--------------------------------------------------------------------------------
/docs/what-are-cloud-experiments.md:
--------------------------------------------------------------------------------
 1 | # What Are Cloud Experiments
 2 | 
 3 | Please continue reading if you are interested in contributing to this repository, building your own Cloud Experiments, or just curious how Cloud Experiments are distinguishable from samples, starter apps, and tutorials. As source we could build a Cloud Experiment from ground up, reuse an existing sample code, or build on top of a starter toolset. As destination we could target a blog-like tutorial for building the Cloud Experiment on AWS Console, create a Streamlit exploratory data app, author an Amazon SageMaker notebook, design a single page interactive data story, or build a starter app using AWS Amplify along with reusable components in Flutter or React. Constituents of a Cloud Experiment usually include a working starter app, identified public dataset(s) suitable for the experiment, a set of reusable custom components for a specific set of use cases, and an API which relates to problem domain or solution design workflow. Components and API play an important role in building explainable solutions which in turn encourages collaboration, open innovation, and adoption. Cloud Experiments can be used as do-it-yourself, however they are designed for collaborative exploration between developers, designers, and domain experts. Cloud Experiments get better with use as collaborative exploration generates new feature ideas, discovers new datasets, or expands the set of use cases. Popular Cloud Experiments may be made production reusable, available in AWS marketplace for reuse by real-world applications. This in turn creates a feedback loop for improving the base Cloud Experiment.
 4 | 
 5 | Following variants of Cloud Experiments increase in investment of time to create, multi-skills required, and value they mght deliver. Many Cloud Experiments are a mix of these variants or evolve from one variant to the next over time and reuse.
 6 | 
 7 | **Art Of The Possible:** Cloud Experiments could also focus on demonstrating the art of the possible with AWS technologies. A collection of demos for AWS service(s) accompanied by a reusable set of components or API abstractions.
 8 | 
 9 | **Starter App:** Cloud Experiment can work like a starter app for a specific use case, bringing together the optimal combination of code dependencies, developer packaging, developer workflow automation, building block components, and sample code tailored for a specific use case.
10 | 
11 | **Exploratory Workflow:** Cloud Experiments are about exploring the problem-solution space between developers, designers, and domain experts. We want to code as close to describing the exploratory workflow for problem discovery and solution definition. This means extracting API which describes activities in the workflow. The API should ideally deliver no code interactive experience in the app to facilitate exploration. As an example, if there is code to source data from a remote repository, turn this into an API which renders UI component to expect user input and displays the data for exploration. This makes the experiment reusable across multiple datasets and use cases. This also makes writing or iterating the Cloud Experiment app a low code experience.
12 | 
13 | **Interactive Data Story:** Really good Cloud Experiments tell an interative story with data. Think data journalism like Guardian or New York Times newspapers. These are well researched stories, into a particular problem-solution domain, backed with real-world open dataset(s), explorable with interactive visualisations, and complete with an innovation story moving from problem definition to scaling idea to solution demonstration.
14 | 
15 | **Codify Domain:** Ultimately Cloud Experiments codify domains related to design methods, innovation techniques, and industry body of knowledge.


--------------------------------------------------------------------------------
/docs/why-cloud-experiments.md:
--------------------------------------------------------------------------------
 1 | # Why Cloud Experiments
 2 | 
 3 | What's not going to change in the next 10 years? This question led to the Amazon virtuous cycle or flywheel focusing on value, convenience, and selection. If we ask this question today to organisations who are thinking about the new normal post pandemic, the answer may include – velocity – the speed of something in a given direction. Organisations want to deliver outcomes faster. They want to complete their physical to digital journey in less time. They want to build new products and services with faster time to market. During the 2020 pandemic, the whole world woke up to the importance of high velocity change in every aspect of our lives from travel to work, from social interactions to how we teach our kids. Reacting faster to change saved lives. Velocity enables organisations to run more experiments. More experiments translate into more innovation. 
 4 | 
 5 | >Further reading: [Jeff Bezos on Planning for the Future in Uncertain Times](https://www.inc.com/jason-aten/with-this-simple-3-word-question-jeff-bezos-explains-how-to-plan-for-future-despite-turbulent-times.html), Inc. Magazine | [Amazon Virtuous Cycle](https://www.youtube.com/watch?v=NJ1-ufRs57E): Fireside Chat with Jeff Wilke of Amazon
 6 | 
 7 | In fact, there is another answer to the 10-years question. This time it is the word – transform – to make a marked change in the form, nature, or appearance of. You can use this verb in a technical context like Extract Transform Load (ETL), which is what AWS Glue does for data. It extracts data from a source in a certain format, like log files; transforms this data into desired form; then loads it for intended outcomes, like into Amazon QuickSight for insights like outlier detection. Transforms play an important role in many AWS services. AWS Transcribe transforms speech to text while Amazon Polly does the reverse. AWS Glue DataBrew can apply 250 pre-built transforms to automate data preparation tasks for machine learning and analytics workloads. Amazon Sumerian can transform 3D polygons into virtual reality worlds. Transforms are everywhere. Business models also transform, from brick-and-mortar to online, from cash to digital payments, and from business-to-consumer to peer-to-peer. Applying transforms is synonymous to running experiments on business models, data, processes, and customer journeys.
 8 | 
 9 | >We hypothesize that, when we perform technology transforms with speed, enabling business models to transform from existing state to a new direction, we achieve high velocity innovation.	
10 | 
11 | Uber uses high velocity innovation to transform user location, city maps, and cab network into a real-time cab booking experience enabling a peer-to-peer business model. Amazons uses high velocity innovation to transform hundreds of millions of SKUs within its product catalog and hundreds of millions of user carts into real-time recommendations, enabling a personalised online shopping business model. The ultimate aspiration for high velocity innovation is to achieve real-time experience for the business and the customer. It is hard work, Amazon took 25 years of peculiar innovation culture to get there. Today, Amazon deploys changes to its software applications 194 million times in a year. As Amazon business model is driven by these applications, it is not incorrect to say that Amazon business model transforms six times every second! Now that is real-time.
12 | 
13 | >We expand on our hypothesis. Customer context transforms to delight with a series of technology enabled transforms on business models, equipment or devices, people skills and organisation, events, processes, and data, along the direction of the customer journey, every transform aiming to improve speed in the direction of achieving customer delight. This results in high velocity innovation for the organisation.
14 | 
15 | Uber and Amazon are cited as case studies of disruptive innovation. There are three more types of innovation. Greg Satell in his book titled Mapping Innovation and the Harvard Business Review article  on the topic defines four types of innovation and problems they solve. Basic research or incremental innovation is suitable when problem and domain are not well defined. This type of innovation engages academic partnerships, research divisions, journals, and conferences. Disruptive innovation is applicable when domain is well defined however problem is not. This innovation technique engages VC model, innovation labs, 15%/20% rule  to block time for creative pursuits or passion projects instead of day job, and lean launchpad . Breakthrough or radical innovation is best used when problem is well defined but domain is not. This technique requires mavericks, skunk works, and open innovation/prizes according to Satell. When both problem and domain are well defined we choose sustaining innovation technique which requires design thinking, roadmapping, R&D labs, and acquisitions to achieve solutions. Cloud Experiments can support disruptive innovation (using open innovation), radical innovation (with communities of practice), and sustaining innovation.
16 | 
17 | Cloud Experiments tell exploratory innovation stories. Stories inspire call to action and can present a complex theory in a user friendly manner. Several design thinking methods tell stories to enable ideation. Now imagine bringing these stories to life with real-world datasets, embedded visualisations, and interactive apps. New York Times does a great job of combining data and journalism into interactive storytelling with their series  on 2020 year in graphics. Guardian Data Journalism  is another example of telling thought provoking stories with interactive data visualisations. Well known for its product innovation and design, Apple has been using visuals, animations, data, and narrative to explain technical specifications  of their products for years. Microsoft Story Labs  positions their BI products powering data journalism. The self-service aspect of interactive innovation stories is even more powerful. According to the Harvard Business School paper, titled The “IKEA Effect”: When labor leads to love , in a series of studies in which consumers assembled IKEA boxes, folded origami, and built sets of Legos, we demonstrate and investigate the boundary conditions for what we term the “IKEA effect” – the increase in valuation of self-made products. Participants saw their amateurish creations – of both utilitarian and hedonic products – as similar in value to the creations of experts, and expected others to share their opinions. Interactive innovation stories can have similar effect, either on self-service readers or co-creation workshop participants, in building and exploring solutions following step by step narrative, visuals, and interactivity.
18 | 
19 | >Further reading: [2020 Year in Graphics](https://www.nytimes.com/interactive/2020/12/30/us/2020-year-in-graphics.html), New York Times | [Data Journalism](https://www.theguardian.com/media/data-journalism), Guardian | [Mac Pro Technical Specifications](https://www.apple.com/mac-pro/specs/), Apple | [The IKEA Effect](https://www.hbs.edu/ris/Publication%20Files/11-091.pdf), Harvard Business School
20 | 
21 | When choosing topics and composition of Cloud Experiments, we can choose from domains which guide ideation and application of transforms or where transforms apply velocity. We can classify a range of intersecting domains for our experiments. These include methods for design and innovation like the Amazon Working Backwards and IDEO Design Thinking kit. These methods typically guide ideation and application of transforms. Cloud Experiments codify these domains into starter apps and API abstractions to enable emergent innovation use cases.
22 | 
23 | ![](cloud-experiments-composition.png)
24 | 
25 | These domains include innovation methods, principles, business models, and emerging technologies. Certain domains define principles, laws, or heuristics for design, user behaviour, and invention, like TRIZ, Cognitive Biases, and Laws of Usability. These principles determine how transforms impact velocity. Business models can be described using Business Model Canvas and Kaplan Norton Strategy Map. Business models are where you would apply transforms to achieve high velocity. Emerging technologies are always a domain of interest for innovation. Emerging technologies enable transforms. Industry trends represent a combination of multiple domains including emerging technologies and consumer behaviour. Country level innovation indicators and mission priorities may encompass multiple industries. Industry trends and country mission priorities provide the direction for velocity. The domain of complex systems address emergence over scale and self-organisation over time. While this classification provides a rich set of choices, let us break down some of these domains into components to get a sense of wide choice and the need to navigate the inherent complexity in high velocity innovation.


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/494724_1196190_compressed_covid_19_data.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/covid_insights/494724_1196190_compressed_covid_19_data.csv.zip


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/README.md:
--------------------------------------------------------------------------------
 1 | # COVID Insights
 2 | 
 3 | Experiments for running exploratory data analysis (EDA) and models on COVID related open datasets. This includes Case Fatality Rate model on country data from John Hopkins. EDA techniques include growth factor analysis, cases growth rate, doubling rate, recovery and mortality rate, and country-wise analysis.
 4 | 
 5 | Start the first experiment using `streamlit run covid-insights/cov-dash.py` to run Case Fatality Rate model directly on latest country data updated from John Hopkins repository.
 6 | 
 7 | ![](cfr.png)
 8 | 
 9 | Second experiment runs using `streamlit run covid-insights/covid-app.py` on archived dataset from Kaggle.
10 | 
11 | ![](covid-app.png)
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/__pycache__/covid.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/covid_insights/__pycache__/covid.cpython-38.pyc


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/cfr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/covid_insights/cfr.png


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/cov_dash.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/cwerner/covid19
  2 | # 1. Source data directly from GitHub (JHU COVID)
  3 | # 2. Configurable UI based on two variables - inhabitants and countries
  4 | # 3. Use Altair (https://altair-viz.github.io/) declarative statistical visualization charts
  5 | 
  6 | import datetime
  7 | from functools import reduce
  8 | import streamlit as st
  9 | from streamlit import caching
 10 | import pandas as pd
 11 | import altair as alt
 12 | import os
 13 | 
 14 | # numbers for 2019
 15 | inhabitants = {'India': 1352.6,
 16 |             'US': 328.2,
 17 |             'Brazil': 209.5,
 18 |             'Russia': 144.5,
 19 |             'United Kingdom': 67.1,
 20 |             'China': 1392.7,
 21 |             'Italy': 60.23}
 22 | 
 23 | @st.cache
 24 | def read_data():
 25 |     BASEURL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series"    
 26 |     url_confirmed = f"{BASEURL}/time_series_covid19_confirmed_global.csv"
 27 |     url_deaths = f"{BASEURL}/time_series_covid19_deaths_global.csv"
 28 |     url_recovered = f"{BASEURL}/time_series_covid19_recovered_global.csv"
 29 | 
 30 |     confirmed = pd.read_csv(url_confirmed, index_col=0)
 31 |     deaths = pd.read_csv(url_deaths, index_col=0)
 32 |     recovered = pd.read_csv(url_recovered, index_col=0)
 33 | 
 34 |     # sum over potentially duplicate rows (France and their territories)
 35 |     confirmed = confirmed.groupby("Country/Region").sum().reset_index()
 36 |     deaths = deaths.groupby("Country/Region").sum().reset_index()
 37 |     recovered = recovered.groupby("Country/Region").sum().reset_index()
 38 | 
 39 |     return (confirmed, deaths, recovered)
 40 | 
 41 | def transform(df, collabel='confirmed'):
 42 |     dfm = pd.melt(df)
 43 |     dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
 44 |     dfm = dfm.set_index("date")
 45 |     dfm = dfm[["value"]]
 46 |     dfm.columns = [collabel]
 47 |     return dfm
 48 | 
 49 | def transform2(df, collabel='confirmed'):
 50 |     dfm = pd.melt(df, id_vars=["Country/Region"])
 51 |     dfm["date"] = pd.to_datetime(dfm.variable, infer_datetime_format=True)
 52 |     dfm = dfm.set_index("date")
 53 |     dfm = dfm[["Country/Region","value"]]
 54 |     dfm.columns = ["country", collabel]
 55 |     return dfm
 56 | 
 57 | def app():
 58 |     st.title("🦠 Covid-19 Data Explorer")
 59 |     st.markdown("""\
 60 |         This app illustrates the spread of COVID-19 in select countries over time.
 61 |     """)
 62 | 
 63 |     #st.error("⚠️ There is currently an issue in the datasource of JHU. Data for 03/13 is invalid and thus removed!")
 64 | 
 65 |     countries = ["India", "US", "Russia", "Brazil", "China", "Italy", "United Kingdom"]
 66 | 
 67 |     analysis = st.sidebar.selectbox("Choose Analysis", ["Overview", "By Country"])
 68 | 
 69 |     if analysis == "Overview":
 70 | 
 71 |         st.header("COVID-19 cases and fatality rate")
 72 |         st.markdown("""\
 73 |             These are the reported case numbers for a selection of countries"""
 74 |             f""" (currently only {', '.join(countries)}). """
 75 |             """The case fatality rate (CFR) is calculated as:  
 76 |             $$
 77 |             CFR[\%] = \\frac{fatalities}{\\textit{all cases}}
 78 |             $$
 79 | 
 80 |             ℹ️ You can select/ deselect countries and switch between linear and log scales.
 81 |             """)
 82 | 
 83 |         confirmed, deaths, recovered = read_data()
 84 | 
 85 |         multiselection = st.multiselect("Select countries:", countries, default=countries)
 86 |         logscale = st.checkbox("Log scale", False)
 87 | 
 88 |         confirmed = confirmed[confirmed["Country/Region"].isin(multiselection)]
 89 |         confirmed = confirmed.drop(["Lat", "Long"],axis=1)
 90 |         confirmed = transform2(confirmed, collabel="confirmed")
 91 | 
 92 |         deaths = deaths[deaths["Country/Region"].isin(multiselection)]
 93 |         deaths = deaths.drop(["Lat", "Long"],axis=1)
 94 |         deaths = transform2(deaths, collabel="deaths")
 95 | 
 96 |         frate = confirmed[["country"]]
 97 |         frate["frate"] = (deaths.deaths / confirmed.confirmed)*100
 98 | 
 99 |         # saveguard for empty selection 
100 |         if len(multiselection) == 0:
101 |             return 
102 | 
103 |         SCALE = alt.Scale(type='linear')
104 |         if logscale:
105 |             confirmed["confirmed"] += 0.00001
106 | 
107 |             confirmed = confirmed[confirmed.index > '2020-02-16']
108 |             frate = frate[frate.index > '2020-02-16']
109 |             
110 |             SCALE = alt.Scale(type='log', domain=[10, int(max(confirmed.confirmed))], clamp=True)
111 | 
112 | 
113 |         c2 = alt.Chart(confirmed.reset_index()).properties(height=150).mark_line().encode(
114 |             x=alt.X("date:T", title="Date"),
115 |             y=alt.Y("confirmed:Q", title="Cases", scale=SCALE),
116 |             color=alt.Color('country:N', title="Country")
117 |         )
118 | 
119 |         # case fatality rate...
120 |         c3 = alt.Chart(frate.reset_index()).properties(height=100).mark_line().encode(
121 |             x=alt.X("date:T", title="Date"),
122 |             y=alt.Y("frate:Q", title="Fatality rate [%]", scale=alt.Scale(type='linear')),
123 |             color=alt.Color('country:N', title="Country")
124 |         )
125 | 
126 |         per100k = confirmed.loc[[confirmed.index.max()]].copy()
127 |         per100k.loc[:,'inhabitants'] = per100k.apply(lambda x: inhabitants[x['country']], axis=1)
128 |         per100k.loc[:,'per100k'] = per100k.confirmed / (per100k.inhabitants * 1_000_000) * 100_000
129 |         per100k = per100k.set_index("country")
130 |         per100k = per100k.sort_values(ascending=False, by='per100k')
131 |         per100k.loc[:,'per100k'] = per100k.per100k.round(2)
132 | 
133 |         c4 = alt.Chart(per100k.reset_index()).properties(width=75).mark_bar().encode(
134 |             x=alt.X("per100k:Q", title="Cases per 100k inhabitants"),
135 |             y=alt.Y("country:N", title="Countries", sort=None),
136 |             color=alt.Color('country:N', title="Country"),
137 |             tooltip=[alt.Tooltip('country:N', title='Country'), 
138 |                         alt.Tooltip('per100k:Q', title='Cases per 100k'),
139 |                         alt.Tooltip('inhabitants:Q', title='Inhabitants [mio]')]
140 |         )
141 | 
142 |         st.altair_chart(alt.hconcat(c4, alt.vconcat(c2, c3)), use_container_width=True)
143 | 
144 |         st.markdown(f"""\
145 |             <div style="font-size: small">
146 |             ⚠️ Please take the CFR with a grain of salt. The ratio is 
147 |             highly dependend on the total number of tests conducted in a country. In the early stages
148 |             of the outbreak often mainly severe cases with clear symptoms are detected. Thus mild cases
149 |             are not recorded which skews the CFR.
150 |             </div><br/>  
151 | 
152 |             """, unsafe_allow_html=True)
153 | 
154 | 
155 |     elif analysis == "By Country":        
156 | 
157 |         confirmed, deaths, recovered = read_data()
158 | 
159 |         st.header("Country statistics")
160 |         st.markdown("""\
161 |             The reported number of active, recovered and deceased COVID-19 cases by country """
162 |             f""" (currently only {', '.join(countries)}).  
163 |             """
164 |             """  
165 |             ℹ️ You can select countries and plot data as cummulative counts or new active cases per day. 
166 |             """)
167 | 
168 |         # selections
169 |         selection = st.selectbox("Select country:", countries)
170 |         cummulative = st.radio("Display type:", ["total", "new cases"])
171 |         #scaletransform = st.radio("Plot y-axis", ["linear", "pow"])
172 |         
173 |         confirmed = confirmed[confirmed["Country/Region"] == selection].iloc[:,3:]
174 |         confirmed = transform(confirmed, collabel="confirmed")
175 | 
176 |         deaths = deaths[deaths["Country/Region"] == selection].iloc[:,3:]
177 |         deaths = transform(deaths, collabel="deaths")
178 | 
179 |         recovered = recovered[recovered["Country/Region"] == selection].iloc[:,3:]
180 |         recovered = transform(recovered, collabel="recovered")
181 | 
182 |         
183 |         df = reduce(lambda a,b: pd.merge(a,b, on='date'), [confirmed, recovered, deaths])
184 |         df["active"] = df.confirmed - (df.deaths + df.recovered)
185 | 
186 |         variables = ["recovered", "active", "deaths"]
187 |         colors = ["steelblue", "orange", "black"]
188 | 
189 |         value_vars = variables
190 |         SCALE = alt.Scale(domain=variables, range=colors)
191 |         if cummulative == 'new cases':
192 |             value_vars = ["new"]
193 |             df["new"] = df.confirmed - df.shift(1).confirmed
194 |             df["new"].loc[df.new < 0]  = 0
195 |             SCALE = alt.Scale(domain=["new"], range=["orange"]) 
196 | 
197 |         dfm = pd.melt(df.reset_index(), id_vars=["date"], value_vars=value_vars)
198 | 
199 |         # introduce order col as altair does auto-sort on stacked elements
200 |         dfm['order'] = dfm['variable'].replace(
201 |             {val: i for i, val in enumerate(variables[::-1])}
202 |         )
203 | 
204 |         c = alt.Chart(dfm.reset_index()).mark_bar().properties(height=200).encode(
205 |             x=alt.X("date:T", title="Date"),
206 |             y=alt.Y("sum(value):Q", title="Cases", scale=alt.Scale(type='linear')),
207 |             color=alt.Color('variable:N', title="Category", scale=SCALE), #, sort=alt.EncodingSortField('value', order='ascending')),
208 |             order='order'
209 |         )
210 | 
211 |         if cummulative != 'new cases':
212 |             st.altair_chart(c, use_container_width=True)
213 |         else:
214 |             # add smooth 7-day trend
215 |             rm_7day = df[['new']].rolling('7D').mean().rename(columns={'new': 'value'})
216 |             c_7day = alt.Chart(rm_7day.reset_index()).properties(height=200).mark_line(strokeDash=[1,1], color='red').encode(
217 |                 x=alt.X("date:T", title="Date"),
218 |                 y=alt.Y("value:Q", title="Cases", scale=alt.Scale(type='linear')),
219 |             )
220 |             st.altair_chart((c + c_7day), use_container_width=True)
221 |             st.markdown(f"""\
222 |                 <div style="font-size: small">Daily reported new cases (incl. 7-day average).</div><br/>
223 |                 """, unsafe_allow_html=True)
224 | 
225 | 
226 |     st.info("""\
227 |             
228 |         by: [C. Werner](https://www.christianwerner.net) | source: [GitHub](https://www.github.com/cwerner/covid19)
229 |         | data source: [Johns Hopkins Univerity (GitHub)](https://github.com/CSSEGISandData/COVID-19). 
230 |     """)
231 | 
232 | 
233 |     # ----------------------
234 | 
235 | app()


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/covid-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/covid_insights/covid-app.png


--------------------------------------------------------------------------------
/experiments/data-apps/covid_insights/covid_app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import plotly.graph_objects as go
  7 | from plotly.subplots import make_subplots
  8 | from api.streamlit_experiments import covid as cov
  9 | 
 10 | st.title('COVID Exploratory Data Analysis')
 11 | 
 12 | # Data from https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset?select=covid_19_data.csv
 13 | 
 14 | covid = pd.read_csv('494724_1196190_compressed_covid_19_data.csv.zip')
 15 | 
 16 | # Dropping column as SNo is of no use, and 'Province/State' contains too many missing values
 17 | covid.drop(['SNo'], 1, inplace=True)
 18 | 
 19 | st.header('Dataset')
 20 | st.write(covid)
 21 | 
 22 | # Converting 'Observation Date' into Datetime format
 23 | covid['ObservationDate']=pd.to_datetime(covid['ObservationDate'])
 24 | 
 25 | # Grouping different types of cases as per the date
 26 | datewise = covid.groupby(['ObservationDate']).agg({
 27 |     'Confirmed': 'sum',
 28 |     'Recovered': 'sum',
 29 |     'Deaths': 'sum'
 30 |     })
 31 | 
 32 | datewise['Days Since'] = datewise.index-datewise.index.min()
 33 | datewise["WeekOfYear"]=datewise.index.weekofyear
 34 | 
 35 | india_data=covid[covid["Country/Region"]=="India"]
 36 | datewise_india=india_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
 37 | datewise_india['Days Since'] = datewise_india.index-datewise.index.min()
 38 | datewise_india["WeekOfYear"]=datewise_india.index.weekofyear
 39 | 
 40 | st.header('Global Analysis')
 41 | 
 42 | st.line_chart(datewise[['Confirmed', 'Deaths', 'Recovered']])
 43 | 
 44 | st.subheader('Global Growth Factor')
 45 | cov.growth_factor(datewise)
 46 | 
 47 | st.subheader('India Growth Factor')
 48 | cov.growth_factor(datewise_india)
 49 | 
 50 | st.subheader('Global Weekly Growth of Cases')
 51 | cov.weekly_increase(datewise)
 52 | 
 53 | st.subheader('India Weekly Growth of Cases')
 54 | cov.weekly_increase(datewise_india)
 55 | 
 56 | st.subheader('Global Doubling Rate')
 57 | cov.double_days(datewise)
 58 | 
 59 | st.subheader('India Doubling Rate')
 60 | cov.double_days(datewise_india)
 61 | 
 62 | st.subheader('Daily Growth')
 63 | cov.growth_scatter(datewise)
 64 | 
 65 | st.subheader('Recovery and Mortality')
 66 | cov.mortality(datewise)
 67 | 
 68 | st.subheader('Daily Increases Stats')
 69 | cov.daily_increase(datewise)
 70 | 
 71 | st.header('Countrywise Analysis')
 72 | 
 73 | #Calculating countrywise Mortality and Recovery Rate
 74 | countrywise=covid[covid["ObservationDate"]==covid["ObservationDate"].max()].groupby(["Country/Region"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'}).sort_values(["Confirmed"],ascending=False)
 75 | countrywise["Mortality"]=(countrywise["Deaths"]/countrywise["Confirmed"])*100
 76 | countrywise["Recovery"]=(countrywise["Recovered"]/countrywise["Confirmed"])*100
 77 | 
 78 | fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(10,12))
 79 | top_15_confirmed=countrywise.sort_values(["Confirmed"],ascending=False).head(15)
 80 | top_15_deaths=countrywise.sort_values(["Deaths"],ascending=False).head(15)
 81 | sns.barplot(x=top_15_confirmed["Confirmed"],y=top_15_confirmed.index,ax=ax1)
 82 | ax1.set_title("Top 15 countries as per Number of Confirmed Cases")
 83 | sns.barplot(x=top_15_deaths["Deaths"],y=top_15_deaths.index,ax=ax2)
 84 | ax2.set_title("Top 15 countries as per Number of Death Cases")
 85 | 
 86 | st.pyplot(fig)
 87 | 
 88 | st.header('India Analysis')
 89 | 
 90 | st.line_chart(datewise_india[['Confirmed', 'Deaths', 'Recovered']])
 91 | 
 92 | st.write(datewise_india.iloc[-1])
 93 | st.write("Total Active Cases: ",datewise_india["Confirmed"].iloc[-1]-datewise_india["Recovered"].iloc[-1]-datewise_india["Deaths"].iloc[-1])
 94 | st.write("Total Closed Cases: ",datewise_india["Recovered"].iloc[-1]+datewise_india["Deaths"].iloc[-1])
 95 | 
 96 | st.subheader('India Growth Daily')
 97 | cov.growth_scatter(datewise_india)
 98 | 
 99 | st.subheader('India Daily Increase in Cases')
100 | cov.daily_increase(datewise_india)
101 | 
102 | st.subheader('India Recovery and Mortality')
103 | cov.mortality(datewise_india)
104 | 
105 | st.subheader('India Compared with Other Countries')
106 | 
107 | Italy_data=covid[covid["Country/Region"]=="Italy"]
108 | US_data=covid[covid["Country/Region"]=="US"]
109 | spain_data=covid[covid["Country/Region"]=="Spain"]
110 | datewise_Italy=Italy_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
111 | datewise_US=US_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
112 | datewise_Spain=spain_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
113 | 
114 | max_ind=datewise_india["Confirmed"].max()
115 | fig = plt.figure(figsize=(12,6))
116 | plt.plot(datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Italy",linewidth=3)
117 | plt.plot(datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases USA",linewidth=3)
118 | plt.plot(datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)]["Confirmed"],label="Confirmed Cases Spain",linewidth=3)
119 | plt.plot(datewise_india[datewise_india["Confirmed"]>0]["Confirmed"],label="Confirmed Cases India",linewidth=3)
120 | plt.xlabel("Date")
121 | plt.ylabel("Number of Confirmed Cases")
122 | plt.title("Growth of Confirmed Cases")
123 | plt.legend()
124 | plt.xticks(rotation=90)
125 | 
126 | st.write("It took",datewise_Italy[(datewise_Italy["Confirmed"]>0)&(datewise_Italy["Confirmed"]<=max_ind)].shape[0],"days in Italy to reach number of Confirmed Cases equivalent to India")
127 | st.write("It took",datewise_US[(datewise_US["Confirmed"]>0)&(datewise_US["Confirmed"]<=max_ind)].shape[0],"days in USA to reach number of Confirmed Cases equivalent to India")
128 | st.write("It took",datewise_Spain[(datewise_Spain["Confirmed"]>0)&(datewise_Spain["Confirmed"]<=max_ind)].shape[0],"days in Spain to reach number of Confirmed Cases equivalent to India")
129 | st.write("It took",datewise_india[datewise_india["Confirmed"]>0].shape[0],"days in India to reach",max_ind,"Confirmed Cases")
130 | 
131 | st.pyplot(fig)
132 | 
133 | 


--------------------------------------------------------------------------------
/experiments/data-apps/exploratory_data_analysis/__pycache__/eda.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/exploratory_data_analysis/__pycache__/eda.cpython-38.pyc


--------------------------------------------------------------------------------
/experiments/data-apps/exploratory_data_analysis/eda_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | from api.streamlit_experiments import eda
 4 | 
 5 | st.header('Exploratory Data Analysis App')
 6 | 
 7 | st.subheader('Dataset')
 8 | df = pd.read_csv('census-income.csv')
 9 | st.write(df.head(20))
10 | 
11 | st.write(f'Rows, Columns: {df.shape}')
12 | 
13 | st.subheader('Correlation')
14 | eda.correlate(df)
15 | 
16 | 


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/README.md:
--------------------------------------------------------------------------------
 1 | # Open Data Explorer
 2 | 
 3 | Apps and API for exploring open data sources including [AWS Registry of Open Data](https://registry.opendata.aws/) which lists datasets for genomics, satellite, transport, COVID, medical imaging, and other use cases in data for social good.
 4 | 
 5 | ## Amazon S3 App
 6 | 
 7 | This experiment provides a starter app and companion API `s3.py` which makes it easy to list and query public S3 buckets for open data. The app also demonstrates how to create and list S3 buckets in your own AWS account.
 8 | 
 9 | ![](s3-app-start.png)
10 | 
11 | ### List Bucket Contents
12 | 
13 | Get started by running the app `streamlit run open-data-explorer/s3-app.py` from the repository root folder.
14 | 
15 | Try the List Bucket Contents feature by typing `nyc-tlc` as the S3 bucket name for accessing the public dataset. Enter `2021-05` to Filter bucket contents with matching string. This will show fewer files in listing, while determining overall size of the public dataset in GB and number of files.
16 | 
17 | ![](list-nyc-tlc-filter.png)
18 | 
19 | This experiment feature is useful when exploring public datasets and identifying a relatively small sample dataset file for running your experiments, before commiting to a larger dataset. This will save you cost and time in running the cloud experiments.
20 | 
21 | ### Query CSV
22 | 
23 | Next you can query a public dataset flat file (well-formed CSV) stored in open data registry directly using SQL, without having to import the dataset into a database. This uses AWS [S3 Select](https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-glacier-select-sql-reference-select.html) feature.
24 | 
25 | ![](query-public-dataset.png)
26 | 
27 | Use the same public dataset bucket `nyc-tlc` and copy paste one of the files listed by the List Bucket Contents feature, like on our case we use `trip data/green_tripdata_2020-05.csv` file. Next we add a SQL statement for the query `SELECT * FROM s3object s LIMIT 5` using the sample provided or creating our own. The query results are displayed along with size of dataset scanned, processed, and returned by the query.


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/__pycache__/s3.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/open_data_explorer/__pycache__/s3.cpython-38.pyc


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/list-nyc-tlc-filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/open_data_explorer/list-nyc-tlc-filter.png


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/query-public-dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/open_data_explorer/query-public-dataset.png


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/s3-app-start.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/data-apps/open_data_explorer/s3-app-start.png


--------------------------------------------------------------------------------
/experiments/data-apps/open_data_explorer/s3_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from api.streamlit_experiments import s3
 3 | 
 4 | st.header('Amazon S3 App')
 5 | tabs = st.radio('Choose S3 action', 
 6 |     ('List Bucket Contents', 'Query CSV', 'Search Own Buckets', 'Create Own Bucket'))
 7 | 
 8 | if tabs == 'Search Buckets':
 9 |     s3.search_buckets()
10 | elif tabs == 'List Bucket Contents':
11 |     s3.list_bucket_contents()
12 | elif tabs == 'Query CSV':
13 |     s3.s3_select()
14 | else:
15 |     s3.create_bucket()
16 | 
17 | 


--------------------------------------------------------------------------------
/experiments/data-apps/rides/uber_pickups.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | st.title('Uber pickups in NYC')
 6 | 
 7 | DATE_COLUMN = 'date/time'
 8 | DATA_URL = ('https://s3-us-west-2.amazonaws.com/'
 9 |          'streamlit-demo-data/uber-raw-data-sep14.csv.gz')
10 | 
11 | @st.cache
12 | def load_data(nrows):
13 |     data = pd.read_csv(DATA_URL, nrows=nrows)
14 |     lowercase = lambda x: str(x).lower()
15 |     data.rename(lowercase, axis='columns', inplace=True)
16 |     data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
17 |     return data
18 | 
19 | # Create a text element and let the reader know the data is loading.
20 | data_load_state = st.text('Loading data...')
21 | # Load 10,000 rows of data into the dataframe.
22 | data = load_data(10000)
23 | # Notify the reader that the data was successfully loaded.
24 | data_load_state.text("Done! (using st.cache)")
25 | 
26 | if st.checkbox('Show raw data'):
27 |     st.subheader('Raw data')
28 |     st.write(data)
29 | 
30 | st.subheader('Number of pickups by hour')
31 | 
32 | hist_values = np.histogram(
33 |     data[DATE_COLUMN].dt.hour, bins=24, range=(0,24))[0]
34 | 
35 | st.bar_chart(hist_values)
36 | 
37 | hour_to_filter = st.slider('hour', 0, 23, 17)  # min: 0h, max: 23h, default: 17h
38 | 
39 | filtered_data = data[data[DATE_COLUMN].dt.hour == hour_to_filter]
40 | st.subheader(f'Map of all pickups at {hour_to_filter}:00')
41 | st.map(filtered_data)
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/experiments/data-apps/wine/wine_app.py:
--------------------------------------------------------------------------------
 1 | from pycaret.classification import load_model, predict_model
 2 | import streamlit as st
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | 
 7 | def predict_quality(model, df):
 8 |     
 9 |     predictions_data = predict_model(estimator = model, data = df)
10 |     return predictions_data['Label'][0]
11 |     
12 | model = load_model('../../notebooks/wine-pycaret/extra_tree_model')
13 | 
14 | 
15 | st.title('Wine Quality Classifier Web App')
16 | st.write('This is a web app to classify the quality of your wine based on\
17 |          several features that you can see in the sidebar. Please adjust the\
18 |          value of each feature. After that, click on the Predict button at the bottom to\
19 |          see the prediction of the classifier.')
20 | 
21 | fixed_acidity = st.sidebar.slider(label = 'Fixed Acidity', min_value = 4.0,
22 |                           max_value = 16.0 ,
23 |                           value = 10.0,
24 |                           step = 0.1)
25 | 
26 | volatile_acidity = st.sidebar.slider(label = 'Volatile Acidity', min_value = 0.00,
27 |                           max_value = 2.00 ,
28 |                           value = 1.00,
29 |                           step = 0.01)
30 |                           
31 | citric_acid = st.sidebar.slider(label = 'Citric Acid', min_value = 0.00,
32 |                           max_value = 1.00 ,
33 |                           value = 0.50,
34 |                           step = 0.01)                          
35 | 
36 | residual_sugar = st.sidebar.slider(label = 'Residual Sugar', min_value = 0.0,
37 |                           max_value = 16.0 ,
38 |                           value = 8.0,
39 |                           step = 0.1)
40 | 
41 | chlorides = st.sidebar.slider(label = 'Chlorides', min_value = 0.000,
42 |                           max_value = 1.000 ,
43 |                           value = 0.500,
44 |                           step = 0.001)
45 |    
46 | f_sulf_diox = st.sidebar.slider(label = 'Free Sulfur Dioxide', min_value = 1,
47 |                           max_value = 72,
48 |                           value = 36,
49 |                           step = 1)
50 | 
51 | t_sulf_diox = st.sidebar.slider(label = 'Total Sulfur Dioxide', min_value = 6,
52 |                           max_value = 289 ,
53 |                           value = 144,
54 |                           step = 1)
55 | 
56 | density = st.sidebar.slider(label = 'Density', min_value = 0.0000,
57 |                           max_value = 2.0000 ,
58 |                           value = 0.9900,
59 |                           step = 0.0001)
60 | 
61 | ph = st.sidebar.slider(label = 'pH', min_value = 2.00,
62 |                           max_value = 5.00 ,
63 |                           value = 3.00,
64 |                           step = 0.01)
65 |                           
66 | sulphates = st.sidebar.slider(label = 'Sulphates', min_value = 0.00,
67 |                           max_value = 2.00,
68 |                           value = 0.50,
69 |                           step = 0.01)
70 | 
71 | alcohol = st.sidebar.slider(label = 'Alcohol', min_value = 8.0,
72 |                           max_value = 15.0,
73 |                           value = 10.5,
74 |                           step = 0.1)
75 | 
76 | features = {'fixed acidity': fixed_acidity, 'volatile acidity': volatile_acidity,
77 |             'citric acid': citric_acid, 'residual sugar': residual_sugar,
78 |             'chlorides': chlorides, 'free sulfur dioxide': f_sulf_diox,
79 |             'total sulfur dioxide': t_sulf_diox, 'density': density,
80 |             'pH': ph, 'sulphates': sulphates, 'alcohol': alcohol
81 |             }
82 |  
83 | 
84 | features_df  = pd.DataFrame([features])
85 | 
86 | st.table(features_df)  
87 | 
88 | if st.button('Predict'):
89 |     
90 |     prediction = predict_quality(model, features_df)
91 |     
92 |     st.write(' Based on feature values, your wine quality is '+ str(prediction))


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/README.md:
--------------------------------------------------------------------------------
  1 | # Flying Cars with Glue DataBrew
  2 | 
  3 | ![](splash.png)
  4 | 
  5 | Smarter cities will have smarter transportation including multi-modal, eco-friendly, balancing commuter convenience with safety and social distancing. This 60 minute experiment uses open data for good and low/no code services provided by AWS to enable insights for business model innovation in smart transport use case. The experiment is intended as a step-by-step guided co-innovation and design workshop along with an AWS specialist. If you are familiar with the pre-requisites specified in the Cloud Experiment Guide (last section of this experiment) then feel free to make this experiment your own.
  6 | 
  7 | Let’s say we want to develop a futuristic commuter service using flying cars. We just got a permit to launch in the busy city of New York. We want to start by helping the rush hour commuters and want to identify the busy locations, times, and routes. We want to start by transforming some taxi commuter data to generate insights for our new service.
  8 | 
  9 | Data goes through many processes including acquiring data from third-party sources, generating data, transferring data  securely from source to destination, storing data securely and cost effectively, preparing data for analytics and machine learning applications, analysing data, and visualising data. Transforms can be applied to any of these processes as well as to data itself. An example of data transform is converting from one form to another, like from speech audio files to text transcripts, depending on the use case. A process like data preparation using spreadsheets requiring significant manual intervention and subject matter expertise can transform with a purpose built data preparation tool like AWS Glue DataBrew which can automate and speed up several aspects of the process. In fact data and process transforms are intrinsically linked as specialised techniques and tooling evolve. 
 10 | 
 11 |  Back to our flying cars experiment, we have just the data source that may help us deliver insights quickly. Let’s make our way to one of the largest open data registries on the planet at registry.opendata.aws and search for our dataset. Type `new york` in the search bar and you will find a link for New York City Taxi and Limousine Commission (TLC) Trip Record Data.
 12 | 
 13 | ```
 14 | https://registry.opendata.aws/nyc-tlc-trip-records-pds/
 15 | ```
 16 | 
 17 | You can browse this dataset in a few ways. You can follow the documentation link to the nyc.gov website and download large (500MB to 1GB) CSV files and make your Excel angry. We will try a more fun and elegant way to get to this data. Let us make a note of where this dataset is stored on a public S3 location. Amazon Simple Storage Service (Amazon S3) is an object storage service. 
 18 | 
 19 | You will note a few things on the registry page for the dataset. First the AWS Region where this dataset is store is `us-east-1`  and second is the S3 location of the dataset which is `s3://nyc-tlc` and we are ready to start.
 20 | 
 21 | Let us use the AWS Console to create an S3 bucket (or folder, if you prefer) of our own. This is where we will save the results of our data transforms. First let us ensure we are in the same region as the one where the dataset is located. This will make the data transfers really fast.
 22 | 
 23 | Now type `S3` in the AWS Console search bar and select the service. Press the `Create bucket` orange button and choose a unique name. We chose our bucket name as `high-velocity-innovation` and yes we were the first ones to do so in the whole wide world (smiles).
 24 | 
 25 | Now we search for AWS Glue DataBrew to transform our taxi dataset. AWS Glue DataBrew is a visual data preparation tool that makes it easy for data analysts and data scientists to clean and normalize data to prepare it for analytics and machine learning.
 26 | 
 27 | We start by clicking `Create project` orange button. We enter the name of our project and make sure a new recipe drop down is selected by default.
 28 | 
 29 | ![](create-project.png)
 30 | 
 31 | Scrolling down we select New dataset to import a new dataset from the open data registry. As we start typing the location of the dataset we will notice it lists the files contained within this location. Page through the files and pick a relatively large size file to observe the true capabilities of AWS. We select `yellow_tripdata_2015-05.csv` which is more than 2 GB in size.
 32 | 
 33 | ![](connect-dataset.png)
 34 | 
 35 | Scroll all the way to the bottom and select `Create a new role` in the `Role name` dropdown. We chose `hvi` suffix and clicked `Create project` orange button.
 36 | 
 37 | Within a few seconds we are greeted by a cool animating progress indicator. AWS Glue DataBrew is loading our dataset and analysing it.
 38 | 
 39 | ![](initiate-session.png)
 40 | 
 41 | Within another few seconds we are ready to play with our dataset! What just happened? AWS Glue DataBrew just transformed our comma separated values dataset into a project with new metadata. As you scroll horizontally to see all the columns of the dataset, you will notice the analysis graphs and data on top of the grid telling you patterns found in the sample dataset, including most often found values and distribution of the data. Now click on Schema to change view from Grid and notice that DataBrew has figured out the correct datatypes by analysing the data. The source dataset did not contain any metadata about the schema. 
 42 | 
 43 | ![](data-schema.png)
 44 | 
 45 | Now click on Profile tab and then click on the `Run data profile` orange button for more magic! Accept the default values for `Job name` and `Custom sample` while typing in the location of the S3 bucket you created earlier. Select the existing IAM role you created earlier. Click on the `Create and run job` orange button. This will take few tens of seconds to a couple of minutes depending on the size of your dataset and sample. Our 2 GB dataset took 5 minutes to complete the data profile job. Once the job in progress is done, you will see the following results.
 46 | 
 47 | ![](job-results.png)
 48 | 
 49 | The most interesting result is a correlation matrix which indicates relationships between variables in our data. Positive  correlation means both variable values rise together and negative means as one increases, the other decreases. This tells us a lot about our dataset. The obvious ones are as `trip_distance` increases there is a strong positive correlation that `fare_amount` also increases. What is not so obvious is some correlation between `fare_amount` and `payment_type` which maybe indicating that passenger prefer cash for small amounts and cards to larger sums.
 50 | 
 51 | ![](corr-matrix.png)
 52 | 
 53 | There are many other transformations you can apply to the dataset. As you apply more transforms (DataBrew calls these Recipes) you can track data lineage visually.
 54 | 
 55 | ![](data-lineage.png)
 56 | 
 57 | The dataset in its present form uses numerical data to represent categories like payment type. This is ideal for training data used in machine learning algorithms. However, for the purpose of manual analysis and more intuitive dashboard creation, we will transform these numerical labels to meaningful values. The data dictionaries are available at nyc.gov website. We will lookup the Yellow Taxi data dictionary.
 58 | 
 59 | Let us find out why there is a correlation between payment type and fare amount. The data dictionary specifies payment types for each numerical value in our dataset. DataBrew helps us create transforms (Recipes) on our dataset to make these changes a breeze. Let us head back to our project. Click on the Recipe icon and then the `Add step` orange button. Use the `Find step` search to select `Create duplicate column` and then select source as `payment_type` and duplicate column name as `payment_type_name` before clicking `Apply` orange button. Not we click the column icon on top of the grid viewer and change type of the newly created column to string. Now we can add another step in our recipe to `Replace value or pattern` and change numerical values to matching strings from the data dictionary. So number 2 becomes Cash according to the data dictionary. Repeat this for all the distinct values found in the sample data. You may notice that your sample of first n rows may not have all the payment types. You can resample your data by clicking on the Sample link just below the title of your project on top left and selecting a larger random sample. Check the distribution of column values again and add more recipe steps as required. Note that it is possible that the particular trip data file you selected does not have all possible variations of values from the data dictionary. The transformed dataset delivers new insights that customers mostly pay by credit card (>60%) followed by cash (>37%). There are very few instances of No charge or Dispute in this dataset. You can resample another random set to see if these insights change. The recipe will reapply to the new sample within a few seconds after loading it in the viewer. The grayed out Publish icon will become active when recipe is applied to the new sample. Easy! You can also publish and version this recipe for future changes. We can continue applying the data dictionary to Vendor ID and Rate Code fields.
 60 | 
 61 | ![](create-recipe.png)
 62 | 
 63 | You can also transform existing data to create new data. Right click on the triple-dots icon next to column name for fare_amount and choose `Binning data` transform to create fare_amount_binning column which groups values into bins by high, medium, low fares.
 64 | 
 65 | ![](data-bins.png)
 66 | 
 67 | Once we are happy with these transforms on the random sample we can get ready to apply these on the entire 2 GB dataset. First we publish another version of the recipe. Next we click on the orange Recipes icon in the left menu bar. Select the recently published recipe. Now we click `Create job with this recipe` button. Choose the project we just created to apply the recipe to the entire attached dataset for this project. When specifying the Job output settings you can choose to compress the output to gzip format. This will save space on our local S3 and also is an acceptable format for the next step, which is to take this data into QuickSight for building dashboards. Select the IAM role you created earlier. Hit the `Create and run job` orange button and wait patiently for the job to complete. In our case, this job will apply all 18 steps transformation recipe to millions of rows of our 2 GB dataset. We grab a cuppa coffee and stretch our legs for a few minutes. Our job took 11 minutes to complete and we could click on the output to see a compressed gzip 560 MB file on S3. We are ready for the next step in building our Flying Car service of the future!
 68 | 
 69 | We will now use AWS Glue to prepare our dataset for analytics. Once in Glue console, add a new database. We called ours `hvi` and then went on to select Tables under the Database menu and chose to Add tables using a crawler. The notion of database and tables in Glue refers to metadata. There is no physical database created for you to manage. The source data still sits in your S3 gzipped file. Configure the crawler with following settings and run it.
 70 | 
 71 | ![](crawler-info.png)
 72 | 
 73 | The crawler completes extracting the metadata within a few seconds and creates a new table pointing to the data on S3. Now make your way to Amazon Athena, an interactive query service that makes it easy to analyze data in Amazon S3 using standard SQL. Choose the database you just created from the dropdown on left sidebar, select the table created by the Glue crawler. You can click on the vertical-dots menu next to the table name and select `Preview table` menu. This creates a Structured Query Language (SQL) statement to return the first 10 rows of your data straight from S3 gzipped CSV file. Cool, you did not actually created a physical database which is normally required to run SQL.
 74 | 
 75 | Next we will analyse and visualise our dataset using Amazon QuickSight - a scalable, serverless, embeddable, machine learning-powered business intelligence (BI) service built for the cloud. We search the AWS Console for QuickSight and launch the service in our browser. Click the `New analysis` blue button on top right. Next click `New dataset` and choose Athena as a data source. Select your data source and move the next step to create the analysis. As QuickSight loads the dataset for the first time (subsequent loads are much faster) you will notice there are more than 13 Million records of data in our original 2 GB dataset. To create your first visualisation let us analyse the hypothesis we made earlier on correlation observed between fare amount and payment type. Select `payment_type_name` field and `fare_amoung_binning` field to see the chart. As you change the cart type to stacked bar chart you can notice the correlation that cash payments are higher ratio to credit cards for low fare, lower for medium fare, and lowest for high fare. We just transformed insights from raw data. Can we build our Flying Car service now please!
 76 | 
 77 | ![](quicksight.png)
 78 | 
 79 | We should gain more insights on our data before creating our business plan. Add another visual clicking the plus icon on top left menu. Now select the pickup latitude and longitude fields. Select the Points on map chart, pan, and zoom into New York. Next select trip distance and vendor fields. This single visual tells us most revenue making pickup hotspots we need to serve and competing vendors for our Flying Car service!
 80 | 
 81 | ![](map.png)
 82 | 
 83 | What did we achieve so far? We started with 13 million records across 20 attributes or 260 million individual data points which are not humanly conceivable to analyse and find higher order patterns. We ended up with a geo-spatial drill-down visualisation which provided key insights for our problem space. We did not have to write a single line of code during the journey. We did not provision or configure any virtual servers on Cloud as we used serverless capabilities of Glue, Athena, and QuickSight. We did not have to setup or create a database. We could develop 18 data transforms and several visualisation transforms on our dataset using intuitive no/low code tools. All these capabilities gave us tremendous speed over doing this process using traditional development environments or spreadsheets. This results in high velocity decision making when innovating our business model.
 84 | 
 85 | ![](donut-charts.png)
 86 | 
 87 | Leaving this experiment with an assignment for you. Study the pie charts here in conjunction with the map visualisation. Do note that the legend colors for vendors are switched between map and pie charts. Answer this question - which vendor will you want to follow as best practice for your Flying Car service and why? Hope you take a high velocity decision!
 88 | 
 89 | 
 90 | ## Cloud Experiment Guide
 91 | 
 92 | Supporting reading material to prepare for this cloud experiment.
 93 | 
 94 | **Resources used by this experiment**
 95 | 
 96 | Registry of Open Data on AWS, from where we source the New York Taxi dataset, https://registry.opendata.aws/
 97 | 
 98 | Yellow Taxi data dictionary, where the New York Taxi data schema (column names) is defined, https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
 99 | 
100 | **AWS Services used by this experiment**
101 | 
102 | AWS Glue, https://aws.amazon.com/glue/
103 | 
104 | AWS Glue DataBrew, https://aws.amazon.com/glue/features/databrew/
105 | 
106 | Amazon Athena, https://aws.amazon.com/athena/
107 | 
108 | **Other References and Citation**
109 | 
110 | The Fast Fourier Transform mini-story refers data point from book by Ian Stewart on 17 equations that changed the world, https://www.amazon.com/Pursuit-Equations-That-Changed-World/dp/0465085989/
111 | 
112 | Uber Business Model, explained in detail by FourWeekMBA, https://fourweekmba.com/uber-business-model/
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/connect-dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/connect-dataset.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/corr-matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/corr-matrix.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/crawler-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/crawler-info.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/create-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/create-project.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/create-recipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/create-recipe.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/data-bins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/data-bins.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/data-lineage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/data-lineage.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/data-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/data-schema.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/donut-charts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/donut-charts.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/initiate-session.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/initiate-session.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/job-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/job-results.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/map.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/quicksight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/quicksight.png


--------------------------------------------------------------------------------
/experiments/guides/flying-cars-with-glue-databrew/splash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/guides/flying-cars-with-glue-databrew/splash.png


--------------------------------------------------------------------------------
/experiments/notebooks/cloudstory-api/README.md:
--------------------------------------------------------------------------------
1 | ## Cloudstory API and Demo
2 | Cloudstory API and demo notebook using the API. The cloudstory API is documented in the other notebooks available at [AWS Open Data Analytics Notebooks](https://github.com/aws-samples/aws-open-data-analytics-notebooks).


--------------------------------------------------------------------------------
/experiments/notebooks/cloudstory-api/cloudstory.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | import pandas as pd
  4 | import numpy as np
  5 | import io
  6 | import json
  7 | import time
  8 | import logging
  9 | import seaborn as sns
 10 | import matplotlib.pyplot as plt
 11 | from IPython.display import display, Markdown, Image, HTML
 12 | from wordcloud import WordCloud
 13 | 
 14 | s3 = boto3.client('s3')
 15 | s3_resource = boto3.resource('s3')
 16 | glue = boto3.client('glue')
 17 | athena = boto3.client('athena')
 18 | rekognition = boto3.client('rekognition','us-east-1')
 19 | comprehend = boto3.client('comprehend', 'us-east-1')
 20 | 
 21 | # Function library from https://github.com/aws-samples/aws-open-data-analytics-notebooks/tree/master/exploring-data
 22 | 
 23 | def create_bucket(bucket):
 24 |     try:
 25 |         s3.create_bucket(Bucket=bucket)
 26 |     except botocore.exceptions.ClientError as e:
 27 |         logging.error(e)
 28 |         return 'Bucket ' + bucket + ' could not be created.'
 29 |     return 'Created or already exists ' + bucket + ' bucket.'
 30 | 
 31 | def list_buckets(match=''):
 32 |     response = s3.list_buckets()
 33 |     if match:
 34 |         print(f'Existing buckets containing "{match}" string:')
 35 |     else:
 36 |         print('All existing buckets:')
 37 |     for bucket in response['Buckets']:
 38 |         if match:
 39 |             if match in bucket["Name"]:
 40 |                 print(f'  {bucket["Name"]}')
 41 | 
 42 | def list_bucket_contents(bucket, match='', size_mb=0):
 43 |     bucket_resource = s3_resource.Bucket(bucket)
 44 |     total_size_gb = 0
 45 |     total_files = 0
 46 |     match_size_gb = 0
 47 |     match_files = 0
 48 |     for key in bucket_resource.objects.all():
 49 |         key_size_mb = key.size/1024/1024
 50 |         total_size_gb += key_size_mb
 51 |         total_files += 1
 52 |         list_check = False
 53 |         if not match:
 54 |             list_check = True
 55 |         elif match in key.key:
 56 |             list_check = True
 57 |         if list_check and not size_mb:
 58 |             match_files += 1
 59 |             match_size_gb += key_size_mb
 60 |             print(f'{key.key} ({key_size_mb:3.0f}MB)')
 61 |         elif list_check and key_size_mb <= size_mb:
 62 |             match_files += 1
 63 |             match_size_gb += key_size_mb
 64 |             print(f'{key.key} ({key_size_mb:3.0f}MB)')
 65 | 
 66 |     if match:
 67 |         print(f'Matched file size is {match_size_gb/1024:3.1f}GB with {match_files} files')            
 68 |     
 69 |     print(f'Bucket {bucket} total size is {total_size_gb/1024:3.1f}GB with {total_files} files')
 70 | 
 71 | def preview_csv_dataset(bucket, key, rows=10):
 72 |     data_source = {
 73 |             'Bucket': bucket,
 74 |             'Key': key
 75 |         }
 76 |     # Generate the URL to get Key from Bucket
 77 |     url = s3.generate_presigned_url(
 78 |         ClientMethod = 'get_object',
 79 |         Params = data_source
 80 |     )
 81 | 
 82 |     data = pd.read_csv(url, nrows=rows)
 83 |     return data
 84 | 
 85 | def key_exists(bucket, key):
 86 |     try:
 87 |         s3_resource.Object(bucket, key).load()
 88 |     except botocore.exceptions.ClientError as e:
 89 |         if e.response['Error']['Code'] == "404":
 90 |             # The key does not exist.
 91 |             return(False)
 92 |         else:
 93 |             # Something else has gone wrong.
 94 |             raise
 95 |     else:
 96 |         # The key does exist.
 97 |         return(True)
 98 | 
 99 | def copy_among_buckets(from_bucket, from_key, to_bucket, to_key):
100 |     if not key_exists(to_bucket, to_key):
101 |         s3_resource.meta.client.copy({'Bucket': from_bucket, 'Key': from_key}, 
102 |                                         to_bucket, to_key)        
103 |         print(f'File {to_key} saved to S3 bucket {to_bucket}')
104 |     else:
105 |         print(f'File {to_key} already exists in S3 bucket {to_bucket}') 
106 | 
107 | def s3_select(bucket, key, statement):
108 |     import io
109 | 
110 |     s3_select_results = s3.select_object_content(
111 |         Bucket=bucket,
112 |         Key=key,
113 |         Expression=statement,
114 |         ExpressionType='SQL',
115 |         InputSerialization={'CSV': {"FileHeaderInfo": "Use"}},
116 |         OutputSerialization={'JSON': {}},
117 |     )
118 | 
119 |     for event in s3_select_results['Payload']:
120 |         if 'Records' in event:
121 |             df = pd.read_json(io.StringIO(event['Records']['Payload'].decode('utf-8')), lines=True)
122 |         elif 'Stats' in event:
123 |             print(f"Scanned: {int(event['Stats']['Details']['BytesScanned'])/1024/1024:5.2f}MB")            
124 |             print(f"Processed: {int(event['Stats']['Details']['BytesProcessed'])/1024/1024:5.2f}MB")
125 |             print(f"Returned: {int(event['Stats']['Details']['BytesReturned'])/1024/1024:5.2f}MB")
126 |     return (df)
127 | 
128 | 
129 | # Function library from https://github.com/aws-samples/aws-open-data-analytics-notebooks/tree/master/optimizing-data
130 | 
131 | def list_glue_databases():
132 |     glue_database = glue.get_databases()
133 | 
134 |     for db in glue_database['DatabaseList']:
135 |         print(db['Name'])
136 | 
137 | def list_glue_tables(database, verbose=True):
138 |     glue_tables = glue.get_tables(DatabaseName=database)
139 |     
140 |     for table in glue_tables['TableList']:
141 |         display(Markdown('**Table: ' + table['Name'] + '**'))
142 |         display(Markdown('Location: ' + table['StorageDescriptor']['Location']))
143 |         created = table['CreatedBy'].split('/')
144 |         display(Markdown('Created by: ' + created[-1]))
145 |         if verbose and created[-1] == 'AWS Crawler':
146 |             display(Markdown(f'Records: {int(table["Parameters"]["recordCount"]):,}'))
147 |             display(Markdown(f'Average Record Size: {table["Parameters"]["averageRecordSize"]} Bytes'))
148 |             display(Markdown(f'Dataset Size: {float(table["Parameters"]["sizeKey"])/1024/1024:3.0f} MB'))
149 |             display(Markdown(f'Crawler: {table["Parameters"]["UPDATED_BY_CRAWLER"]}'))
150 |         if verbose:
151 |             df_columns = pd.DataFrame.from_dict(table["StorageDescriptor"]["Columns"])
152 |             display(df_columns[['Name', 'Type']])
153 |             display(Markdown('---'))
154 | 
155 | def athena_query(query, bucket, folder):
156 |     output = 's3://' + bucket + '/' + folder + '/'
157 |     response = athena.start_query_execution(QueryString=query, 
158 |                                         ResultConfiguration={'OutputLocation': output})
159 |     qid = response['QueryExecutionId']
160 |     response = athena.get_query_execution(QueryExecutionId=qid)
161 |     state = response['QueryExecution']['Status']['State']
162 |     while state == 'RUNNING':
163 |         response = athena.get_query_execution(QueryExecutionId=qid)
164 |         state = response['QueryExecution']['Status']['State']
165 |     key = folder + '/' + qid + '.csv'
166 |     data_source = {'Bucket': bucket, 'Key': key}
167 |     url = s3.generate_presigned_url(ClientMethod = 'get_object', Params = data_source)
168 |     data = pd.read_csv(url)
169 |     return data
170 | 
171 | def heatmap(corr):
172 |     sns.set(style="white")
173 | 
174 |     # Generate a mask for the upper triangle
175 |     mask = np.zeros_like(corr, dtype=np.bool)
176 |     mask[np.triu_indices_from(mask)] = True
177 | 
178 |     # Set up the matplotlib figure
179 |     f, ax = plt.subplots(figsize=(11, 9))
180 | 
181 |     # Generate a custom diverging colormap
182 |     cmap = sns.diverging_palette(220, 10, as_cmap=True)
183 | 
184 |     # Draw the heatmap with the mask and correct aspect ratio
185 |     sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True, fmt="3.2f",
186 |                 square=True, linewidths=.5, cbar_kws={"shrink": .5})
187 | 
188 | # Function library from https://github.com/aws-samples/aws-open-data-analytics-notebooks/tree/master/ai-services
189 | 
190 | def show_image(bucket, key, img_width = 500):
191 |     # [TODO] Load non-public images
192 |     return Image(url='https://s3.amazonaws.com/' + bucket + '/' + key, width=img_width)
193 | 
194 | def image_labels(bucket, key):
195 |     image_object = {'S3Object':{'Bucket': bucket,'Name': key}}
196 | 
197 |     response = rekognition.detect_labels(Image=image_object)
198 |     for label in response['Labels']:
199 |         print('{} ({:.0f}%)'.format(label['Name'], label['Confidence']))
200 | 
201 | def image_label_count(bucket, key, match):    
202 |     image_object = {'S3Object':{'Bucket': bucket,'Name': key}}
203 | 
204 |     response = rekognition.detect_labels(Image=image_object)
205 |     count = 0
206 |     for label in response['Labels']:
207 |         if match in label['Name']:
208 |             for instance in label['Instances']:
209 |                 count += 1
210 |     print(f'Found {match} {count} times.')
211 | 
212 | def image_text(bucket, key, sort_column='', parents=True):
213 |     response = rekognition.detect_text(Image={'S3Object':{'Bucket':bucket,'Name': key}})
214 |     df = pd.read_json(io.StringIO(json.dumps(response['TextDetections'])))
215 |     df['Width'] = df['Geometry'].apply(lambda x: x['BoundingBox']['Width'])
216 |     df['Height'] = df['Geometry'].apply(lambda x: x['BoundingBox']['Height'])
217 |     df['Left'] = df['Geometry'].apply(lambda x: x['BoundingBox']['Left'])
218 |     df['Top'] = df['Geometry'].apply(lambda x: x['BoundingBox']['Top'])
219 |     df = df.drop(columns=['Geometry'])
220 |     if sort_column:
221 |         df = df.sort_values([sort_column])
222 |     if not parents:
223 |         df = df[df['ParentId'] > 0]
224 |     return df
225 | 
226 | def detect_celebs(bucket, key, sort_column=''):
227 |     image_object = {'S3Object':{'Bucket': bucket,'Name': key}}
228 | 
229 |     response = rekognition.recognize_celebrities(Image=image_object)
230 |     df = pd.DataFrame(response['CelebrityFaces'])
231 |     df['Width'] = df['Face'].apply(lambda x: x['BoundingBox']['Width'])
232 |     df['Height'] = df['Face'].apply(lambda x: x['BoundingBox']['Height'])
233 |     df['Left'] = df['Face'].apply(lambda x: x['BoundingBox']['Left'])
234 |     df['Top'] = df['Face'].apply(lambda x: x['BoundingBox']['Top'])
235 |     df = df.drop(columns=['Face'])
236 |     if sort_column:
237 |         df = df.sort_values([sort_column])
238 |     return(df)
239 | 
240 | def comprehend_syntax(text): 
241 |     response = comprehend.detect_syntax(Text=text, LanguageCode='en')
242 |     df = pd.read_json(io.StringIO(json.dumps(response['SyntaxTokens'])))
243 |     df['Tag'] = df['PartOfSpeech'].apply(lambda x: x['Tag'])
244 |     df['Score'] = df['PartOfSpeech'].apply(lambda x: x['Score'])
245 |     df = df.drop(columns=['PartOfSpeech'])
246 |     return df
247 | 
248 | def comprehend_entities(text):
249 |     response = comprehend.detect_entities(Text=text, LanguageCode='en')
250 |     df = pd.read_json(io.StringIO(json.dumps(response['Entities'])))
251 |     return df
252 | 
253 | def comprehend_phrases(text):
254 |     response = comprehend.detect_key_phrases(Text=text, LanguageCode='en')
255 |     df = pd.read_json(io.StringIO(json.dumps(response['KeyPhrases'])))
256 |     return df
257 | 
258 | def comprehend_sentiment(text):
259 |     response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
260 |     return response['SentimentScore']
261 |     
262 | def show_video(bucket, key, size=100, autoplay=False, controls=True):
263 |     source = f'https://s3.amazonaws.com/{bucket}/{key}'
264 |     html = '''
265 |     <div align="middle">
266 |         <video width="{}%"{}{}>
267 |         <source src="{}" type="video/mp4">
268 |         </video>
269 |     </div>
270 |     '''
271 |     html = html.format(size, 
272 |                     ' controls' if controls else '', 
273 |                     ' autoplay' if autoplay else '', 
274 |                     source)
275 |     return HTML(html)
276 | 
277 | def video_labels_job(bucket, key):
278 |     video = {'S3Object': {'Bucket': bucket, 'Name': key}}
279 |     response_detect = rekognition.start_label_detection(Video = video)
280 |     return response_detect['JobId']
281 | 
282 | 
283 | def video_labels_result(jobId):
284 |     display('In Progress...')
285 |     response_label = rekognition.get_label_detection(JobId=jobId)
286 |     while response_label['JobStatus'] == 'IN_PROGRESS':
287 |         time.sleep(5)
288 |         response_label = rekognition.get_label_detection(JobId=jobId)
289 | 
290 |     display('Getting Labels...')
291 |     display(f"Video Duration (ms): {response_label['VideoMetadata']['DurationMillis']}")
292 |     display(f"FrameRate: {int(response_label['VideoMetadata']['FrameRate'])}")
293 | 
294 |     labels = []
295 |     while response_label:
296 |         labels.extend(response_label['Labels'])
297 |         if 'NextToken' in response_label:
298 |             response_label = rekognition.get_label_detection(JobId=jobId, NextToken=response_label['NextToken']) 
299 |         else:
300 |             response_label = None
301 |     
302 |     display(f'Succeeded in detecting {len(labels)} labels.')
303 |     
304 |     df = pd.DataFrame(labels)
305 |     df['LabelName'] = df['Label'].apply(lambda x: x['Name'])
306 |     df['Score'] = df['Label'].apply(lambda x: round(float(x['Confidence']), 2))
307 |     df['Instances'] = df['Label'].apply(lambda x: len(x['Instances']) if x['Instances'] else 0)
308 |     df['ParentsCount'] = df['Label'].apply(lambda x: len(x['Parents']))
309 |     df['Parents'] = df['Label'].apply(lambda x: ', '.join(map(lambda x : x['Name'], x['Parents'])))
310 |     df = df.drop(columns=['Label'])
311 |     return df    
312 | 
313 | def video_labels_text(df):
314 |     si = io.StringIO()
315 |     df['LabelName'].apply(lambda x: si.write(str(x + ' ')))
316 |     s = si.getvalue()
317 |     si.close()
318 |     return s
319 | 
320 | def video_labels_wordcloud(text):
321 |     # take relative word frequencies into account, lower max_font_size
322 |     wordcloud = WordCloud(width = 600, height = 300, background_color = 'black', max_words = len(text),
323 |                         max_font_size = 30, relative_scaling = .5, colormap = 'Spectral').generate(text)
324 |     plt.figure(figsize = (20, 10))
325 |     plt.imshow(wordcloud, interpolation = 'bilinear')
326 |     plt.axis("off")
327 |     plt.tight_layout(pad = 0) 
328 |     plt.show()
329 | 
330 | def video_labels_search(df, column, match):
331 |     df_result = df[df[column].str.contains(match)]
332 |     return df_result
333 | 
334 | def video_label_stats(df, label):
335 |     df_stats = video_labels_search(df, column='LabelName', match=label)
336 |     print(f'Displaying stats on number of instances for label "{label}"')
337 |     return df_stats.describe()
338 | 
339 | def video_persons_job(bucket, key):
340 |     video = {'S3Object': {'Bucket': bucket, 'Name': key}}
341 |     response_detect = rekognition.start_person_tracking(Video = video)
342 |     return response_detect['JobId']    
343 | 
344 | def video_persons_result(jobId):
345 |     display('In Progress...')
346 |     response_person = rekognition.get_person_tracking(JobId=jobId)
347 |     while response_person['JobStatus'] == 'IN_PROGRESS':
348 |         time.sleep(5)
349 |         response_label = rekognition.get_person_tracking(JobId=jobId)
350 | 
351 |     display('Getting Person Paths...')
352 |     display(f"Video Codec: {response_person['VideoMetadata']['Codec']}")
353 |     display(f"Video Duration (ms): {str(response_person['VideoMetadata']['DurationMillis'])}")
354 |     display(f"Video Format: {response_person['VideoMetadata']['Format']}")
355 |     display(f"Video FrameRate: {int(response_person['VideoMetadata']['FrameRate'])}")
356 | 
357 |     persons = []
358 |     while response_person:
359 |         persons.extend(response_person['Persons'])
360 |         if 'NextToken' in response_person:
361 |             response_person = rekognition.get_person_tracking(JobId=jobId, NextToken=response_person['NextToken']) 
362 |         else:
363 |             response_person = None
364 |     
365 |     display(f'Succeeded in detecting {len(persons)} person paths.')
366 |     
367 |     df = pd.DataFrame(persons)
368 |     df['Left'] = df['Person'].apply(lambda x: round(x['BoundingBox']['Left'], 2) if 'BoundingBox' in x else '')
369 |     df['Top'] = df['Person'].apply(lambda x: round(x['BoundingBox']['Top'], 2) if 'BoundingBox' in x else '')
370 |     df['Height'] = df['Person'].apply(lambda x: round(x['BoundingBox']['Height'], 2) if 'BoundingBox' in x else '')
371 |     df['Width'] = df['Person'].apply(lambda x: round(x['BoundingBox']['Width'], 2) if 'BoundingBox' in x else '')
372 |     df['Index'] = df['Person'].apply(lambda x: x['Index'])
373 |     df = df.drop(columns=['Person'])
374 | 
375 |     return df
376 | 
377 | def video_person_path(df, person):
378 |     df_result = df[df['Index'] == person]
379 |     return df_result
380 | 
381 | def video_person_timeframe(df, start, end):
382 |     df_result = df[(df['Timestamp'] >= start) & (df['Timestamp'] <= end)]
383 |     return df_result
384 | 
385 | def video_persons_frequency(df):
386 |     return df.groupby('Index')['Timestamp'].nunique()


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/contributors.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 | The project was developed by - 
3 | * [Pratyush Choudhury](https://www.linkedin.com/in/pratyushchoudhury/)
4 | * [Avanish Yadav](https://www.linkedin.com/in/avanish-yadav-237936171/)
5 | 


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/images/Medical_Resume_Architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/comprehend-medical-ehr/images/Medical_Resume_Architecture.jpg


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/images/output_19_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/comprehend-medical-ehr/images/output_19_0.png


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/images/output_20_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/comprehend-medical-ehr/images/output_20_0.png


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/images/output_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/comprehend-medical-ehr/images/output_21_0.png


--------------------------------------------------------------------------------
/experiments/notebooks/comprehend-medical-ehr/images/output_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/comprehend-medical-ehr/images/output_22_0.png


--------------------------------------------------------------------------------
/experiments/notebooks/covid/2020-03-24-covid-india-stats.csv:
--------------------------------------------------------------------------------
 1 | State,Indian,Foreign,Discharged,Death,Confirmed,Active
 2 | Andhra Pradesh,8,0,0,0,8,8
 3 | Bihar,3,0,0,1,3,2
 4 | Chhattisgarh,1,0,0,0,1,1
 5 | Delhi,29,1,6,2,30,22
 6 | Gujarat,32,1,0,1,33,32
 7 | Haryana,14,14,11,0,28,17
 8 | Himachal Pradesh,3,0,0,1,3,2
 9 | Karnataka,37,0,3,1,37,33
10 | Kerala,87,8,4,0,95,91
11 | Madhya Pradesh,7,0,0,0,7,7
12 | Maharashtra,86,3,0,2,89,87
13 | Manipur,1,0,0,0,1,1
14 | Odisha,2,0,0,0,2,2
15 | Puducherry,1,0,0,0,1,1
16 | Punjab,29,0,0,1,29,28
17 | Rajasthan,30,2,3,0,32,29
18 | Tamil Nadu,13,2,1,0,15,14
19 | Telengana,25,10,1,0,35,34
20 | Chandigarh,7,0,0,0,7,7
21 | Jammu and Kashmir,4,0,0,0,4,4
22 | Ladakh,13,0,0,0,13,13
23 | Uttar Pradesh,32,1,11,0,33,22
24 | Uttarakhand,3,1,0,0,4,4
25 | West Bengal,9,0,0,1,9,8
26 | 


--------------------------------------------------------------------------------
/experiments/notebooks/covid/README.md:
--------------------------------------------------------------------------------
  1 | # COVID-19 Insights for India
  2 | 
  3 | This notebook provides a catalog of open datasets for deriving insights related to COVID-19 and helping open source and open data community to collaborate in fighting this global threat. The notebook provides (a) reusable API to speed up open data analytics related to COVID-19, customized for India however can be adopted for other countries, (b) sample usage of the API, (c) documentation of insights, and (d) catalog of open datasets referenced.
  4 | 
  5 | The notebook is created by aggregating content from hundreds of global contributors, whome we have tried our best to acknowledge, if you note any missed ones, please inform us by creating an issue on this Github repository. The code, links, and datasets are provided on AS-IS basis under open source. This is the work of the individual author and contributors to this repository with no endorsements from any organizations including their own.
  6 | 
  7 | 
  8 | ```python
  9 | %matplotlib inline
 10 | import covid as cv
 11 | ```
 12 | 
 13 | 
 14 | ```python
 15 | df = cv.get_today_stats(force=True)
 16 | ```
 17 | 
 18 |     Creating stats for today...
 19 |     Stats file for today saved: 2020-03-24-covid-india-stats.csv
 20 | 
 21 | 
 22 | 
 23 | ```python
 24 | cv.summary_stats(df)
 25 | ```
 26 | 
 27 | 
 28 | 
 29 | 
 30 | <table id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52" ><thead>    <tr>        <th class="blank level0" ></th>        <th class="col_heading level0 col0" >Latest</th>    </tr></thead><tbody>
 31 |                 <tr>
 32 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row0" class="row_heading level0 row0" >Indian</th>
 33 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row0_col0" class="data row0 col0" >476</td>
 34 |             </tr>
 35 |             <tr>
 36 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row1" class="row_heading level0 row1" >Foreign</th>
 37 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row1_col0" class="data row1 col0" >43</td>
 38 |             </tr>
 39 |             <tr>
 40 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row2" class="row_heading level0 row2" >Discharged</th>
 41 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row2_col0" class="data row2 col0" >40</td>
 42 |             </tr>
 43 |             <tr>
 44 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row3" class="row_heading level0 row3" >Death</th>
 45 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row3_col0" class="data row3 col0" >10</td>
 46 |             </tr>
 47 |             <tr>
 48 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row4" class="row_heading level0 row4" >Confirmed</th>
 49 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row4_col0" class="data row4 col0" >519</td>
 50 |             </tr>
 51 |             <tr>
 52 |                         <th id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52level0_row5" class="row_heading level0 row5" >Active</th>
 53 |                         <td id="T_e5b7e7e6_6df3_11ea_af32_073b3b518e52row5_col0" class="data row5 col0" >469</td>
 54 |             </tr>
 55 |     </tbody></table>
 56 | 
 57 | 
 58 | 
 59 | 
 60 | ```python
 61 | cv.display_stats(df)
 62 | ```
 63 | 
 64 | 
 65 | 
 66 | 
 67 | <table id="T_e5c49568_6df3_11ea_af32_073b3b518e52" ><thead>    <tr>        <th class="blank level0" ></th>        <th class="col_heading level0 col0" >State</th>        <th class="col_heading level0 col1" >Indian</th>        <th class="col_heading level0 col2" >Foreign</th>        <th class="col_heading level0 col3" >Discharged</th>        <th class="col_heading level0 col4" >Death</th>        <th class="col_heading level0 col5" >Confirmed</th>        <th class="col_heading level0 col6" >Active</th>    </tr></thead><tbody>
 68 |                 <tr>
 69 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row0" class="row_heading level0 row0" >8</th>
 70 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col0" class="data row0 col0" >Kerala</td>
 71 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col1" class="data row0 col1" >87</td>
 72 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col2" class="data row0 col2" >8</td>
 73 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col3" class="data row0 col3" >4</td>
 74 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col4" class="data row0 col4" >0</td>
 75 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col5" class="data row0 col5" >95</td>
 76 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row0_col6" class="data row0 col6" >91</td>
 77 |             </tr>
 78 |             <tr>
 79 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row1" class="row_heading level0 row1" >10</th>
 80 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col0" class="data row1 col0" >Maharashtra</td>
 81 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col1" class="data row1 col1" >86</td>
 82 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col2" class="data row1 col2" >3</td>
 83 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col3" class="data row1 col3" >0</td>
 84 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col4" class="data row1 col4" >2</td>
 85 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col5" class="data row1 col5" >89</td>
 86 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row1_col6" class="data row1 col6" >87</td>
 87 |             </tr>
 88 |             <tr>
 89 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row2" class="row_heading level0 row2" >17</th>
 90 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col0" class="data row2 col0" >Telengana</td>
 91 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col1" class="data row2 col1" >25</td>
 92 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col2" class="data row2 col2" >10</td>
 93 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col3" class="data row2 col3" >1</td>
 94 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col4" class="data row2 col4" >0</td>
 95 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col5" class="data row2 col5" >35</td>
 96 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row2_col6" class="data row2 col6" >34</td>
 97 |             </tr>
 98 |             <tr>
 99 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row3" class="row_heading level0 row3" >7</th>
100 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col0" class="data row3 col0" >Karnataka</td>
101 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col1" class="data row3 col1" >37</td>
102 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col2" class="data row3 col2" >0</td>
103 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col3" class="data row3 col3" >3</td>
104 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col4" class="data row3 col4" >1</td>
105 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col5" class="data row3 col5" >37</td>
106 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row3_col6" class="data row3 col6" >33</td>
107 |             </tr>
108 |             <tr>
109 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row4" class="row_heading level0 row4" >4</th>
110 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col0" class="data row4 col0" >Gujarat</td>
111 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col1" class="data row4 col1" >32</td>
112 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col2" class="data row4 col2" >1</td>
113 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col3" class="data row4 col3" >0</td>
114 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col4" class="data row4 col4" >1</td>
115 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col5" class="data row4 col5" >33</td>
116 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row4_col6" class="data row4 col6" >32</td>
117 |             </tr>
118 |             <tr>
119 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row5" class="row_heading level0 row5" >15</th>
120 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col0" class="data row5 col0" >Rajasthan</td>
121 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col1" class="data row5 col1" >30</td>
122 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col2" class="data row5 col2" >2</td>
123 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col3" class="data row5 col3" >3</td>
124 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col4" class="data row5 col4" >0</td>
125 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col5" class="data row5 col5" >32</td>
126 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row5_col6" class="data row5 col6" >29</td>
127 |             </tr>
128 |             <tr>
129 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row6" class="row_heading level0 row6" >14</th>
130 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col0" class="data row6 col0" >Punjab</td>
131 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col1" class="data row6 col1" >29</td>
132 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col2" class="data row6 col2" >0</td>
133 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col3" class="data row6 col3" >0</td>
134 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col4" class="data row6 col4" >1</td>
135 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col5" class="data row6 col5" >29</td>
136 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row6_col6" class="data row6 col6" >28</td>
137 |             </tr>
138 |             <tr>
139 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row7" class="row_heading level0 row7" >21</th>
140 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col0" class="data row7 col0" >Uttar Pradesh</td>
141 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col1" class="data row7 col1" >32</td>
142 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col2" class="data row7 col2" >1</td>
143 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col3" class="data row7 col3" >11</td>
144 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col4" class="data row7 col4" >0</td>
145 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col5" class="data row7 col5" >33</td>
146 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row7_col6" class="data row7 col6" >22</td>
147 |             </tr>
148 |             <tr>
149 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row8" class="row_heading level0 row8" >3</th>
150 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col0" class="data row8 col0" >Delhi</td>
151 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col1" class="data row8 col1" >29</td>
152 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col2" class="data row8 col2" >1</td>
153 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col3" class="data row8 col3" >6</td>
154 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col4" class="data row8 col4" >2</td>
155 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col5" class="data row8 col5" >30</td>
156 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row8_col6" class="data row8 col6" >22</td>
157 |             </tr>
158 |             <tr>
159 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row9" class="row_heading level0 row9" >5</th>
160 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col0" class="data row9 col0" >Haryana</td>
161 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col1" class="data row9 col1" >14</td>
162 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col2" class="data row9 col2" >14</td>
163 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col3" class="data row9 col3" >11</td>
164 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col4" class="data row9 col4" >0</td>
165 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col5" class="data row9 col5" >28</td>
166 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row9_col6" class="data row9 col6" >17</td>
167 |             </tr>
168 |             <tr>
169 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row10" class="row_heading level0 row10" >16</th>
170 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col0" class="data row10 col0" >Tamil Nadu</td>
171 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col1" class="data row10 col1" >13</td>
172 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col2" class="data row10 col2" >2</td>
173 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col3" class="data row10 col3" >1</td>
174 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col4" class="data row10 col4" >0</td>
175 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col5" class="data row10 col5" >15</td>
176 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row10_col6" class="data row10 col6" >14</td>
177 |             </tr>
178 |             <tr>
179 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row11" class="row_heading level0 row11" >20</th>
180 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col0" class="data row11 col0" >Ladakh</td>
181 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col1" class="data row11 col1" >13</td>
182 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col2" class="data row11 col2" >0</td>
183 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col3" class="data row11 col3" >0</td>
184 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col4" class="data row11 col4" >0</td>
185 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col5" class="data row11 col5" >13</td>
186 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row11_col6" class="data row11 col6" >13</td>
187 |             </tr>
188 |             <tr>
189 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row12" class="row_heading level0 row12" >0</th>
190 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col0" class="data row12 col0" >Andhra Pradesh</td>
191 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col1" class="data row12 col1" >8</td>
192 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col2" class="data row12 col2" >0</td>
193 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col3" class="data row12 col3" >0</td>
194 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col4" class="data row12 col4" >0</td>
195 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col5" class="data row12 col5" >8</td>
196 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row12_col6" class="data row12 col6" >8</td>
197 |             </tr>
198 |             <tr>
199 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row13" class="row_heading level0 row13" >23</th>
200 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col0" class="data row13 col0" >West Bengal</td>
201 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col1" class="data row13 col1" >9</td>
202 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col2" class="data row13 col2" >0</td>
203 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col3" class="data row13 col3" >0</td>
204 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col4" class="data row13 col4" >1</td>
205 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col5" class="data row13 col5" >9</td>
206 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row13_col6" class="data row13 col6" >8</td>
207 |             </tr>
208 |             <tr>
209 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row14" class="row_heading level0 row14" >9</th>
210 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col0" class="data row14 col0" >Madhya Pradesh</td>
211 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col1" class="data row14 col1" >7</td>
212 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col2" class="data row14 col2" >0</td>
213 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col3" class="data row14 col3" >0</td>
214 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col4" class="data row14 col4" >0</td>
215 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col5" class="data row14 col5" >7</td>
216 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row14_col6" class="data row14 col6" >7</td>
217 |             </tr>
218 |             <tr>
219 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row15" class="row_heading level0 row15" >18</th>
220 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col0" class="data row15 col0" >Chandigarh</td>
221 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col1" class="data row15 col1" >7</td>
222 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col2" class="data row15 col2" >0</td>
223 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col3" class="data row15 col3" >0</td>
224 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col4" class="data row15 col4" >0</td>
225 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col5" class="data row15 col5" >7</td>
226 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row15_col6" class="data row15 col6" >7</td>
227 |             </tr>
228 |             <tr>
229 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row16" class="row_heading level0 row16" >19</th>
230 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col0" class="data row16 col0" >Jammu and Kashmir</td>
231 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col1" class="data row16 col1" >4</td>
232 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col2" class="data row16 col2" >0</td>
233 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col3" class="data row16 col3" >0</td>
234 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col4" class="data row16 col4" >0</td>
235 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col5" class="data row16 col5" >4</td>
236 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row16_col6" class="data row16 col6" >4</td>
237 |             </tr>
238 |             <tr>
239 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row17" class="row_heading level0 row17" >22</th>
240 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col0" class="data row17 col0" >Uttarakhand</td>
241 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col1" class="data row17 col1" >3</td>
242 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col2" class="data row17 col2" >1</td>
243 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col3" class="data row17 col3" >0</td>
244 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col4" class="data row17 col4" >0</td>
245 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col5" class="data row17 col5" >4</td>
246 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row17_col6" class="data row17 col6" >4</td>
247 |             </tr>
248 |             <tr>
249 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row18" class="row_heading level0 row18" >1</th>
250 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col0" class="data row18 col0" >Bihar</td>
251 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col1" class="data row18 col1" >3</td>
252 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col2" class="data row18 col2" >0</td>
253 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col3" class="data row18 col3" >0</td>
254 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col4" class="data row18 col4" >1</td>
255 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col5" class="data row18 col5" >3</td>
256 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row18_col6" class="data row18 col6" >2</td>
257 |             </tr>
258 |             <tr>
259 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row19" class="row_heading level0 row19" >6</th>
260 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col0" class="data row19 col0" >Himachal Pradesh</td>
261 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col1" class="data row19 col1" >3</td>
262 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col2" class="data row19 col2" >0</td>
263 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col3" class="data row19 col3" >0</td>
264 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col4" class="data row19 col4" >1</td>
265 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col5" class="data row19 col5" >3</td>
266 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row19_col6" class="data row19 col6" >2</td>
267 |             </tr>
268 |             <tr>
269 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row20" class="row_heading level0 row20" >12</th>
270 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col0" class="data row20 col0" >Odisha</td>
271 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col1" class="data row20 col1" >2</td>
272 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col2" class="data row20 col2" >0</td>
273 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col3" class="data row20 col3" >0</td>
274 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col4" class="data row20 col4" >0</td>
275 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col5" class="data row20 col5" >2</td>
276 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row20_col6" class="data row20 col6" >2</td>
277 |             </tr>
278 |             <tr>
279 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row21" class="row_heading level0 row21" >13</th>
280 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col0" class="data row21 col0" >Puducherry</td>
281 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col1" class="data row21 col1" >1</td>
282 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col2" class="data row21 col2" >0</td>
283 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col3" class="data row21 col3" >0</td>
284 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col4" class="data row21 col4" >0</td>
285 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col5" class="data row21 col5" >1</td>
286 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row21_col6" class="data row21 col6" >1</td>
287 |             </tr>
288 |             <tr>
289 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row22" class="row_heading level0 row22" >11</th>
290 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col0" class="data row22 col0" >Manipur</td>
291 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col1" class="data row22 col1" >1</td>
292 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col2" class="data row22 col2" >0</td>
293 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col3" class="data row22 col3" >0</td>
294 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col4" class="data row22 col4" >0</td>
295 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col5" class="data row22 col5" >1</td>
296 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row22_col6" class="data row22 col6" >1</td>
297 |             </tr>
298 |             <tr>
299 |                         <th id="T_e5c49568_6df3_11ea_af32_073b3b518e52level0_row23" class="row_heading level0 row23" >2</th>
300 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col0" class="data row23 col0" >Chhattisgarh</td>
301 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col1" class="data row23 col1" >1</td>
302 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col2" class="data row23 col2" >0</td>
303 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col3" class="data row23 col3" >0</td>
304 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col4" class="data row23 col4" >0</td>
305 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col5" class="data row23 col5" >1</td>
306 |                         <td id="T_e5c49568_6df3_11ea_af32_073b3b518e52row23_col6" class="data row23 col6" >1</td>
307 |             </tr>
308 |     </tbody></table>
309 | 
310 | 
311 | 
312 | 
313 | ```python
314 | cv.linear_regression(df)
315 | ```
316 | 
317 | 
318 | ![](output_5_0.png)
319 | 
320 | 
321 | ### COVID-19 Open Datasets, Dashboards, and Apps
322 | 
323 | 
324 | #### India Stats
325 | 
326 | 1. [Ministry of Health and Family Welfare - MOHFW](https://www.mohfw.gov.in/) publishes COVID India stats. This notebook pulls the stats from HTML table on site.
327 | 
328 | 2. [India Affected People Dataset](http://portal.covid19india.org/) by covid19india.org
329 | 
330 | 3. [Patient Travel History](https://api.covid19india.org/travel_history.json) by covid19india.org 
331 | 
332 | 
333 | #### India Dashboards
334 | 
335 | 1. Kiprosh [covidout.in dashboard](https://covidout.in/) provides MOHFW stats, daily and cummulative trends.
336 | 
337 | 
338 | #### India Apps
339 | 
340 | 1. [COVID-19 India Cluster Graph Visualization](https://cluster.covid19india.org/) by covid19india.org
341 | 
342 | 
343 | #### India Hospitals, Testing Labs
344 | 
345 | 1. [ICMR](https://icmr.nic.in/what-s-new) List of Government [Laboratories](https://icmr.nic.in/sites/default/files/upload_documents/Govt_Lab_COVID_19_Testing_V2.pdf) for COVID-19 Testing
346 | 
347 | 2. Statewise Hospital Beds from [PIB](https://pib.gov.in/PressReleasePage.aspx?PRID=1539877) extracted to [CSV dataset](https://www.kaggle.com/sudalairajkumar/covid19-in-india#HospitalBedsIndia.csv) on Kaggle.
348 | 
349 | 
350 | #### Census, Demographics
351 | 
352 | 1. India rural, urban population and area by states on [Wikipedia](https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_population) extracted to [CSV dataset](https://www.kaggle.com/sudalairajkumar/covid19-in-india#population_india_census2011.csv) on Kaggle.
353 | 
354 | 2. [World Bank Indicators](https://data.humdata.org/dataset/world-bank-indicators-of-interest-to-the-covid-19-outbreak) of Interest to the COVID-19 Outbreak.
355 | 
356 | 
357 | #### Global Stats
358 | 
359 | 1. [Geographic distribution of COVID-19 cases worldwide](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide) from European Centre for Disease Prevention and Control available as daily [Excel dataset](https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-03-22.xlsx) (2020-03-22). Replace yyyy-mm-dd suffix on file to get historical/current data.
360 | 
361 | 2. Johns Hopkins University [Global Dashboard](https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6) and GitHub [datasets](https://github.com/CSSEGISandData/COVID-19).
362 | 
363 | 3. Situational Awareness Dashboard from [World Health Organization](https://experience.arcgis.com/experience/685d0ace521648f8a5beeeee1b9125cd).
364 | 
365 | 
366 | #### Research
367 | 
368 | 1. COVID-19 [Open Research Dataset](https://pages.semanticscholar.org/coronavirus-research) (CORD-19) from  Allen Institute for AI. Contains over 44,000 scholarly articles, including over 29,000 with full text, about COVID-19 and the coronavirus family of viruses for use by the global research community.
369 | 
370 | 2. NCBI [SARS-CoV-2 Genetic Sequences](https://www.ncbi.nlm.nih.gov/genbank/sars-cov-2-seqs/)
371 | 
372 | 3. Nextstrain [Genomic epidemiology of novel coronavirus](https://nextstrain.org/ncov)
373 | 
374 | 4. GISAID App for [Genomic epidemiology of hCoV-19](https://www.gisaid.org/epiflu-applications/next-hcov-19-app/)
375 | 
376 | 
377 | #### News Analysis
378 | 
379 | 1. ACAPS COVID-19: [Government Measures Dataset](https://data.humdata.org/dataset/acaps-covid19-government-measures-dataset)
380 | 
381 | #### Notebooks
382 | 
383 | 1. Notebook from Parul Pandey on [Tracking India's Coronavirus Spread](https://www.kaggle.com/parulpandey/tracking-india-s-coronavirus-spread-wip/notebook) compares trends across India, Italy, Korea.
384 | 
385 | 2. [COVID-19 Literature Clustering](https://www.kaggle.com/maksimeren/covid-19-literature-clustering) visualizes CORD-19 dataset of over 44,000 scholarly articles.
386 | 
387 | 3. [Coronavirus (COVID-19) Visualization & Prediction](https://www.kaggle.com/therealcyberlord/coronavirus-covid-19-visualization-prediction) does timeseries predictive analysis of virus spread based on Johns Hopkins dataset.
388 | 
389 | 
390 | #### Meta Dataset Sources
391 | 
392 | 1. [Registry of Open Data on AWS](https://registry.opendata.aws/)
393 | 
394 | 2. [MyGov COVID-19 Solution Challenge / Resources](https://innovate.mygov.in/covid19/#tab6)
395 | 
396 | 3. [Covidout Data Sources](https://covidout.in/sources)
397 | 
398 | 4. [Kaggle COVID datasets](https://www.kaggle.com/search?q=covid+coronavirus+in%3Adatasets)
399 | 
400 | 5. [HDX Datasets on COVID-19 Outbreak](https://data.humdata.org/event/covid-19)
401 | 
402 | 6. [api.rootnet.in](https://api.rootnet.in/) multiple official and unofficial India specific datasets as JSON files
403 | 
404 | 


--------------------------------------------------------------------------------
/experiments/notebooks/covid/covid.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import bs4
 3 | import pandas as pd
 4 | import numpy as np
 5 | import seaborn as sns
 6 | import matplotlib.pyplot as plt
 7 | from datetime import datetime, date, timedelta
 8 | import os.path
 9 | from os import path
10 | 
11 | def linear_regression(df):
12 |     fig, ax = plt.subplots(1, 3, figsize=(15,5))
13 |     sns.regplot(x='Confirmed', y='Active', data=df, ax=ax[0])
14 |     sns.regplot(x='Confirmed', y='Discharged', data=df, ax=ax[1])
15 |     sns.regplot(x='Confirmed', y='Death', data=df, ax=ax[2])
16 |     fig.show()    
17 | 
18 | def highlight_max(s):
19 |     is_max = s == s.max()
20 |     return ['background-color: pink' if v else '' for v in is_max]
21 | 
22 | def summary_stats(df):
23 |     summary = df.drop(columns=['State']).sum()
24 |     df2 = summary.to_frame()
25 |     df2 = df2.rename(columns={0: 'Latest'})
26 |     return df2.style.apply(highlight_max,subset=['Latest'])
27 | 
28 | def display_stats(df):
29 |     df = df.sort_values(by=['Active'], ascending=False)
30 |     return df.style.apply(highlight_max,subset=['Confirmed', 'Active', 'Discharged', 
31 |                                                 'Death','Indian','Foreign'])
32 | 
33 | def get_today_stats(force = False):
34 |     today_file = datetime.now().strftime('%Y-%m-%d') + '-covid-india-stats.csv'
35 | 
36 |     if path.exists(today_file) and not force:
37 |         stats_df= pd.read_csv(today_file)
38 |         print('Stats file exists: ' + today_file)
39 |     else:
40 |         print('Creating stats for today...')
41 |         with urllib.request.urlopen('https://www.mohfw.gov.in/') as response:
42 |             page = response.read()
43 |             html = bs4.BeautifulSoup(page, 'lxml')
44 | 
45 |         df_cols = []
46 |         # stats page has multiple tables
47 |         tables = html.findAll("table", {"class": "table-dark"})
48 |         for table in tables:
49 |             # only stats table has rows (tr tag)
50 |             if table.thead.tr.th.strong.string == 'S. No.':
51 |                 for th in table.thead.tr:
52 |                     if th.string:
53 |                         df_cols.append(th.string.strip())
54 |                     else:
55 |                         df_cols.append(th.strong.text.strip())
56 | 
57 |                 while '' in df_cols:
58 |                     df_cols.remove('')
59 | 
60 |                 stats_df = pd.DataFrame(columns = df_cols)
61 | 
62 |                 i = 0
63 |                 for tr in table.tbody:
64 |                     df_row = []
65 |                     df_data = []
66 |                     for td in tr:
67 |                         if len(df_row) == len(df_cols):
68 |                             # print(df_row)
69 |                             stats_df.loc[i] = df_row
70 |                             i = i + 1
71 |                             df_row = []
72 |                         if type(td) is bs4.element.Tag:
73 |                             df_row.append(td.string)
74 | 
75 |         stats_df = stats_df.drop(columns=['S. No.'])
76 |         stats_df = stats_df.rename(columns={'Name of State / UT': 'State',
77 |                     'Total Confirmed cases (Indian National)': 'Indian', 
78 |                     'Total Confirmed cases ( Foreign National )': 'Foreign',
79 |                     'Cured/Discharged/Migrated': 'Discharged'})
80 |         stats_df['Indian'] = stats_df['Indian'].astype(int)
81 |         stats_df['Foreign'] = stats_df['Foreign'].astype(int)
82 |         stats_df['Discharged'] = stats_df['Discharged'].astype(int)
83 |         stats_df['Death'] = stats_df['Death'].astype(int)
84 |         stats_df['Confirmed'] = stats_df['Indian'] + stats_df['Foreign']
85 |         stats_df['Active'] = stats_df['Indian'] + stats_df['Foreign'] - stats_df['Discharged'] - stats_df['Death']
86 | 
87 |         stats_df.to_csv(today_file, index=False)
88 |         print('Stats file for today saved: ' + today_file)
89 |     
90 |     return stats_df


--------------------------------------------------------------------------------
/experiments/notebooks/covid/output_5_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/covid/output_5_0.png


--------------------------------------------------------------------------------
/experiments/notebooks/optimizing-data/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Optimizing data for analysis with Amazon Athena and AWS Glue
  3 | 
  4 | We will continue our open data analytics workflow starting with the AWS Console then moving to using the notebook. Using [AWS Glue](https://aws.amazon.com/glue/) we can automate creating a metadata catalog based on flat files stored on Amazon S3. Glue is a fully managed extract, transform, and load (ETL) service that makes it easy for customers to prepare and load their data for analytics. You can create and run an ETL job with a few clicks in the AWS Management Console. You simply point AWS Glue to your data stored on AWS, and AWS Glue discovers your data and stores the associated metadata (e.g. table definition and schema) in the AWS Glue Data Catalog. Once cataloged, your data is immediately searchable, queryable, and available for ETL.
  5 | 
  6 | ### Glue Data Catalog
  7 | 
  8 | We have sourced the open dataset from the [Registry of Open Data on AWS](https://registry.opendata.aws/). We also stored the data on S3. Now we are ready to extract, transform, and load the data for analytics. We will use AWS Glue service to do this. First step is to create a logical database entry in the data catalog. Note that we are not creating a physical database which requires resources. This is just a metadata placeholder for the flat file we copied into S3.
  9 | 
 10 | > When creating the data catalog name try choosing a name without hyphens and few characters long. This will make SQL queries more readable and also avoid certain errors when running these queries.
 11 | 
 12 | ![Glue Data Catalog](https://s3.amazonaws.com/cloudstory/notebooks-media/glue-data-catalog.png)
 13 | 
 14 | We can also setup the notebook for accessing AWS Glue service using the ``Boto3`` Python SDK. The ``pandas`` and ``IPython`` dependencies are imported for output formatting purposes only. We also import ``numpy`` a popular statistical analysis library. Charts and visualizations will be supported by ``seaborn`` and ``matplotlib`` libraries. To access the Glue service API we create a Glue client. 
 15 | 
 16 | 
 17 | ```python
 18 | import boto3
 19 | import pandas as pd
 20 | import numpy as np
 21 | from IPython.display import display, Markdown
 22 | import seaborn as sns
 23 | %matplotlib inline
 24 | import matplotlib.pyplot as plt
 25 | ```
 26 | 
 27 | 
 28 | ```python
 29 | glue = boto3.client('glue')
 30 | s3 = boto3.client('s3')
 31 | ```
 32 | 
 33 | ### List Glue Databases
 34 | We will recreate the AWS Console GUI experience using SDK calls by creating the ``list_glue_databases`` function. We simply get the data catalogs in one statement and iterate over the results in the next one.
 35 | 
 36 | 
 37 | ```python
 38 | def list_glue_databases():
 39 |     glue_database = glue.get_databases()
 40 | 
 41 |     for db in glue_database['DatabaseList']:
 42 |         print(db['Name'])
 43 | ```
 44 | 
 45 | 
 46 | ```python
 47 | list_glue_databases()
 48 | ```
 49 | 
 50 |     default
 51 |     odoc
 52 |     sampledb
 53 |     taxicatalog
 54 | 
 55 | 
 56 | ### Glue Crawler
 57 | Next, we create a logical table using Glue crawler. This is again just table metadata definition while the actual data is still stored only in the flat file on S3. For this notebook we will define and run the default Glue Crawler to extract and load the metadata schema from our flat file. This requires selection of a data store which is S3 in this case, defining an IAM role for access from Glue to S3, selecting a schedule for the crawler to run repeatedly if required, and output destination of the crawler results. 
 58 | 
 59 | > Please ensure that the flat file is stored on S3 within its own folder and you point at the folder when picking the data source during crawler definition. If you point directly to a flat file when running the crawler, it may return zero results when querying using Amazon Athena.
 60 | 
 61 | Glue will pick up folder name for the logical table name. Keeping our data source files in a folder has the added advantage of incremntally updating the folder with updates to the data with more files or updating the original file. Glue will pick up these changes based on crawler run schedule.
 62 | 
 63 | ![Glue Crawler](https://s3.amazonaws.com/cloudstory/notebooks-media/glue-crawler.png)
 64 | 
 65 | ### Glue Table Metadata
 66 | This results in extraction of table metadata stored within our data catalog. The schema with data types is extracted and stored in Glue Data Catalog. Note that the default Glue Crawler understands well-formed CSV files with first row as comma-separated list of column names, and next set of rows representing ordered data records. The Glue Crawler automatically guesses data types based on the contents of the flat file.
 67 | 
 68 | ![Table Metadata](https://s3.amazonaws.com/cloudstory/notebooks-media/table-metadata.png)
 69 | 
 70 | ### Transform Data Using Athena
 71 | 
 72 | Transforming big data in notebook environment is not viable. Instead we can use Amazon Athena for large data transforms and bring the results back into our notebook.
 73 | 
 74 | ![Athena Transform Data](https://s3.amazonaws.com/cloudstory/notebooks-media/athena-transform-data.png)
 75 | 
 76 | We will use following query to create a well formed table transformed from our original table. Note that we specify the output location so that Athena defined WorkGroup location is not used by default. We also specify the format as ``TEXTFILE`` otherwise default ``PARQUET`` format is used which may generate errors when sampling this data.
 77 | 
 78 | ```SQL
 79 | CREATE TABLE 
 80 | IF NOT EXISTS "taxicatalog"."many_trips_well_formed" 
 81 | WITH (
 82 |     external_location = 's3://open-data-analytics-taxi-trips/many-trips-well-formed/',
 83 |     format = 'TEXTFILE',
 84 |     field_delimiter = ','
 85 | )
 86 | AS SELECT vendorid AS vendor,
 87 |          passenger_count AS passengers,
 88 |          trip_distance AS distance,
 89 |          ratecodeid AS rate,
 90 |          pulocationid AS pick_location,
 91 |          dolocationid AS drop_location,
 92 |          payment_type AS payment_type,
 93 |          fare_amount AS fare,
 94 |          extra AS extra_fare,
 95 |          mta_tax AS tax,
 96 |          tip_amount AS tip,
 97 |          tolls_amount AS toll,
 98 |          improvement_surcharge AS surcharge,
 99 |          total_amount AS total_fare,
100 |          tpep_pickup_datetime AS pick_when,
101 |          tpep_dropoff_datetime AS drop_when
102 | FROM "taxicatalog"."many_trips";
103 | ```
104 | 
105 | 
106 | ### List Glue Tables
107 | In the spirit of AWS Open Data Analytics API we will recreate the AWS Console feature which lists the tables and displays the metadata within one single reusable function. We get the list of table metadata stored within our data catalog by passing the ``database`` parameter. Next we iterate over each table object and display the name, source data file, number of records (estimate), average record size, data size in MB, and the name of the crawler used to extract the table metadata. We also display the list of column names and data types extracted as schema from the flat file stored on S3.
108 | 
109 | 
110 | ```python
111 | def list_glue_tables(database, verbose=True):
112 |     glue_tables = glue.get_tables(DatabaseName=database)
113 |     
114 |     for table in glue_tables['TableList']:
115 |         display(Markdown('**Table: ' + table['Name'] + '**'))
116 |         display(Markdown('Location: ' + table['StorageDescriptor']['Location']))
117 |         created = table['CreatedBy'].split('/')
118 |         display(Markdown('Created by: ' + created[-1]))
119 |         if verbose and created[-1] == 'AWS Crawler':
120 |             display(Markdown(f'Records: {int(table["Parameters"]["recordCount"]):,}'))
121 |             display(Markdown(f'Average Record Size: {table["Parameters"]["averageRecordSize"]} Bytes'))
122 |             display(Markdown(f'Dataset Size: {float(table["Parameters"]["sizeKey"])/1024/1024:3.0f} MB'))
123 |             display(Markdown(f'Crawler: {table["Parameters"]["UPDATED_BY_CRAWLER"]}'))
124 |         if verbose:
125 |             df_columns = pd.DataFrame.from_dict(table["StorageDescriptor"]["Columns"])
126 |             display(df_columns[['Name', 'Type']])
127 |             display(Markdown('---'))
128 | ```
129 | 
130 | 
131 | ```python
132 | list_glue_tables('taxicatalog', verbose=False)
133 | ```
134 | 
135 | 
136 | **Table: many_trips**
137 | 
138 | 
139 | 
140 | Location: s3://open-data-analytics-taxi-trips/many-trips/
141 | 
142 | 
143 | 
144 | Created by: AWS-Crawler
145 | 
146 | 
147 | 
148 | **Table: many_trips_well_formed**
149 | 
150 | 
151 | 
152 | Location: s3://open-data-analytics-taxi-trips/many-trips-well-formed
153 | 
154 | 
155 | 
156 | Created by: manav
157 | 
158 | 
159 | 
160 | ```python
161 | athena = boto3.client('athena')
162 | ```
163 | 
164 | ### Athena Query
165 | Our next action is to bring the data created within Athena into the notebook environment using a ``pandas`` DataFrame. This can be done using the ``athena_query`` function which calls the Amazon Athena API to execute a query and store the output within a bucket and folder. This output is then read by a DataFrame which is returned by the function.
166 | 
167 | 
168 | ```python
169 | def athena_query(query, bucket, folder):
170 |     output = 's3://' + bucket + '/' + folder + '/'
171 |     response = athena.start_query_execution(QueryString=query, 
172 |                                         ResultConfiguration={'OutputLocation': output})
173 |     qid = response['QueryExecutionId']
174 |     response = athena.get_query_execution(QueryExecutionId=qid)
175 |     state = response['QueryExecution']['Status']['State']
176 |     while state == 'RUNNING':
177 |         response = athena.get_query_execution(QueryExecutionId=qid)
178 |         state = response['QueryExecution']['Status']['State']
179 |     key = folder + '/' + qid + '.csv'
180 |     data_source = {'Bucket': bucket, 'Key': key}
181 |     url = s3.generate_presigned_url(ClientMethod = 'get_object', Params = data_source)
182 |     data = pd.read_csv(url)
183 |     return data
184 | ```
185 | 
186 | To explore the data within Athena we will query returning thousand random samples.
187 | 
188 | 
189 | ```python
190 | bucket = 'open-data-analytics-taxi-trips'
191 | folder = 'queries'
192 | query = 'SELECT * FROM "taxicatalog"."many_trips_well_formed" TABLESAMPLE BERNOULLI(100) LIMIT 1000;'
193 | 
194 | df = athena_query(query, bucket, folder)
195 | df.head()
196 | ```
197 | 
198 | 
199 | 
200 | 
201 | <div>
202 | <table border="1" class="dataframe">
203 |   <thead>
204 |     <tr style="text-align: right;">
205 |       <th></th>
206 |       <th>vendor</th>
207 |       <th>passengers</th>
208 |       <th>distance</th>
209 |       <th>rate</th>
210 |       <th>pick_location</th>
211 |       <th>drop_location</th>
212 |       <th>payment_type</th>
213 |       <th>fare</th>
214 |       <th>extra_fare</th>
215 |       <th>tax</th>
216 |       <th>tip</th>
217 |       <th>toll</th>
218 |       <th>surcharge</th>
219 |       <th>total_fare</th>
220 |       <th>pick_when</th>
221 |       <th>drop_when</th>
222 |     </tr>
223 |   </thead>
224 |   <tbody>
225 |     <tr>
226 |       <th>0</th>
227 |       <td>2</td>
228 |       <td>1</td>
229 |       <td>1.25</td>
230 |       <td>1</td>
231 |       <td>237</td>
232 |       <td>236</td>
233 |       <td>1</td>
234 |       <td>9.0</td>
235 |       <td>0.0</td>
236 |       <td>0.5</td>
237 |       <td>0.00</td>
238 |       <td>0.0</td>
239 |       <td>0.3</td>
240 |       <td>9.80</td>
241 |       <td>2018-06-06 10:43:34</td>
242 |       <td>2018-06-06 10:54:58</td>
243 |     </tr>
244 |     <tr>
245 |       <th>1</th>
246 |       <td>1</td>
247 |       <td>1</td>
248 |       <td>1.20</td>
249 |       <td>1</td>
250 |       <td>158</td>
251 |       <td>90</td>
252 |       <td>2</td>
253 |       <td>7.5</td>
254 |       <td>0.0</td>
255 |       <td>0.5</td>
256 |       <td>0.00</td>
257 |       <td>0.0</td>
258 |       <td>0.3</td>
259 |       <td>8.30</td>
260 |       <td>2018-06-06 10:06:22</td>
261 |       <td>2018-06-06 10:15:21</td>
262 |     </tr>
263 |     <tr>
264 |       <th>2</th>
265 |       <td>1</td>
266 |       <td>1</td>
267 |       <td>3.30</td>
268 |       <td>1</td>
269 |       <td>234</td>
270 |       <td>236</td>
271 |       <td>1</td>
272 |       <td>17.0</td>
273 |       <td>0.0</td>
274 |       <td>0.5</td>
275 |       <td>3.55</td>
276 |       <td>0.0</td>
277 |       <td>0.3</td>
278 |       <td>21.35</td>
279 |       <td>2018-06-06 10:17:20</td>
280 |       <td>2018-06-06 10:43:07</td>
281 |     </tr>
282 |     <tr>
283 |       <th>3</th>
284 |       <td>1</td>
285 |       <td>1</td>
286 |       <td>0.90</td>
287 |       <td>1</td>
288 |       <td>236</td>
289 |       <td>140</td>
290 |       <td>1</td>
291 |       <td>7.0</td>
292 |       <td>0.0</td>
293 |       <td>0.5</td>
294 |       <td>1.55</td>
295 |       <td>0.0</td>
296 |       <td>0.3</td>
297 |       <td>9.35</td>
298 |       <td>2018-06-06 10:48:28</td>
299 |       <td>2018-06-06 10:57:08</td>
300 |     </tr>
301 |     <tr>
302 |       <th>4</th>
303 |       <td>1</td>
304 |       <td>1</td>
305 |       <td>1.00</td>
306 |       <td>1</td>
307 |       <td>141</td>
308 |       <td>162</td>
309 |       <td>1</td>
310 |       <td>7.0</td>
311 |       <td>0.0</td>
312 |       <td>0.5</td>
313 |       <td>1.95</td>
314 |       <td>0.0</td>
315 |       <td>0.3</td>
316 |       <td>9.75</td>
317 |       <td>2018-06-06 10:59:28</td>
318 |       <td>2018-06-06 11:08:05</td>
319 |     </tr>
320 |   </tbody>
321 | </table>
322 | </div>
323 | 
324 | 
325 | 
326 | Next we will determine statistical correlation between various features (columns) within the given set of samples (records).
327 | 
328 | 
329 | ```python
330 | corr = df.corr(method ='spearman')
331 | corr
332 | ```
333 | 
334 | 
335 | 
336 | 
337 | <div>
338 | <table border="1" class="dataframe">
339 |   <thead>
340 |     <tr style="text-align: right;">
341 |       <th></th>
342 |       <th>vendor</th>
343 |       <th>passengers</th>
344 |       <th>distance</th>
345 |       <th>rate</th>
346 |       <th>pick_location</th>
347 |       <th>drop_location</th>
348 |       <th>payment_type</th>
349 |       <th>fare</th>
350 |       <th>extra_fare</th>
351 |       <th>tax</th>
352 |       <th>tip</th>
353 |       <th>toll</th>
354 |       <th>surcharge</th>
355 |       <th>total_fare</th>
356 |     </tr>
357 |   </thead>
358 |   <tbody>
359 |     <tr>
360 |       <th>vendor</th>
361 |       <td>1.000000</td>
362 |       <td>0.283619</td>
363 |       <td>0.015401</td>
364 |       <td>0.055897</td>
365 |       <td>-0.024097</td>
366 |       <td>-0.005115</td>
367 |       <td>0.013634</td>
368 |       <td>-0.014083</td>
369 |       <td>NaN</td>
370 |       <td>-0.009308</td>
371 |       <td>-0.024436</td>
372 |       <td>0.025840</td>
373 |       <td>NaN</td>
374 |       <td>-0.015078</td>
375 |     </tr>
376 |     <tr>
377 |       <th>passengers</th>
378 |       <td>0.283619</td>
379 |       <td>1.000000</td>
380 |       <td>0.033053</td>
381 |       <td>0.051624</td>
382 |       <td>-0.021166</td>
383 |       <td>0.003783</td>
384 |       <td>0.020200</td>
385 |       <td>0.035106</td>
386 |       <td>NaN</td>
387 |       <td>-0.022736</td>
388 |       <td>0.008936</td>
389 |       <td>0.003765</td>
390 |       <td>NaN</td>
391 |       <td>0.033289</td>
392 |     </tr>
393 |     <tr>
394 |       <th>distance</th>
395 |       <td>0.015401</td>
396 |       <td>0.033053</td>
397 |       <td>1.000000</td>
398 |       <td>0.119010</td>
399 |       <td>-0.119491</td>
400 |       <td>-0.148011</td>
401 |       <td>-0.068732</td>
402 |       <td>0.917127</td>
403 |       <td>NaN</td>
404 |       <td>-0.080828</td>
405 |       <td>0.389773</td>
406 |       <td>0.401863</td>
407 |       <td>NaN</td>
408 |       <td>0.903529</td>
409 |     </tr>
410 |     <tr>
411 |       <th>rate</th>
412 |       <td>0.055897</td>
413 |       <td>0.051624</td>
414 |       <td>0.119010</td>
415 |       <td>1.000000</td>
416 |       <td>-0.042557</td>
417 |       <td>-0.053956</td>
418 |       <td>-0.007774</td>
419 |       <td>0.185992</td>
420 |       <td>NaN</td>
421 |       <td>-0.501256</td>
422 |       <td>0.083778</td>
423 |       <td>0.246460</td>
424 |       <td>NaN</td>
425 |       <td>0.184445</td>
426 |     </tr>
427 |     <tr>
428 |       <th>pick_location</th>
429 |       <td>-0.024097</td>
430 |       <td>-0.021166</td>
431 |       <td>-0.119491</td>
432 |       <td>-0.042557</td>
433 |       <td>1.000000</td>
434 |       <td>0.150656</td>
435 |       <td>-0.009998</td>
436 |       <td>-0.129692</td>
437 |       <td>NaN</td>
438 |       <td>0.010869</td>
439 |       <td>-0.028087</td>
440 |       <td>-0.153488</td>
441 |       <td>NaN</td>
442 |       <td>-0.127936</td>
443 |     </tr>
444 |     <tr>
445 |       <th>drop_location</th>
446 |       <td>-0.005115</td>
447 |       <td>0.003783</td>
448 |       <td>-0.148011</td>
449 |       <td>-0.053956</td>
450 |       <td>0.150656</td>
451 |       <td>1.000000</td>
452 |       <td>0.003079</td>
453 |       <td>-0.162211</td>
454 |       <td>NaN</td>
455 |       <td>0.090225</td>
456 |       <td>-0.042135</td>
457 |       <td>-0.087721</td>
458 |       <td>NaN</td>
459 |       <td>-0.154017</td>
460 |     </tr>
461 |     <tr>
462 |       <th>payment_type</th>
463 |       <td>0.013634</td>
464 |       <td>0.020200</td>
465 |       <td>-0.068732</td>
466 |       <td>-0.007774</td>
467 |       <td>-0.009998</td>
468 |       <td>0.003079</td>
469 |       <td>1.000000</td>
470 |       <td>-0.073051</td>
471 |       <td>NaN</td>
472 |       <td>-0.015087</td>
473 |       <td>-0.776507</td>
474 |       <td>-0.068458</td>
475 |       <td>NaN</td>
476 |       <td>-0.212893</td>
477 |     </tr>
478 |     <tr>
479 |       <th>fare</th>
480 |       <td>-0.014083</td>
481 |       <td>0.035106</td>
482 |       <td>0.917127</td>
483 |       <td>0.185992</td>
484 |       <td>-0.129692</td>
485 |       <td>-0.162211</td>
486 |       <td>-0.073051</td>
487 |       <td>1.000000</td>
488 |       <td>NaN</td>
489 |       <td>-0.091508</td>
490 |       <td>0.425216</td>
491 |       <td>0.395950</td>
492 |       <td>NaN</td>
493 |       <td>0.983444</td>
494 |     </tr>
495 |     <tr>
496 |       <th>extra_fare</th>
497 |       <td>NaN</td>
498 |       <td>NaN</td>
499 |       <td>NaN</td>
500 |       <td>NaN</td>
501 |       <td>NaN</td>
502 |       <td>NaN</td>
503 |       <td>NaN</td>
504 |       <td>NaN</td>
505 |       <td>NaN</td>
506 |       <td>NaN</td>
507 |       <td>NaN</td>
508 |       <td>NaN</td>
509 |       <td>NaN</td>
510 |       <td>NaN</td>
511 |     </tr>
512 |     <tr>
513 |       <th>tax</th>
514 |       <td>-0.009308</td>
515 |       <td>-0.022736</td>
516 |       <td>-0.080828</td>
517 |       <td>-0.501256</td>
518 |       <td>0.010869</td>
519 |       <td>0.090225</td>
520 |       <td>-0.015087</td>
521 |       <td>-0.091508</td>
522 |       <td>NaN</td>
523 |       <td>1.000000</td>
524 |       <td>0.012988</td>
525 |       <td>-0.148891</td>
526 |       <td>NaN</td>
527 |       <td>-0.089695</td>
528 |     </tr>
529 |     <tr>
530 |       <th>tip</th>
531 |       <td>-0.024436</td>
532 |       <td>0.008936</td>
533 |       <td>0.389773</td>
534 |       <td>0.083778</td>
535 |       <td>-0.028087</td>
536 |       <td>-0.042135</td>
537 |       <td>-0.776507</td>
538 |       <td>0.425216</td>
539 |       <td>NaN</td>
540 |       <td>0.012988</td>
541 |       <td>1.000000</td>
542 |       <td>0.267483</td>
543 |       <td>NaN</td>
544 |       <td>0.555170</td>
545 |     </tr>
546 |     <tr>
547 |       <th>toll</th>
548 |       <td>0.025840</td>
549 |       <td>0.003765</td>
550 |       <td>0.401863</td>
551 |       <td>0.246460</td>
552 |       <td>-0.153488</td>
553 |       <td>-0.087721</td>
554 |       <td>-0.068458</td>
555 |       <td>0.395950</td>
556 |       <td>NaN</td>
557 |       <td>-0.148891</td>
558 |       <td>0.267483</td>
559 |       <td>1.000000</td>
560 |       <td>NaN</td>
561 |       <td>0.403146</td>
562 |     </tr>
563 |     <tr>
564 |       <th>surcharge</th>
565 |       <td>NaN</td>
566 |       <td>NaN</td>
567 |       <td>NaN</td>
568 |       <td>NaN</td>
569 |       <td>NaN</td>
570 |       <td>NaN</td>
571 |       <td>NaN</td>
572 |       <td>NaN</td>
573 |       <td>NaN</td>
574 |       <td>NaN</td>
575 |       <td>NaN</td>
576 |       <td>NaN</td>
577 |       <td>NaN</td>
578 |       <td>NaN</td>
579 |     </tr>
580 |     <tr>
581 |       <th>total_fare</th>
582 |       <td>-0.015078</td>
583 |       <td>0.033289</td>
584 |       <td>0.903529</td>
585 |       <td>0.184445</td>
586 |       <td>-0.127936</td>
587 |       <td>-0.154017</td>
588 |       <td>-0.212893</td>
589 |       <td>0.983444</td>
590 |       <td>NaN</td>
591 |       <td>-0.089695</td>
592 |       <td>0.555170</td>
593 |       <td>0.403146</td>
594 |       <td>NaN</td>
595 |       <td>1.000000</td>
596 |     </tr>
597 |   </tbody>
598 | </table>
599 | </div>
600 | 
601 | 
602 | 
603 | We can drop features which show ``NaN`` correlation.
604 | 
605 | 
606 | ```python
607 | df = df.drop(columns=['surcharge'])
608 | ```
609 | 
610 | 
611 | ```python
612 | corr = df.corr(method ='spearman')
613 | ```
614 | 
615 | ### Heatmap
616 | Completing the data science workflow from sourcing big data, wrangling it using Amazon Athena to well formed schema, bringing adequate sample data from Athena to notebook environment, conducting exploratory data analysis, and finally visualizing the results.
617 | 
618 | 
619 | ```python
620 | def heatmap(corr):
621 |     sns.set(style="white")
622 | 
623 |     # Generate a mask for the upper triangle
624 |     mask = np.zeros_like(corr, dtype=np.bool)
625 |     mask[np.triu_indices_from(mask)] = True
626 | 
627 |     # Set up the matplotlib figure
628 |     f, ax = plt.subplots(figsize=(11, 9))
629 | 
630 |     # Generate a custom diverging colormap
631 |     cmap = sns.diverging_palette(220, 10, as_cmap=True)
632 | 
633 |     # Draw the heatmap with the mask and correct aspect ratio
634 |     sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True, fmt="3.2f",
635 |                 square=True, linewidths=.5, cbar_kws={"shrink": .5})
636 | ```
637 | 
638 | 
639 | ```python
640 | heatmap(corr)
641 | ```
642 | 
643 | 
644 | ![Seaborn Correlation Plot](https://s3.amazonaws.com/cloudstory/notebooks-media/seaborn-corr.png)
645 | 
646 | 


--------------------------------------------------------------------------------
/experiments/notebooks/wine-pycaret/extra_tree_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/cloud-experiments/a81581dea9ad89b6120f57c2be260f889a0bed27/experiments/notebooks/wine-pycaret/extra_tree_model.pkl


--------------------------------------------------------------------------------