├── README.md ├── diagram ├── ImmersionDay-ServerlessDataLake-RevPB3.xml └── README.md ├── lab ├── README.md ├── Serverless Data Lake Day Lab-PREP.docx ├── Serverless Data Lake Day Lab1-KDG-KFH-S3.docx ├── Serverless Data Lake Day Lab2-1to2-3-Glue-DataCatalogTransform.docx ├── Serverless Data Lake Day Lab2-4-OpenData-GDELT.docx └── Serverless Data Lake Day Lab3-Athena-QS-and-3PP.docx ├── presentation ├── AWSLoft-BuildingAServerlessDataLake-RevA-2019-03-07-c.pdf └── README.md └── scripts ├── ImmersionDay-ServerlessDataLake-RevPB3.xml ├── Lab 2.3 - Advanced Data Preparation with Developer Endpoints and Notebook - PA5-2019-12-02.ipynb ├── README.md ├── sdlimmersionlab-IAMuser-policy-PA2.json └── serverlessDataLakeImmersionIAMcf.json /README.md: -------------------------------------------------------------------------------- 1 | # aws-serverless-data-lake-workshop 2 | 3 | This workshop is meant to give customers a hands-on experience with mentioned AWS services.  Serverless Data Lake workshop helps customers build a cloud-native and future-proof serverless data lake architecture. It allows hands-on time with AWS big data and analytics services including Amazon Kinesis Services for streaming data ingestion and analytics, AWS Glue for ETL and Data Catalogue Management, Amazon Athena to query data lake. 4 | -------------------------------------------------------------------------------- /diagram/ImmersionDay-ServerlessDataLake-RevPB3.xml: -------------------------------------------------------------------------------- 1 | 7T1pk5pK178mVfd5q5Ki2fkICIqioLh/SSEgoGyyiPrr38ZlZgQmYyZOZjKJ91YGG2iasy99jl8w3t81Yz1yuqFpeV9QxNx9wRpfUBSQFAL/FCP70wiKgPOIHbvm+arHAc09WOfBy2WZa1rJ1YVpGHqpG10PGmEQWEZ6NabHcZhfX7YMveunRrptVQY0Q/eqoxPXTJ3TKI1Sj+Mty7Wdy5MByZzOLHRjbcdhFpyf9wXFlsfP6bSvX+Y6v2ji6GaYPxnChC8YH4dhejryd7zlFcC9gO10n/jM2Yd1x1aQ3nQDhZ9u2epeZl3WfFxZur9A4/g+VnEH+IJxemycEQaxhXGmnjgP55I0DtcWH3phfLwVQ44feGbpet5lPAgDeDtnx7rpwnWWhosJVT1NrTg4zgqJ52HmCzIgnLjqq57ffmvFqbV7MnR+9aYV+lYa7+El57P4BQ1nOv0KkPNA/gTrlzHnCcYpkjhT25nS7IfJH6END84Afwb4OPMXAx9cwx6lqqBHiRrQPyDtV0CP/sWAx5FryJNMDdHjNZAH9B0gj/3FkAdlcYPeSPMYfgfI3yDpoTqKikPXPypIzkl97wzq4i1dqCFZz7ULQKVh9GRU1heWp4aJm7phcXYRpmnowwu84gT3oBWfoOqsFzHu+DA2iU6KvECefvmydHcFrrnzehpOmhYWAFu8MyoaZoB+c6ENsHQhvcTfDPhEVDT1VId/ivEE/tV9/RAGX/U8+ZqkVmC4XjF6FPRiD57QrHjrGtZ3DT7SXbrGdyOM9t+/sxPtO++FmQmRRH+LAvsejIeX8U/XqZsq+u/Bd+AG9FuByRbW0zVzPHDbE2qoYTySPDLes3CyzCubqwqlJ0Co44HLWGx5eupury21Osicn6CGLlzJAxIgM30jruUfdT1HEmaxYZ1ve2o7lWbCQWUm8lvJLEj12LbSylxHZD28+m1y8wZbYRk+SrczTnj+F3n3yQkDotSKC/48z/Mw8IQywiz13ACSxsUyL0tjuDD++KkjI5KmAYc9L6wvww03hrOflhWEcUE3NSL7Is78nV04Kt8CK83DeJ18SyDTW/H3JA3jk6C7A3fT18yNYWiFtwFC1GhV5A6mJF5Vqx2IhcSFwg4RAn3hQS5GETaC4rNMNbUYe2B85BZKqSewH5FJGTVQPmPf1qclf7dOC/6uR1GVeESCJjD8eRK5i55GSpxNAHAZeYJPEv9Wg1CSKkuBV2GUqGBUg0iGREZ68EHcIoZHdnEU63nxwnFo8npaPW1tISCS6vh/bU3p/a86XqGPF7H/opVwzZmgjjReJsLaK+rIyPyWYFxmrK20Sjw4SpMGUSd5CIpFG3Sd8XEHgnowsfcX1VFV/CSoJSbsG408/eB3IC2yQlrN4ghF/hOG8v8+sICwjwt+B4lAU9cIBHU+E1qHQIa5A8KoW2XBbukfBX31zIN8QJ4VCKoebzIr/ScTfotMoEokhSPMe8qEqnkp9ZqCNqyg/gJe6E0VeIys2IVPKxB2HFIfv3O546aWFulGcX0OkXFNGEdgXnz7O8U2yJKLVcOnaF1s4x4u1sVDecqmQ2Ug/FkgJGnyRRACGn8jEIIKCNWBwgua9mcBkSaodwRiNbo5EFRlMDwpBt0/6t7jv3BkLGkjVpbmfxiZAgRlXmZ1BLwNiMkqiCvg+0E0BXkhmkIJLCkUFxVBE+08ZeHmhnYY6J7wOPrh4y0kWvaLS+C/NdxCoWV9WZromVgLxIG+f3JZVFyQPL/g8nMuC/7ZdT1S0mkFrw38kDdEzH8PqUEKi/fTYs5vCEpcBmbHAQIDl4EncqHxJ0QEKwgEJZP9VgotOwkoWproXhSK1S/4WQrFfo6i6V+8/vLed+OAtw1diyLDYNinELZUOSINkNeK20pwG4By4vVu5FxZNPFjgquurXTHr5NcNfr2T7/X4oIB15YuBkrBz5ulJ1LWo7flUn6W3CoLfoHYyuvC7k1qNyR2qqR2k0D76LRD0kQ5zo6/knqeTHX/3BtVtfafib8tXQ8aPi+G4E456n9RtuOZN42y0UTZVnrPyDtVDeRWkP55d79gxHVk6UGMvLjlq2zEvAr2dXL2xIJJpAcXJhxYEVTEbrHhoyZYwknwH22fpJb/EEGHD346wQ18/S9P/455egAlUUnrYAhTk919u2w9XY1yfiE4NnWsQP9CNOoo6EWa+jhJOv30Hu+SuAco8426Ri6KYA9DT7BLkXVCn6rs4HkVfqv2AsRvP3ONdXJ8For8x0n/O6L6w6Jx87jcCioRhKPEt0YlRl8zKSCrmrtGWZB30BV0NUPeOJpsiOpGVoGgD4y4wrj8Hl3W+U77Z6iSiMUft9Q8xR9Vx4T0DxyJmzFIv2xp/dJ2R4ZBPokDj5Y3O+FYxZe6OWqEX5B34dpyocT9vLJLvucvdJxpnLmX4/xkqjdAUY2h85egqJKffDWGHmd6AwTVWCq/hiAaKf6rqh3m+PkDEFfepQ+ICoPcjDkUe3GuO+KyLlF49hvKjkThgV5hmdxk4eXE1+QYcWCLYAAd7R5PXmbRIusY4pL1vRU/cYNPs14/CQ4vnnVaoD2R/oiezvRWE86oWEVl48l3TfOof+u2JjzGWH6YoHx9aUfVVAVUDcneo5SNeVV27CYW/tNY9WspRPdqPi1PdEcmrUssvQmTcnpqOP+Y9AdMCmpKTt+OTauhXy7z1nBE8ouIIzxQosIjPNW3nQOPwSKJauM/7ESDd/ByEY9cQpijSJGRqL9KO4fPUKQ5ee6ax2oR1raCmjKC8wMnF6oK7acXl1b7YX1j9wjr79buCPKKkSIgRFFK85a+MZQAZXcLpW6vLcHuQIlV1/iZbNZ/LbcIvboGPFvEP/7tDP89O8OxindHvmfWiqnLnNwvlsLz56zVhzc3SjsBiIv39PNbGUFpIvB2bsFDT4ufKfy+R133MyK7trb7qvz7Uu19zJJj7OkrKh6LsHl3zCmDHOk07ZCFn542coSRDY+EfvE949kZ/MMPlvIXlFPgYXuOeEJ/PAg5C8z0kTvGBnPr4Cm+Mp/7NjqajCOnOZ+MpbXGiuiGdQdjzR8M2h67G4iTwdBr0H6Uay2NTbvd/m7LcrZ/4HvdZmp29r1eu78HvmFO6SJmzPUY6hBsEYvAoLrngIrFqh20DFUvys3V6XY711HElGZ9OzZsieyG0WImDRC243bZfiqJESuydph4oYz0O25f4AdgvJFYh2MGY0pZbina1wgjE/jtFB4aWO+AxpzDopi1bQcIow7WwbA5t5uK77U7mpC1QpJ1W0MLmbYcPOPsWdNTolUHmh1iB0wanLqC8oFrdIcboG+LmvrlFtsZE1nEJyAdTqaENVx32ll3s8ibUm86BoYs7YXVDE03HSecNa2FsOsOhe5enqeQpDhtn8xXfakH7Qxuu6I2BRTooHvoHvwV59jrfNiwcSVQqIict+Sd3+ZtoAXDtsd30UU3VOKNHm6U1WbGFBif9/tItxEMofwRoeQvRrprrTfI4axL1WEPBF3QBfzWcmbjDW01Wr3gwCzjJanu8d2a8aztNJq2QTdoa3N34WOqhZnA7DUMzCBUB5+IOyLJ57PJ3G9nid7cZXKOG6pLt7h83oQIFTW102s4gh+M9Um0mXVRQ5F3uiE4kgbttlgnx5v1HDUP0zFpBaHbbfB9kmy2/P0a9MbTSZ/sbDbr2MhaYmOpHvC1sS1Aso0EEzMwG6TL6fkVxC2UANwhZigGIZE0nnWIdqb7iTaVAbO2ZaGdLXxCWkn7kdtft1W+xx33ZYrLaTymRtABEJltdyPxdk519l2PtINiwuVkC/+d+Z3iqTJjigNkKZsjr62N96jeE6DZnRDqirSg3BObosj1R25nn8jt5jKgaTnDKCZLVvJ+18OodOdAfkKXmb2Ft4uswEmLZp4wudpv9JHeYGRCInM7btbZr6S8C7beEk566OMDddq1Qe7grEVDg5ULQrK7GuPcdA1IsGgTVjpdZBP4at1Nh8xWayjTOEOx7SOk4BQNXN0iPbYPzMWEDszcaLO0OkSjGL5oFmfU3pS1tb02LX+1wFJCEZ3ZJOpkhSznesG8OcgVdx25+j6NjYDNBQ4VW/DUND/YrtRB03g6RCkx6EAyXXOD0Ujs8R1R24a5JDREju33G5ACCyeJZxPObXiENd4pg+EWW6StrDPyh/NIaLszHxMXBLfjIYn3iO4AMVvT/cKaOngCZdd8bzpUexSPsTZlYiam+xDHcFaWa7MN/jBbokK6msU9OmqGPJLKwzx2jIm4lfCsJeSI3nGlIEYpzhcd3BiyaoyQEKwtyAK+EHW0wOV6hZqwiIYcaqLuBPwOau7hyJOy3Z4MeZLq7JJ5Ux4VCiNqsHnXaAS6hnpzemj4PZZvu2Ho2kprQyabXaNDzeMNnmXBgehGCGnOIijo9abj9AeC0fJbXLLogEQbqqt+OOqOmk5MNsklboJl2ETh7Gv9sE3p3YYLbYUN8bZj073hZOy5zPLgq6w2THe4PRyD3qAP+YZcmNCyHEhDgVpnbV4T4r7hU9bUG8xlSMUiGu3xttFq7PAcYVljHK5W67YyhHJApNS0GQyVtkOfUDRL8bHL+ftw09mucEZdoCNvPhG9TDnxQ3PDOEMu300aDWTZ2tE8S3gztLvCFgTjU0SBaHq2oAphsx3uGWvb5DJGnQJv6ksEeejsNxMvMg6zrEnsKG0tpjk7U13W7osB2B92rrQYyMM15S9DpWHDV0XzdKojoLdaNnJaYfek0gqIbNGEYganrZ44hDzMqUprlYfihrFnAocdOsGweGvfSAazAEoXditM0aK3yTwJp5Dow40DqXsD4gXXTyZJkwZzIxxtt+1xu5BZfrZOfd6YLcn5ntf2rMZBk1eFJ5zG9pBiVNY3G3bYCbNDspA7INWnZ/JeQjVgbTsF+qSN7BJ+PGJJeTd2ZzyBLykFS/3RbjXzpbCtTcSVhPasubHjrBU/OYJ/KbiUeZyK8zvAyowG1tyQm00Y83mXD1N5Mo3ZBePIE2iEBep+5A0HrCEd5jSx7ilCY8OGRmfCbBckdShkvhkQpI+lwDSyYNCOxdMqp5vdIkUoYWdMd/HGRftgsxdaKWWhpMpyTaqxFnJ+TvMZsxoDbYQtJrtE7HGtdS/QkET2EjGnlENGyP1IDVaH9nIkqZ4tyk0n1GdbzMV9KSuEsrxfRwg17nIDR1qmeNZxgbPcgIazxmRnJ3Lr+X6mEcX7MhbjmvkwghJ6v3YkqBbVAOtIChTYKyh4RZyNBUrrtg8uNMy4FKXMtU+3Nn1iGTSnErIZy9bU6jsJzS/RpZOHY5pbdHnBLHDfStCFOh8gwlwR4r2DchEqgrE9m7Ji3tra09yhW4aNwdfXVweo49v8IaGUBT9JQjVujToxzY4xz5pPlZ4l6Vabpcws7xs9b7AKejhhhOP1nLRkbzsNR0pr3IppD58kk3jcQXurGa3gI3Ew2a1iTdYZPRlNskajtRL3yaYj94c7Qc8lz22II4YfyKxmdbGZKdpDy+7tKRMv1HbWwigzIeQl1Ug2YyXgffhaOSI3zTHYYlssThqDvCMFwygPVw7ObJyFbLUi0nCAsVj7ltoK+H6z5TbFvgYHjQzTe3hOIcpEbTZEcTqmTYHvcL3hCIizSdMB0sC0BmywslHlINPQxFHG09a221i2+hcyh+pB5Ga92dTDMwYHMw2K6gG913tWtmRxecIrs9yyJqntWcBaqr6hWxsWEZZLiRU32zZUp5LL4N1tm5dce9vD/X4Xs2dG3vDxEc2mmaR5/Z2QmRzroKAna6ME8KuCQ8SDj0AloxAUydtSF1/h3T4dje2W53QnYjI3IhJqrUM8jArNPmUFrdcHmiqkbQiogxNudGinzAb7SSJoVEZkYxEaaL6pYJaCZ+oKvp6Zu40hOYMjGbpGhWbeyxQZ729lxT7QcUdbt+nE305lGxfxTHY7UNWslsEKlcWVqlJqcIDP9YrCRA4Tt7a3CZpyoayZPcLMM9FpNTb4xOAAyS7JvRG0KTEhlGEArQ9Ooi110oa3k4SFNYRuwilMpjfdEOktp/O9trMY2/Q7o7UMTiKC4ICIWcN2YYx6XckH7sxt56bYF5Gh2G15zLg/DzbIrD2WzN1WR9NVT/dNwC7odorYG39aKDKl0A5MsMiaByPiVNQOxu0mwfF9yZm11qEcyeMMXandHBRvH/vqDqeXPD4kFSVhG3st5KNNUqhhw4M6kCNy29/t/IloW1vTVPxh/9CBFmcLtPWWEjYKrGwBlZpgMoKHm0bX2bA2P3BaU04fd5GhDEahQ0+687F4wEYiPoa6p9WdTgBjHiZdOtGE4VEL+QYn+T194EUp0gwTdgYNTmu3KEIzXDJtrQYJtWp2m4NmkrnGRMsnKitJXpOVtpiN8/ZyRRvQ1oxmQHBaLCf4mJv3Jauj7KiEa4VYMwvzzl50oP3FNVcxsm+MkaL8U5zSG2DzDjqxM0l0oLMRbhKw7iA2b/dHTQ2I0NkgCgtBWbvzvGeT2eLoVnHtwYgQ4nXbhlYZdPOK/+8R6mCu9+KTWE2cg6469JcOFr/UEAv5Uc8cNtC9PXRf/4R2OfrDWj9Cawy0bi/lY43P1W67ezTRBK/aY/A3VoeBSlwRAu+V5WFP0PxQg1VuUningp0fLPv5ArHq6qgXbqHvcgt+3+IggFV3Ir4YvPsEXRuPGU49txIILPgVUEWoVdQTKBjdZP3Qk/FVG4RuF20YXsnVINXM/mV3zlXG8A5dfwB2Q7XIJ8T9B+rYCcpbI39vy06sbhNrabOAEltQEdUn8pGBZbuPJx+zdlnyFfJW+hV9MXX7aZL/lSrjr6Cm/AjUNfu+R/of1PbvLOFSKhR8YP4kMq3shEzw9yCTqHRS/q24xKsZtUfP4Vwo0LQCK9bTYmtGJa0u6/7ChBdxfGgHbhreklf/OG6Hd1r9e/gaJXfxsZXvi4U9zB02TgD811yNl7oCkMfPp3A1CAK7wtSru/7UNFkutWu+k5dBEGjpQdi5V/1zSyMqS/v5O84vczd3oaYP8M9Q6Kfp8k2AsmtGlizym+sjiHK1aHmm+23VuEW+fBw9cAk/Ld3YcsKkbDn8Ho1AlPZ0YvWNmuvqBPHynptX9c1Ef2AKiBfIVE2AhuVB6j6+CeQyS/eTP8sI+BjIL3X7xPDfjPzq1vuKHaidsPvxUflO7HttJ9QGj9+uShTUtML+K/UldulFWt2T/LMKExo5L011x82NNT13PzCrGWEchdArtL4fi7Qf7isxHtWgeIp/U8ZDS61Y8Kr/jNW0p8Xv4UbVNPmthrWCr2ps+YUc5f+uKpWSTnvIdf6WyEZN6+CWboaQ2lG+wMWkdTpQZFaFf46FJ4o8LI4fuuJ8VsSUfuAFv7RM/z2IeWvH7tP0M0Ar+ehXhh4wuhwReKPQA1bun8r8OO9Yvv7eQQTihnqPT5h3+hg5R4CQ5aQTxtA1VvGbpR3JfybV6zCHVfo/1/W8eTOziqyaVe0Gx39ehQyg1VrJz/9OpfxxWrN/dKVMM9f+Bv6YuvlZtcyUXBeAVKa6V6fgssV3/gnVZ1dWuh4gV9f/umImb4hWfNr2oXjZ6mFqcn91WzFAufHO63i9uheDDcLUKXorIKfieNYwIPDTTyxwkRKF0zW/l14vbu+Cgqpd+lzhel1n1wQr7MHj/7ZpeenXMLKCr09bMX+57uT6r9z9ncrdqcuvhL9LuftDC5AfxaeKzTjJ6TXCZREEgbT05SHoX2zTOQmFV0Wu7iEt8RJIEbrKqUwNp96j1TKg/qhEpqv7NQ4DwfDF271tt9pyhIGqaXKKgRq6vwuWqsG+4wbKc4+XD4svo1hkflzku6Ct0tmPrDodeI13TtxDCV5qJv6x1o/FH17Z7lnT+f/tWOv5nbs3d/XC67p6VRTR/9XrmGfafSG9sHi4EW7Pv1LhFppKO7ZZ96zkSd5a1tfW8dv+E9uz19sJ6ZokGErVhGvuETxYsNMVTwxnSPQ9WY18Z68fvn+tp5sS+P+FXp8PvVZwX0Mhz29GuLSrvcTvfl+txzPU8C8Q/37UACcqV3e9Nz3U/W7EX0AP9y0I+iWieJKKeShZrlLFWxUEPUMVVTO+YiX0vo3d2HYDV683F35UKWTpf1xxyS/hGCcrOK4x8N8sq/AMkm8w+z9y6PcXue7a4fr68FOUT5mu7mfdKfoOPtczGKkLxn+mXUy/hDKAg+sdhWhtl9PfzUV/Z0XtxzCoAE6W5Cr0v99uYwP8GoeF7/yYbSviEt3QtIor/h8= -------------------------------------------------------------------------------- /diagram/README.md: -------------------------------------------------------------------------------- 1 | # Workshop Architecture Diagram # 2 | 3 | 4 | We've provided the Draw.io diagram depicted in the Lab guides. 5 | -------------------------------------------------------------------------------- /lab/README.md: -------------------------------------------------------------------------------- 1 | # Lab Guide # 2 | 3 | 4 | This lab guide is prepared to assist you ingest, store, transform, create insights on unstructured data using AWS serverless services. Most of the demos make use of AWS Console, however all the labs can be automated via Cloudformation templates, AWS CLI or AWS API. 5 | -------------------------------------------------------------------------------- /lab/Serverless Data Lake Day Lab-PREP.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/lab/Serverless Data Lake Day Lab-PREP.docx -------------------------------------------------------------------------------- /lab/Serverless Data Lake Day Lab1-KDG-KFH-S3.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/lab/Serverless Data Lake Day Lab1-KDG-KFH-S3.docx -------------------------------------------------------------------------------- /lab/Serverless Data Lake Day Lab2-1to2-3-Glue-DataCatalogTransform.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/lab/Serverless Data Lake Day Lab2-1to2-3-Glue-DataCatalogTransform.docx -------------------------------------------------------------------------------- /lab/Serverless Data Lake Day Lab2-4-OpenData-GDELT.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/lab/Serverless Data Lake Day Lab2-4-OpenData-GDELT.docx -------------------------------------------------------------------------------- /lab/Serverless Data Lake Day Lab3-Athena-QS-and-3PP.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/lab/Serverless Data Lake Day Lab3-Athena-QS-and-3PP.docx -------------------------------------------------------------------------------- /presentation/AWSLoft-BuildingAServerlessDataLake-RevA-2019-03-07-c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AWS-Big-Data-Projects/aws-serverless-data-lake-workshop/d385ed760c4ced2469be6d5ecac9210d86ee6cdf/presentation/AWSLoft-BuildingAServerlessDataLake-RevA-2019-03-07-c.pdf -------------------------------------------------------------------------------- /presentation/README.md: -------------------------------------------------------------------------------- 1 | # Slides from the workshop Building Serverless Data Lakes on AWS # 2 | 3 | 4 | Download link for the higher resolution presentation deck: https://s3.amazonaws.com/hbapub/loft/AWSLoft-BuildingAServerlessDataLake-RevA-2019-03-07.pdf 5 | -------------------------------------------------------------------------------- /scripts/ImmersionDay-ServerlessDataLake-RevPB3.xml: -------------------------------------------------------------------------------- 1 | 7T1pk5pK178mVfd5q5Ki2fkICIqioLh/SSEgoGyyiPrr38ZlZgQmYyZOZjKJ91YGG2iasy99jl8w3t81Yz1yuqFpeV9QxNx9wRpfUBSQFAL/FCP70wiKgPOIHbvm+arHAc09WOfBy2WZa1rJ1YVpGHqpG10PGmEQWEZ6NabHcZhfX7YMveunRrptVQY0Q/eqoxPXTJ3TKI1Sj+Mty7Wdy5MByZzOLHRjbcdhFpyf9wXFlsfP6bSvX+Y6v2ji6GaYPxnChC8YH4dhejryd7zlFcC9gO10n/jM2Yd1x1aQ3nQDhZ9u2epeZl3WfFxZur9A4/g+VnEH+IJxemycEQaxhXGmnjgP55I0DtcWH3phfLwVQ44feGbpet5lPAgDeDtnx7rpwnWWhosJVT1NrTg4zgqJ52HmCzIgnLjqq57ffmvFqbV7MnR+9aYV+lYa7+El57P4BQ1nOv0KkPNA/gTrlzHnCcYpkjhT25nS7IfJH6END84Afwb4OPMXAx9cwx6lqqBHiRrQPyDtV0CP/sWAx5FryJNMDdHjNZAH9B0gj/3FkAdlcYPeSPMYfgfI3yDpoTqKikPXPypIzkl97wzq4i1dqCFZz7ULQKVh9GRU1heWp4aJm7phcXYRpmnowwu84gT3oBWfoOqsFzHu+DA2iU6KvECefvmydHcFrrnzehpOmhYWAFu8MyoaZoB+c6ENsHQhvcTfDPhEVDT1VId/ivEE/tV9/RAGX/U8+ZqkVmC4XjF6FPRiD57QrHjrGtZ3DT7SXbrGdyOM9t+/sxPtO++FmQmRRH+LAvsejIeX8U/XqZsq+u/Bd+AG9FuByRbW0zVzPHDbE2qoYTySPDLes3CyzCubqwqlJ0Co44HLWGx5eupury21Osicn6CGLlzJAxIgM30jruUfdT1HEmaxYZ1ve2o7lWbCQWUm8lvJLEj12LbSylxHZD28+m1y8wZbYRk+SrczTnj+F3n3yQkDotSKC/48z/Mw8IQywiz13ACSxsUyL0tjuDD++KkjI5KmAYc9L6wvww03hrOflhWEcUE3NSL7Is78nV04Kt8CK83DeJ18SyDTW/H3JA3jk6C7A3fT18yNYWiFtwFC1GhV5A6mJF5Vqx2IhcSFwg4RAn3hQS5GETaC4rNMNbUYe2B85BZKqSewH5FJGTVQPmPf1qclf7dOC/6uR1GVeESCJjD8eRK5i55GSpxNAHAZeYJPEv9Wg1CSKkuBV2GUqGBUg0iGREZ68EHcIoZHdnEU63nxwnFo8npaPW1tISCS6vh/bU3p/a86XqGPF7H/opVwzZmgjjReJsLaK+rIyPyWYFxmrK20Sjw4SpMGUSd5CIpFG3Sd8XEHgnowsfcX1VFV/CSoJSbsG408/eB3IC2yQlrN4ghF/hOG8v8+sICwjwt+B4lAU9cIBHU+E1qHQIa5A8KoW2XBbukfBX31zIN8QJ4VCKoebzIr/ScTfotMoEokhSPMe8qEqnkp9ZqCNqyg/gJe6E0VeIys2IVPKxB2HFIfv3O546aWFulGcX0OkXFNGEdgXnz7O8U2yJKLVcOnaF1s4x4u1sVDecqmQ2Ug/FkgJGnyRRACGn8jEIIKCNWBwgua9mcBkSaodwRiNbo5EFRlMDwpBt0/6t7jv3BkLGkjVpbmfxiZAgRlXmZ1BLwNiMkqiCvg+0E0BXkhmkIJLCkUFxVBE+08ZeHmhnYY6J7wOPrh4y0kWvaLS+C/NdxCoWV9WZromVgLxIG+f3JZVFyQPL/g8nMuC/7ZdT1S0mkFrw38kDdEzH8PqUEKi/fTYs5vCEpcBmbHAQIDl4EncqHxJ0QEKwgEJZP9VgotOwkoWproXhSK1S/4WQrFfo6i6V+8/vLed+OAtw1diyLDYNinELZUOSINkNeK20pwG4By4vVu5FxZNPFjgquurXTHr5NcNfr2T7/X4oIB15YuBkrBz5ulJ1LWo7flUn6W3CoLfoHYyuvC7k1qNyR2qqR2k0D76LRD0kQ5zo6/knqeTHX/3BtVtfafib8tXQ8aPi+G4E456n9RtuOZN42y0UTZVnrPyDtVDeRWkP55d79gxHVk6UGMvLjlq2zEvAr2dXL2xIJJpAcXJhxYEVTEbrHhoyZYwknwH22fpJb/EEGHD346wQ18/S9P/455egAlUUnrYAhTk919u2w9XY1yfiE4NnWsQP9CNOoo6EWa+jhJOv30Hu+SuAco8426Ri6KYA9DT7BLkXVCn6rs4HkVfqv2AsRvP3ONdXJ8For8x0n/O6L6w6Jx87jcCioRhKPEt0YlRl8zKSCrmrtGWZB30BV0NUPeOJpsiOpGVoGgD4y4wrj8Hl3W+U77Z6iSiMUft9Q8xR9Vx4T0DxyJmzFIv2xp/dJ2R4ZBPokDj5Y3O+FYxZe6OWqEX5B34dpyocT9vLJLvucvdJxpnLmX4/xkqjdAUY2h85egqJKffDWGHmd6AwTVWCq/hiAaKf6rqh3m+PkDEFfepQ+ICoPcjDkUe3GuO+KyLlF49hvKjkThgV5hmdxk4eXE1+QYcWCLYAAd7R5PXmbRIusY4pL1vRU/cYNPs14/CQ4vnnVaoD2R/oiezvRWE86oWEVl48l3TfOof+u2JjzGWH6YoHx9aUfVVAVUDcneo5SNeVV27CYW/tNY9WspRPdqPi1PdEcmrUssvQmTcnpqOP+Y9AdMCmpKTt+OTauhXy7z1nBE8ouIIzxQosIjPNW3nQOPwSKJauM/7ESDd/ByEY9cQpijSJGRqL9KO4fPUKQ5ee6ax2oR1raCmjKC8wMnF6oK7acXl1b7YX1j9wjr79buCPKKkSIgRFFK85a+MZQAZXcLpW6vLcHuQIlV1/iZbNZ/LbcIvboGPFvEP/7tDP89O8OxindHvmfWiqnLnNwvlsLz56zVhzc3SjsBiIv39PNbGUFpIvB2bsFDT4ufKfy+R133MyK7trb7qvz7Uu19zJJj7OkrKh6LsHl3zCmDHOk07ZCFn542coSRDY+EfvE949kZ/MMPlvIXlFPgYXuOeEJ/PAg5C8z0kTvGBnPr4Cm+Mp/7NjqajCOnOZ+MpbXGiuiGdQdjzR8M2h67G4iTwdBr0H6Uay2NTbvd/m7LcrZ/4HvdZmp29r1eu78HvmFO6SJmzPUY6hBsEYvAoLrngIrFqh20DFUvys3V6XY711HElGZ9OzZsieyG0WImDRC243bZfiqJESuydph4oYz0O25f4AdgvJFYh2MGY0pZbina1wgjE/jtFB4aWO+AxpzDopi1bQcIow7WwbA5t5uK77U7mpC1QpJ1W0MLmbYcPOPsWdNTolUHmh1iB0wanLqC8oFrdIcboG+LmvrlFtsZE1nEJyAdTqaENVx32ll3s8ibUm86BoYs7YXVDE03HSecNa2FsOsOhe5enqeQpDhtn8xXfakH7Qxuu6I2BRTooHvoHvwV59jrfNiwcSVQqIict+Sd3+ZtoAXDtsd30UU3VOKNHm6U1WbGFBif9/tItxEMofwRoeQvRrprrTfI4axL1WEPBF3QBfzWcmbjDW01Wr3gwCzjJanu8d2a8aztNJq2QTdoa3N34WOqhZnA7DUMzCBUB5+IOyLJ57PJ3G9nid7cZXKOG6pLt7h83oQIFTW102s4gh+M9Um0mXVRQ5F3uiE4kgbttlgnx5v1HDUP0zFpBaHbbfB9kmy2/P0a9MbTSZ/sbDbr2MhaYmOpHvC1sS1Aso0EEzMwG6TL6fkVxC2UANwhZigGIZE0nnWIdqb7iTaVAbO2ZaGdLXxCWkn7kdtft1W+xx33ZYrLaTymRtABEJltdyPxdk519l2PtINiwuVkC/+d+Z3iqTJjigNkKZsjr62N96jeE6DZnRDqirSg3BObosj1R25nn8jt5jKgaTnDKCZLVvJ+18OodOdAfkKXmb2Ft4uswEmLZp4wudpv9JHeYGRCInM7btbZr6S8C7beEk566OMDddq1Qe7grEVDg5ULQrK7GuPcdA1IsGgTVjpdZBP4at1Nh8xWayjTOEOx7SOk4BQNXN0iPbYPzMWEDszcaLO0OkSjGL5oFmfU3pS1tb02LX+1wFJCEZ3ZJOpkhSznesG8OcgVdx25+j6NjYDNBQ4VW/DUND/YrtRB03g6RCkx6EAyXXOD0Ujs8R1R24a5JDREju33G5ACCyeJZxPObXiENd4pg+EWW6StrDPyh/NIaLszHxMXBLfjIYn3iO4AMVvT/cKaOngCZdd8bzpUexSPsTZlYiam+xDHcFaWa7MN/jBbokK6msU9OmqGPJLKwzx2jIm4lfCsJeSI3nGlIEYpzhcd3BiyaoyQEKwtyAK+EHW0wOV6hZqwiIYcaqLuBPwOau7hyJOy3Z4MeZLq7JJ5Ux4VCiNqsHnXaAS6hnpzemj4PZZvu2Ho2kprQyabXaNDzeMNnmXBgehGCGnOIijo9abj9AeC0fJbXLLogEQbqqt+OOqOmk5MNsklboJl2ETh7Gv9sE3p3YYLbYUN8bZj073hZOy5zPLgq6w2THe4PRyD3qAP+YZcmNCyHEhDgVpnbV4T4r7hU9bUG8xlSMUiGu3xttFq7PAcYVljHK5W67YyhHJApNS0GQyVtkOfUDRL8bHL+ftw09mucEZdoCNvPhG9TDnxQ3PDOEMu300aDWTZ2tE8S3gztLvCFgTjU0SBaHq2oAphsx3uGWvb5DJGnQJv6ksEeejsNxMvMg6zrEnsKG0tpjk7U13W7osB2B92rrQYyMM15S9DpWHDV0XzdKojoLdaNnJaYfek0gqIbNGEYganrZ44hDzMqUprlYfihrFnAocdOsGweGvfSAazAEoXditM0aK3yTwJp5Dow40DqXsD4gXXTyZJkwZzIxxtt+1xu5BZfrZOfd6YLcn5ntf2rMZBk1eFJ5zG9pBiVNY3G3bYCbNDspA7INWnZ/JeQjVgbTsF+qSN7BJ+PGJJeTd2ZzyBLykFS/3RbjXzpbCtTcSVhPasubHjrBU/OYJ/KbiUeZyK8zvAyowG1tyQm00Y83mXD1N5Mo3ZBePIE2iEBep+5A0HrCEd5jSx7ilCY8OGRmfCbBckdShkvhkQpI+lwDSyYNCOxdMqp5vdIkUoYWdMd/HGRftgsxdaKWWhpMpyTaqxFnJ+TvMZsxoDbYQtJrtE7HGtdS/QkET2EjGnlENGyP1IDVaH9nIkqZ4tyk0n1GdbzMV9KSuEsrxfRwg17nIDR1qmeNZxgbPcgIazxmRnJ3Lr+X6mEcX7MhbjmvkwghJ6v3YkqBbVAOtIChTYKyh4RZyNBUrrtg8uNMy4FKXMtU+3Nn1iGTSnErIZy9bU6jsJzS/RpZOHY5pbdHnBLHDfStCFOh8gwlwR4r2DchEqgrE9m7Ji3tra09yhW4aNwdfXVweo49v8IaGUBT9JQjVujToxzY4xz5pPlZ4l6Vabpcws7xs9b7AKejhhhOP1nLRkbzsNR0pr3IppD58kk3jcQXurGa3gI3Ew2a1iTdYZPRlNskajtRL3yaYj94c7Qc8lz22II4YfyKxmdbGZKdpDy+7tKRMv1HbWwigzIeQl1Ug2YyXgffhaOSI3zTHYYlssThqDvCMFwygPVw7ObJyFbLUi0nCAsVj7ltoK+H6z5TbFvgYHjQzTe3hOIcpEbTZEcTqmTYHvcL3hCIizSdMB0sC0BmywslHlINPQxFHG09a221i2+hcyh+pB5Ga92dTDMwYHMw2K6gG913tWtmRxecIrs9yyJqntWcBaqr6hWxsWEZZLiRU32zZUp5LL4N1tm5dce9vD/X4Xs2dG3vDxEc2mmaR5/Z2QmRzroKAna6ME8KuCQ8SDj0AloxAUydtSF1/h3T4dje2W53QnYjI3IhJqrUM8jArNPmUFrdcHmiqkbQiogxNudGinzAb7SSJoVEZkYxEaaL6pYJaCZ+oKvp6Zu40hOYMjGbpGhWbeyxQZ729lxT7QcUdbt+nE305lGxfxTHY7UNWslsEKlcWVqlJqcIDP9YrCRA4Tt7a3CZpyoayZPcLMM9FpNTb4xOAAyS7JvRG0KTEhlGEArQ9Ooi110oa3k4SFNYRuwilMpjfdEOktp/O9trMY2/Q7o7UMTiKC4ICIWcN2YYx6XckH7sxt56bYF5Gh2G15zLg/DzbIrD2WzN1WR9NVT/dNwC7odorYG39aKDKl0A5MsMiaByPiVNQOxu0mwfF9yZm11qEcyeMMXandHBRvH/vqDqeXPD4kFSVhG3st5KNNUqhhw4M6kCNy29/t/IloW1vTVPxh/9CBFmcLtPWWEjYKrGwBlZpgMoKHm0bX2bA2P3BaU04fd5GhDEahQ0+687F4wEYiPoa6p9WdTgBjHiZdOtGE4VEL+QYn+T194EUp0gwTdgYNTmu3KEIzXDJtrQYJtWp2m4NmkrnGRMsnKitJXpOVtpiN8/ZyRRvQ1oxmQHBaLCf4mJv3Jauj7KiEa4VYMwvzzl50oP3FNVcxsm+MkaL8U5zSG2DzDjqxM0l0oLMRbhKw7iA2b/dHTQ2I0NkgCgtBWbvzvGeT2eLoVnHtwYgQ4nXbhlYZdPOK/+8R6mCu9+KTWE2cg6469JcOFr/UEAv5Uc8cNtC9PXRf/4R2OfrDWj9Cawy0bi/lY43P1W67ezTRBK/aY/A3VoeBSlwRAu+V5WFP0PxQg1VuUningp0fLPv5ArHq6qgXbqHvcgt+3+IggFV3Ir4YvPsEXRuPGU49txIILPgVUEWoVdQTKBjdZP3Qk/FVG4RuF20YXsnVINXM/mV3zlXG8A5dfwB2Q7XIJ8T9B+rYCcpbI39vy06sbhNrabOAEltQEdUn8pGBZbuPJx+zdlnyFfJW+hV9MXX7aZL/lSrjr6Cm/AjUNfu+R/of1PbvLOFSKhR8YP4kMq3shEzw9yCTqHRS/q24xKsZtUfP4Vwo0LQCK9bTYmtGJa0u6/7ChBdxfGgHbhreklf/OG6Hd1r9e/gaJXfxsZXvi4U9zB02TgD811yNl7oCkMfPp3A1CAK7wtSru/7UNFkutWu+k5dBEGjpQdi5V/1zSyMqS/v5O84vczd3oaYP8M9Q6Kfp8k2AsmtGlizym+sjiHK1aHmm+23VuEW+fBw9cAk/Ld3YcsKkbDn8Ho1AlPZ0YvWNmuvqBPHynptX9c1Ef2AKiBfIVE2AhuVB6j6+CeQyS/eTP8sI+BjIL3X7xPDfjPzq1vuKHaidsPvxUflO7HttJ9QGj9+uShTUtML+K/UldulFWt2T/LMKExo5L011x82NNT13PzCrGWEchdArtL4fi7Qf7isxHtWgeIp/U8ZDS61Y8Kr/jNW0p8Xv4UbVNPmthrWCr2ps+YUc5f+uKpWSTnvIdf6WyEZN6+CWboaQ2lG+wMWkdTpQZFaFf46FJ4o8LI4fuuJ8VsSUfuAFv7RM/z2IeWvH7tP0M0Ar+ehXhh4wuhwReKPQA1bun8r8OO9Yvv7eQQTihnqPT5h3+hg5R4CQ5aQTxtA1VvGbpR3JfybV6zCHVfo/1/W8eTOziqyaVe0Gx39ehQyg1VrJz/9OpfxxWrN/dKVMM9f+Bv6YuvlZtcyUXBeAVKa6V6fgssV3/gnVZ1dWuh4gV9f/umImb4hWfNr2oXjZ6mFqcn91WzFAufHO63i9uheDDcLUKXorIKfieNYwIPDTTyxwkRKF0zW/l14vbu+Cgqpd+lzhel1n1wQr7MHj/7ZpeenXMLKCr09bMX+57uT6r9z9ncrdqcuvhL9LuftDC5AfxaeKzTjJ6TXCZREEgbT05SHoX2zTOQmFV0Wu7iEt8RJIEbrKqUwNp96j1TKg/qhEpqv7NQ4DwfDF271tt9pyhIGqaXKKgRq6vwuWqsG+4wbKc4+XD4svo1hkflzku6Ct0tmPrDodeI13TtxDCV5qJv6x1o/FH17Z7lnT+f/tWOv5nbs3d/XC67p6VRTR/9XrmGfafSG9sHi4EW7Pv1LhFppKO7ZZ96zkSd5a1tfW8dv+E9uz19sJ6ZokGErVhGvuETxYsNMVTwxnSPQ9WY18Z68fvn+tp5sS+P+FXp8PvVZwX0Mhz29GuLSrvcTvfl+txzPU8C8Q/37UACcqV3e9Nz3U/W7EX0AP9y0I+iWieJKKeShZrlLFWxUEPUMVVTO+YiX0vo3d2HYDV683F35UKWTpf1xxyS/hGCcrOK4x8N8sq/AMkm8w+z9y6PcXue7a4fr68FOUT5mu7mfdKfoOPtczGKkLxn+mXUy/hDKAg+sdhWhtl9PfzUV/Z0XtxzCoAE6W5Cr0v99uYwP8GoeF7/yYbSviEt3QtIor/h8= -------------------------------------------------------------------------------- /scripts/Lab 2.3 - Advanced Data Preparation with Developer Endpoints and Notebook - PA5-2019-12-02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Serverless Data Lake Immersion\n", 8 | "## Lab 2.3 - Advanced Data Preparation with Developer Endpoints and Notebook\n", 9 | "`(Revision History:\n", 10 | "PA5, 2019-10-19, @akirmak: updated Section 7.2 based on feedback from identified by @rmichaud and @werberm and the solution proposed by @greenste. PA4 excluded.\n", 11 | "PA4, 2019-10-19, @akirmak: Advanced Spark ETL logic added as bonus\n", 12 | "PA3, 2019-05-09, @akirmak: updated based on feedback from @hohenber\n", 13 | "PA2, 2018-12-13, @akirmak \n", 14 | "PA1, 2018-12-07`\n", 15 | "\n", 16 | "This example shows how to do joins and filters with transforms on DynamicFrames.\n", 17 | "\n", 18 | "For purposes of our Immersion Day, we are assuming that you have done the previous Lab assignments (Create Firehose delivery stream, ingest simulated product catalogue data to S3, crawled this data and put the results into a database called `-tame-bda-immersion-gdb` and a table called `raw` in your Data Catalog, as described in the lab guide.\n", 19 | "\n", 20 | "### 2. Getting started\n", 21 | "\n", 22 | "DataFrames APIs support elaborate methods for slicing-and-dicing the data. It includes operations such as \"selecting\" rows, columns, and cells by name or by number, filtering out rows, etc. Statistical data is usually very messy and contains lots of missing and incorrect values and range violations. So a critically important feature of DataFrames is the explicit management of missing data.\n", 23 | "\n", 24 | "We will write a script that:\n", 25 | "\n", 26 | "1. Queries data\n", 27 | "2. Reformats data\n", 28 | "3. Repartitions the data\n", 29 | "\n", 30 | "Begin by running some boilerplate to import the AWS Glue libraries we'll need and set up a single `GlueContext`.\n", 31 | "Then, start a Spark application and create dynamic frame from our the data in S3. \n", 32 | "\n", 33 | "Some concepts:\n", 34 | "\n", 35 | "- Spark provides a unified platform for writing big data applications, ranging from simple data loading and SQL queries to machine learning and streaming computation over the same engine and with a consistent set of APIs.\n", 36 | "- Spark handles loading data from Amazon S3. \n", 37 | "- You control your Spark Application through a driver process called the SparkSession.\n", 38 | "- A Spark DataFrame is the most common Structured API and simply represents a table of data with rows and columns. (Not to be confused with R and Python DataFrames. Those (with some exceptions) exist on one machine rather than multiple machines)\n", 39 | "- Schema is the list that defines the columns and types within those columns.\n", 40 | "\n", 41 | "**Important** Before running the next step, update the *initials* variable with your initials (e.g. fs-tame-bda-immersion-gdb for Frank Sinatra)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Starting Spark application\n" 54 | ] 55 | }, 56 | { 57 | "data": { 58 | "text/html": [ 59 | "\n", 60 | "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
0application_1575272987821_0001pysparkidleLinkLink
" 61 | ], 62 | "text/plain": [ 63 | "" 64 | ] 65 | }, 66 | "metadata": {}, 67 | "output_type": "display_data" 68 | }, 69 | { 70 | "data": { 71 | "application/vnd.jupyter.widget-view+json": { 72 | "model_id": "", 73 | "version_major": 2, 74 | "version_minor": 0 75 | }, 76 | "text/plain": [ 77 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 78 | ] 79 | }, 80 | "metadata": {}, 81 | "output_type": "display_data" 82 | }, 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "SparkSession available as 'spark'.\n" 88 | ] 89 | }, 90 | { 91 | "data": { 92 | "application/vnd.jupyter.widget-view+json": { 93 | "model_id": "", 94 | "version_major": 2, 95 | "version_minor": 0 96 | }, 97 | "text/plain": [ 98 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 99 | ] 100 | }, 101 | "metadata": {}, 102 | "output_type": "display_data" 103 | } 104 | ], 105 | "source": [ 106 | "import sys\n", 107 | "from awsglue.transforms import *\n", 108 | "from awsglue.utils import getResolvedOptions\n", 109 | "from pyspark.context import SparkContext\n", 110 | "from awsglue.context import GlueContext\n", 111 | "from awsglue.job import Job\n", 112 | "\n", 113 | "glueContext = GlueContext(SparkContext.getOrCreate())\n", 114 | "\n", 115 | "initials = \"ADD_YOUR_INITIALS_HERE\" # <-- Add your initials here!\n", 116 | "\n", 117 | "spark = glueContext.spark_session\n", 118 | "\n", 119 | "datasource0 = glueContext.create_dynamic_frame.from_catalog(\n", 120 | " database = initials + \"-tame-bda-immersion-gdb\", \n", 121 | " table_name = \"raw\", \n", 122 | " transformation_ctx = \"datasource0\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### 3. Schema of the Dataset\n", 130 | "Next, you can easily examine the schemas that the crawler recorded in the Data Catalog. For example, to see the schema of the `raw` table, run the following code:\n", 131 | "\n", 132 | "Note: To have a look at the schema, i.e. the structure of the DataFrame, we'll use the printSchema method. This will give us the different columns in our DataFrame, along with the data type and the nullable conditions for that particular column\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "application/vnd.jupyter.widget-view+json": { 143 | "model_id": "", 144 | "version_major": 2, 145 | "version_minor": 0 146 | }, 147 | "text/plain": [ 148 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 149 | ] 150 | }, 151 | "metadata": {}, 152 | "output_type": "display_data" 153 | }, 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Count: 45000\n", 159 | "root\n", 160 | " |-- productName: string (nullable = true)\n", 161 | " |-- product: string (nullable = true)\n", 162 | " |-- department: string (nullable = true)\n", 163 | " |-- color: string (nullable = true)\n", 164 | " |-- imageUrl: string (nullable = true)\n", 165 | " |-- dateSoldSince: string (nullable = true)\n", 166 | " |-- dateSoldUntil: string (nullable = true)\n", 167 | " |-- price: integer (nullable = true)\n", 168 | " |-- campaign: string (nullable = true)\n", 169 | " |-- year: string (nullable = true)\n", 170 | " |-- month: string (nullable = true)\n", 171 | " |-- day: string (nullable = true)\n", 172 | " |-- hour: string (nullable = true)\n", 173 | "\n", 174 | "+--------------------+--------+----------+------+--------------------+--------------------+--------------------+-----+-----------+----+-----+---+----+\n", 175 | "| productName| product|department| color| imageUrl| dateSoldSince| dateSoldUntil|price| campaign|year|month|day|hour|\n", 176 | "+--------------------+--------+----------+------+--------------------+--------------------+--------------------+-----+-----------+----+-----+---+----+\n", 177 | "|Refined Frozen To...| Tuna| Music| tan|http://lorempixel...|Tue Mar 13 2018 2...|Sat Feb 02 2019 1...| 144|BlackFriday|2018| 11| 15| 20|\n", 178 | "| Small Plastic Pizza|Sausages| Games|purple|http://lorempixel...|Wed Jan 24 2018 1...|Sun Dec 16 2018 0...| 63| 10Percent|2018| 11| 15| 20|\n", 179 | "|Gorgeous Frozen Ball| Towels| Games| blue|http://lorempixel...|Mon Feb 19 2018 1...|Sat Apr 06 2019 2...| 33|BlackFriday|2018| 11| 15| 20|\n", 180 | "|Unbranded Wooden ...|Sausages| Books|orange|http://lorempixel...|Sun Jun 17 2018 0...|Tue Aug 20 2019 0...| 123|BlackFriday|2018| 11| 15| 20|\n", 181 | "|Handcrafted Cotto...| Chicken| Jewelery| azure|http://lorempixel...|Wed Jul 11 2018 1...|Sun Aug 18 2019 1...| 110| 10Percent|2018| 11| 15| 20|\n", 182 | "+--------------------+--------+----------+------+--------------------+--------------------+--------------------+-----+-----------+----+-----+---+----+\n", 183 | "only showing top 5 rows" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "print (\"Count: \", datasource0.count())\n", 189 | "\n", 190 | "df = datasource0.toDF()\n", 191 | "\n", 192 | "df.printSchema()\n", 193 | "\n", 194 | "df.show(5)\n" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### 4. Selecting Multiple Columns & Filtering Data\n", 202 | "We can filter our data based on multiple conditions " 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 4, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "application/vnd.jupyter.widget-view+json": { 213 | "model_id": "", 214 | "version_major": 2, 215 | "version_minor": 0 216 | }, 217 | "text/plain": [ 218 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 219 | ] 220 | }, 221 | "metadata": {}, 222 | "output_type": "display_data" 223 | }, 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "+--------------------+--------+----------+-----+-----------+\n", 229 | "| productName| product|department|price| campaign|\n", 230 | "+--------------------+--------+----------+-----+-----------+\n", 231 | "|Refined Frozen To...| Tuna| Music| 144|BlackFriday|\n", 232 | "|Gorgeous Frozen Ball| Towels| Games| 33|BlackFriday|\n", 233 | "|Unbranded Wooden ...|Sausages| Books| 123|BlackFriday|\n", 234 | "|Handmade Rubber S...| Bike| Books| 116|BlackFriday|\n", 235 | "|Awesome Rubber Shoes| Bacon|Automotive| 119|BlackFriday|\n", 236 | "| Generic Cotton Bike| Tuna| Shoes| 109|BlackFriday|\n", 237 | "| Refined Frozen Tuna| Hat| Sports| 37|BlackFriday|\n", 238 | "| Sleek Frozen Pants|Computer| Music| 58|BlackFriday|\n", 239 | "|Handcrafted Rubbe...| Ball| Outdoors| 45|BlackFriday|\n", 240 | "|Practical Soft Co...| Towels| Health| 52|BlackFriday|\n", 241 | "+--------------------+--------+----------+-----+-----------+" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "df.filter((df.campaign=='BlackFriday')).select('productName','product', 'department', 'price','campaign').limit(10).show()\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "### 5. Perform transformations on data\n", 254 | "\n", 255 | "You can easily transform data.\n", 256 | "\n", 257 | "Let's only keep the fields that we want and rename `imageUrl` to `thumbnailImageUrl`. The dataset is small enough that we can look at the whole thing. The `toDF()` converts a DynamicFrame to a Spark DataFrame, so we can apply the\n", 258 | "transforms in SparkSQL." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 5, 264 | "metadata": { 265 | "scrolled": true 266 | }, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "application/vnd.jupyter.widget-view+json": { 271 | "model_id": "", 272 | "version_major": 2, 273 | "version_minor": 0 274 | }, 275 | "text/plain": [ 276 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 277 | ] 278 | }, 279 | "metadata": {}, 280 | "output_type": "display_data" 281 | }, 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "root\n", 287 | " |-- productName: string (nullable = true)\n", 288 | " |-- product: string (nullable = true)\n", 289 | " |-- department: string (nullable = true)\n", 290 | " |-- dateSoldSince: string (nullable = true)\n", 291 | " |-- dateSoldUntil: string (nullable = true)\n", 292 | " |-- price: integer (nullable = true)\n", 293 | " |-- year: string (nullable = true)\n", 294 | " |-- month: string (nullable = true)\n", 295 | " |-- day: string (nullable = true)\n", 296 | " |-- thumbnailImageUrl: string (nullable = true)\n", 297 | " |-- campaignType: string (nullable = true)\n", 298 | "\n", 299 | "+--------------------+--------+----------+--------------------+--------------------+-----+----+-----+---+--------------------+------------+\n", 300 | "| productName| product|department| dateSoldSince| dateSoldUntil|price|year|month|day| thumbnailImageUrl|campaignType|\n", 301 | "+--------------------+--------+----------+--------------------+--------------------+-----+----+-----+---+--------------------+------------+\n", 302 | "|Refined Frozen To...| Tuna| Music|Tue Mar 13 2018 2...|Sat Feb 02 2019 1...| 144|2018| 11| 15|http://lorempixel...| BlackFriday|\n", 303 | "| Small Plastic Pizza|Sausages| Games|Wed Jan 24 2018 1...|Sun Dec 16 2018 0...| 63|2018| 11| 15|http://lorempixel...| 10Percent|\n", 304 | "|Gorgeous Frozen Ball| Towels| Games|Mon Feb 19 2018 1...|Sat Apr 06 2019 2...| 33|2018| 11| 15|http://lorempixel...| BlackFriday|\n", 305 | "|Unbranded Wooden ...|Sausages| Books|Sun Jun 17 2018 0...|Tue Aug 20 2019 0...| 123|2018| 11| 15|http://lorempixel...| BlackFriday|\n", 306 | "|Handcrafted Cotto...| Chicken| Jewelery|Wed Jul 11 2018 1...|Sun Aug 18 2019 1...| 110|2018| 11| 15|http://lorempixel...| 10Percent|\n", 307 | "|Awesome Cotton Chips| Chair| Clothing|Fri Feb 23 2018 1...|Mon Jul 15 2019 0...| 50|2018| 11| 15|http://lorempixel...| NONE|\n", 308 | "|Ergonomic Metal S...| Salad| Clothing|Mon Jun 18 2018 0...|Tue Mar 19 2019 1...| 43|2018| 11| 15|http://lorempixel...| 10Percent|\n", 309 | "|Handmade Rubber S...| Bike| Books|Fri Oct 12 2018 1...|Sun Aug 18 2019 0...| 116|2018| 11| 15|http://lorempixel...| BlackFriday|\n", 310 | "|Awesome Rubber Shoes| Bacon|Automotive|Wed Jan 24 2018 2...|Wed Aug 07 2019 0...| 119|2018| 11| 15|http://lorempixel...| BlackFriday|\n", 311 | "|Handmade Wooden C...| Bike| Computers|Sun Nov 11 2018 0...|Tue Jun 11 2019 2...| 149|2018| 11| 15|http://lorempixel...| NONE|\n", 312 | "+--------------------+--------+----------+--------------------+--------------------+-----+----+-----+---+--------------------+------------+\n", 313 | "only showing top 10 rows" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "dsTransformed = datasource0.drop_fields(['color','hour']).rename_field('imageUrl', 'thumbnailImageUrl').rename_field('campaign', 'campaignType')\n", 319 | "dfTransformed = dsTransformed.toDF()\n", 320 | "\n", 321 | "dfTransformed.printSchema()\n", 322 | "\n", 323 | "dfTransformed.show(10)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### 6. Export transformed data to S3\n", 331 | "\n", 332 | "Let's export the transformed dataset in the previous section to S3. Convert to Parquet format. The following call writes the table across multiple files to support fast parallel reads when doing analysis later:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 6, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "application/vnd.jupyter.widget-view+json": { 343 | "model_id": "", 344 | "version_major": 2, 345 | "version_minor": 0 346 | }, 347 | "text/plain": [ 348 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 349 | ] 350 | }, 351 | "metadata": {}, 352 | "output_type": "display_data" 353 | }, 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "glueContext.write_dynamic_frame.from_options(frame = dsTransformed,\n", 364 | " connection_type = \"s3\",\n", 365 | " connection_options = {\"path\": \"s3://\" + initials + \"-tame-bda-immersion/output-etl-nb-jobs\"},\n", 366 | " format = \"parquet\")" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "When execution is finished, go the the S3 folder, and verify that the files are written. For instance: the folder should look something like:\n", 374 | "\n", 375 | "`\n", 376 | "2018-12-07 22:42:56 87705 part-00000-3944ffa1-8917-42f0-93f2-bef5b3c63cca-c000.snappy.parquet\n", 377 | "2018-12-07 22:41:55 87572 part-00000-48a202cd-86eb-4109-b3e6-f7f2bef549ef-c000.snappy.parquet\n", 378 | "2018-11-21 01:32:34 87572 part-00000-7f23bfb7-7a9f-4eee-bd00-4cf7ab085f57-c000.snappy.parquet\n", 379 | "2018-12-07 22:42:56 88180 part-00001-3944ffa1-8917-42f0-93f2-bef5b3c63cca-c000.snappy.parquet\n", 380 | "2018-12-07 22:41:55 88180 part-00001-48a202cd-86eb-4109-b3e6-f7f2bef549ef-c000.snappy.parquet\n", 381 | "2018-11-21 01:32:34 88180 part-00001-7f23bfb7-7a9f-4eee-bd00-4cf7ab085f57-c000.snappy.parquet\n", 382 | "2018-12-07 22:42:56 87545 part-00002-3944ffa1-8917-42f0-93f2-bef5b3c63cca-c000.snappy.parquet\n", 383 | "2018-12-07 22:41:55 87851 part-00002-48a202cd-86eb-4109-b3e6-f7f2bef549ef-c000.snappy.parquet\n", 384 | "2018-11-21 01:32:34 87545 part-00002-7f23bfb7-7a9f-4eee-bd00-4cf7ab085f57-c000.snappy.parquet`" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "### 7. Repartition Data\n", 392 | "**Important:** Before running the cell below, make sure you are using the correct S3 path.\n", 393 | "\n", 394 | "\n", 395 | "In the previous example, the data was exported to multiple S3 objects in parquet format. Since the data is small, let's combine them in a single partition.\n", 396 | "\n", 397 | "#### 7.1 Combine into a Single Partition\n", 398 | "To put all the history data into a single file, we need to convert it to a data frame, repartition it, and\n", 399 | "write it out." 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 9, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "application/vnd.jupyter.widget-view+json": { 410 | "model_id": "", 411 | "version_major": 2, 412 | "version_minor": 0 413 | }, 414 | "text/plain": [ 415 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 416 | ] 417 | }, 418 | "metadata": {}, 419 | "output_type": "display_data" 420 | } 421 | ], 422 | "source": [ 423 | "dfSinglePartition = dfTransformed.repartition(1)\n", 424 | "dfSinglePartition.write.parquet('s3://' + initials + '-tame-bda-immersion/output-etl-nb-jobs/singlePartition')" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "When execution is finished, go the the S3 folder, and verify that the files are written. For instance: the folder should look something like:\n", 432 | "\n", 433 | "`2018-12-07 22:55:13 1435146 part-00000-95ad4fb6-d178-47ad-8072-d60d8d8e71fd-c000.snappy.parquet`" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "#### 7.2 Repartition Based on a Field\n", 441 | "\n", 442 | "Or if you want to separate it by the `department`:\n", 443 | "\n", 444 | "**Update 2-Dec-2019:** If you get an error that the spark job is aborted, try the command with \"s3a://\" instead of \"s3://\". More details are here: https://issues.apache.org/jira/browse/HADOOP-10400 " 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 10, 450 | "metadata": {}, 451 | "outputs": [ 452 | { 453 | "data": { 454 | "application/vnd.jupyter.widget-view+json": { 455 | "model_id": "", 456 | "version_major": 2, 457 | "version_minor": 0 458 | }, 459 | "text/plain": [ 460 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 461 | ] 462 | }, 463 | "metadata": {}, 464 | "output_type": "display_data" 465 | } 466 | ], 467 | "source": [ 468 | "dfTransformed.write.parquet(\n", 469 | " 's3://' + initials + '-tame-bda-immersion/output-etl-nb-jobs/byDepartment', \n", 470 | " partitionBy=['department'])" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "**Note:**\n", 478 | "Many other types of transformations could be done, such as joining tables. AWS Glue makes it easy to write it to relational databases like Redshift even with semi-structured data. It offers a transform, relationalize(), that flattens DynamicFrames no matter how complex the objects in the frame may be." 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "### 8. Putting it together\n", 486 | "Great! We now have the final table that we'd like to use for analysis in S3, the storage layer of our Data Lake in a compact, efficient format for analytics, that we can run SQL over in AWS Glue, Athena, or Redshift Spectrum.\n", 487 | " \n", 488 | "Note that, many other types of transformations could be done (e.g. JOIN operations). We leave it to your imagination :) \n" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "### 9. Congratulations! \n", 496 | "You've Finished this lab. \n", 497 | "\n", 498 | "**Very Important:** SageMaker Notebooks run on EC2, and therefore you will be billed by the second unless you save your work (by downloading to your local computer) & terminate the SageMaker notebook instance. \n", 499 | "\n", 500 | "### 10. Cleaning up resources \n", 501 | "\n", 502 | "Please \n", 503 | " 1. download this notebook to your computer by selecting ` File -> Download as -> Notebook (.ipynb)`. \n", 504 | " 1. Terminate this instance. Remember that you can always recreate it from the `AWS Glue Console` by selecting the terminated instance and `Cloning` its configuration.\n", 505 | " \n", 506 | " Thank you.\n" 507 | ] 508 | } 509 | ], 510 | "metadata": { 511 | "kernelspec": { 512 | "display_name": "Sparkmagic (PySpark)", 513 | "language": "", 514 | "name": "pysparkkernel" 515 | }, 516 | "language_info": { 517 | "codemirror_mode": { 518 | "name": "python", 519 | "version": 2 520 | }, 521 | "mimetype": "text/x-python", 522 | "name": "pyspark", 523 | "pygments_lexer": "python2" 524 | } 525 | }, 526 | "nbformat": 4, 527 | "nbformat_minor": 2 528 | } 529 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Scripts used in the Lab # 2 | 3 | We prepared a CloudFormation template that creates the IAM roles and policies you’d need to run the labs. They authorize the services. 4 | 5 | We also prepared a Jupyter Notebook to demonstrate iterative development for data transformations. 6 | 7 | The installation instructions for the Kinesis Data Generator tool can be found here: https://awslabs.github.io/amazon-kinesis-data-generator/web/help.html 8 | -------------------------------------------------------------------------------- /scripts/sdlimmersionlab-IAMuser-policy-PA2.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": [ 6 | "logs:CreateLogStream", 7 | "logs:DescribeLogStreams", 8 | "s3:ListBucket", 9 | "logs:PutLogEvents" 10 | ], 11 | "Resource": [ 12 | "arn:aws:s3:::aws-glue-jes-prod-eu-west-1-assets", 13 | "arn:aws:logs:eu-west-1:503254810580:log-group:/aws/sagemaker/*", 14 | "arn:aws:logs:eu-west-1:503254810580:log-group:/aws/sagemaker/*:log-stream:aws-glue-*" 15 | ], 16 | "Effect": "Allow", 17 | "Sid": "VisualEditor0" 18 | }, 19 | { 20 | "Action": "s3:GetObject", 21 | "Resource": "arn:aws:s3:::aws-glue-jes-prod-eu-west-1-assets*", 22 | "Effect": "Allow", 23 | "Sid": "VisualEditor1" 24 | }, 25 | { 26 | "Action": [ 27 | "s3:PutAnalyticsConfiguration", 28 | "s3:GetObjectVersionTagging", 29 | "s3:CreateBucket", 30 | "s3:ReplicateObject", 31 | "s3:GetObjectAcl", 32 | "s3:DeleteBucketWebsite", 33 | "s3:PutLifecycleConfiguration", 34 | "s3:GetObjectVersionAcl", 35 | "s3:PutObjectTagging", 36 | "s3:DeleteObject", 37 | "s3:DeleteObjectTagging", 38 | "s3:GetBucketPolicyStatus", 39 | "s3:GetBucketWebsite", 40 | "s3:PutReplicationConfiguration", 41 | "s3:DeleteObjectVersionTagging", 42 | "s3:GetBucketNotification", 43 | "s3:PutBucketCORS", 44 | "s3:GetReplicationConfiguration", 45 | "s3:ListMultipartUploadParts", 46 | "s3:GetObject", 47 | "s3:PutBucketNotification", 48 | "s3:PutObject", 49 | "s3:PutBucketLogging", 50 | "s3:GetAnalyticsConfiguration", 51 | "s3:GetObjectVersionForReplication", 52 | "s3:GetLifecycleConfiguration", 53 | "s3:ListBucketByTags", 54 | "s3:GetBucketTagging", 55 | "s3:GetInventoryConfiguration", 56 | "s3:PutAccelerateConfiguration", 57 | "s3:DeleteObjectVersion", 58 | "s3:GetBucketLogging", 59 | "s3:ListBucketVersions", 60 | "s3:ReplicateTags", 61 | "s3:RestoreObject", 62 | "s3:GetAccelerateConfiguration", 63 | "s3:ListBucket", 64 | "s3:GetBucketPolicy", 65 | "s3:PutEncryptionConfiguration", 66 | "s3:GetEncryptionConfiguration", 67 | "s3:GetObjectVersionTorrent", 68 | "s3:AbortMultipartUpload", 69 | "s3:GetBucketRequestPayment", 70 | "s3:PutBucketTagging", 71 | "s3:GetObjectTagging", 72 | "s3:GetMetricsConfiguration", 73 | "s3:DeleteBucket", 74 | "s3:PutBucketVersioning", 75 | "s3:GetBucketPublicAccessBlock", 76 | "s3:ListBucketMultipartUploads", 77 | "s3:PutMetricsConfiguration", 78 | "s3:PutObjectVersionTagging", 79 | "s3:GetBucketVersioning", 80 | "s3:GetBucketAcl", 81 | "s3:PutInventoryConfiguration", 82 | "s3:GetObjectTorrent", 83 | "s3:PutBucketRequestPayment", 84 | "s3:PutBucketWebsite", 85 | "s3:GetBucketCORS", 86 | "s3:GetBucketLocation", 87 | "s3:GetObjectVersion", 88 | "s3:ReplicateDelete" 89 | ], 90 | "Resource": "arn:aws:s3:::aws-athena-query-results-503254810580eu-west-1*", 91 | "Effect": "Allow", 92 | "Sid": "VisualEditor2" 93 | }, 94 | { 95 | "Action": [ 96 | "s3:GetAccountPublicAccessBlock", 97 | "s3:ListAllMyBuckets", 98 | "s3:HeadBucket" 99 | ], 100 | "Resource": "*", 101 | "Effect": "Allow", 102 | "Sid": "VisualEditor3" 103 | }, 104 | { 105 | "Action": "logs:CreateLogGroup", 106 | "Resource": [ 107 | "arn:aws:logs:eu-west-1:503254810580:log-group:/aws/sagemaker/*", 108 | "arn:aws:logs:eu-west-1:503254810580:log-group:/aws/sagemaker/*:log-stream:aws-glue-*" 109 | ], 110 | "Effect": "Allow", 111 | "Sid": "VisualEditor4" 112 | }, 113 | { 114 | "Action": [ 115 | "glue:GetDevEndpoints", 116 | "glue:UpdateDevEndpoint", 117 | "glue:GetDevEndpoint" 118 | ], 119 | "Resource": "arn:aws:glue:eu-west-1:503254810580:devEndpoint/gj-tame-bda-kdg-raw2parquet-devEndpoint*", 120 | "Effect": "Allow", 121 | "Sid": "VisualEditor5" 122 | } 123 | ] 124 | } -------------------------------------------------------------------------------- /scripts/serverlessDataLakeImmersionIAMcf.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSTemplateFormatVersion": "2010-09-09", 3 | "Description": "PA17 2019-05-09 - @akirmak - RevHist: PA17: Added missing permission to SageMakerNotebook role reported by @hohenber. PA16-2018-12-13: sagemaker notebook role type fixed. PA15:-(parameters added for AcctId and S3 bucket's name initials)", 4 | "Parameters": { 5 | "yourInitials": { 6 | "Description": "Your Initials to be used in the s3-bucket created. All in small letters pls. e.g. It shall be 'fs' for Frank Sinatra", 7 | "Type": "String", 8 | "MinLength": "2", 9 | "MaxLength": "5" 10 | } 11 | }, 12 | "Resources": { 13 | "tameGlueRoleSlessDataLakeImmersion": { 14 | "Type": "AWS::IAM::Role", 15 | "Properties": { 16 | "AssumeRolePolicyDocument": { 17 | "Version": "2012-10-17", 18 | "Statement": [ 19 | { 20 | "Effect": "Allow", 21 | "Principal": { 22 | "Service": [ 23 | "glue.amazonaws.com" 24 | ] 25 | }, 26 | "Action": [ 27 | "sts:AssumeRole" 28 | ] 29 | } 30 | ] 31 | }, 32 | "ManagedPolicyArns": [ 33 | "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole" 34 | ] 35 | }, 36 | "Metadata": { 37 | "AWS::CloudFormation::Designer": { 38 | "id": "38df71f9-6e6c-4cb3-bc01-40d0988453b1" 39 | } 40 | } 41 | }, 42 | "tameGluePolicySlessDataLakeImmersion": { 43 | "Type": "AWS::IAM::Policy", 44 | "Properties": { 45 | "PolicyName": "AWSGlueServicePolicyServerlessDataLakeImmersion", 46 | "PolicyDocument": { 47 | "Version": "2012-10-17", 48 | "Statement": [ 49 | { 50 | "Sid": "VisualEditor0", 51 | "Effect": "Allow", 52 | "Action": [ 53 | "s3:PutObject", 54 | "s3:GetObject", 55 | "s3:DeleteObject" 56 | ], 57 | "Resource": { 58 | "Fn::Join": [ 59 | "", 60 | [ 61 | "arn:aws:s3:::", 62 | { 63 | "Ref": "yourInitials" 64 | }, 65 | "-tame-bda-immersion/*" 66 | ] 67 | ] 68 | } 69 | }, 70 | { 71 | "Effect": "Allow", 72 | "Action": [ 73 | "s3:GetObject", 74 | "s3:PutObject" 75 | ], 76 | "Resource": { 77 | "Fn::Join": [ 78 | "", 79 | [ 80 | "arn:aws:s3:::", 81 | { 82 | "Ref": "yourInitials" 83 | }, 84 | "-tame-bda-immersion/compressed-parquet*" 85 | ] 86 | ] 87 | } 88 | } 89 | ] 90 | }, 91 | "Roles": [ 92 | { 93 | "Ref": "tameGlueRoleSlessDataLakeImmersion" 94 | } 95 | ] 96 | }, 97 | "Metadata": { 98 | "AWS::CloudFormation::Designer": { 99 | "id": "ed799633-3378-4ef8-8cbe-37b6c7ad5181" 100 | } 101 | } 102 | }, 103 | "tameFHoseRoleSlessDataLakeImmersion": { 104 | "Type": "AWS::IAM::Role", 105 | "Properties": { 106 | "AssumeRolePolicyDocument": { 107 | "Version": "2012-10-17", 108 | "Statement": [ 109 | { 110 | "Sid": "", 111 | "Effect": "Allow", 112 | "Principal": { 113 | "Service": "firehose.amazonaws.com" 114 | }, 115 | "Action": "sts:AssumeRole", 116 | "Condition": { 117 | "StringEquals": { 118 | "sts:ExternalId": { 119 | "Ref": "AWS::AccountId" 120 | } 121 | } 122 | } 123 | } 124 | ] 125 | }, 126 | "ManagedPolicyArns": [ 127 | "arn:aws:iam::aws:policy/CloudWatchLogsFullAccess" 128 | ] 129 | }, 130 | "Metadata": { 131 | "AWS::CloudFormation::Designer": { 132 | "id": "13e1ef2d-884d-4875-8b55-27a4843db116" 133 | } 134 | } 135 | }, 136 | "tameFHosePolicySlessDataLakeImmersion": { 137 | "Type": "AWS::IAM::Policy", 138 | "Properties": { 139 | "PolicyName": "FirehosePolicyServerlessDataLakeImmersion", 140 | "PolicyDocument": { 141 | "Version": "2012-10-17", 142 | "Statement": [ 143 | { 144 | "Sid": "", 145 | "Effect": "Allow", 146 | "Action": [ 147 | "glue:GetTableVersions" 148 | ], 149 | "Resource": "*" 150 | }, 151 | { 152 | "Sid": "", 153 | "Effect": "Allow", 154 | "Action": [ 155 | "s3:AbortMultipartUpload", 156 | "s3:GetBucketLocation", 157 | "s3:GetObject", 158 | "s3:ListBucket", 159 | "s3:ListBucketMultipartUploads", 160 | "s3:PutObject" 161 | ], 162 | "Resource": [ 163 | { 164 | "Fn::Join": [ 165 | "", 166 | [ 167 | "arn:aws:s3:::", 168 | { 169 | "Ref": "yourInitials" 170 | }, 171 | "-tame-bda-immersion" 172 | ] 173 | ] 174 | }, 175 | { 176 | "Fn::Join": [ 177 | "", 178 | [ 179 | "arn:aws:s3:::", 180 | { 181 | "Ref": "yourInitials" 182 | }, 183 | "-tame-bda-immersion/*" 184 | ] 185 | ] 186 | }, 187 | "arn:aws:s3:::%FIREHOSE_BUCKET_NAME%", 188 | "arn:aws:s3:::%FIREHOSE_BUCKET_NAME%/*" 189 | ] 190 | }, 191 | { 192 | "Sid": "", 193 | "Effect": "Allow", 194 | "Action": [ 195 | "lambda:InvokeFunction", 196 | "lambda:GetFunctionConfiguration" 197 | ], 198 | "Resource": { 199 | "Fn::Join": [ 200 | "", 201 | [ 202 | "arn:aws:lambda:", 203 | { 204 | "Ref": "AWS::Region" 205 | }, 206 | ":", 207 | { 208 | "Ref": "AWS::AccountId" 209 | }, 210 | ":function:%FIREHOSE_DEFAULT_FUNCTION%:%FIREHOSE_DEFAULT_VERSION%" 211 | ] 212 | ] 213 | } 214 | }, 215 | { 216 | "Sid": "", 217 | "Effect": "Allow", 218 | "Action": [ 219 | "logs:PutLogEvents" 220 | ], 221 | "Resource": [ 222 | { 223 | "Fn::Join": [ 224 | "", 225 | [ 226 | "arn:aws:logs:", 227 | { 228 | "Ref": "AWS::Region" 229 | }, 230 | ":", 231 | { 232 | "Ref": "AWS::AccountId" 233 | }, 234 | ":log-group:/aws/kinesisfirehose/tamebda-rta-kinesisfh-prodcat:log-stream:*" 235 | ] 236 | ] 237 | } 238 | ] 239 | }, 240 | { 241 | "Sid": "", 242 | "Effect": "Allow", 243 | "Action": [ 244 | "kinesis:DescribeStream", 245 | "kinesis:GetShardIterator", 246 | "kinesis:GetRecords" 247 | ], 248 | "Resource": { 249 | "Fn::Join": [ 250 | "", 251 | [ 252 | "arn:aws:kinesis:", 253 | { 254 | "Ref": "AWS::Region" 255 | }, 256 | ":", 257 | { 258 | "Ref": "AWS::AccountId" 259 | }, 260 | ":stream/%FIREHOSE_STREAM_NAME%" 261 | ] 262 | ] 263 | } 264 | }, 265 | { 266 | "Effect": "Allow", 267 | "Action": [ 268 | "kms:Decrypt" 269 | ], 270 | "Resource": [ 271 | "arn:aws:kms:region:accountid:key/%SSE_KEY_ARN%" 272 | ], 273 | "Condition": { 274 | "StringEquals": { 275 | "kms:ViaService": "kinesis.%REGION_NAME%.amazonaws.com" 276 | }, 277 | "StringLike": { 278 | "kms:EncryptionContext:aws:kinesis:arn": { 279 | "Fn::Join": [ 280 | "", 281 | [ 282 | "arn:aws:kinesis:%REGION_NAME%:", 283 | { 284 | "Ref": "AWS::AccountId" 285 | }, 286 | ":stream/%FIREHOSE_STREAM_NAME%" 287 | ] 288 | ] 289 | } 290 | } 291 | } 292 | } 293 | ] 294 | }, 295 | "Roles": [ 296 | { 297 | "Ref": "tameFHoseRoleSlessDataLakeImmersion" 298 | } 299 | ] 300 | }, 301 | "Metadata": { 302 | "AWS::CloudFormation::Designer": { 303 | "id": "e517d929-5247-426f-9c9e-2c8b7c9a37c6" 304 | } 305 | } 306 | }, 307 | "tameSageMakerNBookRoleSlessDataLake": { 308 | "Type": "AWS::IAM::Role", 309 | "Properties": { 310 | "AssumeRolePolicyDocument": { 311 | "Version": "2012-10-17", 312 | "Statement": [ 313 | { 314 | "Effect": "Allow", 315 | "Principal": { 316 | "Service": "sagemaker.amazonaws.com" 317 | }, 318 | "Action": "sts:AssumeRole" 319 | } 320 | ] 321 | }, 322 | "ManagedPolicyArns": [ 323 | "arn:aws:iam::aws:policy/AmazonS3FullAccess", 324 | "arn:aws:iam::aws:policy/AmazonAthenaFullAccess", 325 | "arn:aws:iam::aws:policy/AWSGlueConsoleSageMakerNotebookFullAccess", 326 | "arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess" 327 | ] 328 | }, 329 | "Metadata": { 330 | "AWS::CloudFormation::Designer": { 331 | "id": "c599e0d5-d036-4fa1-9503-59cebc8349d1" 332 | } 333 | } 334 | }, 335 | "tameSageMakerNBookPolicySlessDataLake": { 336 | "Type": "AWS::IAM::Policy", 337 | "Properties": { 338 | "PolicyName": "SageMakerNotebookPolicyServerlessDataLake", 339 | "PolicyDocument": { 340 | "Version": "2012-10-17", 341 | "Statement": [ 342 | { 343 | "Sid": "VisualEditor0", 344 | "Effect": "Allow", 345 | "Action": [ 346 | "logs:CreateLogStream", 347 | "logs:DescribeLogStreams", 348 | "s3:ListBucket", 349 | "logs:PutLogEvents" 350 | ], 351 | "Resource": [ 352 | { 353 | "Fn::Join": [ 354 | "", 355 | [ 356 | "arn:aws:s3:::aws-glue-jes-prod-", 357 | { 358 | "Ref": "AWS::Region" 359 | }, 360 | "-assets" 361 | ] 362 | ] 363 | }, 364 | { 365 | "Fn::Join": [ 366 | "", 367 | [ 368 | "arn:aws:logs:", 369 | { 370 | "Ref": "AWS::Region" 371 | }, 372 | ":", 373 | { 374 | "Ref": "AWS::AccountId" 375 | }, 376 | ":log-group:/aws/sagemaker/*" 377 | ] 378 | ] 379 | }, 380 | { 381 | "Fn::Join": [ 382 | "", 383 | [ 384 | "arn:aws:logs:", 385 | { 386 | "Ref": "AWS::Region" 387 | }, 388 | ":", 389 | { 390 | "Ref": "AWS::AccountId" 391 | }, 392 | ":log-group:/aws/sagemaker/*:log-stream:aws-glue-*" 393 | ] 394 | ] 395 | } 396 | ] 397 | }, 398 | { 399 | "Sid": "VisualEditor1", 400 | "Effect": "Allow", 401 | "Action": "s3:GetObject", 402 | "Resource": { 403 | "Fn::Join": [ 404 | "", 405 | [ 406 | "arn:aws:s3:::aws-glue-jes-prod-", 407 | { 408 | "Ref": "AWS::Region" 409 | }, 410 | "-assets*" 411 | ] 412 | ] 413 | } 414 | }, 415 | { 416 | "Sid": "VisualEditor2", 417 | "Effect": "Allow", 418 | "Action": [ 419 | "s3:PutAnalyticsConfiguration", 420 | "s3:GetObjectVersionTagging", 421 | "s3:CreateBucket", 422 | "s3:ReplicateObject", 423 | "s3:GetObjectAcl", 424 | "s3:DeleteBucketWebsite", 425 | "s3:PutLifecycleConfiguration", 426 | "s3:GetObjectVersionAcl", 427 | "s3:PutObjectTagging", 428 | "s3:DeleteObject", 429 | "s3:DeleteObjectTagging", 430 | "s3:GetBucketPolicyStatus", 431 | "s3:GetBucketWebsite", 432 | "s3:PutReplicationConfiguration", 433 | "s3:DeleteObjectVersionTagging", 434 | "s3:GetBucketNotification", 435 | "s3:PutBucketCORS", 436 | "s3:GetReplicationConfiguration", 437 | "s3:ListMultipartUploadParts", 438 | "s3:GetObject", 439 | "s3:PutBucketNotification", 440 | "s3:PutObject", 441 | "s3:PutBucketLogging", 442 | "s3:GetAnalyticsConfiguration", 443 | "s3:GetObjectVersionForReplication", 444 | "s3:GetLifecycleConfiguration", 445 | "s3:ListBucketByTags", 446 | "s3:GetBucketTagging", 447 | "s3:GetInventoryConfiguration", 448 | "s3:PutAccelerateConfiguration", 449 | "s3:DeleteObjectVersion", 450 | "s3:GetBucketLogging", 451 | "s3:ListBucketVersions", 452 | "s3:ReplicateTags", 453 | "s3:RestoreObject", 454 | "s3:GetAccelerateConfiguration", 455 | "s3:ListBucket", 456 | "s3:GetBucketPolicy", 457 | "s3:PutEncryptionConfiguration", 458 | "s3:GetEncryptionConfiguration", 459 | "s3:GetObjectVersionTorrent", 460 | "s3:AbortMultipartUpload", 461 | "s3:GetBucketRequestPayment", 462 | "s3:PutBucketTagging", 463 | "s3:GetObjectTagging", 464 | "s3:GetMetricsConfiguration", 465 | "s3:DeleteBucket", 466 | "s3:PutBucketVersioning", 467 | "s3:GetBucketPublicAccessBlock", 468 | "s3:ListBucketMultipartUploads", 469 | "s3:PutMetricsConfiguration", 470 | "s3:PutObjectVersionTagging", 471 | "s3:GetBucketVersioning", 472 | "s3:GetBucketAcl", 473 | "s3:PutInventoryConfiguration", 474 | "s3:GetObjectTorrent", 475 | "s3:PutBucketRequestPayment", 476 | "s3:PutBucketWebsite", 477 | "s3:GetBucketCORS", 478 | "s3:GetBucketLocation", 479 | "s3:GetObjectVersion", 480 | "s3:ReplicateDelete" 481 | ], 482 | "Resource": { 483 | "Fn::Join": [ 484 | "", 485 | [ 486 | "arn:aws:s3:::aws-athena-query-results-", 487 | { 488 | "Ref": "AWS::AccountId" 489 | }, 490 | { 491 | "Ref": "AWS::Region" 492 | }, 493 | "*" 494 | ] 495 | ] 496 | } 497 | }, 498 | { 499 | "Sid": "VisualEditor3", 500 | "Effect": "Allow", 501 | "Action": [ 502 | "s3:GetAccountPublicAccessBlock", 503 | "s3:ListAllMyBuckets", 504 | "s3:HeadBucket" 505 | ], 506 | "Resource": "*" 507 | }, 508 | { 509 | "Sid": "VisualEditor4", 510 | "Effect": "Allow", 511 | "Action": "logs:CreateLogGroup", 512 | "Resource": [ 513 | { 514 | "Fn::Join": [ 515 | "", 516 | [ 517 | "arn:aws:logs:", 518 | { 519 | "Ref": "AWS::Region" 520 | }, 521 | ":", 522 | { 523 | "Ref": "AWS::AccountId" 524 | }, 525 | ":log-group:/aws/sagemaker/*" 526 | ] 527 | ] 528 | }, 529 | { 530 | "Fn::Join": [ 531 | "", 532 | [ 533 | "arn:aws:logs:", 534 | { 535 | "Ref": "AWS::Region" 536 | }, 537 | ":", 538 | { 539 | "Ref": "AWS::AccountId" 540 | }, 541 | ":log-group:/aws/sagemaker/*:log-stream:aws-glue-*" 542 | ] 543 | ] 544 | } 545 | ] 546 | }, 547 | { 548 | "Sid": "VisualEditor5", 549 | "Effect": "Allow", 550 | "Action": [ 551 | "glue:GetDevEndpoints", 552 | "glue:UpdateDevEndpoint", 553 | "glue:GetDevEndpoint" 554 | ], 555 | "Resource": { 556 | "Fn::Join": [ 557 | "", 558 | [ 559 | "arn:aws:glue:", 560 | { 561 | "Ref": "AWS::Region" 562 | }, 563 | ":", 564 | { 565 | "Ref": "AWS::AccountId" 566 | }, 567 | ":devEndpoint/*" 568 | ] 569 | ] 570 | } 571 | } 572 | ] 573 | }, 574 | "Roles": [ 575 | { 576 | "Ref": "tameSageMakerNBookRoleSlessDataLake" 577 | } 578 | ] 579 | }, 580 | "Metadata": { 581 | "AWS::CloudFormation::Designer": { 582 | "id": "e058c21b-ec9d-4936-95eb-2c492465d87b" 583 | } 584 | } 585 | } 586 | } 587 | } 588 | --------------------------------------------------------------------------------