├── .gitignore ├── LICENSE.txt ├── README ├── common.conf.sample ├── copy_to_s3.hql ├── impala.conf ├── impala.txt ├── ingest.scala ├── schema.hql └── spark.conf /.gitignore: -------------------------------------------------------------------------------- 1 | *.h2.db 2 | common.conf 3 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 2 | Strata 2016 NYC - Hadoop in the Cloud Tutorial 3 | ============================================== 4 | 5 | Introduction 6 | ------------ 7 | 8 | This repository contains everything needed to run through a two step pipeline 9 | that will ingest data via Spark and make it available for interactive queries 10 | via Impala. 11 | 12 | The Spark step is expected to be a transient cluster running only for a couple 13 | of hours each day (potentially triggered via cron). 14 | 15 | The Impala one is expected to be long running and elastic and have multiple 16 | users that use Hue or the JDBC interface. 17 | 18 | Common Settings 19 | --------------- 20 | 21 | Start by going through the AWS Quickstart flow: 22 | 23 | https://aws.amazon.com/quickstart/ 24 | http://docs.aws.amazon.com/quickstart/latest/cloudera/welcome.html 25 | 26 | Relevant identifiers will be available as CloudFormation outputs. 27 | 28 | Go to the AWS console and make note of the EC2 instance called ClusterLauncher. This is your Director server instance. 29 | 30 | SSH to the created Director server instance. 31 | $ ssh ec2-user@ -i 32 | 33 | Download the conf files from this github repo to your Director server instance. 34 | Modify common.conf.sample by providing details specific to your AWS account. 35 | Use the information from the CloudFormation output. Use the ClusterLauncher security-group, and not the NAT security-group. 36 | Alternatively, look at aws.sample.conf to see values that should go into common.conf. 37 | Save this file as common.conf. 38 | 39 | Run validation for both configuration files to ensure everything is 40 | configured properly: 41 | 42 | $ cloudera-director validate spark.conf 43 | $ cloudera-director validate impala.conf 44 | 45 | Create a tunnel to Director from your local machine: 46 | 47 | $ ssh -C -L 7189:localhost:7189 ec2-user@ 48 | # Use your browser to go to http://localhost:7189/ 49 | 50 | Data ingest via Spark 51 | --------------------- 52 | 53 | Ask Director to setup the Spark cluster for ETL: 54 | 55 | $ cloudera-director bootstrap-remote spark.conf --lp.remote.username=admin 56 | # Director will ask for the admin password 57 | 58 | Progress information is also available in the Director UI. 59 | 60 | Establish a tunnel to Cloudera Manager: 61 | 62 | $ ssh -i cloudera.pem -CN -L 7180::7180 ec2-user@ 63 | 64 | SSH into the master node and open the Spark shell: 65 | 66 | $ sudo -u hdfs -i bash 67 | 68 | $ curl -o ingest.scala https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/ingest.scala 69 | $ spark-shell -i ingest.scala 70 | 71 | $ curl -o schema.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/schema.hql 72 | $ curl -o copy_to_s3.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/copy_to_s3.hql 73 | 74 | Modify the schema.hql file to point to a new S3 bucket created via the AWS console. 75 | 76 | $ hive -f schema.hql 77 | $ hive -f copy_to_s3.hql 78 | 79 | SQL via Impala on integested data 80 | --------------------------------- 81 | 82 | Ask Director to setup the Impala cluster for interactive queries: 83 | 84 | $ cloudera-director bootstrap-remote impala.conf --lp.remote.username=admin 85 | # Director will ask for the admin password 86 | 87 | Establish a tunnel to Cloudera Manager: 88 | 89 | $ ssh -i cloudera.pem -CN -L 7180::7180 ec2-user@ 90 | 91 | SSH into the master node and open the Impala shell: 92 | 93 | $ sudo -u hdfs -i bash 94 | 95 | $ curl -o schema.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/schema.hql 96 | $ hive -f schema.hql 97 | 98 | Start the impala-shell from a worker node. Identify the Impala worker node from Cloudera Director UI. 99 | $ impala-shell -i 100 | # Run some interesting queries reading from S3 101 | 102 | -------------------------------------------------------------------------------- /common.conf.sample: -------------------------------------------------------------------------------- 1 | # 2 | # (c) Copyright 2015 Cloudera, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # 18 | # Common configurations for tutorial clusters 19 | # 20 | 21 | environmentName: Strata Demo Environment 22 | deploymentName: Strata Demo Deployment 23 | 24 | # RHEL 7.2 in us-west-2 25 | image: ami-775e4f16 26 | 27 | iamProfileName: REPLACE-ME 28 | 29 | network { 30 | region: us-west-2 31 | subnetId: subnet-REPLACE-ME 32 | securityGroupsIds: sg-REPLACE-ME 33 | } 34 | 35 | tags { 36 | owner: ${?USER} 37 | } 38 | 39 | secrets { 40 | privateKey: "/home/ec2-user/cloudera-aws-quickstart-09-25-2016.pem" 41 | } 42 | -------------------------------------------------------------------------------- /copy_to_s3.hql: -------------------------------------------------------------------------------- 1 | 2 | INSERT OVERWRITE TABLE age SELECT * FROM local_age; 3 | 4 | INSERT OVERWRITE TABLE experience SELECT * FROM local_experience; 5 | 6 | INSERT OVERWRITE TABLE players SELECT * FROM local_players; 7 | -------------------------------------------------------------------------------- /impala.conf: -------------------------------------------------------------------------------- 1 | # 2 | # (c) Copyright 2015 Cloudera, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include file("common.conf") 18 | 19 | # 20 | # Cluster name 21 | # 22 | 23 | name: C5-Impala 24 | 25 | # 26 | # Cloud provider configuration (credentials, region or zone and optional default image) 27 | # 28 | 29 | provider { 30 | type: aws 31 | 32 | region: ${network.region} 33 | subnetId: ${network.subnetId} 34 | securityGroupsIds: ${network.securityGroupsIds} 35 | 36 | instanceNamePrefix: strata-2016-impala 37 | 38 | rootVolumeSizeGB: 128 # matching the size of the pre-extracted AMI 39 | rootVolumeType: gp2 # OR standard (for EBS magnetic) 40 | 41 | iamProfileName: ${iamProfileName} 42 | associatePublicIpAddresses: false 43 | } 44 | 45 | # 46 | # SSH credentials to use to connect to the instances 47 | # 48 | 49 | ssh { 50 | username: ec2-user # for RHEL image 51 | privateKey: ${secrets.privateKey} 52 | } 53 | 54 | # 55 | # A list of instance types to use for group of nodes or management services 56 | # 57 | 58 | instances { 59 | m4x { 60 | type: m4.xlarge 61 | image: ${image} 62 | tags: ${tags} 63 | } 64 | } 65 | 66 | # 67 | # Configuration for Cloudera Manager. Cloudera Director can use an existing instance 68 | # or bootstrap everything from scratch for a new cluster 69 | # 70 | 71 | cloudera-manager { 72 | instance: ${instances.m4x} { 73 | instanceNamePrefix: strata-2016-cm 74 | } 75 | 76 | configs { 77 | CLOUDERA_MANAGER { 78 | custom_banner_html: "Managed by Cloudera Director" 79 | } 80 | } 81 | 82 | repository: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/5.8.2/" 83 | repositoryKeyUrl: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera" 84 | 85 | # 86 | # Automatically activate 60-Day Cloudera Enterprise Trial 87 | # 88 | 89 | enableEnterpriseTrial: true 90 | } 91 | 92 | # 93 | # Cluster description 94 | # 95 | 96 | cluster { 97 | 98 | products { 99 | CDH: 5.8 100 | } 101 | 102 | parcelRepositories: ["http://archive.cloudera.com/cdh5/parcels/5.8.2/"] 103 | 104 | services: [HDFS, YARN, HIVE, IMPALA] 105 | 106 | masters { 107 | count: 1 108 | 109 | instance: ${instances.m4x} 110 | 111 | roles { 112 | HDFS: [NAMENODE, SECONDARYNAMENODE] 113 | YARN: [RESOURCEMANAGER, JOBHISTORY] 114 | HIVE: [HIVESERVER2, HIVEMETASTORE] 115 | IMPALA: [CATALOGSERVER, STATESTORE] 116 | } 117 | } 118 | 119 | workers { 120 | count: 1 121 | 122 | instance: ${instances.m4x} 123 | 124 | roles { 125 | HDFS: [DATANODE] 126 | YARN: [NODEMANAGER] 127 | IMPALA: [IMPALAD] 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /impala.txt: -------------------------------------------------------------------------------- 1 | Programmatic Data Analysis with Impala 2 | ====================================== 3 | 4 | Now that we have all the data, it’s time to start analyzing it with Impala. 5 | 6 | Our table has a lot of information in it—raw stats, z-scores, normalized 7 | z-scores—so we can immediately start asking and answering some good questions. 8 | 9 | Let’s check in on the MVP race for this year. 10 | 11 | > select name, zTot from Players where year=2016 order by zTot desc limit 10; 12 | 13 | [Stephen Curry,19.766248304312754] 14 | [Kevin Durant,15.323017389251323] 15 | [Anthony Davis,13.186429940875069] 16 | [Kawhi Leonard,13.18181904233336] 17 | [James Harden,12.622408009920706] 18 | [Russell Westbrook,12.26014043592826] 19 | [Kyle Lowry,11.634357073733122] 20 | [Paul Millsap,11.28903998833887] 21 | [Chris Paul,10.843486407063033] 22 | [Jimmy Butler,10.475908301410975] 23 | 24 | 25 | Basketball fans should not be surprised to see Stephen Curry topping the list, 26 | as he’s having a historic year to date. The rest of the list contains players 27 | who are also having great years, so this is a good sanity check that ours 28 | z-score is a good statistic. Let’s check out the normalized scores as well to 29 | see if they tell a different story: 30 | 31 | 32 | > select name, nTot from Players where year=2016 order by nTot desc limit 10; 33 | 34 | [Stephen Curry,3.911865399387443] 35 | [Kevin Durant,2.8729484855957916] 36 | [Kawhi Leonard,2.7288580160780636] 37 | [Anthony Davis,2.599364621997217] 38 | [Russell Westbrook,2.4555072670169955] 39 | [Paul Millsap,2.3037685595490816] 40 | [LeBron James,2.1939606667616527] 41 | [Kyle Lowry,2.151090172022115] 42 | [Chris Paul,2.1331243771597586] 43 | [James Harden,2.0788749389912686] 44 | 45 | 46 | We see similar players in both lists, but the ordering is slightly different. 47 | Anthony Davis and Kawhi Leonard are neck and neck in z-score, but Leonard has a 48 | slight edge in normalized z-score, which suggests that he contributes more 49 | significantly across a wider spectrum of stats than does Davis, or that Davis 50 | may be overkill in the stats that he is good in. Curry has a commanding lead in 51 | both zTot and nTot—29% on the second place player in z-score and 39% in 52 | normalized z-score. Let’s have a look at Curry’s full numbers from this year. 53 | 54 | 55 | > select * from Players where year=2016 and name='Stephen Curry' 56 | 57 | [6,Stephen Curry,2016,27,PG,GSW,42,42,33.9,10.0,19.5,0.51,4.9,10.8,0.451,5.1,8.7,0.583,0.635,5.3,5.9,0.911,0.8,4.6,5.4,6.6,2.1,0.1,3.4,2.0,30.1,3.2822803060371077,3.4693236141930193,6.056007802613955,0.7550168635135219,2.695650422425284,3.076598009775487,-0.6534242946356899,-2.7596520748161213,3.8444476552061824,19.766248304312747,0.8113439563367003,0.42770109585695865,1.0,0.15840862722238425,0.4898747874478281,0.7843137254901962,-0.08887924083607253,-0.6708975521305531,1.0,3.911865399387443] 58 | 59 | We see his largest contributor to zTot is z3P at 6.05, meaning his three point 60 | game is contributing a lot to his value. Indeed, he’s actually top in this 61 | regard. 62 | 63 | 64 | > select name, 3p, z3p from Players where year=2016 order by z3p desc limit 10; 65 | 66 | [Stephen Curry,4.9,6.056007802613957] 67 | [Klay Thompson,3.2,3.6363158210435227] 68 | [Damian Lillard,3.1,3.493980998598203] 69 | [Paul George,2.9,3.2093113537075637] 70 | [Kyle Lowry,2.7,2.9246417088169245] 71 | [J.J. Redick,2.7,2.9246417088169245] 72 | [James Harden,2.7,2.9246417088169245] 73 | [Eric Gordon,2.5,2.6399720639262854] 74 | [Wesley Matthews,2.5,2.6399720639262854] 75 | [C.J. McCollum,2.5,2.6399720639262854] 76 | 77 | Curry has almost doubled the value of the second place player, teammate Klay 78 | Thompson. At just shy of five three-pointers made a game, Curry is arguably 79 | having one of the best shooting seasons of all time. Or is he? z-scores can 80 | help us here, by telling us how much better a player is relative to the state 81 | of the league. We’ll run two queries to demonstrate: 82 | 83 | 84 | > select name, 3p, z3p from Players order by 3p desc limit 10; 85 | 86 | [Stephen Curry,4.9,6.056007802613957] 87 | [Stephen Curry,3.6,4.395853442084636] 88 | [Stephen Curry,3.5,4.372814802572277] 89 | [Ray Allen,3.4,4.735522316886375] 90 | [Ray Allen,3.3,4.863432467545102] 91 | [Dennis Scott,3.3,4.229610850304963] 92 | [George McCloud,3.3,4.229610850304963] 93 | [Stephen Curry,3.3,3.8893664999911772] 94 | [Klay Thompson,3.2,3.6363158210435227] 95 | [Klay Thompson,3.1,3.651613974257506] 96 | 97 | 98 | > select name, 3p, z3p from Players order by z3p desc limit 10; 99 | 100 | [Joe Hassett,1.3,10.220836345844305] 101 | [Mike Dunleavy,1.1,8.683932683360899] 102 | [Darrell Griffith,1.1,8.683932683360899] 103 | [Mike Dunleavy,0.8,7.496775843134757] 104 | [Joe Hassett,1.0,7.456044031025608] 105 | [Brian Taylor,1.2,6.94629576951599] 106 | [Darrell Griffith,1.2,6.9433935103235775] 107 | [Michael Adams,2.5,6.679409664703833] 108 | [Don Buse,0.9,6.671197290917649] 109 | [Danny Ainge,1.8,6.282718341161911] 110 | 111 | The first query shows that Curry makes, on average, more three-pointers than 112 | anyone in the history of the NBA. The second query shows that Joe Hassett had 113 | the best three-point shooting season in 1981, as compared to the rest of the 114 | league, in the history of the NBA. Curry doesn’t even rank in the top 10. (He 115 | barely misses it, coming in at 12th.) Although Hassett only made 1.3 three 116 | pointers a game in 1981, the three-point shot was new and few people were 117 | taking it. Three-pointers are common in today’s game to the point where there 118 | is talk about moving the line back. The proof is always in the numbers (which 119 | you should be able to compute yourself now): 120 | 121 | In 1981, Hassett’s 1.3 three-pointers a game was such a commanding difference 122 | over the league average of .04, that it produces a higher z-score than Curry’s 123 | 4.9 in 2016. This example illustrates why it’s important to keep in mind that 124 | z-scores are measuring relative values in year: we’re not claiming that Hassett 125 | is a better three-point shooter than Stephen Curry, just that his performance 126 | in 1981 was a larger outlier that year than Curry’s in 2016. 127 | 128 | 129 | This is based on Jordan Volz' excellent Basketball Stats tutorial for Spark: 130 | http://blog.cloudera.com/blog/2016/06/how-to-analyze-fantasy-sports-using-apache-spark-and-sql/ 131 | -------------------------------------------------------------------------------- /ingest.scala: -------------------------------------------------------------------------------- 1 | 2 | //******************** 3 | // Configure S3 access 4 | //******************** 5 | val hadoopConf = sc.hadoopConfiguration 6 | 7 | // Add your keys if you are not using IAM roles 8 | // hadoopConf.set("fs.s3a.access.key", "REPLACE-ME") 9 | // hadoopConf.set("fs.s3a.secret.key", "REPLACE-ME") 10 | 11 | //******************** 12 | //Classes, Helper Functions + Variables 13 | //******************** 14 | import org.apache.spark.util.StatCounter 15 | import org.apache.spark.sql.Row 16 | import org.apache.spark.sql.types._ 17 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 18 | import scala.collection.mutable.ListBuffer 19 | 20 | //helper funciton to compute normalized value 21 | def statNormalize(stat:Double, max:Double, min:Double)={ 22 | val newmax=math.max(math.abs(max),math.abs(min)) 23 | stat/newmax 24 | } 25 | 26 | //Holds initial bball stats + weighted stats + normalized stats 27 | @serializable case class BballData (year: Int, name: String, position: String, age:Int, 28 | team: String, gp: Int, gs: Int, mp: Double,stats: Array[Double], statsZ:Array[Double]=Array[Double](), 29 | valueZ:Double=0,statsN:Array[Double]=Array[Double](),valueN:Double=0,experience:Double=0) 30 | 31 | //parse a stat line into a BBallDataZ object 32 | def bbParse(input: String,bStats: scala.collection.Map[String,Double]=Map.empty,zStats: scala.collection.Map[String,Double]=Map.empty)={ 33 | val line=input.replace(",,",",0,") 34 | val pieces=line.substring(1,line.length-1).split(",") 35 | val year=pieces(0).toInt 36 | val name=pieces(2) 37 | val position=pieces(3) 38 | val age=pieces(4).toInt 39 | val team=pieces(5) 40 | val gp=pieces(6).toInt 41 | val gs=pieces(7).toInt 42 | val mp=pieces(8).toDouble 43 | val stats=pieces.slice(9,31).map(x=>x.toDouble) 44 | var statsZ:Array[Double]=Array.empty 45 | var valueZ:Double=Double.NaN 46 | var statsN:Array[Double]=Array.empty 47 | var valueN:Double=Double.NaN 48 | 49 | if (!bStats.isEmpty){ 50 | val fg=(stats(2)-bStats.apply(year.toString+"_FG%_avg"))*stats(1) 51 | val tp=(stats(3)-bStats.apply(year.toString+"_3P_avg"))/bStats.apply(year.toString+"_3P_stdev") 52 | val ft=(stats(12)-bStats.apply(year.toString+"_FT%_avg"))*stats(11) 53 | val trb=(stats(15)-bStats.apply(year.toString+"_TRB_avg"))/bStats.apply(year.toString+"_TRB_stdev") 54 | val ast=(stats(16)-bStats.apply(year.toString+"_AST_avg"))/bStats.apply(year.toString+"_AST_stdev") 55 | val stl=(stats(17)-bStats.apply(year.toString+"_STL_avg"))/bStats.apply(year.toString+"_STL_stdev") 56 | val blk=(stats(18)-bStats.apply(year.toString+"_BLK_avg"))/bStats.apply(year.toString+"_BLK_stdev") 57 | val tov=(stats(19)-bStats.apply(year.toString+"_TOV_avg"))/bStats.apply(year.toString+"_TOV_stdev")*(-1) 58 | val pts=(stats(21)-bStats.apply(year.toString+"_PTS_avg"))/bStats.apply(year.toString+"_PTS_stdev") 59 | statsZ=Array(fg,ft,tp,trb,ast,stl,blk,tov,pts) 60 | valueZ = statsZ.reduce(_+_) 61 | 62 | if (!zStats.isEmpty){ 63 | val zfg=(fg-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev") 64 | val zft=(ft-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev") 65 | val fgN=statNormalize(zfg,(zStats.apply(year.toString+"_FG_max")-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev"),(zStats.apply(year.toString+"_FG_min")-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev")) 66 | val ftN=statNormalize(zft,(zStats.apply(year.toString+"_FT_max")-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev"),(zStats.apply(year.toString+"_FT_min")-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev")) 67 | val tpN=statNormalize(tp,zStats.apply(year.toString+"_3P_max"),zStats.apply(year.toString+"_3P_min")) 68 | val trbN=statNormalize(trb,zStats.apply(year.toString+"_TRB_max"),zStats.apply(year.toString+"_TRB_min")) 69 | val astN=statNormalize(ast,zStats.apply(year.toString+"_AST_max"),zStats.apply(year.toString+"_AST_min")) 70 | val stlN=statNormalize(stl,zStats.apply(year.toString+"_STL_max"),zStats.apply(year.toString+"_STL_min")) 71 | val blkN=statNormalize(blk,zStats.apply(year.toString+"_BLK_max"),zStats.apply(year.toString+"_BLK_min")) 72 | val tovN=statNormalize(tov,zStats.apply(year.toString+"_TOV_max"),zStats.apply(year.toString+"_TOV_min")) 73 | val ptsN=statNormalize(pts,zStats.apply(year.toString+"_PTS_max"),zStats.apply(year.toString+"_PTS_min")) 74 | statsZ=Array(zfg,zft,tp,trb,ast,stl,blk,tov,pts) 75 | valueZ = statsZ.reduce(_+_) 76 | statsN=Array(fgN,ftN,tpN,trbN,astN,stlN,blkN,tovN,ptsN) 77 | valueN=statsN.reduce(_+_) 78 | } 79 | } 80 | BballData(year, name, position, age, team, gp, gs, mp, stats,statsZ,valueZ,statsN,valueN) 81 | } 82 | 83 | //stat counter class -- need printStats method to print out the stats. Useful for transformations 84 | class BballStatCounter extends Serializable { 85 | val stats: StatCounter = new StatCounter() 86 | var missing: Long = 0 87 | 88 | def add(x: Double): BballStatCounter = { 89 | if (x.isNaN) { 90 | missing += 1 91 | } else { 92 | stats.merge(x) 93 | } 94 | this 95 | } 96 | 97 | def merge(other: BballStatCounter): BballStatCounter = { 98 | stats.merge(other.stats) 99 | missing += other.missing 100 | this 101 | } 102 | 103 | def printStats(delim: String): String= { 104 | stats.count + delim + stats.mean + delim + stats.stdev + delim + stats.max + delim + stats.min 105 | } 106 | 107 | override def toString: String = { 108 | "stats: " + stats.toString + " NaN: " + missing 109 | } 110 | } 111 | 112 | object BballStatCounter extends Serializable { 113 | def apply(x: Double) = new BballStatCounter().add(x) 114 | } 115 | 116 | //process raw data into zScores and nScores 117 | def processStats(stats0:org.apache.spark.rdd.RDD[String],txtStat:Array[String],bStats: scala.collection.Map[String,Double]=Map.empty,zStats: scala.collection.Map[String,Double]=Map.empty)={ 118 | //parse stats 119 | val stats1=stats0.map(x=>bbParse(x,bStats,zStats)) 120 | 121 | //group by year 122 | val stats2={if(bStats.isEmpty){ 123 | stats1.keyBy(x=>x.year).map(x=>(x._1,x._2.stats)).groupByKey() 124 | }else{ 125 | stats1.keyBy(x=>x.year).map(x=>(x._1,x._2.statsZ)).groupByKey() 126 | } 127 | } 128 | 129 | //map each stat to StatCounter 130 | val stats3=stats2.map{case (x,y)=>(x,y.map(a=>a.map(b=>BballStatCounter(b))))} 131 | 132 | //merge all stats together 133 | val stats4=stats3.map{case (x,y)=>(x,y.reduce((a,b)=>a.zip(b).map{ case (c,d)=>c.merge(d)}))} 134 | 135 | //combine stats with label and pull label out 136 | val stats5=stats4.map{case (x,y)=>(x,txtStat.zip(y))}.map{x=>(x._2.map{case (y,z)=>(x._1,y,z)})} 137 | 138 | //separate each stat onto its own line and print out the Stats to a String 139 | val stats6=stats5.flatMap(x=>x.map(y=>(y._1,y._2,y._3.printStats(",")))) 140 | 141 | //turn stat tuple into key-value pairs with corresponding agg stat 142 | val stats7=stats6.flatMap{case(a,b,c)=>{ 143 | val pieces=c.split(",") 144 | val count=pieces(0) 145 | val mean=pieces(1) 146 | val stdev=pieces(2) 147 | val max=pieces(3) 148 | val min=pieces(4) 149 | Array((a+"_"+b+"_"+"count",count.toDouble),(a+"_"+b+"_"+"avg",mean.toDouble),(a+"_"+b+"_"+"stdev",stdev.toDouble),(a+"_"+b+"_"+"max",max.toDouble),(a+"_"+b+"_"+"min",min.toDouble)) 150 | } 151 | } 152 | stats7 153 | } 154 | 155 | //process stats for age or experience 156 | def processStatsAgeOrExperience(stats0:org.apache.spark.rdd.RDD[(Int, Array[Double])], label:String)={ 157 | 158 | 159 | //group elements by age 160 | val stats1=stats0.groupByKey() 161 | 162 | //turn values into StatCounter objects 163 | val stats2=stats1.map{case(x,y)=>(x,y.map(z=>z.map(a=>BballStatCounter(a))))} 164 | 165 | //Reduce rows by merging StatCounter objects 166 | val stats3=stats2.map{case (x,y)=>(x,y.reduce((a,b)=>a.zip(b).map{case(c,d)=>c.merge(d)}))} 167 | 168 | //turn data into RDD[Row] object for dataframe 169 | val stats4=stats3.map(x=>Array(Array(x._1.toDouble),x._2.flatMap(y=>y.printStats(",").split(",")).map(y=>y.toDouble)).flatMap(y=>y)).map(x=>Row(x(0).toInt,x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8),x(9),x(10),x(11),x(12),x(13),x(14),x(15),x(16),x(17),x(18),x(19),x(20))) 170 | 171 | //create schema for age table 172 | val schema =StructType( 173 | StructField(label, IntegerType, true) :: 174 | StructField("valueZ_count", DoubleType, true) :: 175 | StructField("valueZ_mean", DoubleType, true) :: 176 | StructField("valueZ_stdev", DoubleType, true) :: 177 | StructField("valueZ_max", DoubleType, true) :: 178 | StructField("valueZ_min", DoubleType, true) :: 179 | StructField("valueN_count", DoubleType, true) :: 180 | StructField("valueN_mean", DoubleType, true) :: 181 | StructField("valueN_stdev", DoubleType, true) :: 182 | StructField("valueN_max", DoubleType, true) :: 183 | StructField("valueN_min", DoubleType, true) :: 184 | StructField("deltaZ_count", DoubleType, true) :: 185 | StructField("deltaZ_mean", DoubleType, true) :: 186 | StructField("deltaZ_stdev", DoubleType, true) :: 187 | StructField("deltaZ_max", DoubleType, true) :: 188 | StructField("deltaZ_min", DoubleType, true) :: 189 | StructField("deltaN_count", DoubleType, true) :: 190 | StructField("deltaN_mean", DoubleType, true) :: 191 | StructField("deltaN_stdev", DoubleType, true) :: 192 | StructField("deltaN_max", DoubleType, true) :: 193 | StructField("deltaN_min", DoubleType, true) :: Nil 194 | ) 195 | 196 | //create data frame 197 | sqlContext.createDataFrame(stats4,schema) 198 | } 199 | 200 | //******************** 201 | //Processing + Transformations 202 | //******************** 203 | 204 | 205 | //******************** 206 | //Compute Aggregate Stats Per Year 207 | //******************** 208 | 209 | //read in all stats 210 | // Old, read from HDFS: val stats=sc.textFile("/tmp/BasketballStatsWithYear/*/*").repartition(sc.defaultParallelism) 211 | val stats=sc.textFile("s3a://cloudera-cloud-demo/datasets/BasketballStatsWithYear/*/*").repartition(sc.defaultParallelism) 212 | 213 | //filter out junk rows, clean up data entry errors as well 214 | val filteredStats=stats.filter(x => !x.contains("FG%")).filter(x => x.contains(",")).map(x=>x.replace("*","").replace(",,",",0,")) 215 | filteredStats.cache() 216 | 217 | //process stats and save as map 218 | val txtStat=Array("FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%","eFG%","FT","FTA","FT%","ORB","DRB","TRB","AST","STL","BLK","TOV","PF","PTS") 219 | val aggStats=processStats(filteredStats,txtStat).collectAsMap 220 | 221 | //collect rdd into map and broadcast 222 | val broadcastStats=sc.broadcast(aggStats) 223 | 224 | 225 | //******************** 226 | //Compute Z-Score Stats Per Year 227 | //******************** 228 | 229 | //parse stats, now tracking weights 230 | val txtStatZ=Array("FG","FT","3P","TRB","AST","STL","BLK","TOV","PTS") 231 | val zStats=processStats(filteredStats,txtStatZ,broadcastStats.value).collectAsMap 232 | 233 | //collect rdd into map and broadcast 234 | val zBroadcastStats=sc.broadcast(zStats) 235 | 236 | 237 | //******************** 238 | //Compute Normalized Stats Per Year 239 | //******************** 240 | 241 | //parse stats, now normalizing 242 | val nStats=filteredStats.map(x=>bbParse(x,broadcastStats.value,zBroadcastStats.value)) 243 | 244 | //map RDD to RDD[Row] so that we can turn it into a dataframe 245 | val nPlayer = nStats.map(x => Row.fromSeq(Array(x.name,x.year,x.age,x.position,x.team,x.gp,x.gs,x.mp) ++ x.stats ++ x.statsZ ++ Array(x.valueZ) ++ x.statsN ++ Array(x.valueN))) 246 | 247 | //create schema for the data frame 248 | val schemaN =StructType( 249 | StructField("name", StringType, true) :: 250 | StructField("year", IntegerType, true) :: 251 | StructField("age", IntegerType, true) :: 252 | StructField("position", StringType, true) :: 253 | StructField("team", StringType, true) :: 254 | StructField("gp", IntegerType, true) :: 255 | StructField("gs", IntegerType, true) :: 256 | StructField("mp", DoubleType, true) :: 257 | StructField("FG", DoubleType, true) :: 258 | StructField("FGA", DoubleType, true) :: 259 | StructField("FGP", DoubleType, true) :: 260 | StructField("3P", DoubleType, true) :: 261 | StructField("3PA", DoubleType, true) :: 262 | StructField("3PP", DoubleType, true) :: 263 | StructField("2P", DoubleType, true) :: 264 | StructField("2PA", DoubleType, true) :: 265 | StructField("2PP", DoubleType, true) :: 266 | StructField("eFG", DoubleType, true) :: 267 | StructField("FT", DoubleType, true) :: 268 | StructField("FTA", DoubleType, true) :: 269 | StructField("FTP", DoubleType, true) :: 270 | StructField("ORB", DoubleType, true) :: 271 | StructField("DRB", DoubleType, true) :: 272 | StructField("TRB", DoubleType, true) :: 273 | StructField("AST", DoubleType, true) :: 274 | StructField("STL", DoubleType, true) :: 275 | StructField("BLK", DoubleType, true) :: 276 | StructField("TOV", DoubleType, true) :: 277 | StructField("PF", DoubleType, true) :: 278 | StructField("PTS", DoubleType, true) :: 279 | StructField("zFG", DoubleType, true) :: 280 | StructField("zFT", DoubleType, true) :: 281 | StructField("z3P", DoubleType, true) :: 282 | StructField("zTRB", DoubleType, true) :: 283 | StructField("zAST", DoubleType, true) :: 284 | StructField("zSTL", DoubleType, true) :: 285 | StructField("zBLK", DoubleType, true) :: 286 | StructField("zTOV", DoubleType, true) :: 287 | StructField("zPTS", DoubleType, true) :: 288 | StructField("zTOT", DoubleType, true) :: 289 | StructField("nFG", DoubleType, true) :: 290 | StructField("nFT", DoubleType, true) :: 291 | StructField("n3P", DoubleType, true) :: 292 | StructField("nTRB", DoubleType, true) :: 293 | StructField("nAST", DoubleType, true) :: 294 | StructField("nSTL", DoubleType, true) :: 295 | StructField("nBLK", DoubleType, true) :: 296 | StructField("nTOV", DoubleType, true) :: 297 | StructField("nPTS", DoubleType, true) :: 298 | StructField("nTOT", DoubleType, true) :: Nil 299 | ) 300 | 301 | //create data frame 302 | val dfPlayersT=sqlContext.createDataFrame(nPlayer,schemaN) 303 | 304 | //save all stats as a temp table 305 | dfPlayersT.registerTempTable("tPlayers") 306 | 307 | //calculate exp and zdiff, ndiff 308 | val dfPlayers=sqlContext.sql("select age-min_age as exp,tPlayers.* from tPlayers join (select name,min(age)as min_age from tPlayers group by name) as t1 on tPlayers.name=t1.name order by tPlayers.name, exp ") 309 | 310 | //save as table 311 | dfPlayers.saveAsTable("Local_Players") 312 | //filteredStats.unpersist() 313 | 314 | //******************** 315 | //ANALYSIS 316 | //******************** 317 | 318 | 319 | //group data by player name 320 | val pStats=dfPlayers.sort(dfPlayers("name"),dfPlayers("exp") asc).map(x=>(x.getString(1),(x.getDouble(50),x.getDouble(40),x.getInt(2),x.getInt(3),Array(x.getDouble(31),x.getDouble(32),x.getDouble(33),x.getDouble(34),x.getDouble(35),x.getDouble(36),x.getDouble(37),x.getDouble(38),x.getDouble(39)),x.getInt(0)))).groupByKey() 321 | pStats.cache 322 | 323 | //for each player, go through all the years and calculate the change in valueZ and valueN, save into two lists 324 | //one for age, one for experience 325 | //exclude players who played in 1980 from experience, as we only have partial data for them 326 | val excludeNames=dfPlayers.filter(dfPlayers("year")===1980).select(dfPlayers("name")).map(x=>x.mkString).toArray.mkString(",") 327 | 328 | val pStats1=pStats.map{ case(name,stats) => 329 | var last = 0 330 | var deltaZ = 0.0 331 | var deltaN = 0.0 332 | var valueZ = 0.0 333 | var valueN = 0.0 334 | var exp = 0 335 | val aList = ListBuffer[(Int,Array[Double])]() 336 | val eList = ListBuffer[(Int,Array[Double])]() 337 | stats.foreach( z => { 338 | if (last>0){ 339 | deltaN = z._1 - valueN 340 | deltaZ = z._2 - valueZ 341 | }else{ 342 | deltaN = Double.NaN 343 | deltaZ = Double.NaN 344 | } 345 | valueN = z._1 346 | valueZ = z._2 347 | last = z._4 348 | aList += ((last, Array(valueZ,valueN,deltaZ,deltaN))) 349 | if (!excludeNames.contains(z._1)){ 350 | exp = z._6 351 | eList += ((exp, Array(valueZ,valueN,deltaZ,deltaN))) 352 | } 353 | }) 354 | (aList,eList) 355 | } 356 | 357 | pStats1.cache 358 | 359 | 360 | //******************** 361 | //compute age stats 362 | //******************** 363 | 364 | //extract out the age list 365 | val pStats2=pStats1.flatMap{case(x,y)=>x} 366 | 367 | //create age data frame 368 | val dfAge=processStatsAgeOrExperience(pStats2, "age") 369 | 370 | //save as table 371 | dfAge.saveAsTable("Local_Age") 372 | 373 | //extract out the experience list 374 | val pStats3=pStats1.flatMap{case(x,y)=>y} 375 | 376 | //create experience dataframe 377 | val dfExperience=processStatsAgeOrExperience(pStats3,"Experience") 378 | 379 | //save as table 380 | dfExperience.saveAsTable("Local_Experience") 381 | 382 | pStats1.unpersist() 383 | -------------------------------------------------------------------------------- /schema.hql: -------------------------------------------------------------------------------- 1 | 2 | CREATE EXTERNAL TABLE `age`( 3 | `age` int COMMENT '', 4 | `valuez_count` double COMMENT '', 5 | `valuez_mean` double COMMENT '', 6 | `valuez_stdev` double COMMENT '', 7 | `valuez_max` double COMMENT '', 8 | `valuez_min` double COMMENT '', 9 | `valuen_count` double COMMENT '', 10 | `valuen_mean` double COMMENT '', 11 | `valuen_stdev` double COMMENT '', 12 | `valuen_max` double COMMENT '', 13 | `valuen_min` double COMMENT '', 14 | `deltaz_count` double COMMENT '', 15 | `deltaz_mean` double COMMENT '', 16 | `deltaz_stdev` double COMMENT '', 17 | `deltaz_max` double COMMENT '', 18 | `deltaz_min` double COMMENT '', 19 | `deltan_count` double COMMENT '', 20 | `deltan_mean` double COMMENT '', 21 | `deltan_stdev` double COMMENT '', 22 | `deltan_max` double COMMENT '', 23 | `deltan_min` double COMMENT '') 24 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' 25 | STORED AS 26 | INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat" 27 | OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat" 28 | LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/age'; 29 | 30 | CREATE EXTERNAL TABLE `experience`( 31 | `experience` int COMMENT '', 32 | `valuez_count` double COMMENT '', 33 | `valuez_mean` double COMMENT '', 34 | `valuez_stdev` double COMMENT '', 35 | `valuez_max` double COMMENT '', 36 | `valuez_min` double COMMENT '', 37 | `valuen_count` double COMMENT '', 38 | `valuen_mean` double COMMENT '', 39 | `valuen_stdev` double COMMENT '', 40 | `valuen_max` double COMMENT '', 41 | `valuen_min` double COMMENT '', 42 | `deltaz_count` double COMMENT '', 43 | `deltaz_mean` double COMMENT '', 44 | `deltaz_stdev` double COMMENT '', 45 | `deltaz_max` double COMMENT '', 46 | `deltaz_min` double COMMENT '', 47 | `deltan_count` double COMMENT '', 48 | `deltan_mean` double COMMENT '', 49 | `deltan_stdev` double COMMENT '', 50 | `deltan_max` double COMMENT '', 51 | `deltan_min` double COMMENT '') 52 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' 53 | STORED AS 54 | INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat" 55 | OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat" 56 | LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/experience'; 57 | 58 | CREATE EXTERNAL TABLE `players`( 59 | `exp` int COMMENT '', 60 | `name` string COMMENT '', 61 | `year` int COMMENT '', 62 | `age` int COMMENT '', 63 | `position` string COMMENT '', 64 | `team` string COMMENT '', 65 | `gp` int COMMENT '', 66 | `gs` int COMMENT '', 67 | `mp` double COMMENT '', 68 | `fg` double COMMENT '', 69 | `fga` double COMMENT '', 70 | `fgp` double COMMENT '', 71 | `3p` double COMMENT '', 72 | `3pa` double COMMENT '', 73 | `3pp` double COMMENT '', 74 | `2p` double COMMENT '', 75 | `2pa` double COMMENT '', 76 | `2pp` double COMMENT '', 77 | `efg` double COMMENT '', 78 | `ft` double COMMENT '', 79 | `fta` double COMMENT '', 80 | `ftp` double COMMENT '', 81 | `orb` double COMMENT '', 82 | `drb` double COMMENT '', 83 | `trb` double COMMENT '', 84 | `ast` double COMMENT '', 85 | `stl` double COMMENT '', 86 | `blk` double COMMENT '', 87 | `tov` double COMMENT '', 88 | `pf` double COMMENT '', 89 | `pts` double COMMENT '', 90 | `zfg` double COMMENT '', 91 | `zft` double COMMENT '', 92 | `z3p` double COMMENT '', 93 | `ztrb` double COMMENT '', 94 | `zast` double COMMENT '', 95 | `zstl` double COMMENT '', 96 | `zblk` double COMMENT '', 97 | `ztov` double COMMENT '', 98 | `zpts` double COMMENT '', 99 | `ztot` double COMMENT '', 100 | `nfg` double COMMENT '', 101 | `nft` double COMMENT '', 102 | `n3p` double COMMENT '', 103 | `ntrb` double COMMENT '', 104 | `nast` double COMMENT '', 105 | `nstl` double COMMENT '', 106 | `nblk` double COMMENT '', 107 | `ntov` double COMMENT '', 108 | `npts` double COMMENT '', 109 | `ntot` double COMMENT '') 110 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' 111 | STORED AS 112 | INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat" 113 | OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat" 114 | LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/players'; 115 | -------------------------------------------------------------------------------- /spark.conf: -------------------------------------------------------------------------------- 1 | # 2 | # (c) Copyright 2015 Cloudera, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | include file("common.conf") 18 | 19 | # 20 | # Cluster name 21 | # 22 | 23 | name: C5-Spark-Ingest 24 | 25 | # 26 | # Cloud provider configuration (credentials, region or zone and optional default image) 27 | # 28 | 29 | provider { 30 | type: aws 31 | 32 | region: ${network.region} 33 | subnetId: ${network.subnetId} 34 | securityGroupsIds: ${network.securityGroupsIds} 35 | 36 | instanceNamePrefix: strata-2016-spark 37 | 38 | rootVolumeSizeGB: 128 # matching the size of the pre-extracted AMI 39 | rootVolumeType: gp2 # OR standard (for EBS magnetic) 40 | 41 | iamProfileName: ${iamProfileName} 42 | associatePublicIpAddresses: false 43 | } 44 | 45 | # 46 | # SSH credentials to use to connect to the instances 47 | # 48 | 49 | ssh { 50 | username: ec2-user # for RHEL image 51 | privateKey: ${secrets.privateKey} 52 | } 53 | 54 | # 55 | # A list of instance types to use for group of nodes or management services 56 | # 57 | 58 | instances { 59 | m4x { 60 | type: m4.xlarge 61 | image: ${image} 62 | tags: ${tags} 63 | } 64 | } 65 | 66 | # 67 | # Configuration for Cloudera Manager. Cloudera Director can use an existing instance 68 | # or bootstrap everything from scratch for a new cluster 69 | # 70 | 71 | cloudera-manager { 72 | instance: ${instances.m4x} { 73 | instanceNamePrefix: strata-2016-cm 74 | } 75 | 76 | configs { 77 | CLOUDERA_MANAGER { 78 | custom_banner_html: "Managed by Cloudera Director" 79 | } 80 | } 81 | 82 | repository: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/5.8.2/" 83 | repositoryKeyUrl: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera" 84 | 85 | # 86 | # Automatically activate 60-Day Cloudera Enterprise Trial 87 | # 88 | 89 | enableEnterpriseTrial: true 90 | } 91 | 92 | # 93 | # Cluster description 94 | # 95 | 96 | cluster { 97 | 98 | products { 99 | CDH: 5.8 100 | } 101 | 102 | parcelRepositories: ["http://archive.cloudera.com/cdh5/parcels/5.8.2/"] 103 | 104 | services: [HDFS, YARN, HIVE, SPARK_ON_YARN] 105 | 106 | masters { 107 | count: 1 108 | 109 | instance: ${instances.m4x} 110 | 111 | roles { 112 | HDFS: [NAMENODE, SECONDARYNAMENODE] 113 | YARN: [RESOURCEMANAGER, JOBHISTORY] 114 | SPARK_ON_YARN: [SPARK_YARN_HISTORY_SERVER] 115 | HIVE: [HIVESERVER2, HIVEMETASTORE] 116 | } 117 | } 118 | 119 | workers { 120 | count: 1 121 | 122 | instance: ${instances.m4x} 123 | 124 | roles { 125 | HDFS: [DATANODE] 126 | YARN: [NODEMANAGER] 127 | } 128 | } 129 | } 130 | --------------------------------------------------------------------------------