├── .gitignore
├── LICENSE.txt
├── README
├── common.conf.sample
├── copy_to_s3.hql
├── impala.conf
├── impala.txt
├── ingest.scala
├── schema.hql
└── spark.conf


/.gitignore:
--------------------------------------------------------------------------------
1 | *.h2.db
2 | common.conf
3 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | 
  2 | Strata 2016 NYC - Hadoop in the Cloud Tutorial
  3 | ==============================================
  4 | 
  5 | Introduction
  6 | ------------
  7 | 
  8 | This repository contains everything needed to run through a two step pipeline
  9 | that will ingest data via Spark and make it available for interactive queries
 10 | via Impala.
 11 | 
 12 | The Spark step is expected to be a transient cluster running only for a couple
 13 | of hours each day (potentially triggered via cron).
 14 | 
 15 | The Impala one is expected to be long running and elastic and have multiple
 16 | users that use Hue or the JDBC interface.
 17 | 
 18 | Common Settings
 19 | ---------------
 20 | 
 21 | Start by going through the AWS Quickstart flow:
 22 | 
 23 | https://aws.amazon.com/quickstart/
 24 | http://docs.aws.amazon.com/quickstart/latest/cloudera/welcome.html
 25 | 
 26 | Relevant identifiers will be available as CloudFormation outputs.
 27 | 
 28 | Go to the AWS console and make note of the EC2 instance called ClusterLauncher. This is your Director server instance.
 29 | 
 30 | SSH to the created Director server instance.
 31 | $ ssh ec2-user@<Director server IP> -i <PEM file whose keyname was provided to AWS Quickstart>
 32 | 
 33 | Download the conf files from this github repo to your Director server instance.
 34 | Modify common.conf.sample by providing details specific to your AWS account.
 35 | Use the information from the CloudFormation output. Use the ClusterLauncher security-group, and not the NAT security-group.
 36 | Alternatively, look at aws.sample.conf to see values that should go into common.conf.
 37 | Save this file as common.conf.
 38 | 
 39 | Run validation for both configuration files to ensure everything is 
 40 | configured properly:
 41 | 
 42 | $ cloudera-director validate spark.conf
 43 | $ cloudera-director validate impala.conf
 44 | 
 45 | Create a tunnel to Director from your local machine:
 46 | 
 47 | $ ssh -C -L 7189:localhost:7189 ec2-user@<Cluster Launcher IP>
 48 | # Use your browser to go to http://localhost:7189/
 49 | 
 50 | Data ingest via Spark
 51 | ---------------------
 52 | 
 53 | Ask Director to setup the Spark cluster for ETL:
 54 | 
 55 | $ cloudera-director bootstrap-remote spark.conf --lp.remote.username=admin
 56 | # Director will ask for the admin password
 57 | 
 58 | Progress information is also available in the Director UI.
 59 | 
 60 | Establish a tunnel to Cloudera Manager:
 61 | 
 62 | $ ssh -i cloudera.pem -CN -L 7180:<CM Private IP>:7180 ec2-user@<Cluster Launcher IP>
 63 | 
 64 | SSH into the master node and open the Spark shell:
 65 | 
 66 | $ sudo -u hdfs -i bash
 67 | 
 68 | $ curl -o ingest.scala https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/ingest.scala
 69 | $ spark-shell -i ingest.scala
 70 | 
 71 | $ curl -o schema.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/schema.hql
 72 | $ curl -o copy_to_s3.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/copy_to_s3.hql
 73 | 
 74 | Modify the schema.hql file to point to a new S3 bucket created via the AWS console.
 75 | 
 76 | $ hive -f schema.hql
 77 | $ hive -f copy_to_s3.hql
 78 | 
 79 | SQL via Impala on integested data
 80 | ---------------------------------
 81 | 
 82 | Ask Director to setup the Impala cluster for interactive queries:
 83 | 
 84 | $ cloudera-director bootstrap-remote impala.conf --lp.remote.username=admin
 85 | # Director will ask for the admin password
 86 | 
 87 | Establish a tunnel to Cloudera Manager:
 88 | 
 89 | $ ssh -i cloudera.pem -CN -L 7180:<CM Private IP>:7180 ec2-user@<Cluster Launcher IP>
 90 | 
 91 | SSH into the master node and open the Impala shell:
 92 | 
 93 | $ sudo -u hdfs -i bash
 94 | 
 95 | $ curl -o schema.hql https://raw.githubusercontent.com/cloudera/strata-tutorial-2016-nyc/master/schema.hql
 96 | $ hive -f schema.hql
 97 | 
 98 | Start the impala-shell from a worker node. Identify the Impala worker node from Cloudera Director UI.
 99 | $ impala-shell -i <IP address of Impala worker node>
100 | # Run some interesting queries reading from S3
101 | 
102 | 


--------------------------------------------------------------------------------
/common.conf.sample:
--------------------------------------------------------------------------------
 1 | #
 2 | # (c) Copyright 2015 Cloudera, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | #
18 | # Common configurations for tutorial clusters
19 | #
20 | 
21 | environmentName: Strata Demo Environment
22 | deploymentName: Strata Demo Deployment
23 | 
24 | # RHEL 7.2 in us-west-2
25 | image: ami-775e4f16
26 | 
27 | iamProfileName: REPLACE-ME
28 | 
29 | network {
30 |   region: us-west-2
31 |   subnetId: subnet-REPLACE-ME
32 |   securityGroupsIds: sg-REPLACE-ME
33 | }
34 | 
35 | tags {
36 |   owner: ${?USER}
37 | }
38 | 
39 | secrets {
40 |   privateKey: "/home/ec2-user/cloudera-aws-quickstart-09-25-2016.pem"
41 | }
42 | 


--------------------------------------------------------------------------------
/copy_to_s3.hql:
--------------------------------------------------------------------------------
1 | 
2 | INSERT OVERWRITE TABLE age SELECT * FROM local_age;
3 | 
4 | INSERT OVERWRITE TABLE experience SELECT * FROM local_experience;
5 | 
6 | INSERT OVERWRITE TABLE players SELECT * FROM local_players;
7 | 


--------------------------------------------------------------------------------
/impala.conf:
--------------------------------------------------------------------------------
  1 | #
  2 | # (c) Copyright 2015 Cloudera, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | include file("common.conf")
 18 | 
 19 | #
 20 | # Cluster name
 21 | #
 22 | 
 23 | name: C5-Impala
 24 | 
 25 | #
 26 | # Cloud provider configuration (credentials, region or zone and optional default image)
 27 | #
 28 | 
 29 | provider {
 30 |     type: aws
 31 | 
 32 |     region: ${network.region}
 33 |     subnetId: ${network.subnetId}
 34 |     securityGroupsIds: ${network.securityGroupsIds}
 35 | 
 36 |     instanceNamePrefix: strata-2016-impala
 37 | 
 38 |     rootVolumeSizeGB: 128 # matching the size of the pre-extracted AMI
 39 |     rootVolumeType: gp2 # OR standard (for EBS magnetic)
 40 | 
 41 |     iamProfileName: ${iamProfileName} 
 42 |     associatePublicIpAddresses: false
 43 | }
 44 | 
 45 | #
 46 | # SSH credentials to use to connect to the instances
 47 | #
 48 | 
 49 | ssh {
 50 |     username: ec2-user # for RHEL image
 51 |     privateKey: ${secrets.privateKey}
 52 | }
 53 | 
 54 | #
 55 | # A list of instance types to use for group of nodes or management services
 56 | #
 57 | 
 58 | instances {
 59 |   m4x {
 60 |     type: m4.xlarge
 61 |     image: ${image}
 62 |     tags: ${tags}
 63 |   }
 64 | }
 65 | 
 66 | #
 67 | # Configuration for Cloudera Manager. Cloudera Director can use an existing instance
 68 | # or bootstrap everything from scratch for a new cluster
 69 | #
 70 | 
 71 | cloudera-manager {
 72 |     instance: ${instances.m4x} {
 73 |         instanceNamePrefix: strata-2016-cm
 74 |     }
 75 | 
 76 |     configs {
 77 |         CLOUDERA_MANAGER {
 78 |             custom_banner_html: "Managed by Cloudera Director"
 79 |         }
 80 |     }
 81 | 
 82 |     repository: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/5.8.2/"
 83 |     repositoryKeyUrl: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera"
 84 | 
 85 |     #
 86 |     # Automatically activate 60-Day Cloudera Enterprise Trial
 87 |     #
 88 | 
 89 |     enableEnterpriseTrial: true
 90 | }
 91 | 
 92 | #
 93 | # Cluster description
 94 | #
 95 | 
 96 | cluster {
 97 | 
 98 |     products {
 99 |       CDH: 5.8
100 |     }
101 | 
102 |     parcelRepositories: ["http://archive.cloudera.com/cdh5/parcels/5.8.2/"]
103 | 
104 |     services: [HDFS, YARN, HIVE, IMPALA]
105 | 
106 |     masters {
107 |       count: 1
108 | 
109 |       instance: ${instances.m4x}
110 | 
111 |       roles {
112 |         HDFS: [NAMENODE, SECONDARYNAMENODE]
113 |         YARN: [RESOURCEMANAGER, JOBHISTORY]
114 |         HIVE: [HIVESERVER2, HIVEMETASTORE]
115 |         IMPALA: [CATALOGSERVER, STATESTORE]
116 |       }
117 |     }
118 | 
119 |     workers {
120 |       count: 1
121 | 
122 |       instance: ${instances.m4x}
123 | 
124 |       roles {
125 |         HDFS: [DATANODE]
126 |         YARN: [NODEMANAGER]
127 |         IMPALA: [IMPALAD]
128 |       }
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/impala.txt:
--------------------------------------------------------------------------------
  1 | Programmatic Data Analysis with Impala
  2 | ======================================
  3 | 
  4 | Now that we have all the data, it’s time to start analyzing it with Impala.
  5 | 
  6 | Our table has a lot of information in it—raw stats, z-scores, normalized
  7 | z-scores—so we can immediately start asking and answering some good questions.
  8 | 
  9 | Let’s check in on the MVP race for this year.
 10 | 
 11 | > select name, zTot from Players where year=2016 order by zTot desc limit 10;
 12 | 
 13 | [Stephen Curry,19.766248304312754]
 14 | [Kevin Durant,15.323017389251323]
 15 | [Anthony Davis,13.186429940875069]
 16 | [Kawhi Leonard,13.18181904233336]
 17 | [James Harden,12.622408009920706]
 18 | [Russell Westbrook,12.26014043592826]
 19 | [Kyle Lowry,11.634357073733122]
 20 | [Paul Millsap,11.28903998833887]
 21 | [Chris Paul,10.843486407063033]
 22 | [Jimmy Butler,10.475908301410975]
 23 | 
 24 | 
 25 | Basketball fans should not be surprised to see Stephen Curry topping the list,
 26 | as he’s having a historic year to date. The rest of the list contains players
 27 | who are also having great years, so this is a good sanity check that ours
 28 | z-score is a good statistic. Let’s check out the normalized scores as well to
 29 | see if they tell a different story:
 30 | 
 31 | 
 32 | > select name, nTot from Players where year=2016 order by nTot desc limit 10;
 33 |  
 34 | [Stephen Curry,3.911865399387443]
 35 | [Kevin Durant,2.8729484855957916]
 36 | [Kawhi Leonard,2.7288580160780636]
 37 | [Anthony Davis,2.599364621997217]
 38 | [Russell Westbrook,2.4555072670169955]
 39 | [Paul Millsap,2.3037685595490816]
 40 | [LeBron James,2.1939606667616527]
 41 | [Kyle Lowry,2.151090172022115]
 42 | [Chris Paul,2.1331243771597586]
 43 | [James Harden,2.0788749389912686]
 44 | 
 45 | 
 46 | We see similar players in both lists, but the ordering is slightly different.
 47 | Anthony Davis and Kawhi Leonard are neck and neck in z-score, but Leonard has a
 48 | slight edge in normalized z-score, which suggests that he contributes more
 49 | significantly across a wider spectrum of stats than does Davis, or that Davis
 50 | may be overkill in the stats that he is good in. Curry has a commanding lead in
 51 | both zTot and nTot—29% on the second place player in z-score and 39% in
 52 | normalized z-score. Let’s have a look at Curry’s full numbers from this year.
 53 | 
 54 | 
 55 | > select * from Players where year=2016 and name='Stephen Curry'
 56 | 
 57 | [6,Stephen Curry,2016,27,PG,GSW,42,42,33.9,10.0,19.5,0.51,4.9,10.8,0.451,5.1,8.7,0.583,0.635,5.3,5.9,0.911,0.8,4.6,5.4,6.6,2.1,0.1,3.4,2.0,30.1,3.2822803060371077,3.4693236141930193,6.056007802613955,0.7550168635135219,2.695650422425284,3.076598009775487,-0.6534242946356899,-2.7596520748161213,3.8444476552061824,19.766248304312747,0.8113439563367003,0.42770109585695865,1.0,0.15840862722238425,0.4898747874478281,0.7843137254901962,-0.08887924083607253,-0.6708975521305531,1.0,3.911865399387443]
 58 | 
 59 | We see his largest contributor to zTot is z3P at 6.05, meaning his three point
 60 | game is contributing a lot to his value. Indeed, he’s actually top in this
 61 | regard.
 62 | 
 63 | 
 64 | > select name, 3p, z3p from Players  where year=2016 order by z3p desc limit 10;
 65 | 
 66 | [Stephen Curry,4.9,6.056007802613957]
 67 | [Klay Thompson,3.2,3.6363158210435227]
 68 | [Damian Lillard,3.1,3.493980998598203]
 69 | [Paul George,2.9,3.2093113537075637]
 70 | [Kyle Lowry,2.7,2.9246417088169245]
 71 | [J.J. Redick,2.7,2.9246417088169245]
 72 | [James Harden,2.7,2.9246417088169245]
 73 | [Eric Gordon,2.5,2.6399720639262854]
 74 | [Wesley Matthews,2.5,2.6399720639262854]
 75 | [C.J. McCollum,2.5,2.6399720639262854]
 76 | 
 77 | Curry has almost doubled the value of the second place player, teammate Klay
 78 | Thompson. At just shy of five three-pointers made a game, Curry is arguably
 79 | having one of the best shooting seasons of all time. Or is he? z-scores can
 80 | help us here, by telling us how much better a player is relative to the state
 81 | of the league. We’ll run two queries to demonstrate:
 82 | 
 83 | 
 84 | > select name, 3p, z3p from Players order by 3p desc limit 10;
 85 | 
 86 | [Stephen Curry,4.9,6.056007802613957]
 87 | [Stephen Curry,3.6,4.395853442084636]
 88 | [Stephen Curry,3.5,4.372814802572277]
 89 | [Ray Allen,3.4,4.735522316886375]
 90 | [Ray Allen,3.3,4.863432467545102]
 91 | [Dennis Scott,3.3,4.229610850304963]
 92 | [George McCloud,3.3,4.229610850304963]
 93 | [Stephen Curry,3.3,3.8893664999911772]
 94 | [Klay Thompson,3.2,3.6363158210435227]
 95 | [Klay Thompson,3.1,3.651613974257506]
 96 | 
 97 | 
 98 | > select name, 3p, z3p from Players order by z3p desc limit 10;
 99 | 
100 | [Joe Hassett,1.3,10.220836345844305]
101 | [Mike Dunleavy,1.1,8.683932683360899]
102 | [Darrell Griffith,1.1,8.683932683360899]
103 | [Mike Dunleavy,0.8,7.496775843134757]
104 | [Joe Hassett,1.0,7.456044031025608]
105 | [Brian Taylor,1.2,6.94629576951599]
106 | [Darrell Griffith,1.2,6.9433935103235775]
107 | [Michael Adams,2.5,6.679409664703833]
108 | [Don Buse,0.9,6.671197290917649]
109 | [Danny Ainge,1.8,6.282718341161911]
110 | 
111 | The first query shows that Curry makes, on average, more three-pointers than
112 | anyone in the history of the NBA. The second query shows that Joe Hassett had
113 | the best three-point shooting season in 1981, as compared to the rest of the
114 | league, in the history of the NBA. Curry doesn’t even rank in the top 10. (He
115 | barely misses it, coming in at 12th.) Although Hassett only made 1.3 three
116 | pointers a game in 1981, the three-point shot was new and few people were
117 | taking it. Three-pointers are common in today’s game to the point where there
118 | is talk about moving the line back. The proof is always in the numbers (which
119 | you should be able to compute yourself now):
120 | 
121 | In 1981, Hassett’s 1.3 three-pointers a game was such a commanding difference
122 | over the league average of .04, that it produces a higher z-score than Curry’s
123 | 4.9 in 2016. This example illustrates why it’s important to keep in mind that
124 | z-scores are measuring relative values in year: we’re not claiming that Hassett
125 | is a better three-point shooter than Stephen Curry, just that his performance
126 | in 1981 was a larger outlier that year than Curry’s in 2016.
127 | 
128 | 
129 | This is based on Jordan Volz' excellent Basketball Stats tutorial for Spark:
130 | http://blog.cloudera.com/blog/2016/06/how-to-analyze-fantasy-sports-using-apache-spark-and-sql/
131 | 


--------------------------------------------------------------------------------
/ingest.scala:
--------------------------------------------------------------------------------
  1 | 
  2 | //********************
  3 | // Configure S3 access
  4 | //********************
  5 | val hadoopConf = sc.hadoopConfiguration
  6 | 
  7 | // Add your keys if you are not using IAM roles
  8 | // hadoopConf.set("fs.s3a.access.key", "REPLACE-ME")
  9 | // hadoopConf.set("fs.s3a.secret.key", "REPLACE-ME")
 10 | 
 11 | //********************
 12 | //Classes, Helper Functions + Variables
 13 | //********************
 14 | import org.apache.spark.util.StatCounter
 15 | import org.apache.spark.sql.Row
 16 | import org.apache.spark.sql.types._
 17 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
 18 | import scala.collection.mutable.ListBuffer
 19 | 
 20 | //helper funciton to compute normalized value
 21 | def statNormalize(stat:Double, max:Double, min:Double)={
 22 |      val newmax=math.max(math.abs(max),math.abs(min))
 23 |      stat/newmax
 24 | }
 25 | 
 26 | //Holds initial bball stats + weighted stats + normalized stats
 27 | @serializable case class BballData (year: Int, name: String, position: String, age:Int,
 28 |   team: String, gp: Int, gs: Int, mp: Double,stats: Array[Double], statsZ:Array[Double]=Array[Double](),
 29 |   valueZ:Double=0,statsN:Array[Double]=Array[Double](),valueN:Double=0,experience:Double=0)
 30 | 
 31 | //parse a stat line into a BBallDataZ object
 32 | def bbParse(input: String,bStats: scala.collection.Map[String,Double]=Map.empty,zStats: scala.collection.Map[String,Double]=Map.empty)={
 33 |      val line=input.replace(",,",",0,")
 34 |      val pieces=line.substring(1,line.length-1).split(",")
 35 |      val year=pieces(0).toInt
 36 |      val name=pieces(2)
 37 |      val position=pieces(3)
 38 |      val age=pieces(4).toInt
 39 |      val team=pieces(5)
 40 |      val gp=pieces(6).toInt
 41 |      val gs=pieces(7).toInt
 42 |      val mp=pieces(8).toDouble
 43 |      val stats=pieces.slice(9,31).map(x=>x.toDouble)
 44 |      var statsZ:Array[Double]=Array.empty
 45 |      var valueZ:Double=Double.NaN
 46 |      var statsN:Array[Double]=Array.empty
 47 |      var valueN:Double=Double.NaN
 48 | 
 49 |      if (!bStats.isEmpty){
 50 |          val fg=(stats(2)-bStats.apply(year.toString+"_FG%_avg"))*stats(1)
 51 |          val tp=(stats(3)-bStats.apply(year.toString+"_3P_avg"))/bStats.apply(year.toString+"_3P_stdev")
 52 |          val ft=(stats(12)-bStats.apply(year.toString+"_FT%_avg"))*stats(11)
 53 |          val trb=(stats(15)-bStats.apply(year.toString+"_TRB_avg"))/bStats.apply(year.toString+"_TRB_stdev")
 54 |          val ast=(stats(16)-bStats.apply(year.toString+"_AST_avg"))/bStats.apply(year.toString+"_AST_stdev")
 55 |          val stl=(stats(17)-bStats.apply(year.toString+"_STL_avg"))/bStats.apply(year.toString+"_STL_stdev")
 56 |          val blk=(stats(18)-bStats.apply(year.toString+"_BLK_avg"))/bStats.apply(year.toString+"_BLK_stdev")
 57 |          val tov=(stats(19)-bStats.apply(year.toString+"_TOV_avg"))/bStats.apply(year.toString+"_TOV_stdev")*(-1)
 58 |          val pts=(stats(21)-bStats.apply(year.toString+"_PTS_avg"))/bStats.apply(year.toString+"_PTS_stdev")
 59 |          statsZ=Array(fg,ft,tp,trb,ast,stl,blk,tov,pts)
 60 |          valueZ = statsZ.reduce(_+_)
 61 | 
 62 |          if (!zStats.isEmpty){
 63 |              val zfg=(fg-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev")
 64 |              val zft=(ft-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev")
 65 |              val fgN=statNormalize(zfg,(zStats.apply(year.toString+"_FG_max")-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev"),(zStats.apply(year.toString+"_FG_min")-zStats.apply(year.toString+"_FG_avg"))/zStats.apply(year.toString+"_FG_stdev"))
 66 |              val ftN=statNormalize(zft,(zStats.apply(year.toString+"_FT_max")-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev"),(zStats.apply(year.toString+"_FT_min")-zStats.apply(year.toString+"_FT_avg"))/zStats.apply(year.toString+"_FT_stdev"))
 67 |              val tpN=statNormalize(tp,zStats.apply(year.toString+"_3P_max"),zStats.apply(year.toString+"_3P_min"))
 68 |              val trbN=statNormalize(trb,zStats.apply(year.toString+"_TRB_max"),zStats.apply(year.toString+"_TRB_min"))
 69 |              val astN=statNormalize(ast,zStats.apply(year.toString+"_AST_max"),zStats.apply(year.toString+"_AST_min"))
 70 |              val stlN=statNormalize(stl,zStats.apply(year.toString+"_STL_max"),zStats.apply(year.toString+"_STL_min"))
 71 |              val blkN=statNormalize(blk,zStats.apply(year.toString+"_BLK_max"),zStats.apply(year.toString+"_BLK_min"))
 72 |              val tovN=statNormalize(tov,zStats.apply(year.toString+"_TOV_max"),zStats.apply(year.toString+"_TOV_min"))
 73 |              val ptsN=statNormalize(pts,zStats.apply(year.toString+"_PTS_max"),zStats.apply(year.toString+"_PTS_min"))
 74 |              statsZ=Array(zfg,zft,tp,trb,ast,stl,blk,tov,pts)
 75 |              valueZ = statsZ.reduce(_+_)
 76 |              statsN=Array(fgN,ftN,tpN,trbN,astN,stlN,blkN,tovN,ptsN)
 77 |              valueN=statsN.reduce(_+_)
 78 |          }
 79 |      }
 80 |      BballData(year, name, position, age, team, gp, gs, mp, stats,statsZ,valueZ,statsN,valueN)
 81 | }
 82 | 
 83 | //stat counter class -- need printStats method to print out the stats. Useful for transformations
 84 | class BballStatCounter extends Serializable {
 85 |   val stats: StatCounter = new StatCounter()
 86 |   var missing: Long = 0
 87 | 
 88 |   def add(x: Double): BballStatCounter = {
 89 |     if (x.isNaN) {
 90 |       missing += 1
 91 |     } else {
 92 |       stats.merge(x)
 93 |     }
 94 |     this
 95 |   }
 96 | 
 97 |   def merge(other: BballStatCounter): BballStatCounter = {
 98 |     stats.merge(other.stats)
 99 |     missing += other.missing
100 |     this
101 |   }
102 | 
103 |   def printStats(delim: String): String= {
104 |      stats.count + delim + stats.mean + delim + stats.stdev + delim + stats.max + delim + stats.min
105 |   }
106 | 
107 |   override def toString: String = {
108 |     "stats: " + stats.toString + " NaN: " + missing
109 |   }
110 | }
111 | 
112 | object BballStatCounter extends Serializable {
113 |   def apply(x: Double) = new BballStatCounter().add(x)
114 | }
115 | 
116 | //process raw data into zScores and nScores
117 | def processStats(stats0:org.apache.spark.rdd.RDD[String],txtStat:Array[String],bStats: scala.collection.Map[String,Double]=Map.empty,zStats: scala.collection.Map[String,Double]=Map.empty)={
118 |     //parse stats
119 |     val stats1=stats0.map(x=>bbParse(x,bStats,zStats))
120 | 
121 |     //group by year
122 |     val stats2={if(bStats.isEmpty){
123 |                 stats1.keyBy(x=>x.year).map(x=>(x._1,x._2.stats)).groupByKey()
124 |             }else{
125 |                 stats1.keyBy(x=>x.year).map(x=>(x._1,x._2.statsZ)).groupByKey()
126 |             }
127 |     }
128 | 
129 |     //map each stat to StatCounter
130 |     val stats3=stats2.map{case (x,y)=>(x,y.map(a=>a.map(b=>BballStatCounter(b))))}
131 | 
132 |     //merge all stats together
133 |     val stats4=stats3.map{case (x,y)=>(x,y.reduce((a,b)=>a.zip(b).map{ case (c,d)=>c.merge(d)}))}
134 | 
135 |     //combine stats with label and pull label out
136 |     val stats5=stats4.map{case (x,y)=>(x,txtStat.zip(y))}.map{x=>(x._2.map{case (y,z)=>(x._1,y,z)})}
137 | 
138 |     //separate each stat onto its own line and print out the Stats to a String
139 |     val stats6=stats5.flatMap(x=>x.map(y=>(y._1,y._2,y._3.printStats(","))))
140 | 
141 |     //turn stat tuple into key-value pairs with corresponding agg stat
142 |     val stats7=stats6.flatMap{case(a,b,c)=>{
143 |          val pieces=c.split(",")
144 |              val count=pieces(0)
145 |               val mean=pieces(1)
146 |               val stdev=pieces(2)
147 |               val max=pieces(3)
148 |               val min=pieces(4)
149 |              Array((a+"_"+b+"_"+"count",count.toDouble),(a+"_"+b+"_"+"avg",mean.toDouble),(a+"_"+b+"_"+"stdev",stdev.toDouble),(a+"_"+b+"_"+"max",max.toDouble),(a+"_"+b+"_"+"min",min.toDouble))
150 |          }
151 |      }
152 |      stats7
153 | }
154 | 
155 | //process stats for age or experience
156 | def processStatsAgeOrExperience(stats0:org.apache.spark.rdd.RDD[(Int, Array[Double])], label:String)={
157 | 
158 | 
159 |     //group elements by age
160 |     val stats1=stats0.groupByKey()
161 | 
162 |     //turn values into StatCounter objects
163 |     val stats2=stats1.map{case(x,y)=>(x,y.map(z=>z.map(a=>BballStatCounter(a))))}
164 | 
165 |     //Reduce rows by merging StatCounter objects
166 |     val stats3=stats2.map{case (x,y)=>(x,y.reduce((a,b)=>a.zip(b).map{case(c,d)=>c.merge(d)}))}
167 | 
168 |     //turn data into RDD[Row] object for dataframe
169 |     val stats4=stats3.map(x=>Array(Array(x._1.toDouble),x._2.flatMap(y=>y.printStats(",").split(",")).map(y=>y.toDouble)).flatMap(y=>y)).map(x=>Row(x(0).toInt,x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8),x(9),x(10),x(11),x(12),x(13),x(14),x(15),x(16),x(17),x(18),x(19),x(20)))
170 | 
171 |     //create schema for age table
172 |     val schema =StructType(
173 |          StructField(label, IntegerType, true) ::
174 |          StructField("valueZ_count", DoubleType, true) ::
175 |          StructField("valueZ_mean", DoubleType, true) ::
176 |          StructField("valueZ_stdev", DoubleType, true) ::
177 |          StructField("valueZ_max", DoubleType, true) ::
178 |          StructField("valueZ_min", DoubleType, true) ::
179 |          StructField("valueN_count", DoubleType, true) ::
180 |          StructField("valueN_mean", DoubleType, true) ::
181 |          StructField("valueN_stdev", DoubleType, true) ::
182 |          StructField("valueN_max", DoubleType, true) ::
183 |          StructField("valueN_min", DoubleType, true) ::
184 |          StructField("deltaZ_count", DoubleType, true) ::
185 |          StructField("deltaZ_mean", DoubleType, true) ::
186 |          StructField("deltaZ_stdev", DoubleType, true) ::
187 |          StructField("deltaZ_max", DoubleType, true) ::
188 |          StructField("deltaZ_min", DoubleType, true) ::
189 |          StructField("deltaN_count", DoubleType, true) ::
190 |          StructField("deltaN_mean", DoubleType, true) ::
191 |          StructField("deltaN_stdev", DoubleType, true) ::
192 |          StructField("deltaN_max", DoubleType, true) ::
193 |          StructField("deltaN_min", DoubleType, true) :: Nil
194 |     )
195 | 
196 |     //create data frame
197 |     sqlContext.createDataFrame(stats4,schema)
198 | }
199 | 
200 | //********************
201 | //Processing + Transformations
202 | //********************
203 | 
204 | 
205 | //********************
206 | //Compute Aggregate Stats Per Year
207 | //********************
208 | 
209 | //read in all stats
210 | // Old, read from HDFS: val stats=sc.textFile("/tmp/BasketballStatsWithYear/*/*").repartition(sc.defaultParallelism)
211 | val stats=sc.textFile("s3a://cloudera-cloud-demo/datasets/BasketballStatsWithYear/*/*").repartition(sc.defaultParallelism)
212 | 
213 | //filter out junk rows, clean up data entry errors as well
214 | val filteredStats=stats.filter(x => !x.contains("FG%")).filter(x => x.contains(",")).map(x=>x.replace("*","").replace(",,",",0,"))
215 | filteredStats.cache()
216 | 
217 | //process stats and save as map
218 | val txtStat=Array("FG","FGA","FG%","3P","3PA","3P%","2P","2PA","2P%","eFG%","FT","FTA","FT%","ORB","DRB","TRB","AST","STL","BLK","TOV","PF","PTS")
219 | val aggStats=processStats(filteredStats,txtStat).collectAsMap
220 | 
221 | //collect rdd into map and broadcast
222 | val broadcastStats=sc.broadcast(aggStats)
223 | 
224 | 
225 | //********************
226 | //Compute Z-Score Stats Per Year
227 | //********************
228 | 
229 | //parse stats, now tracking weights
230 | val txtStatZ=Array("FG","FT","3P","TRB","AST","STL","BLK","TOV","PTS")
231 | val zStats=processStats(filteredStats,txtStatZ,broadcastStats.value).collectAsMap
232 | 
233 | //collect rdd into map and broadcast
234 | val zBroadcastStats=sc.broadcast(zStats)
235 | 
236 | 
237 | //********************
238 | //Compute Normalized Stats Per Year
239 | //********************
240 | 
241 | //parse stats, now normalizing
242 | val nStats=filteredStats.map(x=>bbParse(x,broadcastStats.value,zBroadcastStats.value))
243 | 
244 | //map RDD to RDD[Row] so that we can turn it into a dataframe
245 | val nPlayer = nStats.map(x => Row.fromSeq(Array(x.name,x.year,x.age,x.position,x.team,x.gp,x.gs,x.mp) ++ x.stats ++ x.statsZ ++ Array(x.valueZ) ++ x.statsN ++ Array(x.valueN)))
246 | 
247 | //create schema for the data frame
248 | val schemaN =StructType(
249 |      StructField("name", StringType, true) ::
250 |      StructField("year", IntegerType, true) ::
251 |      StructField("age", IntegerType, true) ::
252 |      StructField("position", StringType, true) ::
253 |      StructField("team", StringType, true) ::
254 |      StructField("gp", IntegerType, true) ::
255 |      StructField("gs", IntegerType, true) ::
256 |      StructField("mp", DoubleType, true) ::
257 |      StructField("FG", DoubleType, true) ::
258 |      StructField("FGA", DoubleType, true) ::
259 |      StructField("FGP", DoubleType, true) ::
260 |      StructField("3P", DoubleType, true) ::
261 |      StructField("3PA", DoubleType, true) ::
262 |      StructField("3PP", DoubleType, true) ::
263 |      StructField("2P", DoubleType, true) ::
264 |      StructField("2PA", DoubleType, true) ::
265 |      StructField("2PP", DoubleType, true) ::
266 |      StructField("eFG", DoubleType, true) ::
267 |      StructField("FT", DoubleType, true) ::
268 |      StructField("FTA", DoubleType, true) ::
269 |      StructField("FTP", DoubleType, true) ::
270 |      StructField("ORB", DoubleType, true) ::
271 |      StructField("DRB", DoubleType, true) ::
272 |      StructField("TRB", DoubleType, true) ::
273 |      StructField("AST", DoubleType, true) ::
274 |      StructField("STL", DoubleType, true) ::
275 |      StructField("BLK", DoubleType, true) ::
276 |      StructField("TOV", DoubleType, true) ::
277 |      StructField("PF", DoubleType, true) ::
278 |      StructField("PTS", DoubleType, true) ::
279 |      StructField("zFG", DoubleType, true) ::
280 |      StructField("zFT", DoubleType, true) ::
281 |      StructField("z3P", DoubleType, true) ::
282 |      StructField("zTRB", DoubleType, true) ::
283 |      StructField("zAST", DoubleType, true) ::
284 |      StructField("zSTL", DoubleType, true) ::
285 |      StructField("zBLK", DoubleType, true) ::
286 |      StructField("zTOV", DoubleType, true) ::
287 |      StructField("zPTS", DoubleType, true) ::
288 |      StructField("zTOT", DoubleType, true) ::
289 |      StructField("nFG", DoubleType, true) ::
290 |      StructField("nFT", DoubleType, true) ::
291 |      StructField("n3P", DoubleType, true) ::
292 |      StructField("nTRB", DoubleType, true) ::
293 |      StructField("nAST", DoubleType, true) ::
294 |      StructField("nSTL", DoubleType, true) ::
295 |      StructField("nBLK", DoubleType, true) ::
296 |      StructField("nTOV", DoubleType, true) ::
297 |      StructField("nPTS", DoubleType, true) ::
298 |      StructField("nTOT", DoubleType, true) :: Nil
299 | )
300 | 
301 | //create data frame
302 | val dfPlayersT=sqlContext.createDataFrame(nPlayer,schemaN)
303 | 
304 | //save all stats as a temp table
305 | dfPlayersT.registerTempTable("tPlayers")
306 | 
307 | //calculate exp and zdiff, ndiff
308 | val dfPlayers=sqlContext.sql("select age-min_age as exp,tPlayers.* from tPlayers join (select name,min(age)as min_age from tPlayers group by name) as t1 on tPlayers.name=t1.name order by tPlayers.name, exp ")
309 | 
310 | //save as table
311 | dfPlayers.saveAsTable("Local_Players")
312 | //filteredStats.unpersist()
313 | 
314 | //********************
315 | //ANALYSIS
316 | //********************
317 | 
318 | 
319 | //group data by player name
320 | val pStats=dfPlayers.sort(dfPlayers("name"),dfPlayers("exp") asc).map(x=>(x.getString(1),(x.getDouble(50),x.getDouble(40),x.getInt(2),x.getInt(3),Array(x.getDouble(31),x.getDouble(32),x.getDouble(33),x.getDouble(34),x.getDouble(35),x.getDouble(36),x.getDouble(37),x.getDouble(38),x.getDouble(39)),x.getInt(0)))).groupByKey()
321 | pStats.cache
322 | 
323 | //for each player, go through all the years and calculate the change in valueZ and valueN, save into two lists
324 | //one for age, one for experience
325 | //exclude players who played in 1980 from experience, as we only have partial data for them
326 | val excludeNames=dfPlayers.filter(dfPlayers("year")===1980).select(dfPlayers("name")).map(x=>x.mkString).toArray.mkString(",")
327 | 
328 | val pStats1=pStats.map{ case(name,stats) =>
329 |      var last = 0
330 |      var deltaZ = 0.0
331 |      var deltaN = 0.0
332 |      var valueZ = 0.0
333 |      var valueN = 0.0
334 |      var exp = 0
335 |      val aList = ListBuffer[(Int,Array[Double])]()
336 |      val eList = ListBuffer[(Int,Array[Double])]()
337 |      stats.foreach( z => {
338 |           if (last>0){
339 |                deltaN = z._1 - valueN
340 |                deltaZ = z._2 - valueZ
341 |           }else{
342 |                deltaN = Double.NaN
343 |                deltaZ = Double.NaN
344 |           }
345 |           valueN = z._1
346 |           valueZ = z._2
347 |           last = z._4
348 |           aList += ((last, Array(valueZ,valueN,deltaZ,deltaN)))
349 |           if (!excludeNames.contains(z._1)){
350 |               exp = z._6
351 |               eList += ((exp, Array(valueZ,valueN,deltaZ,deltaN)))
352 |           }
353 |           })
354 |     (aList,eList)
355 | }
356 | 
357 | pStats1.cache
358 | 
359 | 
360 | //********************
361 | //compute age stats
362 | //********************
363 | 
364 | //extract out the age list
365 | val pStats2=pStats1.flatMap{case(x,y)=>x}
366 | 
367 | //create age data frame
368 | val dfAge=processStatsAgeOrExperience(pStats2, "age")
369 | 
370 | //save as table
371 | dfAge.saveAsTable("Local_Age")
372 | 
373 | //extract out the experience list
374 | val pStats3=pStats1.flatMap{case(x,y)=>y}
375 | 
376 | //create experience dataframe
377 | val dfExperience=processStatsAgeOrExperience(pStats3,"Experience")
378 | 
379 | //save as table
380 | dfExperience.saveAsTable("Local_Experience")
381 | 
382 | pStats1.unpersist()
383 | 


--------------------------------------------------------------------------------
/schema.hql:
--------------------------------------------------------------------------------
  1 | 
  2 | CREATE EXTERNAL TABLE `age`(
  3 |   `age` int COMMENT '',
  4 |   `valuez_count` double COMMENT '',
  5 |   `valuez_mean` double COMMENT '',
  6 |   `valuez_stdev` double COMMENT '',
  7 |   `valuez_max` double COMMENT '',
  8 |   `valuez_min` double COMMENT '',
  9 |   `valuen_count` double COMMENT '',
 10 |   `valuen_mean` double COMMENT '',
 11 |   `valuen_stdev` double COMMENT '',
 12 |   `valuen_max` double COMMENT '',
 13 |   `valuen_min` double COMMENT '',
 14 |   `deltaz_count` double COMMENT '',
 15 |   `deltaz_mean` double COMMENT '',
 16 |   `deltaz_stdev` double COMMENT '',
 17 |   `deltaz_max` double COMMENT '',
 18 |   `deltaz_min` double COMMENT '',
 19 |   `deltan_count` double COMMENT '',
 20 |   `deltan_mean` double COMMENT '',
 21 |   `deltan_stdev` double COMMENT '',
 22 |   `deltan_max` double COMMENT '',
 23 |   `deltan_min` double COMMENT '')
 24 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
 25 | STORED AS
 26 |   INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat"
 27 |   OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat"
 28 |   LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/age';
 29 | 
 30 | CREATE EXTERNAL TABLE `experience`(
 31 |   `experience` int COMMENT '',
 32 |   `valuez_count` double COMMENT '',
 33 |   `valuez_mean` double COMMENT '',
 34 |   `valuez_stdev` double COMMENT '',
 35 |   `valuez_max` double COMMENT '',
 36 |   `valuez_min` double COMMENT '',
 37 |   `valuen_count` double COMMENT '',
 38 |   `valuen_mean` double COMMENT '',
 39 |   `valuen_stdev` double COMMENT '',
 40 |   `valuen_max` double COMMENT '',
 41 |   `valuen_min` double COMMENT '',
 42 |   `deltaz_count` double COMMENT '',
 43 |   `deltaz_mean` double COMMENT '',
 44 |   `deltaz_stdev` double COMMENT '',
 45 |   `deltaz_max` double COMMENT '',
 46 |   `deltaz_min` double COMMENT '',
 47 |   `deltan_count` double COMMENT '',
 48 |   `deltan_mean` double COMMENT '',
 49 |   `deltan_stdev` double COMMENT '',
 50 |   `deltan_max` double COMMENT '',
 51 |   `deltan_min` double COMMENT '')
 52 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
 53 | STORED AS
 54 |   INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat"
 55 |   OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat"
 56 |   LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/experience';
 57 | 
 58 | CREATE EXTERNAL TABLE `players`(
 59 |   `exp` int COMMENT '',
 60 |   `name` string COMMENT '',
 61 |   `year` int COMMENT '',
 62 |   `age` int COMMENT '',
 63 |   `position` string COMMENT '',
 64 |   `team` string COMMENT '',
 65 |   `gp` int COMMENT '',
 66 |   `gs` int COMMENT '',
 67 |   `mp` double COMMENT '',
 68 |   `fg` double COMMENT '',
 69 |   `fga` double COMMENT '',
 70 |   `fgp` double COMMENT '',
 71 |   `3p` double COMMENT '',
 72 |   `3pa` double COMMENT '',
 73 |   `3pp` double COMMENT '',
 74 |   `2p` double COMMENT '',
 75 |   `2pa` double COMMENT '',
 76 |   `2pp` double COMMENT '',
 77 |   `efg` double COMMENT '',
 78 |   `ft` double COMMENT '',
 79 |   `fta` double COMMENT '',
 80 |   `ftp` double COMMENT '',
 81 |   `orb` double COMMENT '',
 82 |   `drb` double COMMENT '',
 83 |   `trb` double COMMENT '',
 84 |   `ast` double COMMENT '',
 85 |   `stl` double COMMENT '',
 86 |   `blk` double COMMENT '',
 87 |   `tov` double COMMENT '',
 88 |   `pf` double COMMENT '',
 89 |   `pts` double COMMENT '',
 90 |   `zfg` double COMMENT '',
 91 |   `zft` double COMMENT '',
 92 |   `z3p` double COMMENT '',
 93 |   `ztrb` double COMMENT '',
 94 |   `zast` double COMMENT '',
 95 |   `zstl` double COMMENT '',
 96 |   `zblk` double COMMENT '',
 97 |   `ztov` double COMMENT '',
 98 |   `zpts` double COMMENT '',
 99 |   `ztot` double COMMENT '',
100 |   `nfg` double COMMENT '',
101 |   `nft` double COMMENT '',
102 |   `n3p` double COMMENT '',
103 |   `ntrb` double COMMENT '',
104 |   `nast` double COMMENT '',
105 |   `nstl` double COMMENT '',
106 |   `nblk` double COMMENT '',
107 |   `ntov` double COMMENT '',
108 |   `npts` double COMMENT '',
109 |   `ntot` double COMMENT '')
110 | ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
111 | STORED AS
112 |   INPUTFORMAT "parquet.hive.DeprecatedParquetInputFormat"
113 |   OUTPUTFORMAT "parquet.hive.DeprecatedParquetOutputFormat"
114 |   LOCATION 's3a://strata-2016-asavu-tutorial/strata-2016-tables/players';
115 | 


--------------------------------------------------------------------------------
/spark.conf:
--------------------------------------------------------------------------------
  1 | #
  2 | # (c) Copyright 2015 Cloudera, Inc.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #   http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | include file("common.conf")
 18 | 
 19 | #
 20 | # Cluster name
 21 | #
 22 | 
 23 | name: C5-Spark-Ingest
 24 | 
 25 | #
 26 | # Cloud provider configuration (credentials, region or zone and optional default image)
 27 | #
 28 | 
 29 | provider {
 30 |     type: aws
 31 | 
 32 |     region: ${network.region}
 33 |     subnetId: ${network.subnetId}
 34 |     securityGroupsIds: ${network.securityGroupsIds}
 35 | 
 36 |     instanceNamePrefix: strata-2016-spark
 37 | 
 38 |     rootVolumeSizeGB: 128 # matching the size of the pre-extracted AMI
 39 |     rootVolumeType: gp2 # OR standard (for EBS magnetic)
 40 | 
 41 |     iamProfileName: ${iamProfileName}
 42 |     associatePublicIpAddresses: false
 43 | }
 44 | 
 45 | #
 46 | # SSH credentials to use to connect to the instances
 47 | #
 48 | 
 49 | ssh {
 50 |     username: ec2-user # for RHEL image
 51 |     privateKey: ${secrets.privateKey}
 52 | }
 53 | 
 54 | #
 55 | # A list of instance types to use for group of nodes or management services
 56 | #
 57 | 
 58 | instances {
 59 |   m4x {
 60 |     type: m4.xlarge
 61 |     image: ${image}
 62 |     tags: ${tags}
 63 |   }
 64 | }
 65 | 
 66 | #
 67 | # Configuration for Cloudera Manager. Cloudera Director can use an existing instance
 68 | # or bootstrap everything from scratch for a new cluster
 69 | #
 70 | 
 71 | cloudera-manager {
 72 |     instance: ${instances.m4x} {
 73 |         instanceNamePrefix: strata-2016-cm
 74 |     }
 75 | 
 76 |     configs {
 77 |         CLOUDERA_MANAGER {
 78 |             custom_banner_html: "Managed by Cloudera Director"
 79 |         }
 80 |     }
 81 | 
 82 |     repository: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/5.8.2/"
 83 |     repositoryKeyUrl: "http://archive.cloudera.com/cm5/redhat/7/x86_64/cm/RPM-GPG-KEY-cloudera"
 84 | 
 85 |     #
 86 |     # Automatically activate 60-Day Cloudera Enterprise Trial
 87 |     #
 88 | 
 89 |     enableEnterpriseTrial: true
 90 | }
 91 | 
 92 | #
 93 | # Cluster description
 94 | #
 95 | 
 96 | cluster {
 97 | 
 98 |     products {
 99 |       CDH: 5.8
100 |     }
101 | 
102 |     parcelRepositories: ["http://archive.cloudera.com/cdh5/parcels/5.8.2/"]
103 | 
104 |     services: [HDFS, YARN, HIVE, SPARK_ON_YARN]
105 | 
106 |     masters {
107 |       count: 1
108 | 
109 |       instance: ${instances.m4x}
110 | 
111 |       roles {
112 |         HDFS: [NAMENODE, SECONDARYNAMENODE]
113 |         YARN: [RESOURCEMANAGER, JOBHISTORY]
114 |         SPARK_ON_YARN: [SPARK_YARN_HISTORY_SERVER]
115 |         HIVE: [HIVESERVER2, HIVEMETASTORE]
116 |       }
117 |     }
118 | 
119 |     workers {
120 |       count: 1
121 | 
122 |       instance: ${instances.m4x}
123 | 
124 |       roles {
125 |         HDFS: [DATANODE]
126 |         YARN: [NODEMANAGER]
127 |       }
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------