├── .gitattributes
├── .gitignore
├── README.txt
├── data
├── sensordata.csv
├── sensormaint.csv
└── sensorvendor.csv
├── dependency-reduced-pom.xml
├── pom.xml
├── scripts
├── create_ext_table.hql
├── create_join_view.hql
├── create_maint_table.hql
└── create_pump_table.hql
└── src
└── main
└── scala
└── examples
├── HBaseReadRowWriteStats.scala
├── HBaseReadWrite.scala
└── HBaseSensorStream.scala
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 |
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | /target/
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 |
2 | Create an hbase table to write to:
3 | launch the hbase shell
4 | $hbase shell
5 |
6 | create '/user/user01/sensor', {NAME=>'data'}, {NAME=>'alert'}, {NAME=>'stats'}
7 |
8 | Commands to run labs:
9 |
10 | Step 1: First compile the project: Select project -> Run As -> Maven Install
11 |
12 | Step 2: use scp to copy the sparkstreamhbaseapp-1.0.jar to the mapr sandbox or cluster
13 |
14 | To run the streaming:
15 |
16 | Step 3: start the streaming app
17 |
18 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseSensorStream sparkstreamhbaseapp-1.0.jar
19 |
20 | Step 4: copy the streaming data file to the stream directory
21 | cp sensordata.csv /user/user01/stream/.
22 |
23 | Step 5: you can scan the data written to the table, however the values in binary double are not readable from the shell
24 | launch the hbase shell, scan the data column family and the alert column family
25 | $hbase shell
26 | scan '/user/user01/sensor', {COLUMNS=>['data'], LIMIT => 10}
27 | scan '/user/user01/sensor', {COLUMNS=>['alert'], LIMIT => 10 }
28 |
29 | Step 6: launch one of the programs below to read data and calculate daily statistics
30 | calculate stats for one column
31 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseReadWrite sparkstreamhbaseapp-1.0.jar
32 | calculate stats for whole row
33 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseReadRowWriteStats sparkstreamhbaseapp-1.0.jar
34 |
35 | launch the shell and scan for statistics
36 | scan '/user/user01/sensor', {COLUMNS=>['stats']}
37 |
38 |
39 |
--------------------------------------------------------------------------------
/data/sensormaint.csv:
--------------------------------------------------------------------------------
1 | COHUTTA,3/15/11,J.Thomas,Install
2 | COHUTTA,2/20/12,J.Thomas,Inspection
3 | COHUTTA,1/13/13,J.Thomas,Inspection
4 | COHUTTA,6/15/13,J.Thomas,Tighten Mounts
5 | COHUTTA,2/27/14,J.Thomas,Inspection
6 | COHUTTA,3/6/14,E. Simmons,Adjust bearing alignment
7 | NANTAHALLA,3/15/11,J.Thomas,Install
8 | NANTAHALLA,2/19/12,J.Thomas,Inspection
9 | NANTAHALLA,1/12/13,J.Thomas,Inspection
10 | NANTAHALLA,6/14/13,J.Thomas,Tighten Mounts
11 | NANTAHALLA,2/26/14,J.Thomas,Inspection
12 | NANTAHALLA,3/3/14,E. Simmons,Adjust bearing alignment
13 | NANTAHALLA,3/13/14,E. Simmons,Shutdown Failure
14 | THERMALITO,9/26/09,W.Stevens,Install
15 | THERMALITO,11/22/09,W.Stevens,Tighten Mounts
16 | THERMALITO,6/10/10,T. LaBou,Inspection
17 | THERMALITO,1/7/11,T. LaBou,Inspection
18 | THERMALITO,9/26/11,W.Stevens,Inspection
19 | THERMALITO,10/2/11,T. LaBou,Bearing Seal
20 | THERMALITO,11/5/11,D.Pitre,Inspect
21 | THERMALITO,5/22/12,D.Pitre,Inspect
22 | THERMALITO,12/15/12,D.Pitre,Inspect
23 | THERMALITO,6/16/13,T. LaBou,Vane clearance adjust
24 | THERMALITO,7/11/13,W.Stevens,Inspect
25 | THERMALITO,2/5/14,D.Pitre,Inspect
26 | BUTTE,10/2/09,W.Stevens,Install
27 | BUTTE,10/5/09,W.Stevens,Inspect
28 | BUTTE,11/22/09,W.Stevens,Tighten Mounts
29 | BUTTE,6/10/10,T. LaBou,Inspect
30 | BUTTE,1/7/11,T. LaBou,Inspect
31 | BUTTE,9/26/11,W.Stevens,Inspect
32 | BUTTE,10/2/11,T. LaBou,Bearing Seal
33 | BUTTE,11/5/11,D.Pitre,Inspect
34 | BUTTE,5/22/12,D.Pitre,Inspect
35 | BUTTE,12/15/12,D.Pitre,Inspect
36 | BUTTE,6/16/13,T. LaBou,Vane clearance adjust
37 | BUTTE,7/11/13,W.Stevens,Inspect
38 | BUTTE,2/5/14,D.Pitre,Inspect
39 | CARGO,10/2/09,T. LaBou,Install
40 | CARGO,10/5/09,W.Stevens,Inspect
41 | CARGO,11/22/09,T. LaBou,Tighten Mounts
42 | CARGO,6/10/10,T. LaBou,Inspect
43 | CARGO,1/7/11,T. LaBou,Inspect
44 | CARGO,9/26/11,W.Stevens,Inspect
45 | CARGO,10/2/11,T. LaBou,Bearing Seal
46 | CARGO,11/5/11,D.Pitre,Inspect
47 | CARGO,5/22/12,D.Pitre,Inspect
48 | CARGO,12/15/12,D.Pitre,Inspect
49 | CARGO,6/18/13,T. LaBou,Vane clearance adjust
50 | CARGO,7/11/13,W.Stevens,Inspect
51 | CARGO,2/5/14,D.Pitre,Inspect
52 | CARGO,3/13/14,D.Pitre,Tighten Mounts
53 | LAGNAPPE,10/2/09,T. LaBou,Install
54 | LAGNAPPE,10/5/09,W.Stevens,Inspect
55 | LAGNAPPE,11/24/09,W.Stevens,Tighten Mounts
56 | LAGNAPPE,6/10/10,T. LaBou,Inspect
57 | LAGNAPPE,1/7/11,T. LaBou,Inspect
58 | LAGNAPPE,9/30/11,W.Stevens,Inspect
59 | LAGNAPPE,10/3/11,T. LaBou,Bearing Seal
60 | LAGNAPPE,11/5/11,D.Pitre,Inspect
61 | LAGNAPPE,5/22/12,D.Pitre,Inspect
62 | LAGNAPPE,12/15/12,W.Stevens,Inspect
63 | LAGNAPPE,6/18/13,T. LaBou,Vane clearance adjust
64 | LAGNAPPE,7/11/13,W.Stevens,Inspect
65 | LAGNAPPE,2/5/14,D.Pitre,Inspect
66 | LAGNAPPE,3/14/14,D.Pitre,Shutdown Main Feed Line Failure
67 | CHER,11/5/09,D.Pitre,Install
68 | CHER,11/23/09,W.Stevens,Inspect
69 | CHER,11/24/09,W.Stevens,Tighten Mounts
70 | CHER,6/10/10,T. LaBou,Inspect
71 | CHER,1/7/11,T. LaBou,Inspect
72 | CHER,9/30/11,W.Stevens,Inspect
73 | CHER,10/3/11,W.Stevens,Bearing Seal
74 | CHER,11/5/11,D.Pitre,Inspect
75 | CHER,5/22/12,W.Stevens,Inspect
76 | CHER,12/15/12,W.Stevens,Inspect
77 | CHER,6/22/13,T. LaBou,Vane clearance adjust
78 | CHER,7/11/13,W.Stevens,Inspect
79 | CHER,2/5/14,D.Pitre,Inspect
80 | CHER,3/15/14,T. LaBou,Tighten Mounts
81 | ANDOUILLE,11/25/09,W.Stevens,Install
82 | ANDOUILLE,11/27/09,W.Stevens,Inspect
83 | ANDOUILLE,12/2/09,W.Stevens,Tighten Mounts
84 | ANDOUILLE,6/10/10,T. LaBou,Inspect
85 | ANDOUILLE,1/7/11,T. LaBou,Inspect
86 | ANDOUILLE,9/30/11,W.Stevens,Inspect
87 | ANDOUILLE,10/7/11,D.Pitre,Bearing Seal
88 | ANDOUILLE,11/5/11,D.Pitre,Inspect
89 | ANDOUILLE,5/22/12,W.Stevens,Inspect
90 | ANDOUILLE,12/15/12,W.Stevens,Inspect
91 | ANDOUILLE,6/25/13,T. LaBou,Vane clearance adjust
92 | ANDOUILLE,7/11/13,W.Stevens,Inspect
93 | ANDOUILLE,2/5/14,D.Pitre,Inspect
94 | ANDOUILLE,3/12/14,T. LaBou,Tighten Mounts
95 | MOJO,6/20/11,N.Boudreau,Install
96 | MOJO,7/11/11,N.Boudreau,Tighten Mounts
97 | MOJO,1/22/13,N.Boudreau,Inspect
98 | MOJO,7/15/13,N.Boudreau,Inspect
99 | MOJO,12/15/13,N.Boudreau,Inspect
100 | MOJO,6/7/13,M.Dugas,Inspect
101 | MOJO,1/11/14,N.Boudreau,Inspect
102 | MOJO,3/14/14,M.Dugas,Inspect
103 | BBKING,7/15/11,M.Dugas,Install
104 | BBKING,7/11/11,N.Boudreau,Tighten Mounts
105 | BBKING,1/24/13,N.Boudreau,Inspect
106 | BBKING,7/22/13,N.Boudreau,Inspect
107 | BBKING,12/16/13,N.Boudreau,Inspect
108 | BBKING,12/18/13,M.Dugas,Shutdown Failure
109 | BBKING,12/19/13,M.Dugas,Replace Front Motor Bearing
110 | BBKING,12/20/13,N.Boudreau,Inspect
111 | BBKING,6/12/13,M.Dugas,Inspect
112 | BBKING,1/14/14,N.Boudreau,Inspect
113 | BBKING,3/14/14,M.Dugas,Inspect
114 |
--------------------------------------------------------------------------------
/data/sensorvendor.csv:
--------------------------------------------------------------------------------
1 | COHUTTA,HYDROPUMP,11/27/10,3/15/11,HYDROCAM,29.687276,-91.162492
2 | NANTAHALLA,HYDROPUMP,11/27/10,3/15/11,HYDROCAM,29.687128,-91.162499
3 | THERMALITO,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.687276,-91.162492
4 | BUTTE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.686929,-91.1625
5 | CARGO,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683147,-91.145448
6 | LAGNAPPE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683124,-91.145471
7 | CHER,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683167,-91.145427
8 | ANDOUILLE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683187,-91.145408
9 | MOJO,HYDROPUMP,4/25/11,6/20/11,XYLO,29.66975,-91.13223
10 | BBKING,HYDROPUMP,4/25/11,6/20/11,XYLO,29.669723,-91.132278
11 |
--------------------------------------------------------------------------------
/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | mapr
5 | sparkdataframesapp
6 | sparkdataframesapp
7 | 1.0
8 | 2015
9 |
10 | src/main/scala
11 | src/test/scala
12 |
13 |
14 | net.alchim31.maven
15 | scala-maven-plugin
16 | 3.2.0
17 |
18 |
19 |
20 | compile
21 | testCompile
22 |
23 |
24 |
25 | -dependencyfile
26 | ${project.build.directory}/.scala_dependencies
27 |
28 |
29 |
30 |
31 |
32 |
33 | maven-surefire-plugin
34 | 2.18.1
35 |
36 | false
37 | true
38 |
39 | **/*Test.*
40 | **/*Suite.*
41 |
42 |
43 |
44 |
45 | maven-shade-plugin
46 | 2.3
47 |
48 |
49 | package
50 |
51 | shade
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | scala-tools.org
61 | Scala-tools Maven2 Repository
62 | http://scala-tools.org/repo-releases
63 |
64 |
65 |
66 |
67 | false
68 |
69 | mapr-releases
70 | http://repository.mapr.com/maven/
71 |
72 |
73 |
74 |
75 | org.apache.spark
76 | spark-core_2.10
77 | 1.3.1
78 | provided
79 |
80 |
81 | chill_2.10
82 | com.twitter
83 |
84 |
85 | chill-java
86 | com.twitter
87 |
88 |
89 | hadoop-client
90 | org.apache.hadoop
91 |
92 |
93 | spark-network-common_2.10
94 | org.apache.spark
95 |
96 |
97 | spark-network-shuffle_2.10
98 | org.apache.spark
99 |
100 |
101 | jets3t
102 | net.java.dev.jets3t
103 |
104 |
105 | curator-recipes
106 | org.apache.curator
107 |
108 |
109 | javax.servlet
110 | org.eclipse.jetty.orbit
111 |
112 |
113 | commons-lang3
114 | org.apache.commons
115 |
116 |
117 | commons-math3
118 | org.apache.commons
119 |
120 |
121 | jsr305
122 | com.google.code.findbugs
123 |
124 |
125 | slf4j-api
126 | org.slf4j
127 |
128 |
129 | jul-to-slf4j
130 | org.slf4j
131 |
132 |
133 | jcl-over-slf4j
134 | org.slf4j
135 |
136 |
137 | log4j
138 | log4j
139 |
140 |
141 | slf4j-log4j12
142 | org.slf4j
143 |
144 |
145 | compress-lzf
146 | com.ning
147 |
148 |
149 | snappy-java
150 | org.xerial.snappy
151 |
152 |
153 | lz4
154 | net.jpountz.lz4
155 |
156 |
157 | RoaringBitmap
158 | org.roaringbitmap
159 |
160 |
161 | commons-net
162 | commons-net
163 |
164 |
165 | akka-remote_2.10
166 | org.spark-project.akka
167 |
168 |
169 | akka-slf4j_2.10
170 | org.spark-project.akka
171 |
172 |
173 | json4s-jackson_2.10
174 | org.json4s
175 |
176 |
177 | mesos
178 | org.apache.mesos
179 |
180 |
181 | netty-all
182 | io.netty
183 |
184 |
185 | stream
186 | com.clearspring.analytics
187 |
188 |
189 | metrics-core
190 | io.dropwizard.metrics
191 |
192 |
193 | metrics-jvm
194 | io.dropwizard.metrics
195 |
196 |
197 | metrics-json
198 | io.dropwizard.metrics
199 |
200 |
201 | metrics-graphite
202 | io.dropwizard.metrics
203 |
204 |
205 | jackson-databind
206 | com.fasterxml.jackson.core
207 |
208 |
209 | jackson-module-scala_2.10
210 | com.fasterxml.jackson.module
211 |
212 |
213 | ivy
214 | org.apache.ivy
215 |
216 |
217 | oro
218 | oro
219 |
220 |
221 | tachyon-client
222 | org.tachyonproject
223 |
224 |
225 | pyrolite
226 | org.spark-project
227 |
228 |
229 | py4j
230 | net.sf.py4j
231 |
232 |
233 | unused
234 | org.spark-project.spark
235 |
236 |
237 |
238 |
239 | org.apache.spark
240 | spark-sql_2.10
241 | 1.3.1
242 | provided
243 |
244 |
245 | spark-catalyst_2.10
246 | org.apache.spark
247 |
248 |
249 | parquet-column
250 | com.twitter
251 |
252 |
253 | parquet-hadoop
254 | com.twitter
255 |
256 |
257 | jodd-core
258 | org.jodd
259 |
260 |
261 | jackson-databind
262 | com.fasterxml.jackson.core
263 |
264 |
265 | unused
266 | org.spark-project.spark
267 |
268 |
269 |
270 |
271 | junit
272 | junit
273 | 4.11
274 | test
275 |
276 |
277 | hamcrest-core
278 | org.hamcrest
279 |
280 |
281 |
282 |
283 | org.specs2
284 | specs2_2.10
285 | 1.13
286 | test
287 |
288 |
289 | scalaz-core_2.10
290 | org.specs2
291 |
292 |
293 | scalaz-concurrent_2.10
294 | org.specs2
295 |
296 |
297 |
298 |
299 | org.scalatest
300 | scalatest_2.10
301 | 2.0.M6-SNAP8
302 | test
303 |
304 |
305 |
306 | 2.10.4
307 | UTF-8
308 | 1.7
309 | 1.3.1
310 | 2.10
311 | 1.7
312 |
313 |
314 |
315 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | mapr
5 | sparkstreamhbaseapp
6 | 1.0
7 | sparkstreamhbaseapp
8 |
9 | 2015
10 |
11 |
12 | 1.7
13 | 1.7
14 | UTF-8
15 | 2.10
16 | 2.10.4
17 | 1.5.2
18 | 1.1.1-mapr-1602-m7-5.1.0
19 |
20 |
21 |
22 |
23 | scala-tools.org
24 | Scala-tools Maven2 Repository
25 | http://scala-tools.org/repo-releases
26 |
27 |
28 |
29 | mapr-releases
30 | http://repository.mapr.com/maven/
31 |
32 | false
33 |
34 |
35 | true
36 |
37 |
38 |
39 | maven2-repository.dev.java.net
40 | Java.net Repository for Maven
41 | http://download.java.net/maven/2/
42 | default
43 |
44 |
45 |
46 |
47 | org.scala-lang
48 | scala-library
49 | ${scala.version}
50 |
51 |
52 | org.apache.spark
53 | spark-core_${scala.tools.version}
54 | ${spark.version}
55 | provided
56 |
57 |
58 | org.apache.spark
59 | spark-sql_${scala.tools.version}
60 | ${spark.version}
61 |
62 |
63 | org.apache.spark
64 | spark-streaming_${scala.tools.version}
65 | ${spark.version}
66 |
67 |
68 | org.apache.hbase
69 | hbase-server
70 | ${mapr.version}
71 |
72 |
73 |
74 |
75 | src/main/scala
76 | src/test/scala
77 |
78 |
79 | org.scala-tools
80 | maven-scala-plugin
81 | 2.15.2
82 |
83 |
84 |
85 | compile
86 |
87 |
88 |
89 | -dependencyfile
90 | ${project.build.directory}/.scala_dependencies
91 |
92 |
93 |
94 |
95 |
96 |
97 | org.apache.maven.plugins
98 | maven-compiler-plugin
99 | 2.3.1
100 |
101 | 1.7
102 | 1.7
103 | true
104 | true
105 |
106 |
107 |
108 | org.apache.maven.plugins
109 | maven-eclipse-plugin
110 | 2.8
111 |
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/scripts/create_ext_table.hql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE sensor
2 | (key STRING, resID STRING, date STRING,
3 | hz FLOAT,
4 | disp FLOAT,
5 | flo INT,
6 | sedPPM FLOAT,
7 | psi INT,
8 | chlPPM FLOAT)
9 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
10 | WITH SERDEPROPERTIES (
11 | "hbase.columns.mapping" =
12 | ":key,cf1:resID,cf1:date,cf1:hz,cf1:disp,
13 | cf1:flo,cf1:sedPPM,cf1:psi,cf1:chlPPM"
14 | )
15 |
16 | TBLPROPERTIES("hbase.table.name" = "/user/user01/sensor");
17 |
--------------------------------------------------------------------------------
/scripts/create_join_view.hql:
--------------------------------------------------------------------------------
1 | create view pumpview as
2 | select s.date, s.hz, s.disp, s.flo, s.sedPPM, s.psi, s.chlPPM,
3 | p.resourceid, p.type, p.purchasedate, p.dateinservice, p.vendor, p.longitude, p.latitude
4 | from sensor s
5 | join pump_info p
6 | on (s.resid = p.resourceid);
7 |
--------------------------------------------------------------------------------
/scripts/create_maint_table.hql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE maint_table
2 |
3 | (resourceid STRING, eventDate STRING,
4 | technician STRING, description STRING)
5 |
6 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
7 |
8 |
9 | STORED AS TEXTFILE LOCATION "/user/user01/sensormaint.csv";
10 |
--------------------------------------------------------------------------------
/scripts/create_pump_table.hql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE pump_info
2 |
3 | (resourceid STRING, type STRING, purchasedate STRING,
4 | dateinservice STRING, vendor STRING, longitude FLOAT, latitude FLOAT)
5 |
6 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ","
7 |
8 |
9 | STORED AS TEXTFILE LOCATION "/user/user01/sensorvendor.csv";
10 |
--------------------------------------------------------------------------------
/src/main/scala/examples/HBaseReadRowWriteStats.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * This example reads a row of time series sensor data
3 | * calculates the the statistics for the hz data
4 | * and then writes these statistics to the stats column family
5 | *
6 | * you can specify specific columns to return, More info:
7 | * http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
8 | */
9 |
10 | package examples
11 |
12 | import scala.reflect.runtime.universe
13 |
14 | import org.apache.hadoop.hbase.HBaseConfiguration
15 | import org.apache.hadoop.hbase.client.Put
16 | import org.apache.hadoop.hbase.client.Result
17 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
18 | import org.apache.hadoop.hbase.mapred.TableOutputFormat
19 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
20 | import org.apache.hadoop.hbase.util.Bytes
21 | import org.apache.hadoop.mapred.JobConf
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.SparkContext
24 | import org.apache.spark.rdd.PairRDDFunctions
25 | import org.apache.spark.sql.Row
26 | import org.apache.spark.sql.functions.avg
27 | import org.apache.hadoop.mapreduce.Job
28 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
29 | import org.apache.hadoop.fs.Path
30 |
31 | object HBaseReadRowWriteStats {
32 |
33 | case class SensorRow(rowkey: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double)
34 |
35 | object SensorRow extends Serializable{
36 | def parseSensorRow(result: Result): SensorRow = {
37 | val rowkey = Bytes.toString(result.getRow())
38 | // remove time from rowKey, stats row key is for day
39 | val p0 = rowkey.split(" ")(0)
40 | val p1 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("hz")))
41 | val p2 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("disp")))
42 | val p3 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("flo")))
43 | val p4 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("sedPPM")))
44 | val p5 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("psi")))
45 | val p6 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("chlPPM")))
46 | SensorRow(p0, p1, p2, p3, p4, p5, p6)
47 | }
48 | }
49 |
50 | case class SensorStatsRow(rowkey: String,
51 | maxhz: Double, minhz: Double, avghz: Double,
52 | maxdisp: Double, mindisp: Double, avgdisp: Double,
53 | maxflo: Double, minflo: Double, avgflo: Double,
54 | maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double,
55 | maxpsi: Double, minpsi: Double, avgpsi: Double,
56 | maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double)
57 |
58 | object SensorStatsRow {
59 | def convertToPutStats(row: SensorStatsRow): (ImmutableBytesWritable, Put) = {
60 | val p = new Put(Bytes.toBytes(row.rowkey))
61 | // add columns with data values to put
62 | p.add(cfStatsBytes, Bytes.toBytes("hzmax"), Bytes.toBytes(row.maxhz))
63 | p.add(cfStatsBytes, Bytes.toBytes("hzmin"), Bytes.toBytes(row.minhz))
64 | p.add(cfStatsBytes, Bytes.toBytes("hzavg"), Bytes.toBytes(row.avghz))
65 | p.add(cfStatsBytes, Bytes.toBytes("dispmax"), Bytes.toBytes(row.maxdisp))
66 | p.add(cfStatsBytes, Bytes.toBytes("dispmin"), Bytes.toBytes(row.mindisp))
67 | p.add(cfStatsBytes, Bytes.toBytes("dispavg"), Bytes.toBytes(row.avgdisp))
68 | p.add(cfStatsBytes, Bytes.toBytes("flomax"), Bytes.toBytes(row.maxflo))
69 | p.add(cfStatsBytes, Bytes.toBytes("flomin"), Bytes.toBytes(row.minflo))
70 | p.add(cfStatsBytes, Bytes.toBytes("floavg"), Bytes.toBytes(row.avgflo))
71 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMmax"), Bytes.toBytes(row.maxsedPPM))
72 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMmin"), Bytes.toBytes(row.minsedPPM))
73 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMavg"), Bytes.toBytes(row.avgsedPPM))
74 | p.add(cfStatsBytes, Bytes.toBytes("psimax"), Bytes.toBytes(row.maxpsi))
75 | p.add(cfStatsBytes, Bytes.toBytes("psimin"), Bytes.toBytes(row.minpsi))
76 | p.add(cfStatsBytes, Bytes.toBytes("psiavg"), Bytes.toBytes(row.avgpsi))
77 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMmax"), Bytes.toBytes(row.maxchlPPM))
78 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMmin"), Bytes.toBytes(row.minchlPPM))
79 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMavg"), Bytes.toBytes(row.avgchlPPM))
80 | (new ImmutableBytesWritable, p)
81 | }
82 | }
83 |
84 | final val tableName = "/user/user01/sensor"
85 | final val cfData = "data"
86 | final val cfDataBytes = Bytes.toBytes(cfData)
87 | final val cfStats = "stats"
88 | final val cfStatsBytes = Bytes.toBytes(cfStats)
89 |
90 | def main(args: Array[String]) {
91 | val sparkConf = new SparkConf().setAppName("HBaseTest")
92 | val sc = new SparkContext(sparkConf)
93 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
94 | import sqlContext.implicits._
95 |
96 | val conf = HBaseConfiguration.create()
97 |
98 | conf.set(TableInputFormat.INPUT_TABLE, tableName)
99 | // scan data column family
100 | conf.set(TableInputFormat.SCAN_COLUMNS, "data")
101 |
102 | // Load an RDD of rowkey, result(ImmutableBytesWritable, Result) tuples from the table
103 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
104 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
105 | classOf[org.apache.hadoop.hbase.client.Result])
106 |
107 | hBaseRDD.count()
108 |
109 | // transform (ImmutableBytesWritable, Result) tuples into an RDD of Results
110 | val resultRDD = hBaseRDD.map(tuple => tuple._2)
111 | resultRDD.count()
112 | // transform RDD of Results into an RDD of SensorRow objects
113 | val sensorRDD = resultRDD.map(SensorRow.parseSensorRow)
114 | // change RDD of SensorRow objects to a DataFrame
115 | val sensorDF = sensorRDD.toDF()
116 | // Return the schema of this DataFrame
117 | sensorDF.printSchema()
118 | // Display the top 20 rows of DataFrame
119 | sensorDF.show()
120 | // group by the rowkey (sensorid_date) get average psi
121 | sensorDF.groupBy("rowkey").agg(avg(sensorDF("psi"))).take(5).foreach(println)
122 | // register the DataFrame as a temp table
123 | sensorDF.registerTempTable("SensorRow")
124 |
125 | // group by the rowkey (sensorid_date) get average, max , min for all columns
126 | val sensorStatDF = sqlContext.sql("SELECT rowkey,MAX(hz) as maxhz, min(hz) as minhz, avg(hz) as avghz, MAX(disp) as maxdisp, min(disp) as mindisp, avg(disp) as avgdisp, MAX(flo) as maxflo, min(flo) as minflo, avg(flo) as avgflo,MAX(sedPPM) as maxsedPPM, min(sedPPM) as minsedPPM, avg(sedPPM) as avgsedPPM, MAX(psi) as maxpsi, min(psi) as minpsi, avg(psi) as avgpsi,MAX(chlPPM) as maxchlPPM, min(chlPPM) as minchlPPM, avg(chlPPM) as avgchlPPM FROM SensorRow GROUP BY rowkey")
127 | sensorStatDF.printSchema()
128 | sensorStatDF.take(5).foreach(println)
129 |
130 | // map the query result row to the SensorStatsRow object
131 | val sensorStatsRowRDD = sensorStatDF.map {
132 | case Row(rowkey: String,
133 | maxhz: Double, minhz: Double, avghz: Double, maxdisp: Double, mindisp: Double, avgdisp: Double,
134 | maxflo: Double, minflo: Double, avgflo: Double, maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double,
135 | maxpsi: Double, minpsi: Double, avgpsi: Double, maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double) =>
136 | SensorStatsRow(rowkey: String,
137 | maxhz: Double, minhz: Double, avghz: Double, maxdisp: Double, mindisp: Double, avgdisp: Double,
138 | maxflo: Double, minflo: Double, avgflo: Double, maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double,
139 | maxpsi: Double, minpsi: Double, avgpsi: Double, maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double)
140 | }
141 |
142 | sensorStatsRowRDD.take(5).foreach(println)
143 |
144 | // set JobConfiguration variables for writing to HBase
145 | val jobConfig: JobConf = new JobConf(conf, this.getClass)
146 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out")
147 | // set the HBase output table
148 | jobConfig.setOutputFormat(classOf[TableOutputFormat])
149 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName)
150 | // convert the SensorStatsRow objects into HBase put objects and write to HBase
151 | sensorStatsRowRDD.map {
152 | case sensorStatsRow => SensorStatsRow.convertToPutStats(sensorStatsRow)
153 | }.saveAsHadoopDataset(jobConfig)
154 | }
155 |
156 | }
157 |
--------------------------------------------------------------------------------
/src/main/scala/examples/HBaseReadWrite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * This example reads a row of time series sensor data
3 | * calculates the the statistics for the hz data
4 | * and then writes these statistics to the stats column family
5 | *
6 | */
7 |
8 | package examples
9 |
10 | import org.apache.hadoop.hbase.HBaseConfiguration
11 | import org.apache.hadoop.hbase.client.Put
12 | import org.apache.hadoop.hbase.client.Result
13 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
14 | import org.apache.hadoop.hbase.mapred.TableOutputFormat
15 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
16 | import org.apache.hadoop.hbase.util.Bytes
17 | import org.apache.hadoop.mapred.JobConf
18 | import org.apache.spark.SparkConf
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.rdd.PairRDDFunctions
21 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
22 | import org.apache.spark.util.StatCounter
23 |
24 | object HBaseReadWrite extends Serializable{
25 |
26 | final val tableName = "/user/user01/sensor"
27 | final val cfDataBytes = Bytes.toBytes("data")
28 | final val cfStatsBytes = Bytes.toBytes("stats")
29 |
30 | def main(args: Array[String]) {
31 | val sparkConf = new SparkConf().setAppName("HBaseTest")
32 | val sc = new SparkContext(sparkConf)
33 |
34 | val conf = HBaseConfiguration.create()
35 | conf.set(TableInputFormat.INPUT_TABLE, tableName)
36 | conf.set(TableInputFormat.SCAN_ROW_START, "COHUTTA_3/10/14")
37 | conf.set(TableInputFormat.SCAN_ROW_STOP, "COHUTTA_3/11/14")
38 | // specify specific column to return
39 | conf.set(TableInputFormat.SCAN_COLUMNS, "data:psi")
40 |
41 | // Load an RDD of (ImmutableBytesWritable, Result) tuples from the table
42 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
43 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
44 | classOf[org.apache.hadoop.hbase.client.Result])
45 |
46 | hBaseRDD.count()
47 |
48 | // transform (ImmutableBytesWritable, Result) tuples into an RDD of Result’s
49 | val resultRDD = hBaseRDD.map(tuple => tuple._2)
50 | resultRDD.count()
51 | // transform into an RDD of (RowKey, ColumnValue)s the RowKey has the time removed
52 | val keyValueRDD = resultRDD.map(result => (Bytes.toString(result.getRow()).split(" ")(0), Bytes.toDouble(result.value)))
53 | keyValueRDD.take(3).foreach(kv => println(kv))
54 |
55 | // group by rowkey , get statistics for column value
56 | val keyStatsRDD = keyValueRDD.groupByKey().mapValues(list => StatCounter(list))
57 | keyStatsRDD.take(5).foreach(println)
58 |
59 | // set JobConfiguration variables for writing to HBase
60 | val jobConfig: JobConf = new JobConf(conf, this.getClass)
61 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out")
62 | jobConfig.setOutputFormat(classOf[TableOutputFormat])
63 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName)
64 | // convert rowkey, psi stats to put and write to hbase table stats column family
65 | keyStatsRDD.map { case (k, v) => convertToPut(k, v) }.saveAsHadoopDataset(jobConfig)
66 |
67 | }
68 | // convert rowkey, stats to put
69 | def convertToPut(key: String, stats: StatCounter): (ImmutableBytesWritable, Put) = {
70 | val p = new Put(Bytes.toBytes(key))
71 | // add columns with data values to put
72 | p.add(cfStatsBytes, Bytes.toBytes("psimax"), Bytes.toBytes(stats.max))
73 | p.add(cfStatsBytes, Bytes.toBytes("psimin"), Bytes.toBytes(stats.min))
74 | p.add(cfStatsBytes, Bytes.toBytes("psimean"), Bytes.toBytes(stats.mean))
75 | (new ImmutableBytesWritable, p)
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/scala/examples/HBaseSensorStream.scala:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | *
4 | */
5 |
6 | package examples
7 |
8 | import org.apache.hadoop.hbase.HBaseConfiguration
9 | import org.apache.hadoop.hbase.client.Put
10 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
11 | import org.apache.hadoop.hbase.mapred.TableOutputFormat
12 | import org.apache.hadoop.hbase.util.Bytes
13 | import org.apache.hadoop.mapred.JobConf
14 | import org.apache.spark.SparkConf
15 |
16 | import org.apache.spark.streaming.Seconds
17 | import org.apache.spark.streaming.StreamingContext
18 |
19 | object HBaseSensorStream extends Serializable {
20 | final val tableName = "/user/user01/sensor"
21 | final val cfDataBytes = Bytes.toBytes("data")
22 | final val cfAlertBytes = Bytes.toBytes("alert")
23 | final val colHzBytes = Bytes.toBytes("hz")
24 | final val colDispBytes = Bytes.toBytes("disp")
25 | final val colFloBytes = Bytes.toBytes("flo")
26 | final val colSedBytes = Bytes.toBytes("sedPPM")
27 | final val colPsiBytes = Bytes.toBytes("psi")
28 | final val colChlBytes = Bytes.toBytes("chlPPM")
29 |
30 | // schema for sensor data
31 | case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double)
32 |
33 | object Sensor extends Serializable{
34 | // function to parse line of sensor data into Sensor class
35 | def parseSensor(str: String): Sensor = {
36 | val p = str.split(",")
37 | Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble)
38 | }
39 | // Convert a row of sensor object data to an HBase put object
40 | def convertToPut(sensor: Sensor): (ImmutableBytesWritable, Put) = {
41 | val dateTime = sensor.date + " " + sensor.time
42 | // create a composite row key: sensorid_date time
43 | val rowkey = sensor.resid + "_" + dateTime
44 | val put = new Put(Bytes.toBytes(rowkey))
45 | // add to column family data, column data values to put object
46 | put.add(cfDataBytes, colHzBytes, Bytes.toBytes(sensor.hz))
47 | put.add(cfDataBytes, colDispBytes, Bytes.toBytes(sensor.disp))
48 | put.add(cfDataBytes, colFloBytes, Bytes.toBytes(sensor.flo))
49 | put.add(cfDataBytes, colSedBytes, Bytes.toBytes(sensor.sedPPM))
50 | put.add(cfDataBytes, colPsiBytes, Bytes.toBytes(sensor.psi))
51 | put.add(cfDataBytes, colChlBytes, Bytes.toBytes(sensor.chlPPM))
52 | return (new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put)
53 | }
54 | // convert psi alert to an HBase put object
55 | def convertToPutAlert(sensor: Sensor): (ImmutableBytesWritable, Put) = {
56 | val dateTime = sensor.date + " " + sensor.time
57 | // create a composite row key: sensorid_date time
58 | val key = sensor.resid + "_" + dateTime
59 | val p = new Put(Bytes.toBytes(key))
60 | // add to column family alert, column psi data value to put object
61 | p.add(cfAlertBytes, colPsiBytes, Bytes.toBytes(sensor.psi))
62 | return (new ImmutableBytesWritable(Bytes.toBytes(key)), p)
63 | }
64 | }
65 |
66 | def main(args: Array[String]): Unit = {
67 | // set up HBase Table configuration
68 | val conf = HBaseConfiguration.create()
69 | conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
70 | val jobConfig: JobConf = new JobConf(conf, this.getClass)
71 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out")
72 | jobConfig.setOutputFormat(classOf[TableOutputFormat])
73 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName)
74 |
75 | val sparkConf = new SparkConf().setAppName("HBaseStream")
76 | // create a StreamingContext, the main entry point for all streaming functionality
77 | val ssc = new StreamingContext(sparkConf, Seconds(2))
78 |
79 | // parse the lines of data into sensor objects
80 | val sensorDStream = ssc.textFileStream("/user/user01/stream").map(Sensor.parseSensor)
81 | sensorDStream.print()
82 |
83 | sensorDStream.foreachRDD { rdd =>
84 | // filter sensor data for low psi
85 | val alertRDD = rdd.filter(sensor => sensor.psi < 5.0)
86 | alertRDD.take(1).foreach(println)
87 | // convert sensor data to put object and write to HBase table column family data
88 | rdd.map(Sensor.convertToPut).
89 | saveAsHadoopDataset(jobConfig)
90 | // convert alert data to put object and write to HBase table column family alert
91 | alertRDD.map(Sensor.convertToPutAlert).
92 | saveAsHadoopDataset(jobConfig)
93 | }
94 | // Start the computation
95 | ssc.start()
96 | // Wait for the computation to terminate
97 | ssc.awaitTermination()
98 |
99 | }
100 |
101 | }
--------------------------------------------------------------------------------