├── .gitattributes ├── .gitignore ├── README.txt ├── data ├── sensordata.csv ├── sensormaint.csv └── sensorvendor.csv ├── dependency-reduced-pom.xml ├── pom.xml ├── scripts ├── create_ext_table.hql ├── create_join_view.hql ├── create_maint_table.hql └── create_pump_table.hql └── src └── main └── scala └── examples ├── HBaseReadRowWriteStats.scala ├── HBaseReadWrite.scala └── HBaseSensorStream.scala /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | /target/ -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | 2 | Create an hbase table to write to: 3 | launch the hbase shell 4 | $hbase shell 5 | 6 | create '/user/user01/sensor', {NAME=>'data'}, {NAME=>'alert'}, {NAME=>'stats'} 7 | 8 | Commands to run labs: 9 | 10 | Step 1: First compile the project: Select project -> Run As -> Maven Install 11 | 12 | Step 2: use scp to copy the sparkstreamhbaseapp-1.0.jar to the mapr sandbox or cluster 13 | 14 | To run the streaming: 15 | 16 | Step 3: start the streaming app 17 | 18 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseSensorStream sparkstreamhbaseapp-1.0.jar 19 | 20 | Step 4: copy the streaming data file to the stream directory 21 | cp sensordata.csv /user/user01/stream/. 22 | 23 | Step 5: you can scan the data written to the table, however the values in binary double are not readable from the shell 24 | launch the hbase shell, scan the data column family and the alert column family 25 | $hbase shell 26 | scan '/user/user01/sensor', {COLUMNS=>['data'], LIMIT => 10} 27 | scan '/user/user01/sensor', {COLUMNS=>['alert'], LIMIT => 10 } 28 | 29 | Step 6: launch one of the programs below to read data and calculate daily statistics 30 | calculate stats for one column 31 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseReadWrite sparkstreamhbaseapp-1.0.jar 32 | calculate stats for whole row 33 | /opt/mapr/spark/spark-1.5.2/bin/spark-submit --driver-class-path `hbase classpath` --class examples.HBaseReadRowWriteStats sparkstreamhbaseapp-1.0.jar 34 | 35 | launch the shell and scan for statistics 36 | scan '/user/user01/sensor', {COLUMNS=>['stats']} 37 | 38 | 39 | -------------------------------------------------------------------------------- /data/sensormaint.csv: -------------------------------------------------------------------------------- 1 | COHUTTA,3/15/11,J.Thomas,Install 2 | COHUTTA,2/20/12,J.Thomas,Inspection 3 | COHUTTA,1/13/13,J.Thomas,Inspection 4 | COHUTTA,6/15/13,J.Thomas,Tighten Mounts 5 | COHUTTA,2/27/14,J.Thomas,Inspection 6 | COHUTTA,3/6/14,E. Simmons,Adjust bearing alignment 7 | NANTAHALLA,3/15/11,J.Thomas,Install 8 | NANTAHALLA,2/19/12,J.Thomas,Inspection 9 | NANTAHALLA,1/12/13,J.Thomas,Inspection 10 | NANTAHALLA,6/14/13,J.Thomas,Tighten Mounts 11 | NANTAHALLA,2/26/14,J.Thomas,Inspection 12 | NANTAHALLA,3/3/14,E. Simmons,Adjust bearing alignment 13 | NANTAHALLA,3/13/14,E. Simmons,Shutdown Failure 14 | THERMALITO,9/26/09,W.Stevens,Install 15 | THERMALITO,11/22/09,W.Stevens,Tighten Mounts 16 | THERMALITO,6/10/10,T. LaBou,Inspection 17 | THERMALITO,1/7/11,T. LaBou,Inspection 18 | THERMALITO,9/26/11,W.Stevens,Inspection 19 | THERMALITO,10/2/11,T. LaBou,Bearing Seal 20 | THERMALITO,11/5/11,D.Pitre,Inspect 21 | THERMALITO,5/22/12,D.Pitre,Inspect 22 | THERMALITO,12/15/12,D.Pitre,Inspect 23 | THERMALITO,6/16/13,T. LaBou,Vane clearance adjust 24 | THERMALITO,7/11/13,W.Stevens,Inspect 25 | THERMALITO,2/5/14,D.Pitre,Inspect 26 | BUTTE,10/2/09,W.Stevens,Install 27 | BUTTE,10/5/09,W.Stevens,Inspect 28 | BUTTE,11/22/09,W.Stevens,Tighten Mounts 29 | BUTTE,6/10/10,T. LaBou,Inspect 30 | BUTTE,1/7/11,T. LaBou,Inspect 31 | BUTTE,9/26/11,W.Stevens,Inspect 32 | BUTTE,10/2/11,T. LaBou,Bearing Seal 33 | BUTTE,11/5/11,D.Pitre,Inspect 34 | BUTTE,5/22/12,D.Pitre,Inspect 35 | BUTTE,12/15/12,D.Pitre,Inspect 36 | BUTTE,6/16/13,T. LaBou,Vane clearance adjust 37 | BUTTE,7/11/13,W.Stevens,Inspect 38 | BUTTE,2/5/14,D.Pitre,Inspect 39 | CARGO,10/2/09,T. LaBou,Install 40 | CARGO,10/5/09,W.Stevens,Inspect 41 | CARGO,11/22/09,T. LaBou,Tighten Mounts 42 | CARGO,6/10/10,T. LaBou,Inspect 43 | CARGO,1/7/11,T. LaBou,Inspect 44 | CARGO,9/26/11,W.Stevens,Inspect 45 | CARGO,10/2/11,T. LaBou,Bearing Seal 46 | CARGO,11/5/11,D.Pitre,Inspect 47 | CARGO,5/22/12,D.Pitre,Inspect 48 | CARGO,12/15/12,D.Pitre,Inspect 49 | CARGO,6/18/13,T. LaBou,Vane clearance adjust 50 | CARGO,7/11/13,W.Stevens,Inspect 51 | CARGO,2/5/14,D.Pitre,Inspect 52 | CARGO,3/13/14,D.Pitre,Tighten Mounts 53 | LAGNAPPE,10/2/09,T. LaBou,Install 54 | LAGNAPPE,10/5/09,W.Stevens,Inspect 55 | LAGNAPPE,11/24/09,W.Stevens,Tighten Mounts 56 | LAGNAPPE,6/10/10,T. LaBou,Inspect 57 | LAGNAPPE,1/7/11,T. LaBou,Inspect 58 | LAGNAPPE,9/30/11,W.Stevens,Inspect 59 | LAGNAPPE,10/3/11,T. LaBou,Bearing Seal 60 | LAGNAPPE,11/5/11,D.Pitre,Inspect 61 | LAGNAPPE,5/22/12,D.Pitre,Inspect 62 | LAGNAPPE,12/15/12,W.Stevens,Inspect 63 | LAGNAPPE,6/18/13,T. LaBou,Vane clearance adjust 64 | LAGNAPPE,7/11/13,W.Stevens,Inspect 65 | LAGNAPPE,2/5/14,D.Pitre,Inspect 66 | LAGNAPPE,3/14/14,D.Pitre,Shutdown Main Feed Line Failure 67 | CHER,11/5/09,D.Pitre,Install 68 | CHER,11/23/09,W.Stevens,Inspect 69 | CHER,11/24/09,W.Stevens,Tighten Mounts 70 | CHER,6/10/10,T. LaBou,Inspect 71 | CHER,1/7/11,T. LaBou,Inspect 72 | CHER,9/30/11,W.Stevens,Inspect 73 | CHER,10/3/11,W.Stevens,Bearing Seal 74 | CHER,11/5/11,D.Pitre,Inspect 75 | CHER,5/22/12,W.Stevens,Inspect 76 | CHER,12/15/12,W.Stevens,Inspect 77 | CHER,6/22/13,T. LaBou,Vane clearance adjust 78 | CHER,7/11/13,W.Stevens,Inspect 79 | CHER,2/5/14,D.Pitre,Inspect 80 | CHER,3/15/14,T. LaBou,Tighten Mounts 81 | ANDOUILLE,11/25/09,W.Stevens,Install 82 | ANDOUILLE,11/27/09,W.Stevens,Inspect 83 | ANDOUILLE,12/2/09,W.Stevens,Tighten Mounts 84 | ANDOUILLE,6/10/10,T. LaBou,Inspect 85 | ANDOUILLE,1/7/11,T. LaBou,Inspect 86 | ANDOUILLE,9/30/11,W.Stevens,Inspect 87 | ANDOUILLE,10/7/11,D.Pitre,Bearing Seal 88 | ANDOUILLE,11/5/11,D.Pitre,Inspect 89 | ANDOUILLE,5/22/12,W.Stevens,Inspect 90 | ANDOUILLE,12/15/12,W.Stevens,Inspect 91 | ANDOUILLE,6/25/13,T. LaBou,Vane clearance adjust 92 | ANDOUILLE,7/11/13,W.Stevens,Inspect 93 | ANDOUILLE,2/5/14,D.Pitre,Inspect 94 | ANDOUILLE,3/12/14,T. LaBou,Tighten Mounts 95 | MOJO,6/20/11,N.Boudreau,Install 96 | MOJO,7/11/11,N.Boudreau,Tighten Mounts 97 | MOJO,1/22/13,N.Boudreau,Inspect 98 | MOJO,7/15/13,N.Boudreau,Inspect 99 | MOJO,12/15/13,N.Boudreau,Inspect 100 | MOJO,6/7/13,M.Dugas,Inspect 101 | MOJO,1/11/14,N.Boudreau,Inspect 102 | MOJO,3/14/14,M.Dugas,Inspect 103 | BBKING,7/15/11,M.Dugas,Install 104 | BBKING,7/11/11,N.Boudreau,Tighten Mounts 105 | BBKING,1/24/13,N.Boudreau,Inspect 106 | BBKING,7/22/13,N.Boudreau,Inspect 107 | BBKING,12/16/13,N.Boudreau,Inspect 108 | BBKING,12/18/13,M.Dugas,Shutdown Failure 109 | BBKING,12/19/13,M.Dugas,Replace Front Motor Bearing 110 | BBKING,12/20/13,N.Boudreau,Inspect 111 | BBKING,6/12/13,M.Dugas,Inspect 112 | BBKING,1/14/14,N.Boudreau,Inspect 113 | BBKING,3/14/14,M.Dugas,Inspect 114 | -------------------------------------------------------------------------------- /data/sensorvendor.csv: -------------------------------------------------------------------------------- 1 | COHUTTA,HYDROPUMP,11/27/10,3/15/11,HYDROCAM,29.687276,-91.162492 2 | NANTAHALLA,HYDROPUMP,11/27/10,3/15/11,HYDROCAM,29.687128,-91.162499 3 | THERMALITO,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.687276,-91.162492 4 | BUTTE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.686929,-91.1625 5 | CARGO,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683147,-91.145448 6 | LAGNAPPE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683124,-91.145471 7 | CHER,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683167,-91.145427 8 | ANDOUILLE,HYDROPUMP,5/25/08,9/26/09,GENPUMP,29.683187,-91.145408 9 | MOJO,HYDROPUMP,4/25/11,6/20/11,XYLO,29.66975,-91.13223 10 | BBKING,HYDROPUMP,4/25/11,6/20/11,XYLO,29.669723,-91.132278 11 | -------------------------------------------------------------------------------- /dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | mapr 5 | sparkdataframesapp 6 | sparkdataframesapp 7 | 1.0 8 | 2015 9 | 10 | src/main/scala 11 | src/test/scala 12 | 13 | 14 | net.alchim31.maven 15 | scala-maven-plugin 16 | 3.2.0 17 | 18 | 19 | 20 | compile 21 | testCompile 22 | 23 | 24 | 25 | -dependencyfile 26 | ${project.build.directory}/.scala_dependencies 27 | 28 | 29 | 30 | 31 | 32 | 33 | maven-surefire-plugin 34 | 2.18.1 35 | 36 | false 37 | true 38 | 39 | **/*Test.* 40 | **/*Suite.* 41 | 42 | 43 | 44 | 45 | maven-shade-plugin 46 | 2.3 47 | 48 | 49 | package 50 | 51 | shade 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | scala-tools.org 61 | Scala-tools Maven2 Repository 62 | http://scala-tools.org/repo-releases 63 | 64 | 65 | 66 | 67 | false 68 | 69 | mapr-releases 70 | http://repository.mapr.com/maven/ 71 | 72 | 73 | 74 | 75 | org.apache.spark 76 | spark-core_2.10 77 | 1.3.1 78 | provided 79 | 80 | 81 | chill_2.10 82 | com.twitter 83 | 84 | 85 | chill-java 86 | com.twitter 87 | 88 | 89 | hadoop-client 90 | org.apache.hadoop 91 | 92 | 93 | spark-network-common_2.10 94 | org.apache.spark 95 | 96 | 97 | spark-network-shuffle_2.10 98 | org.apache.spark 99 | 100 | 101 | jets3t 102 | net.java.dev.jets3t 103 | 104 | 105 | curator-recipes 106 | org.apache.curator 107 | 108 | 109 | javax.servlet 110 | org.eclipse.jetty.orbit 111 | 112 | 113 | commons-lang3 114 | org.apache.commons 115 | 116 | 117 | commons-math3 118 | org.apache.commons 119 | 120 | 121 | jsr305 122 | com.google.code.findbugs 123 | 124 | 125 | slf4j-api 126 | org.slf4j 127 | 128 | 129 | jul-to-slf4j 130 | org.slf4j 131 | 132 | 133 | jcl-over-slf4j 134 | org.slf4j 135 | 136 | 137 | log4j 138 | log4j 139 | 140 | 141 | slf4j-log4j12 142 | org.slf4j 143 | 144 | 145 | compress-lzf 146 | com.ning 147 | 148 | 149 | snappy-java 150 | org.xerial.snappy 151 | 152 | 153 | lz4 154 | net.jpountz.lz4 155 | 156 | 157 | RoaringBitmap 158 | org.roaringbitmap 159 | 160 | 161 | commons-net 162 | commons-net 163 | 164 | 165 | akka-remote_2.10 166 | org.spark-project.akka 167 | 168 | 169 | akka-slf4j_2.10 170 | org.spark-project.akka 171 | 172 | 173 | json4s-jackson_2.10 174 | org.json4s 175 | 176 | 177 | mesos 178 | org.apache.mesos 179 | 180 | 181 | netty-all 182 | io.netty 183 | 184 | 185 | stream 186 | com.clearspring.analytics 187 | 188 | 189 | metrics-core 190 | io.dropwizard.metrics 191 | 192 | 193 | metrics-jvm 194 | io.dropwizard.metrics 195 | 196 | 197 | metrics-json 198 | io.dropwizard.metrics 199 | 200 | 201 | metrics-graphite 202 | io.dropwizard.metrics 203 | 204 | 205 | jackson-databind 206 | com.fasterxml.jackson.core 207 | 208 | 209 | jackson-module-scala_2.10 210 | com.fasterxml.jackson.module 211 | 212 | 213 | ivy 214 | org.apache.ivy 215 | 216 | 217 | oro 218 | oro 219 | 220 | 221 | tachyon-client 222 | org.tachyonproject 223 | 224 | 225 | pyrolite 226 | org.spark-project 227 | 228 | 229 | py4j 230 | net.sf.py4j 231 | 232 | 233 | unused 234 | org.spark-project.spark 235 | 236 | 237 | 238 | 239 | org.apache.spark 240 | spark-sql_2.10 241 | 1.3.1 242 | provided 243 | 244 | 245 | spark-catalyst_2.10 246 | org.apache.spark 247 | 248 | 249 | parquet-column 250 | com.twitter 251 | 252 | 253 | parquet-hadoop 254 | com.twitter 255 | 256 | 257 | jodd-core 258 | org.jodd 259 | 260 | 261 | jackson-databind 262 | com.fasterxml.jackson.core 263 | 264 | 265 | unused 266 | org.spark-project.spark 267 | 268 | 269 | 270 | 271 | junit 272 | junit 273 | 4.11 274 | test 275 | 276 | 277 | hamcrest-core 278 | org.hamcrest 279 | 280 | 281 | 282 | 283 | org.specs2 284 | specs2_2.10 285 | 1.13 286 | test 287 | 288 | 289 | scalaz-core_2.10 290 | org.specs2 291 | 292 | 293 | scalaz-concurrent_2.10 294 | org.specs2 295 | 296 | 297 | 298 | 299 | org.scalatest 300 | scalatest_2.10 301 | 2.0.M6-SNAP8 302 | test 303 | 304 | 305 | 306 | 2.10.4 307 | UTF-8 308 | 1.7 309 | 1.3.1 310 | 2.10 311 | 1.7 312 | 313 | 314 | 315 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | mapr 5 | sparkstreamhbaseapp 6 | 1.0 7 | sparkstreamhbaseapp 8 | 9 | 2015 10 | 11 | 12 | 1.7 13 | 1.7 14 | UTF-8 15 | 2.10 16 | 2.10.4 17 | 1.5.2 18 | 1.1.1-mapr-1602-m7-5.1.0 19 | 20 | 21 | 22 | 23 | scala-tools.org 24 | Scala-tools Maven2 Repository 25 | http://scala-tools.org/repo-releases 26 | 27 | 28 | 29 | mapr-releases 30 | http://repository.mapr.com/maven/ 31 | 32 | false 33 | 34 | 35 | true 36 | 37 | 38 | 39 | maven2-repository.dev.java.net 40 | Java.net Repository for Maven 41 | http://download.java.net/maven/2/ 42 | default 43 | 44 | 45 | 46 | 47 | org.scala-lang 48 | scala-library 49 | ${scala.version} 50 | 51 | 52 | org.apache.spark 53 | spark-core_${scala.tools.version} 54 | ${spark.version} 55 | provided 56 | 57 | 58 | org.apache.spark 59 | spark-sql_${scala.tools.version} 60 | ${spark.version} 61 | 62 | 63 | org.apache.spark 64 | spark-streaming_${scala.tools.version} 65 | ${spark.version} 66 | 67 | 68 | org.apache.hbase 69 | hbase-server 70 | ${mapr.version} 71 | 72 | 73 | 74 | 75 | src/main/scala 76 | src/test/scala 77 | 78 | 79 | org.scala-tools 80 | maven-scala-plugin 81 | 2.15.2 82 | 83 | 84 | 85 | compile 86 | 87 | 88 | 89 | -dependencyfile 90 | ${project.build.directory}/.scala_dependencies 91 | 92 | 93 | 94 | 95 | 96 | 97 | org.apache.maven.plugins 98 | maven-compiler-plugin 99 | 2.3.1 100 | 101 | 1.7 102 | 1.7 103 | true 104 | true 105 | 106 | 107 | 108 | org.apache.maven.plugins 109 | maven-eclipse-plugin 110 | 2.8 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /scripts/create_ext_table.hql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE sensor 2 | (key STRING, resID STRING, date STRING, 3 | hz FLOAT, 4 | disp FLOAT, 5 | flo INT, 6 | sedPPM FLOAT, 7 | psi INT, 8 | chlPPM FLOAT) 9 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' 10 | WITH SERDEPROPERTIES ( 11 | "hbase.columns.mapping" = 12 | ":key,cf1:resID,cf1:date,cf1:hz,cf1:disp, 13 | cf1:flo,cf1:sedPPM,cf1:psi,cf1:chlPPM" 14 | ) 15 | 16 | TBLPROPERTIES("hbase.table.name" = "/user/user01/sensor"); 17 | -------------------------------------------------------------------------------- /scripts/create_join_view.hql: -------------------------------------------------------------------------------- 1 | create view pumpview as 2 | select s.date, s.hz, s.disp, s.flo, s.sedPPM, s.psi, s.chlPPM, 3 | p.resourceid, p.type, p.purchasedate, p.dateinservice, p.vendor, p.longitude, p.latitude 4 | from sensor s 5 | join pump_info p 6 | on (s.resid = p.resourceid); 7 | -------------------------------------------------------------------------------- /scripts/create_maint_table.hql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE maint_table 2 | 3 | (resourceid STRING, eventDate STRING, 4 | technician STRING, description STRING) 5 | 6 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "," 7 | 8 | 9 | STORED AS TEXTFILE LOCATION "/user/user01/sensormaint.csv"; 10 | -------------------------------------------------------------------------------- /scripts/create_pump_table.hql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE pump_info 2 | 3 | (resourceid STRING, type STRING, purchasedate STRING, 4 | dateinservice STRING, vendor STRING, longitude FLOAT, latitude FLOAT) 5 | 6 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "," 7 | 8 | 9 | STORED AS TEXTFILE LOCATION "/user/user01/sensorvendor.csv"; 10 | -------------------------------------------------------------------------------- /src/main/scala/examples/HBaseReadRowWriteStats.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * This example reads a row of time series sensor data 3 | * calculates the the statistics for the hz data 4 | * and then writes these statistics to the stats column family 5 | * 6 | * you can specify specific columns to return, More info: 7 | * http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html 8 | */ 9 | 10 | package examples 11 | 12 | import scala.reflect.runtime.universe 13 | 14 | import org.apache.hadoop.hbase.HBaseConfiguration 15 | import org.apache.hadoop.hbase.client.Put 16 | import org.apache.hadoop.hbase.client.Result 17 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 18 | import org.apache.hadoop.hbase.mapred.TableOutputFormat 19 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 20 | import org.apache.hadoop.hbase.util.Bytes 21 | import org.apache.hadoop.mapred.JobConf 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.SparkContext 24 | import org.apache.spark.rdd.PairRDDFunctions 25 | import org.apache.spark.sql.Row 26 | import org.apache.spark.sql.functions.avg 27 | import org.apache.hadoop.mapreduce.Job 28 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 29 | import org.apache.hadoop.fs.Path 30 | 31 | object HBaseReadRowWriteStats { 32 | 33 | case class SensorRow(rowkey: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) 34 | 35 | object SensorRow extends Serializable{ 36 | def parseSensorRow(result: Result): SensorRow = { 37 | val rowkey = Bytes.toString(result.getRow()) 38 | // remove time from rowKey, stats row key is for day 39 | val p0 = rowkey.split(" ")(0) 40 | val p1 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("hz"))) 41 | val p2 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("disp"))) 42 | val p3 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("flo"))) 43 | val p4 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("sedPPM"))) 44 | val p5 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("psi"))) 45 | val p6 = Bytes.toDouble(result.getValue(cfDataBytes, Bytes.toBytes("chlPPM"))) 46 | SensorRow(p0, p1, p2, p3, p4, p5, p6) 47 | } 48 | } 49 | 50 | case class SensorStatsRow(rowkey: String, 51 | maxhz: Double, minhz: Double, avghz: Double, 52 | maxdisp: Double, mindisp: Double, avgdisp: Double, 53 | maxflo: Double, minflo: Double, avgflo: Double, 54 | maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double, 55 | maxpsi: Double, minpsi: Double, avgpsi: Double, 56 | maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double) 57 | 58 | object SensorStatsRow { 59 | def convertToPutStats(row: SensorStatsRow): (ImmutableBytesWritable, Put) = { 60 | val p = new Put(Bytes.toBytes(row.rowkey)) 61 | // add columns with data values to put 62 | p.add(cfStatsBytes, Bytes.toBytes("hzmax"), Bytes.toBytes(row.maxhz)) 63 | p.add(cfStatsBytes, Bytes.toBytes("hzmin"), Bytes.toBytes(row.minhz)) 64 | p.add(cfStatsBytes, Bytes.toBytes("hzavg"), Bytes.toBytes(row.avghz)) 65 | p.add(cfStatsBytes, Bytes.toBytes("dispmax"), Bytes.toBytes(row.maxdisp)) 66 | p.add(cfStatsBytes, Bytes.toBytes("dispmin"), Bytes.toBytes(row.mindisp)) 67 | p.add(cfStatsBytes, Bytes.toBytes("dispavg"), Bytes.toBytes(row.avgdisp)) 68 | p.add(cfStatsBytes, Bytes.toBytes("flomax"), Bytes.toBytes(row.maxflo)) 69 | p.add(cfStatsBytes, Bytes.toBytes("flomin"), Bytes.toBytes(row.minflo)) 70 | p.add(cfStatsBytes, Bytes.toBytes("floavg"), Bytes.toBytes(row.avgflo)) 71 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMmax"), Bytes.toBytes(row.maxsedPPM)) 72 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMmin"), Bytes.toBytes(row.minsedPPM)) 73 | p.add(cfStatsBytes, Bytes.toBytes("sedPPMavg"), Bytes.toBytes(row.avgsedPPM)) 74 | p.add(cfStatsBytes, Bytes.toBytes("psimax"), Bytes.toBytes(row.maxpsi)) 75 | p.add(cfStatsBytes, Bytes.toBytes("psimin"), Bytes.toBytes(row.minpsi)) 76 | p.add(cfStatsBytes, Bytes.toBytes("psiavg"), Bytes.toBytes(row.avgpsi)) 77 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMmax"), Bytes.toBytes(row.maxchlPPM)) 78 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMmin"), Bytes.toBytes(row.minchlPPM)) 79 | p.add(cfStatsBytes, Bytes.toBytes("chlPPMavg"), Bytes.toBytes(row.avgchlPPM)) 80 | (new ImmutableBytesWritable, p) 81 | } 82 | } 83 | 84 | final val tableName = "/user/user01/sensor" 85 | final val cfData = "data" 86 | final val cfDataBytes = Bytes.toBytes(cfData) 87 | final val cfStats = "stats" 88 | final val cfStatsBytes = Bytes.toBytes(cfStats) 89 | 90 | def main(args: Array[String]) { 91 | val sparkConf = new SparkConf().setAppName("HBaseTest") 92 | val sc = new SparkContext(sparkConf) 93 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 94 | import sqlContext.implicits._ 95 | 96 | val conf = HBaseConfiguration.create() 97 | 98 | conf.set(TableInputFormat.INPUT_TABLE, tableName) 99 | // scan data column family 100 | conf.set(TableInputFormat.SCAN_COLUMNS, "data") 101 | 102 | // Load an RDD of rowkey, result(ImmutableBytesWritable, Result) tuples from the table 103 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], 104 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 105 | classOf[org.apache.hadoop.hbase.client.Result]) 106 | 107 | hBaseRDD.count() 108 | 109 | // transform (ImmutableBytesWritable, Result) tuples into an RDD of Results 110 | val resultRDD = hBaseRDD.map(tuple => tuple._2) 111 | resultRDD.count() 112 | // transform RDD of Results into an RDD of SensorRow objects 113 | val sensorRDD = resultRDD.map(SensorRow.parseSensorRow) 114 | // change RDD of SensorRow objects to a DataFrame 115 | val sensorDF = sensorRDD.toDF() 116 | // Return the schema of this DataFrame 117 | sensorDF.printSchema() 118 | // Display the top 20 rows of DataFrame 119 | sensorDF.show() 120 | // group by the rowkey (sensorid_date) get average psi 121 | sensorDF.groupBy("rowkey").agg(avg(sensorDF("psi"))).take(5).foreach(println) 122 | // register the DataFrame as a temp table 123 | sensorDF.registerTempTable("SensorRow") 124 | 125 | // group by the rowkey (sensorid_date) get average, max , min for all columns 126 | val sensorStatDF = sqlContext.sql("SELECT rowkey,MAX(hz) as maxhz, min(hz) as minhz, avg(hz) as avghz, MAX(disp) as maxdisp, min(disp) as mindisp, avg(disp) as avgdisp, MAX(flo) as maxflo, min(flo) as minflo, avg(flo) as avgflo,MAX(sedPPM) as maxsedPPM, min(sedPPM) as minsedPPM, avg(sedPPM) as avgsedPPM, MAX(psi) as maxpsi, min(psi) as minpsi, avg(psi) as avgpsi,MAX(chlPPM) as maxchlPPM, min(chlPPM) as minchlPPM, avg(chlPPM) as avgchlPPM FROM SensorRow GROUP BY rowkey") 127 | sensorStatDF.printSchema() 128 | sensorStatDF.take(5).foreach(println) 129 | 130 | // map the query result row to the SensorStatsRow object 131 | val sensorStatsRowRDD = sensorStatDF.map { 132 | case Row(rowkey: String, 133 | maxhz: Double, minhz: Double, avghz: Double, maxdisp: Double, mindisp: Double, avgdisp: Double, 134 | maxflo: Double, minflo: Double, avgflo: Double, maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double, 135 | maxpsi: Double, minpsi: Double, avgpsi: Double, maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double) => 136 | SensorStatsRow(rowkey: String, 137 | maxhz: Double, minhz: Double, avghz: Double, maxdisp: Double, mindisp: Double, avgdisp: Double, 138 | maxflo: Double, minflo: Double, avgflo: Double, maxsedPPM: Double, minsedPPM: Double, avgsedPPM: Double, 139 | maxpsi: Double, minpsi: Double, avgpsi: Double, maxchlPPM: Double, minchlPPM: Double, avgchlPPM: Double) 140 | } 141 | 142 | sensorStatsRowRDD.take(5).foreach(println) 143 | 144 | // set JobConfiguration variables for writing to HBase 145 | val jobConfig: JobConf = new JobConf(conf, this.getClass) 146 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out") 147 | // set the HBase output table 148 | jobConfig.setOutputFormat(classOf[TableOutputFormat]) 149 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName) 150 | // convert the SensorStatsRow objects into HBase put objects and write to HBase 151 | sensorStatsRowRDD.map { 152 | case sensorStatsRow => SensorStatsRow.convertToPutStats(sensorStatsRow) 153 | }.saveAsHadoopDataset(jobConfig) 154 | } 155 | 156 | } 157 | -------------------------------------------------------------------------------- /src/main/scala/examples/HBaseReadWrite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * This example reads a row of time series sensor data 3 | * calculates the the statistics for the hz data 4 | * and then writes these statistics to the stats column family 5 | * 6 | */ 7 | 8 | package examples 9 | 10 | import org.apache.hadoop.hbase.HBaseConfiguration 11 | import org.apache.hadoop.hbase.client.Put 12 | import org.apache.hadoop.hbase.client.Result 13 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 14 | import org.apache.hadoop.hbase.mapred.TableOutputFormat 15 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 16 | import org.apache.hadoop.hbase.util.Bytes 17 | import org.apache.hadoop.mapred.JobConf 18 | import org.apache.spark.SparkConf 19 | import org.apache.spark.SparkContext 20 | import org.apache.spark.rdd.PairRDDFunctions 21 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 22 | import org.apache.spark.util.StatCounter 23 | 24 | object HBaseReadWrite extends Serializable{ 25 | 26 | final val tableName = "/user/user01/sensor" 27 | final val cfDataBytes = Bytes.toBytes("data") 28 | final val cfStatsBytes = Bytes.toBytes("stats") 29 | 30 | def main(args: Array[String]) { 31 | val sparkConf = new SparkConf().setAppName("HBaseTest") 32 | val sc = new SparkContext(sparkConf) 33 | 34 | val conf = HBaseConfiguration.create() 35 | conf.set(TableInputFormat.INPUT_TABLE, tableName) 36 | conf.set(TableInputFormat.SCAN_ROW_START, "COHUTTA_3/10/14") 37 | conf.set(TableInputFormat.SCAN_ROW_STOP, "COHUTTA_3/11/14") 38 | // specify specific column to return 39 | conf.set(TableInputFormat.SCAN_COLUMNS, "data:psi") 40 | 41 | // Load an RDD of (ImmutableBytesWritable, Result) tuples from the table 42 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], 43 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 44 | classOf[org.apache.hadoop.hbase.client.Result]) 45 | 46 | hBaseRDD.count() 47 | 48 | // transform (ImmutableBytesWritable, Result) tuples into an RDD of Result’s 49 | val resultRDD = hBaseRDD.map(tuple => tuple._2) 50 | resultRDD.count() 51 | // transform into an RDD of (RowKey, ColumnValue)s the RowKey has the time removed 52 | val keyValueRDD = resultRDD.map(result => (Bytes.toString(result.getRow()).split(" ")(0), Bytes.toDouble(result.value))) 53 | keyValueRDD.take(3).foreach(kv => println(kv)) 54 | 55 | // group by rowkey , get statistics for column value 56 | val keyStatsRDD = keyValueRDD.groupByKey().mapValues(list => StatCounter(list)) 57 | keyStatsRDD.take(5).foreach(println) 58 | 59 | // set JobConfiguration variables for writing to HBase 60 | val jobConfig: JobConf = new JobConf(conf, this.getClass) 61 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out") 62 | jobConfig.setOutputFormat(classOf[TableOutputFormat]) 63 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName) 64 | // convert rowkey, psi stats to put and write to hbase table stats column family 65 | keyStatsRDD.map { case (k, v) => convertToPut(k, v) }.saveAsHadoopDataset(jobConfig) 66 | 67 | } 68 | // convert rowkey, stats to put 69 | def convertToPut(key: String, stats: StatCounter): (ImmutableBytesWritable, Put) = { 70 | val p = new Put(Bytes.toBytes(key)) 71 | // add columns with data values to put 72 | p.add(cfStatsBytes, Bytes.toBytes("psimax"), Bytes.toBytes(stats.max)) 73 | p.add(cfStatsBytes, Bytes.toBytes("psimin"), Bytes.toBytes(stats.min)) 74 | p.add(cfStatsBytes, Bytes.toBytes("psimean"), Bytes.toBytes(stats.mean)) 75 | (new ImmutableBytesWritable, p) 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/examples/HBaseSensorStream.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * 4 | */ 5 | 6 | package examples 7 | 8 | import org.apache.hadoop.hbase.HBaseConfiguration 9 | import org.apache.hadoop.hbase.client.Put 10 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 11 | import org.apache.hadoop.hbase.mapred.TableOutputFormat 12 | import org.apache.hadoop.hbase.util.Bytes 13 | import org.apache.hadoop.mapred.JobConf 14 | import org.apache.spark.SparkConf 15 | 16 | import org.apache.spark.streaming.Seconds 17 | import org.apache.spark.streaming.StreamingContext 18 | 19 | object HBaseSensorStream extends Serializable { 20 | final val tableName = "/user/user01/sensor" 21 | final val cfDataBytes = Bytes.toBytes("data") 22 | final val cfAlertBytes = Bytes.toBytes("alert") 23 | final val colHzBytes = Bytes.toBytes("hz") 24 | final val colDispBytes = Bytes.toBytes("disp") 25 | final val colFloBytes = Bytes.toBytes("flo") 26 | final val colSedBytes = Bytes.toBytes("sedPPM") 27 | final val colPsiBytes = Bytes.toBytes("psi") 28 | final val colChlBytes = Bytes.toBytes("chlPPM") 29 | 30 | // schema for sensor data 31 | case class Sensor(resid: String, date: String, time: String, hz: Double, disp: Double, flo: Double, sedPPM: Double, psi: Double, chlPPM: Double) 32 | 33 | object Sensor extends Serializable{ 34 | // function to parse line of sensor data into Sensor class 35 | def parseSensor(str: String): Sensor = { 36 | val p = str.split(",") 37 | Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble) 38 | } 39 | // Convert a row of sensor object data to an HBase put object 40 | def convertToPut(sensor: Sensor): (ImmutableBytesWritable, Put) = { 41 | val dateTime = sensor.date + " " + sensor.time 42 | // create a composite row key: sensorid_date time 43 | val rowkey = sensor.resid + "_" + dateTime 44 | val put = new Put(Bytes.toBytes(rowkey)) 45 | // add to column family data, column data values to put object 46 | put.add(cfDataBytes, colHzBytes, Bytes.toBytes(sensor.hz)) 47 | put.add(cfDataBytes, colDispBytes, Bytes.toBytes(sensor.disp)) 48 | put.add(cfDataBytes, colFloBytes, Bytes.toBytes(sensor.flo)) 49 | put.add(cfDataBytes, colSedBytes, Bytes.toBytes(sensor.sedPPM)) 50 | put.add(cfDataBytes, colPsiBytes, Bytes.toBytes(sensor.psi)) 51 | put.add(cfDataBytes, colChlBytes, Bytes.toBytes(sensor.chlPPM)) 52 | return (new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put) 53 | } 54 | // convert psi alert to an HBase put object 55 | def convertToPutAlert(sensor: Sensor): (ImmutableBytesWritable, Put) = { 56 | val dateTime = sensor.date + " " + sensor.time 57 | // create a composite row key: sensorid_date time 58 | val key = sensor.resid + "_" + dateTime 59 | val p = new Put(Bytes.toBytes(key)) 60 | // add to column family alert, column psi data value to put object 61 | p.add(cfAlertBytes, colPsiBytes, Bytes.toBytes(sensor.psi)) 62 | return (new ImmutableBytesWritable(Bytes.toBytes(key)), p) 63 | } 64 | } 65 | 66 | def main(args: Array[String]): Unit = { 67 | // set up HBase Table configuration 68 | val conf = HBaseConfiguration.create() 69 | conf.set(TableOutputFormat.OUTPUT_TABLE, tableName) 70 | val jobConfig: JobConf = new JobConf(conf, this.getClass) 71 | jobConfig.set("mapreduce.output.fileoutputformat.outputdir", "/user/user01/out") 72 | jobConfig.setOutputFormat(classOf[TableOutputFormat]) 73 | jobConfig.set(TableOutputFormat.OUTPUT_TABLE, tableName) 74 | 75 | val sparkConf = new SparkConf().setAppName("HBaseStream") 76 | // create a StreamingContext, the main entry point for all streaming functionality 77 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 78 | 79 | // parse the lines of data into sensor objects 80 | val sensorDStream = ssc.textFileStream("/user/user01/stream").map(Sensor.parseSensor) 81 | sensorDStream.print() 82 | 83 | sensorDStream.foreachRDD { rdd => 84 | // filter sensor data for low psi 85 | val alertRDD = rdd.filter(sensor => sensor.psi < 5.0) 86 | alertRDD.take(1).foreach(println) 87 | // convert sensor data to put object and write to HBase table column family data 88 | rdd.map(Sensor.convertToPut). 89 | saveAsHadoopDataset(jobConfig) 90 | // convert alert data to put object and write to HBase table column family alert 91 | alertRDD.map(Sensor.convertToPutAlert). 92 | saveAsHadoopDataset(jobConfig) 93 | } 94 | // Start the computation 95 | ssc.start() 96 | // Wait for the computation to terminate 97 | ssc.awaitTermination() 98 | 99 | } 100 | 101 | } --------------------------------------------------------------------------------