├── Graph1.png
├── Graph2.png
├── LICENSE
├── Readme.md
├── SparkStreamingSessionNotes.txt
├── createHiveTable.hql
├── hbaseScript.txt
├── pom.xml
└── src
    └── main
        └── scala
            └── com
                └── cloudera
                    └── sa
                        └── example
                            └── sparkstreaming
                                └── sessionization
                                    ├── SessionDataFileHDFSWriter.scala
                                    ├── SessionDataFileWriter.scala
                                    ├── SessionDataGenerator.scala
                                    ├── SessionDataSocketSender.scala
                                    └── SessionizeData.scala


/Graph1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmalaska/SparkStreaming.Sessionization/3fbf3717ca11fc6478e636c79e67a106f9c20b82/Graph1.png


--------------------------------------------------------------------------------
/Graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmalaska/SparkStreaming.Sessionization/3fbf3717ca11fc6478e636c79e67a106f9c20b82/Graph2.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | Spark Stream Sessionization
 2 | ------------------------------
 3 | 
 4 | ### Problem
 5 | This is an example of how to use Spark Streaming to Sessionize web log data by ip address.  
 6 | This will mean that we are sessionizing in NRT and landing the results on HDFS.
 7 | 
 8 | A log with that we are going to be sending stats of the sessionization to HBase so 
 9 | that we can query it with Impala to get a NRT picture of stats like.
10 | 
11 | - Number of events
12 | - Number of active sessions
13 | - Average session time
14 | - Number of new sessions
15 | - Number of dead sessions
16 | 
17 | This will give us graphs like the following
18 | 
19 | ![alt tag](https://raw.githubusercontent.com/tmalaska/SparkStreaming.Sessionizatin/master/Graph1.png)
20 | 
21 | ![alt tag](https://raw.githubusercontent.com/tmalaska/SparkStreaming.Sessionizatin/master/Graph2.png)
22 | 
23 | ### How to use
24 | 
25 | 1: Set up HBase table.  Just go to the HBase shell and use the following command
26 | 
27 | create 'stats', 's'
28 | 
29 | 2: Create the following table in Hive using the createHiveTable.hql file
30 | 
31 | hive -f createHiveTable.hql
32 | 
33 | 3: Create the following directories:
34 | 
35 |  - /user/root/ss/checkpoint
36 |  - /user/root/ss/input
37 |  - /user/root/ss/results
38 |  - /user/root/ss/tmp
39 | 
40 | 4: Start a generator.  I only have two generators now: HDFS file and socket, with the HDFS file being tested more.  
41 | But this code can be made to support any Spark Streaming Receiver.  Here is how I started my generator:
42 | 
43 | hadoop jar SparkStreamingSessionization.jar com.cloudera.sa.example.sparkstreaming.sessionization.SessionDataFileHDFSWriter /user/root/ss/tmp /user/root/ss/input 120 10000 9990
44 | 
45 | 5: Then start the Spark Streaming process in Yarn with the following cmd
46 | 
47 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master yarn --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar newFile hdfs://10.20.194.242/user/root/ss/results stats s hdfs://10.20.194.242/user/root/ss/checkpoint hdfs://10.20.194.242/user/root/ss/input
48 | 
49 | 6: Then I go to hue and I use the following Impala query:
50 | invalidate metadata;
51 | 
52 | select * from hbasetable limit 30;
53 | 
54 | 7: Then I used the graphing functinality in hue to so the graph that I included in this project
55 | 
56 | 
57 |  


--------------------------------------------------------------------------------
/SparkStreamingSessionNotes.txt:
--------------------------------------------------------------------------------
 1 | 10236 Marion Park Dr, Kansas City, MO 64137, USA
 2 | 
 3 | 433 California Street, Suite 1100. San Francisco, CA 94104
 4 | 
 5 | java -cp SparkStreamingSessionization.jar com.cloudera.sa.example.sparkstreaming.sessionization.SessionDataFileWriter 1000000 weblog.txt
 6 | 
 7 | java -cp SparkStreamingSessionization.jar com.cloudera.sa.example.sparkstreaming.sessionization.SessionDataSocketSender 127.0.0.1 42424 1000000
 8 | 
 9 | java -cp SparkStreamingSessionization.jar com.cloudera.sa.example.sparkstreaming.sessionization.SessionDataFileHDFSWriter /user/root/ss/tmp /user/ss/input 20 10000 20000
10 | 
11 | 
12 | HBase Shell
13 | create 'stats', 's'
14 | 
15 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master yarn --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar file hdfs://tedmalaska-exp-d-1.ent.cloudera.com/user/root/sessionization/results stats s hdfs://tedmalaska-exp-d-1.ent.cloudera.com/user/root/sessionization/checkpoint hdfs://tedmalaska-exp-d-1.ent.cloudera.com/user/root/sessionization/input/weblog.txt
16 | 
17 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master spark://tedmalaska-exp-d-1.ent.cloudera.com:7077 --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar file hdfs://10.20.194.242/user/root/sessionization/results stats s hdfs://10.20.194.242/user/root/sessionization/checkpoint hdfs://10.20.194.242/user/root/sessionization/input/
18 | 
19 | //play
20 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master spark://tedmalaska-exp-d-1.ent.cloudera.com:7077 --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar file hdfs://10.20.194.242/user/root/sessionization/results stats s hdfs://10.20.194.242/user/root/sessionization/checkpoint hdfs://10.20.194.242/user/root/
21 | 
22 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master spark://tedmalaska-exp-d-1.ent.cloudera.com:7077 --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar hostPort hdfs://tedmalaska-exp-d-1.ent.cloudera.com/user/root/sessionization/results stats s hdfs://tedmalaska-exp-d-1.ent.cloudera.com/user/root/sessionization/checkpoint 127.0.0.1 42424
23 | 
24 | 
25 | //File Test
26 | hadoop jar SparkStreamingSessionization.jar com.cloudera.sa.example.sparkstreaming.sessionization.SessionDataFileHDFSWriter /user/root/ss/tmp /user/root/ss/input 40 10000 9990
27 | 
28 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master spark://tedmalaska-exp-d-1.ent.cloudera.com:7077 --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar newFile hdfs://10.20.194.242/user/root/ss/results stats s hdfs://10.20.194.242/user/root/ss/checkpoint hdfs://10.20.194.242/user/root/ss/input
29 | 
30 | spark-submit --jars /opt/cloudera/parcels/CDH/lib/zookeeper/zookeeper-3.4.5-cdh5.1.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/guava-12.0.1.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/protobuf-java-2.5.0.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop2-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar,/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar,/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar --class com.cloudera.sa.example.sparkstreaming.sessionization.SessionizeData --master yarn --deploy-mode client --executor-memory 512M --num-executors 4 --driver-java-options -Dspark.executor.extraClassPath=/opt/cloudera/parcels/CDH/lib/hbase/lib/* SparkStreamingSessionization.jar newFile hdfs://10.20.194.242/user/root/ss/results stats s hdfs://10.20.194.242/user/root/ss/checkpoint hdfs://10.20.194.242/user/root/ss/input
31 | 
32 | //create table
33 | CREATE EXTERNAL TABLE hbaseTable (
34 |   id string,
35 |   ONE_TO_TEN_MINUTE_COUNT string,
36 |   OVER_TEN_MINUTES_COUNT string,
37 |   TOTAL_SESSION_COUNTS string,
38 |   TOTAL_SESSION_TIME string,
39 |   UNDER_A_MINUTE_COUNT string,
40 |   NEW_SESSION_COUNTS string,
41 |   EVENT_COUNTS string,
42 |   DEAD_SESSION_COUNTS string)
43 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
44 | WITH SERDEPROPERTIES (
45 |   "hbase.columns.mapping" =
46 |   ":key,s:ONE_TO_TEN_MINUTE_COUNT,s:OVER_TEN_MINUTES_COUNT,s:TOTAL_SESSION_COUNTS,
47 |   s:TOTAL_SESSION_TIME,s:UNDER_A_MINUTE_COUNT,s:NEW_SESSION_COUNTS,s:EVENT_COUNTS,
48 |   s:DEAD_SESSION_COUNTS"
49 | )
50 | TBLPROPERTIES("hbase.table.name" = "stats");
51 | 
52 | 
53 |  SparkStreamingSessionization.jar  


--------------------------------------------------------------------------------
/createHiveTable.hql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE hbaseTable (
 2 |   id string,
 3 |   ONE_TO_TEN_MINUTE_COUNT string,
 4 |   OVER_TEN_MINUTES_COUNT string,
 5 |   TOTAL_SESSION_COUNTS string,
 6 |   TOTAL_SESSION_TIME string,
 7 |   UNDER_A_MINUTE_COUNT string,
 8 |   NEW_SESSION_COUNTS string,
 9 |   EVENT_COUNTS string,
10 |   DEAD_SESSION_COUNTS string,
11 |   TOTAL_SESSION_EVENT_COUNTS string)
12 | STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
13 | WITH SERDEPROPERTIES (
14 |   "hbase.columns.mapping" =
15 |   ":key,s:ONE_TO_TEN_MINUTE_COUNT,s:OVER_TEN_MINUTES_COUNT,s:TOTAL_SESSION_COUNTS,
16 |   s:TOTAL_SESSION_TIME,s:UNDER_A_MINUTE_COUNT,s:NEW_SESSION_COUNTS,s:EVENT_COUNTS,
17 |   s:DEAD_SESSION_COUNTS,s:TOTAL_SESSION_EVENT_COUNTS"
18 |   )
19 | TBLPROPERTIES("hbase.table.name" = "stats");


--------------------------------------------------------------------------------
/hbaseScript.txt:
--------------------------------------------------------------------------------
1 | create 'stats', 's'


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>com.cloudera.sa</groupId>
  6 | 	<artifactId>example.sparkstreaming.sessionization</artifactId>
  7 | 	<version>0.0.1-SNAPSHOT</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>example.sparkstreaming.sessionization</name>
 11 | 	<url>http://maven.apache.org</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 	</properties>
 16 | 
 17 | 	<dependencies>
 18 | 
 19 | 		<dependency>
 20 | 			<groupId>com.cloudera.sa</groupId>
 21 | 			<artifactId>spark.hbase</artifactId>
 22 | 			<version>0.0.1-SNAPSHOT</version>
 23 | 		</dependency>
 24 | 		<dependency>
 25 | 			<groupId>org.scala-lang</groupId>
 26 | 			<artifactId>scala-compiler</artifactId>
 27 | 			<version>2.10.4</version>
 28 | 		</dependency>
 29 | 		<dependency>
 30 | 			<groupId>org.scalatest</groupId>
 31 | 			<artifactId>scalatest_2.10</artifactId>
 32 | 			<version>2.1.5</version>
 33 | 		</dependency>
 34 | 		<dependency>
 35 | 			<groupId>org.apache.spark</groupId>
 36 | 			<artifactId>spark-core_2.10</artifactId>
 37 | 			<version>1.0.0-cdh5.1.0</version>
 38 | 		</dependency>
 39 | 		<dependency>
 40 | 			<groupId>org.apache.spark</groupId>
 41 | 			<artifactId>spark-streaming_2.10</artifactId>
 42 | 			<version>1.0.0-cdh5.1.0</version>
 43 | 			<type>test-jar</type>
 44 | 			<classifier>tests</classifier>
 45 | 			<scope>test</scope>
 46 | 		</dependency>
 47 | 		<dependency>
 48 | 			<groupId>org.apache.spark</groupId>
 49 | 			<artifactId>spark-streaming_2.10</artifactId>
 50 | 			<version>1.0.0-cdh5.1.0</version>
 51 | 		</dependency>
 52 | 		<dependency>
 53 | 			<groupId>org.apache.hbase</groupId>
 54 | 			<artifactId>hbase-client</artifactId>
 55 | 			<version>0.98.1-cdh5.1.0</version>
 56 | 		</dependency>
 57 | 		<dependency>
 58 | 			<groupId>org.apache.hbase</groupId>
 59 | 			<artifactId>hbase-client</artifactId>
 60 | 			<version>0.98.1-cdh5.1.0</version>
 61 | 			<type>test-jar</type>
 62 | 			<classifier>tests</classifier>
 63 | 			<scope>test</scope>
 64 | 		</dependency>
 65 | 		<dependency>
 66 | 			<groupId>org.apache.hbase</groupId>
 67 | 			<artifactId>hbase-server</artifactId>
 68 | 			<version>0.98.1-cdh5.1.0</version>
 69 | 		</dependency>
 70 | 		<dependency>
 71 | 			<groupId>org.apache.hbase</groupId>
 72 | 			<artifactId>hbase-server</artifactId>
 73 | 			<version>0.98.1-cdh5.1.0</version>
 74 | 			<type>test-jar</type>
 75 | 			<classifier>tests</classifier>
 76 | 		</dependency>
 77 | 		<dependency>
 78 | 			<groupId>org.apache.hbase</groupId>
 79 | 			<artifactId>hbase-protocol</artifactId>
 80 | 			<version>0.98.1-cdh5.1.0</version>
 81 | 		</dependency>
 82 | 		<dependency>
 83 | 			<groupId>org.apache.hbase</groupId>
 84 | 			<artifactId>hbase-hadoop2-compat</artifactId>
 85 | 			<version>0.98.1-cdh5.1.0</version>
 86 | 			<!-- <scope>runtime</scope> -->
 87 | 		</dependency>
 88 | 		<dependency>
 89 | 			<groupId>org.apache.hbase</groupId>
 90 | 			<artifactId>hbase-hadoop2-compat</artifactId>
 91 | 			<version>0.98.1-cdh5.1.0</version>
 92 | 			<type>test-jar</type>
 93 | 			<classifier>tests</classifier>
 94 | 			<scope>test</scope>
 95 | 		</dependency>
 96 | 		<dependency>
 97 | 			<groupId>org.apache.hbase</groupId>
 98 | 			<artifactId>hbase-common</artifactId>
 99 | 			<version>0.98.1-cdh5.1.0</version>
100 | 		</dependency>
101 | 		<dependency>
102 | 			<groupId>org.apache.hbase</groupId>
103 | 			<artifactId>hbase-common</artifactId>
104 | 			<version>0.98.1-cdh5.1.0</version>
105 | 			<type>test-jar</type>
106 | 			<classifier>tests</classifier>
107 | 			<scope>test</scope>
108 | 		</dependency>
109 | 		<dependency>
110 | 			<groupId>org.apache.hbase</groupId>
111 | 			<artifactId>hbase-hadoop-compat</artifactId>
112 | 			<version>0.98.1-cdh5.1.0</version>
113 | 			<scope>test</scope>
114 | 		</dependency>
115 | 		<dependency>
116 | 			<groupId>org.apache.hbase</groupId>
117 | 			<artifactId>hbase-hadoop-compat</artifactId>
118 | 			<version>0.98.1-cdh5.1.0</version>
119 | 			<type>test-jar</type>
120 | 			<classifier>tests</classifier>
121 | 			<scope>test</scope>
122 | 		</dependency>
123 | 	</dependencies>
124 | 	<repositories>
125 | 		<repository>
126 | 			<id>maven-hadoop</id>
127 | 			<name>Hadoop Releases</name>
128 | 			<url>https://repository.cloudera.com/content/repositories/releases/</url>
129 | 		</repository>
130 | 	</repositories>
131 | 
132 | 	<build>
133 | 		<outputDirectory>target/scala/classes</outputDirectory>
134 | 		<testOutputDirectory>target/scala/test-classes</testOutputDirectory>
135 | 		<plugins>
136 | 			<plugin>
137 | 				<groupId>org.apache.maven.plugins</groupId>
138 | 				<artifactId>maven-surefire-plugin</artifactId>
139 | 				<version>2.17</version>
140 | 				<dependencies>
141 | 					<dependency>
142 | 						<groupId>org.apache.maven.surefire</groupId>
143 | 						<artifactId>surefire-junit47</artifactId>
144 | 						<version>2.17</version>
145 | 					</dependency>
146 | 				</dependencies>
147 | 			</plugin>
148 | 			<plugin>
149 | 				<groupId>org.scalatest</groupId>
150 | 				<artifactId>scalatest-maven-plugin</artifactId>
151 | 				<version>1.0</version>
152 | 				<configuration>
153 | 					<junitxml>.</junitxml>
154 | 				</configuration>
155 | 				<executions>
156 | 					<execution>
157 | 						<id>test</id>
158 | 						<goals>
159 | 							<goal>test</goal>
160 | 						</goals>
161 | 					</execution>
162 | 				</executions>
163 | 			</plugin>
164 | 
165 | 			<plugin>
166 | 				<groupId>org.scala-tools</groupId>
167 | 				<artifactId>maven-scala-plugin</artifactId>
168 | 				<executions>
169 | 					<execution>
170 | 						<id>compile</id>
171 | 						<goals>
172 | 							<goal>compile</goal>
173 | 						</goals>
174 | 						<phase>compile</phase>
175 | 					</execution>
176 | 					<execution>
177 | 						<id>test-compile</id>
178 | 						<goals>
179 | 							<goal>testCompile</goal>
180 | 						</goals>
181 | 						<phase>test-compile</phase>
182 | 					</execution>
183 | 					<execution>
184 | 						<phase>process-resources</phase>
185 | 						<goals>
186 | 							<goal>compile</goal>
187 | 						</goals>
188 | 					</execution>
189 | 				</executions>
190 | 			</plugin>
191 | 			<plugin>
192 | 				<groupId>org.apache.maven.plugins</groupId>
193 | 				<artifactId>maven-shade-plugin</artifactId>
194 | 				<version>2.2</version>
195 | 				<configuration>
196 | 					<shadedArtifactAttached>false</shadedArtifactAttached>
197 | 					<outputFile>target/SparkStreamingSessionization.jar</outputFile>
198 | 					<artifactSet>
199 | 						<includes>
200 | 							<include>*:*</include>
201 | 						</includes>
202 | 					</artifactSet>
203 | 					<filters>
204 | 						<filter>
205 | 							<artifact>*:*</artifact>
206 | 							<excludes>
207 | 								<exclude>META-INF/*.SF</exclude>
208 | 								<exclude>META-INF/*.DSA</exclude>
209 | 								<exclude>META-INF/*.RSA</exclude>
210 | 							</excludes>
211 | 						</filter>
212 | 					</filters>
213 | 				</configuration>
214 | 				<executions>
215 | 					<execution>
216 | 						<phase>package</phase>
217 | 						<goals>
218 | 							<goal>shade</goal>
219 | 						</goals>
220 | 						<configuration>
221 | 							<transformers>
222 | 								<transformer
223 | 									implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
224 | 								<transformer
225 | 									implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
226 | 									<resource>reference.conf</resource>
227 | 								</transformer>
228 | 							</transformers>
229 | 						</configuration>
230 | 					</execution>
231 | 				</executions>
232 | 			</plugin>
233 | 		</plugins>
234 | 	</build>
235 | 
236 | </project>
237 | 


--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/example/sparkstreaming/sessionization/SessionDataFileHDFSWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudera.sa.example.sparkstreaming.sessionization
 2 | 
 3 | import java.io.BufferedWriter
 4 | import java.io.FileWriter
 5 | import org.apache.hadoop.fs.FileSystem
 6 | import org.apache.hadoop.conf.Configuration
 7 | import java.io.OutputStreamWriter
 8 | import org.apache.hadoop.fs.Path
 9 | import java.util.Random
10 | 
11 | object SessionDataFileHDFSWriter {
12 |   
13 |   val eol = System.getProperty("line.separator");  
14 |   
15 |   def main(args: Array[String]) {
16 |     if (args.length == 0) {
17 |         println("SessionDataFileWriter {tempDir} {distDir} {numberOfFiles} {numberOfEventsPerFile} {waitBetweenFiles}");
18 |         return;
19 |     }
20 |     val conf = new Configuration
21 |     conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"))
22 |     conf.addResource(new Path("/etc/hadoop/conf/mapred-site.xml"))
23 |     conf.addResource(new Path("/etc/hadoop/conf/hdfs-site.xml"))
24 |     
25 |     val fs = FileSystem.get(new Configuration)
26 |     val rootTempDir = args(0)
27 |     val rootDistDir = args(1)
28 |     val files = args(2).toInt
29 |     val loops = args(3).toInt
30 |     val waitBetweenFiles = args(4).toInt
31 |     val r = new Random
32 |     for (f <- 1 to files) {
33 |       val rootName = "/weblog." + System.currentTimeMillis()
34 |       val tmpPath = new Path(rootTempDir + rootName + ".tmp")
35 |       val writer = new BufferedWriter(new OutputStreamWriter(fs.create(tmpPath)))
36 |       
37 |       print(f + ": [")
38 |       
39 |       val randomLoops = loops + r.nextInt(loops)
40 |       
41 |       for (i <- 1 to randomLoops) {
42 |         writer.write(SessionDataGenerator.getNextEvent + eol)
43 |         if (i%100 == 0) {
44 |           print(".")
45 |         }
46 |       }
47 |       println("]")
48 |       writer.close
49 |       
50 |       val distPath = new Path(rootDistDir + rootName + ".dat")
51 |       
52 |       fs.rename(tmpPath, distPath)
53 |       Thread.sleep(waitBetweenFiles)
54 |     }
55 |     println("Done")
56 |   }
57 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/example/sparkstreaming/sessionization/SessionDataFileWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudera.sa.example.sparkstreaming.sessionization
 2 | 
 3 | import java.io.BufferedWriter
 4 | import java.io.FileWriter
 5 | 
 6 | object SessionDataFileWriter {
 7 |   
 8 |   val eol = System.getProperty("line.separator");  
 9 |   
10 |   def main(args: Array[String]) {
11 |     if (args.length == 0) {
12 |         println("SessionDataFileWriter {numberOfRecords} {outputFile} ");
13 |         return;
14 |     }
15 |     
16 |     val writer = new BufferedWriter(new FileWriter(args(1)))
17 |     val loops = args(0).toInt
18 |     
19 |     for (i <- 1 to loops) {
20 |       writer.write(SessionDataGenerator.getNextEvent + eol)
21 |     }
22 |     
23 |     writer.close
24 |   }
25 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/example/sparkstreaming/sessionization/SessionDataGenerator.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudera.sa.example.sparkstreaming.sessionization
 2 | 
 3 | import scala.util.Random
 4 | import java.util.ArrayList
 5 | import java.text.SimpleDateFormat
 6 | import java.util.Date
 7 | 
 8 | 
 9 | object SessionDataGenerator {
10 |   
11 |   val r = new Random
12 |   
13 |   val numberOfUsers = 100000
14 |   
15 |   val intervalLength = 1000000
16 |   
17 |   val userbeingActivePercentage = 0.15
18 |   
19 |   val webSites = List("support.html","about.html","foo.html", "bar.html", "home.html", "search.html", "list.html", "help.html", "bar.html", "foo.html")
20 |   
21 |   var activeUserList = new ArrayList[Int]()
22 |   
23 |   var counter = 0
24 |   var currentIntervalLength = (intervalLength * r.nextGaussian).toInt
25 |   val dateFormat = new SimpleDateFormat("dd/MMM/yyyy HH:mm:ss Z")
26 |   
27 |   def getNextEvent: String = {
28 |     if (counter == 0 || counter % currentIntervalLength == 0) {
29 |       //We are at the end of an interval
30 |       currentIntervalLength = (intervalLength * r.nextGaussian).toInt
31 |       
32 |       activeUserList = new ArrayList[Int]()
33 |       
34 |       for (i <- 1 to numberOfUsers) {
35 |         if (Math.abs(r.nextGaussian) < userbeingActivePercentage) {
36 |           activeUserList.add(i)
37 |         }
38 |       }
39 |     }
40 |     
41 |     counter += 1
42 |     
43 |     val user = activeUserList.get(r.nextInt(activeUserList.size))
44 |     val ipPart3 = (user/256).toInt
45 |     val ipPart4 = (user%256).toInt
46 |     
47 |     
48 |     
49 |     "66.249." + ipPart3 + "." + ipPart4 + " - - [" + dateFormat.format(System.currentTimeMillis()) + "] \"GET /" + webSites(r.nextInt(10)) + " HTTP/1.1\" 200 11179 \"-\" \"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\""
50 |   }
51 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/example/sparkstreaming/sessionization/SessionDataSocketSender.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudera.sa.example.sparkstreaming.sessionization
 2 | 
 3 | import java.net.Socket
 4 | import java.io.OutputStreamWriter
 5 | 
 6 | object SessionDataSocketSender {
 7 |   
 8 |   val eol = System.getProperty("line.separator");  
 9 |   
10 |   def main(args: Array[String]) {
11 |     if (args.length == 0) {
12 |         println("SessionDataSocketSender {host} {port} {loops} {waitTime}");
13 |         return;
14 |     }
15 |     
16 |     val host = args(0)
17 |     val port = args(1).toInt
18 |     val loops = args(2).toInt
19 |     val waitTime = args(3).toInt
20 |     
21 |     val socket = new Socket(host, port)
22 |     
23 |     val writer = new OutputStreamWriter(socket.getOutputStream(), "UTF-8")
24 |     
25 |     for (i <- 1 to loops) {
26 |       writer.write(SessionDataGenerator.getNextEvent + eol)
27 |       wait(waitTime)
28 |     }
29 |     
30 |     writer.close
31 |            
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/example/sparkstreaming/sessionization/SessionizeData.scala:
--------------------------------------------------------------------------------
  1 | package com.cloudera.sa.example.sparkstreaming.sessionization
  2 | 
  3 | import org.apache.spark.SparkConf
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.streaming._
  6 | import org.apache.spark.streaming.dstream.PairDStreamFunctions
  7 | import org.apache.spark.streaming.StreamingContext._
  8 | import org.apache.spark.streaming.Seconds
  9 | import org.apache.spark.streaming.dstream.DStream
 10 | import org.apache.hadoop.io.LongWritable
 11 | import java.text.SimpleDateFormat
 12 | import org.apache.hadoop.hbase.HBaseConfiguration
 13 | import org.apache.spark.hbase.HBaseContext
 14 | import org.apache.hadoop.fs.Path
 15 | import org.apache.hadoop.hbase.client.Put
 16 | import org.apache.hadoop.hbase.util.Bytes
 17 | import org.apache.spark.streaming.dstream.FileInputDStream
 18 | import org.apache.hadoop.io.Text
 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 20 | import scala.collection.immutable.HashMap
 21 | import java.util.Date
 22 | 
 23 | object SessionizeData {
 24 | 
 25 |   val OUTPUT_ARG = 1
 26 |   val HTABLE_ARG = 2
 27 |   val HFAMILY_ARG = 3
 28 |   val CHECKPOINT_DIR_ARG = 4
 29 |   val FIXED_ARGS = 5
 30 | 
 31 |   val SESSION_TIMEOUT = (60000 * 0.5).toInt
 32 | 
 33 |   val TOTAL_SESSION_TIME = "TOTAL_SESSION_TIME"
 34 |   val UNDER_A_MINUTE_COUNT = "UNDER_A_MINUTE_COUNT"
 35 |   val ONE_TO_TEN_MINUTE_COUNT = "ONE_TO_TEN_MINUTE_COUNT"
 36 |   val OVER_TEN_MINUTES_COUNT = "OVER_TEN_MINUTES_COUNT"
 37 |   val NEW_SESSION_COUNTS = "NEW_SESSION_COUNTS"
 38 |   val TOTAL_SESSION_COUNTS = "TOTAL_SESSION_COUNTS"
 39 |   val EVENT_COUNTS = "EVENT_COUNTS"
 40 |   val DEAD_SESSION_COUNTS = "DEAD_SESSION_COUNTS"
 41 |   val REVISTE_COUNT = "REVISTE_COUNT"
 42 |   val TOTAL_SESSION_EVENT_COUNTS = "TOTAL_SESSION_EVENT_COUNTS"
 43 | 
 44 |   val dateFormat = new SimpleDateFormat("dd/MMM/yyyy HH:mm:ss Z")
 45 | 
 46 |   def main(args: Array[String]) {
 47 |     if (args.length == 0) {
 48 |       println("SessionizeData {sourceType} {outputDir} {source information}")
 49 |       println("SessionizeData file {outputDir} {table} {family}  {hdfs checkpoint directory} {source file}")
 50 |       println("SessionizeData newFile {outputDir} {table} {family}  {hdfs checkpoint directory} {source file}")
 51 |       println("SessionizeData socket {outputDir} {table} {family}  {hdfs checkpoint directory} {host} {port}")
 52 |       return ;
 53 |     }
 54 | 
 55 |     val outputDir = args(OUTPUT_ARG)
 56 |     val hTableName = args(HTABLE_ARG)
 57 |     val hFamily = args(HFAMILY_ARG)
 58 |     val checkpointDir = args(CHECKPOINT_DIR_ARG)
 59 | 
 60 |     //This is just creating a Spark Config object.  I don’t do much here but 
 61 |     //add the app name.  There are tons of options to put into the Spark config, 
 62 |     //but none are needed for this simple example.
 63 |     val sparkConf = new SparkConf().
 64 |       setAppName("SessionizeData " + args(0)).
 65 |       set("spark.cleaner.ttl", "120000")
 66 |     
 67 |     //These two lines will get us out SparkContext and our StreamingContext.  
 68 |     //These objects have all the root functionality we need to get started.
 69 |     val sc = new SparkContext(sparkConf)
 70 |     val ssc = new StreamingContext(sc, Seconds(10))
 71 |     
 72 |     //Here are are loading our HBase Configuration object.  This will have 
 73 |     //all the information needed to connect to our HBase cluster.  
 74 |     //There is nothing different here from when you normally interact with HBase.
 75 |     val conf = HBaseConfiguration.create();
 76 |     conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
 77 |     conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));
 78 |     
 79 |     //This is a HBaseContext object.  This is a nice abstraction that will hide 
 80 |     //any complex HBase stuff from us so we can focus on our business case
 81 |     //HBaseContext is from the SparkOnHBase project which can be found at
 82 |     // https://github.com/tmalaska/SparkOnHBase
 83 |     val hbaseContext = new HBaseContext(sc, conf);
 84 | 
 85 |     //This is create a reference to our root DStream.  DStreams are like RDDs but 
 86 |     //with the context of being in micro batch world.  I set this to null now 
 87 |     //because I later give the option of populating this data from HDFS or from 
 88 |     //a socket.  There is no reason this could not also be populated by Kafka, 
 89 |     //Flume, MQ system, or anything else.  I just focused on these because 
 90 |     //there are the easiest to set up.
 91 |     var lines: DStream[String] = null
 92 | 
 93 |     //Options for data load.  Will be adding Kafka and Flume at some point
 94 |     if (args(0).equals("socket")) {
 95 |       val host = args(FIXED_ARGS);
 96 |       val port = args(FIXED_ARGS + 1);
 97 | 
 98 |       println("host:" + host)
 99 |       println("port:" + Integer.parseInt(port))
100 | 
101 |       //Simple example of how you set up a receiver from a Socket Stream
102 |       lines = ssc.socketTextStream(host, port.toInt)
103 |     } else if (args(0).equals("newFile")) {
104 | 
105 |       val directory = args(FIXED_ARGS)
106 |       println("directory:" + directory)
107 |       
108 |       //Simple example of how you set up a receiver from a HDFS folder
109 |       lines = ssc.fileStream[LongWritable, Text, TextInputFormat](directory, (t: Path) => true, true).map(_._2.toString)
110 |     } else {
111 |       throw new RuntimeException("bad input type")
112 |     }
113 | 
114 |     val ipKeyLines = lines.map[(String, (Long, Long, String))](eventRecord => {
115 |       //Get the time and ip address out of the original event
116 |       val time = dateFormat.parse(
117 |         eventRecord.substring(eventRecord.indexOf('[') + 1, eventRecord.indexOf(']'))).
118 |         getTime()
119 |       val ipAddress = eventRecord.substring(0, eventRecord.indexOf(' '))
120 |       
121 |       //We are return the time twice because we will use the first at the start time
122 |       //and the second as the end time
123 |       (ipAddress, (time, time, eventRecord))
124 |     })
125 | 
126 |     val latestSessionInfo = ipKeyLines.
127 |       map[(String, (Long, Long, Long))](a => {
128 |         //transform to (ipAddress, (time, time, counter)) 
129 |         (a._1, (a._2._1, a._2._2, 1))
130 |       }).
131 |       reduceByKey((a, b) => {
132 |         //transform to (ipAddress, (lowestStartTime, MaxFinishTime, sumOfCounter))
133 |         (Math.min(a._1, b._1), Math.max(a._2, b._2), a._3 + b._3)
134 |       }).
135 |       updateStateByKey(updateStatbyOfSessions)
136 | 
137 |     //remove old sessions
138 |     val onlyActiveSessions = latestSessionInfo.filter(t => System.currentTimeMillis() - t._2._2 < SESSION_TIMEOUT)
139 |     val totals = onlyActiveSessions.mapPartitions[(Long, Long, Long, Long)](it =>
140 |       {
141 |         var totalSessionTime: Long = 0
142 |         var underAMinuteCount: Long = 0
143 |         var oneToTenMinuteCount: Long = 0
144 |         var overTenMinutesCount: Long = 0
145 | 
146 |         it.foreach(a => {
147 |           val time = a._2._2 - a._2._1
148 |           totalSessionTime += time
149 |           if (time < 60000) underAMinuteCount += 1
150 |           else if (time < 600000) oneToTenMinuteCount += 1
151 |           else overTenMinutesCount += 1
152 |         })
153 | 
154 |         Iterator((totalSessionTime, underAMinuteCount, oneToTenMinuteCount, overTenMinutesCount))
155 |       }, true).reduce((a, b) => {
156 |         //totalSessionTime, underAMinuteCount, oneToTenMinuteCount, overTenMinutesCount
157 |       (a._1 + b._1, a._2 + b._2, a._3 + b._3, a._4 + b._4)
158 |     }).map[HashMap[String, Long]](t => HashMap(
159 |         (TOTAL_SESSION_TIME, t._1), 
160 |         (UNDER_A_MINUTE_COUNT, t._2), 
161 |         (ONE_TO_TEN_MINUTE_COUNT, t._3), 
162 |         (OVER_TEN_MINUTES_COUNT, t._4)))
163 | 
164 |     val newSessionCount = onlyActiveSessions.filter(t => {
165 |         //is the session newer then that last micro batch
166 |         //and is the boolean saying this is a new session true
167 |         (System.currentTimeMillis() - t._2._2 < 11000 && t._2._4)
168 |       }).
169 |       count.
170 |       map[HashMap[String, Long]](t => HashMap((NEW_SESSION_COUNTS, t)))
171 | 
172 |     val totalSessionCount = onlyActiveSessions.
173 |       count.
174 |       map[HashMap[String, Long]](t => HashMap((TOTAL_SESSION_COUNTS, t)))
175 | 
176 |     val totalSessionEventCount = onlyActiveSessions.map(a => a._2._3).reduce((a, b) => a + b).
177 |       count.
178 |       map[HashMap[String, Long]](t => HashMap((TOTAL_SESSION_EVENT_COUNTS, t)))
179 | 
180 |     val totalEventsCount = ipKeyLines.count.map[HashMap[String, Long]](t => HashMap((EVENT_COUNTS, t)))
181 | 
182 |     val deadSessionsCount = latestSessionInfo.filter(t => {
183 |       val gapTime = System.currentTimeMillis() - t._2._2
184 |       gapTime > SESSION_TIMEOUT && gapTime < SESSION_TIMEOUT + 11000
185 |     }).count.map[HashMap[String, Long]](t => HashMap((DEAD_SESSION_COUNTS, t)))
186 | 
187 |     val allCounts = newSessionCount.
188 |       union(totalSessionCount).
189 |       union(totals).
190 |       union(totalEventsCount).
191 |       union(deadSessionsCount).
192 |       union(totalSessionEventCount).
193 |       reduce((a, b) => b ++ a)
194 | 
195 |     hbaseContext.streamBulkPut[HashMap[String, Long]](
196 |       allCounts, //The input RDD
197 |       hTableName, //The name of the table we want to put too
198 |       (t) => {
199 |         //Here we are converting our input record into a put
200 |         //The rowKey is C for Count and a backward counting time so the newest 
201 |         //count show up first in HBase's sorted order
202 |         val put = new Put(Bytes.toBytes("C." + (Long.MaxValue - System.currentTimeMillis())))
203 |         //We are iterating through the HashMap to make all the columns with their counts
204 |         t.foreach(kv => put.add(Bytes.toBytes(hFamily), Bytes.toBytes(kv._1), Bytes.toBytes(kv._2.toString)))
205 |         put
206 |       }, 
207 |       false)
208 | 
209 |     //Persist to HDFS 
210 |     ipKeyLines.join(onlyActiveSessions).
211 |       map(t => {
212 |         //Session root start time | Event message 
213 |         dateFormat.format(new Date(t._2._2._1)) + "\t" + t._2._1._3
214 |       }).
215 |       saveAsTextFiles(outputDir + "/session", "txt")
216 | 
217 |     ssc.checkpoint(checkpointDir)
218 | 
219 |     ssc.start
220 |     ssc.awaitTermination
221 |   }
222 | 
223 |   /**
224 |    * This function will be called for to union of keys in the Reduce DStream 
225 |    * with the active sessions from the last micro batch with the ipAddress 
226 |    * being the key
227 |    * 
228 |    * To goal is that this produces a stateful RDD that has all the active 
229 |    * sessions.  So we add new sessions and remove sessions that have timed 
230 |    * out and extend sessions that are still going
231 |    */
232 |   def updateStatbyOfSessions(
233 |       //(sessionStartTime, sessionFinishTime, countOfEvents)
234 |       a: Seq[(Long, Long, Long)], 
235 |       //(sessionStartTime, sessionFinishTime, countOfEvents, isNewSession)
236 |       b: Option[(Long, Long, Long, Boolean)] 
237 |     ): Option[(Long, Long, Long, Boolean)] = { 
238 |     
239 |     //This function will return a Optional value.  
240 |     //If we want to delete the value we can return a optional "None".  
241 |     //This value contains four parts 
242 |     //(startTime, endTime, countOfEvents, isNewSession)
243 |     var result: Option[(Long, Long, Long, Boolean)] = null
244 | 
245 |     // These if statements are saying if we didn’t get a new event for 
246 |     //this session’s ip address for longer then the session 
247 |     //timeout + the batch time then it is safe to remove this key value 
248 |     //from the future Stateful DStream
249 |     if (a.size == 0) {
250 |       if (System.currentTimeMillis() - b.get._2 < SESSION_TIMEOUT + 11000) {
251 |         result = None
252 |       } else {
253 |         if (b.get._4 == false) {
254 |           result = b
255 |         } else {
256 |           result = Some((b.get._1, b.get._2, b.get._3, false))
257 |         }
258 |       }
259 |     }
260 | 
261 |     //Now because we used the reduce function before this function we are 
262 |     //only ever going to get at most one event in the Sequence. 
263 |     a.foreach(c => {
264 |       if (b.isEmpty) {
265 |         //If there was no value in the Stateful DStream then just add it 
266 |         //new, with a true for being a new session
267 |         result = Some((c._1, c._2, c._3, true))
268 |       } else {
269 |         if (c._1 - b.get._2 < SESSION_TIMEOUT) {
270 |           //If the session from the stateful DStream has not timed out 
271 |           //then extend the session
272 |           result = Some((
273 |               Math.min(c._1, b.get._1), //newStartTime 
274 |               Math.max(c._2, b.get._2), //newFinishTime
275 |               b.get._3 + c._3, //newSumOfEvents
276 |               false //This is not a new session
277 |             ))
278 |         } else {
279 |           //Otherwise remove the old session with a new one
280 |           result = Some((
281 |               c._1, //newStartTime
282 |               c._2, //newFinishTime
283 |               b.get._3, //newSumOfEvents
284 |               true //new session
285 |             ))
286 |         }
287 |       }
288 |     })
289 |     result
290 |   }
291 | }


--------------------------------------------------------------------------------