├── LICENSE
├── README.md
├── _config.yml
├── data
    ├── ml-1m
    │   ├── README
    │   ├── movies.dat
    │   ├── ratings.dat
    │   └── users.dat
    └── weblog
    │   ├── apache.access.log
    │   └── file_to_stream_utils.sh
├── pom.xml
└── src
    └── main
        └── scala
            ├── SparkTest.scala
            └── org
                ├── spark
                    ├── App.scala
                    └── movie
                    │   ├── MovieUser.scala
                    │   ├── PopularMovie.scala
                    │   └── TopKMovie.scala
                ├── sparkSQL
                    ├── ApacheAccessLog
                    │   ├── ApacheAccessLog.scala
                    │   └── LogAnalyzerSQL.scala
                    └── SensorLog
                    │   ├── RedisClient.scala
                    │   ├── SensorRow.scala
                    │   └── SensorStatistics.scala
                └── sparkStreaming
                    ├── kafka_sparkStreaming_mysql
                        ├── DruidConnectionPool.java
                        ├── KafkaEventProducer.scala
                        ├── UserClickCountAnalytics.scala
                        └── UserClickCountByWindowAnalytics.scala
                    ├── kafka_sparkStreaming_offsetToZK
                        ├── KafkaEventProducer.scala
                        ├── UserClickCountAnalytics.scala
                        └── ZkKafkaOffsetManager.scala
                    ├── kafka_sparkStreaming_redis
                        ├── KafkaEventProducer.scala
                        ├── RedisClient.scala
                        ├── UserClickCountAnalytics.scala
                        └── UserClickCountByWindowAnalytics.scala
                    └── sparkStreamingExactltyOnce
                        ├── KafkaEventProducer.scala
                        └── SparkStreamingExactlyOnce.scala


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## 欢迎大家来到 Higmin/SparkObject
  2 | # 1. Spark离线批处理 => 受众电影分析
  3 | ###  (数据来源：https://grouplens.org/datasets/movielens )
  4 | # 2. SparkSQL内容分析 => Weblog 网站日志分析(数据源：log文件)、工业级传感器数据分析(数据源：Hbase)
  5 | # 3. spark-streaming => 用户实时点击统计(数据源：kafka)
  6 | Spark受众电影分析，用户实时点击统计（Kafka + SparkStreaming + Redis）  
  7 | spark 版本-2.4.4  
  8 | spark-streaming_2.12 版本-2.4.4  
  9 | spark-streaming-kafka-0-10_2.12 版本-2.4.4  
 10 | spark-sql_2.11 版本-2.4.4  
 11 | 
 12 | ### 本地测试方法 ：  
 13 | 1. 本地安装Spark 2.4.4 ,redis（往redis存需要安装redis） ， mysql（往 mysql 存需要安装 mysql ）  
 14 |   => MySQL相关说明：用户名 root ,密码 root 。创建数据库test,
 15 |   => 创建数据表 streaming [uid（varchar 255）; clickCount （varchar 255）] 
 16 |   => 创建数据表treaming_ostype[os_type（varchar 255）; clickCount （varchar 255）]
 17 | 2. 下载示例代码  
 18 | 3. Kafka + SparkStreaming + Redis 和 Kafka + SparkStreaming + mysql 运行流程相同  
 19 | 4. 启动 kafka 消息模拟生产者
 20 | 5. 启动 sparkStreaming 实时计算任务。
 21 | 结果：可在 redis 或者 mysql 中查看
 22 | 
 23 | ## 1.Spark
 24 | Spark 任务一：统计看过 “Sixteen Candles” 的用户、性别和观看次数  
 25 | Spark 任务二：统计年龄段在20-30的年轻人，最喜欢看哪10部电影  
 26 | Spark 任务三：最受欢迎的前三部电影(平均评分最高的三部电影)  
 27 | ##### 代码详情：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/spark/movie
 28 | 
 29 | ## 2.Spark SQL 内容分析
 30 | SparkSQL是spark用来处理结构化的一个模块，它提供一个抽象的数据集DataFrame,并且是作为分布式SQL查询引擎的应用。  
 31 | SparkSQL实现了Hive兼容，执行计划生成和优化都由Catalyst负责。借助Scala的模式匹配等函数式语言特性，利用Catalyst开发执行计划优化策略比Hive要简洁得多。  
 32 | 关于Dataframe和Dataset：Dataframe/Dataset也是分布式数据集，但与RDD不同的是其带有schema信息，类似一张表。Dataset是在spark1.6引入的，目的是提供像RDD一样的强类型、使用强大的lambda函数，同时使用spark sql的优化执行引擎。到spark2.0以后，DataFrame变成类型为Row的Dataset，即为：
 33 | ``
 34 | type DataFrame = Dataset[Row]
 35 | ``  
 36 |   #### 2.1 这里是一个SparkSQL读取Weblog 日志文件的数据分析实例：  
 37 | ##### 详情请参考：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkSQL
 38 | 
 39 |   #### 2.2 另外还有spark 读取 Hbase 中的数据，并转换为 DataFrame ,利用SparkSQL 进行数据分析：
 40 | ##### 详情请参考：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkSQL/SensorLog  
 41 | Hbsse 表数据结构如下：（以下列举一条数据）  
 42 | 
 43 | rowKey  |   columnFamily   |   column              |       	value     
 44 | --------|------------------|-----------------------|-----------------
 45 | 2314035476751 | info | AndroidBoard::availableStorage	|	12379  
 46 | 2314035476751 | info |  AndroidBoard::cpu_usage		    |	4.32  
 47 | 2314035476751 | info | AndroidBoard::current		    | 	1.3  
 48 | 2314035476751 | info | AndroidBoard::storage		    | 	12661  
 49 | 2314035476751 | info | AndroidBoard::voltage	        | 	11.8  
 50 | 2314035476751 | info | AxialFanSpeed			        | 	56.666666666666664  
 51 | 2314035476751 | info | BackLight::state			        | 	true  
 52 | 2314035476751 | info | BlEnable::state				    | 	true  	
 53 | 2314035476751 | info | CrossFlowFanSpeed::value		    | 	40  
 54 | 2314035476751 | info | Decibel::value				    | 	40  	
 55 | 2314035476751 | info | DoorState::state			        | 	false  
 56 | 2314035476751 | info | GPS::latitude			        | 	39.950565  	
 57 | 2314035476751 | info | GPS::longitude			        | 	116.500711  	
 58 | 2314035476751 | info | Humidity::value			        | 	40  
 59 | 2314035476751 | info | Level::value				        | 	80  
 60 | 2314035476751 | info | PowerState::state		        | 	false  
 61 | 2314035476751 | info | PowerState::value		        | 	0.8  
 62 | 2314035476751 | info | Temperature				        | 	29.0  	
 63 | 2314035476751 | info | Time						        | 	1576745304132  	
 64 | 2314035476751 | info | TotalPower::value		        | 	900  
 65 | 
 66 | ## 3.Kafka + SparkStreaming + Redis
 67 | Kafka + SparkStreaming + Redis 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式  
 68 | Kafka + SparkStreaming + Redis scala实现Redis 客户端  
 69 | Kafka + SparkStreaming + Redis 实现实时统计每个用户的点击次数，它是按照用户分组进行累加次数  
 70 | Kafka + SparkStreaming + Redis 每5秒 统计 过去10秒 每种终端 收到的点击量  
 71 | 注意：
 72 |   * 1. 使用 SparkStreaming窗口 计算需要设置检查点 checkpoint
 73 |   * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错
 74 | ##### 代码详情：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis
 75 |   
 76 | ## 4.Kafka + SparkStreaming + mysql
 77 | Kafka + SparkStreaming + mysql 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式  
 78 | Kafka + SparkStreaming + mysql java实现 mysql连接池（本例中使用阿里开源的 druid 连接池 ）  
 79 | Kafka + SparkStreaming + mysql 实现实时统计每个用户的点击次数，它是按照用户分组进行累加次数  
 80 | Kafka + SparkStreaming + mysql 每5秒 统计 过去10秒 每种终端 收到的点击量  
 81 | 注意：
 82 |   * 1. 使用 SparkStreaming窗口 计算需要设置检查点 checkpoint
 83 |   * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错  
 84 | ##### 代码详情：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql
 85 | 
 86 | ## 5.Spark Streaming 中实现 Exactly-Once 语义
 87 | Exactly-once 语义是实时计算的难点之一。  
 88 | 要做到每一条记录只会被处理一次，即使服务器或网络发生故障时也能保证没有遗漏，这不仅需要实时计算框架本身的支持，还对上游的消息系统、下游的数据存储有所要求。此外，我们在编写计算流程时也需要遵循一定规范，才能真正实现 Exactly-once。  
 89 | 
 90 | 实时计算有三种语义，分别是 At-most-once、At-least-once、以及 Exactly-once。一个典型的 Spark Streaming 应用程序会包含三个处理阶段：接收数据、处理汇总、输出结果。每个阶段都需要做不同的处理才能实现相应的语义。  
 91 | 对于 接收数据，主要取决于上游数据源的特性。例如，从 HDFS 这类支持容错的文件系统中读取文件，能够直接支持 Exactly-once 语义。如果上游消息系统支持 ACK（如RabbitMQ），我们就可以结合 Spark 的 Write Ahead Log 特性来实现 At-least-once 语义。对于非可靠的数据接收器（如 socketTextStream），当 Worker 或 Driver 节点发生故障时就会产生数据丢失，提供的语义也是未知的。而 Kafka 消息系统是基于偏移量（Offset）的，它的 Direct API 可以提供 Exactly-once 语义。  
 92 | 在使用 Spark RDD 对数据进行 转换或汇总 时，我们可以天然获得 Exactly-once 语义，因为 RDD 本身就是一种具备容错性、不变性、以及计算确定性的数据结构。只要数据来源是可用的，且处理过程中没有副作用（Side effect），我们就能一直得到相同的计算结果。  
 93 | 
 94 | 实时计算中的 Exactly-once 是比较强的一种语义，因而会给你的应用程序引入额外的开销。此外，它尚不能很好地支持窗口型操作。因此，是否要在代码中使用这一语义就需要开发者自行判断了。很多情况下，数据丢失或重复处理并不那么重要。不过，了解 Exactly-once 的开发流程还是有必要的，对学习 Spark Streaming 也会有所助益。  
 95 | ##### 代码详情：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce
 96 | 
 97 | ## 6.Kafka + SparkStreaming手动管理 offset
 98 | 为了应对可能出现的引起Streaming程序崩溃的异常情况，我们一般都需要手动管理好Kafka的offset，而不是让它自动提交，即需要将enable.auto.commit设为false。只有管理好offset，才能使整个流式系统最大限度地接近exactly once语义。  
 99 | Offsets可以通过多种方式来管理，但是一般来说遵循下面的步骤:
100 | 
101 | 1. 在 Direct DStream初始化的时候，需要指定一个包含每个topic的每个分区的offset用于让Direct DStream从指定位置读取数据。
102 |  （offsets就是步骤4中所保存的offsets位置）
103 | 2. 读取并处理消息
104 | 
105 | 3. 处理完之后存储结果数据
106 | 
107 | 用虚线圈存储和提交offset只是简单强调用户可能会执行一系列操作来满足他们更加严格的语义要求。这包括幂等操作和通过原子操作的方式存储offset。
108 | 
109 | 4.最后，将offsets保存在外部持久化数据库如 HBase, Kafka, HDFS, and ZooKeeper中    
110 | ##### 参考博客：https://blog.csdn.net/rlnLo2pNEfx9c/article/details/79988218  
111 |   #### 6.1 存储在kafka本身 (注意： commitAsync()是Spark Streaming集成kafka-0-10版本中的，在Spark文档提醒到它仍然是个实验性质的API并且存在修改的可能性。)
112 | ```markdown
113 | stream.foreachRDD { rdd =>
114 |   val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
115 | 
116 |   // some time later, after outputs have completed
117 |   stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
118 | }
119 | ```
120 |   #### 6.2 存储在zookeeper等外部存储  
121 | ##### 详情请参考：https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK
122 | 
123 |   #### 6.3 为什么不用SparkStreaming 的 checkpoint?  
124 | Spark Streaming的checkpoint机制无疑是用起来最简单的，checkpoint数据存储在HDFS中，如果Streaming应用挂掉，可以快速恢复。  
125 | 但是，如果Streaming程序的代码改变了，重新打包执行就会出现反序列化异常的问题。这是因为checkpoint首次持久化时会将整个jar包序列化，以便重启时恢复。重新打包之后，新旧代码逻辑不同，就会报错或者仍然执行旧版代码。  
126 | 要解决这个问题，只能将HDFS上的checkpoint文件删掉，但这样也会同时删掉Kafka的offset信息，就毫无意义了。  
127 | 
128 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/data/ml-1m/README:
--------------------------------------------------------------------------------
  1 | SUMMARY
  2 | ================================================================================
  3 | 
  4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
  5 | made by 6,040 MovieLens users who joined MovieLens in 2000.
  6 | 
  7 | USAGE LICENSE
  8 | ================================================================================
  9 | 
 10 | Neither the University of Minnesota nor any of the researchers
 11 | involved can guarantee the correctness of the data, its suitability
 12 | for any particular purpose, or the validity of results based on the
 13 | use of the data set.  The data set may be used for any research
 14 | purposes under the following conditions:
 15 | 
 16 |      * The user may not state or imply any endorsement from the
 17 |        University of Minnesota or the GroupLens Research Group.
 18 | 
 19 |      * The user must acknowledge the use of the data set in
 20 |        publications resulting from the use of the data set
 21 |        (see below for citation information).
 22 | 
 23 |      * The user may not redistribute the data without separate
 24 |        permission.
 25 | 
 26 |      * The user may not use this information for any commercial or
 27 |        revenue-bearing purposes without first obtaining permission
 28 |        from a faculty member of the GroupLens Research Project at the
 29 |        University of Minnesota.
 30 | 
 31 | If you have any further questions or comments, please contact GroupLens
 32 | <grouplens-info@cs.umn.edu>. 
 33 | 
 34 | CITATION
 35 | ================================================================================
 36 | 
 37 | To acknowledge use of the dataset in publications, please cite the following
 38 | paper:
 39 | 
 40 | F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
 41 | and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
 42 | Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
 43 | 
 44 | 
 45 | ACKNOWLEDGEMENTS
 46 | ================================================================================
 47 | 
 48 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
 49 | set.
 50 | 
 51 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
 52 | ================================================================================
 53 | 
 54 | The GroupLens Research Project is a research group in the Department of 
 55 | Computer Science and Engineering at the University of Minnesota. Members of 
 56 | the GroupLens Research Project are involved in many research projects related 
 57 | to the fields of information filtering, collaborative filtering, and 
 58 | recommender systems. The project is lead by professors John Riedl and Joseph 
 59 | Konstan. The project began to explore automated collaborative filtering in 
 60 | 1992, but is most well known for its world wide trial of an automated 
 61 | collaborative filtering system for Usenet news in 1996. Since then the project 
 62 | has expanded its scope to research overall information filtering solutions, 
 63 | integrating in content-based methods as well as improving current collaborative 
 64 | filtering technology.
 65 | 
 66 | Further information on the GroupLens Research project, including research 
 67 | publications, can be found at the following web site:
 68 |         
 69 |         http://www.grouplens.org/
 70 | 
 71 | GroupLens Research currently operates a movie recommender based on 
 72 | collaborative filtering:
 73 | 
 74 |         http://www.movielens.org/
 75 | 
 76 | RATINGS FILE DESCRIPTION
 77 | ================================================================================
 78 | 
 79 | All ratings are contained in the file "ratings.dat" and are in the
 80 | following format:
 81 | 
 82 | UserID::MovieID::Rating::Timestamp
 83 | 
 84 | - UserIDs range between 1 and 6040 
 85 | - MovieIDs range between 1 and 3952
 86 | - Ratings are made on a 5-star scale (whole-star ratings only)
 87 | - Timestamp is represented in seconds since the epoch as returned by time(2)
 88 | - Each user has at least 20 ratings
 89 | 
 90 | USERS FILE DESCRIPTION
 91 | ================================================================================
 92 | 
 93 | User information is in the file "users.dat" and is in the following
 94 | format:
 95 | 
 96 | UserID::Gender::Age::Occupation::Zip-code
 97 | 
 98 | All demographic information is provided voluntarily by the users and is
 99 | not checked for accuracy.  Only users who have provided some demographic
100 | information are included in this data set.
101 | 
102 | - Gender is denoted by a "M" for male and "F" for female
103 | - Age is chosen from the following ranges:
104 | 
105 | 	*  1:  "Under 18"
106 | 	* 18:  "18-24"
107 | 	* 25:  "25-34"
108 | 	* 35:  "35-44"
109 | 	* 45:  "45-49"
110 | 	* 50:  "50-55"
111 | 	* 56:  "56+"
112 | 
113 | - Occupation is chosen from the following choices:
114 | 
115 | 	*  0:  "other" or not specified
116 | 	*  1:  "academic/educator"
117 | 	*  2:  "artist"
118 | 	*  3:  "clerical/admin"
119 | 	*  4:  "college/grad student"
120 | 	*  5:  "customer service"
121 | 	*  6:  "doctor/health care"
122 | 	*  7:  "executive/managerial"
123 | 	*  8:  "farmer"
124 | 	*  9:  "homemaker"
125 | 	* 10:  "K-12 student"
126 | 	* 11:  "lawyer"
127 | 	* 12:  "programmer"
128 | 	* 13:  "retired"
129 | 	* 14:  "sales/marketing"
130 | 	* 15:  "scientist"
131 | 	* 16:  "self-employed"
132 | 	* 17:  "technician/engineer"
133 | 	* 18:  "tradesman/craftsman"
134 | 	* 19:  "unemployed"
135 | 	* 20:  "writer"
136 | 
137 | MOVIES FILE DESCRIPTION
138 | ================================================================================
139 | 
140 | Movie information is in the file "movies.dat" and is in the following
141 | format:
142 | 
143 | MovieID::Title::Genres
144 | 
145 | - Titles are identical to titles provided by the IMDB (including
146 | year of release)
147 | - Genres are pipe-separated and are selected from the following genres:
148 | 
149 | 	* Action
150 | 	* Adventure
151 | 	* Animation
152 | 	* Children's
153 | 	* Comedy
154 | 	* Crime
155 | 	* Documentary
156 | 	* Drama
157 | 	* Fantasy
158 | 	* Film-Noir
159 | 	* Horror
160 | 	* Musical
161 | 	* Mystery
162 | 	* Romance
163 | 	* Sci-Fi
164 | 	* Thriller
165 | 	* War
166 | 	* Western
167 | 
168 | - Some MovieIDs do not correspond to a movie due to accidental duplicate
169 | entries and/or test entries
170 | - Movies are mostly entered by hand, so errors and inconsistencies may exist
171 | 


--------------------------------------------------------------------------------
/data/ml-1m/movies.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Higmin/Spark-Learning/a5fe1a9db2c86d32fc096fd7f98faffae5c466f8/data/ml-1m/movies.dat


--------------------------------------------------------------------------------
/data/weblog/file_to_stream_utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | 
 6 | test $# -eq 1 || ( echo "Incorrect number of arguments" ; exit 1 )
 7 | 
 8 | file="$1"
 9 | 
10 | network_port=9999
11 | lines_in_batch=100
12 | interval_sec=10
13 | 
14 | n_lines=$(cat apache.access.log | wc -l)
15 | cursor=1
16 | while test $cursor -le $n_lines
17 | do
18 | 	tail -n +$cursor $file | head -$lines_in_batch | nc -l $network_port
19 | 	cursor=$(($cursor + $lines_in_batch))
20 | 	sleep $interval_sec
21 | done
22 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>org.spark</groupId>
  4 |   <artifactId>sparkCalculation</artifactId>
  5 |   <version>1.0-SNAPSHOT</version>
  6 |   <inceptionYear>2008</inceptionYear>
  7 |   <properties>
  8 |     <java.version>1.8</java.version>
  9 |     <scala.version>2.12.2</scala.version>
 10 |     <spark.version>2.4.4</spark.version>
 11 |     <kafka.version>0.10.2.2</kafka.version>
 12 |     <junit.version>3.8.2</junit.version>
 13 |     <specs.version>1.2.5</specs.version>
 14 |     <jedis.version>2.5.2</jedis.version>
 15 |     <commons-pool2.version>2.2</commons-pool2.version>
 16 |     <json-lib.version>2.4</json-lib.version>
 17 |     <mysql-connector.version>5.1.39</mysql-connector.version>
 18 |     <druid.version>1.0.28</druid.version>
 19 |     <hbase-mapreduce.version>2.1.2</hbase-mapreduce.version>
 20 |   </properties>
 21 | 
 22 |   <repositories>
 23 |     <repository>
 24 |       <id>nexus-aliyun</id>
 25 |       <name>Nexus aliyun</name>
 26 |       <url>http://maven.aliyun.com/nexus/content/groups/public</url>
 27 |     </repository>
 28 |   </repositories>
 29 | 
 30 |   <dependencies>
 31 |     <dependency>
 32 |       <groupId>org.scala-lang</groupId>
 33 |       <artifactId>scala-library</artifactId>
 34 |       <version>${scala.version}</version>
 35 |     </dependency>
 36 | 
 37 |     <dependency>
 38 |       <groupId>junit</groupId>
 39 |       <artifactId>junit</artifactId>
 40 |       <version>${junit.version}</version>
 41 |       <scope>test</scope>
 42 |     </dependency>
 43 | 
 44 |     <dependency>
 45 |       <groupId>org.specs</groupId>
 46 |       <artifactId>specs</artifactId>
 47 |       <version>${specs.version}</version>
 48 |       <scope>test</scope>
 49 |     </dependency>
 50 |     <dependency>
 51 |       <groupId>org.apache.spark</groupId>
 52 |       <artifactId>spark-core_2.12</artifactId>
 53 |       <version>${spark.version}</version>
 54 |     </dependency>
 55 | 
 56 |     <dependency>
 57 |       <groupId>org.apache.spark</groupId>
 58 |       <artifactId>spark-sql_2.12</artifactId>
 59 |       <version>${spark.version}</version>
 60 |     </dependency>
 61 | 
 62 |     <dependency>
 63 |       <groupId>org.apache.spark</groupId>
 64 |       <artifactId>spark-streaming_2.12</artifactId>
 65 |       <version>${spark.version}</version>
 66 | <!--      <scope>provided</scope>-->
 67 |     </dependency>
 68 | 
 69 |     <dependency>
 70 |       <groupId>org.apache.spark</groupId>
 71 |       <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
 72 |       <version>${spark.version}</version>
 73 |     </dependency>
 74 | 
 75 |     <dependency>
 76 |       <groupId>org.apache.kafka</groupId>
 77 |       <artifactId>kafka_2.12</artifactId>
 78 |       <version>${kafka.version}</version>
 79 |     </dependency>
 80 | 
 81 |     <dependency>
 82 |       <groupId>net.sf.json-lib</groupId>
 83 |       <artifactId>json-lib</artifactId>
 84 |       <version>${json-lib.version}</version>
 85 |       <classifier>jdk15</classifier>
 86 |     </dependency>
 87 | 
 88 |     <dependency>
 89 |       <groupId>redis.clients</groupId>
 90 |       <artifactId>jedis</artifactId>
 91 |       <version>${jedis.version}</version>
 92 |     </dependency>
 93 | 
 94 |     <dependency>
 95 |       <groupId>org.apache.commons</groupId>
 96 |       <artifactId>commons-pool2</artifactId>
 97 |       <version>${commons-pool2.version}</version>
 98 |     </dependency>
 99 | 
100 |     <!--引入MySQL的依赖关系-->
101 |     <dependency>
102 |       <groupId>mysql</groupId>
103 |       <artifactId>mysql-connector-java</artifactId>
104 |       <scope>runtime</scope>
105 |       <version>${mysql-connector.version}</version>
106 |     </dependency>
107 |     <!--引入druid数据库连接池-->
108 |     <dependency>
109 |       <groupId>com.alibaba</groupId>
110 |       <artifactId>druid</artifactId>
111 |       <version>${druid.version}</version>
112 |     </dependency>
113 | 
114 |     <dependency>
115 |       <groupId>org.apache.hbase</groupId>
116 |       <artifactId>hbase-mapreduce</artifactId>
117 |       <version>${hbase-mapreduce.version}</version>
118 |     </dependency>
119 | 
120 |   </dependencies>
121 | 
122 |   <build>
123 |     <plugins>
124 |       <plugin>
125 |         <artifactId>maven-assembly-plugin</artifactId>
126 |         <version>2.3</version>
127 |         <configuration>
128 |           <classifier>dist</classifier>
129 |           <appendAssemblyId>true</appendAssemblyId>
130 |           <descriptorRefs>
131 |             <descriptor>jar-with-dependencies</descriptor>
132 |           </descriptorRefs>
133 |         </configuration>
134 |         <executions>
135 |           <execution>
136 |             <id>make-assembly</id>
137 |             <phase>package</phase>
138 |             <goals>
139 |               <goal>single</goal>
140 |             </goals>
141 |           </execution>
142 |         </executions>
143 |       </plugin>
144 | 
145 |       <plugin>
146 |         <artifactId>maven-compiler-plugin</artifactId>
147 |         <configuration>
148 |           <source>1.7</source>
149 |           <target>1.7</target>
150 |         </configuration>
151 |       </plugin>
152 | 
153 |       <plugin>
154 |         <groupId>net.alchim31.maven</groupId>
155 |         <artifactId>scala-maven-plugin</artifactId>
156 |         <version>3.2.2</version>
157 |         <executions>
158 |           <execution>
159 |             <id>scala-compile-first</id>
160 |             <phase>process-resources</phase>
161 |             <goals>
162 |               <goal>compile</goal>
163 |             </goals>
164 |           </execution>
165 |         </executions>
166 |         <configuration>
167 |           <scalaVersion>${scala.version}</scalaVersion>
168 |           <recompileMode>incremental</recompileMode>
169 |           <useZincServer>true</useZincServer>
170 |           <args>
171 |             <arg>-unchecked</arg>
172 |             <arg>-deprecation</arg>
173 |             <arg>-feature</arg>
174 |           </args>
175 |           <jvmArgs>
176 |             <jvmArg>-Xms1024m</jvmArg>
177 |             <jvmArg>-Xmx1024m</jvmArg>
178 |           </jvmArgs>
179 |           <javacArgs>
180 |             <javacArg>-source</javacArg>
181 |             <javacArg>${java.version}</javacArg>
182 |             <javacArg>-target</javacArg>
183 |             <javacArg>${java.version}</javacArg>
184 |             <javacArg>-Xlint:all,-serial,-path</javacArg>
185 |           </javacArgs>
186 |         </configuration>
187 |       </plugin>
188 |     </plugins>
189 |   </build>
190 | </project>
191 | 


--------------------------------------------------------------------------------
/src/main/scala/SparkTest.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark._
 2 | 
 3 | /**
 4 |   * 用于测试本地spark开发环境
 5 |   */
 6 | object SparkTest {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 |     val conf = new SparkConf().setMaster("local").setAppName("test")
10 |     val sc = new SparkContext(conf)
11 |     val rdd = sc.parallelize(Seq(1,2,3,4)).filter(_==1).take(1)
12 |     rdd.foreach(println(_))
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/org/spark/App.scala:
--------------------------------------------------------------------------------
 1 | package org.spark
 2 | 
 3 | /**
 4 |  * Hello world!
 5 |  *
 6 |  */
 7 | object App {
 8 |   def main(args: Array[String]): Unit = {
 9 |     println( "Hello World!" )
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/org/spark/movie/MovieUser.scala:
--------------------------------------------------------------------------------
 1 | package org.spark.movie
 2 | 
 3 | import org.apache.spark._
 4 | 
 5 | /**
 6 |   * 封装一个对象，因为这里最后我们通过main函数来启动，所以没必要建一个类
 7 |   * 数据格式：
 8 |   * 电影评分数据集：ratings.dat     UserID::MovieID::Rating::Timestamp
 9 |   * 用户信息数据集：users.dat       UserID::Gender::Age::Occupation::Zip-code
10 |   * 电影信息数据集：movies.dat      MovieID::Title::Genres
11 |   *
12 |   * 任务一：统计看过 “Sixteen Candles” 的用户、性别和观看次数
13 |   */
14 | object MovieUser {
15 |   def main(args: Array[String]): Unit = {
16 |     val master = if (args.length > 0) args(0).toString else "local"
17 |     val datapath = if (args.length > 1) args(1).toString else "data/ml-1m"
18 | 
19 |     // 一般写Spark程序，我们需要建立sparkConf和sparkContext
20 |     val conf = new SparkConf().setMaster(master).setAppName("MovieUser")
21 |     val sc = new SparkContext(conf)
22 | 
23 |     // 数据读入：读取数据文件转换为RDD
24 |     val usersRdd = sc.textFile(datapath + "/users.dat")
25 |     val ratingsRdd = sc.textFile(datapath + "/ratings.dat")
26 |     val moviesRdd = sc.textFile(datapath + "/movies.dat")
27 | 
28 |     // 抽取数据的属性，过滤符合条件的电影
29 |     // RDD => users格式 ：[UserID,(Gender,Age)]
30 |     val users = usersRdd.map(_.split("::"))
31 |       .map(x => {(x(0),(x(1),x(2)))})
32 |     // RDD => rating格式 ：[(UserID,MovieID)]
33 |     val rating = ratingsRdd.map(_.split("::"))
34 |       .map(x => (x(0),x(1)))
35 |       .filter(x => x._2.equals("2144"))
36 |     // join 两个数据集
37 |     // RDD => userRating格式 ：[UserID,(MovieID,(Gender,Age))]   =>  key相同，value合并  示例：(4425,(2144,(M,35)))
38 |     val userRating = rating.join(users)
39 | //    userRating.take(1)
40 | //      .foreach(println(_)) // 打印一条记录，测试使用，方便开发过程中查看格式
41 |     // 统计分析
42 |     val userDistribution = userRating.map(x => {(x._2._2,1)}).reduceByKey(_ + _)
43 |       .foreach(println(_))
44 | 
45 |     sc.stop()
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/org/spark/movie/PopularMovie.scala:
--------------------------------------------------------------------------------
 1 | package org.spark.movie
 2 | 
 3 | import org.apache.spark._
 4 | 
 5 | import scala.collection.immutable.HashSet
 6 | 
 7 | /**
 8 |   * 数据格式：
 9 |   * 电影评分数据集：ratings.dat     UserID::MovieID::Rating::Timestamp
10 |   * 用户信息数据集：users.dat       UserID::Gender::Age::Occupation::Zip-code
11 |   * 电影信息数据集：movies.dat      MovieID::Title::Genres
12 |   *
13 |   * 任务二：统计年龄段在20-30的年轻人，最喜欢看哪10部电影
14 |   */
15 | object PopularMovie {
16 |   def main(args: Array[String]): Unit = {
17 |     val master = if (args.length > 0) args(0).toString else "local"
18 |     val datapath = if (args.length > 1) args(1).toString else "data/ml-1m"
19 | 
20 |     // 一般写Spark程序，我们需要建立sparkConf和sparkContext
21 |     val conf = new SparkConf().setMaster(master).setAppName("PopularMovie")
22 |     val sc = new SparkContext(conf)
23 | 
24 |     // 数据读入：读取数据文件转换为RDD
25 |     val usersRdd = sc.textFile(datapath + "/users.dat")
26 |     val ratingsRdd = sc.textFile(datapath + "/ratings.dat")
27 |     val moviesRdd = sc.textFile(datapath + "/movies.dat")
28 | 
29 |     // 抽取数据和过滤 users.dat       UserID::Gender::Age::Occupation::Zip-code
30 |     val users = usersRdd.map(_.split("::"))
31 |       .map(x => {(x(0),x(2))}) // (UserID,Age)
32 |       .filter(x => x._2.toInt >= 20 && x._2.toInt <= 30)
33 |       .map(_._1)
34 |       .collect()
35 |     val userSet = HashSet() ++ users
36 |     val broadcastUserSet = sc.broadcast(userSet)
37 |     val movies = moviesRdd.map(_.split("::"))
38 |       .map(x => {(x(0),x(1))}) // (MovieID,Title)
39 | 
40 |     // 聚合和排序 movies.dat      MovieID::Title::Genres
41 |     val topMovies = ratingsRdd.map(_.split("::"))
42 |       .map(x => {(x(0),x(1))}) // (UserID,MovieID)
43 |       .filter(x => {broadcastUserSet.value.contains(x._1)}) // (UserID,MovieID)
44 |       .map(x => {(x._2,1)})  // (MovieID,1)
45 |       .reduceByKey(_+_) // (MovieID,N)
46 |       .join(movies) // (MovieID,(N,Title))
47 |       .map(x => {(x._2._1,x._2._2)}) // (N,Title)
48 |       .sortByKey(false) // 逆序排列
49 |       .map(x => {(x._2,x._1)}) // (Title,N)
50 |       .take(10) // 获取前十条数据
51 |       .foreach(println(_))
52 | 
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/spark/movie/TopKMovie.scala:
--------------------------------------------------------------------------------
 1 | package org.spark.movie
 2 | 
 3 | import org.apache.spark._
 4 | 
 5 | /**
 6 |   *
 7 |   * 数据格式：
 8 |   * 电影评分数据集：ratings.dat     UserID::MovieID::Rating::Timestamp
 9 |   * 用户信息数据集：users.dat       UserID::Gender::Age::Occupation::Zip-code
10 |   * 电影信息数据集：movies.dat      MovieID::Title::Genres
11 |   * 任务三：最受欢迎的前三部电影(平均评分最高的三部电影)
12 |   */
13 | object TopKMovie {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 |     val master = if (args.length > 0) args(0).toString else "local"
17 |     val datapath = if (args.length > 1) args(1).toString else "data/ml-1m"
18 | 
19 |     // 一般写Spark程序，我们需要建立sparkConf和sparkContext
20 |     val conf = new SparkConf().setMaster(master).setAppName("TopKMovie")
21 |     val sc = new SparkContext(conf)
22 | 
23 |     // 数据读入：读取数据文件转换为RDD
24 |     val usersRdd = sc.textFile(datapath + "/users.dat")
25 |     val ratingsRdd = sc.textFile(datapath + "/ratings.dat")
26 |     val moviesRdd = sc.textFile(datapath + "/movies.dat")
27 | 
28 |     // 数据抽取
29 |     val movies = moviesRdd.map(_.split("::"))
30 |       .map(x => {
31 |         (x(0), x(1))
32 |       }) //(MovieID,Title)
33 | 
34 |     val ratings = ratingsRdd.map(_.split("::"))
35 |       .map(x => {
36 |         (x(1), x(2))
37 |       }) // (MovieID,Rating)
38 |       .join(movies)
39 |       .map(x => {
40 |         (x._2._2, x._2._1)
41 |       }) // (Title,Rating)
42 |     // 数据分析
43 |     val topKScoreMostMovies = ratings.map(x => {
44 |       (x._1, (x._2.toInt, 1))
45 |     }) // (Title,(Rating,1))
46 |       .reduceByKey((v1, v2) => {
47 |       (v1._1 + v2._1, v1._2 + v2._2)
48 |     }) // (Title,(RatingScoreSum,N))
49 |       .map(x => {
50 |       (x._2._1.toFloat / x._2._2.toFloat, x._1)
51 |     }) // (RatingScoreAvg,Title)
52 |       .sortByKey(false)
53 |       .take(3)
54 |       .foreach(println(_))
55 | 
56 |     sc.stop()
57 |   }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkSQL/ApacheAccessLog/ApacheAccessLog.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkSQL.ApacheAccessLog
 2 | 
 3 | /**
 4 |   * case class 用于接受日志对应字段
 5 |   *
 6 |   * @param ipAddress
 7 |   * @param clientIdentd
 8 |   * @param userId
 9 |   * @param dateTime
10 |   * @param method
11 |   * @param endpoint
12 |   * @param protocol
13 |   * @param responseCode
14 |   * @param contentSize
15 |   */
16 | case class ApacheAccessLog(ipAddress: String,
17 |                            clientIdentd: String,
18 |                            userId: String,
19 |                            dateTime: String,
20 |                            method: String,
21 |                            endpoint: String,
22 |                            protocol: String,
23 |                            responseCode: Int,
24 |                            contentSize: Long){
25 | }
26 | 
27 | /**
28 |   * 通过正则表达式匹配相应log中的对应字段
29 |   */
30 | object ApacheAccessLog {
31 |   // 64.242.88.10 - - [07/Mar/2004:16:05:49 -0800] "GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1" 401 12846
32 |   val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s+\-\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
33 | 
34 |   def parseLogLine(log: String): ApacheAccessLog = {
35 |     log match {
36 |       case PATTERN(ipAddress, clientIdentd, userId, dateTime, method, endpoint, protocol, responseCode, contentSize)
37 |         => ApacheAccessLog(ipAddress, clientIdentd, userId, dateTime, method, endpoint, protocol, responseCode.toInt, contentSize.toLong)
38 |       case _ => throw new RuntimeException(s"""Cannot parse log line: $log""")
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkSQL/ApacheAccessLog/LogAnalyzerSQL.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkSQL.ApacheAccessLog
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Spark SQL 统计分析web日志内容
 7 |   */
 8 | object LogAnalyzerSQL {
 9 |   def main(args: Array[String]): Unit = {
10 |     val spark = SparkSession.builder()
11 |       .appName("Log Analyzer")
12 |       .master("local")
13 |       .getOrCreate()
14 | 
15 |     import spark.implicits._
16 | 
17 |     val accessLogs = spark
18 |       .read
19 |       .textFile("data/weblog/apache.access.log")
20 |       .map(ApacheAccessLog.parseLogLine).toDF()
21 | 
22 |     accessLogs.createOrReplaceTempView("logs")
23 | 
24 |     // 统计分析内容大小-全部内容大小，日志条数，最小内容大小，最大内容大小
25 | //    val contentSizeStats: Row = spark.sql("SELECT SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) FROM logs").first()
26 | //    val sum = contentSizeStats.getLong(0)
27 | //    val count = contentSizeStats.getLong(1)
28 | //    val min = contentSizeStats.getLong(2)
29 | //    val max = contentSizeStats.getLong(3)
30 | //    println("sum %s, count %s, min %s, max %s".format(sum, count, min, max))
31 | //    println("avg %s", sum / count)
32 | //    spark.close()
33 | 
34 |     // 统计每种返回码的数量.
35 | //    val responseCodeToCount = spark.sql("SELECT responseCode, COUNT(*) FROM logs GROUP BY responseCode LIMIT 100")
36 | //      .map(row => (row.getInt(0), row.getLong(1)))
37 | //      .collect()
38 | //    responseCodeToCount.foreach(print(_))
39 | 
40 |     // 统计哪个IP地址访问服务器超过10次
41 | //    val ipAddresses = spark.sql("SELECT ipAddress, COUNT(*) AS total FROM logs GROUP BY ipAddress HAVING total > 10 LIMIT 100")
42 | //      .map(row => row.getString(0))
43 | //      .collect()
44 | //    ipAddresses.foreach(println(_))
45 | 
46 |     // 查询访问量最大的访问目的地址
47 |     val topEndpoints = spark.sql("SELECT endpoint, COUNT(*) AS total FROM logs GROUP BY endpoint ORDER BY total DESC LIMIT 10")
48 |       .map(row => (row.getString(0), row.getLong(1)))
49 |       .collect()
50 |     topEndpoints.foreach(println(_))
51 | 
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkSQL/SensorLog/RedisClient.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkSQL.SensorLog
 2 | 
 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig
 4 | import redis.clients.jedis.JedisPool
 5 | 
 6 | /**
 7 |   * Redis 客户端
 8 |   */
 9 | object RedisClient extends Serializable {
10 |   val redisHost = "127.0.0.1"
11 |   val redisPort = 6379
12 |   val redisTimeout = 30000
13 |   val redisPassword = "root"
14 |   lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout, redisPassword)
15 | 
16 |   lazy val hook = new Thread {
17 |     override def run = {
18 |       println("Execute hook thread: " + this)
19 |       pool.destroy()
20 |     }
21 |   }
22 |   sys.addShutdownHook(hook.run)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkSQL/SensorLog/SensorRow.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkSQL.SensorLog
 2 | 
 3 | import org.apache.hadoop.hbase.client.Result
 4 | import org.apache.hadoop.hbase.util.Bytes
 5 | 
 6 | case class SensorRow(
 7 |                       androidBoardAvailableStorage: String, androidBoardCpu_usage: String, androidBoardCurrent: String,
 8 |                       androidBoardStorage: String, androidBoardVoltage: String, axialFanSpeed: String,
 9 |                       backLightState: String, blEnableState: String, crossFlowFanSpeedValue: String,
10 |                       decibelValue: String, doorStateState: String, spsLatitude: String,
11 |                       gpsLongitude: String, humidityValue: String, levelValue: String,
12 |                       powerStateState: String, powerStateValue: String, temperature: String,
13 |                       totalPowerValue: String, time: Long)
14 | 
15 | object SensorRow extends Serializable {
16 | 
17 |   /**
18 |     * 列族 “info”
19 |     * @param result
20 |     * @return
21 |     */
22 |   def parseSensorRow(result: Result): SensorRow = {
23 |     val p0 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::availableStorage".getBytes))
24 |     val p1 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::cpu_usage".getBytes))
25 |     val p2 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::current".getBytes))
26 |     val p3 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::storage".getBytes))
27 |     val p4 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::voltage".getBytes))
28 |     val p5 = Bytes.toString(result.getValue("info".getBytes, "AxialFanSpeed".getBytes))
29 |     val p6 = Bytes.toString(result.getValue("info".getBytes, "BackLight::state".getBytes))
30 |     val p7 = Bytes.toString(result.getValue("info".getBytes, "BlEnable::state".getBytes))
31 |     val p8 = Bytes.toString(result.getValue("info".getBytes, "CrossFlowFanSpeed::value".getBytes))
32 |     val p9 = Bytes.toString(result.getValue("info".getBytes, "Decibel::value".getBytes))
33 |     val p10 = Bytes.toString(result.getValue("info".getBytes, "DoorState::state".getBytes))
34 |     val p11 = Bytes.toString(result.getValue("info".getBytes, "GPS::latitude".getBytes))
35 |     val p12 = Bytes.toString(result.getValue("info".getBytes, "GPS::longitude".getBytes))
36 |     val p13 = Bytes.toString(result.getValue("info".getBytes, "Humidity::value".getBytes))
37 |     val p14 = Bytes.toString(result.getValue("info".getBytes, "Level::value".getBytes))
38 |     val p15 = Bytes.toString(result.getValue("info".getBytes, "PowerState::state".getBytes))
39 |     val p16 = Bytes.toString(result.getValue("info".getBytes, "PowerState::value".getBytes))
40 |     val p17 = Bytes.toString(result.getValue("info".getBytes, "Temperature".getBytes))
41 |     val p18 = Bytes.toString(result.getValue("info".getBytes, "TotalPower::value".getBytes))
42 |     val p19 = Bytes.toString(result.getValue("info".getBytes, "Time".getBytes))
43 |     SensorRow(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19.toLong)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkSQL/SensorLog/SensorStatistics.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkSQL.SensorLog
 2 | 
 3 | import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId}
 4 | import java.util.concurrent.Executors
 5 | 
 6 | import net.sf.json.JSONObject
 7 | import org.apache.hadoop.hbase.HBaseConfiguration
 8 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
 9 | import org.apache.spark.sql.{SQLContext, SparkSession}
10 | import org.apache.spark.{SparkConf, SparkContext}
11 | 
12 | /**
13 |   * 读取 Hbase 中的传感器数据
14 |   * 计算 平均值
15 |   */
16 | object SensorStatistics {
17 |   def main(args: Array[String]): Unit = {
18 |     // 参数
19 |     val master = if (args.length > 0) args(0).toString else "local"
20 |     val zkHost = if (args.length > 1) args(1).toString else "192.168.183.150,192.168.183.151,192.168.183.152"
21 |     val zkPort = if (args.length > 2) args(2).toString else "2181"
22 |     val tableName = if (args.length > 2) args(2).toString else "sensors"
23 | 
24 |     // 一般写Spark程序，我们需要建立sparkConf和sparkContext
25 |     val conf = new SparkConf().setMaster(master).setAppName("SensorStatistics")
26 |     val sc = new SparkContext(conf)
27 | 
28 |     // Hbase 配置
29 |     val hbaseConf = HBaseConfiguration.create()
30 |     hbaseConf.set("hbase.zookeeper.quorum", zkHost)
31 |     hbaseConf.set("hbase.master", zkPort)
32 |     hbaseConf.set("hbase.master", "192.168.183.150:16010"); // 例如:  191.168.9.9:16010 , 这里由于 ambari 的端口不一样所以和原生的端口不一样这个 要注意
33 |     hbaseConf.set(TableInputFormat.INPUT_TABLE, "sensors")
34 |     val executor = Executors.newCachedThreadPool()
35 | 
36 |     // 从Hbase数据源获取数据
37 |     val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
38 |       classOf[TableInputFormat],
39 |       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
40 |       classOf[org.apache.hadoop.hbase.client.Result])
41 | 
42 |     hbaseRDD.count()
43 | 
44 |     val sqlContext = new SQLContext(sc)
45 |     import sqlContext.implicits._
46 |     val spark = SparkSession.builder()
47 |       .appName("SensorStatistics")
48 |       .master("local")
49 |       .getOrCreate()
50 | 
51 |     // 将查询结果映射到 case class
52 |     val resultRDD = hbaseRDD.map(tuple => tuple._2)
53 |     // 将映射数据集转换为 DataFrame 以便 后期使用SQL开发查询统计
54 |     val sensorRDD  = resultRDD.map(SensorRow.parseSensorRow).toDF()
55 | 
56 |     sensorRDD.show() // 打印表，一般开发的时候用
57 |     // 创建视图 命名 即表名
58 |     sensorRDD.createOrReplaceTempView("sensors")
59 | 
60 |     val sensorStatDF = spark.sql("SELECT avg(temperature) FROM sensors where time > 1576080000000  and time < 1576771200000")
61 |       .map(row => {
62 |         row.getDouble(0)
63 |       })
64 |       .collect()
65 |     sensorStatDF.foreach(println(_))
66 | 
67 |     // 零点的时间戳 (预留)
68 |     val today_start = LocalDateTime.of(LocalDate.now, LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli
69 |     val past_oneWeek_start = LocalDateTime.of(LocalDate.now().minusDays(7), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli
70 |     val past_oneMonth_start = LocalDateTime.of(LocalDate.now().minusMonths(1), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli
71 |     val past_threeMonth_start = LocalDateTime.of(LocalDate.now().minusMonths(3), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli
72 | 
73 | 
74 |     // TODO redis 存储 => 查询结果 存储再redis
75 |     val dbIndex = 2
76 |     val clickHashKey = "devMonitorCalculation"
77 | 
78 |     sc.stop()
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/DruidConnectionPool.java:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql;
 2 | 
 3 | import com.alibaba.druid.pool.DruidDataSourceFactory;
 4 | import javax.sql.DataSource;
 5 | import java.util.Properties;
 6 | 
 7 | /**
 8 |  * @author Higmin
 9 |  * @date 2019/11/28 9:10
10 |  **/
11 | public class DruidConnectionPool {
12 | 
13 | 	public DataSource dataSource;
14 | 	private Properties pro = new Properties();
15 | 
16 | 	private DruidConnectionPool() {
17 | 		try {
18 | 			init();
19 | 			dataSource = DruidDataSourceFactory.createDataSource(pro);
20 | 		} catch (Exception e) {
21 | 			e.printStackTrace();
22 | 		}
23 | 	}
24 | 
25 | 	private static class Holder {
26 | 		private static DruidConnectionPool instance = new DruidConnectionPool();
27 | 	}
28 | 
29 | 	public static DruidConnectionPool getInstance() {
30 | 		return Holder.instance;
31 | 	}
32 | 
33 | 	private void init() {
34 | 		// 数据源配置
35 | 		pro.setProperty("driverClassName", "com.mysql.jdbc.Driver");
36 | 		pro.setProperty("url", "jdbc:mysql://localhost:3306/test?characterEncoding=utf8&useSSL=true");
37 | 		pro.setProperty("username", "root");
38 | 		pro.setProperty("password", "root");
39 | 		// 连接池配置
40 | 		pro.setProperty("initialSize", "20"); // 初始化连接大小
41 | 		pro.setProperty("minIdle", "20"); // 最小连接池数量
42 | 		pro.setProperty("maxActive", "100"); // 最大连接池数量
43 | 		pro.setProperty("maxWait", "60000"); // 获取连接时最大等待时间，单位毫秒
44 | 		pro.setProperty("timeBetweenEvictionRunsMillis", "60000"); // 配置间隔多久才进行一次检测，检测需要关闭的空闲连接，单位是毫秒
45 | 		pro.setProperty("minEvictableIdleTimeMillis", "300000"); // 配置一个连接在池中最小生存的时间，单位是毫秒
46 | 		pro.setProperty("validationQuery", "SELECT 1 FROM DUAL"); // 测试连接
47 | 		pro.setProperty("testWhileIdle", "true"); // 申请连接的时候检测，建议配置为true，不影响性能，并且保证安全性
48 | 		pro.setProperty("testOnBorrow", "false"); // 获取连接时执行检测，建议关闭，影响性能
49 | 		pro.setProperty("testOnReturn", "false"); // 归还连接时执行检测，建议关闭，影响性能
50 | 		pro.setProperty("poolPreparedStatements", "false");        // 是否开启PSCache，PSCache对支持游标的数据库性能提升巨大，oracle建议开启，mysql下建议关闭
51 | 		pro.setProperty("maxOpenPreparedStatements", "20"); // 开启poolPreparedStatements后生效
52 | //		pro.setProperty("filters", "stat,wall,slf4j"); // 配置扩展插件，常用的插件有=>stat:监控统计  log4j:日志  wall:防御sql注入
53 | 		pro.setProperty("connectionProperties", "druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000"); // 通过connectProperties属性来打开mergeSql功能;慢SQL记录
54 | 		pro.setProperty("asyncInit", "true");
55 | 	}
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/KafkaEventProducer.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import net.sf.json.JSONObject
 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |   * 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式
12 |   */
13 | object KafkaEventProducer {
14 | 
15 |   private val users = Array(
16 |     "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52",
17 |     "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493",
18 |     "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8",
19 |     "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b",
20 |     "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d")
21 | 
22 |   private val sites = Array(
23 |     "Android","IOS","PC"
24 |   )
25 | 
26 |   private val random = new Random()
27 | 
28 |   def getUserID():String  = {
29 |     val userPointer = random.nextInt(10)
30 |     users(userPointer)
31 |   }
32 | 
33 |   def getSite():String = {
34 |     val sitePointer = random.nextInt(3)
35 |     sites(sitePointer)
36 |   }
37 | 
38 |   def click() : Double = {
39 |     random.nextInt(10)
40 |   }
41 | 
42 |   def main(args: Array[String]): Unit = {
43 |     val topics = "user_events_mysql"
44 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
45 |     val props = new Properties()
46 |     props.put("bootstrap.servers",brokers)
47 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
48 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
49 |     props.put("request.required.acks", "1")
50 | 
51 | //    val kafkaConfig = new ProducerConfig(props)
52 |     val producer = new KafkaProducer[String,String](props)
53 |     while (true) {
54 |       val event = new JSONObject()
55 |       event
56 |         .accumulate("uid", getUserID()) // 用户id
57 |         .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间
58 |         .accumulate("os_type", getSite()) // 终端类型
59 |         .accumulate("click_count", click()) // 点击次数
60 | 
61 |       // produce event message
62 |       producer.send(new ProducerRecord[String,String](topics,event.toString()))
63 |       println("Message sent: " + event.toString)
64 | 
65 |       Thread.sleep(200)
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/UserClickCountAnalytics.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql
 2 | 
 3 | import net.sf.json.JSONObject
 4 | import org.apache.kafka.common.serialization.StringDeserializer
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 7 | import org.apache.spark.streaming.kafka010.KafkaUtils
 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 9 | import org.apache.spark.streaming.{Seconds, StreamingContext}
10 | 
11 | /**
12 |   * 读取kafka中的数据，结果存在mysql中
13 |   * 实现实时统计每个用户的点击次数，它是按照用户分组进行累加次数，逻辑比较简单
14 |   * 关键是在实现过程中要注意一些问题，如对象序列化等
15 |   */
16 | object UserClickCountAnalytics {
17 |   def main(args: Array[String]): Unit = {
18 |     // 创建 SparkConf 和 StreamingContext
19 |     val master = if (args.length > 0) args(0) else "local[1]"
20 |     val conf = new SparkConf().setMaster(master).setAppName("UserClickCountAnalytics")
21 |     val ssc = new StreamingContext(conf, Seconds(1)) // 按5S来划分一个微批处理
22 | 
23 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
24 |     val topics = Array("user_events_mysql")
25 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
26 |     // 读取kafka数据
27 |     val kafkaParams = Map[String,Object](
28 |       "bootstrap.servers" -> brokers,
29 |       "key.deserializer" -> classOf[StringDeserializer],
30 |       "value.deserializer" -> classOf[StringDeserializer],
31 |       "group.id" -> "UserClickCountAnalytics_group",
32 |       "auto.offset.reset" -> "latest",
33 |       "enable.auto.commit" -> (false: java.lang.Boolean)
34 |     )
35 |     // redis 存储
36 |     val dbIndex = 2
37 |     val clickHashKey = "app::user:click"
38 | 
39 |     // 获取日志数据
40 |     val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
41 |     val events = kafkaStream.flatMap(
42 |       line => {
43 |         val data = JSONObject.fromObject(line.value())
44 |         Some(data)
45 |       })
46 | 
47 |     // 统计用户点击次数  根据uid 统计 click_count
48 |     val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果
49 |       .reduceByKey(_+_)
50 |     userClicks.foreachRDD(rdd => {
51 |       rdd.foreachPartition(partitionOfRecords => {
52 |         partitionOfRecords.foreach(pair => {
53 |           // 创建连接池
54 |           val dataSource = DruidConnectionPool.getInstance().dataSource
55 |           val conn = dataSource.getConnection
56 |           val uid = pair._1
57 |           val clickCount = pair._2
58 |           val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'"
59 |           val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")"
60 |           val ps = conn.prepareStatement(sql_isExist)
61 |           val resultSet = ps.executeQuery()
62 |           if (resultSet.next()) {
63 |             val count = resultSet.getString(2).toInt + clickCount.toInt
64 |             val sql_update = "update streaming set clickCount ='"  + count + "' where uid = '" + uid + "'"
65 |             val ps = conn.prepareStatement(sql_update)
66 |             ps.executeUpdate()
67 |             resultSet.close()
68 |           } else {
69 |             val ps = conn.prepareStatement(sql_insert)
70 |             ps.executeUpdate()
71 |           }
72 |           ps.close()
73 |           conn.close()
74 |         })
75 |       })
76 |     })
77 |     ssc.start()
78 |     ssc.awaitTermination()
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/UserClickCountByWindowAnalytics.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql
 2 | 
 3 | import net.sf.json.JSONObject
 4 | import org.apache.kafka.common.serialization.StringDeserializer
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 7 | import org.apache.spark.streaming.kafka010.KafkaUtils
 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 9 | import org.apache.spark.streaming.{Seconds, StreamingContext}
10 | 
11 | /**
12 |   * 每5秒 统计 过去10秒 每种终端 收到的点击量
13 |   *
14 |   * 注意：
15 |   * 1. 使用 窗口计算需要设置检查点 checkpoint
16 |   * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错.
17 |   */
18 | object UserClickCountByWindowAnalytics {
19 |   def main(args: Array[String]): Unit = {
20 |     // 创建 SparkConf 和 StreamingContext
21 |     val master = if (args.length > 0) args(0) else "local[1]"
22 |     // 创建检查点路径
23 |     val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/mysql/UserClickCountByWindowAnalytics"
24 |     val conf = new SparkConf().setMaster(master).setAppName("UserClickCountByWindowAnalytics")
25 |     val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理
26 |     // 设置检查点
27 |     ssc.checkpoint(checkpointDir)
28 | 
29 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
30 |     val topics = Array("user_events_mysql")
31 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
32 |     // 读取kafka数据
33 |     val kafkaParams = Map[String,Object](
34 |       "bootstrap.servers" -> brokers,
35 |       "key.deserializer" -> classOf[StringDeserializer],
36 |       "value.deserializer" -> classOf[StringDeserializer],
37 |       "group.id" -> "UserClickCountByWindowAnalytics_group",
38 |       "auto.offset.reset" -> "latest",
39 |       "enable.auto.commit" -> (false: java.lang.Boolean)
40 |     )
41 |     // redis 存储
42 |     val dbIndex = 2
43 |     val clickHashKey = "app::os_type:click"
44 | 
45 |     // 获取日志数据
46 |     val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
47 |     val events = kafkaStream.flatMap(
48 |       line => {
49 |         val data = JSONObject.fromObject(line.value())
50 |         Some(data)
51 |       })
52 | 
53 |     // 每5秒统计过去10秒每种site的点击量
54 |     val userClicks = events.map(x => {(x.getString("os_type"),x.getInt("click_count"))})
55 |       .reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(5)) // 新增数据，过期数据，过去10S的窗口长度，每隔5S计算一次
56 | //        userClicks.foreachRDD(rdd =>{rdd.foreach(println(_))}) // 用于测试数据格式
57 |     userClicks.foreachRDD(rdd => {
58 |       rdd.foreachPartition(partitionOfRecords => {
59 |         partitionOfRecords.foreach(pair => {
60 |           // 创建连接池
61 |           val dataSource = DruidConnectionPool.getInstance().dataSource
62 |           val conn = dataSource.getConnection
63 |           val os_type = pair._1
64 |           val clickCount = pair._2
65 |           val sql_isExist = "SELECT * from streaming_ostype where os_type = '" + os_type + "'"
66 |           val sql_insert = "insert into streaming_ostype(os_type,clickCount) values('" + os_type + "'," + clickCount + ")"
67 |           val ps = conn.prepareStatement(sql_isExist)
68 |           val resultSet = ps.executeQuery()
69 |           if (resultSet.next()) {
70 |             val count = resultSet.getString(2).toInt + clickCount.toInt
71 |             val sql_update = "update streaming_ostype set clickCount ='"  + count + "' where os_type = '" + os_type + "'"
72 |             val ps = conn.prepareStatement(sql_update)
73 |             ps.executeUpdate()
74 |           }
75 |           else {
76 |             val ps = conn.prepareStatement(sql_insert)
77 |             ps.executeUpdate()
78 |           }
79 |           ps.close()
80 |           conn.close()
81 |         })
82 |       })
83 |     })
84 |     ssc.start()
85 |     ssc.awaitTermination()
86 |   }
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/KafkaEventProducer.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK
 2 | 
 3 | import java.util.Properties
 4 | import net.sf.json.JSONObject
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | import scala.util.Random
 7 | 
 8 | /**
 9 |   * 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式
10 |   */
11 | object KafkaEventProducer {
12 | 
13 |   private val users = Array(
14 |     "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52",
15 |     "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493",
16 |     "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8",
17 |     "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b",
18 |     "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d")
19 | 
20 |   private val sites = Array(
21 |     "Android","IOS","PC"
22 |   )
23 | 
24 |   private val random = new Random()
25 | 
26 |   def getUserID():String  = {
27 |     val userPointer = random.nextInt(10)
28 |     users(userPointer)
29 |   }
30 | 
31 |   def getSite():String = {
32 |     val sitePointer = random.nextInt(3)
33 |     sites(sitePointer)
34 |   }
35 | 
36 |   def click() : Double = {
37 |     random.nextInt(10)
38 |   }
39 | 
40 |   def main(args: Array[String]): Unit = {
41 |     val topics = "user_events_zk"
42 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
43 |     val props = new Properties()
44 |     props.put("bootstrap.servers",brokers)
45 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
46 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
47 |     props.put("request.required.acks", "1")
48 | 
49 |     val producer = new KafkaProducer[String,String](props)
50 |     while (true) {
51 |       val event = new JSONObject()
52 |       event
53 |         .accumulate("uid", getUserID()) // 用户id
54 |         .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间
55 |         .accumulate("os_type", getSite()) // 终端类型
56 |         .accumulate("click_count", click()) // 点击次数
57 | 
58 |       // produce event message
59 |       producer.send(new ProducerRecord[String,String](topics,event.toString()))
60 |       println("Message sent: " + event.toString)
61 | 
62 |       Thread.sleep(200)
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/UserClickCountAnalytics.scala:
--------------------------------------------------------------------------------
  1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK
  2 | 
  3 | import java.lang
  4 | import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
  5 | import net.sf.json.JSONObject
  6 | import org.apache.kafka.clients.consumer.ConsumerRecord
  7 | import org.apache.kafka.common.TopicPartition
  8 | import org.apache.kafka.common.serialization.StringDeserializer
  9 | import org.apache.spark.{SparkConf, TaskContext}
 10 | import org.apache.spark.streaming.dstream.InputDStream
 11 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 12 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 13 | import org.apache.spark.streaming.kafka010._
 14 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 15 | import org.sparkStreaming.kafka_sparkStreaming_mysql.DruidConnectionPool
 16 | 
 17 | /**
 18 |   * 读取kafka中的数据，结果存在mysql中
 19 |   * 实现实时统计每个用户的点击次数，它是按照用户分组进行累加次数，逻辑比较简单
 20 |   * 关键是在实现过程中要注意一些问题，如对象序列化等
 21 |   *
 22 |   * 手动管理 Kafka 偏移量  存储在 ZK 当中
 23 |   */
 24 | object UserClickCountAnalytics {
 25 | 
 26 |   def main(args: Array[String]): Unit = {
 27 |     // 创建 SparkConf 和 StreamingContext
 28 |     val master = if (args.length > 0) args(0) else "local[*]"
 29 |     val conf = new SparkConf().setMaster(master).setAppName("text")
 30 |     val ssc = new StreamingContext(conf, Seconds(1)) // 按5S来划分一个微批处理
 31 | 
 32 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
 33 |     val topicStr = "user_events_zk"
 34 |     val topics = Array(topicStr)
 35 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
 36 |     // 读取kafka数据
 37 |     val kafkaParams = Map[String, Object](
 38 |       "bootstrap.servers" -> brokers,
 39 |       "key.deserializer" -> classOf[StringDeserializer],
 40 |       "value.deserializer" -> classOf[StringDeserializer],
 41 |       "group.id" -> "offsetToZk_test_group",
 42 |       "auto.offset.reset" -> "latest",
 43 |       "enable.auto.commit" -> (false: lang.Boolean)
 44 |     )
 45 | 
 46 |     var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
 47 |     // ZK 相关
 48 |     val zk_host = "192.168.183.150:2181,192.168.183.151:2181,192.168.183.152:2181"
 49 |     val zkClient = ZkUtils.createZkClient(zk_host, 60000, 60000)
 50 | 
 51 |     //创建一个 ZKGroupTopicDirs 对象
 52 |     val topicDirs = new ZKGroupTopicDirs("offsetToZk_test_group", topicStr)
 53 |     //获取 zookeeper 中的路径，这里会变成 /consumers/test_spark_streaming_group/offsets/topic_name
 54 |     val zkTopicPath = s"${topicDirs.consumerOffsetDir}"
 55 |     //查询该路径下是否字节点（默认有字节点为我们自己保存不同 partition 时生成的）
 56 |     val children = zkClient.countChildren(zkTopicPath)
 57 |     //如果 zookeeper 中有保存 offset，我们会利用这个 offset 作为 kafkaStream 的起始位置
 58 |     var fromOffsets: Map[TopicPartition, Long] = Map()
 59 | 
 60 |     if (children > 0) { // 在有记录的情况下 => 从节点获取存储的offset
 61 |       fromOffsets = new ZkKafkaOffsetManager(zk_host).readOffsets(topics, "offsetToZk_test_group")
 62 |       kafkaStream = KafkaUtils.createDirectStream(ssc, PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, fromOffsets))
 63 |     } else { // 如果ZK不存在此路径 => 创建该节点及其父节点
 64 |       kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
 65 |     }
 66 |     kafkaStream.foreachRDD { rdd =>
 67 |       // 数据处理 =====> 开始
 68 |       val events = rdd.flatMap(
 69 |         line => {
 70 |           val data = JSONObject.fromObject(line.value())
 71 |           Some(data)
 72 |         })
 73 |       // 统计用户点击次数  根据uid 统计 click_count
 74 |       val userClicks = events.map(x => {
 75 |         (x.getString("uid"), x.getInt("click_count"))
 76 |       }) // 计算每个微批处理的统计结果
 77 |         .reduceByKey(_ + _)
 78 |       userClicks.foreachPartition { iter =>
 79 |         iter.foreach(pair => {
 80 |           // 创建连接池
 81 |           val dataSource = DruidConnectionPool.getInstance().dataSource
 82 |           val conn = dataSource.getConnection
 83 |           val uid = pair._1
 84 |           val clickCount = pair._2
 85 |           val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'"
 86 |           val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")"
 87 |           val ps = conn.prepareStatement(sql_isExist)
 88 |           val resultSet = ps.executeQuery()
 89 |           if (resultSet.next()) {
 90 |             val count = resultSet.getString(2).toInt + clickCount.toInt
 91 |             val sql_update = "update streaming set clickCount ='" + count + "' where uid = '" + uid + "'"
 92 |             val ps = conn.prepareStatement(sql_update)
 93 |             ps.executeUpdate()
 94 |             resultSet.close()
 95 |           } else {
 96 |             val ps = conn.prepareStatement(sql_insert)
 97 |             ps.executeUpdate()
 98 |           }
 99 |           ps.close()
100 |           conn.close()
101 |         })
102 |       }
103 |       // 数据处理 =====> 结束 =====> 数据处理完毕之后，获取偏移量offset，并保存在 ZK 中
104 |       val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
105 |       new ZkKafkaOffsetManager(zk_host).saveOffsets(offsetRanges, "offsetToZk_test_group")
106 |     }
107 |     ssc.start()
108 |     ssc.awaitTermination()
109 |   }
110 | }


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/ZkKafkaOffsetManager.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK
 2 | 
 3 | import kafka.utils.{ZkUtils, ZKGroupTopicDirs}
 4 | import org.apache.kafka.common.TopicPartition
 5 | import org.apache.spark.streaming.kafka010.OffsetRange
 6 | import org.slf4j.LoggerFactory
 7 | 
 8 | import scala.collection.mutable
 9 | 
10 | /**
11 |   * SparkStreaming 手动管理 kafka offset 到 ZK 工具类
12 |   * @param zkUrl
13 |   */
14 | class ZkKafkaOffsetManager(zkUrl: String) {
15 |   private val logger = LoggerFactory.getLogger(classOf[ZkKafkaOffsetManager])
16 | 
17 |   private val zkClientAndConn = ZkUtils.createZkClientAndConnection(zkUrl, 60000, 60000);
18 |   private val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2, false)
19 | 
20 |   /**
21 |     * 从ZK 中读取偏移量 offset
22 |     * @param topics
23 |     * @param groupId
24 |     * @return
25 |     */
26 |   def readOffsets(topics: Seq[String], groupId: String): Map[TopicPartition, Long] = {
27 |     val offsets = mutable.HashMap.empty[TopicPartition, Long]
28 |     val partitionsForTopics = zkUtils.getPartitionsForTopics(topics)
29 | 
30 |     // /consumers/<groupId>/offsets/<topic>/<partition>
31 |     partitionsForTopics.foreach(partitions => {
32 |       val topic = partitions._1
33 |       val groupTopicDirs = new ZKGroupTopicDirs(groupId, topic)
34 | 
35 |       partitions._2.foreach(partition => {
36 |         val path = groupTopicDirs.consumerOffsetDir + "/" + partition
37 |         try {
38 |           val data = zkUtils.readData(path)
39 |           if (data != null) {
40 |             offsets.put(new TopicPartition(topic, partition), data._1.toLong)
41 |             logger.info(
42 |               "Read offset - topic={}, partition={}, offset={}, path={}",
43 |               Seq[AnyRef](topic, partition.toString, data._1, path)
44 |             )
45 |           }
46 |         } catch {
47 |           case ex: Exception =>
48 |             offsets.put(new TopicPartition(topic, partition), 0L)
49 |             logger.info(
50 |               "Read offset - not exist: {}, topic={}, partition={}, path={}",
51 |               Seq[AnyRef](ex.getMessage, topic, partition.toString, path)
52 |             )
53 |         }
54 |       })
55 |     })
56 | 
57 |     offsets.toMap
58 |   }
59 | 
60 |   /**
61 |     * 保存偏移量到 ZK
62 |     * @param offsetRanges
63 |     * @param groupId
64 |     */
65 |   def saveOffsets(offsetRanges: Seq[OffsetRange], groupId: String): Unit = {
66 |     offsetRanges.foreach(range => {
67 |       val groupTopicDirs = new ZKGroupTopicDirs(groupId, range.topic)
68 |       val path = groupTopicDirs.consumerOffsetDir + "/" + range.partition
69 |       zkUtils.updatePersistentPath(path, range.untilOffset.toString)
70 |       logger.info(
71 |         "Save offset - topic={}, partition={}, offset={}, path={}",
72 |         Seq[AnyRef](range.topic, range.partition.toString, range.untilOffset.toString, path)
73 |       )
74 |     })
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/KafkaEventProducer.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_redis
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import net.sf.json.JSONObject
 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |   * 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式
12 |   */
13 | object KafkaEventProducer {
14 | 
15 |   private val users = Array(
16 |     "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52",
17 |     "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493",
18 |     "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8",
19 |     "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b",
20 |     "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d")
21 | 
22 |   private val sites = Array(
23 |     "Android","IOS","PC"
24 |   )
25 | 
26 |   private val random = new Random()
27 | 
28 |   def getUserID():String  = {
29 |     val userPointer = random.nextInt(10)
30 |     users(userPointer)
31 |   }
32 | 
33 |   def getSite():String = {
34 |     val sitePointer = random.nextInt(3)
35 |     sites(sitePointer)
36 |   }
37 | 
38 |   def click() : Double = {
39 |     random.nextInt(10)
40 |   }
41 | 
42 |   def main(args: Array[String]): Unit = {
43 |     val topics = "user_events_redis"
44 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
45 |     val props = new Properties()
46 |     props.put("bootstrap.servers",brokers)
47 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
48 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
49 |     props.put("request.required.acks", "1")
50 | 
51 | //    val kafkaConfig = new ProducerConfig(props)
52 |     val producer = new KafkaProducer[String,String](props)
53 |     while (true) {
54 |       val event = new JSONObject()
55 |       event
56 |         .accumulate("uid", getUserID()) // 用户id
57 |         .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间
58 |         .accumulate("os_type", getSite()) // 终端类型
59 |         .accumulate("click_count", click()) // 点击次数
60 | 
61 |       // produce event message
62 |       producer.send(new ProducerRecord[String,String](topics,event.toString()))
63 |       println("Message sent: " + event.toString)
64 | 
65 |       Thread.sleep(200)
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/RedisClient.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_redis
 2 | 
 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig
 4 | import redis.clients.jedis.JedisPool
 5 | 
 6 | /**
 7 |   * Redis 客户端
 8 |   */
 9 | object RedisClient extends Serializable {
10 |   val redisHost = "127.0.0.1"
11 |   val redisPort = 6379
12 |   val redisTimeout = 30000
13 |   val redisPassword = "root"
14 |   lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout, redisPassword)
15 | 
16 |   lazy val hook = new Thread {
17 |     override def run = {
18 |       println("Execute hook thread: " + this)
19 |       pool.destroy()
20 |     }
21 |   }
22 |   sys.addShutdownHook(hook.run)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/UserClickCountAnalytics.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_redis
 2 | 
 3 | import net.sf.json.JSONObject
 4 | import org.apache.kafka.common.serialization.StringDeserializer
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 7 | import org.apache.spark.streaming.kafka010.KafkaUtils
 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 9 | import org.apache.spark.streaming.{Seconds, StreamingContext}
10 | 
11 | /**
12 |   * 读取kafka中的数据，结果存在redis中
13 |   * 实现实时统计每个用户的点击次数，它是按照用户分组进行累加次数，逻辑比较简单
14 |   * 关键是在实现过程中要注意一些问题，如对象序列化等
15 |   */
16 | object UserClickCountAnalytics {
17 |   def main(args: Array[String]): Unit = {
18 |     // 创建 SparkConf 和 StreamingContext
19 |     val master = if (args.length > 0) args(0) else "local[1]"
20 |     val conf = new SparkConf().setMaster(master).setAppName("UserClickCountAnalytics")
21 |     val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理
22 | 
23 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
24 |     val topics = Array("user_events_redis")
25 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
26 |     // 读取kafka数据
27 |     val kafkaParams = Map[String,Object](
28 |       "bootstrap.servers" -> brokers,
29 |       "key.deserializer" -> classOf[StringDeserializer],
30 |       "value.deserializer" -> classOf[StringDeserializer],
31 |       "group.id" -> "UserClickCountAnalytics_group",
32 |       "auto.offset.reset" -> "latest",
33 |       "enable.auto.commit" -> (false: java.lang.Boolean)
34 |     )
35 |     // redis 存储
36 |     val dbIndex = 2
37 |     val clickHashKey = "app::user:click"
38 | 
39 |     // 获取日志数据
40 |     val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
41 |     val events = kafkaStream.flatMap(
42 |       line => {
43 |         val data = JSONObject.fromObject(line.value())
44 |         Some(data)
45 |       })
46 | 
47 |     // 统计用户点击次数  根据uid 统计 click_count（累加是在redis中做的）
48 |     val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果
49 |       .reduceByKey(_+_)
50 |     userClicks.foreachRDD(rdd => {
51 |       rdd.foreachPartition(partitionOfRecords => {
52 |         partitionOfRecords.foreach(pair => {
53 |           val jedis = RedisClient.pool.getResource
54 |           jedis.select(dbIndex)
55 |           val uid = pair._1
56 |           val clickCount = pair._2
57 |           jedis.hincrBy(clickHashKey, uid, clickCount) // 为哈希表clickHashKey中的域uid的值加上增量increment。（将每个微批处理的统计结果 根据uid分组 累加 起来）
58 |           RedisClient.pool.returnResource(jedis)
59 |         })
60 |       })
61 |     })
62 |     ssc.start()
63 |     ssc.awaitTermination()
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/UserClickCountByWindowAnalytics.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.kafka_sparkStreaming_redis
 2 | 
 3 | import net.sf.json.JSONObject
 4 | import org.apache.kafka.common.serialization.StringDeserializer
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 7 | import org.apache.spark.streaming.kafka010.KafkaUtils
 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 9 | import org.apache.spark.streaming.{Seconds, StreamingContext}
10 | 
11 | /**
12 |   * 每5秒 统计 过去10秒 每种终端 收到的点击量
13 |   *
14 |   * 注意：
15 |   * 1. 使用 窗口计算需要设置检查点 checkpoint
16 |   * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错.
17 |   */
18 | object UserClickCountByWindowAnalytics {
19 |   def main(args: Array[String]): Unit = {
20 |     // 创建 SparkConf 和 StreamingContext
21 |     val master = if (args.length > 0) args(0) else "local[1]"
22 |     // 创建检查点路径
23 |     val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/redis/UserClickCountByWindowAnalytics"
24 |     val conf = new SparkConf().setMaster(master).setAppName("UserClickCountByWindowAnalytics")
25 |     val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理
26 |     // 设置检查点
27 |     ssc.checkpoint(checkpointDir)
28 | 
29 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
30 |     val topics = Array("user_events_redis")
31 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
32 |     // 读取kafka数据
33 |     val kafkaParams = Map[String,Object](
34 |       "bootstrap.servers" -> brokers,
35 |       "key.deserializer" -> classOf[StringDeserializer],
36 |       "value.deserializer" -> classOf[StringDeserializer],
37 |       "group.id" -> "UserClickCountByWindowAnalytics_group",
38 |       "auto.offset.reset" -> "latest",
39 |       "enable.auto.commit" -> (false: java.lang.Boolean)
40 |     )
41 |     // redis 存储
42 |     val dbIndex = 2
43 |     val clickHashKey = "app::os_type:click"
44 | 
45 |     // 获取日志数据
46 |     val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
47 |     val events = kafkaStream.flatMap(
48 |       line => {
49 |         val data = JSONObject.fromObject(line.value())
50 |         Some(data)
51 |       })
52 | 
53 |     // 每5秒统计过去10秒每种site的点击量
54 |     val userClicks = events.map(x => {(x.getString("os_type"),x.getInt("click_count"))})
55 |       .reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(5)) // 新增数据，过期数据，过去10S的窗口长度，每隔5S计算一次
56 | //        userClicks.foreachRDD(rdd =>{rdd.foreach(println(_))}) // 用于测试数据格式
57 |     userClicks.foreachRDD(rdd => {
58 |       rdd.foreachPartition(partitionOfRecords => {
59 |         partitionOfRecords.foreach(pair => {
60 |           val jedis = RedisClient.pool.getResource
61 |           jedis.select(dbIndex)
62 |           val os_type = pair._1
63 |           val clickCount = pair._2
64 |           jedis.lpush(os_type,String.valueOf(clickCount))
65 |           RedisClient.pool.returnResource(jedis)
66 |         })
67 |       })
68 |     })
69 |     ssc.start()
70 |     ssc.awaitTermination()
71 |   }
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce/KafkaEventProducer.scala:
--------------------------------------------------------------------------------
 1 | package org.sparkStreaming.sparkStreamingExactltyOnce
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import net.sf.json.JSONObject
 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |   * 模拟 Kafka 生产者 实时写入用户行为的事件数据，数据是JSON格式
12 |   */
13 | object KafkaEventProducer {
14 | 
15 |   private val users = Array(
16 |     "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52",
17 |     "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493",
18 |     "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8",
19 |     "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b",
20 |     "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d")
21 | 
22 |   private val sites = Array(
23 |     "Android","IOS","PC"
24 |   )
25 | 
26 |   private val random = new Random()
27 | 
28 |   def getUserID():String  = {
29 |     val userPointer = random.nextInt(10)
30 |     users(userPointer)
31 |   }
32 | 
33 |   def getSite():String = {
34 |     val sitePointer = random.nextInt(3)
35 |     sites(sitePointer)
36 |   }
37 | 
38 |   def click() : Double = {
39 |     random.nextInt(10)
40 |   }
41 | 
42 |   def main(args: Array[String]): Unit = {
43 |     val topics = "user_events_ExactltyOnce"
44 |     val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
45 |     val props = new Properties()
46 |     props.put("bootstrap.servers",brokers)
47 |     props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
48 |     props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
49 |     props.put("request.required.acks", "1")
50 | 
51 | //    val kafkaConfig = new ProducerConfig(props)
52 |     val producer = new KafkaProducer[String,String](props)
53 |     while (true) {
54 |       val event = new JSONObject()
55 |       event
56 |         .accumulate("uid", getUserID()) // 用户id
57 |         .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间
58 |         .accumulate("os_type", getSite()) // 终端类型
59 |         .accumulate("click_count", click()) // 点击次数
60 | 
61 |       // produce event message
62 |       producer.send(new ProducerRecord[String,String](topics,event.toString()))
63 |       println("Message sent: " + event.toString)
64 | 
65 |       Thread.sleep(200)
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce/SparkStreamingExactlyOnce.scala:
--------------------------------------------------------------------------------
  1 | package org.sparkStreaming.sparkStreamingExactltyOnce
  2 | 
  3 | import net.sf.json.JSONObject
  4 | import org.apache.kafka.common.serialization.StringDeserializer
  5 | import org.apache.spark.SparkConf
  6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
  7 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
  8 | import org.apache.spark.streaming.kafka010.KafkaUtils
  9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 10 | import org.sparkStreaming.kafka_sparkStreaming_mysql.DruidConnectionPool
 11 | 
 12 | /**
 13 |   * Spark Streaming 中如何实现 Exactly-Once 语义
 14 |   *
 15 |   * Exactly-once 语义是实时计算的难点之一。
 16 |   * 要做到每一条记录只会被处理一次，即使服务器或网络发生故障时也能保证没有遗漏，
 17 |   * 这不仅需要实时计算框架本身的支持，还对上游的消息系统、下游的数据存储有所要求。
 18 |   * 此外，我们在编写计算流程时也需要遵循一定规范，才能真正实现 Exactly-once。
 19 |   */
 20 | object SparkStreamingExactlyOnce {
 21 | 
 22 |   def main(args: Array[String]): Unit = {
 23 |     // 创建 SparkConf 和 StreamingContext
 24 |     val master = if (args.length > 0) args(0) else "local[1]"
 25 |     // 创建检查点路径
 26 |     val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/exactlyOnce/SparkStreamingExactlyOnce"
 27 |     // 创建SparkConf
 28 |     val conf = new SparkConf().setMaster(master).setAppName("SparkStreamingExactlyOnce")
 29 | 
 30 |     // kafka 配置：消费Kafka 中，topic为 user_events的消息
 31 |     val brokers = if (args.length > 2) args(2) else "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092"
 32 |     val topicNames = if (args.length > 3) args(3) else "user_events_ExactltyOnce"
 33 | 
 34 |     def createSSC(): StreamingContext = {
 35 |       val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理
 36 |       kafkaTest(ssc,brokers,topicNames) // Spark 的 Transform和Action
 37 |       ssc.checkpoint(checkpointDir)
 38 |       ssc
 39 |     }
 40 | 
 41 |     // 如果重启的话，可以从检查点恢复
 42 |     val ssc = StreamingContext.getOrCreate(checkpointDir,createSSC)
 43 | 
 44 |     ssc.start()
 45 |     ssc.awaitTermination()
 46 |   }
 47 | 
 48 |   /**
 49 |     *  消费 Kafka 数据 的 Transform和Action
 50 |     * @param ssc
 51 |     * @param brokers
 52 |     * @param topicNames
 53 |     */
 54 |   def kafkaTest(ssc: StreamingContext, brokers: String, topicNames: String): Unit ={
 55 | 
 56 |     val topics = Array(topicNames)
 57 |     // kafka 参数配置
 58 |     val kafkaParams = Map[String,Object](
 59 |     "bootstrap.servers" -> brokers,
 60 |     "key.deserializer" -> classOf[StringDeserializer],
 61 |     "value.deserializer" -> classOf[StringDeserializer],
 62 |     "group.id" -> "SparkStreamingExactlyOnce_group",
 63 |     "auto.offset.reset" -> "latest",
 64 |     "enable.auto.commit" -> (false: java.lang.Boolean)
 65 |     )
 66 |     // 获取日志数据 KafkaUtils.createDirectStream : Direct API 可以提供 Exactly-once 语义。
 67 |     val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
 68 |     val events = kafkaStream.flatMap(
 69 |       line => {
 70 |         val data = JSONObject.fromObject(line.value())
 71 |         Some(data)
 72 |       })
 73 | 
 74 |     // 统计用户点击次数  根据uid 统计 click_count（累加是在redis中做的）
 75 |     val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果
 76 |       .reduceByKey(_+_)
 77 |     userClicks.foreachRDD(rdd => {
 78 |       rdd.foreachPartition(partitionOfRecords => {
 79 |         partitionOfRecords.foreach(pair => {
 80 |           // 创建连接池
 81 |           val dataSource = DruidConnectionPool.getInstance().dataSource
 82 |           val conn = dataSource.getConnection
 83 |           val uid = pair._1
 84 |           val clickCount = pair._2
 85 |           val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'"
 86 |           val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")"
 87 |           val resultSet  = conn.createStatement().executeQuery(sql_isExist)
 88 |           if (resultSet.next()) {
 89 |             val count = resultSet.getString(2).toInt + clickCount.toInt
 90 |             val sql_update = "update streaming set clickCount ='"  + count + "' where uid = '" + uid + "'"
 91 |             conn.createStatement().executeUpdate(sql_update)
 92 |           }
 93 |           else conn.createStatement().executeUpdate(sql_insert)
 94 |           conn.close()
 95 |         })
 96 |       })
 97 |     })
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------