├── LICENSE ├── README.md ├── _config.yml ├── data ├── ml-1m │ ├── README │ ├── movies.dat │ ├── ratings.dat │ └── users.dat └── weblog │ ├── apache.access.log │ └── file_to_stream_utils.sh ├── pom.xml └── src └── main └── scala ├── SparkTest.scala └── org ├── spark ├── App.scala └── movie │ ├── MovieUser.scala │ ├── PopularMovie.scala │ └── TopKMovie.scala ├── sparkSQL ├── ApacheAccessLog │ ├── ApacheAccessLog.scala │ └── LogAnalyzerSQL.scala └── SensorLog │ ├── RedisClient.scala │ ├── SensorRow.scala │ └── SensorStatistics.scala └── sparkStreaming ├── kafka_sparkStreaming_mysql ├── DruidConnectionPool.java ├── KafkaEventProducer.scala ├── UserClickCountAnalytics.scala └── UserClickCountByWindowAnalytics.scala ├── kafka_sparkStreaming_offsetToZK ├── KafkaEventProducer.scala ├── UserClickCountAnalytics.scala └── ZkKafkaOffsetManager.scala ├── kafka_sparkStreaming_redis ├── KafkaEventProducer.scala ├── RedisClient.scala ├── UserClickCountAnalytics.scala └── UserClickCountByWindowAnalytics.scala └── sparkStreamingExactltyOnce ├── KafkaEventProducer.scala └── SparkStreamingExactlyOnce.scala /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 欢迎大家来到 Higmin/SparkObject 2 | # 1. Spark离线批处理 => 受众电影分析 3 | ### (数据来源:https://grouplens.org/datasets/movielens ) 4 | # 2. SparkSQL内容分析 => Weblog 网站日志分析(数据源:log文件)、工业级传感器数据分析(数据源:Hbase) 5 | # 3. spark-streaming => 用户实时点击统计(数据源:kafka) 6 | Spark受众电影分析,用户实时点击统计(Kafka + SparkStreaming + Redis) 7 | spark 版本-2.4.4 8 | spark-streaming_2.12 版本-2.4.4 9 | spark-streaming-kafka-0-10_2.12 版本-2.4.4 10 | spark-sql_2.11 版本-2.4.4 11 | 12 | ### 本地测试方法 : 13 | 1. 本地安装Spark 2.4.4 ,redis(往redis存需要安装redis) , mysql(往 mysql 存需要安装 mysql ) 14 | => MySQL相关说明:用户名 root ,密码 root 。创建数据库test, 15 | => 创建数据表 streaming [uid(varchar 255); clickCount (varchar 255)] 16 | => 创建数据表treaming_ostype[os_type(varchar 255); clickCount (varchar 255)] 17 | 2. 下载示例代码 18 | 3. Kafka + SparkStreaming + Redis 和 Kafka + SparkStreaming + mysql 运行流程相同 19 | 4. 启动 kafka 消息模拟生产者 20 | 5. 启动 sparkStreaming 实时计算任务。 21 | 结果:可在 redis 或者 mysql 中查看 22 | 23 | ## 1.Spark 24 | Spark 任务一:统计看过 “Sixteen Candles” 的用户、性别和观看次数 25 | Spark 任务二:统计年龄段在20-30的年轻人,最喜欢看哪10部电影 26 | Spark 任务三:最受欢迎的前三部电影(平均评分最高的三部电影) 27 | ##### 代码详情:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/spark/movie 28 | 29 | ## 2.Spark SQL 内容分析 30 | SparkSQL是spark用来处理结构化的一个模块,它提供一个抽象的数据集DataFrame,并且是作为分布式SQL查询引擎的应用。 31 | SparkSQL实现了Hive兼容,执行计划生成和优化都由Catalyst负责。借助Scala的模式匹配等函数式语言特性,利用Catalyst开发执行计划优化策略比Hive要简洁得多。 32 | 关于Dataframe和Dataset:Dataframe/Dataset也是分布式数据集,但与RDD不同的是其带有schema信息,类似一张表。Dataset是在spark1.6引入的,目的是提供像RDD一样的强类型、使用强大的lambda函数,同时使用spark sql的优化执行引擎。到spark2.0以后,DataFrame变成类型为Row的Dataset,即为: 33 | `` 34 | type DataFrame = Dataset[Row] 35 | `` 36 | #### 2.1 这里是一个SparkSQL读取Weblog 日志文件的数据分析实例: 37 | ##### 详情请参考:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkSQL 38 | 39 | #### 2.2 另外还有spark 读取 Hbase 中的数据,并转换为 DataFrame ,利用SparkSQL 进行数据分析: 40 | ##### 详情请参考:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkSQL/SensorLog 41 | Hbsse 表数据结构如下:(以下列举一条数据) 42 | 43 | rowKey | columnFamily | column | value 44 | --------|------------------|-----------------------|----------------- 45 | 2314035476751 | info | AndroidBoard::availableStorage | 12379 46 | 2314035476751 | info | AndroidBoard::cpu_usage | 4.32 47 | 2314035476751 | info | AndroidBoard::current | 1.3 48 | 2314035476751 | info | AndroidBoard::storage | 12661 49 | 2314035476751 | info | AndroidBoard::voltage | 11.8 50 | 2314035476751 | info | AxialFanSpeed | 56.666666666666664 51 | 2314035476751 | info | BackLight::state | true 52 | 2314035476751 | info | BlEnable::state | true 53 | 2314035476751 | info | CrossFlowFanSpeed::value | 40 54 | 2314035476751 | info | Decibel::value | 40 55 | 2314035476751 | info | DoorState::state | false 56 | 2314035476751 | info | GPS::latitude | 39.950565 57 | 2314035476751 | info | GPS::longitude | 116.500711 58 | 2314035476751 | info | Humidity::value | 40 59 | 2314035476751 | info | Level::value | 80 60 | 2314035476751 | info | PowerState::state | false 61 | 2314035476751 | info | PowerState::value | 0.8 62 | 2314035476751 | info | Temperature | 29.0 63 | 2314035476751 | info | Time | 1576745304132 64 | 2314035476751 | info | TotalPower::value | 900 65 | 66 | ## 3.Kafka + SparkStreaming + Redis 67 | Kafka + SparkStreaming + Redis 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 68 | Kafka + SparkStreaming + Redis scala实现Redis 客户端 69 | Kafka + SparkStreaming + Redis 实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数 70 | Kafka + SparkStreaming + Redis 每5秒 统计 过去10秒 每种终端 收到的点击量 71 | 注意: 72 | * 1. 使用 SparkStreaming窗口 计算需要设置检查点 checkpoint 73 | * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错 74 | ##### 代码详情:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis 75 | 76 | ## 4.Kafka + SparkStreaming + mysql 77 | Kafka + SparkStreaming + mysql 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 78 | Kafka + SparkStreaming + mysql java实现 mysql连接池(本例中使用阿里开源的 druid 连接池 ) 79 | Kafka + SparkStreaming + mysql 实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数 80 | Kafka + SparkStreaming + mysql 每5秒 统计 过去10秒 每种终端 收到的点击量 81 | 注意: 82 | * 1. 使用 SparkStreaming窗口 计算需要设置检查点 checkpoint 83 | * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错 84 | ##### 代码详情:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql 85 | 86 | ## 5.Spark Streaming 中实现 Exactly-Once 语义 87 | Exactly-once 语义是实时计算的难点之一。 88 | 要做到每一条记录只会被处理一次,即使服务器或网络发生故障时也能保证没有遗漏,这不仅需要实时计算框架本身的支持,还对上游的消息系统、下游的数据存储有所要求。此外,我们在编写计算流程时也需要遵循一定规范,才能真正实现 Exactly-once。 89 | 90 | 实时计算有三种语义,分别是 At-most-once、At-least-once、以及 Exactly-once。一个典型的 Spark Streaming 应用程序会包含三个处理阶段:接收数据、处理汇总、输出结果。每个阶段都需要做不同的处理才能实现相应的语义。 91 | 对于 接收数据,主要取决于上游数据源的特性。例如,从 HDFS 这类支持容错的文件系统中读取文件,能够直接支持 Exactly-once 语义。如果上游消息系统支持 ACK(如RabbitMQ),我们就可以结合 Spark 的 Write Ahead Log 特性来实现 At-least-once 语义。对于非可靠的数据接收器(如 socketTextStream),当 Worker 或 Driver 节点发生故障时就会产生数据丢失,提供的语义也是未知的。而 Kafka 消息系统是基于偏移量(Offset)的,它的 Direct API 可以提供 Exactly-once 语义。 92 | 在使用 Spark RDD 对数据进行 转换或汇总 时,我们可以天然获得 Exactly-once 语义,因为 RDD 本身就是一种具备容错性、不变性、以及计算确定性的数据结构。只要数据来源是可用的,且处理过程中没有副作用(Side effect),我们就能一直得到相同的计算结果。 93 | 94 | 实时计算中的 Exactly-once 是比较强的一种语义,因而会给你的应用程序引入额外的开销。此外,它尚不能很好地支持窗口型操作。因此,是否要在代码中使用这一语义就需要开发者自行判断了。很多情况下,数据丢失或重复处理并不那么重要。不过,了解 Exactly-once 的开发流程还是有必要的,对学习 Spark Streaming 也会有所助益。 95 | ##### 代码详情:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce 96 | 97 | ## 6.Kafka + SparkStreaming手动管理 offset 98 | 为了应对可能出现的引起Streaming程序崩溃的异常情况,我们一般都需要手动管理好Kafka的offset,而不是让它自动提交,即需要将enable.auto.commit设为false。只有管理好offset,才能使整个流式系统最大限度地接近exactly once语义。 99 | Offsets可以通过多种方式来管理,但是一般来说遵循下面的步骤: 100 | 101 | 1. 在 Direct DStream初始化的时候,需要指定一个包含每个topic的每个分区的offset用于让Direct DStream从指定位置读取数据。 102 | (offsets就是步骤4中所保存的offsets位置) 103 | 2. 读取并处理消息 104 | 105 | 3. 处理完之后存储结果数据 106 | 107 | 用虚线圈存储和提交offset只是简单强调用户可能会执行一系列操作来满足他们更加严格的语义要求。这包括幂等操作和通过原子操作的方式存储offset。 108 | 109 | 4.最后,将offsets保存在外部持久化数据库如 HBase, Kafka, HDFS, and ZooKeeper中 110 | ##### 参考博客:https://blog.csdn.net/rlnLo2pNEfx9c/article/details/79988218 111 | #### 6.1 存储在kafka本身 (注意: commitAsync()是Spark Streaming集成kafka-0-10版本中的,在Spark文档提醒到它仍然是个实验性质的API并且存在修改的可能性。) 112 | ```markdown 113 | stream.foreachRDD { rdd => 114 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 115 | 116 | // some time later, after outputs have completed 117 | stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) 118 | } 119 | ``` 120 | #### 6.2 存储在zookeeper等外部存储 121 | ##### 详情请参考:https://github.com/Higmin/SparkObject/tree/master/src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK 122 | 123 | #### 6.3 为什么不用SparkStreaming 的 checkpoint? 124 | Spark Streaming的checkpoint机制无疑是用起来最简单的,checkpoint数据存储在HDFS中,如果Streaming应用挂掉,可以快速恢复。 125 | 但是,如果Streaming程序的代码改变了,重新打包执行就会出现反序列化异常的问题。这是因为checkpoint首次持久化时会将整个jar包序列化,以便重启时恢复。重新打包之后,新旧代码逻辑不同,就会报错或者仍然执行旧版代码。 126 | 要解决这个问题,只能将HDFS上的checkpoint文件删掉,但这样也会同时删掉Kafka的offset信息,就毫无意义了。 127 | 128 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /data/ml-1m/README: -------------------------------------------------------------------------------- 1 | SUMMARY 2 | ================================================================================ 3 | 4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 5 | made by 6,040 MovieLens users who joined MovieLens in 2000. 6 | 7 | USAGE LICENSE 8 | ================================================================================ 9 | 10 | Neither the University of Minnesota nor any of the researchers 11 | involved can guarantee the correctness of the data, its suitability 12 | for any particular purpose, or the validity of results based on the 13 | use of the data set. The data set may be used for any research 14 | purposes under the following conditions: 15 | 16 | * The user may not state or imply any endorsement from the 17 | University of Minnesota or the GroupLens Research Group. 18 | 19 | * The user must acknowledge the use of the data set in 20 | publications resulting from the use of the data set 21 | (see below for citation information). 22 | 23 | * The user may not redistribute the data without separate 24 | permission. 25 | 26 | * The user may not use this information for any commercial or 27 | revenue-bearing purposes without first obtaining permission 28 | from a faculty member of the GroupLens Research Project at the 29 | University of Minnesota. 30 | 31 | If you have any further questions or comments, please contact GroupLens 32 | . 33 | 34 | CITATION 35 | ================================================================================ 36 | 37 | To acknowledge use of the dataset in publications, please cite the following 38 | paper: 39 | 40 | F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History 41 | and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, 42 | Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872 43 | 44 | 45 | ACKNOWLEDGEMENTS 46 | ================================================================================ 47 | 48 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data 49 | set. 50 | 51 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT 52 | ================================================================================ 53 | 54 | The GroupLens Research Project is a research group in the Department of 55 | Computer Science and Engineering at the University of Minnesota. Members of 56 | the GroupLens Research Project are involved in many research projects related 57 | to the fields of information filtering, collaborative filtering, and 58 | recommender systems. The project is lead by professors John Riedl and Joseph 59 | Konstan. The project began to explore automated collaborative filtering in 60 | 1992, but is most well known for its world wide trial of an automated 61 | collaborative filtering system for Usenet news in 1996. Since then the project 62 | has expanded its scope to research overall information filtering solutions, 63 | integrating in content-based methods as well as improving current collaborative 64 | filtering technology. 65 | 66 | Further information on the GroupLens Research project, including research 67 | publications, can be found at the following web site: 68 | 69 | http://www.grouplens.org/ 70 | 71 | GroupLens Research currently operates a movie recommender based on 72 | collaborative filtering: 73 | 74 | http://www.movielens.org/ 75 | 76 | RATINGS FILE DESCRIPTION 77 | ================================================================================ 78 | 79 | All ratings are contained in the file "ratings.dat" and are in the 80 | following format: 81 | 82 | UserID::MovieID::Rating::Timestamp 83 | 84 | - UserIDs range between 1 and 6040 85 | - MovieIDs range between 1 and 3952 86 | - Ratings are made on a 5-star scale (whole-star ratings only) 87 | - Timestamp is represented in seconds since the epoch as returned by time(2) 88 | - Each user has at least 20 ratings 89 | 90 | USERS FILE DESCRIPTION 91 | ================================================================================ 92 | 93 | User information is in the file "users.dat" and is in the following 94 | format: 95 | 96 | UserID::Gender::Age::Occupation::Zip-code 97 | 98 | All demographic information is provided voluntarily by the users and is 99 | not checked for accuracy. Only users who have provided some demographic 100 | information are included in this data set. 101 | 102 | - Gender is denoted by a "M" for male and "F" for female 103 | - Age is chosen from the following ranges: 104 | 105 | * 1: "Under 18" 106 | * 18: "18-24" 107 | * 25: "25-34" 108 | * 35: "35-44" 109 | * 45: "45-49" 110 | * 50: "50-55" 111 | * 56: "56+" 112 | 113 | - Occupation is chosen from the following choices: 114 | 115 | * 0: "other" or not specified 116 | * 1: "academic/educator" 117 | * 2: "artist" 118 | * 3: "clerical/admin" 119 | * 4: "college/grad student" 120 | * 5: "customer service" 121 | * 6: "doctor/health care" 122 | * 7: "executive/managerial" 123 | * 8: "farmer" 124 | * 9: "homemaker" 125 | * 10: "K-12 student" 126 | * 11: "lawyer" 127 | * 12: "programmer" 128 | * 13: "retired" 129 | * 14: "sales/marketing" 130 | * 15: "scientist" 131 | * 16: "self-employed" 132 | * 17: "technician/engineer" 133 | * 18: "tradesman/craftsman" 134 | * 19: "unemployed" 135 | * 20: "writer" 136 | 137 | MOVIES FILE DESCRIPTION 138 | ================================================================================ 139 | 140 | Movie information is in the file "movies.dat" and is in the following 141 | format: 142 | 143 | MovieID::Title::Genres 144 | 145 | - Titles are identical to titles provided by the IMDB (including 146 | year of release) 147 | - Genres are pipe-separated and are selected from the following genres: 148 | 149 | * Action 150 | * Adventure 151 | * Animation 152 | * Children's 153 | * Comedy 154 | * Crime 155 | * Documentary 156 | * Drama 157 | * Fantasy 158 | * Film-Noir 159 | * Horror 160 | * Musical 161 | * Mystery 162 | * Romance 163 | * Sci-Fi 164 | * Thriller 165 | * War 166 | * Western 167 | 168 | - Some MovieIDs do not correspond to a movie due to accidental duplicate 169 | entries and/or test entries 170 | - Movies are mostly entered by hand, so errors and inconsistencies may exist 171 | -------------------------------------------------------------------------------- /data/ml-1m/movies.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Higmin/Spark-Learning/a5fe1a9db2c86d32fc096fd7f98faffae5c466f8/data/ml-1m/movies.dat -------------------------------------------------------------------------------- /data/weblog/file_to_stream_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | test $# -eq 1 || ( echo "Incorrect number of arguments" ; exit 1 ) 7 | 8 | file="$1" 9 | 10 | network_port=9999 11 | lines_in_batch=100 12 | interval_sec=10 13 | 14 | n_lines=$(cat apache.access.log | wc -l) 15 | cursor=1 16 | while test $cursor -le $n_lines 17 | do 18 | tail -n +$cursor $file | head -$lines_in_batch | nc -l $network_port 19 | cursor=$(($cursor + $lines_in_batch)) 20 | sleep $interval_sec 21 | done 22 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | org.spark 4 | sparkCalculation 5 | 1.0-SNAPSHOT 6 | 2008 7 | 8 | 1.8 9 | 2.12.2 10 | 2.4.4 11 | 0.10.2.2 12 | 3.8.2 13 | 1.2.5 14 | 2.5.2 15 | 2.2 16 | 2.4 17 | 5.1.39 18 | 1.0.28 19 | 2.1.2 20 | 21 | 22 | 23 | 24 | nexus-aliyun 25 | Nexus aliyun 26 | http://maven.aliyun.com/nexus/content/groups/public 27 | 28 | 29 | 30 | 31 | 32 | org.scala-lang 33 | scala-library 34 | ${scala.version} 35 | 36 | 37 | 38 | junit 39 | junit 40 | ${junit.version} 41 | test 42 | 43 | 44 | 45 | org.specs 46 | specs 47 | ${specs.version} 48 | test 49 | 50 | 51 | org.apache.spark 52 | spark-core_2.12 53 | ${spark.version} 54 | 55 | 56 | 57 | org.apache.spark 58 | spark-sql_2.12 59 | ${spark.version} 60 | 61 | 62 | 63 | org.apache.spark 64 | spark-streaming_2.12 65 | ${spark.version} 66 | 67 | 68 | 69 | 70 | org.apache.spark 71 | spark-streaming-kafka-0-10_2.12 72 | ${spark.version} 73 | 74 | 75 | 76 | org.apache.kafka 77 | kafka_2.12 78 | ${kafka.version} 79 | 80 | 81 | 82 | net.sf.json-lib 83 | json-lib 84 | ${json-lib.version} 85 | jdk15 86 | 87 | 88 | 89 | redis.clients 90 | jedis 91 | ${jedis.version} 92 | 93 | 94 | 95 | org.apache.commons 96 | commons-pool2 97 | ${commons-pool2.version} 98 | 99 | 100 | 101 | 102 | mysql 103 | mysql-connector-java 104 | runtime 105 | ${mysql-connector.version} 106 | 107 | 108 | 109 | com.alibaba 110 | druid 111 | ${druid.version} 112 | 113 | 114 | 115 | org.apache.hbase 116 | hbase-mapreduce 117 | ${hbase-mapreduce.version} 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | maven-assembly-plugin 126 | 2.3 127 | 128 | dist 129 | true 130 | 131 | jar-with-dependencies 132 | 133 | 134 | 135 | 136 | make-assembly 137 | package 138 | 139 | single 140 | 141 | 142 | 143 | 144 | 145 | 146 | maven-compiler-plugin 147 | 148 | 1.7 149 | 1.7 150 | 151 | 152 | 153 | 154 | net.alchim31.maven 155 | scala-maven-plugin 156 | 3.2.2 157 | 158 | 159 | scala-compile-first 160 | process-resources 161 | 162 | compile 163 | 164 | 165 | 166 | 167 | ${scala.version} 168 | incremental 169 | true 170 | 171 | -unchecked 172 | -deprecation 173 | -feature 174 | 175 | 176 | -Xms1024m 177 | -Xmx1024m 178 | 179 | 180 | -source 181 | ${java.version} 182 | -target 183 | ${java.version} 184 | -Xlint:all,-serial,-path 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /src/main/scala/SparkTest.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark._ 2 | 3 | /** 4 | * 用于测试本地spark开发环境 5 | */ 6 | object SparkTest { 7 | 8 | def main(args: Array[String]): Unit = { 9 | val conf = new SparkConf().setMaster("local").setAppName("test") 10 | val sc = new SparkContext(conf) 11 | val rdd = sc.parallelize(Seq(1,2,3,4)).filter(_==1).take(1) 12 | rdd.foreach(println(_)) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/org/spark/App.scala: -------------------------------------------------------------------------------- 1 | package org.spark 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | object App { 8 | def main(args: Array[String]): Unit = { 9 | println( "Hello World!" ) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/org/spark/movie/MovieUser.scala: -------------------------------------------------------------------------------- 1 | package org.spark.movie 2 | 3 | import org.apache.spark._ 4 | 5 | /** 6 | * 封装一个对象,因为这里最后我们通过main函数来启动,所以没必要建一个类 7 | * 数据格式: 8 | * 电影评分数据集:ratings.dat UserID::MovieID::Rating::Timestamp 9 | * 用户信息数据集:users.dat UserID::Gender::Age::Occupation::Zip-code 10 | * 电影信息数据集:movies.dat MovieID::Title::Genres 11 | * 12 | * 任务一:统计看过 “Sixteen Candles” 的用户、性别和观看次数 13 | */ 14 | object MovieUser { 15 | def main(args: Array[String]): Unit = { 16 | val master = if (args.length > 0) args(0).toString else "local" 17 | val datapath = if (args.length > 1) args(1).toString else "data/ml-1m" 18 | 19 | // 一般写Spark程序,我们需要建立sparkConf和sparkContext 20 | val conf = new SparkConf().setMaster(master).setAppName("MovieUser") 21 | val sc = new SparkContext(conf) 22 | 23 | // 数据读入:读取数据文件转换为RDD 24 | val usersRdd = sc.textFile(datapath + "/users.dat") 25 | val ratingsRdd = sc.textFile(datapath + "/ratings.dat") 26 | val moviesRdd = sc.textFile(datapath + "/movies.dat") 27 | 28 | // 抽取数据的属性,过滤符合条件的电影 29 | // RDD => users格式 :[UserID,(Gender,Age)] 30 | val users = usersRdd.map(_.split("::")) 31 | .map(x => {(x(0),(x(1),x(2)))}) 32 | // RDD => rating格式 :[(UserID,MovieID)] 33 | val rating = ratingsRdd.map(_.split("::")) 34 | .map(x => (x(0),x(1))) 35 | .filter(x => x._2.equals("2144")) 36 | // join 两个数据集 37 | // RDD => userRating格式 :[UserID,(MovieID,(Gender,Age))] => key相同,value合并 示例:(4425,(2144,(M,35))) 38 | val userRating = rating.join(users) 39 | // userRating.take(1) 40 | // .foreach(println(_)) // 打印一条记录,测试使用,方便开发过程中查看格式 41 | // 统计分析 42 | val userDistribution = userRating.map(x => {(x._2._2,1)}).reduceByKey(_ + _) 43 | .foreach(println(_)) 44 | 45 | sc.stop() 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/spark/movie/PopularMovie.scala: -------------------------------------------------------------------------------- 1 | package org.spark.movie 2 | 3 | import org.apache.spark._ 4 | 5 | import scala.collection.immutable.HashSet 6 | 7 | /** 8 | * 数据格式: 9 | * 电影评分数据集:ratings.dat UserID::MovieID::Rating::Timestamp 10 | * 用户信息数据集:users.dat UserID::Gender::Age::Occupation::Zip-code 11 | * 电影信息数据集:movies.dat MovieID::Title::Genres 12 | * 13 | * 任务二:统计年龄段在20-30的年轻人,最喜欢看哪10部电影 14 | */ 15 | object PopularMovie { 16 | def main(args: Array[String]): Unit = { 17 | val master = if (args.length > 0) args(0).toString else "local" 18 | val datapath = if (args.length > 1) args(1).toString else "data/ml-1m" 19 | 20 | // 一般写Spark程序,我们需要建立sparkConf和sparkContext 21 | val conf = new SparkConf().setMaster(master).setAppName("PopularMovie") 22 | val sc = new SparkContext(conf) 23 | 24 | // 数据读入:读取数据文件转换为RDD 25 | val usersRdd = sc.textFile(datapath + "/users.dat") 26 | val ratingsRdd = sc.textFile(datapath + "/ratings.dat") 27 | val moviesRdd = sc.textFile(datapath + "/movies.dat") 28 | 29 | // 抽取数据和过滤 users.dat UserID::Gender::Age::Occupation::Zip-code 30 | val users = usersRdd.map(_.split("::")) 31 | .map(x => {(x(0),x(2))}) // (UserID,Age) 32 | .filter(x => x._2.toInt >= 20 && x._2.toInt <= 30) 33 | .map(_._1) 34 | .collect() 35 | val userSet = HashSet() ++ users 36 | val broadcastUserSet = sc.broadcast(userSet) 37 | val movies = moviesRdd.map(_.split("::")) 38 | .map(x => {(x(0),x(1))}) // (MovieID,Title) 39 | 40 | // 聚合和排序 movies.dat MovieID::Title::Genres 41 | val topMovies = ratingsRdd.map(_.split("::")) 42 | .map(x => {(x(0),x(1))}) // (UserID,MovieID) 43 | .filter(x => {broadcastUserSet.value.contains(x._1)}) // (UserID,MovieID) 44 | .map(x => {(x._2,1)}) // (MovieID,1) 45 | .reduceByKey(_+_) // (MovieID,N) 46 | .join(movies) // (MovieID,(N,Title)) 47 | .map(x => {(x._2._1,x._2._2)}) // (N,Title) 48 | .sortByKey(false) // 逆序排列 49 | .map(x => {(x._2,x._1)}) // (Title,N) 50 | .take(10) // 获取前十条数据 51 | .foreach(println(_)) 52 | 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/spark/movie/TopKMovie.scala: -------------------------------------------------------------------------------- 1 | package org.spark.movie 2 | 3 | import org.apache.spark._ 4 | 5 | /** 6 | * 7 | * 数据格式: 8 | * 电影评分数据集:ratings.dat UserID::MovieID::Rating::Timestamp 9 | * 用户信息数据集:users.dat UserID::Gender::Age::Occupation::Zip-code 10 | * 电影信息数据集:movies.dat MovieID::Title::Genres 11 | * 任务三:最受欢迎的前三部电影(平均评分最高的三部电影) 12 | */ 13 | object TopKMovie { 14 | 15 | def main(args: Array[String]): Unit = { 16 | val master = if (args.length > 0) args(0).toString else "local" 17 | val datapath = if (args.length > 1) args(1).toString else "data/ml-1m" 18 | 19 | // 一般写Spark程序,我们需要建立sparkConf和sparkContext 20 | val conf = new SparkConf().setMaster(master).setAppName("TopKMovie") 21 | val sc = new SparkContext(conf) 22 | 23 | // 数据读入:读取数据文件转换为RDD 24 | val usersRdd = sc.textFile(datapath + "/users.dat") 25 | val ratingsRdd = sc.textFile(datapath + "/ratings.dat") 26 | val moviesRdd = sc.textFile(datapath + "/movies.dat") 27 | 28 | // 数据抽取 29 | val movies = moviesRdd.map(_.split("::")) 30 | .map(x => { 31 | (x(0), x(1)) 32 | }) //(MovieID,Title) 33 | 34 | val ratings = ratingsRdd.map(_.split("::")) 35 | .map(x => { 36 | (x(1), x(2)) 37 | }) // (MovieID,Rating) 38 | .join(movies) 39 | .map(x => { 40 | (x._2._2, x._2._1) 41 | }) // (Title,Rating) 42 | // 数据分析 43 | val topKScoreMostMovies = ratings.map(x => { 44 | (x._1, (x._2.toInt, 1)) 45 | }) // (Title,(Rating,1)) 46 | .reduceByKey((v1, v2) => { 47 | (v1._1 + v2._1, v1._2 + v2._2) 48 | }) // (Title,(RatingScoreSum,N)) 49 | .map(x => { 50 | (x._2._1.toFloat / x._2._2.toFloat, x._1) 51 | }) // (RatingScoreAvg,Title) 52 | .sortByKey(false) 53 | .take(3) 54 | .foreach(println(_)) 55 | 56 | sc.stop() 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkSQL/ApacheAccessLog/ApacheAccessLog.scala: -------------------------------------------------------------------------------- 1 | package org.sparkSQL.ApacheAccessLog 2 | 3 | /** 4 | * case class 用于接受日志对应字段 5 | * 6 | * @param ipAddress 7 | * @param clientIdentd 8 | * @param userId 9 | * @param dateTime 10 | * @param method 11 | * @param endpoint 12 | * @param protocol 13 | * @param responseCode 14 | * @param contentSize 15 | */ 16 | case class ApacheAccessLog(ipAddress: String, 17 | clientIdentd: String, 18 | userId: String, 19 | dateTime: String, 20 | method: String, 21 | endpoint: String, 22 | protocol: String, 23 | responseCode: Int, 24 | contentSize: Long){ 25 | } 26 | 27 | /** 28 | * 通过正则表达式匹配相应log中的对应字段 29 | */ 30 | object ApacheAccessLog { 31 | // 64.242.88.10 - - [07/Mar/2004:16:05:49 -0800] "GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1" 401 12846 32 | val PATTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s+\-\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r 33 | 34 | def parseLogLine(log: String): ApacheAccessLog = { 35 | log match { 36 | case PATTERN(ipAddress, clientIdentd, userId, dateTime, method, endpoint, protocol, responseCode, contentSize) 37 | => ApacheAccessLog(ipAddress, clientIdentd, userId, dateTime, method, endpoint, protocol, responseCode.toInt, contentSize.toLong) 38 | case _ => throw new RuntimeException(s"""Cannot parse log line: $log""") 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkSQL/ApacheAccessLog/LogAnalyzerSQL.scala: -------------------------------------------------------------------------------- 1 | package org.sparkSQL.ApacheAccessLog 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Spark SQL 统计分析web日志内容 7 | */ 8 | object LogAnalyzerSQL { 9 | def main(args: Array[String]): Unit = { 10 | val spark = SparkSession.builder() 11 | .appName("Log Analyzer") 12 | .master("local") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | 17 | val accessLogs = spark 18 | .read 19 | .textFile("data/weblog/apache.access.log") 20 | .map(ApacheAccessLog.parseLogLine).toDF() 21 | 22 | accessLogs.createOrReplaceTempView("logs") 23 | 24 | // 统计分析内容大小-全部内容大小,日志条数,最小内容大小,最大内容大小 25 | // val contentSizeStats: Row = spark.sql("SELECT SUM(contentSize), COUNT(*), MIN(contentSize), MAX(contentSize) FROM logs").first() 26 | // val sum = contentSizeStats.getLong(0) 27 | // val count = contentSizeStats.getLong(1) 28 | // val min = contentSizeStats.getLong(2) 29 | // val max = contentSizeStats.getLong(3) 30 | // println("sum %s, count %s, min %s, max %s".format(sum, count, min, max)) 31 | // println("avg %s", sum / count) 32 | // spark.close() 33 | 34 | // 统计每种返回码的数量. 35 | // val responseCodeToCount = spark.sql("SELECT responseCode, COUNT(*) FROM logs GROUP BY responseCode LIMIT 100") 36 | // .map(row => (row.getInt(0), row.getLong(1))) 37 | // .collect() 38 | // responseCodeToCount.foreach(print(_)) 39 | 40 | // 统计哪个IP地址访问服务器超过10次 41 | // val ipAddresses = spark.sql("SELECT ipAddress, COUNT(*) AS total FROM logs GROUP BY ipAddress HAVING total > 10 LIMIT 100") 42 | // .map(row => row.getString(0)) 43 | // .collect() 44 | // ipAddresses.foreach(println(_)) 45 | 46 | // 查询访问量最大的访问目的地址 47 | val topEndpoints = spark.sql("SELECT endpoint, COUNT(*) AS total FROM logs GROUP BY endpoint ORDER BY total DESC LIMIT 10") 48 | .map(row => (row.getString(0), row.getLong(1))) 49 | .collect() 50 | topEndpoints.foreach(println(_)) 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkSQL/SensorLog/RedisClient.scala: -------------------------------------------------------------------------------- 1 | package org.sparkSQL.SensorLog 2 | 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig 4 | import redis.clients.jedis.JedisPool 5 | 6 | /** 7 | * Redis 客户端 8 | */ 9 | object RedisClient extends Serializable { 10 | val redisHost = "127.0.0.1" 11 | val redisPort = 6379 12 | val redisTimeout = 30000 13 | val redisPassword = "root" 14 | lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout, redisPassword) 15 | 16 | lazy val hook = new Thread { 17 | override def run = { 18 | println("Execute hook thread: " + this) 19 | pool.destroy() 20 | } 21 | } 22 | sys.addShutdownHook(hook.run) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkSQL/SensorLog/SensorRow.scala: -------------------------------------------------------------------------------- 1 | package org.sparkSQL.SensorLog 2 | 3 | import org.apache.hadoop.hbase.client.Result 4 | import org.apache.hadoop.hbase.util.Bytes 5 | 6 | case class SensorRow( 7 | androidBoardAvailableStorage: String, androidBoardCpu_usage: String, androidBoardCurrent: String, 8 | androidBoardStorage: String, androidBoardVoltage: String, axialFanSpeed: String, 9 | backLightState: String, blEnableState: String, crossFlowFanSpeedValue: String, 10 | decibelValue: String, doorStateState: String, spsLatitude: String, 11 | gpsLongitude: String, humidityValue: String, levelValue: String, 12 | powerStateState: String, powerStateValue: String, temperature: String, 13 | totalPowerValue: String, time: Long) 14 | 15 | object SensorRow extends Serializable { 16 | 17 | /** 18 | * 列族 “info” 19 | * @param result 20 | * @return 21 | */ 22 | def parseSensorRow(result: Result): SensorRow = { 23 | val p0 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::availableStorage".getBytes)) 24 | val p1 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::cpu_usage".getBytes)) 25 | val p2 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::current".getBytes)) 26 | val p3 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::storage".getBytes)) 27 | val p4 = Bytes.toString(result.getValue("info".getBytes, "AndroidBoard::voltage".getBytes)) 28 | val p5 = Bytes.toString(result.getValue("info".getBytes, "AxialFanSpeed".getBytes)) 29 | val p6 = Bytes.toString(result.getValue("info".getBytes, "BackLight::state".getBytes)) 30 | val p7 = Bytes.toString(result.getValue("info".getBytes, "BlEnable::state".getBytes)) 31 | val p8 = Bytes.toString(result.getValue("info".getBytes, "CrossFlowFanSpeed::value".getBytes)) 32 | val p9 = Bytes.toString(result.getValue("info".getBytes, "Decibel::value".getBytes)) 33 | val p10 = Bytes.toString(result.getValue("info".getBytes, "DoorState::state".getBytes)) 34 | val p11 = Bytes.toString(result.getValue("info".getBytes, "GPS::latitude".getBytes)) 35 | val p12 = Bytes.toString(result.getValue("info".getBytes, "GPS::longitude".getBytes)) 36 | val p13 = Bytes.toString(result.getValue("info".getBytes, "Humidity::value".getBytes)) 37 | val p14 = Bytes.toString(result.getValue("info".getBytes, "Level::value".getBytes)) 38 | val p15 = Bytes.toString(result.getValue("info".getBytes, "PowerState::state".getBytes)) 39 | val p16 = Bytes.toString(result.getValue("info".getBytes, "PowerState::value".getBytes)) 40 | val p17 = Bytes.toString(result.getValue("info".getBytes, "Temperature".getBytes)) 41 | val p18 = Bytes.toString(result.getValue("info".getBytes, "TotalPower::value".getBytes)) 42 | val p19 = Bytes.toString(result.getValue("info".getBytes, "Time".getBytes)) 43 | SensorRow(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19.toLong) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkSQL/SensorLog/SensorStatistics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkSQL.SensorLog 2 | 3 | import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId} 4 | import java.util.concurrent.Executors 5 | 6 | import net.sf.json.JSONObject 7 | import org.apache.hadoop.hbase.HBaseConfiguration 8 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 9 | import org.apache.spark.sql.{SQLContext, SparkSession} 10 | import org.apache.spark.{SparkConf, SparkContext} 11 | 12 | /** 13 | * 读取 Hbase 中的传感器数据 14 | * 计算 平均值 15 | */ 16 | object SensorStatistics { 17 | def main(args: Array[String]): Unit = { 18 | // 参数 19 | val master = if (args.length > 0) args(0).toString else "local" 20 | val zkHost = if (args.length > 1) args(1).toString else "192.168.183.150,192.168.183.151,192.168.183.152" 21 | val zkPort = if (args.length > 2) args(2).toString else "2181" 22 | val tableName = if (args.length > 2) args(2).toString else "sensors" 23 | 24 | // 一般写Spark程序,我们需要建立sparkConf和sparkContext 25 | val conf = new SparkConf().setMaster(master).setAppName("SensorStatistics") 26 | val sc = new SparkContext(conf) 27 | 28 | // Hbase 配置 29 | val hbaseConf = HBaseConfiguration.create() 30 | hbaseConf.set("hbase.zookeeper.quorum", zkHost) 31 | hbaseConf.set("hbase.master", zkPort) 32 | hbaseConf.set("hbase.master", "192.168.183.150:16010"); // 例如: 191.168.9.9:16010 , 这里由于 ambari 的端口不一样所以和原生的端口不一样这个 要注意 33 | hbaseConf.set(TableInputFormat.INPUT_TABLE, "sensors") 34 | val executor = Executors.newCachedThreadPool() 35 | 36 | // 从Hbase数据源获取数据 37 | val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf, 38 | classOf[TableInputFormat], 39 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 40 | classOf[org.apache.hadoop.hbase.client.Result]) 41 | 42 | hbaseRDD.count() 43 | 44 | val sqlContext = new SQLContext(sc) 45 | import sqlContext.implicits._ 46 | val spark = SparkSession.builder() 47 | .appName("SensorStatistics") 48 | .master("local") 49 | .getOrCreate() 50 | 51 | // 将查询结果映射到 case class 52 | val resultRDD = hbaseRDD.map(tuple => tuple._2) 53 | // 将映射数据集转换为 DataFrame 以便 后期使用SQL开发查询统计 54 | val sensorRDD = resultRDD.map(SensorRow.parseSensorRow).toDF() 55 | 56 | sensorRDD.show() // 打印表,一般开发的时候用 57 | // 创建视图 命名 即表名 58 | sensorRDD.createOrReplaceTempView("sensors") 59 | 60 | val sensorStatDF = spark.sql("SELECT avg(temperature) FROM sensors where time > 1576080000000 and time < 1576771200000") 61 | .map(row => { 62 | row.getDouble(0) 63 | }) 64 | .collect() 65 | sensorStatDF.foreach(println(_)) 66 | 67 | // 零点的时间戳 (预留) 68 | val today_start = LocalDateTime.of(LocalDate.now, LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli 69 | val past_oneWeek_start = LocalDateTime.of(LocalDate.now().minusDays(7), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli 70 | val past_oneMonth_start = LocalDateTime.of(LocalDate.now().minusMonths(1), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli 71 | val past_threeMonth_start = LocalDateTime.of(LocalDate.now().minusMonths(3), LocalTime.MIN).atZone(ZoneId.systemDefault()).toInstant.toEpochMilli 72 | 73 | 74 | // TODO redis 存储 => 查询结果 存储再redis 75 | val dbIndex = 2 76 | val clickHashKey = "devMonitorCalculation" 77 | 78 | sc.stop() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/DruidConnectionPool.java: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql; 2 | 3 | import com.alibaba.druid.pool.DruidDataSourceFactory; 4 | import javax.sql.DataSource; 5 | import java.util.Properties; 6 | 7 | /** 8 | * @author Higmin 9 | * @date 2019/11/28 9:10 10 | **/ 11 | public class DruidConnectionPool { 12 | 13 | public DataSource dataSource; 14 | private Properties pro = new Properties(); 15 | 16 | private DruidConnectionPool() { 17 | try { 18 | init(); 19 | dataSource = DruidDataSourceFactory.createDataSource(pro); 20 | } catch (Exception e) { 21 | e.printStackTrace(); 22 | } 23 | } 24 | 25 | private static class Holder { 26 | private static DruidConnectionPool instance = new DruidConnectionPool(); 27 | } 28 | 29 | public static DruidConnectionPool getInstance() { 30 | return Holder.instance; 31 | } 32 | 33 | private void init() { 34 | // 数据源配置 35 | pro.setProperty("driverClassName", "com.mysql.jdbc.Driver"); 36 | pro.setProperty("url", "jdbc:mysql://localhost:3306/test?characterEncoding=utf8&useSSL=true"); 37 | pro.setProperty("username", "root"); 38 | pro.setProperty("password", "root"); 39 | // 连接池配置 40 | pro.setProperty("initialSize", "20"); // 初始化连接大小 41 | pro.setProperty("minIdle", "20"); // 最小连接池数量 42 | pro.setProperty("maxActive", "100"); // 最大连接池数量 43 | pro.setProperty("maxWait", "60000"); // 获取连接时最大等待时间,单位毫秒 44 | pro.setProperty("timeBetweenEvictionRunsMillis", "60000"); // 配置间隔多久才进行一次检测,检测需要关闭的空闲连接,单位是毫秒 45 | pro.setProperty("minEvictableIdleTimeMillis", "300000"); // 配置一个连接在池中最小生存的时间,单位是毫秒 46 | pro.setProperty("validationQuery", "SELECT 1 FROM DUAL"); // 测试连接 47 | pro.setProperty("testWhileIdle", "true"); // 申请连接的时候检测,建议配置为true,不影响性能,并且保证安全性 48 | pro.setProperty("testOnBorrow", "false"); // 获取连接时执行检测,建议关闭,影响性能 49 | pro.setProperty("testOnReturn", "false"); // 归还连接时执行检测,建议关闭,影响性能 50 | pro.setProperty("poolPreparedStatements", "false"); // 是否开启PSCache,PSCache对支持游标的数据库性能提升巨大,oracle建议开启,mysql下建议关闭 51 | pro.setProperty("maxOpenPreparedStatements", "20"); // 开启poolPreparedStatements后生效 52 | // pro.setProperty("filters", "stat,wall,slf4j"); // 配置扩展插件,常用的插件有=>stat:监控统计 log4j:日志 wall:防御sql注入 53 | pro.setProperty("connectionProperties", "druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000"); // 通过connectProperties属性来打开mergeSql功能;慢SQL记录 54 | pro.setProperty("asyncInit", "true"); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/KafkaEventProducer.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql 2 | 3 | import java.util.Properties 4 | 5 | import net.sf.json.JSONObject 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 12 | */ 13 | object KafkaEventProducer { 14 | 15 | private val users = Array( 16 | "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52", 17 | "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493", 18 | "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8", 19 | "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b", 20 | "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d") 21 | 22 | private val sites = Array( 23 | "Android","IOS","PC" 24 | ) 25 | 26 | private val random = new Random() 27 | 28 | def getUserID():String = { 29 | val userPointer = random.nextInt(10) 30 | users(userPointer) 31 | } 32 | 33 | def getSite():String = { 34 | val sitePointer = random.nextInt(3) 35 | sites(sitePointer) 36 | } 37 | 38 | def click() : Double = { 39 | random.nextInt(10) 40 | } 41 | 42 | def main(args: Array[String]): Unit = { 43 | val topics = "user_events_mysql" 44 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 45 | val props = new Properties() 46 | props.put("bootstrap.servers",brokers) 47 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 48 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 49 | props.put("request.required.acks", "1") 50 | 51 | // val kafkaConfig = new ProducerConfig(props) 52 | val producer = new KafkaProducer[String,String](props) 53 | while (true) { 54 | val event = new JSONObject() 55 | event 56 | .accumulate("uid", getUserID()) // 用户id 57 | .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间 58 | .accumulate("os_type", getSite()) // 终端类型 59 | .accumulate("click_count", click()) // 点击次数 60 | 61 | // produce event message 62 | producer.send(new ProducerRecord[String,String](topics,event.toString())) 63 | println("Message sent: " + event.toString) 64 | 65 | Thread.sleep(200) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/UserClickCountAnalytics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql 2 | 3 | import net.sf.json.JSONObject 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 7 | import org.apache.spark.streaming.kafka010.KafkaUtils 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | 11 | /** 12 | * 读取kafka中的数据,结果存在mysql中 13 | * 实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数,逻辑比较简单 14 | * 关键是在实现过程中要注意一些问题,如对象序列化等 15 | */ 16 | object UserClickCountAnalytics { 17 | def main(args: Array[String]): Unit = { 18 | // 创建 SparkConf 和 StreamingContext 19 | val master = if (args.length > 0) args(0) else "local[1]" 20 | val conf = new SparkConf().setMaster(master).setAppName("UserClickCountAnalytics") 21 | val ssc = new StreamingContext(conf, Seconds(1)) // 按5S来划分一个微批处理 22 | 23 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 24 | val topics = Array("user_events_mysql") 25 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 26 | // 读取kafka数据 27 | val kafkaParams = Map[String,Object]( 28 | "bootstrap.servers" -> brokers, 29 | "key.deserializer" -> classOf[StringDeserializer], 30 | "value.deserializer" -> classOf[StringDeserializer], 31 | "group.id" -> "UserClickCountAnalytics_group", 32 | "auto.offset.reset" -> "latest", 33 | "enable.auto.commit" -> (false: java.lang.Boolean) 34 | ) 35 | // redis 存储 36 | val dbIndex = 2 37 | val clickHashKey = "app::user:click" 38 | 39 | // 获取日志数据 40 | val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 41 | val events = kafkaStream.flatMap( 42 | line => { 43 | val data = JSONObject.fromObject(line.value()) 44 | Some(data) 45 | }) 46 | 47 | // 统计用户点击次数 根据uid 统计 click_count 48 | val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果 49 | .reduceByKey(_+_) 50 | userClicks.foreachRDD(rdd => { 51 | rdd.foreachPartition(partitionOfRecords => { 52 | partitionOfRecords.foreach(pair => { 53 | // 创建连接池 54 | val dataSource = DruidConnectionPool.getInstance().dataSource 55 | val conn = dataSource.getConnection 56 | val uid = pair._1 57 | val clickCount = pair._2 58 | val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'" 59 | val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")" 60 | val ps = conn.prepareStatement(sql_isExist) 61 | val resultSet = ps.executeQuery() 62 | if (resultSet.next()) { 63 | val count = resultSet.getString(2).toInt + clickCount.toInt 64 | val sql_update = "update streaming set clickCount ='" + count + "' where uid = '" + uid + "'" 65 | val ps = conn.prepareStatement(sql_update) 66 | ps.executeUpdate() 67 | resultSet.close() 68 | } else { 69 | val ps = conn.prepareStatement(sql_insert) 70 | ps.executeUpdate() 71 | } 72 | ps.close() 73 | conn.close() 74 | }) 75 | }) 76 | }) 77 | ssc.start() 78 | ssc.awaitTermination() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_mysql/UserClickCountByWindowAnalytics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_mysql 2 | 3 | import net.sf.json.JSONObject 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 7 | import org.apache.spark.streaming.kafka010.KafkaUtils 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | 11 | /** 12 | * 每5秒 统计 过去10秒 每种终端 收到的点击量 13 | * 14 | * 注意: 15 | * 1. 使用 窗口计算需要设置检查点 checkpoint 16 | * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错. 17 | */ 18 | object UserClickCountByWindowAnalytics { 19 | def main(args: Array[String]): Unit = { 20 | // 创建 SparkConf 和 StreamingContext 21 | val master = if (args.length > 0) args(0) else "local[1]" 22 | // 创建检查点路径 23 | val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/mysql/UserClickCountByWindowAnalytics" 24 | val conf = new SparkConf().setMaster(master).setAppName("UserClickCountByWindowAnalytics") 25 | val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理 26 | // 设置检查点 27 | ssc.checkpoint(checkpointDir) 28 | 29 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 30 | val topics = Array("user_events_mysql") 31 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 32 | // 读取kafka数据 33 | val kafkaParams = Map[String,Object]( 34 | "bootstrap.servers" -> brokers, 35 | "key.deserializer" -> classOf[StringDeserializer], 36 | "value.deserializer" -> classOf[StringDeserializer], 37 | "group.id" -> "UserClickCountByWindowAnalytics_group", 38 | "auto.offset.reset" -> "latest", 39 | "enable.auto.commit" -> (false: java.lang.Boolean) 40 | ) 41 | // redis 存储 42 | val dbIndex = 2 43 | val clickHashKey = "app::os_type:click" 44 | 45 | // 获取日志数据 46 | val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 47 | val events = kafkaStream.flatMap( 48 | line => { 49 | val data = JSONObject.fromObject(line.value()) 50 | Some(data) 51 | }) 52 | 53 | // 每5秒统计过去10秒每种site的点击量 54 | val userClicks = events.map(x => {(x.getString("os_type"),x.getInt("click_count"))}) 55 | .reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(5)) // 新增数据,过期数据,过去10S的窗口长度,每隔5S计算一次 56 | // userClicks.foreachRDD(rdd =>{rdd.foreach(println(_))}) // 用于测试数据格式 57 | userClicks.foreachRDD(rdd => { 58 | rdd.foreachPartition(partitionOfRecords => { 59 | partitionOfRecords.foreach(pair => { 60 | // 创建连接池 61 | val dataSource = DruidConnectionPool.getInstance().dataSource 62 | val conn = dataSource.getConnection 63 | val os_type = pair._1 64 | val clickCount = pair._2 65 | val sql_isExist = "SELECT * from streaming_ostype where os_type = '" + os_type + "'" 66 | val sql_insert = "insert into streaming_ostype(os_type,clickCount) values('" + os_type + "'," + clickCount + ")" 67 | val ps = conn.prepareStatement(sql_isExist) 68 | val resultSet = ps.executeQuery() 69 | if (resultSet.next()) { 70 | val count = resultSet.getString(2).toInt + clickCount.toInt 71 | val sql_update = "update streaming_ostype set clickCount ='" + count + "' where os_type = '" + os_type + "'" 72 | val ps = conn.prepareStatement(sql_update) 73 | ps.executeUpdate() 74 | } 75 | else { 76 | val ps = conn.prepareStatement(sql_insert) 77 | ps.executeUpdate() 78 | } 79 | ps.close() 80 | conn.close() 81 | }) 82 | }) 83 | }) 84 | ssc.start() 85 | ssc.awaitTermination() 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/KafkaEventProducer.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK 2 | 3 | import java.util.Properties 4 | import net.sf.json.JSONObject 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | import scala.util.Random 7 | 8 | /** 9 | * 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 10 | */ 11 | object KafkaEventProducer { 12 | 13 | private val users = Array( 14 | "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52", 15 | "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493", 16 | "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8", 17 | "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b", 18 | "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d") 19 | 20 | private val sites = Array( 21 | "Android","IOS","PC" 22 | ) 23 | 24 | private val random = new Random() 25 | 26 | def getUserID():String = { 27 | val userPointer = random.nextInt(10) 28 | users(userPointer) 29 | } 30 | 31 | def getSite():String = { 32 | val sitePointer = random.nextInt(3) 33 | sites(sitePointer) 34 | } 35 | 36 | def click() : Double = { 37 | random.nextInt(10) 38 | } 39 | 40 | def main(args: Array[String]): Unit = { 41 | val topics = "user_events_zk" 42 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 43 | val props = new Properties() 44 | props.put("bootstrap.servers",brokers) 45 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 46 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 47 | props.put("request.required.acks", "1") 48 | 49 | val producer = new KafkaProducer[String,String](props) 50 | while (true) { 51 | val event = new JSONObject() 52 | event 53 | .accumulate("uid", getUserID()) // 用户id 54 | .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间 55 | .accumulate("os_type", getSite()) // 终端类型 56 | .accumulate("click_count", click()) // 点击次数 57 | 58 | // produce event message 59 | producer.send(new ProducerRecord[String,String](topics,event.toString())) 60 | println("Message sent: " + event.toString) 61 | 62 | Thread.sleep(200) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/UserClickCountAnalytics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK 2 | 3 | import java.lang 4 | import kafka.utils.{ZKGroupTopicDirs, ZkUtils} 5 | import net.sf.json.JSONObject 6 | import org.apache.kafka.clients.consumer.ConsumerRecord 7 | import org.apache.kafka.common.TopicPartition 8 | import org.apache.kafka.common.serialization.StringDeserializer 9 | import org.apache.spark.{SparkConf, TaskContext} 10 | import org.apache.spark.streaming.dstream.InputDStream 11 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 12 | import org.apache.spark.streaming.{Seconds, StreamingContext} 13 | import org.apache.spark.streaming.kafka010._ 14 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 15 | import org.sparkStreaming.kafka_sparkStreaming_mysql.DruidConnectionPool 16 | 17 | /** 18 | * 读取kafka中的数据,结果存在mysql中 19 | * 实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数,逻辑比较简单 20 | * 关键是在实现过程中要注意一些问题,如对象序列化等 21 | * 22 | * 手动管理 Kafka 偏移量 存储在 ZK 当中 23 | */ 24 | object UserClickCountAnalytics { 25 | 26 | def main(args: Array[String]): Unit = { 27 | // 创建 SparkConf 和 StreamingContext 28 | val master = if (args.length > 0) args(0) else "local[*]" 29 | val conf = new SparkConf().setMaster(master).setAppName("text") 30 | val ssc = new StreamingContext(conf, Seconds(1)) // 按5S来划分一个微批处理 31 | 32 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 33 | val topicStr = "user_events_zk" 34 | val topics = Array(topicStr) 35 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 36 | // 读取kafka数据 37 | val kafkaParams = Map[String, Object]( 38 | "bootstrap.servers" -> brokers, 39 | "key.deserializer" -> classOf[StringDeserializer], 40 | "value.deserializer" -> classOf[StringDeserializer], 41 | "group.id" -> "offsetToZk_test_group", 42 | "auto.offset.reset" -> "latest", 43 | "enable.auto.commit" -> (false: lang.Boolean) 44 | ) 45 | 46 | var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null 47 | // ZK 相关 48 | val zk_host = "192.168.183.150:2181,192.168.183.151:2181,192.168.183.152:2181" 49 | val zkClient = ZkUtils.createZkClient(zk_host, 60000, 60000) 50 | 51 | //创建一个 ZKGroupTopicDirs 对象 52 | val topicDirs = new ZKGroupTopicDirs("offsetToZk_test_group", topicStr) 53 | //获取 zookeeper 中的路径,这里会变成 /consumers/test_spark_streaming_group/offsets/topic_name 54 | val zkTopicPath = s"${topicDirs.consumerOffsetDir}" 55 | //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的) 56 | val children = zkClient.countChildren(zkTopicPath) 57 | //如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置 58 | var fromOffsets: Map[TopicPartition, Long] = Map() 59 | 60 | if (children > 0) { // 在有记录的情况下 => 从节点获取存储的offset 61 | fromOffsets = new ZkKafkaOffsetManager(zk_host).readOffsets(topics, "offsetToZk_test_group") 62 | kafkaStream = KafkaUtils.createDirectStream(ssc, PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, fromOffsets)) 63 | } else { // 如果ZK不存在此路径 => 创建该节点及其父节点 64 | kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 65 | } 66 | kafkaStream.foreachRDD { rdd => 67 | // 数据处理 =====> 开始 68 | val events = rdd.flatMap( 69 | line => { 70 | val data = JSONObject.fromObject(line.value()) 71 | Some(data) 72 | }) 73 | // 统计用户点击次数 根据uid 统计 click_count 74 | val userClicks = events.map(x => { 75 | (x.getString("uid"), x.getInt("click_count")) 76 | }) // 计算每个微批处理的统计结果 77 | .reduceByKey(_ + _) 78 | userClicks.foreachPartition { iter => 79 | iter.foreach(pair => { 80 | // 创建连接池 81 | val dataSource = DruidConnectionPool.getInstance().dataSource 82 | val conn = dataSource.getConnection 83 | val uid = pair._1 84 | val clickCount = pair._2 85 | val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'" 86 | val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")" 87 | val ps = conn.prepareStatement(sql_isExist) 88 | val resultSet = ps.executeQuery() 89 | if (resultSet.next()) { 90 | val count = resultSet.getString(2).toInt + clickCount.toInt 91 | val sql_update = "update streaming set clickCount ='" + count + "' where uid = '" + uid + "'" 92 | val ps = conn.prepareStatement(sql_update) 93 | ps.executeUpdate() 94 | resultSet.close() 95 | } else { 96 | val ps = conn.prepareStatement(sql_insert) 97 | ps.executeUpdate() 98 | } 99 | ps.close() 100 | conn.close() 101 | }) 102 | } 103 | // 数据处理 =====> 结束 =====> 数据处理完毕之后,获取偏移量offset,并保存在 ZK 中 104 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 105 | new ZkKafkaOffsetManager(zk_host).saveOffsets(offsetRanges, "offsetToZk_test_group") 106 | } 107 | ssc.start() 108 | ssc.awaitTermination() 109 | } 110 | } -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_offsetToZK/ZkKafkaOffsetManager.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_offsetToZK 2 | 3 | import kafka.utils.{ZkUtils, ZKGroupTopicDirs} 4 | import org.apache.kafka.common.TopicPartition 5 | import org.apache.spark.streaming.kafka010.OffsetRange 6 | import org.slf4j.LoggerFactory 7 | 8 | import scala.collection.mutable 9 | 10 | /** 11 | * SparkStreaming 手动管理 kafka offset 到 ZK 工具类 12 | * @param zkUrl 13 | */ 14 | class ZkKafkaOffsetManager(zkUrl: String) { 15 | private val logger = LoggerFactory.getLogger(classOf[ZkKafkaOffsetManager]) 16 | 17 | private val zkClientAndConn = ZkUtils.createZkClientAndConnection(zkUrl, 60000, 60000); 18 | private val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2, false) 19 | 20 | /** 21 | * 从ZK 中读取偏移量 offset 22 | * @param topics 23 | * @param groupId 24 | * @return 25 | */ 26 | def readOffsets(topics: Seq[String], groupId: String): Map[TopicPartition, Long] = { 27 | val offsets = mutable.HashMap.empty[TopicPartition, Long] 28 | val partitionsForTopics = zkUtils.getPartitionsForTopics(topics) 29 | 30 | // /consumers//offsets// 31 | partitionsForTopics.foreach(partitions => { 32 | val topic = partitions._1 33 | val groupTopicDirs = new ZKGroupTopicDirs(groupId, topic) 34 | 35 | partitions._2.foreach(partition => { 36 | val path = groupTopicDirs.consumerOffsetDir + "/" + partition 37 | try { 38 | val data = zkUtils.readData(path) 39 | if (data != null) { 40 | offsets.put(new TopicPartition(topic, partition), data._1.toLong) 41 | logger.info( 42 | "Read offset - topic={}, partition={}, offset={}, path={}", 43 | Seq[AnyRef](topic, partition.toString, data._1, path) 44 | ) 45 | } 46 | } catch { 47 | case ex: Exception => 48 | offsets.put(new TopicPartition(topic, partition), 0L) 49 | logger.info( 50 | "Read offset - not exist: {}, topic={}, partition={}, path={}", 51 | Seq[AnyRef](ex.getMessage, topic, partition.toString, path) 52 | ) 53 | } 54 | }) 55 | }) 56 | 57 | offsets.toMap 58 | } 59 | 60 | /** 61 | * 保存偏移量到 ZK 62 | * @param offsetRanges 63 | * @param groupId 64 | */ 65 | def saveOffsets(offsetRanges: Seq[OffsetRange], groupId: String): Unit = { 66 | offsetRanges.foreach(range => { 67 | val groupTopicDirs = new ZKGroupTopicDirs(groupId, range.topic) 68 | val path = groupTopicDirs.consumerOffsetDir + "/" + range.partition 69 | zkUtils.updatePersistentPath(path, range.untilOffset.toString) 70 | logger.info( 71 | "Save offset - topic={}, partition={}, offset={}, path={}", 72 | Seq[AnyRef](range.topic, range.partition.toString, range.untilOffset.toString, path) 73 | ) 74 | }) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/KafkaEventProducer.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_redis 2 | 3 | import java.util.Properties 4 | 5 | import net.sf.json.JSONObject 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 12 | */ 13 | object KafkaEventProducer { 14 | 15 | private val users = Array( 16 | "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52", 17 | "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493", 18 | "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8", 19 | "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b", 20 | "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d") 21 | 22 | private val sites = Array( 23 | "Android","IOS","PC" 24 | ) 25 | 26 | private val random = new Random() 27 | 28 | def getUserID():String = { 29 | val userPointer = random.nextInt(10) 30 | users(userPointer) 31 | } 32 | 33 | def getSite():String = { 34 | val sitePointer = random.nextInt(3) 35 | sites(sitePointer) 36 | } 37 | 38 | def click() : Double = { 39 | random.nextInt(10) 40 | } 41 | 42 | def main(args: Array[String]): Unit = { 43 | val topics = "user_events_redis" 44 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 45 | val props = new Properties() 46 | props.put("bootstrap.servers",brokers) 47 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 48 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 49 | props.put("request.required.acks", "1") 50 | 51 | // val kafkaConfig = new ProducerConfig(props) 52 | val producer = new KafkaProducer[String,String](props) 53 | while (true) { 54 | val event = new JSONObject() 55 | event 56 | .accumulate("uid", getUserID()) // 用户id 57 | .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间 58 | .accumulate("os_type", getSite()) // 终端类型 59 | .accumulate("click_count", click()) // 点击次数 60 | 61 | // produce event message 62 | producer.send(new ProducerRecord[String,String](topics,event.toString())) 63 | println("Message sent: " + event.toString) 64 | 65 | Thread.sleep(200) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/RedisClient.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_redis 2 | 3 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig 4 | import redis.clients.jedis.JedisPool 5 | 6 | /** 7 | * Redis 客户端 8 | */ 9 | object RedisClient extends Serializable { 10 | val redisHost = "127.0.0.1" 11 | val redisPort = 6379 12 | val redisTimeout = 30000 13 | val redisPassword = "root" 14 | lazy val pool = new JedisPool(new GenericObjectPoolConfig(), redisHost, redisPort, redisTimeout, redisPassword) 15 | 16 | lazy val hook = new Thread { 17 | override def run = { 18 | println("Execute hook thread: " + this) 19 | pool.destroy() 20 | } 21 | } 22 | sys.addShutdownHook(hook.run) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/UserClickCountAnalytics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_redis 2 | 3 | import net.sf.json.JSONObject 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 7 | import org.apache.spark.streaming.kafka010.KafkaUtils 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | 11 | /** 12 | * 读取kafka中的数据,结果存在redis中 13 | * 实现实时统计每个用户的点击次数,它是按照用户分组进行累加次数,逻辑比较简单 14 | * 关键是在实现过程中要注意一些问题,如对象序列化等 15 | */ 16 | object UserClickCountAnalytics { 17 | def main(args: Array[String]): Unit = { 18 | // 创建 SparkConf 和 StreamingContext 19 | val master = if (args.length > 0) args(0) else "local[1]" 20 | val conf = new SparkConf().setMaster(master).setAppName("UserClickCountAnalytics") 21 | val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理 22 | 23 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 24 | val topics = Array("user_events_redis") 25 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 26 | // 读取kafka数据 27 | val kafkaParams = Map[String,Object]( 28 | "bootstrap.servers" -> brokers, 29 | "key.deserializer" -> classOf[StringDeserializer], 30 | "value.deserializer" -> classOf[StringDeserializer], 31 | "group.id" -> "UserClickCountAnalytics_group", 32 | "auto.offset.reset" -> "latest", 33 | "enable.auto.commit" -> (false: java.lang.Boolean) 34 | ) 35 | // redis 存储 36 | val dbIndex = 2 37 | val clickHashKey = "app::user:click" 38 | 39 | // 获取日志数据 40 | val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 41 | val events = kafkaStream.flatMap( 42 | line => { 43 | val data = JSONObject.fromObject(line.value()) 44 | Some(data) 45 | }) 46 | 47 | // 统计用户点击次数 根据uid 统计 click_count(累加是在redis中做的) 48 | val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果 49 | .reduceByKey(_+_) 50 | userClicks.foreachRDD(rdd => { 51 | rdd.foreachPartition(partitionOfRecords => { 52 | partitionOfRecords.foreach(pair => { 53 | val jedis = RedisClient.pool.getResource 54 | jedis.select(dbIndex) 55 | val uid = pair._1 56 | val clickCount = pair._2 57 | jedis.hincrBy(clickHashKey, uid, clickCount) // 为哈希表clickHashKey中的域uid的值加上增量increment。(将每个微批处理的统计结果 根据uid分组 累加 起来) 58 | RedisClient.pool.returnResource(jedis) 59 | }) 60 | }) 61 | }) 62 | ssc.start() 63 | ssc.awaitTermination() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/kafka_sparkStreaming_redis/UserClickCountByWindowAnalytics.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.kafka_sparkStreaming_redis 2 | 3 | import net.sf.json.JSONObject 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 7 | import org.apache.spark.streaming.kafka010.KafkaUtils 8 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | 11 | /** 12 | * 每5秒 统计 过去10秒 每种终端 收到的点击量 13 | * 14 | * 注意: 15 | * 1. 使用 窗口计算需要设置检查点 checkpoint 16 | * 2. 窗口滑动长度和窗口长度一定要是SparkStreaming微批处理时间的整数倍,不然会报错. 17 | */ 18 | object UserClickCountByWindowAnalytics { 19 | def main(args: Array[String]): Unit = { 20 | // 创建 SparkConf 和 StreamingContext 21 | val master = if (args.length > 0) args(0) else "local[1]" 22 | // 创建检查点路径 23 | val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/redis/UserClickCountByWindowAnalytics" 24 | val conf = new SparkConf().setMaster(master).setAppName("UserClickCountByWindowAnalytics") 25 | val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理 26 | // 设置检查点 27 | ssc.checkpoint(checkpointDir) 28 | 29 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 30 | val topics = Array("user_events_redis") 31 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 32 | // 读取kafka数据 33 | val kafkaParams = Map[String,Object]( 34 | "bootstrap.servers" -> brokers, 35 | "key.deserializer" -> classOf[StringDeserializer], 36 | "value.deserializer" -> classOf[StringDeserializer], 37 | "group.id" -> "UserClickCountByWindowAnalytics_group", 38 | "auto.offset.reset" -> "latest", 39 | "enable.auto.commit" -> (false: java.lang.Boolean) 40 | ) 41 | // redis 存储 42 | val dbIndex = 2 43 | val clickHashKey = "app::os_type:click" 44 | 45 | // 获取日志数据 46 | val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 47 | val events = kafkaStream.flatMap( 48 | line => { 49 | val data = JSONObject.fromObject(line.value()) 50 | Some(data) 51 | }) 52 | 53 | // 每5秒统计过去10秒每种site的点击量 54 | val userClicks = events.map(x => {(x.getString("os_type"),x.getInt("click_count"))}) 55 | .reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(5)) // 新增数据,过期数据,过去10S的窗口长度,每隔5S计算一次 56 | // userClicks.foreachRDD(rdd =>{rdd.foreach(println(_))}) // 用于测试数据格式 57 | userClicks.foreachRDD(rdd => { 58 | rdd.foreachPartition(partitionOfRecords => { 59 | partitionOfRecords.foreach(pair => { 60 | val jedis = RedisClient.pool.getResource 61 | jedis.select(dbIndex) 62 | val os_type = pair._1 63 | val clickCount = pair._2 64 | jedis.lpush(os_type,String.valueOf(clickCount)) 65 | RedisClient.pool.returnResource(jedis) 66 | }) 67 | }) 68 | }) 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce/KafkaEventProducer.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.sparkStreamingExactltyOnce 2 | 3 | import java.util.Properties 4 | 5 | import net.sf.json.JSONObject 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * 模拟 Kafka 生产者 实时写入用户行为的事件数据,数据是JSON格式 12 | */ 13 | object KafkaEventProducer { 14 | 15 | private val users = Array( 16 | "df354f90-5acd-4c55-a3e2-adc045f628c3", "e20f8e06-7717-4236-87f0-484a82f00b52", 17 | "293901ca-9a58-4ef9-8c01-fa3c766ca236", "2b175ac2-f1a6-4fcc-a437-d2f01828b493", 18 | "27e51fd9-2be9-405c-b81a-b34e2f6379dd", "f3f2c74d-5fe0-4cce-8ce1-a2bdd5ad82b8", 19 | "ef062789-6214-493d-8aad-4b15f91ec5d3", "569e4b06-9301-4a9d-842c-1e6aa9b4f39b", 20 | "7637be73-6bd8-4170-890f-6352b21b8ce0", "06321173-8abb-40a8-af66-3dec3ff1ce5d") 21 | 22 | private val sites = Array( 23 | "Android","IOS","PC" 24 | ) 25 | 26 | private val random = new Random() 27 | 28 | def getUserID():String = { 29 | val userPointer = random.nextInt(10) 30 | users(userPointer) 31 | } 32 | 33 | def getSite():String = { 34 | val sitePointer = random.nextInt(3) 35 | sites(sitePointer) 36 | } 37 | 38 | def click() : Double = { 39 | random.nextInt(10) 40 | } 41 | 42 | def main(args: Array[String]): Unit = { 43 | val topics = "user_events_ExactltyOnce" 44 | val brokers = "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 45 | val props = new Properties() 46 | props.put("bootstrap.servers",brokers) 47 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 48 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 49 | props.put("request.required.acks", "1") 50 | 51 | // val kafkaConfig = new ProducerConfig(props) 52 | val producer = new KafkaProducer[String,String](props) 53 | while (true) { 54 | val event = new JSONObject() 55 | event 56 | .accumulate("uid", getUserID()) // 用户id 57 | .accumulate("event_time", System.currentTimeMillis.toString) // 点击时间 58 | .accumulate("os_type", getSite()) // 终端类型 59 | .accumulate("click_count", click()) // 点击次数 60 | 61 | // produce event message 62 | producer.send(new ProducerRecord[String,String](topics,event.toString())) 63 | println("Message sent: " + event.toString) 64 | 65 | Thread.sleep(200) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/org/sparkStreaming/sparkStreamingExactltyOnce/SparkStreamingExactlyOnce.scala: -------------------------------------------------------------------------------- 1 | package org.sparkStreaming.sparkStreamingExactltyOnce 2 | 3 | import net.sf.json.JSONObject 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 8 | import org.apache.spark.streaming.kafka010.KafkaUtils 9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 10 | import org.sparkStreaming.kafka_sparkStreaming_mysql.DruidConnectionPool 11 | 12 | /** 13 | * Spark Streaming 中如何实现 Exactly-Once 语义 14 | * 15 | * Exactly-once 语义是实时计算的难点之一。 16 | * 要做到每一条记录只会被处理一次,即使服务器或网络发生故障时也能保证没有遗漏, 17 | * 这不仅需要实时计算框架本身的支持,还对上游的消息系统、下游的数据存储有所要求。 18 | * 此外,我们在编写计算流程时也需要遵循一定规范,才能真正实现 Exactly-once。 19 | */ 20 | object SparkStreamingExactlyOnce { 21 | 22 | def main(args: Array[String]): Unit = { 23 | // 创建 SparkConf 和 StreamingContext 24 | val master = if (args.length > 0) args(0) else "local[1]" 25 | // 创建检查点路径 26 | val checkpointDir = if (args.length > 1) args(1) else "data/checkpoint/exactlyOnce/SparkStreamingExactlyOnce" 27 | // 创建SparkConf 28 | val conf = new SparkConf().setMaster(master).setAppName("SparkStreamingExactlyOnce") 29 | 30 | // kafka 配置:消费Kafka 中,topic为 user_events的消息 31 | val brokers = if (args.length > 2) args(2) else "192.168.183.150:9092,192.168.183.151:9092,192.168.183.152:9092" 32 | val topicNames = if (args.length > 3) args(3) else "user_events_ExactltyOnce" 33 | 34 | def createSSC(): StreamingContext = { 35 | val ssc = new StreamingContext(conf, Seconds(5)) // 按5S来划分一个微批处理 36 | kafkaTest(ssc,brokers,topicNames) // Spark 的 Transform和Action 37 | ssc.checkpoint(checkpointDir) 38 | ssc 39 | } 40 | 41 | // 如果重启的话,可以从检查点恢复 42 | val ssc = StreamingContext.getOrCreate(checkpointDir,createSSC) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | } 47 | 48 | /** 49 | * 消费 Kafka 数据 的 Transform和Action 50 | * @param ssc 51 | * @param brokers 52 | * @param topicNames 53 | */ 54 | def kafkaTest(ssc: StreamingContext, brokers: String, topicNames: String): Unit ={ 55 | 56 | val topics = Array(topicNames) 57 | // kafka 参数配置 58 | val kafkaParams = Map[String,Object]( 59 | "bootstrap.servers" -> brokers, 60 | "key.deserializer" -> classOf[StringDeserializer], 61 | "value.deserializer" -> classOf[StringDeserializer], 62 | "group.id" -> "SparkStreamingExactlyOnce_group", 63 | "auto.offset.reset" -> "latest", 64 | "enable.auto.commit" -> (false: java.lang.Boolean) 65 | ) 66 | // 获取日志数据 KafkaUtils.createDirectStream : Direct API 可以提供 Exactly-once 语义。 67 | val kafkaStream = KafkaUtils.createDirectStream[String,String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 68 | val events = kafkaStream.flatMap( 69 | line => { 70 | val data = JSONObject.fromObject(line.value()) 71 | Some(data) 72 | }) 73 | 74 | // 统计用户点击次数 根据uid 统计 click_count(累加是在redis中做的) 75 | val userClicks = events.map(x => {(x.getString("uid"),x.getInt("click_count"))}) // 计算每个微批处理的统计结果 76 | .reduceByKey(_+_) 77 | userClicks.foreachRDD(rdd => { 78 | rdd.foreachPartition(partitionOfRecords => { 79 | partitionOfRecords.foreach(pair => { 80 | // 创建连接池 81 | val dataSource = DruidConnectionPool.getInstance().dataSource 82 | val conn = dataSource.getConnection 83 | val uid = pair._1 84 | val clickCount = pair._2 85 | val sql_isExist = "SELECT * from streaming where uid = '" + uid + "'" 86 | val sql_insert = "insert into streaming(uid,clickCount) values('" + uid + "'," + clickCount + ")" 87 | val resultSet = conn.createStatement().executeQuery(sql_isExist) 88 | if (resultSet.next()) { 89 | val count = resultSet.getString(2).toInt + clickCount.toInt 90 | val sql_update = "update streaming set clickCount ='" + count + "' where uid = '" + uid + "'" 91 | conn.createStatement().executeUpdate(sql_update) 92 | } 93 | else conn.createStatement().executeUpdate(sql_insert) 94 | conn.close() 95 | }) 96 | }) 97 | }) 98 | } 99 | } 100 | --------------------------------------------------------------------------------