├── README.md
├── pom.xml
└── src
    └── main
        ├── resources
            ├── advUrlCount.log
            ├── bs_log
            │   ├── 19735E1C66.log
            │   ├── DDE7970F68.log
            │   └── E549D940E0.log
            └── ip.txt
        └── scala
            ├── com
                └── zxl
                │   ├── spark1_6
                │       ├── dataframe
                │       │   └── SQLDemo.scala
                │       ├── elastic
                │       │   └── ElasticSpark.scala
                │       ├── flume
                │       │   └── FlumePushWordCount.scala
                │       ├── jedis
                │       │   └── JedisConnectionPool.scala
                │       ├── kafka
                │       │   ├── DirectKafkaWordCount.scala
                │       │   ├── KafkaWordCount.scala
                │       │   └── LoggerLevels.scala
                │       ├── my_partitioner
                │       │   └── UrlCountPartition.scala
                │       ├── my_sort
                │       │   └── CustomSort.scala
                │       ├── mysql
                │       │   └── JdbcRDDDemo.scala
                │       ├── simple
                │       │   ├── AdvUrlCount.scala
                │       │   ├── IpDemo.scala
                │       │   ├── UserLocation.scala
                │       │   └── WordCount.scala
                │       └── streaming
                │       │   ├── LoggerLevels.scala
                │       │   ├── StateFulWordCount.scala
                │       │   ├── StreamingWordCount.scala
                │       │   └── WindowOpts.scala
                │   └── spark2_2
                │       ├── dataset
                │           ├── actions.scala
                │           ├── basicAction.scala
                │           └── createDataSet.scala
                │       ├── kafka
                │           ├── StreamingKafka10.scala
                │           └── StreamingKafka8.scala
                │       ├── streaming
                │           └── StreamingToMysql.scala
                │       └── structured
                │           ├── JDBCSink.scala
                │           ├── MySqlPool.scala
                │           └── StructuredStreamingKafka.scala
            └── org
                └── apache
                    └── spark
                        └── streaming
                            └── kafka
                                └── KafkaManager.scala


/README.md:
--------------------------------------------------------------------------------
  1 | # Spark-Example
  2 | com.zxl.spark2_2.kafka 
  3 | 	
  4 | 	StreamingKafka8：
  5 | 		
  6 | 		SparkStreaming从kafka中读取数据
  7 |   		
  8 | 		kafka版本0.8
  9 |   		
 10 | 		采取直连方式
 11 |   	
 12 | 	StreamingKafka10：
 13 |   		
 14 | 		SparkStreaming从kafka中读取数据
 15 |   		
 16 | 		kafka版本0.10
 17 |   		
 18 | 		采取直连方式
 19 | 
 20 |  com.zxl.spark2_2.streaming 
 21 |  	
 22 | 	StreamingToMysql：
 23 |  		
 24 | 		SparkStreaming读取数据，存储到Mysql中
 25 | 
 26 |  com.zxl.spark2_2.structured 
 27 |  	
 28 | 	JDBCSink：
 29 |  		
 30 | 		处理从StructuredStreaming中向mysql中写入数据
 31 |  	
 32 | 	MySqlPool：
 33 |  		
 34 | 		从mysql连接池中获取连接
 35 |  	
 36 | 	StructuredStreamingKafka：
 37 |  		
 38 | 		结构化流从kafka中读取数据存储到关系型数据库mysql
 39 |   		
 40 | 		目前结构化流对kafka的要求版本0.10及以上 
 41 | 
 42 | com.zxl.spark2_2.dataset
 43 | 
 44 | 	createDataSet：
 45 | 	
 46 | 		DataSet创建的多种方式
 47 | 
 48 | 	basicAction：
 49 | 	
 50 | 		DataSet的基本操作
 51 | 		
 52 | 	actions：
 53 | 	
 54 | 		DataSet的Action操作
 55 | 			1.map操作，flatMap操作
 56 | 			2.filter操作，where操作
 57 | 			3.去重操作
 58 | 			4.加法/减法操作
 59 | 			5.select操作
 60 | 			6.排序操作
 61 | 			7.分割抽样操作
 62 | 			8.列操作
 63 | 			9.join操作
 64 | 			10.分组聚合操作
 65 | 		
 66 | com.zxl.spark1_6.dataframe
 67 | 	
 68 | 	SQLDemo：
 69 | 		
 70 | 		从hdfs中读取数据，转化为DataFrame，执行简单操作
 71 | 
 72 | com.zxl.spark1_6.elastic 
 73 | 	
 74 | 	ElasticSpark：
 75 | 		
 76 | 		Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。
 77 |   		
 78 | 		设计用于云计算中，能够达到实时搜索，稳定，可靠，快速，安装使用方便。
 79 | 
 80 | com.zxl.spark1_6.flume
 81 | 	
 82 | 	FlumePushWordCount：
 83 | 		
 84 | 		flume向spark发送数据
 85 |   		
 86 | 		添加三个jar包
 87 |   			
 88 | 			- commons-lang3-3.3.2.jar
 89 |   			
 90 | 			- scala-library-2.10.5.jar
 91 |   			
 92 | 			- spark-streaming-flume-sink_2.10-1.6.1.jar
 93 |   		
 94 | 		打成jar包上传到集群中运行
 95 |   		
 96 | 		集群命令如下：
 97 |   		
 98 | 		bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount /jar/____.jar 192.168.13.131 8888
 99 | 
100 | com.zxl.spark1_6.jedis 
101 | 	
102 | 	JedisConnectionPool：
103 | 		
104 | 		获得Jedis连接，进行简单操作
105 | 
106 | com.zxl.spark1_6.kafka 
107 | 	
108 | 	DirectKafkaWordCount：
109 | 		
110 | 		Spark Streaming维护偏移量相关的信息，实现零数据丢失，保证不重复消费
111 |   		
112 | 		采用直连的方式有一个缺点，就是不再向zookeeper中更新offset信息。
113 |   		
114 | 		因此，在采用直连的方式消费kafka中的数据的时候，大体思路是首先获取保存在zookeeper中的偏移量信息，
115 |   		
116 | 		根据偏移量信息去创建stream，消费数据后再把当前的偏移量写入zookeeper中
117 |  		
118 | 		在2.0以前的版本中KafkaManager这个类是private权限，需要把它拷贝到项目里使用。
119 |   			org.apache.spark.streaming.kafka
120 |   	
121 | 	KafkaWordCount：
122 |   		
123 | 		从集群中的kafka读取数据操作
124 |   		
125 | 		运行时参数：
126 |   			
127 | 			node1:2181,node2:2181,node3:2181 g1 test 2
128 |   			
129 | 			其中g1为组名，此处随意写，test为topic名，kafka中的topic名要一致
130 |   		
131 | 		集群命令(需先启动完成)：
132 |         	
133 | 		1.启动kafak
134 |          		
135 | 			bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 &
136 |   	     	
137 | 		2.创建topic
138 |         		
139 | 			bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test
140 |   		
141 | 		3.向topic中添加数据
142 |  			
143 | 			bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
144 | 
145 | com.zxl.spark1_6.my_partitioner 
146 | 	
147 | 	UrlCountPartition：
148 | 		
149 | 		自定义分区
150 |   		
151 | 		数据格式(时间点  url地址)，例如：
152 |   			20160321101954	http://net.zxl.cn/net/video.shtml
153 |   		
154 | 		处理成数据(k, v)
155 |   		
156 | 		对于数据(k, v)
157 |   		
158 | 		重写自己的 partitioner
159 | 
160 | com.zxl.spark1_6.my_sort 
161 | 	
162 | 	CustomSort：自定义排序
163 | 
164 | com.zxl.spark1_6.mysql 
165 | 	
166 | 	JdbcRDDDemo：简单连接数据库操作
167 | 
168 | com.zxl.spark1_6.simple 
169 | 	
170 | 	AdvUrlCount：
171 | 		
172 | 		读取文本内容,根据指定的学科, 取出点击量前三的
173 |   		
174 | 		文本内容为某广告链接点击量，格式为：(时间点  某学科url链接)
175 |   		
176 | 		举例：(20160321101957	http://net.zxl.cn/net/course.shtml)
177 |   	
178 | 	IpDemo：
179 |   		
180 | 		数据格式如下：
181 |   			(1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302)
182 |   		
183 | 		根据ip地址转换为数字，从数据集中找出详细信息.
184 |   		
185 | 		为了简化查找速率，采用二分查找.
186 |   	
187 | 	UserLocation：
188 |   		
189 | 		根据日志统计出每个用户在站点所呆时间最长的前2个的信息
190 |   		
191 | 		日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。
192 |   			
193 | 			1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔)
194 |   			
195 | 			2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和)
196 |   			
197 | 			3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和)
198 |   			
199 | 			4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t)))
200 |   	
201 | 	WordCount：
202 |   		
203 | 		简单WordCount实现
204 |   		
205 | 		集群上执行示例，指定相关配置
206 |   		
207 | 		bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m	--total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out
208 | 
209 | com.zxl.spark1_6.streaming
210 | 	
211 | 	LoggerLevels：
212 | 		
213 | 		设置打印的log的级别
214 | 	
215 | 	StateFulWordCount：
216 | 		
217 | 		Spark Streaming累加器操作（updateStateByKey)
218 | 	
219 | 	StreamingWordCount：
220 | 		
221 | 		通过SparkStreaming简单实现WordCount
222 | 	
223 | 	WindowOpts：
224 | 		
225 | 		SparkStreaming窗口函数的实现
226 | 
227 | org.apache.spark.streaming.kafka 
228 | 	
229 | 	KafkaManager：
230 | 		
231 | 		SparkStreaming直连kafka获取数据，自己编写偏移量offset，用于spark2.0以前
232 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.zxl</groupId>
  8 |     <artifactId>spark-example</artifactId>
  9 |     <version>1.0</version>
 10 | 
 11 |     <properties>
 12 |         <maven.compiler.source>1.8</maven.compiler.source>
 13 |         <maven.compiler.target>1.8</maven.compiler.target>
 14 |         <encoding>UTF-8</encoding>
 15 |         <scala.version>2.11.8</scala.version>
 16 |         <spark.version>2.2.0</spark.version>
 17 |         <hadoop.version>2.6.4</hadoop.version>
 18 |     </properties>
 19 | 
 20 |     <dependencies>
 21 |         <dependency>
 22 |             <groupId>org.scala-lang</groupId>
 23 |             <artifactId>scala-library</artifactId>
 24 |             <version>${scala.version}</version>
 25 |         </dependency>
 26 | 
 27 |         <dependency>
 28 |             <groupId>org.apache.spark</groupId>
 29 |             <artifactId>spark-core_2.11</artifactId>
 30 |             <version>${spark.version}</version>
 31 |         </dependency>
 32 | 
 33 |         <dependency>
 34 |             <groupId>org.apache.hadoop</groupId>
 35 |             <artifactId>hadoop-client</artifactId>
 36 |             <version>${hadoop.version}</version>
 37 |         </dependency>
 38 | 
 39 |         <dependency>
 40 |             <groupId>mysql</groupId>
 41 |             <artifactId>mysql-connector-java</artifactId>
 42 |             <version>5.1.32</version>
 43 |         </dependency>
 44 | 
 45 |         <dependency>
 46 |             <groupId>org.apache.spark</groupId>
 47 |             <artifactId>spark-sql_2.11</artifactId>
 48 |             <version>${spark.version}</version>
 49 |         </dependency>
 50 | 
 51 |         <dependency>
 52 |             <groupId>org.apache.spark</groupId>
 53 |             <artifactId>spark-hive_2.11</artifactId>
 54 |             <version>${spark.version}</version>
 55 |         </dependency>
 56 |         <dependency>
 57 |             <groupId>org.apache.hive</groupId>
 58 |             <artifactId>hive-jdbc</artifactId>
 59 |             <version>${spark.version}</version>
 60 |         </dependency>
 61 | 
 62 |         <dependency>
 63 |             <groupId>org.apache.spark</groupId>
 64 |             <artifactId>spark-streaming_2.11</artifactId>
 65 |             <version>${spark.version}</version>
 66 |         </dependency>
 67 | 
 68 |         <dependency>
 69 |             <groupId>org.apache.spark</groupId>
 70 |             <artifactId>spark-streaming-flume_2.11</artifactId>
 71 |             <version>${spark.version}</version>
 72 |         </dependency>
 73 | 
 74 |         <dependency>
 75 |             <groupId>org.apache.spark</groupId>
 76 |             <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
 77 |             <version>${spark.version}</version>
 78 |         </dependency>
 79 | 
 80 |         <dependency>
 81 |             <groupId>org.apache.spark</groupId>
 82 |             <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
 83 |             <version>${spark.version}</version>
 84 |         </dependency>
 85 | 
 86 |         <dependency>
 87 |             <groupId>org.apache.spark</groupId>
 88 |             <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
 89 |             <version>${spark.version}</version>
 90 |         </dependency>
 91 | 
 92 |         <dependency>
 93 |             <groupId>org.apache.spark</groupId>
 94 |             <artifactId>spark-graphx_2.11</artifactId>
 95 |             <version>${spark.version}</version>
 96 |         </dependency>
 97 | 
 98 |         <dependency>
 99 |             <groupId>org.apache.spark</groupId>
100 |             <artifactId>spark-mllib_2.11</artifactId>
101 |             <version>${spark.version}</version>
102 |         </dependency>
103 | 
104 |         <dependency>
105 |             <groupId>org.scalanlp</groupId>
106 |             <artifactId>breeze_2.11</artifactId>
107 |             <version>0.12</version>
108 |         </dependency>
109 | 
110 |         <dependency>
111 |             <groupId>redis.clients</groupId>
112 |             <artifactId>jedis</artifactId>
113 |             <version>2.8.1</version>
114 |         </dependency>
115 | 
116 |         <dependency>
117 |             <groupId>org.elasticsearch</groupId>
118 |             <artifactId>elasticsearch</artifactId>
119 |             <version>2.3.1</version>
120 |         </dependency>
121 | 
122 |         <dependency>
123 |             <groupId>org.elasticsearch</groupId>
124 |             <artifactId>elasticsearch-spark_2.11</artifactId>
125 |             <version>2.3.0</version>
126 |         </dependency>
127 | 
128 |         <dependency>
129 |             <groupId>mysql</groupId>
130 |             <artifactId>mysql-connector-java</artifactId>
131 |             <version>5.1.35</version>
132 |         </dependency>
133 |     </dependencies>
134 | 
135 |     <build>
136 |         <sourceDirectory>src/main/scala</sourceDirectory>
137 |         <plugins>
138 |             <plugin>
139 |                 <groupId>net.alchim31.maven</groupId>
140 |                 <artifactId>scala-maven-plugin</artifactId>
141 |                 <version>3.2.2</version>
142 |                 <executions>
143 |                     <execution>
144 |                         <goals>
145 |                             <goal>compile</goal>
146 |                             <goal>testCompile</goal>
147 |                         </goals>
148 |                         <configuration>
149 |                             <args>
150 |                                 <arg>-make:transitive</arg>
151 |                                 <arg>-dependencyfile</arg>
152 |                                 <arg>${project.build.directory}/.scala_dependencies</arg>
153 |                             </args>
154 |                         </configuration>
155 |                     </execution>
156 |                 </executions>
157 |             </plugin>
158 | 
159 |             <plugin>
160 |                 <groupId>org.apache.maven.plugins</groupId>
161 |                 <artifactId>maven-shade-plugin</artifactId>
162 |                 <version>2.4.3</version>
163 |                 <executions>
164 |                     <execution>
165 |                         <phase>package</phase>
166 |                         <goals>
167 |                             <goal>shade</goal>
168 |                         </goals>
169 |                         <configuration>
170 |                             <filters>
171 |                                 <filter>
172 |                                     <artifact>*:*</artifact>
173 |                                     <excludes>
174 |                                         <exclude>META-INF/*.SF</exclude>
175 |                                         <exclude>META-INF/*.DSA</exclude>
176 |                                         <exclude>META-INF/*.RSA</exclude>
177 |                                     </excludes>
178 |                                 </filter>
179 |                             </filters>
180 |                         </configuration>
181 |                     </execution>
182 |                 </executions>
183 |             </plugin>
184 |         </plugins>
185 |     </build>
186 | 
187 | </project>


--------------------------------------------------------------------------------
/src/main/resources/bs_log/19735E1C66.log:
--------------------------------------------------------------------------------
1 | 18688888888,20160327082400,16030401EAFB68F1E3CDF819735E1C66,1
2 | 18611132889,20160327082500,16030401EAFB68F1E3CDF819735E1C66,1
3 | 18688888888,20160327170000,16030401EAFB68F1E3CDF819735E1C66,0
4 | 18611132889,20160327180000,16030401EAFB68F1E3CDF819735E1C66,0
5 | 


--------------------------------------------------------------------------------
/src/main/resources/bs_log/DDE7970F68.log:
--------------------------------------------------------------------------------
1 | 18611132889,20160327075000,9F36407EAD0629FC166F14DDE7970F68,1
2 | 18688888888,20160327075100,9F36407EAD0629FC166F14DDE7970F68,1
3 | 18611132889,20160327081000,9F36407EAD0629FC166F14DDE7970F68,0
4 | 18688888888,20160327081300,9F36407EAD0629FC166F14DDE7970F68,0
5 | 18688888888,20160327175000,9F36407EAD0629FC166F14DDE7970F68,1
6 | 18611132889,20160327182000,9F36407EAD0629FC166F14DDE7970F68,1
7 | 18688888888,20160327220000,9F36407EAD0629FC166F14DDE7970F68,0
8 | 18611132889,20160327230000,9F36407EAD0629FC166F14DDE7970F68,0
9 | 


--------------------------------------------------------------------------------
/src/main/resources/bs_log/E549D940E0.log:
--------------------------------------------------------------------------------
1 | 18611132889,20160327081100,CC0710CC94ECC657A8561DE549D940E0,1
2 | 18688888888,20160327081200,CC0710CC94ECC657A8561DE549D940E0,1
3 | 18688888888,20160327081900,CC0710CC94ECC657A8561DE549D940E0,0
4 | 18611132889,20160327082000,CC0710CC94ECC657A8561DE549D940E0,0
5 | 18688888888,20160327171000,CC0710CC94ECC657A8561DE549D940E0,1
6 | 18688888888,20160327171600,CC0710CC94ECC657A8561DE549D940E0,0
7 | 18611132889,20160327180500,CC0710CC94ECC657A8561DE549D940E0,1
8 | 18611132889,20160327181500,CC0710CC94ECC657A8561DE549D940E0,0
9 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/dataframe/SQLDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.dataframe
 2 | 
 3 | import org.apache.spark.sql.SQLContext
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |   * 从hdfs中读取数据，转化为DataFrame，执行简单操作
 8 |   * Created by ZXL on 2017/10/23.
 9 |   */
10 | object SQLDemo {
11 | 
12 |   def main(args: Array[String]) {
13 |     val conf = new SparkConf().setAppName("SQLDemo")//.setMaster("local")
14 |     val sc = new SparkContext(conf)
15 |     val sqlContext = new SQLContext(sc)
16 |     // 设置可以读取集群中的hdfs中文件
17 |     System.setProperty("user.name", "root")
18 | 
19 |     val personRdd = sc.textFile("hdfs://node1:9000/person.txt").map(line =>{
20 |       val fields = line.split(",")
21 |       Person(fields(0).toLong, fields(1), fields(2).toInt)
22 |     })
23 | 
24 |     import sqlContext.implicits._
25 |     // 转为DataFrame
26 |     val personDf = personRdd.toDF
27 | 
28 |     personDf.show()
29 | 
30 |     personDf.registerTempTable("person")
31 | 
32 |     sqlContext.sql("select * from person where age >= 20 order by age desc limit 2").show()
33 | 
34 |     sc.stop()
35 | 
36 |   }
37 | 
38 |   case class Person(id: Long, name: String, age: Int)
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/elastic/ElasticSpark.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.elastic
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | import org.elasticsearch.spark._
 5 | 
 6 | /**
 7 |   * Elasticsearch是一个基于Lucene的实时地分布式搜索和分析引擎。
 8 |   * 设计用于云计算中，能够达到实时搜索，稳定，可靠，快速，安装使用方便。
 9 |   *
10 |   * Created by ZXL on 2017/10/23.
11 |   */
12 | object ElasticSpark {
13 | 
14 |   def main(args: Array[String]) {
15 |     val conf = new SparkConf().setAppName("ElasticSpark").setMaster("local")
16 |     conf.set("es.nodes", "192.168.13.131,192.168.13.132,192.168.13.133")
17 |     conf.set("es.port", "9200")
18 |     conf.set("es.index.auto.create", "true")
19 |     val sc = new SparkContext(conf)
20 |     //val query: String = "{\"query\":{\"match_all\":{}}}"
21 |     val start = 1463998397
22 |     val end = 1463998399
23 | //    val query: String =
24 | //      s"""{
25 | //       "query": {"match_all": {}},
26 | //       "filter": {
27 | //         "bool": {
28 | //           "must": {
29 | //             "range": {
30 | //               "access.time": {
31 | //                 "gte": "$start",
32 | //                 "lte": "$end"
33 | //               }
34 | //             }
35 | //           }
36 | //         }
37 | //       }
38 | //     }"""
39 | 
40 |     val tp = "1"
41 |     val query: String = s"""{
42 |        "query": {"match_all": {}},
43 |        "filter" : {
44 |           "bool": {
45 |             "must": [
46 |                 {"term" : {"access.type" : $tp}},
47 |                 {
48 |                 "range": {
49 |                   "access.time": {
50 |                   "gte": "$start",
51 |                   "lte": "$end"
52 |                   }
53 |                 }
54 |               }
55 |             ]
56 |           }
57 |        }
58 |      }"""
59 |     val rdd1 = sc.esRDD("accesslogs", query)
60 | 
61 |     println(rdd1.collect().toBuffer)
62 |     println(rdd1.collect().size)
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/flume/FlumePushWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.flume
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.flume.FlumeUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |   * flume向spark发送数据
 9 |   *
10 |   * 添加三个jar包
11 |   *   - commons-lang3-3.3.2.jar
12 |   *   - scala-library-2.10.5.jar
13 |   *   - spark-streaming-flume-sink_2.10-1.6.1.jar
14 |   *
15 |   * 打成jar包上传到集群中运行
16 |   * 集群命令如下：
17 |   * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.flume.FlumePushWordCount
18 |   *   /jar/____.jar 192.168.13.131 8888
19 |   *
20 |   * Created by ZXL on 2017/10/23.
21 |   */
22 | object FlumePushWordCount {
23 | 
24 |   def main(args: Array[String]) {
25 |     val host = args(0)
26 |     val port = args(1).toInt
27 |     val conf = new SparkConf().setAppName("FlumeWordCount")//.setMaster("local[2]")
28 |     val ssc = new StreamingContext(conf, Seconds(5))
29 |     //推送方式: flume向spark发送数据
30 |     val flumeStream = FlumeUtils.createStream(ssc, host, port)
31 |     //flume中的数据通过event.getBody()才能拿到真正的内容
32 |     val words = flumeStream.flatMap(x => new String(x.event.getBody().array()).split(" ")).map((_, 1))
33 | 
34 |     val results = words.reduceByKey(_ + _)
35 |     results.print()
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/jedis/JedisConnectionPool.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.jedis
 2 | 
 3 | import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
 4 | 
 5 | /**
 6 |   * 获得Jedis连接，进行简单操作
 7 |   * Created by ZXL on 2016/5/24.
 8 |   */
 9 | object JedisConnectionPool{
10 | 
11 |   val config = new JedisPoolConfig()
12 |   //最大连接数,
13 |   config.setMaxTotal(10)
14 |   //最大空闲连接数,
15 |   config.setMaxIdle(5)
16 |   //当调用borrow Object方法时，是否进行有效性检查 -->
17 |   config.setTestOnBorrow(true)
18 |   val pool = new JedisPool(config, "172.16.0.101", 6379)
19 | 
20 |   def getConnection(): Jedis = {
21 |     pool.getResource
22 | 
23 |   }
24 | 
25 |   def main(args: Array[String]) {
26 |     val conn = JedisConnectionPool.getConnection()
27 |     val r = conn.keys("*")
28 |     println(r)
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/DirectKafkaWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.kafka
 2 | 
 3 | import kafka.serializer.StringDecoder
 4 | import org.apache.log4j.{Level, Logger}
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.streaming.kafka.KafkaManager
 8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 9 | 
10 | /**
11 |   * Spark Streaming维护偏移量相关的信息，实现零数据丢失，保证不重复消费
12 |   * 采用直连的方式有一个缺点，就是不再向zookeeper中更新offset信息。
13 |   * 因此，在采用直连的方式消费kafka中的数据的时候，大体思路是首先获取保存在zookeeper中的偏移量信息，
14 |   * 根据偏移量信息去创建stream，消费数据后再把当前的偏移量写入zookeeper中
15 |   *
16 |   * 在2.0以前的版本中KafkaManager这个类是private权限的，需要把它拷贝到项目里使用。
17 |   *     org.apache.spark.streaming.kafka
18 |   *
19 |   * Created by ZXL on 2017/11/1.
20 |   */
21 | object DirectKafkaWordCount {
22 | 
23 |   /*  def dealLine(line: String): String = {
24 |       val list = line.split(',').toList
25 |   //    val list = AnalysisUtil.dealString(line, ',', '"')// 把dealString函数当做split即可
26 |       list.get(0).substring(0, 10) + "-" + list.get(26)
27 |     }*/
28 | 
29 |   def processRdd(rdd: RDD[(String, String)]): Unit = {
30 |     val lines = rdd.map(_._2)
31 |     val words = lines.map(_.split(" "))
32 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
33 |     wordCounts.foreach(println)
34 |   }
35 | 
36 |   def main(args: Array[String]) {
37 |     if (args.length < 3) {
38 |       System.err.println(
39 |         s"""
40 |            |Usage: DirectKafkaWordCount <brokers> <topics> <groupid>
41 |            |  <brokers> is a list of one or more Kafka brokers
42 |            |  <topics> is a list of one or more kafka topics to consume from
43 |            |  <groupid> is a consume group
44 |            |
45 |         """.stripMargin)
46 |       System.exit(1)
47 |     }
48 | 
49 |     Logger.getLogger("org").setLevel(Level.WARN)
50 | 
51 |     val Array(brokers, topics, groupId) = args
52 | 
53 |     // Create context with 2 second batch interval
54 |     val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
55 |     sparkConf.setMaster("local[*]")
56 |     sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "5")
57 |     sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
58 | 
59 |     val ssc = new StreamingContext(sparkConf, Seconds(2))
60 | 
61 |     // Create direct kafka stream with brokers and topics
62 |     val topicsSet = topics.split(",").toSet
63 |     val kafkaParams = Map[String, String](
64 |       "metadata.broker.list" -> brokers,
65 |       "group.id" -> groupId,
66 |       "auto.offset.reset" -> "smallest"
67 |     )
68 | 
69 |     val km = new KafkaManager(kafkaParams)
70 | 
71 |     val messages = km.createDirectStream[String, String, StringDecoder, StringDecoder](
72 |       ssc, kafkaParams, topicsSet)
73 | 
74 |     messages.foreachRDD(rdd => {
75 |       if (!rdd.isEmpty()) {
76 |         // 先处理消息
77 |         processRdd(rdd)
78 |         // 再更新offsets
79 |         km.updateZKOffsets(rdd)
80 |       }
81 |     })
82 | 
83 |     ssc.start()
84 |     ssc.awaitTermination()
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/KafkaWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.kafka
 2 | 
 3 | import org.apache.spark.storage.StorageLevel
 4 | import org.apache.spark.streaming.kafka.KafkaUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | import org.apache.spark.{HashPartitioner, SparkConf}
 7 | 
 8 | /**
 9 |   * 从集群中的kafka读取数据操作
10 |   *
11 |   * 运行时参数：
12 |   *     node1:2181,node2:2181,node3:2181 g1 test 2
13 |   *     其中g1为组名，此处随意写，test为topic名，kafka中的topic名要一致
14 |   *
15 |   * 集群命令(需先启动完成)：
16 |   *     1.启动kafak
17 |   *       bin/kafka-server-start.sh  config/server.properties > /dev/null 2>&1 &
18 |   *     2.创建topic
19 |   *       bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 3 --partitions 3 --topic test
20 |   *     3.向topic中添加数据
21 |   *       bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
22 |   *
23 |   * Created by ZXL on 2017/11/1.
24 |   */
25 | object KafkaWordCount {
26 | 
27 |   val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {
28 |     //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x)))
29 |     iter.flatMap{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(i => (x, i))}
30 |   }
31 | 
32 |   def main(args: Array[String]) {
33 | 
34 |     LoggerLevels.setStreamingLogLevels()
35 |     val Array(zkQuorum, group, topics, numThreads) = args
36 |     val sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]")
37 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
38 |     ssc.checkpoint("D:\\test\\spark\\checkpoint2")
39 |     // 线程执行个数
40 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
41 |     val data = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_AND_DISK_SER)
42 |     // 返回(K, V)，_._2返回的是值，值得输入是按空格分开
43 |     val words = data.map(_._2).flatMap(_.split(" "))
44 |     val wordCounts = words.map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
45 |     wordCounts.print()
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/kafka/LoggerLevels.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.kafka
 2 | 
 3 | import org.apache.log4j.{Level, Logger}
 4 | import org.apache.spark.internal.Logging
 5 | 
 6 | object LoggerLevels extends Logging {
 7 | 
 8 |   def setStreamingLogLevels() {
 9 |     val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
10 |     if (!log4jInitialized) {
11 |       logInfo("Setting log level to [WARN] for streaming example." +
12 |         " To override add a custom log4j.properties to the classpath.")
13 |       Logger.getRootLogger.setLevel(Level.WARN)
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/my_partitioner/UrlCountPartition.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.my_partitioner
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import org.apache.spark.{Partitioner, SparkContext, SparkConf}
 6 | import scala.collection.mutable
 7 | 
 8 | /**
 9 |   * 自定义分区
10 |   * 数据格式(时间点  url地址)，例如：
11 |   *   20160321101954	http://net.zxl.cn/net/video.shtml
12 |   * 处理成数据(k, v)
13 |   * 对于数据(k, v)
14 |   * 重写自己的 partitioner
15 |   * Created by ZXL on 2017/10/20.
16 |   */
17 | object UrlCountPartition {
18 | 
19 |   def main(args: Array[String]) {
20 | 
21 |     val conf = new SparkConf().setAppName("UrlCountPartition").setMaster("local[2]")
22 |     val sc = new SparkContext(conf)
23 | 
24 |     // rdd1将数据切分，元组中放的是（URL, 1）
25 |     val rdd1 = sc.textFile("D://test//spark//adv_url_count.log").map(line => {
26 |       val f = line.split("\t")
27 |       (f(1), 1)
28 |     })
29 | 
30 |     val rdd2 = rdd1.reduceByKey(_ + _)  
31 | 
32 |     // （URL, n）
33 |     val rdd3 = rdd2.map(t => {
34 |       val url = t._1
35 |       val host = new URL(url).getHost
36 |       // host返回的是如 php.zxl.cn
37 |       (host, (url, t._2))
38 |     })
39 | 
40 |     // 得到结果为 ArrayBuffer(net.zxl.cn, java.zxl.cn, php.zxl.cn)
41 |     val ints = rdd3.map(_._1).distinct().collect()
42 | //    rdd3.repartition(3).saveAsTextFile("D://test//spark//out//out1")
43 | //    println(ints.toBuffer)
44 | 
45 |     val hostPartitioner = new HostPartitioner(ints)
46 |     // 取出每个 partitioner 中的信息
47 |     val rdd4 = rdd3.partitionBy(hostPartitioner).mapPartitions(it => {
48 |       it.toList.sortBy(_._2._2).reverse.take(2).iterator
49 |     })
50 | 
51 |     rdd4.saveAsTextFile("D://test//spark//out//out3")
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 
57 | class HostPartitioner(ins: Array[String]) extends Partitioner {
58 | 
59 |   val parMap = new mutable.HashMap[String, Int]()
60 |   var count = 0
61 |   for(i <- ins) {
62 |     parMap += (i -> count)
63 |     count += 1
64 |   }
65 | 
66 |   override def numPartitions: Int = ins.length
67 | 
68 |   override def getPartition(key: Any): Int = {
69 |     // 根据 key 值获得分区
70 |     parMap.getOrElse(key.toString, 0)
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/my_sort/CustomSort.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.my_sort
 2 | 
 3 | import org.apache.spark.{SparkContext, SparkConf}
 4 | 
 5 | // 第二种方式
 6 | object OrderContext {
 7 | 
 8 |   /**
 9 |     * 第一种形式
10 | 
11 |   implicit object GirlOrdering extends Ordering[Girl] {
12 |     override def compare(x: Girl, y: Girl): Int = {
13 |       if(x.faceValue > y.faceValue) 1
14 |       else if(x.faceValue == y.faceValue) {
15 |         if(x.age > y.age) -1 else 1
16 |       } else -1
17 |     }
18 |   }
19 |     */
20 | 
21 |   /**
22 |     * 第二种形式
23 |     */
24 |   implicit val girlOrdering = new Ordering[Girl] {
25 |     override def compare(x: Girl, y: Girl): Int = {
26 |       if(x.faceValue > y.faceValue) 1
27 |       else if(x.faceValue == y.faceValue) {
28 |         if(x.age > y.age) -1 else 1
29 |       } else -1
30 |     }
31 |   }
32 | }
33 | 
34 | /**
35 |   * Created by ZXL on 2017/10/21.
36 |   * 自定义排序
37 |   */
38 | object CustomSort {
39 | 
40 |   def main(args: Array[String]) {
41 |     val conf = new SparkConf().setAppName("CustomSort").setMaster("local[2]")
42 |     val sc = new SparkContext(conf)
43 |     val rdd1 = sc.parallelize(List(("zzz", 90, 28, 1), ("xxx", 90, 27, 2), ("lll", 95, 22, 3)))
44 |     import OrderContext._
45 |     val rdd2 = rdd1.sortBy(x => Girl(x._2, x._3), false)
46 |     println(rdd2.collect().toBuffer)
47 |     sc.stop()
48 |   }
49 | }
50 | 
51 | /**
52 |   * 第一种方式
53 |   * @param faceValue
54 |   * @param age
55 | 
56 | case class Girl(val faceValue: Int, val age: Int) extends Ordered[Girl] with Serializable {
57 |   override def compare(that: Girl): Int = {
58 |     if(this.faceValue == that.faceValue) {
59 |       that.age - this.age
60 |     } else {
61 |       this.faceValue - that.faceValue
62 |     }
63 |   }
64 | }
65 |   */
66 | 
67 | // 第二种方式
68 | case class Girl(faceValue: Int, age: Int) extends Serializable
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/mysql/JdbcRDDDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.mysql
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | import org.apache.spark.rdd.JdbcRDD
 6 | import org.apache.spark.{SparkConf, SparkContext}
 7 | 
 8 | /**
 9 |   * 简单连接数据库操作
10 |   * Created by ZXL on 2017/10/22.
11 |   */
12 | object JdbcRDDDemo {
13 | 
14 |   def main(args: Array[String]) {
15 |     val conf = new SparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")
16 |     val sc = new SparkContext(conf)
17 |     val connection = () => {
18 |       Class.forName("com.mysql.jdbc.Driver").newInstance()
19 |       DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useUnicode=true&characterEncoding=utf-8", "root", "1234")
20 |     }
21 |     val jdbcRDD = new JdbcRDD(
22 |       sc,
23 |       connection,
24 |       "SELECT * FROM ta where id >= ? AND id <= ?",
25 |       // 1,4分别为两个占位符赋值，2表示两个任务一起读取数据
26 |       1, 4, 2,
27 |       // 返回的内容
28 |       r => {
29 |         val id = r.getInt(1)
30 |         val code = r.getString(2)
31 |         (id, code)
32 |       }
33 |     )
34 |     val data = jdbcRDD.collect()
35 |     println(data.toBuffer)
36 |     sc.stop()
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/AdvUrlCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.simple
 2 | 
 3 | import java.net.URL
 4 | 
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | /**
 8 |   * 读取文本内容,根据指定的学科, 取出点击量前三的
 9 |   * 文本内容为某广告链接点击量，格式为：
10 |   *     (时间点  某学科url链接)
11 |   *     举例：(20160321101957	http://net.zxl.cn/net/course.shtml)
12 |   * Created by ZXL on 2017/10/16.
13 |   */
14 | object AdvUrlCount {
15 | 
16 |   def main(args: Array[String]) {
17 | 
18 |     // 从数据库中加载规则
19 |     val arr = Array("java.zxl.cn", "php.zxl.cn", "net.zxl.cn")
20 | 
21 |     val conf = new SparkConf().setAppName("AdvUrlCount").setMaster("local[2]")
22 |     val sc = new SparkContext(conf)
23 | 
24 |     // rdd1将数据切分，元组中放的是(URL, 1)
25 |     val rdd1 = sc.textFile("D://test//spark//advUrlCount.log").map(line => {
26 |       val f = line.split("\t")
27 |       (f(1), 1)
28 |     })
29 |     val rdd2 = rdd1.reduceByKey(_ + _)
30 | 
31 |     val rdd3 = rdd2.map(t => {
32 |       val url = t._1
33 |       val host = new URL(url).getHost
34 |       (host, url, t._2)
35 |     })
36 | 
37 |     // println(rdd3.collect().toBuffer)
38 | 
39 |     // val rddjava = rdd3.filter(_._1 == "java.zxl.cn")
40 |     // val sortdjava = rddjava.sortBy(_._3, false).take(3)
41 |     //  val rddphp = rdd3.filter(_._1 == "php.zxl.cn")
42 | 
43 |     for (ins <- arr) {
44 |       val rdd = rdd3.filter(_._1 == ins)
45 |       val result= rdd.sortBy(_._3, false).take(3)
46 |       //通过JDBC向数据库中存储数据
47 |       //id，学院，URL，次数， 访问日期
48 |       println(result.toBuffer)
49 |     }
50 | 
51 |     //println(sortdjava.toBuffer)
52 |     sc.stop()
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/IpDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.simple
 2 | 
 3 | import scala.collection.mutable.ArrayBuffer
 4 | import scala.io.Source
 5 | 
 6 | /**
 7 |   * 数据格式如下：
 8 |   *   (1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302)
 9 |   * 根据ip地址转换为数字，从数据集中找出详细信息.
10 |   * 为了简化查找速率，采用二分查找.
11 |   * Created by ZXL on 2017/10/22.
12 |   */
13 | object IpDemo {
14 | 
15 |   // ip地址转换为数字
16 |   // 如 100.101.102.103，从100开始向左移动8位
17 |   def ip2Long(ip: String): Long = {
18 |     val fragments = ip.split("[.]")
19 |     var ipNum = 0L
20 |     for (i <- 0 until fragments.length) {
21 |       // | 二进制OR运算符
22 |       // ipNum向左移动8位，相当于乘以256(即2^8)
23 |       ipNum = fragments(i).toLong | ipNum << 8L
24 |     }
25 |     ipNum
26 |   }
27 | 
28 |   // 从文件中读取数据
29 |   def readData(path: String) = {
30 | 
31 |     val lines = new ArrayBuffer[String]()
32 | 
33 |     /**
34 |       * java读取文件方式
35 |       * val br = new BufferedReader(new InputStreamReader(new FileInputStream(path)))
36 |       * var s: String = null
37 |       * var flag = true
38 |       * while (flag) {
39 |       * s = br.readLine()
40 |       * if (s != null)
41 |       * lines += s
42 |       * else
43 |       * flag = false
44 |       * }
45 |       * lines
46 |       */
47 | 
48 |     val content = Source.fromFile(path)
49 |     for (line <- content.getLines()) {
50 |       lines += line
51 |     }
52 |     lines
53 |   }
54 | 
55 |   // 二分查找ip的下标地址，ip地址已经转为十进制
56 |   def binarySearch(lines: ArrayBuffer[String], ip: Long): Int = {
57 |     var low = 0
58 |     var high = lines.length - 1
59 |     while (low <= high) {
60 |       val middle = (low + high) / 2
61 |       if ((ip >= lines(middle).split("\\|")(2).toLong) && (ip <= lines(middle).split("\\|")(3).toLong))
62 |         return middle
63 |       if (ip < lines(middle).split("\\|")(2).toLong)
64 |         high = middle - 1
65 |       else
66 |         low = middle + 1
67 |     }
68 |     -1
69 |   }
70 | 
71 |   def main(args: Array[String]) {
72 |     val ip = "120.55.185.61"
73 |     val ipNum = ip2Long(ip)
74 |     println(ipNum)
75 |     val lines = readData("d://test//spark//ip//ip.txt")
76 |     val index = binarySearch(lines, ipNum)
77 |     print(lines(index))
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/UserLocation.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.simple
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | /**
 6 |   * 根据日志统计出每个用户在站点所呆时间最长的前2个的信息
 7 |   * 日志内容格式为(手机号,时间点,基站站点,事件类型),事件类型为1时是进入基站,0是出基站。
 8 |   *   1, 先根据"手机号_站点"为唯一标识, 算一次进站出站的时间, 返回(手机号_站点, 时间间隔)
 9 |   *   2, 以"手机号_站点"为key, 统计每个站点的时间总和, ("手机号_站点", 时间总和)
10 |   *   3, ("手机号_站点", 时间总和) --> (手机号, 站点, 时间总和)
11 |   *   4, (手机号, 站点, 时间总和) --> groupBy().mapValues(以时间排序,取出前2个) --> (手机->((m,s,t)(m,s,t)))
12 |   * Created by ZXL on 2017/10/15.
13 |   */
14 | object UserLocation {
15 | 
16 |   def main(args: Array[String]) {
17 |     val conf = new SparkConf().setAppName("UserLocation").setMaster("local[2]")
18 |     val sc = new SparkContext(conf)
19 |     //sc.textFile("D://test//spark//bs_log").map(_.split(",")).map(x => (x(0), x(1), x(2), x(3)))
20 |     val mbt = sc.textFile("D://test//spark//bs_log").map( line => {
21 |       val fields = line.split(",")
22 |       val eventType = fields(3)
23 |       val time = fields(1)
24 |       val timeLong = if(eventType == "1")  -time.toLong else time.toLong
25 |       (fields(0) + "_"  + fields(2), timeLong)
26 |     })
27 |     //println(mbt.collect().toBuffer)
28 |     //(18611132889_9F36407EAD0629FC166F14DDE7970F68,54000)
29 |     val rdd1 = mbt.groupBy(_._1).mapValues(_.foldLeft(0L)(_ + _._2))
30 |     val rdd2 = rdd1.map( t => {
31 |       val mobile_bs = t._1
32 |       val mobile = mobile_bs.split("_")(0)
33 |       val lac = mobile_bs.split("_")(1)
34 |       val time = t._2
35 |       (mobile, lac, time)
36 |     })
37 |     val rdd3 = rdd2.groupBy(_._1)
38 |     //ArrayBuffer((18688888888,List((18688888888,16030401EAFB68F1E3CDF819735E1C66,87600), (18688888888,9F36407EAD0629FC166F14DDE7970F68,51200))), (18611132889,List((18611132889,16030401EAFB68F1E3CDF819735E1C66,97500), (18611132889,9F36407EAD0629FC166F14DDE7970F68,54000))))
39 |     val rdd4 = rdd3.mapValues(it => {
40 |       it.toList.sortBy(_._3).reverse.take(2)
41 |     })
42 |     println(rdd4.collect().toBuffer)
43 |     sc.stop()
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/simple/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.simple
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | /**
 6 |   * 简单WordCount实现
 7 |   * Created by ZXL on 2017/10/12.
 8 |   *
 9 |   * 集群上执行示例，指定相关配置
10 |   * bin/spark-submit --master spark://node1:7077 --class com.zxl.spark1_6.simple.WordCount --executor-memory 512m
11 |   * --total-executor-cores 2 /opt/soft/jar/hello-spark-1.0.jar hdfs://node1:9000/wc hdfs://node1:9000/out
12 |   */
13 | object WordCount {
14 | 
15 |   def main(args: Array[String]) {
16 |     // 非常重要，是通向Spark集群的入口
17 |     val conf = new SparkConf().setAppName("WordCount")
18 |     val sc = new SparkContext(conf)
19 | 
20 |     // reduceByKey(_+_, 1)指定partition的个数为1，即生成一个输出文件
21 |     sc.textFile(args(0)).flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).sortBy(_._2, false).saveAsTextFile(args(1))
22 |     sc.stop()
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/LoggerLevels.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.streaming
 2 | 
 3 | import org.apache.log4j.{Level, Logger}
 4 | import org.apache.spark.internal.Logging
 5 | 
 6 | /**
 7 |   * 设置打印的log的级别
 8 |   */
 9 | object LoggerLevels extends Logging {
10 | 
11 |   def setStreamingLogLevels() {
12 |     val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
13 |     if (!log4jInitialized) {
14 |       logInfo("Setting log level to [WARN] for streaming example." +
15 |         " To override add a custom log4j.properties to the classpath.")
16 |       Logger.getRootLogger.setLevel(Level.WARN)
17 |     }
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/StateFulWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.streaming
 2 | 
 3 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 4 | import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |   * Spark Streaming累加器操作（updateStateByKey)
 8 |   * Created by ZXL on 2017/11/1.
 9 |   */
10 | object StateFulWordCount {
11 | 
12 |   // Seq这个批次某个单词的次数
13 |   // Option[Int]：以前的结果
14 |   // 分好组的数据
15 |   // updateFunc: (Iterator[(K, Seq[V], Option[S])]) => Iterator[(K, S)]
16 |   val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {
17 |     // 下面几种操作结果一致
18 |     //iter.flatMap(it => Some(it._2.sum + it._3.getOrElse(0)).map(x => (it._1, x)))
19 |     //iter.map(t => (t._1, t._2.sum + t._3.getOrElse(0)))
20 |     //iter.map{case(x, y, z) => Some(y.sum + z.getOrElse(0)).map(m => (x, m))}
21 |     iter.map{case(word, current_count, history_count) => (word, current_count.sum + history_count.getOrElse(0))}
22 |   }
23 | 
24 |   def main(args: Array[String]) {
25 |     LoggerLevels.setStreamingLogLevels()
26 |     // StreamingContext
27 |     val conf = new SparkConf().setAppName("StateFulWordCount").setMaster("local[2]")
28 |     val sc = new SparkContext(conf)
29 |     // updateStateByKey必须设置setCheckpointDir
30 |     sc.setCheckpointDir("D:\\test\\spark\\checkpoint")
31 |     val ssc = new StreamingContext(sc, Seconds(5))
32 | 
33 |     val ds = ssc.socketTextStream("192.168.13.131", 8888)
34 | 
35 |     // DStream是一个特殊的RDD
36 |     // hello tom hello jerry
37 |     val result = ds.flatMap(_.split(" ")).map((_, 1)).updateStateByKey(updateFunc, new HashPartitioner(sc.defaultParallelism), true)
38 |     result.print()
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/StreamingWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.streaming
 2 | 
 3 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |   * 通过SparkStreaming简单实现WordCount
 8 |   * Created by ZXL on 2017/10/31.
 9 |   */
10 | object StreamingWordCount {
11 | 
12 |   def main(args: Array[String]) {
13 |     // 设置log level
14 |     LoggerLevels.setStreamingLogLevels()
15 | 
16 |     // StreamingContext
17 |     val conf = new SparkConf().setAppName("StreamingWordCount").setMaster("local[2]")
18 |     val sc = new SparkContext(conf)
19 |     val ssc = new StreamingContext(sc, Seconds(5))
20 | 
21 |     // 接收数据，使用nc绑定ip和端口发送数据
22 |     val ds = ssc.socketTextStream("192.168.13.131", 8888)
23 | 
24 |     // DStream是一个特殊的RDD
25 |     // hello tom hello jerry
26 |     val result = ds.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_)
27 | 
28 |     // 打印结果
29 |     result.print()
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark1_6/streaming/WindowOpts.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark1_6.streaming
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext}
 5 | 
 6 | /**
 7 |   * SparkStreaming窗口函数的实现
 8 |   * Created by ZXL on 2017/11/2.
 9 |   */
10 | object WindowOpts {
11 | 
12 |   def main(args: Array[String]) {
13 |     LoggerLevels.setStreamingLogLevels()
14 |     val conf = new SparkConf().setAppName("WindowOpts").setMaster("local[2]")
15 |     val ssc = new StreamingContext(conf, Milliseconds(5000))
16 |     val lines = ssc.socketTextStream("192.168.13.131", 9999)
17 |     val pairs = lines.flatMap(_.split(" ")).map((_, 1))
18 |     // Seconds(15)：窗口的宽度，Seconds(10)：移动窗口的间隔
19 |     val windowedWordCounts = pairs.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(15), Seconds(10))
20 |     windowedWordCounts.print()
21 |     // Map((hello, 5), (jerry, 2), (kitty, 3))
22 |     val a = windowedWordCounts.map(_._2).reduce(_+_)
23 |     a.foreachRDD(rdd => {
24 |       println(rdd.take(0))
25 |     })
26 |     a.print()
27 | 
28 |     // windowedWordCounts.map(t => (t._1, t._2.toDouble / a.toD))
29 |     ssc.start()
30 |     ssc.awaitTermination()
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/actions.scala:
--------------------------------------------------------------------------------
  1 | package com.zxl.spark2_2.dataset
  2 | 
  3 | import org.apache.spark.sql._
  4 | import org.apache.spark.sql.functions._
  5 | 
  6 | /**
  7 |   * DataSet的操作
  8 |   * Created by ZXL on 2018/1/28.
  9 |   */
 10 | object actions {
 11 | 
 12 |   // 构建Spark对象
 13 |   val spark = SparkSession.builder()
 14 |     .master("local[2]")
 15 |     .appName("createDataSet")
 16 |     .enableHiveSupport()
 17 |     .getOrCreate()
 18 | 
 19 |   // 导入操作需要的隐式函数
 20 |   import spark.implicits._
 21 | 
 22 |   // 1.map操作，flatMap操作
 23 |   val seq1 = Seq(Peoples(21, "zxl,wr,hy"), Peoples(20, "cc,hw,lwq"))
 24 |   val ds1 = spark.createDataset(seq1)
 25 |   val ds2 = ds1.map{ x => (x.age + 1, x.names)}.show()
 26 |   val ds3 = ds1.flatMap{ x =>
 27 |     val a = x.age
 28 |     val s = x.names.split(",").map{ x => (a, x)}
 29 |     s
 30 |   }.show()
 31 | 
 32 |   // 2.filter操作，where操作
 33 |   val seq2 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
 34 |   val ds4 = spark.createDataset(seq2)
 35 |   ds4.filter("age >= 20 and height >= 170").show()
 36 |   ds4.filter($"age" >= 20 && $"height" >= 170).show()
 37 |   ds4.filter{x => x.age > 20 && x.height >= 170}.show()
 38 |   ds4.where("age >= 20 and height >= 170").show()
 39 |   ds4.where($"age" >= 20 && $"height" >= 170).show()
 40 | 
 41 |   // 3.去重操作
 42 |   ds4.distinct().show()
 43 |   ds4.dropDuplicates("age").show()
 44 |   ds4.dropDuplicates("age", "height").show()
 45 |   ds4.dropDuplicates(Seq("age", "height")).show()
 46 |   ds4.dropDuplicates(Array("age", "height")).show()
 47 | 
 48 |   // 4.加法/减法操作
 49 |   val seq3 = Seq(Person("zxl2", 29, 170), Person("wx2", 30, 165), Person("cc2", 30, 165))
 50 |   val ds5 = spark.createDataset(seq3)
 51 |   ds4.except(ds5).show()
 52 |   ds4.union(ds5).show()
 53 |   ds4.intersect(ds5).show()
 54 | 
 55 |   // 5.select操作
 56 |   ds5.select("name", "age").show()
 57 |   ds5.select(expr("height + 1").as[Int]).show()
 58 | 
 59 |   // 6.排序操作
 60 |   ds5.sort("age").show()
 61 |   ds5.sort($"age".desc, $"height".desc).show()
 62 |   ds5.orderBy("age").show()
 63 |   ds5.orderBy($"age".desc, $"height".desc).show()
 64 | 
 65 |   // 7.分割抽样操作
 66 |   val ds6 = ds4.union(ds5)
 67 |   val rands = ds6.randomSplit(Array(0.3, 0.7))
 68 |   rands(0).count()
 69 |   rands(1).count()
 70 |   rands(0).show()
 71 |   rands(1).show()
 72 |   val ds7 = ds6.sample(false, 0.5)
 73 |   ds7.count()
 74 |   ds7.show()
 75 | 
 76 |   // 8.列操作
 77 |   val ds8 = ds6.drop("height")
 78 |   ds8.columns
 79 |   ds8.show()
 80 |   val ds9 = ds6.withColumn("add2", $"age" + 2)    // 对数据集增加列
 81 |   ds9.columns
 82 |   ds9.show()
 83 |   val ds10 = ds9.withColumnRenamed("add2", "age_new")
 84 |   ds10.columns
 85 |   ds10.show()
 86 |   ds6.withColumn("add_col", lit(1)).show()
 87 | 
 88 |   // 9.join操作
 89 |   val seq4 = Seq(Score("zxl", 85), Score("wr", 90), Score("hy", 95))
 90 |   val ds11 = spark.createDataset(seq4)
 91 |   val ds12 = ds5.join(ds11, Seq("name"), "inner")
 92 |   ds12.show()
 93 |   val ds13 = ds5.join(ds11, Seq("name"), "left")
 94 |   ds13.show()
 95 | 
 96 |   // 10.分组聚合操作
 97 |   val ds14 = ds4.union(ds5).groupBy("height").agg(avg("age")).as("avg_agg")
 98 |   ds14.show()
 99 | }
100 | 
101 | case class Score(name: String, score: Int)


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/basicAction.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.dataset
 2 | 
 3 | import org.apache.spark.sql._
 4 | import org.apache.spark.sql.types._
 5 | import org.apache.spark.storage.StorageLevel._
 6 | 
 7 | /**
 8 |   * DataSet的基本操作
 9 |   * Created by ZXL on 2018/1/28.
10 |   */
11 | object basicAction {
12 | 
13 |   // 构建Spark对象
14 |   val spark = SparkSession.builder()
15 |     .master("local[2]")
16 |     .appName("createDataSet")
17 |     .enableHiveSupport()
18 |     .getOrCreate()
19 | 
20 |   // 导入操作需要的隐式函数
21 |   import spark.implicits._
22 | 
23 |   // 1.DataSet存储类型
24 |   val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
25 |   val ds1 = spark.createDataset(seq1)
26 |   ds1.show()
27 |   ds1.checkpoint()
28 |   ds1.cache()
29 |   ds1.persist(MEMORY_ONLY)
30 |   ds1.count()
31 |   ds1.show()
32 |   ds1.unpersist(true)   // 将DataSet删除
33 | 
34 |   // 2.获取数据集
35 |   val c1 = ds1.collect()
36 |   val c2 = ds1.collectAsList()
37 |   val h1 = ds1.head()
38 |   val h2 = ds1.head(3)
39 |   val f1 = ds1.first()
40 |   val t1 = ds1.take(2)
41 |   val t2 = ds1.takeAsList(2)
42 | 
43 |   // 3.统计数据集
44 |   ds1.count()
45 |   ds1.describe().show()
46 |   ds1.describe("age").show()
47 |   ds1.describe("age", "height").show()
48 | 
49 |   // 4.聚集
50 |   ds1.reduce((f1, f2) => Person("sum", (f1.age + f2.age), (f1.height + f2.height)))
51 | 
52 |   // 5.DataSet结构属性
53 |   ds1.columns
54 |   ds1.dtypes
55 |   ds1.explain()   // 返回执行物理计划
56 | 
57 |   // 6.DataSet rdd数据互转
58 |   val rdd1 = ds1.rdd
59 |   val ds2 = rdd1.toDS()
60 |   ds2.show()
61 |   val df2 = rdd1.toDF()
62 |   df2.show()
63 | 
64 |   // 7.DataSet 保存文件
65 |   ds1.select("name", "age", "height").write.format("csv").save("hdfs://node1:9000/test2.csv")
66 |   // 读取保存的文件
67 |   val schema2 = StructType(
68 |     StructField("name", StringType, false) ::
69 |       StructField("age", IntegerType, false) ::
70 |       StructField("name", IntegerType, true) :: Nil)
71 |   val out = spark.read.
72 |     options(Map(("delimiter", ","), ("header", "false"))).
73 |     schema(schema2).csv("hdf2://node:9000/test2.csv")
74 |   out.show(10)
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/dataset/createDataSet.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.dataset
 2 | 
 3 | import org.apache.spark.sql._
 4 | import org.apache.spark.sql.functions._
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | /**
 8 |   * DataSet创建的多种方式
 9 |   * Created by ZXL on 2018/1/28.
10 |   */
11 | object createDataSet {
12 | 
13 |   // 构建Spark对象
14 |   val spark = SparkSession.builder()
15 |     .master("local[2]")
16 |     .appName("createDataSet")
17 |     .enableHiveSupport()
18 |     .getOrCreate()
19 | 
20 |   // 导入操作需要的隐式函数
21 |   import spark.implicits._
22 | 
23 |   // 设置检查点
24 |   spark.sparkContext.setCheckpointDir("hdfs://node1:9000/user/spark_checkpoint")
25 | 
26 |   // 1.产生序列dataset
27 |   val numDS = spark.range(5, 100, 5)
28 |   numDS.orderBy(desc("id")).show(5)
29 |   numDS.describe().show()
30 | 
31 |   // 2.集合转成DataSet
32 |   val seq1 = Seq(Person("zxl", 29, 170), Person("wx", 30, 165), Person("cc", 30, 165))
33 |   val ds1 = spark.createDataset(seq1)
34 |   ds1.show()
35 | 
36 |   // 3.集合转成DataFrame
37 |   val df1 = spark.createDataFrame(seq1).withColumnRenamed("_1", "name").withColumnRenamed("_2", "age")
38 |   df1.orderBy(desc("age")).show(10)
39 | 
40 |   // 4.rdd转成DataFrame
41 |   val array1 = Array(("zxl", 29, 170), ("wx", 30, 165), ("cc", 30, 165))
42 |   val rdd1 = spark.sparkContext.parallelize(array1, 3).map(f => Row(f._1, f._2, f._3))
43 |   val schema = StructType(
44 |     StructField("name", StringType, false) ::
45 |     StructField("age", IntegerType, true) :: Nil)
46 |   val rddToDataFrame = spark.createDataFrame(rdd1, schema)
47 |   rddToDataFrame.orderBy(desc("name")).show(false)
48 | 
49 |   // 5.rdd转成DataSet/DataFrame
50 |   val rdd2 = spark.sparkContext.parallelize(array1, 3).map(f => Person(f._1, f._2, f._3))
51 |   val ds2 = rdd2.toDS()
52 |   val df2 = rdd2.toDF()
53 |   ds2.orderBy(desc("name")).show(10)
54 |   df2.orderBy(desc("name")).show(10)
55 | 
56 |   // 6.rdd转成DataSet
57 |   val ds3 = spark.createDataset(rdd2)
58 |   ds3.show(10)
59 | 
60 |   // 7.读取文件
61 |   val df4 = spark.read.csv("hdf2://node:9000/test.csv")
62 |   df4.show()
63 | 
64 |   // 8.读取文件，详细参数
65 |   val schema2 = StructType(
66 |     StructField("name", StringType, false) ::
67 |     StructField("age", IntegerType, false) ::
68 |     StructField("name", IntegerType, true) :: Nil)
69 |   val df7 = spark.read.
70 |     options(Map(("delimiter", ","), ("header", "false"))).
71 |     schema(schema2).csv("hdf2://node:9000/test.csv")
72 | }
73 | 
74 | case class Person(name: String, age: Int, height: Int)
75 | case class Peoples(age: Int, names: String)


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka10.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.kafka
 2 | 
 3 | import org.apache.kafka.common.serialization.StringDeserializer
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
 6 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
 7 | import org.apache.spark.streaming.kafka010._
 8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 9 | 
10 | /**
11 |   * SparkStreaming从kafka中读取数据
12 |   * kafka版本0.10
13 |   * 采取直连方式
14 |   *
15 |   * Created by ZXL on 2017/10/15.
16 |   */
17 | object StreamingKafka10 {
18 | 
19 |   def main(args: Array[String]): Unit = {
20 | 
21 |     val spark  = SparkSession.builder()
22 |       .master("local[2]")
23 |       .appName("streaming").getOrCreate()
24 | 
25 |     val sc =spark.sparkContext
26 |     val ssc = new StreamingContext(sc, Seconds(5))
27 |     val kafkaParams = Map[String, Object](
28 |       "bootstrap.servers" -> "node2:9092",
29 |       "key.deserializer" -> classOf[StringDeserializer],
30 |       "value.deserializer" -> classOf[StringDeserializer],
31 |       "group.id" -> "0001",
32 |       "auto.offset.reset" -> "latest",
33 |       "enable.auto.commit" -> (false: java.lang.Boolean)
34 |     )
35 |     val topics = Array("weblogs")
36 |     val stream = KafkaUtils.createDirectStream[String, String](
37 |       ssc,
38 |       PreferConsistent,
39 |       Subscribe[String, String](topics, kafkaParams)
40 |     )
41 | 
42 |     val lines = stream.map(x => x.value())
43 |     val words = lines.flatMap(_.split(" "))
44 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
45 |     wordCounts.print()
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/kafka/StreamingKafka8.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.kafka
 2 | 
 3 | import kafka.serializer.StringDecoder
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.streaming.kafka.KafkaUtils
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | 
 8 | /**
 9 |   * SparkStreaming从kafka中读取数据
10 |   * kafka版本0.8
11 |   * 采取直连方式
12 |   *
13 |   * Created by ZXL on 2017/10/15.
14 |   */
15 | object StreamingKafka8 {
16 | 
17 |   def main(args: Array[String]): Unit = {
18 | 
19 |     val spark  = SparkSession.builder()
20 |       .master("local[2]")
21 |       .appName("streaming").getOrCreate()
22 | 
23 |     val sc =spark.sparkContext
24 |     val ssc = new StreamingContext(sc, Seconds(5))
25 | 
26 |     // Create direct kafka stream with brokers and topics
27 |     val topicsSet =Set("weblogs")
28 |     val kafkaParams = Map[String, String]("metadata.broker.list" -> "node1:9092")
29 |     val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
30 |       ssc, kafkaParams, topicsSet)
31 | 
32 |     val lines = kafkaStream.map(x => x._2)
33 |     val words = lines.flatMap(_.split(" "))
34 |     val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
35 |     wordCounts.print()
36 | 
37 |     ssc.start()
38 |     ssc.awaitTermination()
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/streaming/StreamingToMysql.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.streaming
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | 
 8 | /**
 9 |   * SparkStreaming读取数据，存储到Mysql中
10 |   *
11 |   * Created by ZXL on 2017/10/23.
12 |   */
13 | object StreamingToMysql {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val spark  = SparkSession.builder()
18 |       .master("local[2]")
19 |       .appName("streaming").getOrCreate()
20 | 
21 |     val sc =spark.sparkContext
22 |     val ssc = new StreamingContext(sc, Seconds(5))
23 |     val lines = ssc.socketTextStream("node2", 9999)
24 |     val words = lines.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
25 | 
26 |     words.foreachRDD(rdd => rdd.foreachPartition(line => {
27 |          Class.forName("com.mysql.jdbc.Driver")
28 |          val conn = DriverManager
29 |            .getConnection("jdbc:mysql://node3:3306/test","root","1234")
30 |          try{
31 |             for(row <- line){
32 |               val sql = "insert into webCount(titleName,count)values('"+row._1+"',"+row._2+")"
33 |               conn.prepareStatement(sql).executeUpdate()
34 |             }
35 |          }finally {
36 |             conn.close()
37 |          }
38 |     }))
39 | 
40 |      //words.print()
41 |       ssc.start()
42 |       ssc.awaitTermination()
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/JDBCSink.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.structured
 2 | 
 3 | import java.sql._
 4 | 
 5 | import org.apache.spark.sql.{ForeachWriter, Row}
 6 | 
 7 | /**
 8 |   * 处理从StructuredStreaming中向mysql中写入数据
 9 |   *
10 |   * Created by ZXL on 2017/10/15.
11 |   */
12 | class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {
13 | 
14 |   var statement: Statement = _
15 |   var resultSet: ResultSet = _
16 |   var connection: Connection = _
17 | 
18 |   override def open(partitionId: Long, version: Long): Boolean = {
19 |     connection = new MySqlPool(url, username, password).getJdbcConn()
20 |     statement = connection.createStatement()
21 |     return true
22 |   }
23 | 
24 |   override def process(value: Row): Unit = {
25 | 
26 |     val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
27 |     val count = value.getAs[Long]("count")
28 | 
29 |     val querySql = "select 1 from webCount " +
30 |       "where titleName = '" + titleName + "'"
31 | 
32 |     val updateSql = "update webCount set " +
33 |       "count = " + count + " where titleName = '" + titleName + "'"
34 | 
35 |     val insertSql = "insert into webCount(titleName,count)" +
36 |       "values('" + titleName + "'," + count + ")"
37 | 
38 |     try {
39 | 
40 |       //查看连接是否成功
41 |       var resultSet = statement.executeQuery(querySql)
42 |       if (resultSet.next()) {
43 |         statement.executeUpdate(updateSql)
44 |       } else {
45 |         statement.execute(insertSql)
46 |       }
47 |     } catch {
48 |       case ex: SQLException => {
49 |         println("SQLException")
50 |       }
51 |       case ex: Exception => {
52 |         println("Exception")
53 |       }
54 |       case ex: RuntimeException => {
55 |         println("RuntimeException")
56 |       }
57 |       case ex: Throwable => {
58 |         println("Throwable")
59 |       }
60 |     }
61 |   }
62 | 
63 |   override def close(errorOrNull: Throwable): Unit = {
64 |     //    if(resultSet.wasNull()){
65 |     //      resultSet.close()
66 |     //    }
67 |     if (statement == null) {
68 |       statement.close()
69 |     }
70 |     if (connection == null) {
71 |       connection.close()
72 |     }
73 |   }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/MySqlPool.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.structured
 2 | 
 3 | import java.sql.{Connection, DriverManager}
 4 | import java.util
 5 | 
 6 | /**
 7 |   * 从mysql连接池中获取连接
 8 |   *
 9 |   * Created by ZXL on 2017/10/15.
10 |   */
11 | class MySqlPool(url: String, user: String, pwd: String) extends Serializable {
12 |   //连接池连接总数
13 |   private val max = 3
14 | 
15 |   //每次产生连接数
16 |   private val connectionNum = 1
17 | 
18 |   //当前连接池已产生的连接数
19 |   private var conNum = 0
20 | 
21 |   private val pool = new util.LinkedList[Connection]() //连接池
22 | 
23 |   //获取连接
24 |   def getJdbcConn(): Connection = {
25 |     //同步代码块，AnyRef为所有引用类型的基类，AnyVal为所有值类型的基类
26 |     AnyRef.synchronized({
27 |       if (pool.isEmpty) {
28 |         //加载驱动
29 |         preGetConn()
30 |         for (i <- 1 to connectionNum) {
31 |           val conn = DriverManager.getConnection(url, user, pwd)
32 |           pool.push(conn)
33 |           conNum += 1
34 |         }
35 |       }
36 |       pool.poll()
37 |     })
38 |   }
39 | 
40 |   //释放连接
41 |   def releaseConn(conn: Connection): Unit = {
42 |     pool.push(conn)
43 |   }
44 | 
45 |   //加载驱动
46 |   private def preGetConn(): Unit = {
47 |     //控制加载
48 |     if (conNum < max && !pool.isEmpty) {
49 |       println("Jdbc Pool has no connection now, please wait a moments!")
50 |       Thread.sleep(2000)
51 |       preGetConn()
52 |     } else {
53 |       Class.forName("com.mysql.jdbc.Driver")
54 |     }
55 |   }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/com/zxl/spark2_2/structured/StructuredStreamingKafka.scala:
--------------------------------------------------------------------------------
 1 | package com.zxl.spark2_2.structured
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.ProcessingTime
 5 | 
 6 | /**
 7 |   * 结构化流从kafka中读取数据存储到关系型数据库mysql
 8 |   * 目前结构化流对kafka的要求版本0.10及以上
 9 |   *
10 |   * Created by ZXL on 2017/10/15.
11 |   */
12 | object StructuredStreamingKafka {
13 | 
14 |   case class Weblog(datatime:String,
15 |                     userid:String,
16 |                     searchname:String,
17 |                     retorder:String,
18 |                     cliorder:String,
19 |                     cliurl:String)
20 | 
21 |   def main(args: Array[String]): Unit = {
22 | 
23 |     val spark  = SparkSession.builder()
24 |       .master("local[2]")
25 |       .appName("streaming").getOrCreate()
26 | 
27 |     val df = spark
28 |       .readStream
29 |       .format("kafka")
30 |       .option("kafka.bootstrap.servers", "node1:9092")
31 |       .option("subscribe", "weblogs")
32 |       .load()
33 | 
34 |     import spark.implicits._
35 |     val lines = df.selectExpr("CAST(value AS STRING)").as[String]
36 |     val weblog = lines.map(_.split(","))
37 |                      .map(x => Weblog(x(0), x(1), x(2),x(3),x(4),x(5)))
38 |     val titleCount = weblog
39 |       .groupBy("searchname").count().toDF("titleName","count")
40 | 
41 |     val url ="jdbc:mysql://node3:3306/test"
42 |     val username="root"
43 |     val password="1234"
44 | 
45 |     val writer = new JDBCSink(url,username,password)
46 |     val query = titleCount.writeStream
47 |       .foreach(writer)      
48 |       .outputMode("update")
49 |       .trigger(ProcessingTime("5 seconds"))
50 |       .start()
51 |     query.awaitTermination()
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/streaming/kafka/KafkaManager.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.streaming.kafka
  2 | 
  3 | import kafka.common.TopicAndPartition
  4 | import kafka.message.MessageAndMetadata
  5 | import kafka.serializer.Decoder
  6 | import org.apache.spark.SparkException
  7 | import org.apache.spark.rdd.RDD
  8 | import org.apache.spark.streaming.StreamingContext
  9 | import org.apache.spark.streaming.dstream.InputDStream
 10 | import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
 11 | 
 12 | import scala.reflect.ClassTag
 13 | 
 14 | /**
 15 |   * 自己管理offset
 16 |   */
 17 | class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable {
 18 | 
 19 |   private val kc = new KafkaCluster(kafkaParams)
 20 | 
 21 |   /**
 22 |     * 创建数据流
 23 |     */
 24 |   def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](
 25 |                                                                                                             ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] =  {
 26 |     val groupId = kafkaParams.get("group.id").get
 27 |     // 在zookeeper上读取offsets前先根据实际情况更新offsets
 28 |     setOrUpdateOffsets(topics, groupId)
 29 | 
 30 |     //从zookeeper上读取offset开始消费message
 31 |     val messages = {
 32 |       val partitionsE = kc.getPartitions(topics)
 33 |       if (partitionsE.isLeft)
 34 |         throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
 35 |       val partitions = partitionsE.right.get
 36 |       val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
 37 |       if (consumerOffsetsE.isLeft)
 38 |         throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")
 39 |       val consumerOffsets = consumerOffsetsE.right.get
 40 |       KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
 41 |         ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
 42 |     }
 43 |     messages
 44 |   }
 45 | 
 46 |   /**
 47 |     * 创建数据流前，根据实际消费情况更新消费offsets
 48 |     * @param topics
 49 |     * @param groupId
 50 |     */
 51 |   private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = {
 52 |     topics.foreach(topic => {
 53 |       var hasConsumed = true
 54 |       val partitionsE = kc.getPartitions(Set(topic))
 55 |       if (partitionsE.isLeft)
 56 |         throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
 57 |       val partitions = partitionsE.right.get
 58 |       val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
 59 |       if (consumerOffsetsE.isLeft) hasConsumed = false
 60 |       if (hasConsumed) {// 消费过
 61 |         /**
 62 |           * 如果streaming程序执行的时候出现kafka.common.OffsetOutOfRangeException，
 63 |           * 说明zk上保存的offsets已经过时了，即kafka的定时清理策略已经将包含该offsets的文件删除。
 64 |           * 针对这种情况，只要判断一下zk上的consumerOffsets和earliestLeaderOffsets的大小，
 65 |           * 如果consumerOffsets比earliestLeaderOffsets还小的话，说明consumerOffsets已过时,
 66 |           * 这时把consumerOffsets更新为earliestLeaderOffsets
 67 |           */
 68 |         val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
 69 |         if (earliestLeaderOffsetsE.isLeft)
 70 |           throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}")
 71 |         val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
 72 |         val consumerOffsets = consumerOffsetsE.right.get
 73 | 
 74 |         // 可能只是存在部分分区consumerOffsets过时，所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
 75 |         var offsets: Map[TopicAndPartition, Long] = Map()
 76 |         consumerOffsets.foreach({ case(tp, n) =>
 77 |           val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
 78 |           if (n < earliestLeaderOffset) {
 79 |             println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition +
 80 |               " offsets已经过时，更新为" + earliestLeaderOffset)
 81 |             offsets += (tp -> earliestLeaderOffset)
 82 |           }
 83 |         })
 84 |         if (!offsets.isEmpty) {
 85 |           kc.setConsumerOffsets(groupId, offsets)
 86 |         }
 87 |       } else {// 没有消费过
 88 |       val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
 89 |         var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
 90 |         if (reset == Some("smallest")) {
 91 |           val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
 92 |           if (leaderOffsetsE.isLeft)
 93 |             throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}")
 94 |           leaderOffsets = leaderOffsetsE.right.get
 95 |         } else {
 96 |           val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions)
 97 |           if (leaderOffsetsE.isLeft)
 98 |             throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}")
 99 |           leaderOffsets = leaderOffsetsE.right.get
100 |         }
101 |         val offsets = leaderOffsets.map {
102 |           case (tp, offset) => (tp, offset.offset)
103 |         }
104 |         kc.setConsumerOffsets(groupId, offsets)
105 |       }
106 |     })
107 |   }
108 | 
109 |   /**
110 |     * 更新zookeeper上的消费offsets
111 |     * @param rdd
112 |     */
113 |   def updateZKOffsets(rdd: RDD[(String, String)]) : Unit = {
114 |     val groupId = kafkaParams.get("group.id").get
115 |     val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
116 | 
117 |     for (offsets <- offsetsList) {
118 |       val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
119 |       val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
120 |       if (o.isLeft) {
121 |         println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
122 |       }
123 |     }
124 |   }
125 | }
126 | 
127 | 


--------------------------------------------------------------------------------