├── README.md
├── flink-coding
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── hive-site.xml
    │       └── scala
    │           └── com
    │               └── anryg
    │                   ├── FlinkDSFromKafka2HDFS.scala
    │                   ├── FlinkTest04.scala
    │                   ├── hive_cdc
    │                       ├── FlinkReadKafka2Hive.scala
    │                       └── FlinkWithHive.scala
    │                   └── window_and_watermark
    │                       ├── FlinkDSFromKafkaWithWatermark.scala
    │                       ├── FlinkSQLFromKafkaWithWatermarkAndWindow.scala
    │                       └── FlinkTBFromKafkaWithWatermark.scala
├── pom.xml
├── redis
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── anryg
    │                   └── bigdata
    │                       ├── IPUtils.java
    │                       ├── IpSearch.java
    │                       ├── RedisClientUtils.java
    │                       └── RedisParam.java
└── spark-coding
    ├── pom.xml
    ├── spark-coding.iml
    └── src
        └── main
            ├── java
                └── com
                │   └── anryg
                │       └── bigdata
                │           └── clickhouse
                │               └── CKSink.java
            └── scala
                └── com
                    └── anryg
                        └── bigdata
                            ├── hive
                                ├── ConnectHive.scala
                                └── Spark3ConnectHive3.scala
                            ├── streaming
                                ├── Kafka2CK.scala
                                ├── StreamingProcessHelper.scala
                                ├── StructuredStreamingTest.scala
                                ├── demo
                                │   ├── StructuredStreaming4Kafka2CSV.scala
                                │   ├── StructuredStreamingFromKafka.scala
                                │   ├── StructuredStreamingFromKafka2ES.scala
                                │   ├── StructuredStreamingFromKafka2Hive.scala
                                │   ├── StructuredStreamingReadHive.scala
                                │   └── window_watermark
                                │   │   └── WorldCountWithWatermark.scala
                                ├── dwd
                                │   └── StreamingFromOds2Dwd.scala
                                └── ods
                                │   └── StreamingSource2HiveOds.scala
                            └── test
                                ├── data_skew
                                    ├── DataSkew01.scala
                                    ├── DataSkew02.scala
                                    └── MyPartitioner.scala
                                └── map_pk_mappartition
                                    ├── MapPartitionTest.scala
                                    └── MapTest.scala


/README.md:
--------------------------------------------------------------------------------
 1 | # internet_behavior_project
 2 | 大数据项目之用户上网行为分析
 3 | 
 4 |  
 5 |  数据源解读
 6 | 这份数据长这样，有非常规整的9个字段(我都替你清洗过了)，为了方便你们读取，我把它导出成CSV文件，其中第一行是schema。
 7 | 
 8 | 为了方便大家获取，我把它放到了云盘上，原文件有12G，我通过压缩之后，也有3G，为了保证大家是真的用这份数据在学习，而不是干别的，这个下载地址需要你加我微信后告诉你。
 9 | 
10 | 现在来帮你解读下这份数据，一共个9个字段，其字段意义解释分别如下：
11 | client_ip: 指上网用户的ip地址，你可以根据这个ip知道这个用户大概的位置信息，这个有专门的api可以查询；
12 | domain：指上网人要上的网站地址，你可以根据该网站的性质来判断这个人的上网行为；
13 | time：上网人的上网时间；
14 | target_ip: 上网人要上的网站的目标ip地址；
15 | rcode：网站返回状态码，0为正常响应，2为不正常；
16 | query_type: 查询类型，几乎都是1，即正常上网行为；
17 | authority_recode：网站服务器真正返回的域名，可能跟domain不一样，如果不一样的话，可能说明是个钓鱼网站之类的，你可以去分析分析；
18 | add_msg: 附加信息，几乎都为空，你可以看看如果有内容的话，到底是什么玩意；
19 | dns_ip：当前要上的这个网站由哪个DNS服务器给提供的解析，一般一个DNS服务器会服务一个区域，如果由同一个DNS服务器进行解析的，说明他们在同一片大的区域；
20 | 
21 | 以上是对这份数据的字段解读，相信从这些解释中，你已经大概能了解这份数据的作用了。
22 | 


--------------------------------------------------------------------------------
/flink-coding/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>internet_behavior_project</artifactId>
  7 |         <groupId>com.anryg.bigdata</groupId>
  8 |         <version>1.0-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <artifactId>flink-coding</artifactId>
 13 | 
 14 |     <name>flink-coding</name>
 15 |     <!-- FIXME change it to the project's website -->
 16 |     <url>http://www.example.com</url>
 17 | 
 18 |     <properties>
 19 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 20 |         <maven.compiler.source>1.8</maven.compiler.source>
 21 |         <maven.compiler.target>1.8</maven.compiler.target>
 22 | 
 23 |         <flink.version>1.15.2</flink.version> <!--flink版本-->
 24 |         <hadoop.version>3.1.0</hadoop.version> <!--Hadoop版本-->
 25 |     </properties>
 26 | 
 27 |     <dependencies>
 28 |         <!--DataStream API需要-->
 29 |         <dependency>
 30 |             <groupId>org.apache.flink</groupId>
 31 |             <artifactId>flink-streaming-scala_2.12</artifactId><!--如果是用scala语言，需要指定scala版本-->
 32 |             <version>${flink.version}</version>
 33 |             <exclusions>
 34 |                 <exclusion>
 35 |                     <artifactId>commons-math3</artifactId>
 36 |                     <groupId>org.apache.commons</groupId>
 37 |                 </exclusion>
 38 |             </exclusions>
 39 |         </dependency>
 40 |         <dependency>
 41 |             <groupId>org.apache.flink</groupId>
 42 |             <artifactId>flink-clients</artifactId><!--高版本不用指定scala版本-->
 43 |             <version>${flink.version}</version>
 44 |         </dependency>
 45 |         <!--读取kafka需要，kafka CDC-->
 46 |         <dependency>
 47 |             <groupId>org.apache.flink</groupId>
 48 |             <artifactId>flink-connector-kafka</artifactId><!--高版本不用指定scala版本-->
 49 |             <version>${flink.version}</version>
 50 |         </dependency>
 51 |         <!--添加hive CDC依赖-->
 52 |         <dependency>
 53 |             <groupId>org.apache.flink</groupId>
 54 |             <artifactId>flink-connector-hive_2.12</artifactId>
 55 |             <version>${flink.version}</version>
 56 |         </dependency>
 57 |         <dependency>
 58 |             <groupId>org.apache.hive</groupId>
 59 |             <artifactId>hive-exec</artifactId>
 60 |             <version>${hadoop.version}</version>
 61 |             <exclusions>
 62 |                 <exclusion>
 63 |                     <artifactId>calcite-core</artifactId>
 64 |                     <groupId>org.apache.calcite</groupId>  <!--解决jar包冲突-->
 65 |                 </exclusion>
 66 |                 <exclusion>
 67 |                     <artifactId>calcite-linq4j</artifactId>
 68 |                     <groupId>org.apache.calcite</groupId>  <!--解决jar包冲突-->
 69 |                 </exclusion>
 70 |             </exclusions>
 71 |         </dependency>
 72 | 
 73 |         <!--DataStream转Table-->
 74 |         <dependency>
 75 |             <groupId>org.apache.flink</groupId>
 76 |             <artifactId>flink-table-api-scala-bridge_2.12</artifactId><!--需要指定scala版本，这个需要跟flink-streaming-scala_2.12保持一致，如果上面是java，这里就对应的java-->
 77 |             <version>${flink.version}</version>
 78 |         </dependency>
 79 |         <dependency>
 80 |             <groupId>org.apache.flink</groupId>
 81 |             <artifactId>flink-table-planner_2.12</artifactId>
 82 |             <version>${flink.version}</version>
 83 |             <scope>provided</scope>  <!--会跟服务器上lib目录中的jar包冲突-->
 84 |         </dependency>
 85 | 
 86 |         <!--提供ES连接-->
 87 |         <dependency>
 88 |             <groupId>org.apache.flink</groupId>
 89 |             <artifactId>flink-connector-elasticsearch7</artifactId>
 90 |             <version>${flink.version}</version>
 91 |         </dependency>
 92 |         <!--添加Hadoop依赖，数据写hdfs需要-->
 93 |         <dependency>
 94 |             <groupId>org.apache.hadoop</groupId>
 95 |             <artifactId>hadoop-common</artifactId>
 96 |             <version>${hadoop.version}</version>
 97 |             <exclusions>
 98 |                 <exclusion>
 99 |                     <artifactId>commons-compress</artifactId>
100 |                     <groupId>org.apache.commons</groupId>
101 |                 </exclusion>
102 |             </exclusions>
103 |         </dependency>
104 |         <dependency>
105 |             <groupId>org.apache.hadoop</groupId>
106 |             <artifactId>hadoop-client</artifactId>
107 |             <version>${hadoop.version}</version>
108 |             <exclusions>
109 |                 <exclusion>
110 |                     <artifactId>commons-compress</artifactId>
111 |                     <groupId>org.apache.commons</groupId>
112 |                 </exclusion>
113 |             </exclusions>
114 |         </dependency>
115 |         <dependency>
116 |             <groupId>org.apache.hadoop</groupId>
117 |             <artifactId>hadoop-hdfs</artifactId>
118 |             <version>${hadoop.version}</version>
119 |         </dependency>
120 |         <!--用于识别CSV文件-->
121 |         <dependency>
122 |             <groupId>org.apache.flink</groupId>
123 |             <artifactId>flink-csv</artifactId>
124 |             <version>${flink.version}</version>
125 |         </dependency>
126 |         <dependency>
127 |             <groupId>org.apache.flink</groupId>
128 |             <artifactId>flink-hadoop-compatibility_2.12</artifactId>
129 |             <version>${flink.version}</version>
130 |         </dependency>
131 |         <!--json转换-->
132 |         <dependency>
133 |             <groupId>com.alibaba</groupId>
134 |             <artifactId>fastjson</artifactId>
135 |             <version>1.2.71</version>
136 |         </dependency>
137 |         <dependency>
138 |             <groupId>junit</groupId>
139 |             <artifactId>junit</artifactId>
140 |             <version>4.11</version>
141 |             <scope>test</scope>
142 |         </dependency>
143 |     </dependencies>
144 | 
145 |     <build>
146 |         <sourceDirectory>src/main/scala</sourceDirectory> <!--指定打包的位置，默认只打src/main/java目录，且只能打包一个目录-->
147 |         <testSourceDirectory>src/main/test</testSourceDirectory>
148 |         <!--<finalName>kafka-sparkstreaming</finalName>--> <!--第一个打的源码jar包的名字，可以不用-->
149 |         <plugins>
150 |             <!--该插件可以用来解决项目jar包（类）与server端冲突问题-->
151 |             <plugin>
152 |                 <groupId>org.apache.maven.plugins</groupId>
153 |                 <artifactId>maven-shade-plugin</artifactId>
154 |                 <version>3.2.0</version><!--20220418因spark2升级到spark3升级到用3.2.0-->
155 |                 <configuration>
156 |                     <shadedArtifactAttached>true</shadedArtifactAttached>
157 |                     <shadedClassifierName>with-dependencies</shadedClassifierName><!--给依赖包添加后缀名-->
158 |                     <artifactSet>
159 |                         <includes>
160 |                             <include>*:*</include>
161 |                         </includes>
162 |                         <excludes>
163 |                             <exclude>junit:junit</exclude> <!--排除jar包-->
164 |                         </excludes>
165 |                     </artifactSet>
166 | 
167 |                     <filters><!--去掉META-INF文件中可能出现的非法签名文件-->
168 |                         <filter>
169 |                             <artifact>*:*</artifact>
170 |                             <excludes>
171 |                                 <exclude>META-INF/*.SF</exclude>
172 |                                 <exclude>META-INF/*.DSA</exclude>
173 |                                 <exclude>META-INF/*.RSA</exclude>
174 |                             </excludes>
175 |                         </filter>
176 |                     </filters>
177 |                     <minimizeJar>false</minimizeJar> <!--使jar包最小化，有时候会将一些间接依赖的类打掉，慎用-->
178 |                     <!--<encoding>UTF-8</encoding>-->
179 |                     <!--<appendAssemblyId>true</appendAssemblyId>
180 |                     <descriptors>
181 |                         <descriptor>package.xml</descriptor> &lt;!&ndash;这个用不上了&ndash;&gt;
182 |                     </descriptors>
183 |                     <createDependencyReducedPom>false</createDependencyReducedPom>-->
184 |                 </configuration>
185 |                 <executions>
186 |                     <execution>
187 |                         <phase>package</phase>
188 |                         <goals>
189 |                             <goal>shade</goal>
190 |                         </goals>
191 |                         <configuration>
192 |                             <!--<relocations>
193 |                                 <relocation> &lt;!&ndash;将会冲突的类重命名&ndash;&gt;
194 |                                     <pattern>com.google.guava</pattern>
195 |                                     <shadedPattern>com.shade2.google.guava</shadedPattern>
196 |                                 </relocation>
197 |                                 <relocation>
198 |                                     <pattern>com.google.common</pattern>
199 |                                     <shadedPattern>com.shade2.google.common</shadedPattern>
200 |                                 </relocation>
201 |                                 <relocation>
202 |                                     <pattern>com.google.thirdparty</pattern>
203 |                                     <shadedPattern>com.shade2.google.thirdparty</shadedPattern>
204 |                                 </relocation>
205 |                             </relocations>-->
206 |                             <transformers><!--20190829添加，解决An SPI class of type org.apache.lucene.codecs.PostingsFormat with name 'Lucene50' does not exist.  You need to add the corresponding JAR file supporting this SPI to your classpath的问题-->
207 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" />
208 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
209 |                                 <!--<mainClass></mainClass>--> <!--主类名-->
210 |                             </transformers>
211 |                         </configuration>
212 |                     </execution>
213 |                 </executions>
214 |             </plugin>
215 |             <!--增加可以打包多个source的插件-->
216 |             <plugin>
217 |                 <groupId>org.codehaus.mojo</groupId>
218 |                 <artifactId>build-helper-maven-plugin</artifactId>
219 |                 <version>3.0.0</version>
220 |                 <executions>
221 |                     <execution>
222 |                         <id>add-source</id>
223 |                         <phase>generate-sources</phase>
224 |                         <goals>
225 |                             <goal>add-source</goal>
226 |                         </goals>
227 |                         <configuration>
228 |                             <sources>
229 |                                 <source>src/main/java</source>  <!--增加打包的目录-->
230 |                             </sources>
231 |                         </configuration>
232 |                     </execution>
233 |                 </executions>
234 |             </plugin>
235 |             <plugin>
236 |                 <!--之前的，只能指定一个资源进行打包,有时候不需要，但有时候又必须要，否则scala代码无法打包-->
237 |                 <groupId>net.alchim31.maven</groupId>
238 |                 <artifactId>scala-maven-plugin</artifactId>
239 |                 <version>3.2.1</version>
240 |                 <executions>
241 |                     <execution>
242 |                         <goals>
243 |                             <goal>compile</goal>
244 |                             <goal>testCompile</goal>
245 |                         </goals>
246 |                     </execution>
247 |                 </executions>
248 |             </plugin>
249 |         </plugins>
250 |     </build>
251 | </project>
252 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
  1 |   <configuration  xmlns:xi="http://www.w3.org/2001/XInclude">
  2 |     
  3 |     <property>
  4 |       <name>ambari.hive.db.schema.name</name>
  5 |       <value>hive</value>
  6 |     </property>
  7 |     
  8 |     <property>
  9 |       <name>atlas.hook.hive.maxThreads</name>
 10 |       <value>1</value>
 11 |     </property>
 12 |     
 13 |     <property>
 14 |       <name>atlas.hook.hive.minThreads</name>
 15 |       <value>1</value>
 16 |     </property>
 17 |     
 18 |     <property>
 19 |       <name>credentialStoreClassPath</name>
 20 |       <value>/var/lib/ambari-agent/cred/lib/*</value>
 21 |     </property>
 22 |     
 23 |     <property>
 24 |       <name>datanucleus.autoCreateSchema</name>
 25 |       <value>false</value>
 26 |     </property>
 27 |     
 28 |     <property>
 29 |       <name>datanucleus.cache.level2.type</name>
 30 |       <value>none</value>
 31 |     </property>
 32 |     
 33 |     <property>
 34 |       <name>datanucleus.fixedDatastore</name>
 35 |       <value>true</value>
 36 |     </property>
 37 |     
 38 |     <property>
 39 |       <name>hadoop.security.credential.provider.path</name>
 40 |       <value>jceks://file/usr/hdp/current/hive-server2/conf/hive-site.jceks</value>
 41 |     </property>
 42 |     
 43 |     <property>
 44 |       <name>hive.auto.convert.join</name>
 45 |       <value>true</value>
 46 |     </property>
 47 |     
 48 |     <property>
 49 |       <name>hive.auto.convert.join.noconditionaltask</name>
 50 |       <value>true</value>
 51 |     </property>
 52 |     
 53 |     <property>
 54 |       <name>hive.auto.convert.join.noconditionaltask.size</name>
 55 |       <value>2147483648</value>
 56 |     </property>
 57 |     
 58 |     <property>
 59 |       <name>hive.auto.convert.sortmerge.join</name>
 60 |       <value>true</value>
 61 |     </property>
 62 |     
 63 |     <property>
 64 |       <name>hive.auto.convert.sortmerge.join.to.mapjoin</name>
 65 |       <value>true</value>
 66 |     </property>
 67 |     
 68 |     <property>
 69 |       <name>hive.cbo.enable</name>
 70 |       <value>true</value>
 71 |     </property>
 72 |     
 73 |     <property>
 74 |       <name>hive.cli.print.header</name>
 75 |       <value>false</value>
 76 |     </property>
 77 |     
 78 |     <property>
 79 |       <name>hive.cluster.delegation.token.store.class</name>
 80 |       <value>org.apache.hadoop.hive.thrift.ZooKeeperTokenStore</value>
 81 |     </property>
 82 |     
 83 |     <property>
 84 |       <name>hive.cluster.delegation.token.store.zookeeper.connectString</name>
 85 |       <value>hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181</value>
 86 |     </property>
 87 |     
 88 |     <property>
 89 |       <name>hive.cluster.delegation.token.store.zookeeper.znode</name>
 90 |       <value>/hive/cluster/delegation</value>
 91 |     </property>
 92 |     
 93 |     <property>
 94 |       <name>hive.compactor.abortedtxn.threshold</name>
 95 |       <value>1000</value>
 96 |     </property>
 97 |     
 98 |     <property>
 99 |       <name>hive.compactor.check.interval</name>
100 |       <value>300</value>
101 |     </property>
102 |     
103 |     <property>
104 |       <name>hive.compactor.delta.num.threshold</name>
105 |       <value>10</value>
106 |     </property>
107 |     
108 |     <property>
109 |       <name>hive.compactor.delta.pct.threshold</name>
110 |       <value>0.1f</value>
111 |     </property>
112 |     
113 |     <property>
114 |       <name>hive.compactor.initiator.on</name>
115 |       <value>true</value>
116 |     </property>
117 |     
118 |     <property>
119 |       <name>hive.compactor.worker.threads</name>
120 |       <value>7</value>
121 |     </property>
122 |     
123 |     <property>
124 |       <name>hive.compactor.worker.timeout</name>
125 |       <value>86400</value>
126 |     </property>
127 |     
128 |     <property>
129 |       <name>hive.compute.query.using.stats</name>
130 |       <value>true</value>
131 |     </property>
132 |     
133 |     <property>
134 |       <name>hive.convert.join.bucket.mapjoin.tez</name>
135 |       <value>false</value>
136 |     </property>
137 |     
138 |     <property>
139 |       <name>hive.create.as.insert.only</name>
140 |       <value>true</value>
141 |     </property>
142 |     
143 |     <property>
144 |       <name>hive.default.fileformat</name>
145 |       <value>TextFile</value>
146 |     </property>
147 |     
148 |     <property>
149 |       <name>hive.default.fileformat.managed</name>
150 |       <value>ORC</value>
151 |     </property>
152 |     
153 |     <property>
154 |       <name>hive.driver.parallel.compilation</name>
155 |       <value>true</value>
156 |     </property>
157 |     
158 |     <property>
159 |       <name>hive.enforce.sortmergebucketmapjoin</name>
160 |       <value>true</value>
161 |     </property>
162 |     
163 |     <property>
164 |       <name>hive.exec.compress.intermediate</name>
165 |       <value>false</value>
166 |     </property>
167 |     
168 |     <property>
169 |       <name>hive.exec.compress.output</name>
170 |       <value>false</value>
171 |     </property>
172 |     
173 |     <property>
174 |       <name>hive.exec.dynamic.partition</name>
175 |       <value>true</value>
176 |     </property>
177 |     
178 |     <property>
179 |       <name>hive.exec.dynamic.partition.mode</name>
180 |       <value>nonstrict</value>
181 |     </property>
182 |     
183 |     <property>
184 |       <name>hive.exec.failure.hooks</name>
185 |       <value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
186 |     </property>
187 |     
188 |     <property>
189 |       <name>hive.exec.max.created.files</name>
190 |       <value>100000</value>
191 |     </property>
192 |     
193 |     <property>
194 |       <name>hive.exec.max.dynamic.partitions</name>
195 |       <value>5000</value>
196 |     </property>
197 |     
198 |     <property>
199 |       <name>hive.exec.max.dynamic.partitions.pernode</name>
200 |       <value>2000</value>
201 |     </property>
202 |     
203 |     <property>
204 |       <name>hive.exec.orc.split.strategy</name>
205 |       <value>HYBRID</value>
206 |     </property>
207 |     
208 |     <property>
209 |       <name>hive.exec.parallel</name>
210 |       <value>false</value>
211 |     </property>
212 |     
213 |     <property>
214 |       <name>hive.exec.parallel.thread.number</name>
215 |       <value>8</value>
216 |     </property>
217 |     
218 |     <property>
219 |       <name>hive.exec.post.hooks</name>
220 |       <value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
221 |     </property>
222 |     
223 |     <property>
224 |       <name>hive.exec.pre.hooks</name>
225 |       <value>org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook</value>
226 |     </property>
227 |     
228 |     <property>
229 |       <name>hive.exec.reducers.bytes.per.reducer</name>
230 |       <value>1083179008</value>
231 |     </property>
232 |     
233 |     <property>
234 |       <name>hive.exec.reducers.max</name>
235 |       <value>1009</value>
236 |     </property>
237 |     
238 |     <property>
239 |       <name>hive.exec.scratchdir</name>
240 |       <value>/tmp/hive</value>
241 |     </property>
242 |     
243 |     <property>
244 |       <name>hive.exec.submit.local.task.via.child</name>
245 |       <value>true</value>
246 |     </property>
247 |     
248 |     <property>
249 |       <name>hive.exec.submitviachild</name>
250 |       <value>false</value>
251 |     </property>
252 |     
253 |     <property>
254 |       <name>hive.execution.engine</name>
255 |       <value>tez</value>
256 |     </property>
257 |     
258 |     <property>
259 |       <name>hive.execution.mode</name>
260 |       <value>container</value>
261 |     </property>
262 |     
263 |     <property>
264 |       <name>hive.fetch.task.aggr</name>
265 |       <value>false</value>
266 |     </property>
267 |     
268 |     <property>
269 |       <name>hive.fetch.task.conversion</name>
270 |       <value>more</value>
271 |     </property>
272 |     
273 |     <property>
274 |       <name>hive.fetch.task.conversion.threshold</name>
275 |       <value>1073741824</value>
276 |     </property>
277 |     
278 |     <property>
279 |       <name>hive.heapsize</name>
280 |       <value>1024</value>
281 |     </property>
282 |     
283 |     <property>
284 |       <name>hive.hook.proto.base-directory</name>
285 |       <value>/warehouse/tablespace/external/hive/sys.db/query_data/</value>
286 |     </property>
287 |     
288 |     <property>
289 |       <name>hive.limit.optimize.enable</name>
290 |       <value>true</value>
291 |     </property>
292 |     
293 |     <property>
294 |       <name>hive.limit.pushdown.memory.usage</name>
295 |       <value>0.04</value>
296 |     </property>
297 |     
298 |     <property>
299 |       <name>hive.load.data.owner</name>
300 |       <value>hive</value>
301 |     </property>
302 |     
303 |     <property>
304 |       <name>hive.lock.manager</name>
305 |       <value></value>
306 |     </property>
307 |     
308 |     <property>
309 |       <name>hive.map.aggr</name>
310 |       <value>true</value>
311 |     </property>
312 |     
313 |     <property>
314 |       <name>hive.map.aggr.hash.force.flush.memory.threshold</name>
315 |       <value>0.9</value>
316 |     </property>
317 |     
318 |     <property>
319 |       <name>hive.map.aggr.hash.min.reduction</name>
320 |       <value>0.5</value>
321 |     </property>
322 |     
323 |     <property>
324 |       <name>hive.map.aggr.hash.percentmemory</name>
325 |       <value>0.5</value>
326 |     </property>
327 |     
328 |     <property>
329 |       <name>hive.mapjoin.bucket.cache.size</name>
330 |       <value>10000</value>
331 |     </property>
332 |     
333 |     <property>
334 |       <name>hive.mapjoin.hybridgrace.hashtable</name>
335 |       <value>false</value>
336 |     </property>
337 |     
338 |     <property>
339 |       <name>hive.mapjoin.optimized.hashtable</name>
340 |       <value>true</value>
341 |     </property>
342 |     
343 |     <property>
344 |       <name>hive.mapred.reduce.tasks.speculative.execution</name>
345 |       <value>false</value>
346 |     </property>
347 |     
348 |     <property>
349 |       <name>hive.materializedview.rewriting.incremental</name>
350 |       <value>false</value>
351 |     </property>
352 |     
353 |     <property>
354 |       <name>hive.merge.mapfiles</name>
355 |       <value>true</value>
356 |     </property>
357 |     
358 |     <property>
359 |       <name>hive.merge.mapredfiles</name>
360 |       <value>false</value>
361 |     </property>
362 |     
363 |     <property>
364 |       <name>hive.merge.orcfile.stripe.level</name>
365 |       <value>true</value>
366 |     </property>
367 |     
368 |     <property>
369 |       <name>hive.merge.rcfile.block.level</name>
370 |       <value>true</value>
371 |     </property>
372 |     
373 |     <property>
374 |       <name>hive.merge.size.per.task</name>
375 |       <value>256000000</value>
376 |     </property>
377 |     
378 |     <property>
379 |       <name>hive.merge.smallfiles.avgsize</name>
380 |       <value>16000000</value>
381 |     </property>
382 |     
383 |     <property>
384 |       <name>hive.merge.tezfiles</name>
385 |       <value>false</value>
386 |     </property>
387 |     
388 |     <property>
389 |       <name>hive.metastore.authorization.storage.checks</name>
390 |       <value>false</value>
391 |     </property>
392 |     
393 |     <property>
394 |       <name>hive.metastore.cache.pinobjtypes</name>
395 |       <value>Table,Database,Type,FieldSchema,Order</value>
396 |     </property>
397 |     
398 |     <property>
399 |       <name>hive.metastore.client.connect.retry.delay</name>
400 |       <value>5s</value>
401 |     </property>
402 |     
403 |     <property>
404 |       <name>hive.metastore.client.socket.timeout</name>
405 |       <value>1800s</value>
406 |     </property>
407 |     
408 |     <property>
409 |       <name>hive.metastore.connect.retries</name>
410 |       <value>24</value>
411 |     </property>
412 |     
413 |     <property>
414 |       <name>hive.metastore.db.type</name>
415 |       <value>MYSQL</value>
416 |     </property>
417 |     
418 |     <property>
419 |       <name>hive.metastore.dml.events</name>
420 |       <value>true</value>
421 |     </property>
422 |     
423 |     <property>
424 |       <name>hive.metastore.event.listeners</name>
425 |       <value></value>
426 |     </property>
427 |     
428 |     <property>
429 |       <name>hive.metastore.execute.setugi</name>
430 |       <value>true</value>
431 |     </property>
432 |     
433 |     <property>
434 |       <name>hive.metastore.failure.retries</name>
435 |       <value>24</value>
436 |     </property>
437 |     
438 |     <property>
439 |       <name>hive.metastore.kerberos.keytab.file</name>
440 |       <value>/etc/security/keytabs/hive.service.keytab</value>
441 |     </property>
442 |     
443 |     <property>
444 |       <name>hive.metastore.kerberos.principal</name>
445 |       <value>hive/_HOST@EXAMPLE.COM</value>
446 |     </property>
447 |     
448 |     <property>
449 |       <name>hive.metastore.pre.event.listeners</name>
450 |       <value>org.apache.hadoop.hive.ql.security.authorization.AuthorizationPreEventListener</value>
451 |     </property>
452 |     
453 |     <property>
454 |       <name>hive.metastore.sasl.enabled</name>
455 |       <value>false</value>
456 |     </property>
457 |     
458 |     <property>
459 |       <name>hive.metastore.server.max.threads</name>
460 |       <value>100000</value>
461 |     </property>
462 |     
463 |     <property>
464 |       <name>hive.metastore.transactional.event.listeners</name>
465 |       <value>org.apache.hive.hcatalog.listener.DbNotificationListener</value>
466 |     </property>
467 |     
468 |     <property>
469 |       <name>hive.metastore.uris</name>
470 |       <value>thrift://hdp01.pcl-test.com:9083</value>
471 |     </property>
472 |     
473 |     <property>
474 |       <name>hive.metastore.warehouse.dir</name>
475 |       <value>/warehouse/tablespace/managed/hive</value>
476 |     </property>
477 |     
478 |     <property>
479 |       <name>hive.metastore.warehouse.external.dir</name>
480 |       <value>/warehouse/tablespace/external/hive</value>
481 |     </property>
482 |     
483 |     <property>
484 |       <name>hive.optimize.bucketmapjoin</name>
485 |       <value>true</value>
486 |     </property>
487 |     
488 |     <property>
489 |       <name>hive.optimize.bucketmapjoin.sortedmerge</name>
490 |       <value>false</value>
491 |     </property>
492 |     
493 |     <property>
494 |       <name>hive.optimize.constant.propagation</name>
495 |       <value>true</value>
496 |     </property>
497 |     
498 |     <property>
499 |       <name>hive.optimize.dynamic.partition.hashjoin</name>
500 |       <value>true</value>
501 |     </property>
502 |     
503 |     <property>
504 |       <name>hive.optimize.index.filter</name>
505 |       <value>true</value>
506 |     </property>
507 |     
508 |     <property>
509 |       <name>hive.optimize.metadataonly</name>
510 |       <value>true</value>
511 |     </property>
512 |     
513 |     <property>
514 |       <name>hive.optimize.null.scan</name>
515 |       <value>true</value>
516 |     </property>
517 |     
518 |     <property>
519 |       <name>hive.optimize.reducededuplication</name>
520 |       <value>true</value>
521 |     </property>
522 |     
523 |     <property>
524 |       <name>hive.optimize.reducededuplication.min.reducer</name>
525 |       <value>4</value>
526 |     </property>
527 |     
528 |     <property>
529 |       <name>hive.optimize.sort.dynamic.partition</name>
530 |       <value>false</value>
531 |     </property>
532 |     
533 |     <property>
534 |       <name>hive.orc.compute.splits.num.threads</name>
535 |       <value>10</value>
536 |     </property>
537 |     
538 |     <property>
539 |       <name>hive.orc.splits.include.file.footer</name>
540 |       <value>false</value>
541 |     </property>
542 |     
543 |     <property>
544 |       <name>hive.prewarm.enabled</name>
545 |       <value>false</value>
546 |     </property>
547 |     
548 |     <property>
549 |       <name>hive.prewarm.numcontainers</name>
550 |       <value>3</value>
551 |     </property>
552 |     
553 |     <property>
554 |       <name>hive.repl.cm.enabled</name>
555 |       <value></value>
556 |     </property>
557 |     
558 |     <property>
559 |       <name>hive.repl.cmrootdir</name>
560 |       <value></value>
561 |     </property>
562 |     
563 |     <property>
564 |       <name>hive.repl.rootdir</name>
565 |       <value></value>
566 |     </property>
567 |     
568 |     <property>
569 |       <name>hive.security.metastore.authenticator.manager</name>
570 |       <value>org.apache.hadoop.hive.ql.security.HadoopDefaultMetastoreAuthenticator</value>
571 |     </property>
572 |     
573 |     <property>
574 |       <name>hive.security.metastore.authorization.auth.reads</name>
575 |       <value>true</value>
576 |     </property>
577 |     
578 |     <property>
579 |       <name>hive.security.metastore.authorization.manager</name>
580 |       <value>org.apache.hadoop.hive.ql.security.authorization.StorageBasedAuthorizationProvider</value>
581 |     </property>
582 |     
583 |     <property>
584 |       <name>hive.server2.allow.user.substitution</name>
585 |       <value>true</value>
586 |     </property>
587 |     
588 |     <property>
589 |       <name>hive.server2.authentication</name>
590 |       <value>NONE</value>
591 |     </property>
592 |     
593 |     <property>
594 |       <name>hive.server2.authentication.spnego.keytab</name>
595 |       <value>HTTP/_HOST@EXAMPLE.COM</value>
596 |     </property>
597 |     
598 |     <property>
599 |       <name>hive.server2.authentication.spnego.principal</name>
600 |       <value>/etc/security/keytabs/spnego.service.keytab</value>
601 |     </property>
602 |     
603 |     <property>
604 |       <name>hive.server2.enable.doAs</name>
605 |       <value>true</value>
606 |     </property>
607 |     
608 |     <property>
609 |       <name>hive.server2.idle.operation.timeout</name>
610 |       <value>6h</value>
611 |     </property>
612 |     
613 |     <property>
614 |       <name>hive.server2.idle.session.timeout</name>
615 |       <value>1d</value>
616 |     </property>
617 |     
618 |     <property>
619 |       <name>hive.server2.logging.operation.enabled</name>
620 |       <value>true</value>
621 |     </property>
622 |     
623 |     <property>
624 |       <name>hive.server2.logging.operation.log.location</name>
625 |       <value>/tmp/hive/operation_logs</value>
626 |     </property>
627 |     
628 |     <property>
629 |       <name>hive.server2.max.start.attempts</name>
630 |       <value>5</value>
631 |     </property>
632 |     
633 |     <property>
634 |       <name>hive.server2.support.dynamic.service.discovery</name>
635 |       <value>true</value>
636 |     </property>
637 |     
638 |     <property>
639 |       <name>hive.server2.table.type.mapping</name>
640 |       <value>CLASSIC</value>
641 |     </property>
642 |     
643 |     <property>
644 |       <name>hive.server2.tez.default.queues</name>
645 |       <value>default,llap</value>
646 |     </property>
647 |     
648 |     <property>
649 |       <name>hive.server2.tez.initialize.default.sessions</name>
650 |       <value>false</value>
651 |     </property>
652 |     
653 |     <property>
654 |       <name>hive.server2.tez.sessions.per.default.queue</name>
655 |       <value>1</value>
656 |     </property>
657 |     
658 |     <property>
659 |       <name>hive.server2.thrift.http.path</name>
660 |       <value>cliservice</value>
661 |     </property>
662 |     
663 |     <property>
664 |       <name>hive.server2.thrift.http.port</name>
665 |       <value>10001</value>
666 |     </property>
667 |     
668 |     <property>
669 |       <name>hive.server2.thrift.max.worker.threads</name>
670 |       <value>500</value>
671 |     </property>
672 |     
673 |     <property>
674 |       <name>hive.server2.thrift.port</name>
675 |       <value>10000</value>
676 |     </property>
677 |     
678 |     <property>
679 |       <name>hive.server2.thrift.sasl.qop</name>
680 |       <value>auth</value>
681 |     </property>
682 |     
683 |     <property>
684 |       <name>hive.server2.transport.mode</name>
685 |       <value>binary</value>
686 |     </property>
687 |     
688 |     <property>
689 |       <name>hive.server2.use.SSL</name>
690 |       <value>false</value>
691 |     </property>
692 |     
693 |     <property>
694 |       <name>hive.server2.webui.cors.allowed.headers</name>
695 |       <value>X-Requested-With,Content-Type,Accept,Origin,X-Requested-By,x-requested-by</value>
696 |     </property>
697 |     
698 |     <property>
699 |       <name>hive.server2.webui.enable.cors</name>
700 |       <value>true</value>
701 |     </property>
702 |     
703 |     <property>
704 |       <name>hive.server2.webui.port</name>
705 |       <value>10002</value>
706 |     </property>
707 |     
708 |     <property>
709 |       <name>hive.server2.webui.use.ssl</name>
710 |       <value>false</value>
711 |     </property>
712 |     
713 |     <property>
714 |       <name>hive.server2.zookeeper.namespace</name>
715 |       <value>hiveserver2</value>
716 |     </property>
717 |     
718 |     <property>
719 |       <name>hive.service.metrics.codahale.reporter.classes</name>
720 |       <value>org.apache.hadoop.hive.common.metrics.metrics2.JsonFileMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.JmxMetricsReporter,org.apache.hadoop.hive.common.metrics.metrics2.Metrics2Reporter</value>
721 |     </property>
722 |     
723 |     <property>
724 |       <name>hive.smbjoin.cache.rows</name>
725 |       <value>10000</value>
726 |     </property>
727 |     
728 |     <property>
729 |       <name>hive.stats.autogather</name>
730 |       <value>true</value>
731 |     </property>
732 |     
733 |     <property>
734 |       <name>hive.stats.dbclass</name>
735 |       <value>fs</value>
736 |     </property>
737 |     
738 |     <property>
739 |       <name>hive.stats.fetch.column.stats</name>
740 |       <value>true</value>
741 |     </property>
742 |     
743 |     <property>
744 |       <name>hive.stats.fetch.partition.stats</name>
745 |       <value>true</value>
746 |     </property>
747 |     
748 |     <property>
749 |       <name>hive.stats.jdbc.timeout</name>
750 |       <value>0</value>
751 |     </property>
752 |     
753 |     <property>
754 |       <name>hive.strict.managed.tables</name> <!---->
755 |       <value>false</value>
756 |     </property>
757 |     
758 |     <property>
759 |       <name>hive.support.concurrency</name>
760 |       <value>true</value>
761 |     </property>
762 |     
763 |     <property>
764 |       <name>hive.tez.auto.reducer.parallelism</name>
765 |       <value>true</value>
766 |     </property>
767 |     
768 |     <property>
769 |       <name>hive.tez.bucket.pruning</name>
770 |       <value>true</value>
771 |     </property>
772 |     
773 |     <property>
774 |       <name>hive.tez.cartesian-product.enabled</name>
775 |       <value>true</value>
776 |     </property>
777 |     
778 |     <property>
779 |       <name>hive.tez.container.size</name>
780 |       <value>7680</value>
781 |     </property>
782 |     
783 |     <property>
784 |       <name>hive.tez.cpu.vcores</name>
785 |       <value>-1</value>
786 |     </property>
787 |     
788 |     <property>
789 |       <name>hive.tez.dynamic.partition.pruning</name>
790 |       <value>true</value>
791 |     </property>
792 |     
793 |     <property>
794 |       <name>hive.tez.dynamic.partition.pruning.max.data.size</name>
795 |       <value>104857600</value>
796 |     </property>
797 |     
798 |     <property>
799 |       <name>hive.tez.dynamic.partition.pruning.max.event.size</name>
800 |       <value>1048576</value>
801 |     </property>
802 |     
803 |     <property>
804 |       <name>hive.tez.exec.print.summary</name>
805 |       <value>true</value>
806 |     </property>
807 |     
808 |     <property>
809 |       <name>hive.tez.input.format</name>
810 |       <value>org.apache.hadoop.hive.ql.io.HiveInputFormat</value>
811 |     </property>
812 |     
813 |     <property>
814 |       <name>hive.tez.input.generate.consistent.splits</name>
815 |       <value>true</value>
816 |     </property>
817 |     
818 |     <property>
819 |       <name>hive.tez.java.opts</name>
820 |       <value>-server -Djava.net.preferIPv4Stack=true -XX:NewRatio=8 -XX:+UseNUMA -XX:+UseG1GC -XX:+ResizeTLAB -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps</value>
821 |     </property>
822 |     
823 |     <property>
824 |       <name>hive.tez.log.level</name>
825 |       <value>INFO</value>
826 |     </property>
827 |     
828 |     <property>
829 |       <name>hive.tez.max.partition.factor</name>
830 |       <value>2.0</value>
831 |     </property>
832 |     
833 |     <property>
834 |       <name>hive.tez.min.partition.factor</name>
835 |       <value>0.25</value>
836 |     </property>
837 |     
838 |     <property>
839 |       <name>hive.tez.smb.number.waves</name>
840 |       <value>0.5</value>
841 |     </property>
842 |     
843 |     <property>
844 |       <name>hive.txn.manager</name>
845 |       <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
846 |     </property>
847 |     
848 |     <property>
849 |       <name>hive.txn.max.open.batch</name>
850 |       <value>1000</value>
851 |     </property>
852 |     
853 |     <property>
854 |       <name>hive.txn.strict.locking.mode</name>
855 |       <value>false</value>
856 |     </property>
857 |     
858 |     <property>
859 |       <name>hive.txn.timeout</name>
860 |       <value>1000</value>
861 |     </property>
862 |     
863 |     <property>
864 |       <name>hive.user.install.directory</name>
865 |       <value>/user/</value>
866 |     </property>
867 |     
868 |     <property>
869 |       <name>hive.vectorized.execution.enabled</name>
870 |       <value>true</value>
871 |     </property>
872 |     
873 |     <property>
874 |       <name>hive.vectorized.execution.mapjoin.minmax.enabled</name>
875 |       <value>true</value>
876 |     </property>
877 |     
878 |     <property>
879 |       <name>hive.vectorized.execution.mapjoin.native.enabled</name>
880 |       <value>true</value>
881 |     </property>
882 |     
883 |     <property>
884 |       <name>hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled</name>
885 |       <value>true</value>
886 |     </property>
887 |     
888 |     <property>
889 |       <name>hive.vectorized.execution.reduce.enabled</name>
890 |       <value>true</value>
891 |     </property>
892 |     
893 |     <property>
894 |       <name>hive.vectorized.groupby.checkinterval</name>
895 |       <value>4096</value>
896 |     </property>
897 |     
898 |     <property>
899 |       <name>hive.vectorized.groupby.flush.percent</name>
900 |       <value>0.1</value>
901 |     </property>
902 |     
903 |     <property>
904 |       <name>hive.vectorized.groupby.maxentries</name>
905 |       <value>100000</value>
906 |     </property>
907 |     
908 |     <property>
909 |       <name>hive.zookeeper.client.port</name>
910 |       <value>2181</value>
911 |     </property>
912 |     
913 |     <property>
914 |       <name>hive.zookeeper.namespace</name>
915 |       <value>hive_zookeeper_namespace</value>
916 |     </property>
917 |     
918 |     <property>
919 |       <name>hive.zookeeper.quorum</name>
920 |       <value>hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181</value>
921 |     </property>
922 |     
923 |     <property>
924 |       <name>javax.jdo.option.ConnectionDriverName</name>
925 |       <value>com.mysql.jdbc.Driver</value>
926 |     </property>
927 |     
928 |     <property>
929 |       <name>javax.jdo.option.ConnectionURL</name>
930 |       <value>jdbc:mysql://hdp01.pcl-test.com/hive</value>
931 |     </property>
932 |     
933 |     <property>
934 |       <name>javax.jdo.option.ConnectionUserName</name>
935 |       <value>hive</value>
936 |     </property>
937 |     
938 |     <property>
939 |       <name>metastore.create.as.acid</name>
940 |       <value>true</value>
941 |     </property>
942 | 
943 |       <!--添加为了防止本地程序不连接hiveMetastore导致无法识别hive中的表结构-->
944 | <!--      <property>
945 |           <name>hive.metastore.schema.verification</name>
946 |           <value>false</value>
947 |       </property>
948 | 
949 |       <property>
950 |           <name>hive.metastore.schema.verification.record.version</name>
951 |           <value>false</value>
952 |       </property>-->
953 |     
954 |   </configuration>


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/FlinkDSFromKafka2HDFS.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg
 2 | 
 3 | import java.time.Duration
 4 | 
 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
 6 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
 7 | import org.apache.flink.configuration.MemorySize
 8 | import org.apache.flink.connector.kafka.source.KafkaSource
 9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
10 | import org.apache.flink.core.fs.Path
11 | import org.apache.flink.runtime.state.CheckpointStorage
12 | import org.apache.flink.streaming.api.CheckpointingMode
13 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
14 | import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy
15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
16 | 
17 | 
18 | /**
19 |   * @DESC: 读取kafka数据，从DataStream到HDFS
20 |   * @Auther: Anryg
21 |   * @Date: 2022/8/14 19:08
22 |   */
23 | object FlinkDSFromKafka2HDFS {
24 | 
25 |     private final val hdfsPrefix = "hdfs://192.168.211.106:8020"
26 | 
27 |     def main(args: Array[String]): Unit = {
28 |         //获取流任务的环境变量
29 |         val env = StreamExecutionEnvironment.getExecutionEnvironment
30 |                 .enableCheckpointing(3000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
31 | 
32 |         env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafka2HDFS") //设置checkpoint的hdfs目录
33 | 
34 |         val kafkaSource = KafkaSource.builder()  //获取kafka数据源
35 |                 .setBootstrapServers("192.168.211.107:6667")
36 |                 .setTopics("qianxin")
37 |                 .setGroupId("FlinkDSFromKafka2HDFS2")
38 |                 .setStartingOffsets(OffsetsInitializer.latest())
39 |                 .setValueOnlyDeserializer(new SimpleStringSchema())
40 |                 .build()
41 | 
42 |         import org.apache.flink.streaming.api.scala._  //引入隐私转换函数
43 |         val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data") //读取数据源生成DataStream对象
44 | 
45 |         val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理
46 |             line.split("\\|")
47 |         }).filter(_.length == 9).map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8)))
48 | 
49 |         /**基于flink1.14之后新的，文件系统的sink策略，跟官网提供的不一致，有坑*/
50 |         val hdfsSink2 = StreamingFileSink.forRowFormat(new Path(hdfsPrefix + "/tmp/flink_sink3"),
51 |             new SimpleStringEncoder[(String,String,String,String,String,String,String,String,String)]("UTF-8"))
52 |                 //.withBucketAssigner(new DateTimeBucketAssigner) /**默认基于时间分配器*/
53 |                 .withRollingPolicy( //设置文件的滚动策略，也就是分文件策略，也可以同时设置文件的命名规则，这里暂时用默认
54 |             DefaultRollingPolicy.builder()
55 |                     .withRolloverInterval(Duration.ofSeconds(300)) //文件滚动间隔，设为5分钟，即每5分钟生成一个新文件
56 |                     .withInactivityInterval(Duration.ofSeconds(20)) //空闲间隔时间，也就是当前文件有多久没有写入数据，则进行滚动
57 |                     .withMaxPartSize(MemorySize.ofMebiBytes(800)) //单个文件的最大文件大小，设置为500MB
58 |                     .build()).build()
59 | 
60 |         targetDS.addSink(hdfsSink2) //目标DataStream添加sink策略
61 | 
62 |         env.execute("FlinkDSFromKafka2HDFS") //启动任务
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/FlinkTest04.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg
 2 | 
 3 | import com.alibaba.fastjson.JSON
 4 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 6 | import org.apache.flink.connector.kafka.source.KafkaSource
 7 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
 8 | import org.apache.flink.streaming.api.scala._
 9 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
10 | 
11 | 
12 | /**
13 |   * @DESC: 读取kafka数据，从DataStream转为Table 把结果写ES
14 |   * @Auther: Anryg
15 |   * @Date: 2022/8/14 19:08
16 |   */
17 | object FlinkTest04 {
18 |     case class InternetBehavior(id:String, client_ip:String, domain:String, do_time:String, target_ip:String,rcode:String, query_type:String, authority_record:String, add_msg:String, dns_ip:String)//定义当前数据对象
19 | 
20 |     def main(args: Array[String]): Unit = {
21 |         val env = StreamExecutionEnvironment.getExecutionEnvironment
22 | 
23 |         val tableEnv = StreamTableEnvironment.create(env)
24 | 
25 |         val kafkaSource = KafkaSource.builder()
26 |                         .setBootstrapServers("192.168.211.107:6667")
27 |                         .setTopics("test")
28 |                         .setGroupId("group01")
29 |                         .setStartingOffsets(OffsetsInitializer.earliest())
30 |                         .setValueOnlyDeserializer(new SimpleStringSchema())
31 |                         .build()
32 | 
33 |         val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data")
34 |         val targetDS = kafkaDS.map(line => {
35 |             val rawJson = JSON.parseObject(line)      //原始string是一个json，对其进行解析
36 |             val message = rawJson.getString("message")  //获取业务数据部分
37 |             val msgArray = message.split(",")  //指定分隔符进行字段切分
38 |             msgArray
39 |         }).filter(_.length == 9).map(array => {
40 |             InternetBehavior(array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))
41 |         })
42 | 
43 |         val targetTable = tableEnv.fromDataStream(targetDS)//转化成为Table类型
44 |         //targetTable.execute().print()
45 | 
46 |         /**定义sink*/
47 |         tableEnv.executeSql("CREATE TABLE InternetBehavior (\n\tid String,\n  client_ip STRING,\n  domain STRING,\n  do_time STRING,\n  target_ip STRING,\n  rcode int,\n  query_type string,\n  authority_record string,\n  add_msg string,\n  dns_ip string,\n  PRIMARY KEY (id) NOT ENFORCED\n) WITH (\n  'connector' = 'elasticsearch-7',\n  'hosts' = 'http://192.168.211.106:9201',\n  'index' = 'internet_behavior-flink'\n)")
48 | 
49 |         targetTable.executeInsert("InternetBehavior")
50 |         //targetDS.addSink()
51 |         //targetTable.executeInsert()
52 | 
53 |         //env.execute("FlinkTest03")
54 | 
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkReadKafka2Hive.scala:
--------------------------------------------------------------------------------
  1 | package com.anryg.hive_cdc
  2 | 
  3 | import org.apache.flink.configuration.Configuration
  4 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment}
  5 | import org.apache.flink.table.catalog.hive.HiveCatalog
  6 | 
  7 | /**
  8 |   * @DESC: Flink读取kafka写hive动态分区表
  9 |   * @Auther: Anryg
 10 |   * @Date: 2022/12/19 10:36
 11 |   */
 12 | object FlinkReadKafka2Hive {
 13 | 
 14 |     def main(args: Array[String]): Unit = {
 15 |         val settings = EnvironmentSettings.newInstance().inStreamingMode()
 16 |                 .withConfiguration(setConf())
 17 |                 .build() //读设置
 18 |         val tableEnv = TableEnvironment.create(settings) //获取table env
 19 |         setHive(tableEnv)
 20 | 
 21 |         /**读取kafka source*/
 22 |         getDataSource(tableEnv)
 23 | 
 24 |         tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言，该方言可以在整个上下文过程中来回切换
 25 |         /**创建hive表*/
 26 |         createHiveTable(tableEnv)
 27 | 
 28 |         tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT) //设置当前SQL语法为flink方言，
 29 |         /**将数据Sink到*/
 30 |         sinkData(tableEnv)
 31 | 
 32 |     }
 33 | 
 34 |     /**
 35 |       * @DESC: 设置Flink相关配置
 36 |       * */
 37 |     private def setConf(): Configuration ={
 38 |         val config = new Configuration() //设置checkpoint
 39 |         config.setString("execution.checkpointing.interval","10000")
 40 |         config.setString("state.backend", "filesystem")
 41 |         config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkWithHive")
 42 |         config
 43 |     }
 44 | 
 45 |     /**
 46 |       * @DESC: 设置hive catalog
 47 |       * */
 48 |     private def setHive(tableEnv: TableEnvironment): Unit ={
 49 |         val name = "hive_test"  //取个catalog名字
 50 |         val database = "test"   //指定hive的database
 51 |         //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方
 52 | 
 53 |         /**读取hive配置，并生成hive的catalog对象*/
 54 |         val hive = new HiveCatalog(name,database, null) //hiveConf为null后，程序会自动到classpath下找hive-site.xml
 55 |         tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中，这样flink就可以直接访问hive中的表
 56 | 
 57 |         tableEnv.useCatalog(name) //让当前的flink环境使用该catalog
 58 |     }
 59 | 
 60 |     /**
 61 |       * @DESC: 读取Kafka数据源
 62 |       * */
 63 |     private def getDataSource(tableEnv: TableEnvironment): Unit ={
 64 |         tableEnv.executeSql(
 65 |             """
 66 |               |drop table if exists test.kafkaTable;
 67 |             """.stripMargin)
 68 | 
 69 |         tableEnv.executeSql(
 70 |             """
 71 |               |Create table test.kafkaTable(
 72 |               |client_ip STRING,
 73 |               |domain STRING,
 74 |               |`time` STRING,
 75 |               |target_ip STRING,
 76 |               |rcode STRING,
 77 |               |query_type STRING,
 78 |               |authority_record STRING,
 79 |               |add_msg STRING,
 80 |               |dns_ip STRING
 81 |               |)
 82 |               |with(
 83 |               |'connector' = 'kafka',
 84 |               |'topic' = 'qianxin',
 85 |               |'properties.bootstrap.servers' = '192.168.211.107:6667',
 86 |               |'properties.group.id' = 'FlinkWithHive',
 87 |               |'scan.startup.mode' = 'latest-offset',
 88 |               |'value.format'='csv',                                 //确定数据源为文本格式
 89 |               |'value.csv.field-delimiter'='|'                      //确定文本数据源的分隔符
 90 |               |);
 91 |             """.stripMargin)
 92 |     }
 93 | 
 94 |     /**
 95 |       * @DESC: 创建hive目标数据表
 96 |       * */
 97 |     private def createHiveTable(tableEnv: TableEnvironment): Unit ={
 98 |         tableEnv.executeSql(
 99 |             """
100 |               |CREATE TABLE if not exists test.kafka_flink_hive (
101 |               |client_ip STRING,
102 |               |domain STRING,
103 |               |target_ip STRING,
104 |               |rcode STRING,
105 |               |query_type STRING,
106 |               |authority_record STRING,
107 |               |add_msg STRING,
108 |               |dns_ip STRING
109 |               |)
110 |               |PARTITIONED BY (`time` STRING)
111 |               |STORED AS textfile TBLPROPERTIES (
112 |               |  'partition.time-extractor.timestamp-pattern'='$time',
113 |               |  'sink.partition-commit.trigger'='partition-time',
114 |               |  'sink.partition-commit.delay'='1 h',
115 |               |  'sink.partition-commit.policy.kind'='metastore,success-file'
116 |               |);
117 |             """.stripMargin)
118 |     }
119 | 
120 |     /**
121 |       * @DESC: 将数据写入到目标表中
122 |       * */
123 |     private def sinkData(tableEnv: TableEnvironment): Unit ={
124 |         tableEnv.executeSql(
125 |             """
126 |               |INSERT INTO test.kafka_flink_hive
127 |               |SELECT client_ip,domain,target_ip,rcode,query_type,authority_record,add_msg,dns_ip,`time`
128 |               |FROM test.kafkaTable;
129 |             """.stripMargin)
130 |     }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/hive_cdc/FlinkWithHive.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.hive_cdc
 2 | 
 3 | import org.apache.flink.table.api.{EnvironmentSettings, SqlDialect, TableEnvironment}
 4 | import org.apache.flink.table.catalog.hive.HiveCatalog
 5 | import org.apache.hadoop.conf.Configuration
 6 | import org.apache.hadoop.hive.conf.HiveConf
 7 | 
 8 | /**
 9 |   * @DESC: Flink连接hive
10 |   * @Auther: Anryg
11 |   * @Date: 2022/12/19 10:36
12 |   */
13 | object FlinkWithHive {
14 | 
15 |     def main(args: Array[String]): Unit = {
16 |         val settings = EnvironmentSettings.newInstance().inStreamingMode().build() //读取默认设置
17 |         val tableEnv = TableEnvironment.create(settings) //获取table env
18 |         tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) //设置当前SQL语法为hive方言，该方言可以在整个上下文过程中来回切换
19 |         setHive(tableEnv)
20 | 
21 |         /**查看当前database有哪些表*/
22 |         tableEnv.executeSql(
23 |             """
24 |               |SHOW tables;
25 |             """.stripMargin).print()
26 | 
27 | 
28 |         /**将数据Sink到*/
29 |     }
30 | 
31 |     /**
32 |       * @DESC: 设置hive catalog
33 |       * */
34 |     private def setHive(tableEnv: TableEnvironment): Unit ={
35 |         val name = "hive_test"  //取个catalog名字
36 |         val database = "test"   //指定hive的database
37 |         //val hiveConf = "./flink-coding/src/main/resources/" //指定hive-site.xml配置文件所在的地方
38 | 
39 |         /**读取hive配置，并生成hive的catalog对象*/
40 |         val hive = new HiveCatalog(name,database, null) //hiveConf为null后，程序会自动到classpath下找hive-site.xml
41 |         tableEnv.registerCatalog(name, hive) //将该catalog登记到Flink的table env环境中，这样flink就可以直接访问hive中的表
42 | 
43 |         tableEnv.useCatalog(name) //让当前的flink环境使用该catalog
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkDSFromKafkaWithWatermark.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.window_and_watermark
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.time.Duration
 5 | import java.util.Locale
 6 | 
 7 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
 8 | import org.apache.flink.api.common.serialization.{SimpleStringEncoder, SimpleStringSchema}
 9 | import org.apache.flink.configuration.MemorySize
10 | import org.apache.flink.connector.kafka.source.KafkaSource
11 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
12 | import org.apache.flink.core.fs.Path
13 | import org.apache.flink.streaming.api.CheckpointingMode
14 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
16 | import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, SlidingProcessingTimeWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows}
17 | import org.apache.flink.streaming.api.windowing.time
18 | import org.apache.flink.streaming.api.windowing.time.Time
19 | 
20 | 
21 | /**
22 |   * @DESC: 读取kafka数据，从DataStream到HDFS
23 |   * @Auther: Anryg
24 |   * @Date: 2022/8/14 19:08
25 |   */
26 | object FlinkDSFromKafkaWithWatermark {
27 | 
28 |     private final val hdfsPrefix = "hdfs://192.168.211.106:8020"
29 | 
30 |     def main(args: Array[String]): Unit = {
31 |         //获取流任务的环境变量
32 |         val env = StreamExecutionEnvironment.getExecutionEnvironment
33 |                 .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
34 | 
35 |         env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkDSFromKafkaWithWatermark") //设置checkpoint的hdfs目录
36 |         env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略
37 | 
38 |         val kafkaSource = KafkaSource.builder()  //获取kafka数据源
39 |                 .setBootstrapServers("192.168.211.107:6667")
40 |                 .setTopics("qianxin")
41 |                 .setGroupId("FlinkDSFromKafkaWithWatermark")
42 |                 .setStartingOffsets(OffsetsInitializer.latest())
43 |                 .setValueOnlyDeserializer(new SimpleStringSchema())
44 |                 .build()
45 | 
46 |         import org.apache.flink.streaming.api.scala._  //引入隐私转换函数
47 | 
48 |         val kafkaDS = env.fromSource(kafkaSource,
49 |             WatermarkStrategy.noWatermarks()
50 |             ,"kafka-data") //读取数据源生成DataStream对象
51 | 
52 |         val targetDS = kafkaDS.map(line => { //对数据源做简单的ETL处理
53 |             line.split("\\|")
54 |         }).filter(_.length == 9).filter(_(1).endsWith("com"))
55 |                 .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofHours(10)) //指定watermark
56 |                         .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] {
57 |                             override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = {
58 |                                 val sdf = new SimpleDateFormat("yyyyMMddhhmmss")
59 |                                 sdf.parse(element(2)).getTime  //指定的watermark字段必须是Long类型的时间戳
60 |                             }
61 |                         }))
62 |                 .map(array => (array(0), 1))
63 |                 .keyBy(kv => kv._1) //根据client_ip聚合
64 |                 .window(SlidingProcessingTimeWindows.of(Time.minutes(2), Time.seconds(30)))  //指定window，这里的window assigner必须是基于Process Time而不是Event Time，因为数据时间跟当前真实时间相差有点多
65 |                 .sum(1)
66 | 
67 |         targetDS.print() //打印结果
68 | 
69 |         env.execute("FlinkDSFromKafkaWithWatermark") //启动任务
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkSQLFromKafkaWithWatermarkAndWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.window_and_watermark
 2 | 
 3 | import org.apache.flink.configuration.Configuration
 4 | import org.apache.flink.table.api.{EnvironmentSettings, TableEnvironment}
 5 | 
 6 | 
 7 | /**
 8 |   * @DESC: 用SQL API读取kafka数据，并利用watermark和window功能来对数据进行统计
 9 |   * @Auther: Anryg
10 |   * @Date: 2022/8/14 19:08
11 |   */
12 | object FlinkSQLFromKafkaWithWatermarkAndWindow {
13 | 
14 |     def main(args: Array[String]): Unit = {
15 |         val streamingSetting = EnvironmentSettings.newInstance().inStreamingMode().build()
16 | 
17 |         val config = new Configuration() //设置checkpoint
18 |         config.setString("execution.checkpointing.interval","10000")
19 |         config.setString("state.backend", "filesystem")
20 |         config.setString("state.checkpoints.dir","hdfs://192.168.211.106:8020/tmp/checkpoint/FlinkSQLFromKafkaWithWatermarkAndWindow")
21 | 
22 |         streamingSetting.getConfiguration.addAll(config)
23 | 
24 |         val tableEnv = TableEnvironment.create(streamingSetting)
25 | 
26 |         tableEnv.executeSql(
27 |             """
28 |               |Create table kafkaTable(
29 |               |client_ip STRING,
30 |               |domain STRING,
31 |               |`time` STRING,
32 |               |target_ip STRING,
33 |               |rcode STRING,
34 |               |query_type STRING,
35 |               |authority_record STRING,
36 |               |add_msg STRING,
37 |               |dns_ip STRING,
38 |               |event_time AS to_timestamp(`time`, 'yyyyMMddHHmmss'), //设置事件时间为实际数据的产生时间，注意time这个字段必须要用``符合括起来
39 |               |watermark for event_time as event_time - interval '10' second  //设置watermark，确定watermark字段
40 |               |)
41 |               |with(
42 |               |'connector' = 'kafka',
43 |               |'topic' = 'qianxin',
44 |               |'properties.bootstrap.servers' = '192.168.211.107:6667',
45 |               |'properties.group.id' = 'FlinkSQLFromKafkaWithWatermarkAndWindow',
46 |               |'scan.startup.mode' = 'latest-offset',
47 |               |'value.format'='csv',                                 //确定数据源为文本格式
48 |               |'value.csv.field-delimiter'='|'                      //确定文本数据源的分隔符
49 |               |)
50 |             """.stripMargin)
51 | 
52 |         tableEnv.executeSql(
53 |             """
54 |               |SELECT
55 |               |window_start,
56 |               |window_end,
57 |               |client_ip,
58 |               |count(client_ip) as ip_count
59 |               |FROM TABLE(
60 |               |HOP(                       //确定window策略
61 |               |TABLE kafkaTable,
62 |               |DESCRIPTOR(event_time),
63 |               |INTERVAL '30' SECONDS,   //确定滑动周期
64 |               |INTERVAL '2' MINUTES)    //确定窗口时间间隔
65 |               |)
66 |               |GROUP BY
67 |               |window_start,
68 |               |window_end,
69 |               |client_ip
70 |               |ORDER BY ip_count
71 |               |DESC
72 |               |LIMIT 10
73 |             """.stripMargin
74 |         ).print()
75 | 
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/flink-coding/src/main/scala/com/anryg/window_and_watermark/FlinkTBFromKafkaWithWatermark.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.window_and_watermark
 2 | 
 3 | import java.sql.Timestamp
 4 | import java.text.SimpleDateFormat
 5 | import java.time.Duration
 6 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 8 | import org.apache.flink.connector.kafka.source.KafkaSource
 9 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
10 | import org.apache.flink.streaming.api.CheckpointingMode
11 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
12 | import org.apache.flink.streaming.api.scala._
13 | import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
14 | 
15 | 
16 | 
17 | /**
18 |   * @DESC: 读取kafka数据，从DataStream API转为Table API，并利用watermark
19 |   * @Auther: Anryg
20 |   * @Date: 2022/8/14 19:08
21 |   */
22 | object FlinkTBFromKafkaWithWatermark {
23 |     private final val hdfsPrefix = "hdfs://192.168.211.106:8020"//HDFS地址前缀
24 | 
25 |     def main(args: Array[String]): Unit = {
26 |         val env = StreamExecutionEnvironment.getExecutionEnvironment //获取流环境变量
27 |                                             .enableCheckpointing(10000, CheckpointingMode.EXACTLY_ONCE) //打开checkpoint功能
28 | 
29 |         val tableEnv = StreamTableEnvironment.create(env)  //创建Table环境变量
30 |         env.getCheckpointConfig.setCheckpointStorage(hdfsPrefix + "/tmp/flink_checkpoint/FlinkTBFromKafkaWithWatermark") //设置checkpoint的hdfs目录
31 |         env.getCheckpointConfig.setExternalizedCheckpointCleanup(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) //设置checkpoint记录的保留策略
32 | 
33 |         val kafkaSource = KafkaSource.builder()
34 |                         .setBootstrapServers("192.168.211.107:6667")
35 |                         .setTopics("qianxin")
36 |                         .setGroupId("FlinkTBFromKafkaWithWatermark")
37 |                         .setStartingOffsets(OffsetsInitializer.latest())
38 |                         .setValueOnlyDeserializer(new SimpleStringSchema())
39 |                         .build()
40 |         val kafkaDS = env.fromSource(kafkaSource,WatermarkStrategy.noWatermarks(),"kafka-data")
41 |         val targetDS = kafkaDS.map(_.split("\\|"))
42 |                                 .filter(_.length == 9)
43 |                                 .filter(_(1).endsWith("com"))
44 |                                 .assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(10))  //给业务字段分配watermark
45 |                                                                                 .withTimestampAssigner(new SerializableTimestampAssigner[Array[String]] {
46 |                                                                                     override def extractTimestamp(element: Array[String], recordTimestamp: Long): Long = { //实现watermark字段的分配
47 |                                                                                         val sdf = new SimpleDateFormat("yyyyMMddhhmmss")
48 |                                                                                         sdf.parse(element(2)).getTime
49 |                                                                                     }
50 |                                 }))
51 |                                 .map(array => (array(0), array(2)))
52 |                                 .map(kv => {
53 |                                     val date = kv._2
54 |                                     val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(date)
55 |                                     val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf)
56 |                                     (kv._1, Timestamp.valueOf(time)) //将时间转为要求的Time Attributes 也就是Timestamp类型
57 |                                 })
58 | 
59 |         import org.apache.flink.table.api._  //加入隐式转换，否则下面的$无法识别
60 | 
61 |         val targetTable = tableEnv.fromDataStream(targetDS)
62 |                                 .as("client_ip", "time") //添加schema
63 |                                 .window(
64 |                                     Slide  over 1.minute every 30.seconds() on $"time" as $"w"  //加入window
65 |                                 )
66 |                                 .groupBy($"client_ip", $"w")
67 |                                 .select(
68 |                                     $"client_ip",
69 |                                     $"w".start(), //时间窗口的开始时间
70 |                                     $"w".end(),  //时间窗口的解释时间
71 |                                     $"client_ip".count() as "count"
72 |                                 )
73 |                                 .orderBy($"count")
74 |                                 .limit(10)
75 |         targetTable.execute().print()
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.anryg.bigdata</groupId>
 8 |     <artifactId>internet_behavior_project</artifactId>
 9 |     <packaging>pom</packaging>
10 |     <version>1.0-SNAPSHOT</version>
11 |     <modules>
12 |         <module>spark-coding</module>
13 |         <module>flink-coding</module>
14 |         <module>redis</module>
15 |     </modules>
16 | 
17 | 
18 | </project>


--------------------------------------------------------------------------------
/redis/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>internet_behavior_project</artifactId>
  7 |         <groupId>com.anryg.bigdata</groupId>
  8 |         <version>1.0-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <groupId>com.anryg.bigdata</groupId>
 13 |     <artifactId>redis</artifactId>
 14 |     <version>1.0-SNAPSHOT</version>
 15 |     <packaging>jar</packaging>
 16 |     <name>redis</name>
 17 |     <!-- FIXME change it to the project's website -->
 18 |     <url>http://www.example.com</url>
 19 | 
 20 |     <properties>
 21 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 22 |         <maven.compiler.source>1.8</maven.compiler.source>
 23 |         <maven.compiler.target>1.8</maven.compiler.target>
 24 |     </properties>
 25 | 
 26 |     <dependencies>
 27 |         <dependency>
 28 |             <groupId>redis.clients</groupId>
 29 |             <artifactId>jedis</artifactId>
 30 |             <version>3.3.0</version>
 31 |         </dependency>
 32 |         <dependency>
 33 |             <groupId>junit</groupId>
 34 |             <artifactId>junit</artifactId>
 35 |             <version>4.11</version>
 36 |             <scope>test</scope>
 37 |         </dependency>
 38 |     </dependencies>
 39 | 
 40 |     <build>
 41 |         <sourceDirectory>src/main/scala</sourceDirectory>
 42 |         <plugins>
 43 |             <!--该插件可以用来解决项目jar包（类）与server端冲突问题-->
 44 |             <plugin>
 45 |                 <groupId>org.apache.maven.plugins</groupId>
 46 |                 <artifactId>maven-shade-plugin</artifactId>
 47 |                 <version>3.1.0</version>
 48 |                 <configuration>
 49 |                     <shadedArtifactAttached>true</shadedArtifactAttached>
 50 |                     <shadedClassifierName>with-dependencies</shadedClassifierName><!--给依赖包添加后缀名-->
 51 |                     <filters><!--去掉META-INF文件中可能出现的非法签名文件-->
 52 |                         <filter>
 53 |                             <artifact>*:*</artifact>
 54 |                             <excludes>
 55 |                                 <exclude>META-INF/*.SF</exclude>
 56 |                                 <exclude>META-INF/*.DSA</exclude>
 57 |                                 <exclude>META-INF/*.RSA</exclude>
 58 |                             </excludes>
 59 |                         </filter>
 60 |                     </filters>
 61 |                     <!--<minimizeJar>true</minimizeJar>--> <!--使jar包最小化-->
 62 |                     <artifactSet>
 63 |                         <excludes>
 64 |                             <exclude>junit:junit</exclude> <!--排除jar包-->
 65 |                         </excludes>
 66 |                     </artifactSet>
 67 |                 </configuration>
 68 |                 <executions>
 69 |                     <execution>
 70 |                         <phase>package</phase>
 71 |                         <goals>
 72 |                             <goal>shade</goal>
 73 |                         </goals>
 74 |                         <configuration>
 75 |                             <relocations>
 76 |                                 <relocation> <!--将会冲突的类重命名-->
 77 |                                     <pattern>com.google.guava</pattern>
 78 |                                     <shadedPattern>com.shade2.google.guava</shadedPattern>
 79 |                                 </relocation>
 80 |                             </relocations>
 81 |                             <transformers>
 82 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" />
 83 |                                 <!--<mainClass></mainClass>--> <!--主类名-->
 84 |                             </transformers>
 85 |                         </configuration>
 86 |                     </execution>
 87 |                 </executions>
 88 |             </plugin>
 89 |             <plugin>
 90 |                 <groupId>org.codehaus.mojo</groupId>
 91 |                 <artifactId>build-helper-maven-plugin</artifactId>
 92 |                 <version>3.0.0</version>
 93 |                 <executions>
 94 |                     <execution>
 95 |                         <id>add-source</id>
 96 |                         <phase>generate-sources</phase>
 97 |                         <goals>
 98 |                             <goal>add-source</goal>
 99 |                         </goals>
100 |                         <configuration>
101 |                             <sources>
102 |                                 <source>src/main/java</source>  <!--增加打包的目录-->
103 |                             </sources>
104 |                         </configuration>
105 |                     </execution>
106 |                 </executions>
107 |             </plugin>
108 |             <plugin>
109 |                 <!--之前的，只能指定一个资源进行打包,有时候不需要，但有时候又必须要，否则scala代码无法打包-->
110 |                 <groupId>net.alchim31.maven</groupId>
111 |                 <artifactId>scala-maven-plugin</artifactId>
112 |                 <version>3.2.1</version>
113 |                 <executions>
114 |                     <execution>
115 |                         <goals>
116 |                             <goal>compile</goal>
117 |                             <goal>testCompile</goal>
118 |                         </goals>
119 |                         <configuration>
120 |                             <args>
121 |                                 <arg>-make:transitive</arg>
122 |                                 <arg>-dependencyfile</arg>
123 |                                 <arg>${project.build.directory}/.scala_dependencies</arg>
124 |                             </args>
125 |                         </configuration>
126 |                     </execution>
127 |                 </executions>
128 |             </plugin>
129 |         </plugins>
130 |     </build>
131 | </project>
132 | 


--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/IPUtils.java:
--------------------------------------------------------------------------------
  1 | package com.anryg.bigdata;
  2 | 
  3 | //import com.googlecode.ipv6.IPv6Network;
  4 | import org.slf4j.Logger;
  5 | import org.slf4j.LoggerFactory;
  6 | import redis.clients.jedis.Jedis;
  7 | 
  8 | import java.io.BufferedReader;
  9 | import java.io.FileInputStream;
 10 | import java.io.InputStreamReader;
 11 | import java.math.BigInteger;
 12 | import java.net.InetAddress;
 13 | import java.net.UnknownHostException;
 14 | import java.util.HashMap;
 15 | import java.util.Map;
 16 | 
 17 | /**
 18 |  * @DESC: 提供对IP地址数据相关的操作
 19 |  * @Author Anryg
 20 |  * */
 21 | 
 22 | public class IPUtils {
 23 |     private static Logger logger = LoggerFactory.getLogger(IPUtils.class);
 24 | 
 25 | 
 26 |     /**
 27 |      * @DESC: 将本地ip.merge.txt文件中的IP地址导入到redis zset中
 28 |      * @param filePath : IP地址与地理位置关系文件
 29 |      * @param dbNo : redis的数据库名
 30 |      * */
 31 |     public static void ipCountryImport(String filePath, int dbNo) throws Exception {
 32 |         FileInputStream inputStream = new FileInputStream(filePath);
 33 |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
 34 |         String line = null; /*读取文件中的每一行*/
 35 |         HashMap<String,Double> map = new HashMap(1024,1); //key为数据体，value为ip范围的结束ip地址
 36 |         int i = 0;
 37 |         while((line=bufferedReader.readLine()) != null){
 38 |             String[] args=line.split("\\|");
 39 |             String ipStart=args[0];
 40 |             String ipEnd=args[1];
 41 |             //Long ipStartLong= IPUtils.ip2Long(ipStart);
 42 |             Long ipEndLong= IPUtils.ip2Long(ipEnd); //将每条IP地址范围的结束IP地址，转换为long类型的数值
 43 |             String country = args[2];  //获取国家信息
 44 |             String province = args[4]; //或者省份信息
 45 |             String city = args[5];  //获取城市信息
 46 |             String operator = args[6]; //获取运营商信息
 47 |             StringBuilder rowBuffer  = new StringBuilder(11); //用来存放组装后的IP地址与地理位置信息
 48 |             rowBuffer.append(ipStart).append("-").append(ipEnd).append("-").append(country).append("-")
 49 |                     .append(province).append("-").append(city).append("-").append(operator);
 50 |             map.put(rowBuffer.toString(),ipEndLong.doubleValue());
 51 |             ++i;
 52 |             if (i == 1024) {/**每1024个为一批*/
 53 |                 toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr");
 54 |                 map.clear();
 55 |                 i = 0;
 56 |             }
 57 |         }
 58 |         if (map.size() > 0) toRedis(RedisClientUtils.getSingleRedisClient(),map, dbNo,"ipAndAddr");
 59 |     }
 60 | 
 61 |     /**
 62 |      * @DESC: 将IP转为10进制
 63 |      * */
 64 |     public static long ip2Long(String ipstr) {
 65 |         InetAddress ip = null;
 66 |         try {
 67 |             ip = InetAddress.getByName(ipstr);
 68 |         } catch (UnknownHostException e) {
 69 |             logger.error("UnknownHost...",e);
 70 |         }
 71 |         byte[] octets = ip.getAddress();
 72 |         long result = 0;
 73 |         for (byte octet : octets) {
 74 |             result <<= 8;
 75 |             result |= octet & 0xff;
 76 |         }
 77 |         return result;
 78 |     }
 79 | 
 80 |     /**
 81 |      * @DESC: 经10进制转换成为IPV4地址字符串
 82 |      * */
 83 |     public static String Long2Ip(long ten) {
 84 |         StringBuilder sb = new StringBuilder();
 85 |         for (int i = 0; i < 4; i++) {
 86 |             sb.insert(0, Long.toString(ten & 0xff));
 87 |             if (i < 3) {
 88 |                 sb.insert(0, '.');
 89 |             }
 90 |             ten = ten >> 8;
 91 |         }
 92 |         return sb.toString();
 93 |     }
 94 | 
 95 |     /**
 96 |      * 根据IPV4地址和子网掩码计算IPV4地址范围，例如：192.168.1.53/27 --》3232235808,3232235839
 97 |      * @param ipAndMask
 98 |      * @return IPV4地址范围
 99 |      */
100 |     public static long[] getIPLongScope(String ipAndMask) {
101 |         String[] ipArr = ipAndMask.split("/");
102 |         if (ipArr.length != 2) {
103 |             throw new IllegalArgumentException("invalid ipAndMask with: "
104 |                     + ipAndMask);
105 |         }
106 |         int netMask = Integer.valueOf(ipArr[1].trim());
107 |         if (netMask < 0 || netMask > 32) {
108 |             throw new IllegalArgumentException("invalid ipAndMask with: "
109 |                     + ipAndMask);
110 |         }
111 |         long ipInt = ip2Long(ipArr[0]);
112 |         long netIP = ipInt & (0xFFFFFFFF << (32 - netMask));
113 |         long hostScope = (0xFFFFFFFF >>> netMask);
114 |         return new long[] { netIP, netIP + hostScope };
115 |     }
116 | 
117 |     /**
118 |      * 根据IPV4地址和子网掩码计算IPV4地址范围，例如：ip：192.168.1.53,子网掩码：255.255.255.224--》3232235808,3232235839
119 |      * @param ipaddr，mask IPV4地址，子网掩码 192.168.1.53，255.255.255.224
120 |      * @return IPV4地址范围字符串
121 |      */
122 |     public static String getIPNetworkAddr(String ipaddr, String mask){
123 |         //IP地址和子网掩码与得到网络地址
124 |         Long ipNetworkAddr = ip2Long(ipaddr)&ip2Long(mask);
125 |         Long ipBroadcastAddr = ((ipNetworkAddr^ip2Long(mask))^0xffffffffL);
126 | 
127 |         //System.out.println(Long.toBinaryString(ipBroadcastAddr));
128 |         return Long2Ip(ipNetworkAddr+1)+"-->"+Long2Ip(ipBroadcastAddr-1);
129 |     }
130 | 
131 |     /**
132 |      * ipv6字符串转整数
133 |      * @param ipv6
134 |      * @return
135 |      */
136 |     public static BigInteger ipv6ToBigInt(String ipv6)
137 |     {
138 | 
139 |         int compressIndex = ipv6.indexOf("::");
140 |         if (compressIndex != -1)
141 |         {
142 |             String part1s = ipv6.substring(0, compressIndex);
143 |             String part2s = ipv6.substring(compressIndex + 1);
144 |             BigInteger part1 = ipv6ToBigInt(part1s);
145 |             BigInteger part2 = ipv6ToBigInt(part2s);
146 |             int part1hasDot = 0;
147 |             char[] ch = part1s.toCharArray();
148 |             for (char c : ch)
149 |             {
150 |                 if (c == ':')
151 |                 {
152 |                     part1hasDot++;
153 |                 }
154 |             }
155 |             // ipv6 has most 7 dot
156 |             return part1.shiftLeft(16 * (7 - part1hasDot )).add(part2);
157 |         }
158 |         String[] str = ipv6.split(":");
159 |         BigInteger big = BigInteger.ZERO;
160 |         for (int i = 0; i < str.length; i++)
161 |         {
162 |             //::1
163 |             if (str[i].isEmpty())
164 |             {
165 |                 str[i] = "0";
166 |             }
167 |             big = big.add(BigInteger.valueOf(Long.valueOf(str[i], 16))
168 |                     .shiftLeft(16 * (str.length - i - 1)));
169 |         }
170 |         return big;
171 |     }
172 | 
173 | 
174 |     /**
175 |      * @Author liuxh02
176 |      * @Description   整数转为ipv6地址字符串
177 |      * @Date 2020/8/5
178 |      * @Param [big]
179 |      * @return java.lang.String
180 |      **/
181 |     public static String bigIntToipv6(BigInteger big)
182 |     {
183 |         String str = "";
184 |         BigInteger ff = BigInteger.valueOf(0xffff);
185 |         for (int i = 0; i < 8 ; i++)
186 |         {
187 |             str = big.and(ff).toString(16) + ":" + str;
188 | 
189 |             big = big.shiftRight(16);
190 |         }
191 |         //the last :
192 |         str = str.substring(0, str.length() - 1);
193 | 
194 |         return str.replaceFirst("(^|:)(0+(:|$)){2,8}", "::");
195 |     }
196 | 
197 | 
198 |     /**
199 |      * @DESC: 批量方式写入Redis
200 |      * */
201 |     private static  Long toRedis(Jedis jedis, Map<String,Double> map, int dbno, String key) {
202 |         try {
203 |             jedis.select(dbno);
204 |             return jedis.zadd(key,map);
205 |         } finally {
206 |             RedisClientUtils.returnResource(jedis);
207 |         }
208 | 
209 |     }
210 | 
211 | 
212 |     /**
213 |      * @Author liuxh02
214 |      * @Description  根据ipv6地址和子网掩码计算IP范围，返回数组
215 |      * @Date 2020/8/6
216 |      * @Param 【起始IP，结束IP】
217 |      * @return java.math.BigInteger[]
218 |      **/
219 | /*    public  static BigInteger[]   getIPV6LongScope(String ipv6AndMask ){
220 | 
221 |         IPv6Network network = IPv6Network.fromString(ipv6AndMask);
222 |         BigInteger start=network.getFirst().toBigInteger();//起始IP
223 |         BigInteger end=network.getLast().toBigInteger();//结束IP
224 |         System.out.println(end);
225 |         return new BigInteger[]{start,end};
226 | 
227 |     }*/
228 | }
229 | 


--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/IpSearch.java:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata;
 2 | 
 3 | 
 4 | import org.slf4j.LoggerFactory;
 5 | import redis.clients.jedis.Jedis;
 6 | import redis.clients.jedis.Tuple;
 7 | 
 8 | import java.math.BigInteger;
 9 | import java.net.UnknownHostException;
10 | import java.util.Set;
11 | 
12 | public class IpSearch {
13 |     private static org.slf4j.Logger logger = LoggerFactory.getLogger(IpSearch.class);
14 | 
15 |     /**
16 |      * 在redis db1数据库中查找IP所在的地址信息
17 |      * @param jedis
18 |      * @param ip
19 |      * @return 给定IP所在范围
20 |      * @throws UnknownHostException
21 |      */
22 |     public static String getAddrByIP(Jedis jedis, String ip)  {
23 |         try {
24 |             jedis.select(1);
25 |             long ipscore = IPUtils.ip2Long(ip);
26 |             Set<Tuple> tuples = jedis.zrangeByScoreWithScores("ipAndAddr", String.valueOf(ipscore),"+inf",0,1);
27 |             String value = "";
28 |             for (Tuple tuple : tuples) {
29 |                 value = tuple.getElement();
30 |             }
31 |             String[] valueSplits = value.split("-");
32 |             long begin = IPUtils.ip2Long(valueSplits[0]);
33 |             long end = IPUtils.ip2Long(valueSplits[1]);
34 |             //String[] scope = value.substring(startpos+1,endpos).split(",");
35 |             if(ipscore >= begin && ipscore <= end){
36 |                 return value;
37 |             }
38 |             else return "";
39 |         } finally {
40 |             //RedisClientUtils.returnResource(jedis);/**归还到连接池*/
41 | 
42 |         }
43 |     }
44 |     /**
45 |      * @Author liuxh02
46 |      * @Description  在redis db2数据库中查询ipv4,ipv6地址信息
47 |      * @Date 2020/8/6
48 |      * @Param [jedis, ip]
49 |      * @return java.lang.String
50 |      **/
51 |     public static String getAddr(Jedis jedis, String ip)  {
52 |         jedis.select(2);
53 |         //ip地址转整数
54 |         BigInteger ipscore=null;
55 |         if(ip.contains(":")){
56 |             //ipv6转整数
57 |             ipscore=IPUtils.ipv6ToBigInt(ip);
58 |         }else{
59 |             //ipv4转整数
60 |             ipscore = BigInteger.valueOf(IPUtils.ip2Long(ip));
61 |         }
62 |         Set<Tuple> tuples = jedis.zrangeByScoreWithScores("ipAndAddr",ipscore.toString(),"+inf",0,1);
63 |         String value = "";
64 |         for (Tuple tuple : tuples) {
65 |             value = tuple.getElement();
66 |         }
67 |         String[] valueArray = value.split("-");
68 |         //获取IP和子网掩码
69 |         String ipAndMask=valueArray[0];
70 |         BigInteger start=null;
71 |         BigInteger end=null;
72 |         if(ipAndMask.contains(":")){
73 |             //ipv6地址计算
74 |             BigInteger[] ipv6AndMask = null;
75 |             start=ipv6AndMask[0];
76 |             end=ipv6AndMask[1];
77 |         }else{
78 |             //ipv4地址计算
79 |             long[] ipv4AndMask=IPUtils.getIPLongScope(ipAndMask);
80 |             start= BigInteger.valueOf(ipv4AndMask[0]);
81 |             end= BigInteger.valueOf(ipv4AndMask[1]);
82 |         }
83 |         if(ipscore.compareTo(start)>0 && ipscore.compareTo(end)<0){
84 |             return value;
85 |         }
86 |         else return "";
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/RedisClientUtils.java:
--------------------------------------------------------------------------------
  1 | package com.anryg.bigdata;
  2 | 
  3 | import redis.clients.jedis.Jedis;
  4 | import redis.clients.jedis.JedisPool;
  5 | import redis.clients.jedis.JedisPoolConfig;
  6 | 
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | import java.util.Set;
 10 | 
 11 | /**
 12 |  * Created by Anryg on 2018/5/9.
 13 |  */
 14 | public class RedisClientUtils implements RedisParam {
 15 |     private static volatile JedisPool jedisPool = null;/**用连接池进行管理,避免多线程情况下连接redis出现的各种问题*/
 16 |     private static volatile Jedis jedis = null;
 17 | 
 18 |     /**
 19 |      * @DESC: 初始化连接池
 20 |     * */
 21 |     private static void initPool(){
 22 |         JedisPoolConfig config = null;
 23 |         try {
 24 |             config = new JedisPoolConfig();
 25 |             config.setMaxTotal(MAX_ACTIVE);
 26 |             config.setMaxIdle(MAX_IDLE);
 27 |             config.setMaxWaitMillis(MAX_WAIT);
 28 |             config.setTestOnBorrow(TEST_ON_BORROW);//使用时进行扫描，确保都可用
 29 |             config.setTestWhileIdle(true);//Idle时进行连接扫描
 30 |             config.setTestOnReturn(true);//还回线程池时进行扫描
 31 |         } catch (Exception e) {
 32 |             throw e;
 33 |         }
 34 |         /*表示idle object evitor两次扫描之间要sleep的毫秒数
 35 |                    config.setTimeBetweenEvictionRunsMillis(30000);
 36 |         表示idle object evitor每次扫描的最多的对象数
 37 |                     config.setNumTestsPerEvictionRun(10);
 38 |         表示一个对象至少停留在idle状态的最短时间，然后才能被idle object evitor扫描并驱逐；这一项只有在timeBetweenEvictionRunsMillis大于0时才有意义
 39 |                    config.setMinEvictableIdleTimeMillis(60000);*/
 40 |         jedisPool = new JedisPool(config, HOSTS.split(",")[0], PORT, TIMEOUT, PASSWD);
 41 |     }
 42 |     /**
 43 |      *@DESC: 多线程环境下确保只初始化一个连接池
 44 |      */
 45 |     private static void poolInit() {
 46 |         if (jedisPool == null){
 47 |             synchronized (RedisClientUtils.class){
 48 |                 if (jedisPool == null) initPool();
 49 |             }
 50 |         }
 51 |     }
 52 | 
 53 |     /**
 54 |      * @DESC: 获取连接池对象，适用多线程时，利用其获取多个jedis客户端
 55 |      * */
 56 |     public static  JedisPool getJedisPool(){
 57 |         poolInit();
 58 |         return jedisPool;
 59 |     }
 60 |     /**
 61 |      * @DESC: 同步获取Jedis实例，适合单线程
 62 |      * @return Jedis
 63 |      */
 64 |     public static Jedis getSingleRedisClient() {
 65 |         poolInit();
 66 |         if (jedis == null){
 67 |             synchronized (RedisClientUtils.class){
 68 |                 if (jedis == null) {
 69 |                     jedis = jedisPool.getResource();
 70 |                 }
 71 |             }
 72 |         }
 73 |         return jedis;
 74 |     }
 75 |     /**
 76 |      * @DESC: 释放jedis资源,将资源放回连接池
 77 |      * @param jedis
 78 |      */
 79 |     public static void returnResource(final Jedis jedis) {
 80 |         if (jedis != null && jedisPool != null) jedis.close();
 81 |     }
 82 | 
 83 |     /**
 84 |      * @DESC: 删除某个库下的所有数据
 85 |      * */
 86 |     public static void delDataPerDB(Jedis redis, int dbNum){
 87 |         redis.select(dbNum);
 88 |         Set<String> keySet = redis.keys("*");
 89 |         for (String key:keySet){
 90 |             try {
 91 |                 Set<String> fields = redis.hkeys(key);
 92 |                 redis.hdel(key,fields.toArray(new String[fields.size()]));//Set转Array
 93 |             } catch (Exception e) {
 94 |                 throw e;
 95 |             }finally {
 96 |                 //redis.close();
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     /**
102 |      * @DESC: 存储set对象
103 |     * */
104 |     public static boolean save2RedisBySet(Jedis redis, int redisNo, String key, String[] strArray){
105 |         redis.select(redisNo);
106 |         long count = 0;
107 |         try {
108 |             count = redis.sadd(key,strArray);/**存储不重复的*/
109 |         } catch (Exception e) {
110 |             e.printStackTrace();
111 |         } finally {
112 |             redis.close();
113 |         }
114 |         if (count > 0) return true;
115 |         else return false;
116 |     }
117 | 
118 |     /**
119 |      * @DESC: 用来批量存储key:value
120 |      * @param kvList : 为容器，奇数位的key，偶数位的为value,且总数必须是偶数个
121 |     * */
122 |     public static void save2RedisByKVs(Jedis redis, int redisNo, List<String> kvList){
123 |         redis.select(redisNo);
124 |         try {
125 |             redis.mset(kvList.toArray(new String[kvList.size()]));
126 |         } finally {
127 |             redis.close();
128 |         }
129 |     }
130 |     /**
131 |      * @DESC: 获取set对象的结果
132 |     * */
133 |     public static Set<String> getSetResult(Jedis redis, int redisNo, String key){
134 |         redis.select(redisNo);
135 |         Set<String> scanResult = null;
136 |         try {
137 |             scanResult = redis.smembers(key);
138 |         } catch (Exception e) {
139 |             e.printStackTrace();
140 |         } finally {
141 |             redis.close();
142 |         }
143 |         return scanResult;
144 |     }
145 | 
146 |     /**
147 |      *@DESC: 删除指定的key集合（调用时所在环境指定的数据库）
148 |      * */
149 |     public  static void deleteKeys(Jedis redis , List<String> keys){
150 |         redis.del(keys.toArray(new String[keys.size()]));
151 |     }
152 | 
153 | /**
154 |  * @DESC: 删除指定key(hash类型)下的字段集（调用时所在环境指定的数据库）
155 |  * */
156 |     public   static void deleteFieldByKey(Jedis redis, String key, List<String> fields){
157 |         redis.hdel(key,fields.toArray(new String[fields.size()]));
158 |     }
159 | 
160 | }
161 | 


--------------------------------------------------------------------------------
/redis/src/main/java/com/anryg/bigdata/RedisParam.java:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata;
 2 | 
 3 | 
 4 | 
 5 | /**
 6 |  * Created by Anryg on 2018/5/9.
 7 |  * @DESC: 提供Redis的基础属性配置
 8 |  */
 9 | public interface RedisParam {
10 |     String HOSTS = "192.168.211.106";/**redis服务器列表，目前为单点*/
11 |     int PORT = 6379;
12 |     String PASSWD = "pcl@2020";
13 |     //可用连接实例的最大数目，默认值为8；
14 |     //如果赋值为-1，则表示不限制；如果pool已经分配了maxActive个jedis实例，则此时pool的状态为exhausted(耗尽)
15 |     int MAX_ACTIVE = 1500;
16 |     //控制一个pool最多有多少个状态为idle(空闲的)的jedis实例，默认值也是8
17 |     int MAX_IDLE = 100;
18 |     //等待可用连接的最大时间，单位毫秒，默认值为-1，表示永不超时。如果超过等待时间，则直接抛出JedisConnectionException
19 |     int MAX_WAIT = 100 * 1000;
20 |     int TIMEOUT = 100 * 1000;//超时时间
21 |     //在borrow一个jedis实例时，是否提前进行validate操作；如果为true，则得到的jedis实例均是可用的；
22 |     boolean TEST_ON_BORROW = true;
23 | }
24 | 


--------------------------------------------------------------------------------
/spark-coding/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>internet_behavior_project</artifactId>
  7 |         <groupId>com.anryg.bigdata</groupId>
  8 |         <version>1.0-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 |     <artifactId>spark-coding</artifactId>
 12 |     <name>spark-coding</name>
 13 |     <!-- FIXME change it to the project's website -->
 14 |     <url>http://www.example.com</url>
 15 | 
 16 |     <properties>
 17 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 18 |         <maven.compiler.source>1.8</maven.compiler.source>
 19 |         <maven.compiler.target>1.8</maven.compiler.target>
 20 |     </properties>
 21 | 
 22 |     <dependencies>
 23 |         <!--主要提供SparkConf和SparkContext对象-->
 24 |         <dependency>
 25 |             <groupId>org.apache.spark</groupId>
 26 |             <artifactId>spark-core_2.12</artifactId>
 27 |             <version>3.2.0</version>
 28 |         </dependency>
 29 |         <!--提供SparkSession对象-->
 30 |         <dependency>
 31 |             <groupId>org.apache.spark</groupId>
 32 |             <artifactId>spark-sql_2.12</artifactId>
 33 |             <version>3.2.0</version>
 34 |         </dependency>
 35 |         <!--spark读取kafka的connector-->
 36 |         <dependency>
 37 |             <groupId>org.apache.spark</groupId>
 38 |             <artifactId>spark-sql-kafka-0-10_2.12</artifactId>
 39 |             <version>3.2.0</version>
 40 |         </dependency>
 41 |         <!--通过json转换-->
 42 |         <dependency>
 43 |             <groupId>com.alibaba</groupId>
 44 |             <artifactId>fastjson</artifactId>
 45 |             <version>1.2.71</version>
 46 |         </dependency>
 47 |         <!--提供spark读取hive-->
 48 |         <dependency>
 49 |             <groupId>org.apache.spark</groupId>
 50 |             <artifactId>spark-hive_2.12</artifactId>
 51 |             <version>3.2.0</version>
 52 |         </dependency>
 53 |         <!--添加Redis模块依赖-->
 54 |         <dependency>
 55 |             <groupId>com.anryg.bigdata</groupId>
 56 |             <artifactId>redis</artifactId>
 57 |             <version>1.0-SNAPSHOT</version>
 58 |         </dependency>
 59 |         <!--spark处理结果sink到Elasticsearch-->
 60 |         <dependency>
 61 |             <groupId>org.elasticsearch</groupId>
 62 |             <artifactId>elasticsearch-spark-30_2.12</artifactId>
 63 |             <version>7.12.0</version>
 64 |             <exclusions>
 65 |                 <exclusion>
 66 |                     <artifactId>scala-library</artifactId>
 67 |                     <groupId>org.scala-lang</groupId>
 68 |                 </exclusion>
 69 |                 <exclusion>
 70 |                     <artifactId>spark-core_2.12</artifactId>
 71 |                     <groupId>org.apache.spark</groupId>
 72 |                 </exclusion>
 73 |                 <exclusion>
 74 |                     <artifactId>spark-sql_2.12</artifactId>
 75 |                     <groupId>org.apache.spark</groupId>
 76 |                 </exclusion>
 77 |                 <exclusion>
 78 |                     <artifactId>spark-catalyst_2.12</artifactId>
 79 |                     <groupId>org.apache.spark</groupId>
 80 |                 </exclusion>
 81 |                 <exclusion>
 82 |                     <artifactId>slf4j-api</artifactId>
 83 |                     <groupId>org.slf4j</groupId>
 84 |                 </exclusion>
 85 |             </exclusions>
 86 |         </dependency>
 87 |         <dependency>
 88 |             <groupId>commons-httpclient</groupId>
 89 |             <artifactId>commons-httpclient</artifactId>
 90 |             <version>3.1</version>
 91 |         </dependency>
 92 | 
 93 |         <!--增加clickhouse的jdbc，用于写数据到clickhouse-->
 94 |         <dependency>
 95 |             <groupId>com.clickhouse</groupId>
 96 |             <artifactId>clickhouse-jdbc</artifactId>
 97 |             <version>0.4.6</version>
 98 |             <!-- use uber jar with all dependencies included, change classifier to http for smaller jar -->
 99 |         </dependency>
100 | 
101 | 
102 |         <dependency>
103 |             <groupId>junit</groupId>
104 |             <artifactId>junit</artifactId>
105 |             <version>4.11</version>
106 |             <scope>test</scope>
107 |         </dependency>
108 | 
109 |     </dependencies>
110 | 
111 | 
112 |     <build>
113 |     <sourceDirectory>src/main/scala</sourceDirectory> <!--指定打包的位置，默认只打src/main/java目录，且只能打包一个目录-->
114 |     <testSourceDirectory>src/main/test</testSourceDirectory>
115 |     <!--<finalName>kafka-sparkstreaming</finalName>--> <!--第一个打的源码jar包的名字，可以不用-->
116 |     <plugins>
117 |         <!--该插件可以用来解决项目jar包（类）与server端冲突问题-->
118 |         <plugin>
119 |             <groupId>org.apache.maven.plugins</groupId>
120 |             <artifactId>maven-shade-plugin</artifactId>
121 |             <version>3.2.0</version><!--20220418因spark2升级到spark3升级到用3.2.0-->
122 |             <configuration>
123 |                 <shadedArtifactAttached>true</shadedArtifactAttached>
124 |                 <shadedClassifierName>with-dependencies</shadedClassifierName><!--给依赖包添加后缀名-->
125 |                 <artifactSet>
126 |                     <includes>
127 |                         <include>*:*</include>
128 |                     </includes>
129 |                     <excludes>
130 |                         <exclude>junit:junit</exclude> <!--排除jar包-->
131 |                     </excludes>
132 |                 </artifactSet>
133 | 
134 |                 <filters><!--去掉META-INF文件中可能出现的非法签名文件-->
135 |                     <filter>
136 |                         <artifact>*:*</artifact>
137 |                         <excludes>
138 |                             <exclude>META-INF/*.SF</exclude>
139 |                             <exclude>META-INF/*.DSA</exclude>
140 |                             <exclude>META-INF/*.RSA</exclude>
141 |                         </excludes>
142 |                     </filter>
143 |                 </filters>
144 |                 <minimizeJar>false</minimizeJar> <!--使jar包最小化，有时候会将一些间接依赖的类打掉，慎用-->
145 |                 <!--<encoding>UTF-8</encoding>-->
146 |                 <!--<appendAssemblyId>true</appendAssemblyId>
147 |                 <descriptors>
148 |                     <descriptor>package.xml</descriptor> &lt;!&ndash;这个用不上了&ndash;&gt;
149 |                 </descriptors>
150 |                 <createDependencyReducedPom>false</createDependencyReducedPom>-->
151 |             </configuration>
152 |             <executions>
153 |                 <execution>
154 |                     <phase>package</phase>
155 |                     <goals>
156 |                         <goal>shade</goal>
157 |                     </goals>
158 |                     <configuration>
159 |                         <!--<relocations>
160 |                             <relocation> &lt;!&ndash;将会冲突的类重命名&ndash;&gt;
161 |                                 <pattern>com.google.guava</pattern>
162 |                                 <shadedPattern>com.shade2.google.guava</shadedPattern>
163 |                             </relocation>
164 |                             <relocation>
165 |                                 <pattern>com.google.common</pattern>
166 |                                 <shadedPattern>com.shade2.google.common</shadedPattern>
167 |                             </relocation>
168 |                             <relocation>
169 |                                 <pattern>com.google.thirdparty</pattern>
170 |                                 <shadedPattern>com.shade2.google.thirdparty</shadedPattern>
171 |                             </relocation>
172 |                         </relocations>-->
173 |                         <transformers><!--20190829添加，解决An SPI class of type org.apache.lucene.codecs.PostingsFormat with name 'Lucene50' does not exist.  You need to add the corresponding JAR file supporting this SPI to your classpath的问题-->
174 |                             <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer" />
175 |                             <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
176 |                             <!--<mainClass></mainClass>--> <!--主类名-->
177 |                         </transformers>
178 |                     </configuration>
179 |                 </execution>
180 |             </executions>
181 |         </plugin>
182 |         <!--增加可以打包多个source的插件-->
183 |         <plugin>
184 |             <groupId>org.codehaus.mojo</groupId>
185 |             <artifactId>build-helper-maven-plugin</artifactId>
186 |             <version>3.0.0</version>
187 |             <executions>
188 |                 <execution>
189 |                     <id>add-source</id>
190 |                     <phase>generate-sources</phase>
191 |                     <goals>
192 |                         <goal>add-source</goal>
193 |                     </goals>
194 |                     <configuration>
195 |                         <sources>
196 |                             <source>src/main/java</source>  <!--增加打包的目录-->
197 |                         </sources>
198 |                     </configuration>
199 |                 </execution>
200 |             </executions>
201 |         </plugin>
202 |         <plugin>
203 |             <!--之前的，只能指定一个资源进行打包,有时候不需要，但有时候又必须要，否则scala代码无法打包-->
204 |             <groupId>net.alchim31.maven</groupId>
205 |             <artifactId>scala-maven-plugin</artifactId>
206 |             <version>3.2.1</version>
207 |             <executions>
208 |                 <execution>
209 |                     <goals>
210 |                         <goal>compile</goal>
211 |                         <goal>testCompile</goal>
212 |                     </goals>
213 |                 </execution>
214 |             </executions>
215 |         </plugin>
216 |     </plugins>
217 | </build>
218 | </project>
219 | 


--------------------------------------------------------------------------------
/spark-coding/spark-coding.iml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
  3 |   <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
  4 |     <output url="file://$MODULE_DIR$/target/classes" />
  5 |     <output-test url="file://$MODULE_DIR$/target/test-classes" />
  6 |     <content url="file://$MODULE_DIR$">
  7 |       <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
  8 |       <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" />
  9 |       <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
 10 |       <excludeFolder url="file://$MODULE_DIR$/target" />
 11 |     </content>
 12 |     <orderEntry type="inheritedJdk" />
 13 |     <orderEntry type="sourceFolder" forTests="false" />
 14 |     <orderEntry type="library" name="scala-sdk-2.12.7" level="application" />
 15 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-core_2.12:3.2.0" level="project" />
 16 |     <orderEntry type="library" name="Maven: org.apache.avro:avro:1.10.2" level="project" />
 17 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.12.2" level="project" />
 18 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.20" level="project" />
 19 |     <orderEntry type="library" name="Maven: org.apache.avro:avro-mapred:1.10.2" level="project" />
 20 |     <orderEntry type="library" name="Maven: org.apache.avro:avro-ipc:1.10.2" level="project" />
 21 |     <orderEntry type="library" name="Maven: org.tukaani:xz:1.8" level="project" />
 22 |     <orderEntry type="library" name="Maven: com.twitter:chill_2.12:0.10.0" level="project" />
 23 |     <orderEntry type="library" name="Maven: com.esotericsoftware:kryo-shaded:4.0.2" level="project" />
 24 |     <orderEntry type="library" name="Maven: com.esotericsoftware:minlog:1.3.0" level="project" />
 25 |     <orderEntry type="library" name="Maven: org.objenesis:objenesis:2.5.1" level="project" />
 26 |     <orderEntry type="library" name="Maven: com.twitter:chill-java:0.10.0" level="project" />
 27 |     <orderEntry type="library" name="Maven: org.apache.xbean:xbean-asm9-shaded:4.20" level="project" />
 28 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-client-api:3.3.1" level="project" />
 29 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-client-runtime:3.3.1" level="project" />
 30 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.htrace:htrace-core4:4.1.0-incubating" level="project" />
 31 |     <orderEntry type="library" name="Maven: commons-logging:commons-logging:1.1.3" level="project" />
 32 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-launcher_2.12:3.2.0" level="project" />
 33 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-kvstore_2.12:3.2.0" level="project" />
 34 |     <orderEntry type="library" name="Maven: org.fusesource.leveldbjni:leveldbjni-all:1.8" level="project" />
 35 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.12.3" level="project" />
 36 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-network-common_2.12:3.2.0" level="project" />
 37 |     <orderEntry type="library" name="Maven: com.google.crypto.tink:tink:1.6.0" level="project" />
 38 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-network-shuffle_2.12:3.2.0" level="project" />
 39 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-unsafe_2.12:3.2.0" level="project" />
 40 |     <orderEntry type="library" name="Maven: javax.activation:activation:1.1.1" level="project" />
 41 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-recipes:2.13.0" level="project" />
 42 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-framework:2.13.0" level="project" />
 43 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-client:2.13.0" level="project" />
 44 |     <orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper:3.6.2" level="project" />
 45 |     <orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
 46 |     <orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper-jute:3.6.2" level="project" />
 47 |     <orderEntry type="library" name="Maven: org.apache.yetus:audience-annotations:0.5.0" level="project" />
 48 |     <orderEntry type="library" name="Maven: jakarta.servlet:jakarta.servlet-api:4.0.3" level="project" />
 49 |     <orderEntry type="library" name="Maven: commons-codec:commons-codec:1.15" level="project" />
 50 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.12.0" level="project" />
 51 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.4.1" level="project" />
 52 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-text:1.6" level="project" />
 53 |     <orderEntry type="library" name="Maven: commons-io:commons-io:2.8.0" level="project" />
 54 |     <orderEntry type="library" name="Maven: commons-collections:commons-collections:3.2.2" level="project" />
 55 |     <orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:3.0.0" level="project" />
 56 |     <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.30" level="project" />
 57 |     <orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.30" level="project" />
 58 |     <orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.30" level="project" />
 59 |     <orderEntry type="library" name="Maven: log4j:log4j:1.2.17" level="project" />
 60 |     <orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.7.30" level="project" />
 61 |     <orderEntry type="library" name="Maven: com.ning:compress-lzf:1.0.3" level="project" />
 62 |     <orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.1.8.4" level="project" />
 63 |     <orderEntry type="library" name="Maven: org.lz4:lz4-java:1.7.1" level="project" />
 64 |     <orderEntry type="library" name="Maven: com.github.luben:zstd-jni:1.5.0-4" level="project" />
 65 |     <orderEntry type="library" name="Maven: org.roaringbitmap:RoaringBitmap:0.9.0" level="project" />
 66 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.roaringbitmap:shims:0.9.0" level="project" />
 67 |     <orderEntry type="library" name="Maven: commons-net:commons-net:3.1" level="project" />
 68 |     <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-xml_2.12:1.2.0" level="project" />
 69 |     <orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.12.15" level="project" />
 70 |     <orderEntry type="library" name="Maven: org.scala-lang:scala-reflect:2.12.15" level="project" />
 71 |     <orderEntry type="library" name="Maven: org.json4s:json4s-jackson_2.12:3.7.0-M11" level="project" />
 72 |     <orderEntry type="library" name="Maven: org.json4s:json4s-core_2.12:3.7.0-M11" level="project" />
 73 |     <orderEntry type="library" name="Maven: org.json4s:json4s-ast_2.12:3.7.0-M11" level="project" />
 74 |     <orderEntry type="library" name="Maven: org.json4s:json4s-scalap_2.12:3.7.0-M11" level="project" />
 75 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-client:2.34" level="project" />
 76 |     <orderEntry type="library" name="Maven: jakarta.ws.rs:jakarta.ws.rs-api:2.1.6" level="project" />
 77 |     <orderEntry type="library" name="Maven: org.glassfish.hk2.external:jakarta.inject:2.6.1" level="project" />
 78 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-common:2.34" level="project" />
 79 |     <orderEntry type="library" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" level="project" />
 80 |     <orderEntry type="library" name="Maven: org.glassfish.hk2:osgi-resource-locator:1.0.3" level="project" />
 81 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.core:jersey-server:2.34" level="project" />
 82 |     <orderEntry type="library" name="Maven: jakarta.validation:jakarta.validation-api:2.0.2" level="project" />
 83 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet:2.34" level="project" />
 84 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.containers:jersey-container-servlet-core:2.34" level="project" />
 85 |     <orderEntry type="library" name="Maven: org.glassfish.jersey.inject:jersey-hk2:2.34" level="project" />
 86 |     <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-locator:2.6.1" level="project" />
 87 |     <orderEntry type="library" name="Maven: org.glassfish.hk2.external:aopalliance-repackaged:2.6.1" level="project" />
 88 |     <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-api:2.6.1" level="project" />
 89 |     <orderEntry type="library" name="Maven: org.glassfish.hk2:hk2-utils:2.6.1" level="project" />
 90 |     <orderEntry type="library" name="Maven: org.javassist:javassist:3.25.0-GA" level="project" />
 91 |     <orderEntry type="library" name="Maven: io.netty:netty-all:4.1.68.Final" level="project" />
 92 |     <orderEntry type="library" name="Maven: com.clearspring.analytics:stream:2.9.6" level="project" />
 93 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-core:4.2.0" level="project" />
 94 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-jvm:4.2.0" level="project" />
 95 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-json:4.2.0" level="project" />
 96 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-graphite:4.2.0" level="project" />
 97 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-jmx:4.2.0" level="project" />
 98 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.12.3" level="project" />
 99 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.module:jackson-module-scala_2.12:2.12.3" level="project" />
100 |     <orderEntry type="library" name="Maven: com.thoughtworks.paranamer:paranamer:2.8" level="project" />
101 |     <orderEntry type="library" name="Maven: org.apache.ivy:ivy:2.5.0" level="project" />
102 |     <orderEntry type="library" name="Maven: oro:oro:2.0.8" level="project" />
103 |     <orderEntry type="library" name="Maven: net.razorvine:pyrolite:4.30" level="project" />
104 |     <orderEntry type="library" name="Maven: net.sf.py4j:py4j:0.10.9.2" level="project" />
105 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-tags_2.12:3.2.0" level="project" />
106 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-crypto:1.1.0" level="project" />
107 |     <orderEntry type="library" name="Maven: org.spark-project.spark:unused:1.0.0" level="project" />
108 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-sql_2.12:3.2.0" level="project" />
109 |     <orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:6.20.3" level="project" />
110 |     <orderEntry type="library" name="Maven: com.univocity:univocity-parsers:2.9.1" level="project" />
111 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-sketch_2.12:3.2.0" level="project" />
112 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-catalyst_2.12:3.2.0" level="project" />
113 |     <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-parser-combinators_2.12:1.1.2" level="project" />
114 |     <orderEntry type="library" name="Maven: org.codehaus.janino:janino:3.0.16" level="project" />
115 |     <orderEntry type="library" name="Maven: org.codehaus.janino:commons-compiler:3.0.16" level="project" />
116 |     <orderEntry type="library" name="Maven: org.antlr:antlr4-runtime:4.8" level="project" />
117 |     <orderEntry type="library" name="Maven: javax.xml.bind:jaxb-api:2.2.11" level="project" />
118 |     <orderEntry type="library" name="Maven: org.apache.arrow:arrow-vector:2.0.0" level="project" />
119 |     <orderEntry type="library" name="Maven: org.apache.arrow:arrow-format:2.0.0" level="project" />
120 |     <orderEntry type="library" name="Maven: org.apache.arrow:arrow-memory-core:2.0.0" level="project" />
121 |     <orderEntry type="library" name="Maven: com.google.flatbuffers:flatbuffers-java:1.9.0" level="project" />
122 |     <orderEntry type="library" name="Maven: org.apache.arrow:arrow-memory-netty:2.0.0" level="project" />
123 |     <orderEntry type="library" name="Maven: org.apache.orc:orc-core:1.6.11" level="project" />
124 |     <orderEntry type="library" name="Maven: org.apache.orc:orc-shims:1.6.11" level="project" />
125 |     <orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:2.5.0" level="project" />
126 |     <orderEntry type="library" name="Maven: io.airlift:aircompressor:0.21" level="project" />
127 |     <orderEntry type="library" name="Maven: org.jetbrains:annotations:17.0.0" level="project" />
128 |     <orderEntry type="library" name="Maven: org.threeten:threeten-extra:1.5.0" level="project" />
129 |     <orderEntry type="library" name="Maven: org.apache.orc:orc-mapreduce:1.6.11" level="project" />
130 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-storage-api:2.7.2" level="project" />
131 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-column:1.12.1" level="project" />
132 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-common:1.12.1" level="project" />
133 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-encoding:1.12.1" level="project" />
134 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-hadoop:1.12.1" level="project" />
135 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-format-structures:1.12.1" level="project" />
136 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-jackson:1.12.1" level="project" />
137 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0" level="project" />
138 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-token-provider-kafka-0-10_2.12:3.2.0" level="project" />
139 |     <orderEntry type="library" name="Maven: org.apache.kafka:kafka-clients:2.8.0" level="project" />
140 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.6.2" level="project" />
141 |     <orderEntry type="library" name="Maven: com.alibaba:fastjson:1.2.71" level="project" />
142 |     <orderEntry type="library" name="Maven: org.apache.spark:spark-hive_2.12:3.2.0" level="project" />
143 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-common:2.3.9" level="project" />
144 |     <orderEntry type="library" name="Maven: commons-cli:commons-cli:1.2" level="project" />
145 |     <orderEntry type="library" name="Maven: jline:jline:2.12" level="project" />
146 |     <orderEntry type="library" name="Maven: com.tdunning:json:1.8" level="project" />
147 |     <orderEntry type="library" name="Maven: com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter:0.1.2" level="project" />
148 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-exec:core:2.3.9" level="project" />
149 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-vector-code-gen:2.3.9" level="project" />
150 |     <orderEntry type="library" name="Maven: com.google.guava:guava:14.0.1" level="project" />
151 |     <orderEntry type="library" name="Maven: org.apache.velocity:velocity:1.5" level="project" />
152 |     <orderEntry type="library" name="Maven: org.antlr:antlr-runtime:3.5.2" level="project" />
153 |     <orderEntry type="library" name="Maven: org.antlr:ST4:4.0.4" level="project" />
154 |     <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" />
155 |     <orderEntry type="library" name="Maven: stax:stax-api:1.0.1" level="project" />
156 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-metastore:2.3.9" level="project" />
157 |     <orderEntry type="library" name="Maven: javolution:javolution:5.5.1" level="project" />
158 |     <orderEntry type="library" name="Maven: com.jolbox:bonecp:0.8.0.RELEASE" level="project" />
159 |     <orderEntry type="library" name="Maven: com.zaxxer:HikariCP:2.5.1" level="project" />
160 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-api-jdo:4.2.4" level="project" />
161 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-rdbms:4.1.19" level="project" />
162 |     <orderEntry type="library" name="Maven: commons-pool:commons-pool:1.5.4" level="project" />
163 |     <orderEntry type="library" name="Maven: commons-dbcp:commons-dbcp:1.4" level="project" />
164 |     <orderEntry type="library" name="Maven: javax.jdo:jdo-api:3.0.1" level="project" />
165 |     <orderEntry type="library" name="Maven: javax.transaction:jta:1.1" level="project" />
166 |     <orderEntry type="library" name="Maven: org.datanucleus:javax.jdo:3.2.0-m3" level="project" />
167 |     <orderEntry type="library" name="Maven: javax.transaction:transaction-api:1.1" level="project" />
168 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-serde:2.3.9" level="project" />
169 |     <orderEntry type="library" name="Maven: net.sf.opencsv:opencsv:2.3" level="project" />
170 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-shims:2.3.9" level="project" />
171 |     <orderEntry type="library" name="Maven: org.apache.hive.shims:hive-shims-common:2.3.9" level="project" />
172 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hive.shims:hive-shims-0.23:2.3.9" level="project" />
173 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hive.shims:hive-shims-scheduler:2.3.9" level="project" />
174 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-common:2.3.9" level="project" />
175 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-client:2.3.9" level="project" />
176 |     <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.5.13" level="project" />
177 |     <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4.13" level="project" />
178 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-mapper-asl:1.9.13" level="project" />
179 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
180 |     <orderEntry type="library" name="Maven: joda-time:joda-time:2.10.10" level="project" />
181 |     <orderEntry type="library" name="Maven: org.jodd:jodd-core:3.5.2" level="project" />
182 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-core:4.1.17" level="project" />
183 |     <orderEntry type="library" name="Maven: org.apache.thrift:libthrift:0.12.0" level="project" />
184 |     <orderEntry type="library" name="Maven: org.apache.thrift:libfb303:0.9.3" level="project" />
185 |     <orderEntry type="library" name="Maven: org.apache.derby:derby:10.14.2.0" level="project" />
186 |     <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.11" level="project" />
187 |     <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
188 |   </component>
189 | </module>


--------------------------------------------------------------------------------
/spark-coding/src/main/java/com/anryg/bigdata/clickhouse/CKSink.java:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.clickhouse;
 2 | 
 3 | import com.clickhouse.jdbc.ClickHouseConnection;
 4 | import com.clickhouse.jdbc.ClickHouseDataSource;
 5 | import org.apache.spark.sql.ForeachWriter;
 6 | import org.apache.spark.sql.Row;
 7 | 
 8 | import java.sql.PreparedStatement;
 9 | import java.sql.SQLException;
10 | 
11 | /**
12 |  * @DESC: 自定义structured streaming的外部sink，通过jdbc写数据到clickhouse中
13 |  * @Auther: Anryg
14 |  * @Date: 2023/7/3 20:24
15 |  */
16 | public class CKSink extends ForeachWriter<Row> {
17 |     private static final String jdbcUrl = "jdbc:ch://192.168.211.107:8123,192.168.211.108:8123,192.168.211.109:8123/local_db"; //为了防止找不到本地表，把整个集群的配置都写上
18 |     //private static final Properties properties = new Properties();
19 |     private static volatile ClickHouseDataSource ckDtaSource;
20 |     private static volatile ClickHouseConnection connection;
21 | 
22 |     private static final String user = "default"; //用CK的默认用户
23 |     private static final String pwd = "";     //默认用户没有设置密码
24 |     private static final String tableName = "dns_logs_from_spark"; //写入的CK目标表
25 | 
26 | 
27 |     /**
28 |      * @DESC: 执行数据处理之前的准备工作，创建数据库连接，并确保单例，其中open会以partition为单位执行
29 |      * */
30 |     @Override
31 |     public boolean open(long partitionId, long epochId){
32 |         if (ckDtaSource == null || connection == null) {
33 |             synchronized (CKSink.class){
34 |                 if (ckDtaSource == null || connection == null) {
35 |                     try {
36 |                         ckDtaSource = new ClickHouseDataSource(jdbcUrl);
37 |                         connection = ckDtaSource.getConnection(user, pwd);
38 |                     } catch (SQLException e) {
39 |                         e.printStackTrace();
40 |                         System.exit(-1); //捕获到异常后进程退出
41 |                     }
42 |                 }
43 |             }
44 |         }
45 | 
46 |         if (connection == null) return false;
47 |         else return true;
48 |     }
49 | 
50 | 
51 |     /**
52 |      * @DESC: 当open函数返回为true之后，会针对partition中的每个ROW进行调用
53 |      * */
54 |     @Override
55 |     public void process(Row value) {
56 |         try {
57 |             PreparedStatement preparedStatement = connection.prepareStatement("insert into " + tableName + " values(?,?,?,?,?,?,?,?,?)");
58 |             preparedStatement.setString(1,value.getString(0));
59 |             preparedStatement.setString(2,value.getString(1));
60 |             preparedStatement.setString(3,value.getString(2));
61 |             preparedStatement.setString(4,value.getString(3));
62 |             preparedStatement.setString(5,value.getString(4));
63 |             preparedStatement.setString(6,value.getString(5));
64 |             preparedStatement.setString(7,value.getString(6));
65 |             preparedStatement.setString(8,value.getString(7));
66 |             preparedStatement.setString(9,value.getString(8));
67 |             preparedStatement.addBatch();
68 |             preparedStatement.executeBatch();
69 |         } catch (SQLException e) {
70 |             e.printStackTrace();
71 |             System.exit(-1); //捕获到异常后进程退出
72 |         }
73 | 
74 |     }
75 | 
76 |     /**
77 |      * @DESC: 上两个函数执行完后，开始调用，一般用于关闭连接
78 |      * */
79 |     @Override
80 |     public void close(Throwable errorOrNull) {
81 |         //长连接，不关闭
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/hive/ConnectHive.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.hive
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | //import org.apache.spark.sql.SparkSession
 5 | 
 6 | /**
 7 |   * @DESC:
 8 |   * @Auther: Anryg
 9 |   * @Date: 2022/4/8 16:30
10 |   */
11 | /*object ConnectHive {
12 | 
13 |     def main(args: Array[String]): Unit = {
14 |         val conf = new SparkConf()
15 |         conf.setAppName("connect_hive")
16 |         val sparkSession = SparkSession.builder().config(conf)
17 |                 //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive")
18 |                 //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
19 |                 //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
20 |                 .enableHiveSupport()
21 |                 .getOrCreate()
22 | 
23 |         val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3")
24 |         result.show()
25 | 
26 |         sparkSession.close()
27 |         sparkSession.stop()
28 |     }
29 | }*/
30 | 
31 | 
32 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象
33 |         val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据
34 |         result.show()*/


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/hive/Spark3ConnectHive3.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.hive
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | //import org.apache.spark.sql.SparkSession
 5 | 
 6 | /**
 7 |   * @DESC: 必须是非ACID表,否则读取为空表，但不报错
 8 |   * @Auther: Anryg
 9 |   * @Date: 2022/4/8 16:30
10 |   */
11 | /*object Spark3ConnectHive3 {
12 | 
13 |     def main(args: Array[String]): Unit = {
14 |         val conf = new SparkConf()
15 |         conf.setAppName("connect_hive")
16 |         val sparkSession = SparkSession.builder().config(conf)
17 |                 //.config("spark.sql.warehouse.dir","hdfs://192.168.211.106:8020/warehouse/tablespace/managed/hive")
18 |                 //.config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
19 |                 //.config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
20 |                 //.config("spark.sql.hive.strict.managed.tables", false)
21 |                 .enableHiveSupport()
22 |                 .getOrCreate()
23 | 
24 |         val result = sparkSession.sql("select * from xas.as_bgp_bak limit 3")
25 |         result.show()
26 | 
27 |         sparkSession.close()
28 |         sparkSession.stop()
29 |     }
30 | }*/
31 | 
32 | 
33 | /*val hive = HiveWarehouseSession.session(sparkSession).build()//获取HWC对象
34 |         val result = hive.executeQuery("select * from doi_data limit 2")//查询hive表数据
35 |         result.show()*/


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/Kafka2CK.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.clickhouse
 2 | 
 3 | 
 4 | import com.anryg.bigdata.clickhouse.CKSink
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 8 | 
 9 | /**
10 |   * @DESC: 通过spark structured streaming消费kafka，并通过自定义的ForeachWriter写数据到clickhouse
11 |   * @Auther: Anryg
12 |   * @Date: 2023/7/3 10:18
13 |   */
14 | object Kafka2CK {
15 | 
16 |     def main(args: Array[String]): Unit = {
17 |         val conf = new SparkConf().setAppName("Kafka2CK").setMaster("local[*]")
18 |         val spark = SparkSession.builder().config(conf).getOrCreate()
19 | 
20 |         val rawDF = spark.readStream //获取数据源
21 |                 .format("kafka") //确定数据源的来源格式
22 |                 .option("kafka.bootstrap.servers", "192.168.211.107:6667,192.168.211.108:6667,192.168.211.109:6667") //指定kafka集群的地址，理论上写一个broker就可以了
23 |                 .option("subscribe","qianxin")  //指定topic
24 |                 //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
25 |                 .option("failOnDataLoss",false)  //如果读取数据源时，发现数据突然缺失，比如被删，则是否马上抛出异常
26 |                 .option("fetchOffset.numRetries",3)  //获取消息的偏移量时，最多进行的重试次数
27 |                 //.option("maxOffsetsPerTrigger",99000000)/**用于限流，限定每次读取数据的最大条数，不指定则是as fast as possible,但是每次只取最新的数据，不取旧的*/
28 |                 .option("startingOffsets","latest")  //第一次消费时，读取kafka数据的位置
29 |                 .load()
30 | 
31 |         import spark.implicits._
32 | 
33 |         val ds = rawDF.selectExpr("CAST(value AS STRING)")  //将kafka中的数据的value转为为string，原始为binary类型
34 |                 .map(row => {
35 |                     val line = row.getAs[String]("value") //获取row对象中的field，其实也只有一个field
36 |                     val msgArray = line.split("\\|")  //指定分隔符进行字段切分
37 |                     msgArray
38 |                 }).filter(_.length == 9)  //只留字段数为9的数据
39 |                 .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组，为了方便下一步赋予schema
40 |                 .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
41 | 
42 | 
43 |         val query = ds.writeStream
44 |                 .outputMode(OutputMode.Append())  //指定数据的写入方式
45 |                 .foreach(new CKSink)
46 |                 //.format("console")  //指定外部输出介质，注意：不能同时指定2个外部输出，否则只会以最后一个为准
47 |                 //.trigger(Trigger.ProcessingTime(6,TimeUnit.SECONDS))/**每60秒执行一次，不指定就是as fast as possible*/
48 |                 .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/Kafka2CK2") /**用来保存offset，用该目录来绑定对应的offset，如果该目录发生改变则程序运行的id会发生变化，类比group.id的变化*/
49 |                 .start()
50 | 
51 |         query.awaitTermination()
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/StreamingProcessHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 4 | import org.apache.spark.sql.streaming.{DataStreamReader, DataStreamWriter, OutputMode}
 5 | 
 6 | /**
 7 |   * @DESC: 对数据处理的共性部分进行提取
 8 |   * @Auther: Anryg
 9 |   * @Date: 2022/8/31 17:50
10 |   */
11 | trait StreamingProcessHelper[Any] {
12 | 
13 |     /**
14 |       * @DESC: 以流的方式获取数据source
15 |       * @param sparkSession:
16 |       * @param dataSource: 数据源形式，比如kafka
17 |       * @param config: 对流式数据源的配置
18 |       * */
19 |     def getStreamingReader(sparkSession:SparkSession, dataSource:String, config:Map[String,String]): DataStreamReader ={
20 |         val streamingReader = sparkSession.readStream
21 |                 .format(dataSource)
22 |                 .options(config)
23 |         streamingReader
24 |     }
25 | 
26 |     /**
27 |       * @DESC: 以流方式对数据进行sink
28 |       * @param dataSet: 处理完成的结果数据集
29 |       * @param outputMode: sink的类型：Complete、append、update
30 |       * @param config: 对sink对象的配置
31 |       * */
32 |     def getStreamingWriter(dataSet:DataFrame, outputMode:OutputMode, outputFormat:String, config:Map[String,String]): DataStreamWriter[Row] ={
33 |         val streamingWriter = dataSet.writeStream
34 |                 .format(outputFormat)
35 |                 .outputMode(outputMode)
36 |                 .options(config)
37 |         streamingWriter
38 |     }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/StructuredStreamingTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | package com.anryg.bigdata.streaming
 3 | 
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.streaming.OutputMode
 7 | 
 8 | import scala.collection.mutable
 9 | 
10 | /**
11 |   * @DESC: 测试daemon,注意如果在Windows下测试运行，需要在Windows上设置HADOOP_HOME环境变量,且需要在$HADOOP_HOME/bin目录下放置hadoop.dll和winutils两个文件
12 |   * @Auther: Anryg
13 |   * @Date: 2021/3/1 11:09
14 |   */
15 | object StructuredStreamingTest {
16 | 
17 | 
18 |     def main(args: Array[String]): Unit = {
19 |         val conf = new SparkConf().setMaster("local[2]").setAppName("Structured streaming test")
20 |         //val conf = SparkConfFactory.newSparkConf().setMaster("local[2]").setAppName("Structured streaming test")
21 | 
22 |         val spark = SparkSession.builder().config(conf).getOrCreate()
23 | 
24 |         val rawDF = spark.readStream.format("socket") /**如果结果输出为complete模式，原始DF不能直接作为结果输出，必须经过聚合处理才可以，否则会有如下报错*/
25 |             /*Exception in thread "main" org.apache.spark.sql.AnalysisException:
26 |             Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets;;*/
27 |             .option("host","192.168.211.106")
28 |             .option("port",9998)
29 |             .load()
30 | 
31 |         import spark.implicits._
32 | 
33 | 
34 | 
35 |         val xxx = rawDF.as[String].foreachPartition(iter => {
36 |             while (iter.hasNext) println(iter.next())
37 |         })
38 |         /*mapPartitions(iterator => {
39 |             val array = new mutable.ArrayBuffer[String]
40 |             while (iterator.hasNext){
41 |                 val next = iterator.next()
42 |                 array.+=(next)
43 |             }
44 |             array.toIterator
45 |         })*/
46 | 
47 |         val query = rawDF.writeStream
48 |             .outputMode(OutputMode.Append())
49 |             .format("console")
50 |             .start()
51 | 
52 |         query.awaitTermination()
53 | 
54 | 
55 |         //rawDF.take(10).foreach(println(_))
56 | 
57 | 
58 |     }
59 | 
60 | }
61 | */
62 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreaming4Kafka2CSV.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo;
 2 | import java.util.concurrent.TimeUnit
 3 | 
 4 | import com.alibaba.fastjson.JSON
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 8 | /**
 9 |   * @DESC:  对接实时上网的数据源到HDFS的CSV文件中
10 |   * @Auther: Anryg
11 |   * @Date: 2020/12/17 09:56
12 |   */
13 | object StructuredStreaming4Kafka2CSV {
14 | 
15 |     def main(args: Array[String]): Unit = {
16 |         val conf = new SparkConf().setAppName("StructuredStreaming4Kafka2CSV").setMaster("local[*]")
17 |         val spark = SparkSession.builder().config(conf).getOrCreate()
18 | 
19 |         val rawDF = spark.readStream
20 |             .format("kafka")
21 |             .option("kafka.bootstrap.servers", "192.168.211.107:6667")
22 |             .option("subscribe","qianxin")
23 |             //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
24 |             .option("failOnDataLoss",false)
25 |             .option("fetchOffset.numRetries",3)
26 |             .option("maxOffsetsPerTrigger",90000000)/**用于限流，限定每个批次取的数据条数，确定写入HDFS单个文件的条数*/
27 |             .option("startingOffsets","earliest")
28 |             .load()
29 | 
30 |         import spark.implicits._
31 |         val ds = rawDF.selectExpr("CAST(value AS STRING)")
32 |                         .map(row => {
33 |                             val line = row.getAs[String]("value")
34 |                             val fieldArray:Array[String] = line.split("\\|")
35 |                             fieldArray
36 |                         }).filter(_.length == 9).map(array =>(array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8)))
37 |                 .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
38 | 
39 |         ds.printSchema()
40 | 
41 |         //val ds1 = ds.select($"client_ip")
42 |         val query = ds.writeStream
43 |             .outputMode(OutputMode.Append()).trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒写文件一次*/
44 |                 .option("format", "append") /**会在同一个目录下追加新文件，否则只能在特定目录下写一个批次的的数据后就报错*/
45 |                 .option("header", "true") /**添加文件的scheme*/
46 |             .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/")
47 |             .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-03") /**用来保存offset，用该目录来绑定对应的offset，如果该目录发生改变则程序运行的id会发生变化，类比group.id的变化*/
48 |             .start()
49 | 
50 |         query.awaitTermination()
51 |     }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo
 2 | 
 3 | import com.alibaba.fastjson.JSON
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.streaming.OutputMode
 7 | ;
 8 | 
 9 | /**
10 |   * @DESC:  从kafka读取上网上网数据
11 |   * @Auther: Anryg
12 |   * @Date: 2020/12/17 09:56
13 |   */
14 | object StructuredStreamingFromKafka {
15 | 
16 |     def main(args: Array[String]): Unit = {
17 |         val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]")
18 |         val spark = SparkSession.builder().config(conf).getOrCreate()
19 | 
20 |         val rawDF = spark.readStream //获取数据源
21 |             .format("kafka") //确定数据源的来源格式
22 |             .option("kafka.bootstrap.servers", "192.168.211.108:6667") //指定kafka集群的地址，理论上写一个broker就可以了
23 |             .option("subscribe","test")  //指定topic
24 |             //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
25 |             .option("failOnDataLoss",false)  //如果读取数据源时，发现数据突然缺失，比如被删，则是否马上抛出异常
26 |             .option("fetchOffset.numRetries",3)  //获取消息的偏移量时，最多进行的重试次数
27 |             .option("maxOffsetsPerTrigger",10)/**用于限流，限定每次读取数据的最大条数，不指定则是as fast as possible,但是每次只取最新的数据，不取旧的*/
28 |             .option("startingOffsets","latest")  //第一次消费时，读取kafka数据的位置
29 |                 //.option("startingOffsets","""{"test":{"0":-2,"1":-2,"2":-2,"3":-2}}""")
30 |             .load()
31 | 
32 |         import spark.implicits._
33 |         val ds = rawDF.selectExpr("CAST(value AS STRING)")  //将kafka中的数据的value转为为string，原始为binary类型
34 |                         .map(row => {
35 |                             val line = row.getAs[String]("value") //获取row对象中的field，其实也只有一个field
36 |                             val rawJson = JSON.parseObject(line)      //原始string是一个json，对其进行解析
37 |                             val message = rawJson.getString("message")  //获取业务数据部分
38 |                             val msgArray = message.split(",")  //指定分隔符进行字段切分
39 |                             msgArray
40 |                         }).filter(_.length == 9)  //只留字段数为9的数据
41 |                         .map(array => (array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组，为了方便下一步赋予schema
42 |                         .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
43 | 
44 |         ds.printSchema() //打印schema，确认没有问题
45 | 
46 |         val query = ds.writeStream
47 |             .outputMode(OutputMode.Append())  //指定数据的写入方式
48 |                 .format("console")  //指定外部输出介质
49 |                 //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次，不指定就是as fast as possible*/
50 |                 .option("format", "append") /**会在同一个目录下追加新文件，否则只能在特定目录下写一个批次的的数据后就报错*/
51 |                 //.option("header", "true") /**添加文件的scheme*/
52 |                 // .format("csv").option("path","hdfs://192.168.211.106:8020/DATA/qianxin/3/")
53 |             .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-08") /**用来保存offset，用该目录来绑定对应的offset，如果该目录发生改变则程序运行的id会发生变化，类比group.id的变化*/
54 |             .start()
55 | 
56 |         query.awaitTermination()
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2ES.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo;
 2 | import com.alibaba.fastjson.JSON
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.OutputMode
 6 | /**
 7 |   * @DESC:  从kafka读取上网上网数据,写入ES
 8 |   * @Auther: Anryg
 9 |   * @Date: 2020/12/17 09:56
10 |   */
11 | object StructuredStreamingFromKafka2ES {
12 | 
13 |     def main(args: Array[String]): Unit = {
14 |         val conf = new SparkConf().setAppName("StructuredStreamingFromKafka").setMaster("local[*]")
15 |         val spark = SparkSession.builder().config(conf).getOrCreate()
16 | 
17 |         val rawDF = spark.readStream
18 |             .format("kafka") //确定数据源的来源格式
19 |             .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址，理论上写一个broker就可以了
20 |             .option("subscribe","test")  //指定topic
21 |             //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
22 |             .option("failOnDataLoss",false)  //如果读取数据源时，发现数据突然缺失，比如被删，则是否马上抛出异常
23 |             .option("fetchOffset.numRetries",3)  //获取消息的偏移量时，最多进行的重试次数
24 |             //.option("maxOffsetsPerTrigger",100)/**用于限流，限定每次读取数据的最大条数，不指定则是as fast as possible*/
25 |             .option("startingOffsets","earliest")  //第一次消费时，读取kafka数据的位置
26 |             .load()
27 | 
28 |         import spark.implicits._
29 |         val ds = rawDF.selectExpr("CAST(value AS STRING)")  //将kafka中的数据的value转为为string，原始为binary类型
30 |                         .map(row => {
31 |                             val line = row.getAs[String]("value") //获取row对象中的field，其实也只有一个field
32 |                             val rawJson = JSON.parseObject(line)      //原始string是一个json，对其进行解析
33 |                             val message = rawJson.getString("message")  //获取业务数据部分
34 |                             val msgArray = message.split(",")  //指定分隔符进行字段切分
35 |                             msgArray
36 |                         }).filter(_.length == 9)  //只留字段数为9的数据
37 |                         .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),array(4),array(5),array(6),array(7),array(8))) //将其转化成为元组，为了方便下一步赋予schema
38 |                         .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip") //给裸数据添加字段名
39 | 
40 |         ds.printSchema() //打印schema，确认没有问题
41 | 
42 |         val query = ds.writeStream
43 |             .outputMode(OutputMode.Append())  //指定数据的写入方式
44 |                 .format("org.elasticsearch.spark.sql")  //指定外部输出为ES
45 |                 .option("es.nodes","192.168.211.106")
46 |                 .option("es.port","9201")
47 |                 .option("es.write.operation","upsert")
48 |                 .option("es.mapping.id","id")
49 |                 //.option("es.mapping.exclude","id")
50 |                 //.trigger(Trigger.ProcessingTime(60,TimeUnit.SECONDS))/**每60秒执行一次，不指定就是as fast as possible*/
51 |                 .option("format", "append") /**追加写入*/
52 |             .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/kafka_datasource-05") /**用来保存offset，用该目录来绑定对应的offset，如果该目录发生改变则程序运行的id会发生变化，类比group.id的变化*/
53 |             .start("internet_behavior-flink")
54 | 
55 |         query.awaitTermination()
56 |     }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingFromKafka2Hive.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import com.alibaba.fastjson.JSON
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.sql.SparkSession
 8 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 9 | ;
10 | 
11 | /**
12 |   * @DESC:  从kafka读取上网数据,写入hive动态分区表
13 |   * @Auther: Anryg
14 |   * @Date: 2020/12/17 09:56
15 |   */
16 | object StructuredStreamingFromKafka2Hive {
17 | 
18 |     def main(args: Array[String]): Unit = {
19 |         val conf = new SparkConf()
20 |                 .setAppName("StructuredStreamingFromKafka2Hive")
21 |                 .setMaster("local[*]")//本地运行模式，如果提交集群，注释掉这行
22 |         val spark = SparkSession.builder().config(conf)
23 |                 .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
24 |                 .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
25 |                 .enableHiveSupport() //打开hive支持功能，可以与hive共享catalog
26 |                 .getOrCreate()
27 | 
28 |          val rawDF = spark.readStream
29 |             .format("kafka") //确定数据源的来源格式
30 |             .option("kafka.bootstrap.servers", "192.168.211.107:6667") //指定kafka集群的地址，理论上写一个broker就可以了
31 |             .option("subscribe","test")  //指定topic
32 |             //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
33 |             .option("failOnDataLoss",false)  //如果读取数据源时，发现数据突然缺失，比如被删，则是否马上抛出异常
34 |             .option("fetchOffset.numRetries",3)  //获取消息的偏移量时，最多进行的重试次数
35 |             .option("maxOffsetsPerTrigger",500)/**用于限流，限定每次读取数据的最大条数，不指定则是as fast as possible*/
36 |             .option("startingOffsets","earliest")  //第一次消费时，读取kafka数据的位置
37 |             .load()
38 | 
39 |         import spark.implicits._
40 |         val ds = rawDF.selectExpr("CAST(value AS STRING)")  //将kafka中的数据的value转为为string，原始为binary类型
41 |                         .map(row => {
42 |                             val line = row.getAs[String]("value") //获取row对象中的field，其实也只有一个field
43 |                             val rawJson = JSON.parseObject(line)      //原始string是一个json，对其进行解析
44 |                             val message = rawJson.getString("message")  //获取业务数据部分
45 |                             val msgArray = message.split(",")  //指定分隔符进行字段切分
46 |                             msgArray
47 |                         }).filter(_.length == 9)  //只留字段数为9的数据
48 |                         .filter(array => array(2).length >= 8)//确保日期字段符合规范
49 |                         .map(array => (array(0)+array(1)+array(2),array(0),array(1),array(2),array(3),
50 |                 if(array(4).isInstanceOf[Int]) array(4).toInt else 99,array(5),array(6),array(7),array(8),array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8))) //将其转化成为元组，为了方便下一步赋予schema
51 |                         .toDF("id","client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名
52 | 
53 |         ds.printSchema() //打印schema，确认没有问题
54 |         spark.sql("show databases;").show()
55 | 
56 |         val query = ds.writeStream
57 |             .outputMode(OutputMode.Append())  //指定数据的写入方式
58 |                 .format("orc")  //指定外部输出的文件存储格式
59 |                 .option("format", "append")
60 |                 .trigger(Trigger.ProcessingTime(10,TimeUnit.SECONDS))/**每60秒执行一次，不指定就是as fast as possible*/
61 |                 .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingFromKafka2Hive01") /**用来保存offset，用该目录来绑定对应的offset，如果该目录发生改变则程序运行的id会发生变化，类比group.id的变化，写hive的时候一定不要轻易变动*/
62 |                 .partitionBy("year","month","day")//提供分区字段
63 |                 .toTable("test.test")//写入hive表
64 |         query.awaitTermination()
65 |     }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/StructuredStreamingReadHive.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo;
 2 | 
 3 | import org.apache.log4j.{Level, Logger}
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | /**
 8 |   * @DESC:  读取通过streaming写入hive动态分区表的数据
 9 |   * @Auther: Anryg
10 |   * @Date: 2022/08/31 09:56
11 |   */
12 | object StructuredStreamingReadHive {
13 | 
14 |     def main(args: Array[String]): Unit = {
15 |         Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
16 |         val conf = new SparkConf()
17 |                 .setAppName("StructuredStreamingReadHive")
18 |                 .setMaster("local[*]")//本地运行模式，如果提交集群，注释掉这行
19 |         val spark = SparkSession.builder().config(conf)
20 |                 .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
21 |                 .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
22 |                 .enableHiveSupport() //打开hive支持功能，可以与hive共享catalog
23 |                 .getOrCreate()
24 | 
25 |          spark.readStream
26 |                  .table("ods.ods_kafka_internetlog1")
27 |                  .select("client_ip")
28 |                  .writeStream
29 |                  .format("console")
30 |                  .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StructuredStreamingReadHive1")
31 |                  .start().awaitTermination()
32 | 
33 |     }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/demo/window_watermark/WorldCountWithWatermark.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.demo.window_watermark
 2 | 
 3 | import java.sql.Timestamp
 4 | import java.text.SimpleDateFormat
 5 | import org.apache.log4j.{Level, Logger}
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.sql.SparkSession
 8 | import org.apache.spark.sql.streaming.OutputMode
 9 | 
10 | /**
11 |   * @DESC: 用时间窗口和watermark来进行client_ip的worldcount统计
12 |   * @Auther: Anryg
13 |   * @Date: 2022/11/30 10:04
14 |   */
15 | object WorldCountWithWatermark {
16 | 
17 |     def main(args: Array[String]): Unit = {
18 |         val conf = new SparkConf().setAppName("WorldCountWithWatermark").setMaster("local")
19 |         val spark = SparkSession.builder()
20 |                 .config(conf)
21 |                 .getOrCreate()
22 |         Logger.getLogger("org.apache").setLevel(Level.WARN) //减少INFO日志的输出
23 | 
24 |         val rawDF = spark.readStream
25 |                 .format("kafka")
26 |                 .option("kafka.bootstrap.servers", "192.168.211.107:6667")
27 |                 .option("subscribe", "qianxin")
28 |                 //.option("group.id","test9999") /**不再用该方式来绑定offset，而是每个程序有个唯一的id，该id跟checkpointLocation绑定，虽然group.id属性在运行中依然保留，但是不再跟offset绑定*/
29 |                 .option("failOnDataLoss",false)
30 |                 .option("fetchOffset.numRetries",3)
31 |                 //.option("maxOffsetsPerTrigger",Integer.MAX_VALUE)/**用于限流，限定每个批次取的数据条数，确定写入HDFS单个文件的条数*/
32 |                 .option("startingOffsets","latest")
33 |                 .load()
34 | 
35 |         import spark.implicits._
36 |         val df1 = rawDF.selectExpr("CAST(value AS string)")
37 |                 .map(row =>{
38 |                     val line = row.getAs[String]("value")
39 |                     val fieldArray:Array[String] = line.split("\\|")
40 |                     fieldArray
41 |                 })
42 |                 .filter(_.length == 9) //确定字段数必须为9个
43 |                 .filter(_(1).endsWith("com")) //防止数量太大，对访问的网站做的一点限制
44 |                .map(array =>{
45 |                     val sdf = new SimpleDateFormat("yyyyMMddhhmmss").parse(array(2))
46 |                     val time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(sdf)
47 |                     (array(0), Timestamp.valueOf(time)) //因为time这个字段要作为watermark字段，它必须是yyyy-MM-dd HH:mm:ss样式的Timestamp类型
48 |                 })
49 |                 .toDF("client_ip", "time") //添加schema
50 | 
51 |         import org.apache.spark.sql.functions._  /**引入spark内置函数*/
52 | 
53 |         val df2 = df1.withWatermark("time", "10 seconds") //一般需要跟window一起配合使用
54 |                 .groupBy(window($"time","2 minutes","30 seconds"), $"client_ip") //确定具体字段，以及对应的聚合时间窗口，和滑动窗口
55 |                 .count()
56 |                 .orderBy($"count".desc)
57 |                 .limit(10)
58 | 
59 |         val query = df2.writeStream
60 |                 .format("console") //打印到控制台
61 |                 .option("truncate", false) //将结果的内容完整输出，默认会砍掉内容过长的部分
62 |                 .option("numRows",30) //一次最多打印多少行，默认20行
63 |                 .option("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/WorldCountWithWatermark") //确定checkpoint目录
64 |                 //.outputMode(OutputMode.Update())//不支持排序的结果
65 |                 .outputMode(OutputMode.Complete()) //确定输出模式，默认为Append
66 |                 .start()
67 | 
68 |         query.awaitTermination()
69 |     }
70 | 
71 | }


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/dwd/StreamingFromOds2Dwd.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.dwd
 2 | 
 3 | import com.anryg.bigdata.{IpSearch, RedisClientUtils}
 4 | import com.anryg.bigdata.streaming.StreamingProcessHelper
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
 7 | import org.apache.spark.sql.{DataFrame, SparkSession}
 8 | 
 9 | 
10 | /**
11 |   * @DESC: 读取ods层数据，并加工写入到dwd
12 |   * @Auther: Anryg
13 |   * @Date: 2022/9/1 09:53
14 |   */
15 | object StreamingFromOds2Dwd  extends StreamingProcessHelper[Any]{
16 | 
17 |     def main(args: Array[String]): Unit = {
18 |         val conf = new SparkConf().setAppName("StreamingFromOds2Dwd").setMaster("local")
19 |         val spark = SparkSession.builder().config(conf)
20 |                 .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
21 |                 .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
22 |                 .enableHiveSupport() //打开hive支持功能，可以与hive共享catalog
23 |                 .getOrCreate()
24 | 
25 |         clickProcess(spark,"ods.ods_kafka_internetlog","dwd.dwd_internetlog_detail")
26 |     }
27 | 
28 |     /**
29 |       *@DESC: 流方式读取hive数据源
30 |       * */
31 |     def readHive2DF(sparkSession: SparkSession, sourceTable:String): DataFrame ={
32 |         sparkSession.readStream.table(sourceTable)
33 |     }
34 | 
35 |     /**
36 |       *@DESC: 对ods数据进行字段补齐等处理
37 |       * */
38 |     def handleData(sparkSession: SparkSession, dataFrame: DataFrame, tableName:String): DataFrame ={
39 |         import sparkSession.implicits._
40 |         dataFrame.printSchema()
41 |         dataFrame.map(row => {
42 |             val clientIP = row.getAs[String]("client_ip")
43 |             val ipAndAddr = IpSearch.getAddrByIP(RedisClientUtils.getSingleRedisClient,clientIP).split("-")
44 |             val country = ipAndAddr(2)
45 |             val province = ipAndAddr(3)
46 |             val city = ipAndAddr(4)
47 |             val operator = ipAndAddr(5)
48 |             val domain = row.getAs[String]("domain").toLowerCase//将域名转成小写
49 |             val time = row.getAs[String]("time")
50 |             val targetIP = row.getAs[String]("target_ip")
51 |             val rcode = row.getAs[String]("rcode")
52 |             val queryType = row.getAs[String]("query_type")
53 |             val authRecord = row.getAs[String]("authority_record").toLowerCase
54 |             val addMsg = row.getAs[String]("add_msg")
55 |             val dnsIP = row.getAs[String]("dns_ip")
56 |             val year = row.getAs[String]("year")
57 |             val month = row.getAs[String]("month")
58 |             val day = row.getAs[String]("day")
59 |             (clientIP,country,province,city,operator,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP,year,month,day)
60 |         }).toDF("client_ip","country","province","city","operator","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day")
61 |     }
62 | 
63 |     /**
64 |       *@DESC: 将处理好的数据sink到dwd表中
65 |       * */
66 |     def sinkData(targetDS:DataFrame, tableName:String): StreamingQuery ={
67 |         val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingFromOds2Dwd"),
68 |             ("format","append"))
69 |         getStreamingWriter(targetDS,OutputMode.Append(),"orc",config)
70 |                 .partitionBy("year","month","day")
71 |                 .toTable(tableName)
72 |     }
73 | 
74 |     /**
75 |       * @DESC: 将所有数据步骤串起来
76 |       * */
77 |     def clickProcess(sparkSession: SparkSession,sourceTable:String, sinkTable:String): Unit ={
78 |         val rawDF = readHive2DF(sparkSession, sourceTable)
79 |         val targetDS = handleData(sparkSession, rawDF, sourceTable)
80 |         sinkData(targetDS, sinkTable).awaitTermination()
81 |     }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/streaming/ods/StreamingSource2HiveOds.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.streaming.ods
 2 | 
 3 | import com.alibaba.fastjson.JSON
 4 | import com.anryg.bigdata.streaming.StreamingProcessHelper
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
 7 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 8 | 
 9 | 
10 | /**
11 |   * @DESC:
12 |   * @Auther: Anryg
13 |   * @Date: 2022/8/31 19:03
14 |   */
15 | object StreamingSource2HiveOds extends StreamingProcessHelper[Any]{
16 | 
17 | 
18 |     /**
19 |       * @DESC: 主函数，应用运行入口
20 |       * */
21 |     def main(args: Array[String]): Unit = {
22 |         val conf = new SparkConf().setAppName("StreamingSource2HiveOds").setMaster("local[*]")
23 |         val spark = SparkSession.builder().config(conf)
24 |                 .config("spark.sql.hive.hiveserver2.jdbc.url","jdbc:hive2://hdp01.pcl-test.com:2181,hdp03.pcl-test.com:2181,hdp02.pcl-test.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2")
25 |                 .config("spark.datasource.hive.warehouse.metastoreUri","thrift://hdp01.pcl-test.com:9083")
26 |                 .enableHiveSupport() //打开hive支持功能，可以与hive共享catalog
27 |                 .getOrCreate()
28 | 
29 |         clickProcess(spark)
30 | 
31 |     }
32 | 
33 |     /**
34 |       *@DESC: 将kafka数据源数据读取出来成为DataFrame
35 |       * */
36 |     def readKafka2DF(sparkSession: SparkSession): DataFrame ={
37 |         val config = Map(("kafka.bootstrap.servers", "192.168.211.107:6667"),("subscribe","test"),
38 |             ("failOnDataLoss","false"),("fetchOffset.numRetries","3"),("startingOffsets","earliest"))
39 | 
40 |         getStreamingReader(sparkSession,"kafka",config).load()
41 |     }
42 | 
43 |     /**
44 |       *@DESC: 将原始DF进行业务逻辑处理
45 |       * */
46 | 
47 |     def handleData(sparkSession: SparkSession, rawDF:DataFrame): DataFrame ={
48 |         import sparkSession.implicits._
49 |         val targetDS = rawDF.selectExpr("CAST(value AS STRING)")  //将kafka中的数据的value转为为string，原始为binary类型
50 |                 .map(row => {
51 |             val line = row.getAs[String]("value") //获取row对象中的field，其实也只有一个field
52 |             val rawJson = JSON.parseObject(line)      //原始string是一个json，对其进行解析
53 |             val message = rawJson.getString("message")  //获取业务数据部分
54 |             val msgArray = message.split(",")  //指定分隔符进行字段切分
55 |             msgArray
56 |         }).filter(_.length == 9).filter(array => array(2).length >= 8)//确保日期字段符合规范
57 |                 .map(array =>(array(0),array(1),array(2),array(3), array(4),array(5),array(6),array(7),array(8),
58 |                 array(2).substring(0,4),array(2).substring(4,6),array(2).substring(6,8)))
59 |                 .toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip","year","month","day") //给裸数据添加字段名
60 | 
61 |         targetDS
62 |     }
63 | 
64 |     /**
65 |       *@DESC: 将目标数据集进行写入hive的ODS
66 |       * */
67 |     def sinkData(targetDS:DataFrame): StreamingQuery ={
68 |         val config = Map(("checkpointLocation","hdfs://192.168.211.106:8020/tmp/offset/test/StreamingSource2HiveOds"),
69 |             ("format","append"))
70 |         getStreamingWriter(targetDS, OutputMode.Append(),"orc",config)
71 |                 .partitionBy("year","month","day")
72 |                 .toTable("ods.ods_kafka_internetlog")
73 |     }
74 | 
75 |     /**
76 |       * @DESC: 将所有数据步骤串起来
77 |       * */
78 |     def clickProcess(sparkSession: SparkSession): Unit ={
79 |         val rawDF = readKafka2DF(sparkSession)
80 |         val targetDS = handleData(sparkSession, rawDF)
81 |         sinkData(targetDS).awaitTermination()
82 |     }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew01.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.test.data_skew
 2 | 
 3 | import java.util
 4 | 
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | /**
 8 |   * @DESC: 一个数据倾斜的例子
 9 |   * @Auther: Anryg
10 |   * @Date: 2022/10/10 17:00
11 |   */
12 | object DataSkew01 {
13 | 
14 |     def main(args: Array[String]): Unit = {
15 |         val conf = new SparkConf().setAppName("DataSkewTest01")/*.setMaster("local[*]")*/
16 |         val spark = new SparkContext(conf)
17 | 
18 |         val rawRDD = spark.textFile(args(0))//读取数据源
19 | 
20 |         val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据，已到达数据倾斜的目的*/
21 |             val array = line.split(",")
22 |             val target_ip = array(3)
23 |             target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116")
24 |         })
25 | 
26 |         val reducedRDD = filteredRDD.map(line => {/**根据目的ip进行汇总，将访问同一个目的ip的所有客户端ip进行汇总*/
27 |             val array = line.split(",")
28 |             val target_ip = array(3)
29 |             val client_ip = array(0)
30 |             val index = client_ip.lastIndexOf(".")
31 |             val subClientIP = client_ip.substring(0, index) //为了让后续聚合后的value数据量尽可能的少，只取ip的前段部分
32 |             (target_ip,Array(subClientIP))
33 |         }).reduceByKey(new MyPartitioner(4), _++_)//将Array中的元素进行合并
34 | 
35 |         val targetRDD = reducedRDD.map(kv => {/**将访问同一个目的ip的客户端，再次根据客户端ip进行进一步统计*/
36 |             val map = new util.HashMap[String,Int]()
37 |             val target_ip = kv._1
38 |             val clientIPArray = kv._2
39 |             clientIPArray.foreach(clientIP => {
40 |                 if (map.containsKey(clientIP)) {
41 |                     val sum = map.get(clientIP) + 1
42 |                     map.put(clientIP,sum)
43 |                 }
44 |                 else map.put(clientIP,1)
45 |             })
46 |             (target_ip,map)
47 |         })
48 | 
49 |         targetRDD.saveAsTextFile("/tmp/DataSkew01") //结果数据保存目录
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/DataSkew02.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.test.data_skew
 2 | 
 3 | import java.util
 4 | 
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | import scala.util.Random
 8 | 
 9 | /**
10 |   * @DESC: 一个数据倾斜的例子
11 |   * @Auther: Anryg
12 |   * @Date: 2022/10/10 17:00
13 |   */
14 | object DataSkew02 {
15 | 
16 |     def main(args: Array[String]): Unit = {
17 |         val conf = new SparkConf().setAppName("DataSkewTest02")/*.setMaster("local[*]")*/
18 |         val spark = new SparkContext(conf)
19 | 
20 |         val rawRDD = spark.textFile(args(0)) //读取原始数据源
21 | 
22 |         val filteredRDD = rawRDD.filter(line => { /**筛选满足需要的数据，已到达数据倾斜的目的*/
23 |             val array = line.split(",")
24 |             val target_ip = array(3)
25 |             target_ip.equals("106.38.176.185") || target_ip.equals("106.38.176.117") || target_ip.equals("106.38.176.118") || target_ip.equals("106.38.176.116")
26 |         })
27 | 
28 |         val reducedRDD_01 = filteredRDD.map(line => {/**解决倾斜第一步：加盐操作将原本1个分区的数据扩大到100个分区*/
29 |             val array = line.split(",")
30 |             val target_ip = array(3)
31 |             val client_ip = array(0)
32 |             val index = client_ip.lastIndexOf(".")
33 |             val subClientIP = client_ip.substring(0, index)//为了让后续聚合后的value数据量尽可能的少，只取ip的前段部分
34 |             if (target_ip.equals("106.38.176.185")){/**针对特定倾斜的key进行加盐操作*/
35 |                 val saltNum = 99 //将原来的1个key增加到100个key
36 |                 val salt = new Random().nextInt(saltNum)
37 |                 (target_ip + "-" + salt,Array(subClientIP))
38 |             }
39 |             else (target_ip,Array(subClientIP))
40 |         }).reduceByKey(_++_,103)//将Array中的元素进行合并,并确定分区数量
41 | 
42 |         val targetRDD_01 = reducedRDD_01.map(kv => {/**第二步：将各个分区中的数据进行初步统计，减少单个分区中value的大小*/
43 |             val map = new util.HashMap[String,Int]()
44 |             val target_ip = kv._1
45 |             val clientIPArray = kv._2
46 |             clientIPArray.foreach(clientIP => {//对clientIP进行统计
47 |                 if (map.containsKey(clientIP)) {
48 |                     val sum = map.get(clientIP) + 1
49 |                     map.put(clientIP,sum)
50 |                 }
51 |                 else map.put(clientIP,1)
52 |             })
53 |             (target_ip,map)
54 |         })
55 | 
56 |         val reducedRDD_02 = targetRDD_01.map(kv => {/**第3步：对倾斜的数据进行减盐操作，将分区数从100减到10*/
57 |             val targetIPWithSalt01 = kv._1
58 |             val clientIPMap = kv._2
59 |             if (targetIPWithSalt01.startsWith("106.38.176.185")){
60 |                 val targetIP = targetIPWithSalt01.split("-")(0)
61 |                 val saltNum = 9 //将原来的100个分区减少到10个分区
62 |                 val salt = new Random().nextInt(saltNum)
63 |                 (targetIP + "-" + salt,clientIPMap)
64 |             }
65 |             else kv
66 |         }).reduceByKey((map1,map2) => { /**合并2个map中的元素，key相同则value值相加*/
67 |             val map3 = new util.HashMap[String,Int](map1)
68 |             map2.forEach((key,value) => {
69 |                 map3.merge(key, value, (v1,v2) => v1 + v2) //将map1和map2中的结果merge到map3中，相同的key，则value相加
70 |             })
71 |             map3
72 |         },13)//调整分区数量
73 | 
74 |         val finalRDD = reducedRDD_02.map(kv => {/**第4步：继续减盐，将原本10个分区数的数据恢复到1个*/
75 |             val targetIPWithSalt01 = kv._1
76 |             val clientIPMap = kv._2
77 |             if (targetIPWithSalt01.startsWith("106.38.176.185")){
78 |                 val targetIP = targetIPWithSalt01.split("-")(0)
79 |                 (targetIP,clientIPMap)//彻底将盐去掉
80 |             }
81 |             else kv
82 |         }).reduceByKey(new MyPartitioner(4), (map1,map2) => { /**合并2个map中的元素，key相同则value值相加*/
83 |             val map3 = new util.HashMap[String,Int](map1)
84 |             map2.forEach((key,value) => {
85 |                 map3.merge(key, value, (v1,v2) => v1 + v2)
86 |             })
87 |             map3
88 |         })//调整分区数量
89 | 
90 |         finalRDD.saveAsTextFile(args(1))
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/data_skew/MyPartitioner.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.test.data_skew
 2 | 
 3 | import org.apache.spark.Partitioner
 4 | 
 5 | /**
 6 |   * @DESC: 实现自定义的分区策略
 7 |   * @Auther: Anryg
 8 |   * @Date: 2022/10/13 09:52
 9 |   */
10 | class MyPartitioner(partitionNum: Int) extends Partitioner{
11 |     override def numPartitions: Int = partitionNum  //确定总分区数量
12 | 
13 |     override def getPartition(key: Any): Int = {//确定数据进入分区的具体策略
14 |         val keyStr = key.toString
15 |         val keyTag = keyStr.substring(keyStr.length - 1, keyStr.length)
16 |         keyTag.toInt % partitionNum
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapPartitionTest.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.test.map_pk_mappartition
 2 | 
 3 | import java.util
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | import scala.collection.mutable
 9 | 
10 | /**
11 |   * @DESC:
12 |   * @Auther: Anryg
13 |   * @Date: 2022/9/20 10:10
14 |   */
15 | object MapPartitionTest {
16 | 
17 |     def main(args: Array[String]): Unit = {
18 |         val conf = new SparkConf().setAppName("MapPartitionTest")/*.setMaster("local")*/
19 |         val spark = SparkSession.builder().config(conf).getOrCreate()
20 |         val rawDF = spark.read/*.option("header",true)*/.csv(args(0))
21 | 
22 |         import spark.implicits._
23 |         rawDF.printSchema()
24 |         rawDF.show()
25 |         val resultDF = rawDF.mapPartitions(iterator => {
26 |             //val array = new mutable.ArrayBuffer[(String,String,String,String,String,String,String,String,String)]
27 |             //val seq = mutable.Seq[(String,String,String,String,String,String,String,String,String)]
28 |             //val list = new util.LinkedList[(String,String,String,String,String,String,String,String,String)]
29 |             val set = new mutable.LinkedHashSet[(String,String,String,String,String,String,String,String,String)]
30 |             while (iterator.hasNext){
31 |                 val next = iterator.next()
32 |                 val clientIP = next.getAs[String]("_c0")
33 |                 val domain = next.getAs[String]("_c1").toLowerCase//将域名转成小写
34 |                 val time = next.getAs[String]("_c2")
35 |                 val targetIP = next.getAs[String]("_c3")
36 |                 val rcode = next.getAs[String]("_c4")
37 |                 val queryType = next.getAs[String]("_c5")
38 |                 val authRecord = if (next.getAs[String]("_c6") == null ) "" else next.getAs[String]("_c6").toLowerCase
39 |                 val addMsg = if (next.getAs[String]("_c7") == null ) "" else next.getAs[String]("_c7")
40 |                 val dnsIP = next.getAs[String]("_c8")
41 | 
42 |                 set.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP))
43 |                 //array.+=((clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP))
44 |             }
45 |             //array.toIterator
46 |             set.toIterator
47 |         }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
48 | 
49 |         resultDF.write.csv(args(1))
50 |     }
51 | 
52 | 
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-coding/src/main/scala/com/anryg/bigdata/test/map_pk_mappartition/MapTest.scala:
--------------------------------------------------------------------------------
 1 | package com.anryg.bigdata.test.map_pk_mappartition
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | /**
 7 |   * @DESC:
 8 |   * @Auther: Anryg
 9 |   * @Date: 2022/9/20 10:10
10 |   */
11 | object MapTest {
12 | 
13 |     def main(args: Array[String]): Unit = {
14 |         val conf = new SparkConf().setAppName("MapTest")/*.setMaster("local")*/
15 |         val spark = SparkSession.builder().config(conf).getOrCreate()
16 |         val rawDF = spark.read/*.option("header",true)*/.csv(args(0))//读取HDFS的数据源
17 | 
18 |         import spark.implicits._
19 |         rawDF.printSchema() //spark job 1
20 |         rawDF.show() //spark job 2
21 |         val resultDF = rawDF.map(row => {
22 |             val clientIP = row.getAs[String]("_c0")
23 |             val domain = row.getAs[String]("_c1").toLowerCase//将域名转成小写
24 |             val time = row.getAs[String]("_c2")
25 |             val targetIP = row.getAs[String]("_c3")
26 |             val rcode = row.getAs[String]("_c4")
27 |             val queryType = row.getAs[String]("_c5")
28 |             val authRecord = if (row.getAs[String]("_c6") == null ) "" else row.getAs[String]("_c6").toLowerCase
29 |             val addMsg = if (row.getAs[String]("_c7") == null ) "" else row.getAs[String]("_c7")
30 |             val dnsIP = row.getAs[String]("_c8")
31 |             (clientIP,domain,time,targetIP,rcode,queryType,authRecord,addMsg,dnsIP)
32 |         }).toDF("client_ip","domain","time","target_ip","rcode","query_type","authority_record","add_msg","dns_ip")
33 | 
34 | 
35 |         /**将转换后的数据写入HDFS*/
36 |         resultDF.write.csv(args(1))//spark job 3
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------