├── .gitignore
├── .gitattributes
├── dmp
    ├── dmp.iml
    ├── data
    │   ├── isp-mapping.dic
    │   ├── device-mapping.dic
    │   ├── network-mapping.dic
    │   └── data.txt
    ├── src
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── awebone
    │   │       │       └── dmp
    │   │       │           ├── tags
    │   │       │               ├── Tags.scala
    │   │       │               ├── AppTag.scala
    │   │       │               ├── ChannelTag.scala
    │   │       │               ├── AdPositionTag.scala
    │   │       │               ├── AreaTag.scala
    │   │       │               ├── KeyWordTag.scala
    │   │       │               └── DeviceTag.scala
    │   │       │           ├── constants
    │   │       │               └── AdTagConstants.scala
    │   │       │           ├── util
    │   │       │               └── Utils.scala
    │   │       │           ├── etl
    │   │       │               ├── DMPLogETLOps.scala
    │   │       │               └── DMPLogETLHDFSOps.scala
    │   │       │           ├── report
    │   │       │               ├── ProvinceCityQuantityJob.scala
    │   │       │               └── AreaRequestDistributionJob.scala
    │   │       │           ├── Logs.scala
    │   │       │           └── personas
    │   │       │               └── DmpPersonasJob.scala
    │   │   └── resources
    │   │       ├── hive-site.xml
    │   │       ├── hbase-site.xml
    │   │       ├── core-site.xml
    │   │       └── hdfs-site.xml
    ├── script
    │   └── mysql-create.sql
    └── pom.xml
├── mllib
    ├── mllib.iml
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       ├── ml-1m
    │   │       │   ├── movies.dat
    │   │       │   └── README
    │   │       ├── core-site.xml
    │   │       └── hdfs-site.xml
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── awebone
    │   │       │       └── spark
    │   │       │           ├── WordCountScala.scala
    │   │       │           ├── MovieLensSparkShell.scala
    │   │       │           └── MovieLensALS.scala
    │   │   └── java
    │   │       └── com
    │   │           └── awebone
    │   │               └── spark
    │   │                   ├── WordCountJava8.java
    │   │                   └── WordCountJava7.java
    └── pom.xml
├── akka_rpc
    ├── akka_rpc.iml
    ├── src
    │   └── main
    │   │   ├── java
    │   │       └── com
    │   │       │   └── awebone
    │   │       │       └── hadoop_rpc
    │   │       │           ├── MyDataNode.java
    │   │       │           ├── MyServerProtocal.java
    │   │       │           ├── MyServerImpl.java
    │   │       │           ├── NameNodeClient.java
    │   │       │           └── MyNamenode.java
    │   │   └── scala
    │   │       └── com
    │   │           └── awebone
    │   │               ├── yarn
    │   │                   ├── Constant.scala
    │   │                   ├── Message.scala
    │   │                   ├── MyNodeManager.scala
    │   │                   └── MyResourceManager.scala
    │   │               └── akka_rpc
    │   │                   ├── Worker.scala
    │   │                   └── Master.scala
    └── pom.xml
├── flink-train
    ├── flink-train.iml
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       ├── scripts
    │   │       │   ├── kafka-script
    │   │       │   ├── mysql.sql
    │   │       │   └── es-scripts
    │   │       ├── hive-site.xml
    │   │       ├── hbase-site.xml
    │   │       ├── core-site.xml
    │   │       └── hdfs-site.xml
    │   │   └── scala
    │   │       └── com
    │   │           └── awebone
    │   │               └── flink
    │   │                   ├── connetcor
    │   │                       └── FileSystemSinkApp.scala
    │   │                   └── project
    │   │                       ├── MySQLSource.scala
    │   │                       ├── MockKafkaProducer.scala
    │   │                       ├── LogAnalysis.scala
    │   │                       └── LogAnalysisWithMySQL.scala
    └── pom.xml
├── weblog
    ├── .settings
    │   ├── org.eclipse.m2e.core.prefs
    │   ├── org.eclipse.core.resources.prefs
    │   └── org.eclipse.jdt.core.prefs
    ├── src
    │   ├── main
    │   │   └── java
    │   │   │   ├── log4j.properties
    │   │   │   ├── core-site.xml
    │   │   │   ├── com
    │   │   │       └── awebone
    │   │   │       │   ├── pre
    │   │   │       │       ├── WebLogParse.java
    │   │   │       │       └── WebLogPreProcess.java
    │   │   │       │   ├── bean
    │   │   │       │       ├── VisitBean.java
    │   │   │       │       ├── PageViewsBean.java
    │   │   │       │       └── WebLogBean.java
    │   │   │       │   └── click
    │   │   │       │       ├── ClickModel.java
    │   │   │       │       └── ClickSessionStream.java
    │   │   │   ├── hdfs-site.xml
    │   │   │   └── hive-op.txt
    │   └── test
    │   │   └── java
    │   │       └── com
    │   │           └── awebone
    │   │               └── weblog
    │   │                   └── AppTest.java
    ├── .project
    ├── pom.xml
    └── .classpath
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .vscode
3 | target/
4 | out/


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/dmp/dmp.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/mllib/mllib.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/akka_rpc/akka_rpc.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/dmp/data/isp-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=移动  D0003001
2 | 2=联通  D0003002
3 | 3=电信  D0003003
4 | 4=OPERATOROTHER   D0003004


--------------------------------------------------------------------------------
/flink-train/flink-train.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/dmp/data/device-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=Android D0001001
2 | 2=IOS	    D0001002
3 | 3=Winphone	D0001003
4 | 4=其他	    D0001004


--------------------------------------------------------------------------------
/dmp/data/network-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=WIFI    D0002001
2 | 2=4G  D0002002
3 | 3=3G  D0002003
4 | 4=2G  D0002004
5 | 5=NWTWORKOTHER    D0004004


--------------------------------------------------------------------------------
/mllib/src/main/resources/ml-1m/movies.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanbo03/bigdata-projects/HEAD/mllib/src/main/resources/ml-1m/movies.dat


--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/test/java=UTF-8
4 | encoding/<project>=UTF-8
5 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyDataNode.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.hadoop_rpc;
 2 | 
 3 | public class MyDataNode {
 4 | 
 5 | 	public static void main(String[] args) {
 6 | 		
 7 | 		
 8 | 		
 9 | 		
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/Tags.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | 
 5 | /**
 6 |   * 用户提取标签的特质
 7 |   */
 8 | trait Tags {
 9 | 
10 |     def extractTag(logs:Logs):Map[String, Int]
11 | }
12 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerProtocal.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.hadoop_rpc;
 2 | 
 3 | public interface MyServerProtocal {
 4 | 	
 5 | 	long versionID = 12345678L;
 6 | 
 7 | 	void hello();
 8 | 	
 9 | 	String getName();
10 | }
11 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/Constant.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.yarn
2 | 
3 | object Constant {
4 |   val RMAS = "MyResourceManagerActorSystem"
5 |   val RMA = "MyResourceManagerActor"
6 |   val NMAS = "MyNodeManagerActorSystem"
7 |   val NMA = "MyNodeManagerActor"
8 | }
9 | 


--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
3 | org.eclipse.jdt.core.compiler.compliance=1.5
4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5 | org.eclipse.jdt.core.compiler.source=1.5
6 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerImpl.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.hadoop_rpc;
 2 | 
 3 | public class MyServerImpl implements MyServerProtocal{
 4 | 
 5 | 	@Override
 6 | 	public void hello() {
 7 | 		System.out.println("hi");
 8 | 	}
 9 | 
10 | 	@Override
11 | 	public String getName() {
12 | 		return "mynamenode";
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/log4j.properties:
--------------------------------------------------------------------------------
1 | ###set log levels###
2 | log4j.rootLogger=info, stdout
3 | ###output to the console###
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=[%d{dd/MM/yy HH:mm:ss:SSS z}] %t %5p %c{2}: %m%n


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AppTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | 
 6 | object AppTag extends Tags {
 7 |     override def extractTag(logs: Logs) = {
 8 |         val map = Map[String, Int]((AdTagConstants.PREFIX_AD_APP_TAG + logs.appname -> 1))
 9 |         map
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/ChannelTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | 
 6 | /**
 7 |   * 3）渠道（标签格式：CNxxxx->1）xxxx为渠道ID
 8 |   */
 9 | object ChannelTag extends Tags {
10 |     override def extractTag(logs: Logs) = {
11 |         if(logs.channelid == null) {
12 |             Map[String, Int]()
13 |         } else {
14 |             Map[String, Int]((AdTagConstants.PREFIX_AD_CHANNEL_TAG + logs.channelid -> 1))
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/kafka-script:
--------------------------------------------------------------------------------
 1 | 启动：
 2 | zkServer.sh start
 3 | nohup kafka-server-start.sh $KAFKA_HOME/config/server.properties 1>~/logs/kafka_std.log 2>~/logs/kafka_err.log &
 4 | 
 5 | 查看topics：
 6 | kafka-topics.sh --list --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka
 7 | 
 8 | 创建topic：cdnlog
 9 | kafka-topics.sh --create --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --replication-factor 1 --partitions 1 --topic cdnlog
10 | 
11 | 控制台消费：
12 | kafka-console-consumer.sh --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --topic cdnlog


--------------------------------------------------------------------------------
/weblog/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>weblog</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AdPositionTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | import com.awebone.dmp.util.Utils
 6 | 
 7 | import scala.collection.mutable
 8 | 
 9 | /**
10 |   * 标签一：
11 | 1）广告位类型（标签格式：LC03->1或者LC16->1）xx为数字，小于10 补0
12 |   */
13 | object AdPositionTag extends Tags {
14 | 
15 |     override def extractTag(logs: Logs) = {
16 |         val map = mutable.Map[String, Int]()
17 |         val adspacetype = Utils.fulfill(logs.adspacetype)
18 |         map.put(AdTagConstants.PREFIX_AD_SPACE_TAG + "" + adspacetype, 1)
19 |         map.toMap
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/mysql.sql:
--------------------------------------------------------------------------------
 1 | create table uesr_domain_config(
 2 | id int unsigned auto_increment,
 3 | user_id varchar(40) not null,
 4 | domain varchar(40) not null,
 5 | primary key (id)
 6 | );
 7 | 
 8 | insert into uesr_domain_config(user_id,domain) values('8000001','v1.awebone.com');
 9 | insert into uesr_domain_config(user_id,domain) values('8000002','v2.awebone.com');
10 | insert into uesr_domain_config(user_id,domain) values('8000003','v3.awebone.com');
11 | insert into uesr_domain_config(user_id,domain) values('8000004','v4.awebone.com');
12 | insert into uesr_domain_config(user_id,domain) values('8000005','vmi.awebone.com');
13 | 
14 | select * from uesr_domain_config;


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/constants/AdTagConstants.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.constants
 2 | 
 3 | /**
 4 |   * dmp中常见广告标签前缀常量
 5 |   */
 6 | object AdTagConstants {
 7 |     //广告位标签前缀
 8 |     val PREFIX_AD_SPACE_TAG = "LC_"
 9 |     //APP
10 |     val PREFIX_AD_APP_TAG = "APP_"
11 |     //渠道前缀
12 |     val PREFIX_AD_CHANNEL_TAG = "CN_"
13 |     //设备前缀
14 |     val PREFIX_AD_DEVICE_TAG = "DEVICE_"
15 |     //联网方式前缀
16 |     val PREFIX_AD_NETWORK_TAG = "NET_"
17 |     //设备运营商前缀
18 |     val PREFIX_AD_ISP_TAG = "ISP_"
19 |     //关键字前缀
20 |     val PREFIX_AD_KEYWORD_TAG = "KW_"
21 |     //省份地域前缀
22 |     val PREFIX_AD_PROVINCE_TAG = "ZP_"
23 |     //城市地域前缀
24 |     val PREFIX_AD_CITY_TAG = "ZC_"
25 | }
26 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AreaTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | 
 6 | import scala.collection.mutable
 7 | 
 8 | /**
 9 |   * 地域标签（省标签格式：ZPxxx->1，地市标签格式：ZCxxx->1）xxx为省或市名称
10 |   */
11 | object AreaTag extends Tags {
12 |     override def extractTag(logs: Logs) = {
13 |         val areaMap = mutable.Map[String, Int]()
14 |         if(logs.provincename != null) {
15 |             areaMap.put(AdTagConstants.PREFIX_AD_PROVINCE_TAG + logs.provincename, 1)
16 |         }
17 |         if(logs.cityname != null) {
18 |             areaMap.put(AdTagConstants.PREFIX_AD_CITY_TAG + logs.cityname, 1)
19 |         }
20 |         areaMap.toMap
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/KeyWordTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | 
 6 | import scala.collection.mutable
 7 | 
 8 | /**
 9 |   * 5）关键词（标签格式：Kxxx->1）xxx为关键字。
10 |   * 关键词个数不能少于3个字符，且不能超过8个字符；
11 |   * 关键字中如包含”|”,则分割成数组，转化成多个关键字标签
12 |     “麻辣小龙虾|麻辣香锅|与神对话|家”
13 |   */
14 | object KeyWordTag extends Tags {
15 |     override def extractTag(logs: Logs) = {
16 |         val map = mutable.Map[String, Int]()
17 |         if(logs.keywords != null) {
18 |             val kws = logs.keywords.split("\\|")
19 |             for (kw <- kws) {
20 |                 if(kw.length >= 3 && kw.length <= 8) {
21 |                     map.put(AdTagConstants.PREFIX_AD_KEYWORD_TAG + kw, 1)
22 |                 }
23 |             }
24 |         }
25 |         map.toMap
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/weblog/src/test/java/com/awebone/weblog/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.weblog;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/NameNodeClient.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.hadoop_rpc;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.InetSocketAddress;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.ipc.RPC;
 8 | 
 9 | public class NameNodeClient {
10 | 
11 | 	public static void main(String[] args) {
12 | 		
13 | 		
14 | 		try {
15 | 			MyServerProtocal proxy = RPC.getProxy(MyServerProtocal.class, 
16 | 					MyServerProtocal.versionID, 
17 | 					new InetSocketAddress("localhost", 9988), new Configuration());
18 | 			
19 | 			/**
20 | 			 * proxy.hello();
21 | 			 * 的底层，其实就是调用：
22 | 			 * 
23 | 			 * 服务器中的   setInstance这个参数对象中的hello方法
24 | 			 */
25 | 			proxy.hello();
26 | 			System.out.println(proxy.getName());
27 | 			
28 | 			
29 | 		} catch (IOException e) {
30 | 			e.printStackTrace();
31 | 		}
32 | 		
33 | 		
34 | 		
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/es-scripts:
--------------------------------------------------------------------------------
 1 | 创建索引库：
 2 | curl -XPUT http://localhost:9200/cdn
 3 | 
 4 | 删除索引库：
 5 | curl -XDELETE http://localhost:9200/cdn
 6 | 
 7 | 创建type表：
 8 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic/_mapping -d'{
 9 |     "traffic": {
10 |         "properties": {
11 |             "domain": {"type": "keyword"},
12 |             "traffics": {"type": "long"},
13 |             "time": {"type": "date","format": "yyyy-MM-dd HH:mm"}
14 |         }
15 |     }
16 | }'
17 | 
18 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic-userid/_mapping -d'{
19 |     "traffic": {
20 |         "properties": {
21 |             "userid": {"type": "keyword"},
22 |             "domain": {"type": "text"},
23 |             "traffics": {"type": "long"},
24 |             "time": {"type": "date","format": "yyyy-MM-dd HH:mm"}
25 |         }
26 |     }
27 | }'


--------------------------------------------------------------------------------
/dmp/script/mysql-create.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE `dmp`;
 2 | CREATE TABLE `p_c_quantity` (
 3 |   `data_date` date NOT NULL,
 4 |   `province` VARCHAR(40),
 5 |   `city` VARCHAR(40),
 6 |   `countz` bigint(20) NOT NULL
 7 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 8 | 
 9 | CREATE TABLE `area_ad_req` (
10 |   `data_date` date NOT NULL,
11 |   `province` VARCHAR(40),
12 |   `city` VARCHAR(40),
13 |   `orginal_req` bigint(20) DEFAULT NULL,
14 |   `valid_req` bigint(20) DEFAULT NULL,
15 |   `ad_req` bigint(20) DEFAULT NULL,
16 |   `tpi_bid_num` bigint(20) DEFAULT NULL,
17 |   `win_bid_num` bigint(20) DEFAULT NULL,
18 |   `show_ad_master_num` bigint(20) DEFAULT NULL,
19 |   `click_ad_master_num` bigint(20) DEFAULT NULL,
20 |   `show_ad_media_num` bigint(20) DEFAULT NULL,
21 |   `click_ad_media_num` bigint(20) DEFAULT NULL,
22 |   `dsp_ad_xf` double DEFAULT NULL,
23 |   `dsp_ad_cost` double DEFAULT NULL
24 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 大数据项目集
 2 | 
 3 | ## 1. 基于Hadoop的离线用户行为日志分析（weblog）
 4 | 
 5 | **技术栈：Hadoop**
 6 | 
 7 | - [x] Bean
 8 | - [x] 点击流数据处理
 9 | - [x] 点击会话流模型构建
10 | - [x] Hive明细表构建
11 | - [x] 用户行为指标分析
12 | 
13 | <br />
14 | 
15 | 
16 | 
17 | ## 2. 基于Akka实现RPC通信（akka_rpc）
18 | 
19 | **技术栈：Akka**
20 | 
21 | - [x] 模拟Hadoop集群间通信
22 | - [x] 模拟Spark集群间通信
23 | - [x] 模拟Yarn通信
24 | 
25 | <br />
26 | 
27 | 
28 | 
29 | ## 3. 广告数据管理平台（dmp）
30 | 
31 | **技术栈：Spark、Scala**
32 | 
33 | - [x] 广告日志ETL
34 | - [x] 报表统计
35 | - [x] 用户画像构建
36 | - [x] 广告标签统计
37 | - [x] DMP结果入库HBase
38 | 
39 | <br />
40 | 
41 | 
42 | 
43 | ## 4. 基于Spark MLLib实现个性化推荐（mllib）
44 | 
45 | **技术栈：Spark、Scala**
46 | 
47 | - [x] MovieLens DataModel构建
48 | - [x] 冷启动：启动时用户随机对10部电影评分
49 | - [x] 切分数据集
50 | - [x] ALS模型构建
51 | - [x] 模型评估
52 | - [x] 个性化推荐
53 | 
54 | <br />
55 | 
56 | 
57 | 
58 | ## 5. 基于Flink对CDN日志分析（flink-train）
59 | 
60 | **技术栈：Flink、Scala**
61 | 
62 | - [x] 模拟Kafka生产者生成日志数据
63 | - [x] CDN日志分析
64 | 
65 | <br />
66 | 
67 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/WordCountScala.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.spark
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | object WordCountScala {
 7 |   def main(args: Array[String]): Unit = {
 8 |     //获取程序入口
 9 |     val sparkConf: SparkConf = new SparkConf()
10 |     sparkConf.setAppName(WordCountScala.getClass.getSimpleName)
11 |     sparkConf.setMaster("local")
12 |     val sparkContext: SparkContext = new SparkContext(sparkConf)
13 | 
14 |     //WorkCount
15 |     val linesRDD: RDD[String] = sparkContext.textFile(args(0))
16 |     val wordRDD: RDD[String] = linesRDD.flatMap(_.split(" "))
17 |     val wordAndOneRDD: RDD[(String, Int)] = wordRDD.map((_, 1))
18 |     val wordsCountRDD = wordAndOneRDD.reduceByKey((x: Int, y: Int) => x + y)
19 |     wordsCountRDD.foreach(x => println(x._1, x._2))
20 |     wordsCountRDD.saveAsTextFile(args(1))
21 | 
22 |     sparkContext.stop()
23 |   }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/Message.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.yarn
 2 | 
 3 | //样例类，做模式匹配
 4 | 
 5 | //注册消息   nodemanager  -> resourcemanager
 6 | case class RegisterNodeManager(val nodemanagerid: String, val memory: Int, val cpu: Int)
 7 | 
 8 | //资源： 不是说哪个任务需要多少资源，就把资源给这个任务
 9 | //而是，某个节点有多少适合用于做计算的资源，那么就把这个任务启动在这个节点上
10 | 
11 | 
12 | //注册完成消息 resourcemanager -》 nodemanager
13 | case class RegisteredNodeManager(val resourcemanagerhostname: String)
14 | 
15 | 
16 | //心跳消息  nodemanager -》 resourcemanager
17 | case class Heartbeat(val nodemanagerid: String)
18 | 
19 | /**
20 |   * 是在RM中，为了维持整个集群中，到底哪个节点有多少资源
21 |   * 所以吧每个节点的资源都封装在一个NodeManagerInfo对象里
22 |   * 然后在RM中就维持了一个NodeManagerInfo对象的集合
23 |   */
24 | class NodeManagerInfo(val nodemanagerid: String, val memory: Int, val cpu: Int) {
25 |   //用来存储nomanagerid这个NodeManager的最后一次心跳时间
26 |   //_是一个默认值
27 |   var lastHeartBeatTime: Long = _
28 | }
29 | 
30 | //单例
31 | case object SendMessage //仅仅是一个标志
32 | case object CheckTimeOut //也是一个标志


--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/connetcor/FileSystemSinkApp.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.flink.connetcor
 2 | 
 3 | 
 4 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 5 | import org.apache.flink.streaming.connectors.fs.StringWriter
 6 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer}
 7 | 
 8 | object FileSystemSinkApp {
 9 |   def main(args: Array[String]): Unit = {
10 |     System.setProperty("HADOOP_USER_NAME","hadoop")
11 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
12 |     val data = env.socketTextStream("hadoop04",9999)
13 | 
14 |     data.print().setParallelism(1)
15 |     val filepath = "/tmpdata/flink/hdfssink"
16 | 
17 |     val sink = new BucketingSink[String](filepath)
18 |     sink.setBucketer(new DateTimeBucketer[String]("yyyy-MM-dd--HHmm"))
19 |     sink.setWriter(new StringWriter())
20 |     sink.setBatchRolloverInterval(20)
21 | 
22 |     data.addSink(sink)
23 |     env.execute("FileSystemSinkApp")
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyNamenode.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.hadoop_rpc;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.HadoopIllegalArgumentException;
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.ipc.RPC;
 8 | import org.apache.hadoop.ipc.RPC.Server;
 9 | 
10 | public class MyNamenode {
11 | 
12 | 	public static void main(String[] args) {
13 | 		
14 | 		
15 | 		try {
16 | 			
17 | 			/**
18 | 			 * new MyServerImpl().hello()   .getName()
19 | 			 */
20 | 			Server server = new RPC.Builder(new Configuration())
21 | 			.setProtocol(MyServerProtocal.class)
22 | 			.setInstance(new MyServerImpl())
23 | 			.setBindAddress("localhost")
24 | 			.setPort(9988)
25 | 			.build();
26 | 			
27 | 			
28 | 			server.start();
29 | 			System.out.println("SERVER START ......");
30 | 			
31 | 			
32 | 		} catch (HadoopIllegalArgumentException e) {
33 | 			e.printStackTrace();
34 | 		} catch (IOException e) {
35 | 			e.printStackTrace();
36 | 		}
37 | 		
38 | 		
39 | 		
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/weblog/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.awebone</groupId>
 6 |   <artifactId>weblog</artifactId>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <name>weblog</name>
11 |   <url>http://maven.apache.org</url>
12 | 
13 |   <properties>
14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15 |   </properties>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>junit</groupId>
20 |       <artifactId>junit</artifactId>
21 |       <version>3.8.1</version>
22 |       <scope>test</scope>
23 |     </dependency>
24 |     <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
25 | 	<dependency>
26 | 		<groupId>org.apache.hadoop</groupId>
27 | 		<artifactId>hadoop-client</artifactId>
28 | 		<version>2.7.6</version>
29 | 	</dependency>
30 |   </dependencies>
31 | </project>
32 | 


--------------------------------------------------------------------------------
/weblog/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
10 | 		<attributes>
11 | 			<attribute name="optional" value="true"/>
12 | 			<attribute name="maven.pomderived" value="true"/>
13 | 		</attributes>
14 | 	</classpathentry>
15 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5">
16 | 		<attributes>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="output" path="target/classes"/>
26 | </classpath>
27 | 


--------------------------------------------------------------------------------
/mllib/src/main/java/com/awebone/spark/WordCountJava8.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaPairRDD;
 5 | import org.apache.spark.api.java.JavaRDD;
 6 | import org.apache.spark.api.java.JavaSparkContext;
 7 | import scala.Tuple2;
 8 | 
 9 | import java.util.Arrays;
10 | 
11 | public class WordCountJava8 {
12 | 	public static void main(String[] args) {
13 | 		//获取程序入口
14 | 		SparkConf sparkConf = new SparkConf();
15 | 		sparkConf.setAppName("WordCountJava8");
16 | 		sparkConf.setMaster("local");
17 | 		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
18 | 
19 | 		//获取数据
20 | 		JavaRDD<String> linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input");
21 | 
22 | 		//计算
23 | 		JavaRDD<String> rdd1 = linesRDD.flatMap(s -> Arrays.asList(s.split(" ")).iterator());
24 | 		JavaPairRDD<String, Integer> rdd2 = rdd1.mapToPair(s -> new Tuple2<>(s, 1));
25 | 		JavaPairRDD<String, Integer> rdd3 = rdd2.reduceByKey((x, y) -> x + y);
26 | 
27 | 		rdd3.foreach(t -> System.out.println(t._1 + "\t" + t._2));
28 | 
29 | 		javaSparkContext.stop();
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Awebone
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/akka_rpc/Worker.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.akka_rpc
 2 | 
 3 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
 4 | import com.typesafe.config.ConfigFactory
 5 | 
 6 | class Worker extends Actor{
 7 | 
 8 |   override def preStart(): Unit = {
 9 |     //指定访问哪个节点上的哪个actorSystem的哪个actor
10 |     val connectStr = "akka.tcp://MasterActorSystem@localhost:6789/user/master"
11 |     val selection: ActorSelection = context.actorSelection(connectStr)
12 | 
13 |     selection ! "hello"
14 |   }
15 | 
16 |   override def receive: Receive = {
17 |     case "hi" => {
18 |       println("master send hi")
19 |     }
20 | 
21 |     case _ => println("非法消息")
22 |   }
23 | }
24 | 
25 | object WorkerRun{
26 |   def main(args: Array[String]): Unit = {
27 |     val hostname = "localhost"
28 |     val strConfig =
29 |       s"""
30 |         |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
31 |         |akka.remote.netty.tcp.hostname = ${hostname}
32 |       """.stripMargin
33 | 
34 |     val config = ConfigFactory.parseString(strConfig)
35 |     val as = ActorSystem("WorkerActorSystem", config)
36 | 
37 |     as.actorOf(Props(new Worker()), "worker")
38 |   }
39 | }


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/akka_rpc/Master.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.akka_rpc
 2 | 
 3 | import akka.actor.{Actor, ActorSystem, Props}
 4 | import com.typesafe.config.ConfigFactory
 5 | 
 6 | class Master extends Actor{
 7 | 
 8 |   override def preStart(): Unit = {
 9 |     //业务逻辑初始化
10 |     println("prestart")
11 |   }
12 | 
13 |   //相当于是一个run，处理业务逻辑时有消息传送过来
14 |   override def receive: Receive = {
15 |     case "hello" => {
16 |       //这个注释代表模拟一个业务方法，得到结果
17 |       println("receive hi")
18 | 
19 |       val result = "hi"
20 |       //谁发送过来消息，谁就是sender()
21 |       sender() ! result
22 |     }
23 | 
24 |     case _ => println("非法新消息")
25 |   }
26 | }
27 | 
28 | object MasterRun{
29 |   def main(args: Array[String]): Unit = {
30 |     val strConfig =
31 |       """
32 |         |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
33 |         |akka.remote.netty.tcp.hostname =localhost
34 |         |akka.remote.netty.tcp.port=6789
35 |       """.stripMargin
36 | 
37 |     val config = ConfigFactory.parseString(strConfig)
38 |     val as = ActorSystem("MasterActorSystem",config)
39 | 
40 |     as.actorOf(Props(new Master()), "master")
41 |     println("MasterActorSystem init")
42 |   }
43 | }


--------------------------------------------------------------------------------
/dmp/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<property>
 3 | 		<name>javax.jdo.option.ConnectionURL</name>
 4 | 		<value>jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true</value>
 5 | 		<description>JDBC connect string for a JDBC metastore</description>
 6 | 	</property>
 7 | 	<property>
 8 | 		<name>javax.jdo.option.ConnectionDriverName</name>
 9 | 		<value>com.mysql.jdbc.Driver</value>
10 | 		<description>Driver class name for a JDBC metastore</description>
11 | 	</property>
12 | 	<property>
13 | 		<name>javax.jdo.option.ConnectionUserName</name>
14 | 		<value>root</value>
15 | 		<description>username to use against metastore database</description>
16 | 	</property>
17 | 	<property>
18 | 		<name>javax.jdo.option.ConnectionPassword</name>
19 | 		<value>root</value>
20 | 		<description>password to use against metastore database</description>
21 | 	</property>
22 | 
23 | 	<property> 
24 | 		<name>hive.server2.thrift.port</name> 
25 | 		<value>10000</value> 
26 | 	</property>
27 | 	<property>
28 | 		<name>hive.server2.thrift.bind.host</name>
29 | 		<value>hadoop04</value>
30 | 	</property>
31 | 
32 | 	<property>
33 | 		<name>hive.metastore.uris</name>
34 | 		<value>thrift://hadoop04:9083</value>
35 | 	</property>
36 | </configuration>
37 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 	<property>
 3 | 		<name>javax.jdo.option.ConnectionURL</name>
 4 | 		<value>jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true</value>
 5 | 		<description>JDBC connect string for a JDBC metastore</description>
 6 | 	</property>
 7 | 	<property>
 8 | 		<name>javax.jdo.option.ConnectionDriverName</name>
 9 | 		<value>com.mysql.jdbc.Driver</value>
10 | 		<description>Driver class name for a JDBC metastore</description>
11 | 	</property>
12 | 	<property>
13 | 		<name>javax.jdo.option.ConnectionUserName</name>
14 | 		<value>root</value>
15 | 		<description>username to use against metastore database</description>
16 | 	</property>
17 | 	<property>
18 | 		<name>javax.jdo.option.ConnectionPassword</name>
19 | 		<value>root</value>
20 | 		<description>password to use against metastore database</description>
21 | 	</property>
22 | 
23 | 	<property> 
24 | 		<name>hive.server2.thrift.port</name> 
25 | 		<value>10000</value> 
26 | 	</property>
27 | 	<property>
28 | 		<name>hive.server2.thrift.bind.host</name>
29 | 		<value>hadoop04</value>
30 | 	</property>
31 | 
32 | 	<property>
33 | 		<name>hive.metastore.uris</name>
34 | 		<value>thrift://hadoop04:9083</value>
35 | 	</property>
36 | </configuration>
37 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/DeviceTag.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.tags
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import com.awebone.dmp.constants.AdTagConstants
 5 | 
 6 | import scala.collection.mutable
 7 | 
 8 | /**
 9 |   * 4）设备：操作系统|联网方式|运营商
10 |     设备操作系统
11 |         1	Android	D0001001
12 |         2	IOS	D0001002
13 |         3	Winphone	D0001003
14 |         4	其他	D0001004
15 |     设备联网方式
16 |         WIFI	D0002001
17 |         4G	D0002002
18 |         3G	D0002003
19 |         2G	D0002004
20 |         NWTWORKOTHER	D0004004
21 |     设备运营商方案
22 |         移动	D0003001
23 |         联通	D0003002
24 |         电信	D0003003
25 |         OPERATOROTHER	D0003004
26 |   */
27 | object DeviceTag extends Tags {
28 |     override def extractTag(logs: Logs) = {
29 |         val mMap = mutable.Map[String, Int]()
30 |         //设备操作系统为：client
31 |         if(logs.client != null) {
32 |             mMap.put(AdTagConstants.PREFIX_AD_DEVICE_TAG + logs.client, 1)
33 |         }
34 |         //联网方式networkmannerid
35 |         if(logs.networkmannerid != null) {
36 |             mMap.put(AdTagConstants.PREFIX_AD_NETWORK_TAG + logs.networkmannerid, 1)
37 |         }
38 | 
39 |         //设备运营商ispid
40 |         if(logs.ispid != null) {
41 |             mMap.put(AdTagConstants.PREFIX_AD_ISP_TAG + logs.ispid, 1)
42 |         }
43 |         mMap.toMap
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/dmp/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 | /**
 5 |  *
 6 |  * Licensed to the Apache Software Foundation (ASF) under one
 7 |  * or more contributor license agreements.  See the NOTICE file
 8 |  * distributed with this work for additional information
 9 |  * regarding copyright ownership.  The ASF licenses this file
10 |  * to you under the Apache License, Version 2.0 (the
11 |  * "License"); you may not use this file except in compliance
12 |  * with the License.  You may obtain a copy of the License at
13 |  *
14 |  *     http://www.apache.org/licenses/LICENSE-2.0
15 |  *
16 |  * Unless required by applicable law or agreed to in writing, software
17 |  * distributed under the License is distributed on an "AS IS" BASIS,
18 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |  * See the License for the specific language governing permissions and
20 |  * limitations under the License.
21 |  */
22 | -->
23 | <configuration>
24 | <property>
25 | 	<!-- 指定 hbase 在 HDFS 上存储的路径 -->
26 | 	<name>hbase.rootdir</name>
27 | 	<value>hdfs://myha/myhbase</value>
28 | </property>
29 | <property>
30 | 	<!-- 指定 hbase 是分布式的 -->
31 | 	<name>hbase.cluster.distributed</name>
32 | 	<value>true</value>
33 | </property>
34 | <property>
35 | 	<!-- 指定 zk 的地址，多个用“,”分割 -->
36 | 	<name>hbase.zookeeper.quorum</name>
37 | 	<value>hadoop01:2181,hadoop02:2181,hadoop03:2181</value>
38 | </property>
39 | </configuration>
40 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 | /**
 5 |  *
 6 |  * Licensed to the Apache Software Foundation (ASF) under one
 7 |  * or more contributor license agreements.  See the NOTICE file
 8 |  * distributed with this work for additional information
 9 |  * regarding copyright ownership.  The ASF licenses this file
10 |  * to you under the Apache License, Version 2.0 (the
11 |  * "License"); you may not use this file except in compliance
12 |  * with the License.  You may obtain a copy of the License at
13 |  *
14 |  *     http://www.apache.org/licenses/LICENSE-2.0
15 |  *
16 |  * Unless required by applicable law or agreed to in writing, software
17 |  * distributed under the License is distributed on an "AS IS" BASIS,
18 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 |  * See the License for the specific language governing permissions and
20 |  * limitations under the License.
21 |  */
22 | -->
23 | <configuration>
24 | <property>
25 | 	<!-- 指定 hbase 在 HDFS 上存储的路径 -->
26 | 	<name>hbase.rootdir</name>
27 | 	<value>hdfs://myha/myhbase</value>
28 | </property>
29 | <property>
30 | 	<!-- 指定 hbase 是分布式的 -->
31 | 	<name>hbase.cluster.distributed</name>
32 | 	<value>true</value>
33 | </property>
34 | <property>
35 | 	<!-- 指定 zk 的地址，多个用“,”分割 -->
36 | 	<name>hbase.zookeeper.quorum</name>
37 | 	<value>hadoop01:2181,hadoop02:2181,hadoop03:2181</value>
38 | </property>
39 | </configuration>
40 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/util/Utils.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.util
 2 | 
 3 | import org.apache.commons.lang3.StringUtils
 4 | 
 5 | object Utils {
 6 |     def parseInt(str:String):Int = {
 7 |         if(StringUtils.isEmpty(str)) {
 8 |             0
 9 |         } else {
10 |             str.toInt
11 |         }
12 |     }
13 | 
14 |     def parseDouble(str:String):Double = {
15 |         if(StringUtils.isEmpty(str)) {
16 |             0.0
17 |         } else {
18 |             str.toDouble
19 |         }
20 |     }
21 | 
22 |     //yyyy-MM-dd hh:mm:ss--->hh
23 |     def fmtHour(str: String):Option[String] = {
24 |         if(StringUtils.isEmpty(str)) {
25 |             None
26 |         } else {
27 |             Some(str.substring(str.indexOf(" ") + 1, str.indexOf(" ") + 3))
28 |         }
29 |     }
30 | 
31 |     //yyyy-MM-dd hh:mm:ss--->yyyy-MM-dd
32 |     def fmtDate(str: String):Option[String] = {
33 |         if(StringUtils.isEmpty(str)) {
34 |             None
35 |         } else {
36 |             Some(str.substring(0, str.indexOf(" ")))
37 |         }
38 |     }
39 | 
40 |     //补全两位字符串
41 |     def fulfill(str:String) = {
42 |         if(str != null && str.length > 1) {
43 |             str
44 |         } else if(!"".equals(str) && str.length == 1){
45 |             0 + "" + str
46 |         } else {
47 |             "other"
48 |         }
49 |     }
50 |     //补全数字
51 |     def fulfill(num:Int) = {
52 |         if(num >= 0 && num < 10) {
53 |             "0" + num
54 |         } else {
55 |             "" + num
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/dmp/data/data.txt:
--------------------------------------------------------------------------------
1 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,,
2 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,,
3 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,,
4 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,,


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLOps.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.etl
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import org.apache.log4j.{Level, Logger}
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.serializer.KryoSerializer
 8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
 9 | 
10 | /**
11 |   * 日志数据清洗过程
12 |   *
13 |   * 1）要求一：将数据转换成parquet文件格式
14 |   * 2）要求二：序列化方式采用KryoSerializer方式
15 |   * 3）要求三：parquet文件采用Sanppy压缩方式
16 |   *
17 |   *     通过处理分析，使用SparkCore只能完成KryoSerializer和Snappy，想要完成parquet比较困难，
18 |   * 而SparkSQL处理parquet文件非常简单，所以需要将原先的编码做一稍微改动
19 |   */
20 | object DMPLogETLOps {
21 |   def main(args: Array[String]): Unit = {
22 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 |     Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 | 
26 |     val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]")
27 |       .set("spark.serializer",classOf[KryoSerializer].getName)
28 |       .registerKryoClasses(Array(classOf[Logs])) //要求二：序列化方式采用KryoSerializer方式
29 |     val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
30 |     import spark.implicits._
31 | 
32 |     val lines:RDD[String] = spark.sparkContext.textFile("file:///D:\\workplace\\dmp\\data\\data.txt")
33 | 
34 |     val retDS: Dataset[Logs] = lines.map(line => {
35 |       val log: Logs = Logs.line2Logs(line)
36 |       log
37 |     }).toDS()
38 | 
39 |     /**
40 |       * 要求一：将数据转换成parquet文件格式
41 |       * 要求三：parquet文件采用Sanppy压缩方式
42 |       */
43 |     retDS.write.mode(SaveMode.Overwrite).parquet("file:///D:\\workplace\\dmp\\data\\out\\")
44 | 
45 |     spark.stop()
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/MySQLSource.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.flink.project
 2 | 
 3 | 
 4 | import java.sql.{Connection, DriverManager, PreparedStatement}
 5 | 
 6 | import org.apache.flink.configuration.Configuration
 7 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
 8 | 
 9 | import scala.collection.mutable
10 | 
11 | /**
12 |   * 自定义Mysql 并行的Source
13 |   */
14 | class MySQLSource extends RichParallelSourceFunction[mutable.HashMap[String, String]] {
15 |   var connection: Connection = null
16 |   var ps: PreparedStatement = null
17 | 
18 |   //创建连接
19 |   override def open(parameters: Configuration): Unit = {
20 |     super.open(parameters)
21 |     val driver = "com.mysql.jdbc.Driver"
22 |     val url = "jdbc:mysql://hadoop01:3306/flink"
23 |     val user = "root"
24 |     val password = "root"
25 |     Class.forName(driver)
26 |     connection = DriverManager.getConnection(url, user, password)
27 | 
28 |     val sql = "select user_id,domain from user_domain_config"
29 |     ps = connection.prepareStatement(sql)
30 |   }
31 | 
32 |   //不断执行的函数
33 |   override def run(sourceContext: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = {
34 |     val resultSet = ps.executeQuery()
35 |     val collect = mutable.HashMap[String,String]()
36 | 
37 |     //将查询结果放入HashMap中
38 |     while (resultSet.next()){
39 |       collect.put(resultSet.getNString("domain"), resultSet.getNString("user_id"))
40 |     }
41 |     sourceContext.collect(collect)
42 |   }
43 | 
44 |   override def cancel(): Unit = {}
45 | 
46 |   override def close(): Unit = {
47 |     if(ps != null){
48 |       ps.close()
49 |     }
50 |     if(connection != null){
51 |       connection.close()
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/dmp/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<!-- 指定hdfs的nameservice为myha01 -->
21 | 	<property>
22 | 		<name>fs.defaultFS</name>
23 | 		<value>hdfs://myha/</value>
24 | 	</property>
25 | 
26 | 	<!-- 指定hadoop临时目录 -->
27 | 	<property>
28 | 		<name>hadoop.tmp.dir</name>
29 | 		<value>/home/hadoop/data/hadoopdata/</value>
30 | 	</property>
31 | 
32 | 	<!-- 指定zookeeper地址 -->
33 | 	<property>
34 | 		<name>ha.zookeeper.quorum</name>
35 | 		<value>hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181</value>
36 | 	</property>
37 | 
38 | 	<!-- hadoop链接zookeeper的超时时长设置 -->
39 | 	<property>
40 | 		<name>ha.zookeeper.session-timeout.ms</name>
41 | 		<value>1000</value>
42 | 		<description>ms</description>
43 | 	</property>
44 | 
45 | 	<property>
46 | 		<name>topology.script.file.name</name>
47 | 		<value>/home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh</value>
48 | 	</property>
49 | 
50 | 	<property>
51 |      		<name>hadoop.proxyuser.hadoop.hosts</name> 
52 |      		<value>*</value> 
53 | 	</property> 
54 | 	<property>
55 | 		<name>hadoop.proxyuser.hadoop.groups</name>
56 |      		<value>*</value>
57 | 	</property>
58 | </configuration>
59 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<!-- 指定hdfs的nameservice为myha01 -->
21 | 	<property>
22 | 		<name>fs.defaultFS</name>
23 | 		<value>hdfs://myha/</value>
24 | 	</property>
25 | 
26 | 	<!-- 指定hadoop临时目录 -->
27 | 	<property>
28 | 		<name>hadoop.tmp.dir</name>
29 | 		<value>/home/hadoop/data/hadoopdata/</value>
30 | 	</property>
31 | 
32 | 	<!-- 指定zookeeper地址 -->
33 | 	<property>
34 | 		<name>ha.zookeeper.quorum</name>
35 | 		<value>hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181</value>
36 | 	</property>
37 | 
38 | 	<!-- hadoop链接zookeeper的超时时长设置 -->
39 | 	<property>
40 | 		<name>ha.zookeeper.session-timeout.ms</name>
41 | 		<value>1000</value>
42 | 		<description>ms</description>
43 | 	</property>
44 | 
45 | 	<property>
46 | 		<name>topology.script.file.name</name>
47 | 		<value>/home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh</value>
48 | 	</property>
49 | 
50 | 	<property>
51 |      		<name>hadoop.proxyuser.hadoop.hosts</name> 
52 |      		<value>*</value> 
53 | 	</property> 
54 | 	<property>
55 | 		<name>hadoop.proxyuser.hadoop.groups</name>
56 |      		<value>*</value>
57 | 	</property>
58 | </configuration>
59 | 


--------------------------------------------------------------------------------
/mllib/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<!-- 指定hdfs的nameservice为myha01 -->
21 | 	<property>
22 | 		<name>fs.defaultFS</name>
23 | 		<value>hdfs://myha/</value>
24 | 	</property>
25 | 
26 | 	<!-- 指定hadoop临时目录 -->
27 | 	<property>
28 | 		<name>hadoop.tmp.dir</name>
29 | 		<value>/home/hadoop/data/hadoopdata/</value>
30 | 	</property>
31 | 
32 | 	<!-- 指定zookeeper地址 -->
33 | 	<property>
34 | 		<name>ha.zookeeper.quorum</name>
35 | 		<value>hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181</value>
36 | 	</property>
37 | 
38 | 	<!-- hadoop链接zookeeper的超时时长设置 -->
39 | 	<property>
40 | 		<name>ha.zookeeper.session-timeout.ms</name>
41 | 		<value>1000</value>
42 | 		<description>ms</description>
43 | 	</property>
44 | 
45 | 	<property>
46 | 		<name>topology.script.file.name</name>
47 | 		<value>/home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh</value>
48 | 	</property>
49 | 
50 | 	<property>
51 |      		<name>hadoop.proxyuser.hadoop.hosts</name> 
52 |      		<value>*</value> 
53 | 	</property> 
54 | 	<property>
55 | 		<name>hadoop.proxyuser.hadoop.groups</name>
56 |      		<value>*</value>
57 | 	</property>
58 | </configuration>
59 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | 	<!-- 指定hdfs的nameservice为myha01 -->
21 | 	<property>
22 | 		<name>fs.defaultFS</name>
23 | 		<value>hdfs://myha/</value>
24 | 	</property>
25 | 
26 | 	<!-- 指定hadoop临时目录 -->
27 | 	<property>
28 | 		<name>hadoop.tmp.dir</name>
29 | 		<value>/home/hadoop/data/hadoopdata/</value>
30 | 	</property>
31 | 
32 | 	<!-- 指定zookeeper地址 -->
33 | 	<property>
34 | 		<name>ha.zookeeper.quorum</name>
35 | 		<value>hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181</value>
36 | 	</property>
37 | 
38 | 	<!-- hadoop链接zookeeper的超时时长设置 -->
39 | 	<property>
40 | 		<name>ha.zookeeper.session-timeout.ms</name>
41 | 		<value>1000</value>
42 | 		<description>ms</description>
43 | 	</property>
44 | 
45 | 	<property>
46 | 		<name>topology.script.file.name</name>
47 | 		<value>/home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh</value>
48 | 	</property>
49 | 
50 | 	<property>
51 |      		<name>hadoop.proxyuser.hadoop.hosts</name> 
52 |      		<value>*</value> 
53 | 	</property> 
54 | 	<property>
55 | 		<name>hadoop.proxyuser.hadoop.groups</name>
56 |      		<value>*</value>
57 | 	</property>
58 | </configuration>
59 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/report/ProvinceCityQuantityJob.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.report
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.log4j.{Level, Logger}
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
 8 | 
 9 | /**
10 |   * 省份：province
11 |   * 城市：city
12 |   * 结果存储到MySQL数据库
13 |   * select
14 |   *     province,
15 |   *     city,
16 |   *     count(1)
17 |   * from logs
18 |   * group by province, city
19 |   **/
20 | object ProvinceCityQuantityJob {
21 |   def main(args: Array[String]): Unit = {
22 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 |     Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 | 
26 |     if(args == null || args.length < 2){
27 |       println(
28 |         """Parameter Errors! Usage: <inputpath> <table>
29 |           |inputpath  : input path
30 |           |table :  mysql table name
31 |         """.stripMargin)
32 |       System.exit(-1)
33 |     }
34 |     val Array(inputpath, table) = args
35 | 
36 |     val conf: SparkConf = new SparkConf().setAppName("ProvinceCityQuantityJob").setMaster("local[*]")
37 |     val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
38 | 
39 |     val input: DataFrame = spark.read.parquet(inputpath)
40 |     input.createOrReplaceTempView("logs")
41 | 
42 |     val sql =
43 |       """
44 |         |select
45 |         |    date_sub(current_date(), 0) data_date,
46 |         |    provincename province,
47 |         |    cityname city,
48 |         |    count(1) as countz
49 |         |from logs
50 |         |group by provincename, cityname
51 |       """.stripMargin
52 | 
53 |     val url = "jdbc:mysql://hadoop01:3306/dmp"
54 |     val properties = new Properties
55 |     properties.put("user","root")
56 |     properties.put("password","root")
57 | 
58 |     spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties)
59 | 
60 |     spark.stop()
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLHDFSOps.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.etl
 2 | 
 3 | import com.awebone.dmp.Logs
 4 | import org.apache.log4j.{Level, Logger}
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.serializer.KryoSerializer
 8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
 9 | 
10 | /**
11 |   * 日志数据清洗过程
12 |   *
13 |   * 1）要求一：将数据转换成parquet文件格式
14 |   * 2）要求二：序列化方式采用KryoSerializer方式
15 |   * 3）要求三：parquet文件采用Sanppy压缩方式
16 |   *
17 |   *     通过处理分析，使用SparkCore只能完成KryoSerializer和Snappy，想要完成parquet比较困难，
18 |   * 而SparkSQL处理parquet文件非常简单，所以需要将原先的编码做一稍微改动
19 |   */
20 | object DMPLogETLHDFSOps {
21 |   def main(args: Array[String]): Unit = {
22 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 |     Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 | 
26 |     if(args == null || args.length < 2){
27 |       println(
28 |         """Parameter Errors! Usage: <inputpath> <outputpath>
29 |           |inputpath  : input path
30 |           |outputpath : output path
31 |         """.stripMargin)
32 |       System.exit(-1)
33 |     }
34 |     val Array(inputpath, outputpath) = args
35 | 
36 |     val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]")
37 |       .set("spark.serializer",classOf[KryoSerializer].getName)
38 |       .registerKryoClasses(Array(classOf[Logs])) //要求二：序列化方式采用KryoSerializer方式
39 |     val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
40 |     import spark.implicits._
41 | 
42 |     val lines:RDD[String] = spark.sparkContext.textFile(inputpath)
43 | 
44 |     val retDS: Dataset[Logs] = lines.map(line => {
45 |       val log: Logs = Logs.line2Logs(line)
46 |       log
47 |     }).toDS()
48 | 
49 |     /**
50 |       * 要求一：将数据转换成parquet文件格式
51 |       * 要求三：parquet文件采用Sanppy压缩方式
52 |       */
53 |     retDS.write.mode(SaveMode.Overwrite).parquet(outputpath)
54 | 
55 |     spark.stop()
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/pre/WebLogParse.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.pre;
 2 | 
 3 | import java.text.ParseException;
 4 | import java.text.SimpleDateFormat;
 5 | import java.util.HashSet;
 6 | import java.util.Locale;
 7 | import java.util.Set;
 8 | 
 9 | import com.awebone.bean.WebLogBean;
10 | 
11 | public class WebLogParse {
12 | 	static SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss", Locale.US);
13 | 	static SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
14 | 	static Set<String> pages = new HashSet<String>();
15 | 	static {
16 | 		pages.add("/about");
17 | 		pages.add("/black-ip-list/");
18 | 		pages.add("/cassandra-clustor/");
19 | 		pages.add("/finance-rhive-repurchase/");
20 | 		pages.add("/hadoop-family-roadmap/");
21 | 		pages.add("/hadoop-hive-intro/");
22 | 		pages.add("/hadoop-zookeeper-intro/");
23 | 		pages.add("/hadoop-mahout-roadmap/");
24 | 	}
25 | 
26 | 	public static WebLogBean parse(String line) throws ParseException {
27 | 		// 参数代表一行日志信息
28 | 		String[] log_datas = line.split(" ");
29 | 		if (log_datas.length >= 12) {
30 | 			String addr = log_datas[0];
31 | 			String user = log_datas[2];
32 | 			String local_time = log_datas[3];
33 | 			// 时间解析
34 | 			String format_time = sdf2.format(sdf1.parse(local_time.substring(1)));
35 | 			if (null == format_time || "".equals(format_time)) {
36 | 				format_time = "_invalid_";
37 | 			}
38 | 			String request = log_datas[6];
39 | 			String status = log_datas[8];
40 | 			String byte_sent = log_datas[9];
41 | 			String http_refer = log_datas[10];
42 | 			// 拼接浏览器对象
43 | 			StringBuffer sb = new StringBuffer();
44 | 			for (int i = 11; i < log_datas.length; i++) {
45 | 				sb.append(log_datas[i] + " ");
46 | 			}
47 | 			String user_agent = sb.substring(1, sb.length() - 2);
48 | 
49 | 			WebLogBean bean = new WebLogBean(false, addr, user, format_time, request, status, byte_sent, http_refer,
50 | 					user_agent);
51 | 			// 判断数据有效性
52 | 			if ("_invalid_".equals(format_time)) {
53 | 				bean.setValid(false);
54 | 			}
55 | 			if (Integer.parseInt(bean.getStatus()) > 400) {
56 | 				bean.setValid(false);
57 | 			}
58 | 			if (pages.contains(bean.getRequest())) {
59 | 				bean.setValid(true);
60 | 			}
61 | 			return bean;
62 | 		}else{
63 | 			return null;
64 | 		}
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/pre/WebLogPreProcess.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.pre;
 2 | 
 3 | import java.io.IOException;
 4 | import java.text.ParseException;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.LongWritable;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | import com.awebone.bean.WebLogBean;
17 | 
18 | //对原始数据进行预处理
19 | public class WebLogPreProcess {
20 | 	/**
21 | 	 * @author Awebone 
22 | 	 * map端： 
23 | 	 * 一行数据--- 一条日志--- hive一条数据 
24 | 	 * 切分 封装对象 发送 写出hdfs 
25 | 	 * 		key：null
26 | 	 * 		value：自定义对象
27 | 	 */
28 | 	static class WebLogPreProcessMapper extends Mapper<LongWritable, Text, NullWritable, WebLogBean> {
29 | 		@Override
30 | 		protected void map(LongWritable key, Text value,
31 | 				Mapper<LongWritable, Text, NullWritable, WebLogBean>.Context context)
32 | 				throws IOException, InterruptedException {
33 | 			String line = value.toString();
34 | 			try {
35 | 				WebLogBean webLogBean = WebLogParse.parse(line);
36 | 				if (webLogBean != null) {
37 | 					context.write(NullWritable.get(), webLogBean);
38 | 				}
39 | 			} catch (ParseException e) {
40 | 				e.printStackTrace();
41 | 			}
42 | 		}
43 | 	}
44 | 	
45 | 	public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
46 | 		System.setProperty("HADOOP_USER_NAME", "hadoop");
47 |         Configuration conf = new Configuration();
48 |         conf.set("fs.defaultFS", "hdfs://myha/");
49 |         Job job = Job.getInstance(conf);
50 | 
51 |         job.setJarByClass(WebLogPreProcess.class);
52 | 
53 |         job.setMapperClass(WebLogPreProcessMapper.class);
54 |         job.setOutputKeyClass(NullWritable.class);
55 |         job.setOutputValueClass(WebLogBean.class);
56 | 
57 |         FileInputFormat.setInputPaths(job, new Path("/weblog/20200221"));
58 |         FileOutputFormat.setOutputPath(job, new Path("/weblog/pre/20200221"));
59 | 
60 |         //不需要  设置为0 
61 |         job.setNumReduceTasks(0);
62 | 
63 |         boolean res = job.waitForCompletion(true);
64 |         System.exit(res ? 0 : 1);
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/MockKafkaProducer.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.flink.project
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.{Date, Properties}
 5 | 
 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 7 | import org.apache.kafka.common.serialization.StringSerializer
 8 | 
 9 | import scala.util.Random
10 | 
11 | object MockKafkaProducer {
12 | 
13 |   private def getLevels() = {
14 |     val levels = Array[String]("M","E")
15 | 
16 |     levels(new Random().nextInt(levels.length))
17 |   }
18 | 
19 |   private def getIps() = {
20 |     val ips = Array[String]("233.104.18.110",
21 |     "113.101.75.194",
22 |     "27.17.127.135",
23 |     "185.225.139.16",
24 |     "112.1.66.34",
25 |     "175.148.211.190",
26 |     "183.227.58.21",
27 |     "59.83.198.84",
28 |     "117.28.38.28",
29 |     "117.59.39.169")
30 | 
31 |     ips(new Random().nextInt(ips.length))
32 |   }
33 | 
34 |   private def getDomains() = {
35 |     val domains = Array[String]("v1.awebone.com", "v2.awebone.com", "v3.awebone.com", "v4.awebone.com", "vmi.awebone.com")
36 | 
37 |     domains(new Random().nextInt(domains.length))
38 |   }
39 | 
40 |   private def getTraffic() = new Random().nextInt(10000)
41 | 
42 |   def main(args: Array[String]): Unit = {
43 |     val properties: Properties = new Properties()
44 |     properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
45 |     properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
46 | //    properties.put("metadata.broker.list", "hadoop04:9092") // 声明kafka broker
47 |     properties.setProperty("key.serializer", classOf[StringSerializer].getName)
48 |     properties.setProperty("value.serializer", classOf[StringSerializer].getName)
49 | 
50 |     val producer = new KafkaProducer[String, String](properties)
51 |     val topic = "cdnlog"
52 | 
53 |     while (true){
54 |       val builder = new StringBuilder()
55 |       builder.append("cdnlog").append("\t")
56 |         .append("CN").append("\t")
57 |         .append(getLevels()).append("\t")
58 |         .append(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())).append("\t")
59 |         .append(getIps()).append("\t")
60 |         .append(getDomains()).append("\t")
61 |         .append(getTraffic()).append("\t")
62 | 
63 |       println(builder.toString())
64 |       val pr = new ProducerRecord[String, String](topic, builder.toString())
65 |       producer.send(pr)
66 |       Thread.sleep(2000)
67 |     }
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/MyNodeManager.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.yarn
 2 | 
 3 | import java.util.UUID
 4 | 
 5 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
 6 | import com.typesafe.config.ConfigFactory
 7 | import sun.plugin2.message.HeartbeatMessage
 8 | 
 9 | class MyNodeManager(val resourcemanagerhostname: String, val resourcemanagerport: Int, val memory: Int, val cpu: Int) extends Actor {
10 | 
11 |   var nodemanagerid: String = _
12 |   var rmRef: ActorSelection = _
13 | 
14 |   override def preStart(): Unit = {
15 |     // 远程path　　                  akka.tcp://（ActorSystem的名称）@（远程地址的IP）   ：         （远程地址的端口）/user/（Actor的名称）
16 |     rmRef = context.actorSelection(s"akka.tcp://${Constant.RMAS}@${resourcemanagerhostname}:${resourcemanagerport}/user/${Constant.RMA}")
17 | 
18 |     // val nodemanagerid:String
19 |     // val memory:Int
20 |     // val cpu:Int
21 |     nodemanagerid = UUID.randomUUID().toString
22 |     //发送注册消息
23 |     rmRef ! RegisterNodeManager(nodemanagerid, memory, cpu)
24 |   }
25 | 
26 |   override def receive: Receive = {
27 |     case RegisteredNodeManager(masterURL) => {
28 |       println(masterURL);
29 | 
30 |       /**
31 |         * initialDelay: FiniteDuration, 多久以后开始执行
32 |         * interval:     FiniteDuration, 每隔多长时间执行一次
33 |         * receiver:     ActorRef, 给谁发送这个消息
34 |         * message:      Any  发送的消息是啥
35 |         */
36 |       import scala.concurrent.duration._
37 |       import context.dispatcher
38 |       //每个4秒对自己发送信息，然后就可以发送心跳信息
39 |       context.system.scheduler.schedule(0 millis, 4000 millis, self, SendMessage)
40 |     }
41 | 
42 |     case SendMessage => {
43 | 
44 |       //向主节点发送心跳信息
45 |       rmRef ! Heartbeat(nodemanagerid)
46 | 
47 |       println(Thread.currentThread().getId)
48 |     }
49 |   }
50 | }
51 | 
52 | object MyNodeManager {
53 |   def main(args: Array[String]): Unit = {
54 |     val HOSTNAME = args(0)
55 |     val RM_HOSTNAME = args(1)
56 |     val RM_PORT = args(2).toInt
57 |     val NODEMANAGER_MEMORY = args(3).toInt
58 |     val NODEMANAGER_CORE = args(4).toInt
59 |     var NODEMANAGER_PORT = args(5).toInt
60 |     val str =
61 |       s"""
62 |          |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
63 |          |akka.remote.netty.tcp.hostname =${HOSTNAME}
64 |          |akka.remote.netty.tcp.port=${NODEMANAGER_PORT}
65 |       """.stripMargin
66 |     val conf = ConfigFactory.parseString(str)
67 |     val actorSystem = ActorSystem(Constant.NMAS, conf)
68 |     actorSystem.actorOf(Props(new MyNodeManager(RM_HOSTNAME, RM_PORT, NODEMANAGER_MEMORY, NODEMANAGER_CORE)), Constant.NMA)
69 |   }
70 | }


--------------------------------------------------------------------------------
/mllib/src/main/java/com/awebone/spark/WordCountJava7.java:
--------------------------------------------------------------------------------
 1 | package com.awebone.spark;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaPairRDD;
 5 | import org.apache.spark.api.java.JavaRDD;
 6 | import org.apache.spark.api.java.JavaSparkContext;
 7 | import org.apache.spark.api.java.function.FlatMapFunction;
 8 | import org.apache.spark.api.java.function.Function2;
 9 | import org.apache.spark.api.java.function.PairFunction;
10 | import org.apache.spark.api.java.function.VoidFunction;
11 | import scala.Tuple2;
12 | 
13 | import java.util.Arrays;
14 | import java.util.Iterator;
15 | 
16 | public class WordCountJava7 {
17 | 	public static void main(String[] args) {
18 | 		//获取程序入口
19 | 		SparkConf sparkConf = new SparkConf();
20 | 		sparkConf.setAppName("WordCountJava7");
21 | 		sparkConf.setMaster("local");
22 | 		JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
23 | 
24 | 		//获取数据
25 | 		JavaRDD<String> linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input");
26 | 
27 | 		//计算
28 | 		JavaRDD<String> wordsRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {
29 | 			@Override
30 | 			public Iterator<String> call(String s) throws Exception {
31 | 				return Arrays.asList(s.split(" ")).iterator();
32 | 			}
33 | 		});
34 | 
35 | 		JavaPairRDD<String, Integer> wordAndOneRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() {
36 | 			@Override
37 | 			public Tuple2<String, Integer> call(String s) throws Exception {
38 | 				return new Tuple2<>(s, 1);
39 | 			}
40 | 		});
41 | 
42 | 		JavaPairRDD<String, Integer> wordsCountRDD = wordAndOneRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
43 | 			@Override
44 | 			public Integer call(Integer integer, Integer integer2) throws Exception {
45 | 				return integer + integer2;
46 | 			}
47 | 		});
48 | 
49 | 		JavaPairRDD<Integer, String> newWordsCountRDD = wordsCountRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
50 | 			@Override
51 | 			public Tuple2<Integer, String> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
52 | 				return stringIntegerTuple2.swap();
53 | 			}
54 | 		});
55 | 		JavaPairRDD<Integer, String> sortedRDD = newWordsCountRDD.sortByKey(false);
56 | 		JavaPairRDD<String, Integer> lastSortWordCoundRDD = sortedRDD.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
57 | 			@Override
58 | 			public Tuple2<String, Integer> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
59 | 				return integerStringTuple2.swap();
60 | 			}
61 | 		});
62 | 
63 | 		lastSortWordCoundRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
64 | 			@Override
65 | 			public void call(Tuple2<String, Integer> t) throws Exception {
66 | 				System.out.println(t._1 + "\t" + t._2);
67 | 			}
68 | 		});
69 | 
70 | 		javaSparkContext.stop();
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/MyResourceManager.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.yarn
 2 | 
 3 | import akka.actor.{Actor, ActorSystem, Props}
 4 | import com.typesafe.config.ConfigFactory
 5 | 
 6 | import scala.collection.mutable
 7 | 
 8 | class MyResourceManager(var hostname: String, var port: Int) extends Actor {
 9 | 
10 |   // 用来存储每个注册的NodeManager节点的信息
11 |   private var id2nodemanagerinfo = new mutable.HashMap[String, NodeManagerInfo]()
12 |   // 对所有注册的NodeManager进行去重，其实就是一个HashSet
13 |   private var nodemanagerInfoes = new mutable.HashSet[NodeManagerInfo]()
14 | 
15 |   // actor在最开始的时候，会执行一次
16 |   override def preStart(): Unit = {
17 |     import scala.concurrent.duration._
18 |     import context.dispatcher
19 | 
20 |     // 调度一个任务， 每隔五秒钟执行一次，每隔5秒给自己发送一次信息
21 |     context.system.scheduler.schedule(0 millis, 5000 millis, self, CheckTimeOut)
22 |   }
23 | 
24 |   override def receive: Receive = {
25 | 
26 |     case RegisterNodeManager(nodemanagerid, memory, cpu) => {
27 |       val nodeManagerInfo = new NodeManagerInfo(nodemanagerid, memory, cpu)
28 | 
29 |       // 对注册的NodeManager节点进行存储管理
30 |       id2nodemanagerinfo.put(nodemanagerid, nodeManagerInfo)
31 |       nodemanagerInfoes += nodeManagerInfo
32 | 
33 |       //把信息存到zookeeper
34 |       sender() ! RegisteredNodeManager(hostname + ":" + port)
35 |     }
36 | 
37 |     case Heartbeat(nodemanagerid) => {
38 |       val currentTime = System.currentTimeMillis()
39 |       val nodeManagerInfo = id2nodemanagerinfo(nodemanagerid)
40 |       nodeManagerInfo.lastHeartBeatTime = currentTime
41 | 
42 |       id2nodemanagerinfo(nodemanagerid) = nodeManagerInfo
43 |       nodemanagerInfoes += nodeManagerInfo
44 |     }
45 | 
46 |     // 检查过期失效的 NodeManager
47 |     case CheckTimeOut => {
48 |       val currentTime = System.currentTimeMillis()
49 | 
50 |       // 15 秒钟失效
51 |       //foreach：遍历
52 |       //filter：拿到所有的已经宕机的节点
53 |       nodemanagerInfoes.filter(nm => currentTime - nm.lastHeartBeatTime > 15000)
54 |         .foreach(deadnm => {
55 |           nodemanagerInfoes -= deadnm
56 |           id2nodemanagerinfo.remove(deadnm.nodemanagerid)
57 |         })
58 |       println("当前注册成功的节点数" + nodemanagerInfoes.size);
59 |     }
60 |   }
61 | }
62 | 
63 | object MyResourceManager {
64 |   def main(args: Array[String]): Unit = {
65 |     val RESOURCEMANAGER_HOSTNAME = args(0) //解析的配置的日志
66 |     val RESOURCEMANAGER_PORT = args(1).toInt
67 | 
68 |     //解析运行时所需要的参数
69 |     val str =
70 |       s"""
71 |          |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
72 |          |akka.remote.netty.tcp.hostname =${RESOURCEMANAGER_HOSTNAME}
73 |          |akka.remote.netty.tcp.port=${RESOURCEMANAGER_PORT}
74 |       """.stripMargin
75 | 
76 |     val conf = ConfigFactory.parseString(str)
77 |     val actorSystem = ActorSystem(Constant.RMAS, conf)
78 | 
79 |     //启动一个actor
80 |     actorSystem.actorOf(Props(new MyResourceManager(RESOURCEMANAGER_HOSTNAME, RESOURCEMANAGER_PORT)), Constant.RMA)
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/VisitBean.java:
--------------------------------------------------------------------------------
  1 | package com.awebone.bean;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.hadoop.io.Writable;
  8 | 
  9 | public class VisitBean implements Writable {
 10 | 
 11 | 	private String session;
 12 | 	private String remote_addr;
 13 | 	private String inTime;
 14 | 	private String outTime;
 15 | 	private String inPage;
 16 | 	private String outPage;
 17 | 	private String referal;
 18 | 	private int pageVisits;
 19 | 
 20 | 	public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) {
 21 | 		this.session = session;
 22 | 		this.remote_addr = remote_addr;
 23 | 		this.inTime = inTime;
 24 | 		this.outTime = outTime;
 25 | 		this.inPage = inPage;
 26 | 		this.outPage = outPage;
 27 | 		this.referal = referal;
 28 | 		this.pageVisits = pageVisits;
 29 | 	}
 30 | 
 31 | 	public String getSession() {
 32 | 		return session;
 33 | 	}
 34 | 
 35 | 	public void setSession(String session) {
 36 | 		this.session = session;
 37 | 	}
 38 | 
 39 | 	public String getRemote_addr() {
 40 | 		return remote_addr;
 41 | 	}
 42 | 
 43 | 	public void setRemote_addr(String remote_addr) {
 44 | 		this.remote_addr = remote_addr;
 45 | 	}
 46 | 
 47 | 	public String getInTime() {
 48 | 		return inTime;
 49 | 	}
 50 | 
 51 | 	public void setInTime(String inTime) {
 52 | 		this.inTime = inTime;
 53 | 	}
 54 | 
 55 | 	public String getOutTime() {
 56 | 		return outTime;
 57 | 	}
 58 | 
 59 | 	public void setOutTime(String outTime) {
 60 | 		this.outTime = outTime;
 61 | 	}
 62 | 
 63 | 	public String getInPage() {
 64 | 		return inPage;
 65 | 	}
 66 | 
 67 | 	public void setInPage(String inPage) {
 68 | 		this.inPage = inPage;
 69 | 	}
 70 | 
 71 | 	public String getOutPage() {
 72 | 		return outPage;
 73 | 	}
 74 | 
 75 | 	public void setOutPage(String outPage) {
 76 | 		this.outPage = outPage;
 77 | 	}
 78 | 
 79 | 	public String getReferal() {
 80 | 		return referal;
 81 | 	}
 82 | 
 83 | 	public void setReferal(String referal) {
 84 | 		this.referal = referal;
 85 | 	}
 86 | 
 87 | 	public int getPageVisits() {
 88 | 		return pageVisits;
 89 | 	}
 90 | 
 91 | 	public void setPageVisits(int pageVisits) {
 92 | 		this.pageVisits = pageVisits;
 93 | 	}
 94 | 
 95 | 	public void readFields(DataInput in) throws IOException {
 96 | 		this.session = in.readUTF();
 97 | 		this.remote_addr = in.readUTF();
 98 | 		this.inTime = in.readUTF();
 99 | 		this.outTime = in.readUTF();
100 | 		this.inPage = in.readUTF();
101 | 		this.outPage = in.readUTF();
102 | 		this.referal = in.readUTF();
103 | 		this.pageVisits = in.readInt();
104 | 
105 | 	}
106 | 
107 | 	public void write(DataOutput out) throws IOException {
108 | 		out.writeUTF(session);
109 | 		out.writeUTF(remote_addr);
110 | 		out.writeUTF(inTime);
111 | 		out.writeUTF(outTime);
112 | 		out.writeUTF(inPage);
113 | 		out.writeUTF(outPage);
114 | 		out.writeUTF(referal);
115 | 		out.writeInt(pageVisits);
116 | 
117 | 	}
118 | 
119 | 	@Override
120 | 	public String toString() {
121 | 		return session + "\001" + remote_addr + "\001" + inTime + "\001" + outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits;
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/PageViewsBean.java:
--------------------------------------------------------------------------------
  1 | package com.awebone.bean;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.hadoop.io.Writable;
  8 | 
  9 | public class PageViewsBean implements Writable {
 10 | 
 11 | 	private String session;
 12 | 	private String remote_addr;
 13 | 	private String timestr;
 14 | 	private String request;
 15 | 	private int step;
 16 | 	private String staylong;
 17 | 	private String referal;
 18 | 	private String useragent;
 19 | 	private String bytes_send;
 20 | 	private String status;
 21 | 
 22 | 	public void set(String session, String remote_addr, String useragent, String timestr, String request, int step, String staylong, String referal, String bytes_send, String status) {
 23 | 		this.session = session;
 24 | 		this.remote_addr = remote_addr;
 25 | 		this.useragent = useragent;
 26 | 		this.timestr = timestr;
 27 | 		this.request = request;
 28 | 		this.step = step;
 29 | 		this.staylong = staylong;
 30 | 		this.referal = referal;
 31 | 		this.bytes_send = bytes_send;
 32 | 		this.status = status;
 33 | 	}
 34 | 
 35 | 	public String getSession() {
 36 | 		return session;
 37 | 	}
 38 | 
 39 | 	public void setSession(String session) {
 40 | 		this.session = session;
 41 | 	}
 42 | 
 43 | 	public String getRemote_addr() {
 44 | 		return remote_addr;
 45 | 	}
 46 | 
 47 | 	public void setRemote_addr(String remote_addr) {
 48 | 		this.remote_addr = remote_addr;
 49 | 	}
 50 | 
 51 | 	public String getTimestr() {
 52 | 		return timestr;
 53 | 	}
 54 | 
 55 | 	public void setTimestr(String timestr) {
 56 | 		this.timestr = timestr;
 57 | 	}
 58 | 
 59 | 	public String getRequest() {
 60 | 		return request;
 61 | 	}
 62 | 
 63 | 	public void setRequest(String request) {
 64 | 		this.request = request;
 65 | 	}
 66 | 
 67 | 	public int getStep() {
 68 | 		return step;
 69 | 	}
 70 | 
 71 | 	public void setStep(int step) {
 72 | 		this.step = step;
 73 | 	}
 74 | 
 75 | 	public String getStaylong() {
 76 | 		return staylong;
 77 | 	}
 78 | 
 79 | 	public void setStaylong(String staylong) {
 80 | 		this.staylong = staylong;
 81 | 	}
 82 | 
 83 | 	public String getReferal() {
 84 | 		return referal;
 85 | 	}
 86 | 
 87 | 	public void setReferal(String referal) {
 88 | 		this.referal = referal;
 89 | 	}
 90 | 
 91 | 	public String getUseragent() {
 92 | 		return useragent;
 93 | 	}
 94 | 
 95 | 	public void setUseragent(String useragent) {
 96 | 		this.useragent = useragent;
 97 | 	}
 98 | 
 99 | 	public String getBytes_send() {
100 | 		return bytes_send;
101 | 	}
102 | 
103 | 	public void setBytes_send(String bytes_send) {
104 | 		this.bytes_send = bytes_send;
105 | 	}
106 | 
107 | 	public String getStatus() {
108 | 		return status;
109 | 	}
110 | 
111 | 	public void setStatus(String status) {
112 | 		this.status = status;
113 | 	}
114 | 
115 | 	public void readFields(DataInput in) throws IOException {
116 | 		this.session = in.readUTF();
117 | 		this.remote_addr = in.readUTF();
118 | 		this.timestr = in.readUTF();
119 | 		this.request = in.readUTF();
120 | 		this.step = in.readInt();
121 | 		this.staylong = in.readUTF();
122 | 		this.referal = in.readUTF();
123 | 		this.useragent = in.readUTF();
124 | 		this.bytes_send = in.readUTF();
125 | 		this.status = in.readUTF();
126 | 	}
127 | 
128 | 	public void write(DataOutput out) throws IOException {
129 | 		out.writeUTF(session);
130 | 		out.writeUTF(remote_addr);
131 | 		out.writeUTF(timestr);
132 | 		out.writeUTF(request);
133 | 		out.writeInt(step);
134 | 		out.writeUTF(staylong);
135 | 		out.writeUTF(referal);
136 | 		out.writeUTF(useragent);
137 | 		out.writeUTF(bytes_send);
138 | 		out.writeUTF(status);
139 | 	}
140 | 
141 | }
142 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/report/AreaRequestDistributionJob.scala:
--------------------------------------------------------------------------------
 1 | package com.awebone.dmp.report
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.log4j.{Level, Logger}
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
 8 | 
 9 | /**
10 |   * 广告请求地域分布统计
11 |   * 省市/城市	   总请求	有效请求	广告请求	|参与竞价数	竞价成功数 	竞价成功率	|展示量	点击量	点击率	|广告成本	广告消费
12 |   *  汇总结果，是可以保存到mysql(hbase)表中的，全量结果不建议保存到mysql
13 |   */
14 | object AreaRequestDistributionJob {
15 |   def main(args: Array[String]): Unit = {
16 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
17 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
18 |     Logger.getLogger("org.spark-project").setLevel(Level.WARN)
19 | 
20 |     if(args == null || args.length < 2){
21 |       println(
22 |         """Parameter Errors! Usage: <inputpath> <table>
23 |           |inputpath  : input path
24 |           |table :  mysql table name
25 |         """.stripMargin)
26 |       System.exit(-1)
27 |     }
28 |     val Array(inputpath, table) = args
29 | 
30 |     val conf: SparkConf = new SparkConf().setAppName("AreaRequestDistributionJob").setMaster("local[*]")
31 |     val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
32 | 
33 |     val input: DataFrame = spark.read.parquet(inputpath)
34 |     input.createOrReplaceTempView("logs")
35 | 
36 |     val sql =
37 |       """
38 |         |select
39 |         |    date_sub(current_date(), 1) data_date,
40 |         |    provincename province,
41 |         |    cityname city,
42 |         |    sum(if(requestmode = 1 and processnode >= 1, 1, 0)) orginal_req,
43 |         |    sum(if(requestmode = 1 and processnode >= 2, 1, 0)) valid_req,
44 |         |    sum(if(requestmode = 1 and processnode = 3, 1, 0)) ad_req,
45 |         |    sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and isbid = 1 and adorderid != 0
46 |         |         then 1
47 |         |         else 0
48 |         |    end) tpi_bid_num,
49 |         |    sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1
50 |         |         then 1
51 |         |         else 0
52 |         |    end) win_bid_num,
53 |         |    sum(case when requestmode = 2 and iseffective = 1
54 |         |         then 1
55 |         |         else 0
56 |         |    end) show_ad_master_num,
57 |         |    sum(case when requestmode = 3 and iseffective = 1
58 |         |         then 1
59 |         |         else 0
60 |         |    end) click_ad_master_num,
61 |         |    sum(case when requestmode = 2 and iseffective = 1 and isbilling = 1
62 |         |         then 1
63 |         |         else 0
64 |         |    end) show_ad_media_num,
65 |         |    sum(case when requestmode = 3 and iseffective = 1 and isbilling = 1
66 |         |         then 1
67 |         |         else 0
68 |         |    end) click_ad_media_num,
69 |         |    round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000
70 |         |         then winprice
71 |         |         else 0.0
72 |         |    end) / 1000, 2) dsp_ad_xf,
73 |         |    round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000
74 |         |         then adpayment
75 |         |         else 0.0
76 |         |    end) / 1000, 2) dsp_ad_cost
77 |         |from logs
78 |         |group by provincename, cityname
79 |       """.stripMargin
80 | 
81 |     val url = "jdbc:mysql://hadoop01:3306/dmp"
82 |     val properties = new Properties
83 |     properties.put("user","root")
84 |     properties.put("password","root")
85 | 
86 |     spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties)
87 | 
88 |     spark.stop()
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/WebLogBean.java:
--------------------------------------------------------------------------------
  1 | package com.awebone.bean;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.hadoop.io.Writable;
  8 | 
  9 | public class WebLogBean implements Writable {
 10 | 	private boolean valid = true; // 判断数据是否合法
 11 | 	private String remote_addr; // 记录客户端的ip地址
 12 | 	private String remote_user; // 记录客户端用户名称,忽略属性"-"
 13 | 	private String time_local; // 记录访问时间与时区
 14 | 	private String request; // 记录请求的url与http协议
 15 | 	private String status; // 记录请求状态；成功是200
 16 | 	private String body_bytes_sent; // 记录发送给客户端文件主体内容大小
 17 | 	private String http_referer; // 用来记录从那个页面链接访问过来的
 18 | 	private String http_user_agent; // 记录客户浏览器的相关信息
 19 | 
 20 | 	public boolean isValid() {
 21 | 		return valid;
 22 | 	}
 23 | 
 24 | 	public void setValid(boolean valid) {
 25 | 		this.valid = valid;
 26 | 	}
 27 | 
 28 | 	public String getRemote_addr() {
 29 | 		return remote_addr;
 30 | 	}
 31 | 
 32 | 	public void setRemote_addr(String remote_addr) {
 33 | 		this.remote_addr = remote_addr;
 34 | 	}
 35 | 
 36 | 	public String getRemote_user() {
 37 | 		return remote_user;
 38 | 	}
 39 | 
 40 | 	public void setRemote_user(String remote_user) {
 41 | 		this.remote_user = remote_user;
 42 | 	}
 43 | 
 44 | 	public String getTime_local() {
 45 | 		return time_local;
 46 | 	}
 47 | 
 48 | 	public void setTime_local(String time_local) {
 49 | 		this.time_local = time_local;
 50 | 	}
 51 | 
 52 | 	public String getRequest() {
 53 | 		return request;
 54 | 	}
 55 | 
 56 | 	public void setRequest(String request) {
 57 | 		this.request = request;
 58 | 	}
 59 | 
 60 | 	public String getStatus() {
 61 | 		return status;
 62 | 	}
 63 | 
 64 | 	public void setStatus(String status) {
 65 | 		this.status = status;
 66 | 	}
 67 | 
 68 | 	public String getBody_bytes_sent() {
 69 | 		return body_bytes_sent;
 70 | 	}
 71 | 
 72 | 	public void setBody_bytes_sent(String body_bytes_sent) {
 73 | 		this.body_bytes_sent = body_bytes_sent;
 74 | 	}
 75 | 
 76 | 	public String getHttp_referer() {
 77 | 		return http_referer;
 78 | 	}
 79 | 
 80 | 	public void setHttp_referer(String http_referer) {
 81 | 		this.http_referer = http_referer;
 82 | 	}
 83 | 
 84 | 	public String getHttp_user_agent() {
 85 | 		return http_user_agent;
 86 | 	}
 87 | 
 88 | 	public void setHttp_user_agent(String http_user_agent) {
 89 | 		this.http_user_agent = http_user_agent;
 90 | 	}
 91 | 
 92 | 	public WebLogBean() {
 93 | 		super();
 94 | 	}
 95 | 
 96 | 	public WebLogBean(boolean valid, String remote_addr, String remote_user, String time_local, String request,
 97 | 			String status, String body_bytes_sent, String http_referer, String http_user_agent) {
 98 | 		super();
 99 | 		this.valid = valid;
100 | 		this.remote_addr = remote_addr;
101 | 		this.remote_user = remote_user;
102 | 		this.time_local = time_local;
103 | 		this.request = request;
104 | 		this.status = status;
105 | 		this.body_bytes_sent = body_bytes_sent;
106 | 		this.http_referer = http_referer;
107 | 		this.http_user_agent = http_user_agent;
108 | 	}
109 | 
110 | 	@Override
111 | 	public String toString() {
112 | 		return valid + "\001" + remote_addr + "\001" + remote_user + "\001" + time_local + "\001" + request + "\001"
113 | 				+ status + "\001" + body_bytes_sent + "\001" + http_referer + "\001" + http_user_agent;
114 | 	}
115 | 
116 | 	// 反序列化
117 | 	public void readFields(DataInput in) throws IOException {
118 | 		this.valid = in.readBoolean();
119 | 		this.remote_addr = in.readUTF();
120 | 		this.remote_user = in.readUTF();
121 | 		this.time_local = in.readUTF();
122 | 		this.request = in.readUTF();
123 | 		this.status = in.readUTF();
124 | 		this.body_bytes_sent = in.readUTF();
125 | 		this.http_referer = in.readUTF();
126 | 		this.http_user_agent = in.readUTF();
127 | 	}
128 | 
129 | 	// 序列化
130 | 	public void write(DataOutput out) throws IOException {
131 | 		out.writeBoolean(valid);
132 | 		out.writeUTF(remote_addr);
133 | 		out.writeUTF(remote_user);
134 | 		out.writeUTF(time_local);
135 | 		out.writeUTF(request);
136 | 		out.writeUTF(status);
137 | 		out.writeUTF(body_bytes_sent);
138 | 		out.writeUTF(http_referer);
139 | 		out.writeUTF(http_user_agent);
140 | 	}
141 | 
142 | }
143 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/hdfs-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | <!--
  4 |   Licensed under the Apache License, Version 2.0 (the "License");
  5 |   you may not use this file except in compliance with the License.
  6 |   You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |   Unless required by applicable law or agreed to in writing, software
 11 |   distributed under the License is distributed on an "AS IS" BASIS,
 12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |   See the License for the specific language governing permissions and
 14 |   limitations under the License. See accompanying LICENSE file.
 15 | -->
 16 | 
 17 | <!-- Put site-specific property overrides in this file. -->
 18 | 
 19 | <configuration>
 20 | 	<!-- 指定副本数 -->
 21 | 	<property>
 22 | 		<name>dfs.replication</name>
 23 | 		<value>2</value>
 24 | 	</property>
 25 | 
 26 | 	<!-- 配置namenode和datanode的工作目录-数据存储目录 -->
 27 | 	<property>
 28 | 		<name>dfs.namenode.name.dir</name>
 29 | 		<value>/home/hadoop/data/hadoopdata/dfs/name</value>
 30 | 	</property>
 31 | 	<property>
 32 | 		<name>dfs.datanode.data.dir</name>
 33 | 		<value>/home/hadoop/data/hadoopdata/dfs/data</value>
 34 | 	</property>
 35 | 
 36 | 	<!-- 启用webhdfs -->
 37 | 	<property>
 38 | 		<name>dfs.webhdfs.enabled</name>
 39 | 		<value>true</value>
 40 | 	</property>
 41 | 
 42 | 	<!--指定hdfs的nameservice为myha01，需要和core-site.xml中的保持一致 dfs.ha.namenodes.[nameservice id]为在nameservice中的每一个NameNode设置唯一标示符。 配置一个逗号分隔的NameNode ID列表。这将是被DataNode识别为所有的NameNode。 例如，如果使用"myha01"作为nameservice ID，并且使用"nn1"和"nn2"作为NameNodes标示符 -->
 43 | 	<property>
 44 | 		<name>dfs.nameservices</name>
 45 | 		<value>myha</value>
 46 | 	</property>
 47 | 
 48 | 	<!-- myha01下面有两个NameNode，分别是nn1，nn2 -->
 49 | 	<property>
 50 | 		<name>dfs.ha.namenodes.myha</name>
 51 | 		<value>nn1,nn2</value>
 52 | 	</property>
 53 | 
 54 | 	<!-- nn1的RPC通信地址 -->
 55 | 	<property>
 56 | 		<name>dfs.namenode.rpc-address.myha.nn1</name>
 57 | 		<value>hadoop01:9000</value>
 58 | 	</property>
 59 | 
 60 | 	<!-- nn1的http通信地址 -->
 61 | 	<property>
 62 | 		<name>dfs.namenode.http-address.myha.nn1</name>
 63 | 		<value>hadoop01:50070</value>
 64 | 	</property>
 65 | 
 66 | 	<!-- nn2的RPC通信地址 -->
 67 | 	<property>
 68 | 		<name>dfs.namenode.rpc-address.myha.nn2</name>
 69 | 		<value>hadoop02:9000</value>
 70 | 	</property>
 71 | 
 72 | 	<!-- nn2的http通信地址 -->
 73 | 	<property>
 74 | 		<name>dfs.namenode.http-address.myha.nn2</name>
 75 | 		<value>hadoop02:50070</value>
 76 | 	</property>
 77 | 
 78 | 	<!-- 指定NameNode的edits元数据的共享存储位置。也就是JournalNode列表,该url的配置格式：qjournal://host1:port1;host2:port2;host3:port3/journalId journalId推荐使用nameservice，默认端口号是：8485 -->
 79 | 	<property>
 80 | 		<name>dfs.namenode.shared.edits.dir</name>
 81 | 		<value>qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha</value>
 82 | 	</property>
 83 | 
 84 | 	<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
 85 | 	<property>
 86 | 		<name>dfs.journalnode.edits.dir</name>
 87 | 		<value>/home/hadoop/data/journaldata</value>
 88 | 	</property>
 89 | 
 90 | 	<!-- 开启NameNode失败自动切换 -->
 91 | 	<property>
 92 | 		<name>dfs.ha.automatic-failover.enabled</name>
 93 | 		<value>true</value>
 94 | 	</property>
 95 | 
 96 | 	<!-- 配置失败自动切换实现方式 -->
 97 | 	<property>
 98 | 		<name>dfs.client.failover.proxy.provider.myha</name>
 99 | 		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
100 | 	</property>
101 | 
102 | 	<!-- 配置隔离机制方法，多个机制用换行分割，即每个机制暂用一行 -->
103 | 	<property>
104 | 		<name>dfs.ha.fencing.methods</name>
105 | 		<value>
106 | 			sshfence
107 | 			shell(/bin/true)
108 | 		</value>
109 | 	</property>
110 | 
111 | 	<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
112 | 	<property>
113 | 		<name>dfs.ha.fencing.ssh.private-key-files</name>
114 | 		<value>/home/hadoop/.ssh/id_rsa</value>
115 | 	</property>
116 | 
117 | 	<!-- 配置sshfence隔离机制超时时间 -->
118 | 	<property>
119 | 		<name>dfs.ha.fencing.ssh.connect-timeout</name>
120 | 		<value>30000</value>
121 | 	</property>
122 | 
123 | 	<property>
124 | 		<name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
125 | 		<value>60000</value>
126 | 	</property>
127 | </configuration>
128 | 


--------------------------------------------------------------------------------
/dmp/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | <!--
  4 |   Licensed under the Apache License, Version 2.0 (the "License");
  5 |   you may not use this file except in compliance with the License.
  6 |   You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |   Unless required by applicable law or agreed to in writing, software
 11 |   distributed under the License is distributed on an "AS IS" BASIS,
 12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |   See the License for the specific language governing permissions and
 14 |   limitations under the License. See accompanying LICENSE file.
 15 | -->
 16 | 
 17 | <!-- Put site-specific property overrides in this file. -->
 18 | 
 19 | <configuration>
 20 | 	<!-- 指定副本数 -->
 21 | 	<property>
 22 | 		<name>dfs.replication</name>
 23 | 		<value>2</value>
 24 | 	</property>
 25 | 
 26 | 	<!-- 配置namenode和datanode的工作目录-数据存储目录 -->
 27 | 	<property>
 28 | 		<name>dfs.namenode.name.dir</name>
 29 | 		<value>/home/hadoop/data/hadoopdata/dfs/name</value>
 30 | 	</property>
 31 | 	<property>
 32 | 		<name>dfs.datanode.data.dir</name>
 33 | 		<value>/home/hadoop/data/hadoopdata/dfs/data</value>
 34 | 	</property>
 35 | 
 36 | 	<!-- 启用webhdfs -->
 37 | 	<property>
 38 | 		<name>dfs.webhdfs.enabled</name>
 39 | 		<value>true</value>
 40 | 	</property>
 41 | 
 42 | 	<!--指定hdfs的nameservice为myha01，需要和core-site.xml中的保持一致 dfs.ha.namenodes.[nameservice id]为在nameservice中的每一个NameNode设置唯一标示符。 配置一个逗号分隔的NameNode ID列表。这将是被DataNode识别为所有的NameNode。 例如，如果使用"myha01"作为nameservice ID，并且使用"nn1"和"nn2"作为NameNodes标示符 -->
 43 | 	<property>
 44 | 		<name>dfs.nameservices</name>
 45 | 		<value>myha</value>
 46 | 	</property>
 47 | 
 48 | 	<!-- myha01下面有两个NameNode，分别是nn1，nn2 -->
 49 | 	<property>
 50 | 		<name>dfs.ha.namenodes.myha</name>
 51 | 		<value>nn1,nn2</value>
 52 | 	</property>
 53 | 
 54 | 	<!-- nn1的RPC通信地址 -->
 55 | 	<property>
 56 | 		<name>dfs.namenode.rpc-address.myha.nn1</name>
 57 | 		<value>hadoop01:9000</value>
 58 | 	</property>
 59 | 
 60 | 	<!-- nn1的http通信地址 -->
 61 | 	<property>
 62 | 		<name>dfs.namenode.http-address.myha.nn1</name>
 63 | 		<value>hadoop01:50070</value>
 64 | 	</property>
 65 | 
 66 | 	<!-- nn2的RPC通信地址 -->
 67 | 	<property>
 68 | 		<name>dfs.namenode.rpc-address.myha.nn2</name>
 69 | 		<value>hadoop02:9000</value>
 70 | 	</property>
 71 | 
 72 | 	<!-- nn2的http通信地址 -->
 73 | 	<property>
 74 | 		<name>dfs.namenode.http-address.myha.nn2</name>
 75 | 		<value>hadoop02:50070</value>
 76 | 	</property>
 77 | 
 78 | 	<!-- 指定NameNode的edits元数据的共享存储位置。也就是JournalNode列表,该url的配置格式：qjournal://host1:port1;host2:port2;host3:port3/journalId journalId推荐使用nameservice，默认端口号是：8485 -->
 79 | 	<property>
 80 | 		<name>dfs.namenode.shared.edits.dir</name>
 81 | 		<value>qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha</value>
 82 | 	</property>
 83 | 
 84 | 	<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
 85 | 	<property>
 86 | 		<name>dfs.journalnode.edits.dir</name>
 87 | 		<value>/home/hadoop/data/journaldata</value>
 88 | 	</property>
 89 | 
 90 | 	<!-- 开启NameNode失败自动切换 -->
 91 | 	<property>
 92 | 		<name>dfs.ha.automatic-failover.enabled</name>
 93 | 		<value>true</value>
 94 | 	</property>
 95 | 
 96 | 	<!-- 配置失败自动切换实现方式 -->
 97 | 	<property>
 98 | 		<name>dfs.client.failover.proxy.provider.myha</name>
 99 | 		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
100 | 	</property>
101 | 
102 | 	<!-- 配置隔离机制方法，多个机制用换行分割，即每个机制暂用一行 -->
103 | 	<property>
104 | 		<name>dfs.ha.fencing.methods</name>
105 | 		<value>
106 | 			sshfence
107 | 			shell(/bin/true)
108 | 		</value>
109 | 	</property>
110 | 
111 | 	<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
112 | 	<property>
113 | 		<name>dfs.ha.fencing.ssh.private-key-files</name>
114 | 		<value>/home/hadoop/.ssh/id_rsa</value>
115 | 	</property>
116 | 
117 | 	<!-- 配置sshfence隔离机制超时时间 -->
118 | 	<property>
119 | 		<name>dfs.ha.fencing.ssh.connect-timeout</name>
120 | 		<value>30000</value>
121 | 	</property>
122 | 
123 | 	<property>
124 | 		<name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
125 | 		<value>60000</value>
126 | 	</property>
127 | </configuration>
128 | 


--------------------------------------------------------------------------------
/mllib/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | <!--
  4 |   Licensed under the Apache License, Version 2.0 (the "License");
  5 |   you may not use this file except in compliance with the License.
  6 |   You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |   Unless required by applicable law or agreed to in writing, software
 11 |   distributed under the License is distributed on an "AS IS" BASIS,
 12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |   See the License for the specific language governing permissions and
 14 |   limitations under the License. See accompanying LICENSE file.
 15 | -->
 16 | 
 17 | <!-- Put site-specific property overrides in this file. -->
 18 | 
 19 | <configuration>
 20 | 	<!-- 指定副本数 -->
 21 | 	<property>
 22 | 		<name>dfs.replication</name>
 23 | 		<value>2</value>
 24 | 	</property>
 25 | 
 26 | 	<!-- 配置namenode和datanode的工作目录-数据存储目录 -->
 27 | 	<property>
 28 | 		<name>dfs.namenode.name.dir</name>
 29 | 		<value>/home/hadoop/data/hadoopdata/dfs/name</value>
 30 | 	</property>
 31 | 	<property>
 32 | 		<name>dfs.datanode.data.dir</name>
 33 | 		<value>/home/hadoop/data/hadoopdata/dfs/data</value>
 34 | 	</property>
 35 | 
 36 | 	<!-- 启用webhdfs -->
 37 | 	<property>
 38 | 		<name>dfs.webhdfs.enabled</name>
 39 | 		<value>true</value>
 40 | 	</property>
 41 | 
 42 | 	<!--指定hdfs的nameservice为myha01，需要和core-site.xml中的保持一致 dfs.ha.namenodes.[nameservice id]为在nameservice中的每一个NameNode设置唯一标示符。 配置一个逗号分隔的NameNode ID列表。这将是被DataNode识别为所有的NameNode。 例如，如果使用"myha01"作为nameservice ID，并且使用"nn1"和"nn2"作为NameNodes标示符 -->
 43 | 	<property>
 44 | 		<name>dfs.nameservices</name>
 45 | 		<value>myha</value>
 46 | 	</property>
 47 | 
 48 | 	<!-- myha01下面有两个NameNode，分别是nn1，nn2 -->
 49 | 	<property>
 50 | 		<name>dfs.ha.namenodes.myha</name>
 51 | 		<value>nn1,nn2</value>
 52 | 	</property>
 53 | 
 54 | 	<!-- nn1的RPC通信地址 -->
 55 | 	<property>
 56 | 		<name>dfs.namenode.rpc-address.myha.nn1</name>
 57 | 		<value>hadoop01:9000</value>
 58 | 	</property>
 59 | 
 60 | 	<!-- nn1的http通信地址 -->
 61 | 	<property>
 62 | 		<name>dfs.namenode.http-address.myha.nn1</name>
 63 | 		<value>hadoop01:50070</value>
 64 | 	</property>
 65 | 
 66 | 	<!-- nn2的RPC通信地址 -->
 67 | 	<property>
 68 | 		<name>dfs.namenode.rpc-address.myha.nn2</name>
 69 | 		<value>hadoop02:9000</value>
 70 | 	</property>
 71 | 
 72 | 	<!-- nn2的http通信地址 -->
 73 | 	<property>
 74 | 		<name>dfs.namenode.http-address.myha.nn2</name>
 75 | 		<value>hadoop02:50070</value>
 76 | 	</property>
 77 | 
 78 | 	<!-- 指定NameNode的edits元数据的共享存储位置。也就是JournalNode列表,该url的配置格式：qjournal://host1:port1;host2:port2;host3:port3/journalId journalId推荐使用nameservice，默认端口号是：8485 -->
 79 | 	<property>
 80 | 		<name>dfs.namenode.shared.edits.dir</name>
 81 | 		<value>qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha</value>
 82 | 	</property>
 83 | 
 84 | 	<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
 85 | 	<property>
 86 | 		<name>dfs.journalnode.edits.dir</name>
 87 | 		<value>/home/hadoop/data/journaldata</value>
 88 | 	</property>
 89 | 
 90 | 	<!-- 开启NameNode失败自动切换 -->
 91 | 	<property>
 92 | 		<name>dfs.ha.automatic-failover.enabled</name>
 93 | 		<value>true</value>
 94 | 	</property>
 95 | 
 96 | 	<!-- 配置失败自动切换实现方式 -->
 97 | 	<property>
 98 | 		<name>dfs.client.failover.proxy.provider.myha</name>
 99 | 		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
100 | 	</property>
101 | 
102 | 	<!-- 配置隔离机制方法，多个机制用换行分割，即每个机制暂用一行 -->
103 | 	<property>
104 | 		<name>dfs.ha.fencing.methods</name>
105 | 		<value>
106 | 			sshfence
107 | 			shell(/bin/true)
108 | 		</value>
109 | 	</property>
110 | 
111 | 	<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
112 | 	<property>
113 | 		<name>dfs.ha.fencing.ssh.private-key-files</name>
114 | 		<value>/home/hadoop/.ssh/id_rsa</value>
115 | 	</property>
116 | 
117 | 	<!-- 配置sshfence隔离机制超时时间 -->
118 | 	<property>
119 | 		<name>dfs.ha.fencing.ssh.connect-timeout</name>
120 | 		<value>30000</value>
121 | 	</property>
122 | 
123 | 	<property>
124 | 		<name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
125 | 		<value>60000</value>
126 | 	</property>
127 | </configuration>
128 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | <!--
  4 |   Licensed under the Apache License, Version 2.0 (the "License");
  5 |   you may not use this file except in compliance with the License.
  6 |   You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |   Unless required by applicable law or agreed to in writing, software
 11 |   distributed under the License is distributed on an "AS IS" BASIS,
 12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |   See the License for the specific language governing permissions and
 14 |   limitations under the License. See accompanying LICENSE file.
 15 | -->
 16 | 
 17 | <!-- Put site-specific property overrides in this file. -->
 18 | 
 19 | <configuration>
 20 | 	<!-- 指定副本数 -->
 21 | 	<property>
 22 | 		<name>dfs.replication</name>
 23 | 		<value>2</value>
 24 | 	</property>
 25 | 
 26 | 	<!-- 配置namenode和datanode的工作目录-数据存储目录 -->
 27 | 	<property>
 28 | 		<name>dfs.namenode.name.dir</name>
 29 | 		<value>/home/hadoop/data/hadoopdata/dfs/name</value>
 30 | 	</property>
 31 | 	<property>
 32 | 		<name>dfs.datanode.data.dir</name>
 33 | 		<value>/home/hadoop/data/hadoopdata/dfs/data</value>
 34 | 	</property>
 35 | 
 36 | 	<!-- 启用webhdfs -->
 37 | 	<property>
 38 | 		<name>dfs.webhdfs.enabled</name>
 39 | 		<value>true</value>
 40 | 	</property>
 41 | 
 42 | 	<!--指定hdfs的nameservice为myha01，需要和core-site.xml中的保持一致 dfs.ha.namenodes.[nameservice id]为在nameservice中的每一个NameNode设置唯一标示符。 配置一个逗号分隔的NameNode ID列表。这将是被DataNode识别为所有的NameNode。 例如，如果使用"myha01"作为nameservice ID，并且使用"nn1"和"nn2"作为NameNodes标示符 -->
 43 | 	<property>
 44 | 		<name>dfs.nameservices</name>
 45 | 		<value>myha</value>
 46 | 	</property>
 47 | 
 48 | 	<!-- myha01下面有两个NameNode，分别是nn1，nn2 -->
 49 | 	<property>
 50 | 		<name>dfs.ha.namenodes.myha</name>
 51 | 		<value>nn1,nn2</value>
 52 | 	</property>
 53 | 
 54 | 	<!-- nn1的RPC通信地址 -->
 55 | 	<property>
 56 | 		<name>dfs.namenode.rpc-address.myha.nn1</name>
 57 | 		<value>hadoop01:9000</value>
 58 | 	</property>
 59 | 
 60 | 	<!-- nn1的http通信地址 -->
 61 | 	<property>
 62 | 		<name>dfs.namenode.http-address.myha.nn1</name>
 63 | 		<value>hadoop01:50070</value>
 64 | 	</property>
 65 | 
 66 | 	<!-- nn2的RPC通信地址 -->
 67 | 	<property>
 68 | 		<name>dfs.namenode.rpc-address.myha.nn2</name>
 69 | 		<value>hadoop02:9000</value>
 70 | 	</property>
 71 | 
 72 | 	<!-- nn2的http通信地址 -->
 73 | 	<property>
 74 | 		<name>dfs.namenode.http-address.myha.nn2</name>
 75 | 		<value>hadoop02:50070</value>
 76 | 	</property>
 77 | 
 78 | 	<!-- 指定NameNode的edits元数据的共享存储位置。也就是JournalNode列表,该url的配置格式：qjournal://host1:port1;host2:port2;host3:port3/journalId journalId推荐使用nameservice，默认端口号是：8485 -->
 79 | 	<property>
 80 | 		<name>dfs.namenode.shared.edits.dir</name>
 81 | 		<value>qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha</value>
 82 | 	</property>
 83 | 
 84 | 	<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
 85 | 	<property>
 86 | 		<name>dfs.journalnode.edits.dir</name>
 87 | 		<value>/home/hadoop/data/journaldata</value>
 88 | 	</property>
 89 | 
 90 | 	<!-- 开启NameNode失败自动切换 -->
 91 | 	<property>
 92 | 		<name>dfs.ha.automatic-failover.enabled</name>
 93 | 		<value>true</value>
 94 | 	</property>
 95 | 
 96 | 	<!-- 配置失败自动切换实现方式 -->
 97 | 	<property>
 98 | 		<name>dfs.client.failover.proxy.provider.myha</name>
 99 | 		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
100 | 	</property>
101 | 
102 | 	<!-- 配置隔离机制方法，多个机制用换行分割，即每个机制暂用一行 -->
103 | 	<property>
104 | 		<name>dfs.ha.fencing.methods</name>
105 | 		<value>
106 | 			sshfence
107 | 			shell(/bin/true)
108 | 		</value>
109 | 	</property>
110 | 
111 | 	<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
112 | 	<property>
113 | 		<name>dfs.ha.fencing.ssh.private-key-files</name>
114 | 		<value>/home/hadoop/.ssh/id_rsa</value>
115 | 	</property>
116 | 
117 | 	<!-- 配置sshfence隔离机制超时时间 -->
118 | 	<property>
119 | 		<name>dfs.ha.fencing.ssh.connect-timeout</name>
120 | 		<value>30000</value>
121 | 	</property>
122 | 
123 | 	<property>
124 | 		<name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
125 | 		<value>60000</value>
126 | 	</property>
127 | </configuration>
128 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/click/ClickModel.java:
--------------------------------------------------------------------------------
  1 | package com.awebone.click;
  2 | 
  3 | import java.io.IOException;
  4 | import java.lang.reflect.InvocationTargetException;
  5 | import java.util.ArrayList;
  6 | import java.util.Collections;
  7 | import java.util.Comparator;
  8 | 
  9 | import org.apache.commons.beanutils.BeanUtils;
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.fs.Path;
 12 | import org.apache.hadoop.io.LongWritable;
 13 | import org.apache.hadoop.io.NullWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.Reducer;
 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | 
 21 | import com.awebone.bean.PageViewsBean;
 22 | import com.awebone.bean.VisitBean;
 23 | import com.awebone.bean.WebLogBean;
 24 | 
 25 | /**
 26 |  * map端： 相同的会话的数据  发送到  reduce 
 27 |  *		key： session 
 28 |  * 		value: 其他的字段  
 29 | 			访问时间   url   step   外链   ip  
 30 | 	reduce端：
 31 | 		相同session的数据过来了
 32 | 		按照step排序
 33 | 		list 第一个开始
 34 | 		list 最后一个结束
 35 | 		封装 发送
 36 |  *
 37 |  */
 38 | public class ClickModel {
 39 | 	static class ClickModelMapper extends Mapper<LongWritable, Text, Text, PageViewsBean>{
 40 | 		Text mk = new Text();
 41 | 		PageViewsBean pbean = new PageViewsBean();
 42 | 		
 43 | 		@Override
 44 | 		protected void map(LongWritable key, Text value,
 45 | 				Mapper<LongWritable, Text, Text, PageViewsBean>.Context context)
 46 | 				throws IOException, InterruptedException {
 47 | 			String[] fields = value.toString().split("\001");
 48 | 			if (fields.length == 11){
 49 | 				mk.set(fields[0]);
 50 | 				int step=Integer.parseInt(fields[6]);
 51 | 				pbean.set(fields[0], fields[1], fields[10], fields[3], fields[4],step,
 52 | 						  fields[5], fields[9], fields[8], fields[7]);
 53 | 				context.write(mk, pbean);
 54 | 			}
 55 | 		}
 56 | 	}
 57 | 	
 58 | 	static class ClickModelReducer extends Reducer<Text, PageViewsBean, VisitBean, NullWritable>{
 59 | 		VisitBean vb=new VisitBean();
 60 | 		
 61 | 		@Override
 62 | 		protected void reduce(Text key, Iterable<PageViewsBean> values,
 63 | 				Reducer<Text, PageViewsBean, VisitBean, NullWritable>.Context context)
 64 | 				throws IOException, InterruptedException {
 65 | 			ArrayList<PageViewsBean> list = new ArrayList<PageViewsBean>();
 66 | 			for (PageViewsBean v:values){
 67 | 				PageViewsBean pb = new PageViewsBean();
 68 | 				try {
 69 | 					BeanUtils.copyProperties(pb, v);
 70 | 					list.add(pb);
 71 | 				} catch (IllegalAccessException e) {
 72 | 					// TODO Auto-generated catch block
 73 | 					e.printStackTrace();
 74 | 				} catch (InvocationTargetException e) {
 75 | 					// TODO Auto-generated catch block
 76 | 					e.printStackTrace();
 77 | 				}
 78 | 			}
 79 | 			
 80 | 			Collections.sort(list, new Comparator<PageViewsBean>() {
 81 | 				public int compare(PageViewsBean o1, PageViewsBean o2) {
 82 | 					if(o1 == null || o2 == null){
 83 | 						return 0;
 84 | 					}
 85 | 					return o1.getStep()-o2.getStep();
 86 | 				}
 87 | 			});
 88 | 			
 89 | 			//构造发送的对象
 90 | 			vb.set(key.toString(), list.get(0).getRemote_addr(), 
 91 | 					list.get(0).getTimestr(), list.get(list.size()-1).getTimestr(), 
 92 | 					list.get(0).getRequest(), list.get(list.size()-1).getRequest(), 
 93 | 					list.get(0).getReferal(), list.get(list.size()-1).getStep());
 94 | 			context.write(vb, NullWritable.get());
 95 | 		}
 96 | 	}
 97 | 	
 98 | 	public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
 99 | 		System.setProperty("HADOOP_USER_NAME", "hadoop");
100 |         Configuration conf = new Configuration();
101 |         conf.set("fs.defaultFS", "hdfs://myha/");
102 |         Job job = Job.getInstance(conf);
103 | 
104 |         job.setJarByClass(ClickModel.class);
105 | 
106 |         job.setMapperClass(ClickModelMapper.class);
107 |         job.setReducerClass(ClickModelReducer.class);
108 |         
109 |         job.setMapOutputKeyClass(Text.class);
110 |         job.setMapOutputValueClass(PageViewsBean.class);
111 |         job.setOutputKeyClass(VisitBean.class);
112 |         job.setOutputValueClass(NullWritable.class);
113 | 
114 |         FileInputFormat.setInputPaths(job, new Path("/weblog/click/stream/20200221"));
115 |         FileOutputFormat.setOutputPath(job, new Path("/weblog/click/model/20200221"));
116 | 
117 |         boolean res = job.waitForCompletion(true);
118 |         System.exit(res ? 0 : 1);
119 | 	}
120 | }
121 | 


--------------------------------------------------------------------------------
/akka_rpc/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>awebone</groupId>
  8 |   <artifactId>akka_rpc</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <name>akka_rpc</name>
 12 |   <!-- FIXME change it to the project's website -->
 13 |   <url>http://www.example.com</url>
 14 | 
 15 |   <properties>
 16 |     <project.build.sourceEncoding>UTF8</project.build.sourceEncoding>
 17 |     <maven.compiler.source>1.8</maven.compiler.source>
 18 |     <maven.compiler.target>1.8</maven.compiler.target>
 19 |     <encoding>UTF-8</encoding>
 20 |     <scala.version>2.11.8</scala.version>
 21 |     <scala.actors.version>2.11.8</scala.actors.version>
 22 |     <akka.version>2.4.17</akka.version>
 23 |   </properties>
 24 | 
 25 |   <dependencies>
 26 |     <dependency>
 27 |       <groupId>org.scala-lang</groupId>
 28 |       <artifactId>scala-library</artifactId>
 29 |       <version>${scala.version}</version>
 30 |     </dependency>
 31 | 
 32 |     <dependency>
 33 |       <groupId>com.typesafe.akka</groupId>
 34 |       <artifactId>akka-actor_2.11</artifactId>
 35 |       <version>${akka.version}</version>
 36 |     </dependency>
 37 | 
 38 |     <!-- https://mvnrepository.com/artifact/org.scala-lang/scala-actors -->
 39 |     <dependency>
 40 |       <groupId>org.scala-lang</groupId>
 41 |       <artifactId>scala-actors</artifactId>
 42 |       <version>${scala.actors.version}</version>
 43 |     </dependency>
 44 | 
 45 |     <dependency>
 46 |       <groupId>com.typesafe.akka</groupId>
 47 |       <artifactId>akka-remote_2.11</artifactId>
 48 |       <version>${akka.version}</version>
 49 |     </dependency>
 50 | 
 51 |     <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
 52 |     <dependency>
 53 |       <groupId>org.apache.hadoop</groupId>
 54 |       <artifactId>hadoop-client</artifactId>
 55 |       <version>2.7.6</version>
 56 |     </dependency>
 57 | 
 58 |   </dependencies>
 59 | 
 60 |   <build>
 61 |     <pluginManagement>
 62 |       <plugins>
 63 |         <plugin>
 64 |           <groupId>net.alchim31.maven</groupId>
 65 |           <artifactId>scala-maven-plugin</artifactId>
 66 |           <version>3.2.2</version>
 67 |         </plugin>
 68 |         <plugin>
 69 |           <groupId>org.apache.maven.plugins</groupId>
 70 |           <artifactId>maven-compiler-plugin</artifactId>
 71 |           <version>3.5.1</version>
 72 |         </plugin>
 73 |       </plugins>
 74 |     </pluginManagement>
 75 |     <plugins>
 76 |       <plugin>
 77 |         <groupId>net.alchim31.maven</groupId>
 78 |         <artifactId>scala-maven-plugin</artifactId>
 79 |         <executions>
 80 |           <execution>
 81 |             <id>scala-compile-first</id>
 82 |             <phase>process-resources</phase>
 83 |             <goals>
 84 |               <goal>add-source</goal>
 85 |               <goal>compile</goal>
 86 |             </goals>
 87 |           </execution>
 88 |           <execution>
 89 |             <id>scala-test-compile</id>
 90 |             <phase>process-test-resources</phase>
 91 |             <goals>
 92 |               <goal>testCompile</goal>
 93 |             </goals>
 94 |           </execution>
 95 |         </executions>
 96 |       </plugin>
 97 | 
 98 |       <plugin>
 99 |         <groupId>org.apache.maven.plugins</groupId>
100 |         <artifactId>maven-compiler-plugin</artifactId>
101 |         <executions>
102 |           <execution>
103 |             <phase>compile</phase>
104 |             <goals>
105 |               <goal>compile</goal>
106 |             </goals>
107 |           </execution>
108 |         </executions>
109 |       </plugin>
110 | 
111 |       <plugin>
112 |         <groupId>org.apache.maven.plugins</groupId>
113 |         <artifactId>maven-shade-plugin</artifactId>
114 |         <version>2.4.3</version>
115 |         <executions>
116 |           <execution>
117 |             <phase>package</phase>
118 |             <goals>
119 |               <goal>shade</goal>
120 |             </goals>
121 |             <configuration>
122 |               <filters>
123 |                 <filter>
124 |                   <artifact>*:*</artifact>
125 |                   <excludes>
126 |                     <exclude>META-INF/*.SF</exclude>
127 |                     <exclude>META-INF/*.DSA</exclude>
128 |                     <exclude>META-INF/*.RSA</exclude>
129 |                   </excludes>
130 |                 </filter>
131 |               </filters>
132 |             </configuration>
133 |           </execution>
134 |         </executions>
135 |       </plugin>
136 |     </plugins>
137 |   </build>
138 | </project>
139 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/MovieLensSparkShell.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.spark
  2 | 
  3 | import org.apache.spark.mllib.evaluation.RegressionMetrics
  4 | import org.apache.spark.{SparkConf, SparkContext}
  5 | import org.apache.spark.sql.types._
  6 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
  7 | import org.apache.spark.sql.{SQLContext, SparkSession}
  8 | 
  9 | case class Movie(movieId: Int, title: String, genres: Seq[String])
 10 | 
 11 | case class User(userId: Int, gender: String, age: Int, occupation: Int, zip: String)
 12 | 
 13 | object DataProcess {
 14 |   //获取程序入口
 15 |   val sparkConf: SparkConf = new SparkConf()
 16 |   sparkConf.setAppName(DataProcess.getClass.getSimpleName)
 17 |   sparkConf.setMaster("local")
 18 |   val sc: SparkContext = new SparkContext(sparkConf)
 19 |   val sqlContext = new SQLContext(sc)
 20 | 
 21 |   import sqlContext.implicits._
 22 |   //  val spark:SparkSession = SparkSession.builder().appName("MyFirstSparkSQL").config("someKey", "someValue").master("local").getOrCreate()
 23 | 
 24 |   //Define parse function
 25 |   def parseMovie(str: String): Movie = {
 26 |     val fields = str.split("::")
 27 |     assert(fields.size == 3)
 28 |     Movie(fields(0).toInt, fields(1).toString, Seq(fields(2)))
 29 |   }
 30 | 
 31 |   def parseUser(str: String): User = {
 32 |     val fields = str.split("::")
 33 |     assert(fields.size == 5)
 34 |     User(fields(0).toInt, fields(1).toString, fields(2).toInt, fields(3).toInt, fields(4).toString)
 35 |   }
 36 | 
 37 |   def parseRating(str: String): Rating = {
 38 |     val fields = str.split("::")
 39 |     assert(fields.size == 4)
 40 |     Rating(fields(0).toInt, fields(1).toInt, fields(2).toInt)
 41 |   }
 42 | 
 43 |   //Rating analysis
 44 |   val ratingText = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\ratings.dat")
 45 |   ratingText.first()
 46 |   val ratingRDD = ratingText.map(parseRating).cache()
 47 |   println("Total number of ratings: " + ratingRDD.count())
 48 |   println("Total number of movies rated: " + ratingRDD.map(_.product).distinct().count())
 49 |   println("Total number of users who rated movies: " + ratingRDD.map(_.user).distinct().count())
 50 | 
 51 |   //Create DataFrames
 52 |   val ratingDF = ratingRDD.toDF
 53 |   //  val ratingDF = spark.createDataFrame(ratingRDD)
 54 |   val movieDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\movies.dat").map(parseMovie).toDF
 55 |   val userDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\users.dat").map(parseUser).toDF
 56 |   ratingDF.printSchema
 57 |   //  ratingDF.show
 58 |   movieDF.printSchema
 59 |   userDF.printSchema
 60 | 
 61 |   //注册成表
 62 |   ratingDF.registerTempTable("ratings")
 63 |   //  ratingDF.createOrReplaceTempView(“ratings”)
 64 |   movieDF.registerTempTable("movies")
 65 |   userDF.registerTempTable("users")
 66 | 
 67 |   //数据探索
 68 |   val rantingMovies = sqlContext.sql(
 69 |     """
 70 |       |select title,ramx,rmin,ucnt from
 71 |       |(select product, max(rating) as rmax, min(rating) as rmin, count(distinct user) as ucnt from ratings group by product) rantingsCNT
 72 |       |join movies on product=movieId
 73 |       |order by ucnt desc
 74 |     """.stripMargin)
 75 |   rantingMovies.show()
 76 | 
 77 |   val mostActiveUser = sqlContext.sql(
 78 |     """
 79 |       |select user,count(*) as cnt
 80 |       |from ratings group by user order by cnt desc limit 10
 81 |     """.stripMargin)
 82 |   mostActiveUser.show()
 83 | 
 84 |   val userRating = sqlContext.sql(
 85 |     """
 86 |       |select distinct title,rating
 87 |       |from ratings join movies on movieId=product
 88 |       |where user=4169 and rating>4
 89 |     """.stripMargin)
 90 |   userRating.show()
 91 | 
 92 |   //ALS model
 93 |   //数据切分
 94 |   val splitsData = ratingRDD.randomSplit(Array(0.8, 0.2), 0L)
 95 |   val trainingSet = splitsData(0).cache()
 96 |   val testSet = splitsData(0).cache()
 97 |   trainingSet.count()
 98 |   testSet.count()
 99 | 
100 |   //构建模型
101 |   val model = new ALS()
102 |     .setRank(20)
103 |     .setIterations(10)
104 |     .run(trainingSet)
105 | 
106 |   //进行推荐
107 |   val recomForTopUser = model.recommendProducts(4169, 5)
108 |   val movieTitle = movieDF.rdd.map(x => (x(0), x(1))).collectAsMap
109 |   val recomResult = recomForTopUser.map(rating => (movieTitle(rating.product), rating.rating)).foreach(println)
110 | 
111 |   //测试集预测
112 |   val testUserProduct = testSet.map {
113 |     case Rating(user, product, rating) => (user, product)
114 |   }
115 |   val testUserProductPredict = model.predict(testUserProduct)
116 |   testUserProductPredict.take(10).mkString("\n")
117 | 
118 |   //模型评估
119 |   val testSetPair = testSet.map {
120 |     case Rating(user, product, rating) => ((user, product), rating)
121 |   }
122 |   val predictionsPair = testUserProductPredict.map {
123 |     case Rating(user, product, rating) => ((user, product), rating)
124 |   }
125 | 
126 |   val joinTestPredict = testSetPair.join(predictionsPair)
127 |   val mae = joinTestPredict.map {
128 |     case ((user, product), (ratingT, ratingP)) =>
129 |       val err = ratingT - ratingP
130 |       Math.abs(err)
131 |   }.mean()
132 |   val fp = joinTestPredict.filter {
133 |     case ((user, product), (ratingT, ratingP)) =>
134 |       (ratingT <= 1 & ratingP >= 4)
135 |   }.count()
136 | 
137 |   //使用库进行评估
138 |   val ratingTP = joinTestPredict.map {
139 |     case ((user, product), (ratingT, ratingP)) =>
140 |       (ratingP, ratingT)
141 |   }
142 |   val evalutor = new RegressionMetrics(ratingTP)
143 |   evalutor.meanAbsoluteError
144 |   evalutor.rootMeanSquaredError
145 | }
146 | 


--------------------------------------------------------------------------------
/weblog/src/main/java/hive-op.txt:
--------------------------------------------------------------------------------
  1 | 启动服务
  2 | nohup hiveserver2 1>~/logs/hive_std.log 2>~/logs/hive_err.log &
  3 | 
  4 | 连接服务
  5 | beeline或者hive
  6 | !connect jdbc:hive2://hadoop04:10000
  7 | show databases;
  8 | show tables;
  9 | 
 10 | 
 11 | 创建表
 12 | ODS层
 13 | 原始数据表：
 14 | create database if not exists weblog;
 15 | use weblog;
 16 | drop table if exists weblog.ods_weblog_origin;
 17 | create table weblog.ods_weblog_origin(
 18 | valid string,
 19 | remote_addr string,
 20 | remote_user string,
 21 | time_local string,
 22 | request string,
 23 | status string,
 24 | body_bytes_sent string,
 25 | http_referer string,
 26 | http_user_agent string)
 27 | partitioned by (datestr string)
 28 | row format delimited
 29 | fields terminated by '\001';
 30 | 
 31 | 
 32 | dw层
 33 | 点击流事件表：
 34 | create database if not exists weblog;
 35 | use weblog;
 36 | drop table if exists weblog.click_stream_pageviews;
 37 | create table weblog.click_stream_pageviews (
 38 | session string,
 39 | remote_addr string,
 40 | remote_user string,
 41 | time_local string,
 42 | request string,
 43 | page_staylong string,
 44 | visit_step string,
 45 | status string,
 46 | body_bytes_sent string,
 47 | http_referer string,
 48 | http_user_agent string)
 49 | partitioned by (datestr string)
 50 | row format delimited
 51 | fields terminated by '\001';
 52 | 
 53 | 会话访问统计表  点击流访客表
 54 | create database if not exists weblog;
 55 | use weblog;
 56 | drop table if exists weblog.click_stream_visit;
 57 | create table weblog.click_stream_visit(
 58 | session string,
 59 | remote_addr string,
 60 | inTime string,
 61 | outTime string,
 62 | inPage string,
 63 | outPage string,
 64 | referal string,
 65 | pageVisits int)
 66 | partitioned by (datestr string);
 67 | 
 68 | 
 69 | 加载数据
 70 | /weblog/pre/20200221  原始表
 71 | load data inpath '/weblog/pre/20200221' into table weblog.ods_weblog_origin partition(datestr = "20200221");
 72 | 
 73 | /weblog/click/stream/20200221  点击流事件表
 74 | load data inpath "/weblog/click/stream/20200221" into table weblog.click_stream_pageviews partition(datestr ="20200221");
 75 | 
 76 | /weblog/click/model/20200221  点击流访客表
 77 | load data inpath "/weblog/click/model/20200221" into table weblog.click_stream_visit partition(datestr ="20200221");
 78 | 
 79 | 查询数据
 80 | select * from weblog.ods_weblog_origin limit 1;
 81 | select * from weblog.click_stream_pageviews limit 1;
 82 | select * from weblog.click_stream_visit limit 1;
 83 | 
 84 | 
 85 | dw层创建明细宽表：
 86 | create database if not exists weblog;
 87 | use weblog;
 88 | drop table if exists weblog.ods_weblog_detail;
 89 | create table weblog.ods_weblog_detail(
 90 | valid string comment "有效标识",
 91 | remote_addr string comment "来源 IP",
 92 | remote_user string comment "用户标识",
 93 | time_local string comment "访问完整时间",
 94 | daystr string comment "访问日期",
 95 | timestr string comment "访问时间",
 96 | year string comment "访问年",
 97 | month string comment "访问月",
 98 | day string comment "访问日",
 99 | hour string comment "访问时",
100 | request string comment "请求的 url",
101 | status string comment "响应码",
102 | body_bytes_sent string comment "传输字节数",
103 | http_referer string comment "来源 url",
104 | ref_host string comment "来源的 host",
105 | ref_path string comment "来源的路径",
106 | ref_query string comment "来源参数 query",
107 | ref_query_id string comment "来源参数 query 的值",
108 | http_user_agent string comment "客户终端标识"
109 | )
110 | partitioned by(datestr string) 
111 | row format delimited fields terminated by '\001';
112 | 
113 | 
114 | 设置本地模式和打印表头
115 | set hive.exec.mode.local.auto=true;
116 | set hive.cli.print.header=true;
117 | 
118 | 
119 | 解析url：解析外链的信息  
120 | create database if not exists weblog;
121 | use weblog;
122 | drop table if exists weblog.t_ods_tmp_referurl;
123 | create table weblog.t_ods_tmp_referurl as
124 | SELECT a.*, b.*
125 | FROM ods_weblog_origin a
126 | LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH', 'QUERY','QUERY:id') b 
127 | as host, path, query, query_id;
128 | 
129 | 查询外链信息临时表
130 | select * from weblog.t_ods_tmp_referurl a where a.host is not null limit 1;
131 | 
132 | 最终明细宽表
133 | create database if not exists weblog;
134 | use weblog;
135 | drop table if exists weblog.t_ods_tmp_detail;
136 | create table weblog.t_ods_tmp_detail as
137 | select b.*,substring(time_local,0,10) as daystr,
138 | substring(time_local,11) as tmstr,
139 | substring(time_local,0,4) as year,
140 | substring(time_local,6,2) as month,
141 | substring(time_local,9,2) as day,
142 | substring(time_local,12,2) as hour 
143 | From t_ods_tmp_referurl b;
144 | 
145 | 查询宽表
146 | select * from weblog.t_ods_tmp_detail where month is not null limit 3;
147 | 
148 | 
149 | 统计日志中的相关指标
150 | 1）pv：page view 
151 | click_stream_pageviews 76
152 | select count(*) from click_stream_pageviews;
153 | 
154 | 2）uv：独立用户数 独立会话数，统计的会话的个数
155 | click_stream_visit 57
156 | select count(*) from click_stream_visit;
157 | 
158 | 3)dv：平均每一个会话的访问深度，所有的pv / uv 
159 | 关联 
160 | set hive.strict.checks.cartesian.product=false;
161 | set hive.mapred.mode=nonstrict;
162 | 
163 | select a.pv/b.uv avgdv  
164 | from 
165 | (select count(*) pv from click_stream_pageviews ) a join 
166 | (select count(*) uv from click_stream_visit) b;
167 | 
168 | 4）转化率
169 | 数据order.txt
170 | 1,广告,10000   
171 | 2,菜单,3000		
172 | 3,商品详情,2600	
173 | 4,购物车,300	
174 | 5,下单,200		
175 | 6,支付,190
176 | 7,支付成功,189
177 | 
178 | 建表加载数据
179 | create database if not exists hive_order;
180 | use hive_order;
181 | drop table if exists t_order;
182 | create table t_order(step int, name string, pv int) row format delimited fields terminated by ",";
183 | load data local inpath "/home/hadoop/tmpdata/order.txt" into table t_order;
184 | select * from t_order limit 10;
185 | 
186 | 查转化率
187 | select step,name,pv,pv/lpv t 
188 | from 
189 | (select step,name,pv,lag(pv,1,pv) over(order by step) lpv from t_order) a;
190 | 
191 | 


--------------------------------------------------------------------------------
/mllib/src/main/resources/ml-1m/README:
--------------------------------------------------------------------------------
  1 | SUMMARY
  2 | ================================================================================
  3 | 
  4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
  5 | made by 6,040 MovieLens users who joined MovieLens in 2000.
  6 | 
  7 | USAGE LICENSE
  8 | ================================================================================
  9 | 
 10 | Neither the University of Minnesota nor any of the researchers
 11 | involved can guarantee the correctness of the data, its suitability
 12 | for any particular purpose, or the validity of results based on the
 13 | use of the data set.  The data set may be used for any research
 14 | purposes under the following conditions:
 15 | 
 16 |      * The user may not state or imply any endorsement from the
 17 |        University of Minnesota or the GroupLens Research Group.
 18 | 
 19 |      * The user must acknowledge the use of the data set in
 20 |        publications resulting from the use of the data set, and must
 21 |        send us an electronic or paper copy of those publications.
 22 | 
 23 |      * The user may not redistribute the data without separate
 24 |        permission.
 25 | 
 26 |      * The user may not use this information for any commercial or
 27 |        revenue-bearing purposes without first obtaining permission
 28 |        from a faculty member of the GroupLens Research Project at the
 29 |        University of Minnesota.
 30 | 
 31 | If you have any further questions or comments, please contact GroupLens
 32 | <grouplens-info@cs.umn.edu>. 
 33 | 
 34 | ACKNOWLEDGEMENTS
 35 | ================================================================================
 36 | 
 37 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
 38 | set.
 39 | 
 40 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
 41 | ================================================================================
 42 | 
 43 | The GroupLens Research Project is a research group in the Department of 
 44 | Computer Science and Engineering at the University of Minnesota. Members of 
 45 | the GroupLens Research Project are involved in many research projects related 
 46 | to the fields of information filtering, collaborative filtering, and 
 47 | recommender systems. The project is lead by professors John Riedl and Joseph 
 48 | Konstan. The project began to explore automated collaborative filtering in 
 49 | 1992, but is most well known for its world wide trial of an automated 
 50 | collaborative filtering system for Usenet news in 1996. Since then the project 
 51 | has expanded its scope to research overall information filtering solutions, 
 52 | integrating in content-based methods as well as improving current collaborative 
 53 | filtering technology.
 54 | 
 55 | Further information on the GroupLens Research project, including research 
 56 | publications, can be found at the following web site:
 57 |         
 58 |         http://www.grouplens.org/
 59 | 
 60 | GroupLens Research currently operates a movie recommender based on 
 61 | collaborative filtering:
 62 | 
 63 |         http://www.movielens.org/
 64 | 
 65 | RATINGS FILE DESCRIPTION
 66 | ================================================================================
 67 | 
 68 | All ratings are contained in the file "ratings.dat" and are in the
 69 | following format:
 70 | 
 71 | UserID::MovieID::Rating::Timestamp
 72 | 
 73 | - UserIDs range between 1 and 6040 
 74 | - MovieIDs range between 1 and 3952
 75 | - Ratings are made on a 5-star scale (whole-star ratings only)
 76 | - Timestamp is represented in seconds since the epoch as returned by time(2)
 77 | - Each user has at least 20 ratings
 78 | 
 79 | USERS FILE DESCRIPTION
 80 | ================================================================================
 81 | 
 82 | User information is in the file "users.dat" and is in the following
 83 | format:
 84 | 
 85 | UserID::Gender::Age::Occupation::Zip-code
 86 | 
 87 | All demographic information is provided voluntarily by the users and is
 88 | not checked for accuracy.  Only users who have provided some demographic
 89 | information are included in this data set.
 90 | 
 91 | - Gender is denoted by a "M" for male and "F" for female
 92 | - Age is chosen from the following ranges:
 93 | 
 94 | 	*  1:  "Under 18"
 95 | 	* 18:  "18-24"
 96 | 	* 25:  "25-34"
 97 | 	* 35:  "35-44"
 98 | 	* 45:  "45-49"
 99 | 	* 50:  "50-55"
100 | 	* 56:  "56+"
101 | 
102 | - Occupation is chosen from the following choices:
103 | 
104 | 	*  0:  "other" or not specified
105 | 	*  1:  "academic/educator"
106 | 	*  2:  "artist"
107 | 	*  3:  "clerical/admin"
108 | 	*  4:  "college/grad student"
109 | 	*  5:  "customer service"
110 | 	*  6:  "doctor/health care"
111 | 	*  7:  "executive/managerial"
112 | 	*  8:  "farmer"
113 | 	*  9:  "homemaker"
114 | 	* 10:  "K-12 student"
115 | 	* 11:  "lawyer"
116 | 	* 12:  "programmer"
117 | 	* 13:  "retired"
118 | 	* 14:  "sales/marketing"
119 | 	* 15:  "scientist"
120 | 	* 16:  "self-employed"
121 | 	* 17:  "technician/engineer"
122 | 	* 18:  "tradesman/craftsman"
123 | 	* 19:  "unemployed"
124 | 	* 20:  "writer"
125 | 
126 | MOVIES FILE DESCRIPTION
127 | ================================================================================
128 | 
129 | Movie information is in the file "movies.dat" and is in the following
130 | format:
131 | 
132 | MovieID::Title::Genres
133 | 
134 | - Titles are identical to titles provided by the IMDB (including
135 | year of release)
136 | - Genres are pipe-separated and are selected from the following genres:
137 | 
138 | 	* Action
139 | 	* Adventure
140 | 	* Animation
141 | 	* Children's
142 | 	* Comedy
143 | 	* Crime
144 | 	* Documentary
145 | 	* Drama
146 | 	* Fantasy
147 | 	* Film-Noir
148 | 	* Horror
149 | 	* Musical
150 | 	* Mystery
151 | 	* Romance
152 | 	* Sci-Fi
153 | 	* Thriller
154 | 	* War
155 | 	* Western
156 | 
157 | - Some MovieIDs do not correspond to a movie due to accidental duplicate
158 | entries and/or test entries
159 | - Movies are mostly entered by hand, so errors and inconsistencies may exist
160 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/MovieLensALS.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.spark
  2 | 
  3 | import java.io.File
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | import org.apache.spark.mllib.evaluation.RegressionMetrics
  7 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
  8 | import org.apache.spark.rdd.RDD
  9 | import org.apache.spark.{SparkConf, SparkContext}
 10 | 
 11 | import scala.util.Random
 12 | 
 13 | object MovieLensALS {
 14 |   //1. Define a rating elicitation function
 15 |   def elicitateRating(movies: Seq[(Int, String)]) = {
 16 |     val prompt = "Please rate the following movie(1-5(best) or 0 if not seen: )"
 17 |     println(prompt)
 18 | 
 19 |     val ratings = movies.flatMap { x =>
 20 |       var rating: Option[Rating] = None
 21 |       var vaild = false
 22 |       while (!vaild) {
 23 |         println(x._2 + " :")
 24 |         try {
 25 |           val r = Console.readInt()
 26 |           if (r > 5 || r < 0) {
 27 |             println(prompt)
 28 |           } else {
 29 |             vaild = true
 30 |             if (r > 0) {
 31 |               rating = Some(Rating(0, x._1, r))
 32 |             }
 33 |           }
 34 |         } catch {
 35 |           case e: Exception => println(prompt)
 36 |         }
 37 |       }
 38 |       rating match {
 39 |         case Some(r) => Iterator(r)
 40 |         case None => Iterator.empty
 41 |       }
 42 |     }
 43 |     if (ratings.isEmpty) {
 44 |       error("No ratings provided!")
 45 |     } else {
 46 |       ratings
 47 |     }
 48 |   }
 49 | 
 50 |   //2. Define a RMSE computation function
 51 |   def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]) = {
 52 |     val prediction = model.predict(data.map(x => (x.user, x.product)))
 53 |     val predDataJoined = prediction
 54 |       .map(x => ((x.user, x.product), x.rating))
 55 |       .join(data.map(x => ((x.user, x.product), x.rating)))
 56 |       .values
 57 |     new RegressionMetrics(predDataJoined).rootMeanSquaredError
 58 |   }
 59 | 
 60 |   //3. Main
 61 |   def main(args: Array[String]) = {
 62 |     //3.1 Setup env
 63 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 64 |     if (args.length != 1) {
 65 |       println("Usage: movieLensDir")
 66 |       sys.exit(1)
 67 |     }
 68 | 
 69 |     val conf = new SparkConf()
 70 |       .setAppName("MovieLensALS")
 71 |       .setMaster("local")
 72 |       .set("spark.executor.memory", "500m")
 73 |     val sc = new SparkContext(conf)
 74 | 
 75 |     //3.2 Load ratings data and know your data
 76 |     val movieLensHomeDir = args(0)
 77 |     val ratings = sc
 78 |       .textFile(new File(movieLensHomeDir, "ratings.dat").toString)
 79 |       .map { line =>
 80 |         val fields = line.split("::")
 81 |         (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
 82 |       }
 83 |     val movies = sc
 84 |       .textFile(new File(movieLensHomeDir, "movies.dat").toString)
 85 |       .map { line =>
 86 |         val fields = line.split("::")
 87 |         (fields(0).toInt, fields(1).toString)
 88 |       }
 89 |       .collectAsMap()
 90 | 
 91 |     val numRatings = ratings.count()
 92 |     val numUser = ratings.map(x => x._2.user).distinct().count()
 93 |     val numMovie = ratings.map(x => x._2.product).distinct().count()
 94 |     println("Got " + numRatings + " ratings from " + numUser + " users on " + numMovie + " movies.")
 95 | 
 96 |     //3.3 Elicitate personal rating
 97 |     val topMovies = ratings
 98 |       .map(_._2.product)
 99 |       .countByValue()
100 |       .toSeq
101 |       .sortBy(-_._2)
102 |       .take(50)
103 |       .map(_._1)
104 |     val random = new Random(0)
105 |     val selectMovies = topMovies
106 |       .filter(x => random.nextDouble() < 0.2)
107 |       .map(x => (x, movies(x)))
108 | 
109 |     val myRatings = elicitateRating(selectMovies)
110 |     val myRatingsRDD = sc.parallelize(myRatings, 1)
111 | 
112 |     //3.4 Split data into train(60%), validation(20%) and test(20%)
113 |     val numPartitions = 10
114 |     val trainSet = ratings
115 |       .filter(x => x._1 < 6)
116 |       .map(_._2)
117 |       .union(myRatingsRDD)
118 |       .repartition(numPartitions)
119 |       .persist()
120 |     val validationSet = ratings
121 |       .filter(x => x._1 >= 6 && x._1 < 8)
122 |       .map(_._2)
123 |       .persist()
124 |     val testSet = ratings
125 |       .filter(x => x._1 >= 8)
126 |       .map(_._2)
127 |       .persist()
128 | 
129 |     val numTrain = trainSet.count()
130 |     val numValidation = validationSet.count()
131 |     val numTest = testSet.count()
132 |     println("Training data: " + numTrain + " Validation data: " + numValidation + " Test data: " + numTest)
133 | 
134 |     //3.5 Train model and optimize model with validation set
135 |     val numRanks = List(8, 12)
136 |     val numIters = List(10, 20)
137 |     val numLambdas = List(0.1, 0.01)
138 |     var bestRmse = Double.MaxValue
139 |     var bestModel: Option[MatrixFactorizationModel] = None
140 |     var bestRanks = -1
141 |     var bestIters = 0
142 |     var bestLambdas = -1.0
143 | 
144 |     for (rank <- numRanks; iter <- numIters; lambda <- numLambdas) {
145 |       val model = ALS.train(trainSet, rank, iter, lambda)
146 |       val validationRmse = computeRmse(model, validationSet)
147 |       println("RMSE(validation) = " + validationRmse + " with ranks = " + rank + ", iter = " + iter + ", Lambda = " + lambda)
148 | 
149 |       if (validationRmse < bestRmse) {
150 |         bestRmse = validationRmse
151 |         bestModel = Some(model)
152 |         bestIters = iter
153 |         bestLambdas = lambda
154 |         bestRanks = rank
155 |       }
156 |     }
157 | 
158 |     //3.6 Evaluate model with test set
159 |     val testRmse = computeRmse(bestModel.get, testSet)
160 |     println("The best model was trained with rank = " + bestRanks + ", iter = " + bestIters + ", Lambda = " + bestLambdas + " and compute RMSE on test set is " + testRmse)
161 | 
162 |     //3.7 Create a baseline and compare it with best model
163 |     val meanRating = trainSet.union(validationSet).map(_.rating).mean()
164 |     val baselineRmse = new RegressionMetrics(testSet.map(x => (x.rating, meanRating))).rootMeanSquaredError
165 |     val improvement = (baselineRmse - testRmse) / baselineRmse * 100
166 |     println("The best model improves the baseline by %1.2f".format(improvement) + "%.")
167 | 
168 |     //3.8 Make a personal recommendation
169 |     val moviesId = myRatings.map(_.product)
170 |     val candidates = sc.parallelize(movies.keys.filter(!moviesId.contains(_)).toSeq)
171 |     val recommendations = bestModel.get
172 |       .predict(candidates.map(x => (0, x)))
173 |       .sortBy(-_.rating)
174 |       .take(50)
175 | 
176 |     var i = 1
177 |     println("Movies recommended for you: ")
178 |     recommendations.foreach { line =>
179 |       println("%2d".format(i) + " : " + movies(line.product))
180 |       i += 1
181 |     }
182 | 
183 |     sc.stop()
184 |   }
185 | }
186 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/LogAnalysis.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.flink.project
  2 | 
  3 | import java.text.SimpleDateFormat
  4 | import java.util
  5 | import java.util.{Date, Properties}
  6 | 
  7 | import org.apache.flink.api.common.functions.RuntimeContext
  8 | import org.apache.flink.api.common.serialization.SimpleStringSchema
  9 | import org.apache.flink.api.java.tuple.Tuple
 10 | import org.apache.flink.streaming.api.TimeCharacteristic
 11 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
 12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 13 | import org.apache.flink.streaming.api.scala.function.WindowFunction
 14 | import org.apache.flink.streaming.api.watermark.Watermark
 15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
 16 | import org.apache.flink.streaming.api.windowing.time.Time
 17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
 18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
 19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
 20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
 21 | import org.apache.flink.util.Collector
 22 | import org.apache.http.HttpHost
 23 | import org.elasticsearch.action.index.IndexRequest
 24 | import org.elasticsearch.client.Requests
 25 | import org.slf4j.LoggerFactory
 26 | 
 27 | import scala.collection.mutable.ArrayBuffer
 28 | 
 29 | /**
 30 |   * 日志分析系统
 31 |   * * 功能：
 32 |   * *   最近一分钟每个域名产生的流量统计
 33 |   */
 34 | object LogAnalysis {
 35 | 
 36 |   def main(args: Array[String]): Unit = {
 37 |     //在生产上进行日志的输出，采用以下方式
 38 |     val logger = LoggerFactory.getLogger("LogAnalysis")
 39 | 
 40 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
 41 |     //设置事件时间作为flink处理的基准时间
 42 |     env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
 43 |     import org.apache.flink.api.scala._
 44 | 
 45 |     /**
 46 |       * 读取kafka集群数据
 47 |       */
 48 |     val topic = "cdnlog"
 49 |     val properties: Properties = new Properties()
 50 |     properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
 51 |     properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
 52 | //    properties.setProperty("enable.auto.commit", "true")
 53 | //    properties.setProperty("bootstrap.servers","hadoop04:9092")
 54 |     properties.setProperty("group.id","test-cdnlog")
 55 | 
 56 |     val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties)
 57 |     val data = env.addSource(consumer) // 接受kafka数据
 58 | //    data.print().setParallelism(1) // 测试是否连通
 59 | 
 60 |     /**
 61 |       * 数据清洗：
 62 |       * 在生产上进行业务处理的时候，一定要考虑处理的健壮性以及数据的准确性
 63 |       * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后
 64 |       * 再进行相应业务逻辑的处理
 65 |       */
 66 |     val logData = data.map(x => {
 67 |       val strings = x.split("\t")
 68 | 
 69 |       val level = strings(2)
 70 |       val timeStr = strings(3)
 71 |       var time = 0l
 72 |       try {
 73 |         val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
 74 |         time = sourceFormat.parse(timeStr).getTime
 75 |       } catch {
 76 |         case e:Exception => {
 77 |           logger.error(s"time parse error: $timeStr", e.getMessage)
 78 |         }
 79 |       }
 80 | 
 81 |       val domain = strings(5)
 82 |       val traffic = strings(6).toLong
 83 |       (level, time, domain, traffic)
 84 |     }).filter(_._2 != 0).filter(_._1 == "E")
 85 |       .map(x => {
 86 |         (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic
 87 |       })
 88 | //    logData.print.setParallelism(1)
 89 | 
 90 |     /**
 91 |       * Flink watermarks 定义
 92 |       *   设置timestamp和watermark,解决时序性问题
 93 |       * Windows function 使用
 94 |       *   AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型
 95 |       */
 96 |     val resultData = logData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long)] {
 97 |       //最大无序容忍的时间 10s
 98 |       val maxOutOfOrderness = 10000L // 3.5 seconds
 99 |       //当前最大的TimeStamp
100 |       var currentMaxTimestamp: Long = _
101 | 
102 |       //设置TimeStamp生成WaterMark
103 |       override def getCurrentWatermark: Watermark = {
104 |         new Watermark(currentMaxTimestamp - maxOutOfOrderness)
105 |       }
106 | 
107 |       //抽取时间
108 |       override def extractTimestamp(element: (Long, String, Long), previousElementTimestamp: Long): Long = {
109 |         //获取数据的event time
110 |         val timestamp: Long = element._1
111 |         currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
112 |         timestamp
113 |       }
114 |     }) //根据window进行业务逻辑的处理   最近一分钟每个域名产生的流量
115 |       .keyBy(1) //以域名进行分组，按照域名进行keyby
116 |       .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口，进行统计
117 |       .apply(new WindowFunction[(Long, String, Long), (String, String, Long), Tuple, TimeWindow] {
118 |         override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long)], out: Collector[(String, String, Long)]): Unit = {
119 |           val domain = key.getField(0).toString //拿到key，域名
120 | 
121 |           var sum = 0l
122 |           val times = ArrayBuffer[Long]()
123 |           val iterator = input.iterator
124 |           while (iterator.hasNext) {
125 |             val next = iterator.next()
126 |             sum += next._3 //统计流量
127 |             times.append(next._1) //记录这一分钟，格式：yyyy-MM-dd HH:mm
128 |           }
129 |           val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间，格式化
130 | 
131 |           /**
132 |             * 输出结果：
133 |             * 第一个参数：这一分钟的时间
134 |             * 第二个参数：域名
135 |             * 第三个参数：traffic流量的和
136 |             */
137 |           out.collect((time, domain, sum))
138 |         }
139 |       })
140 |     resultData.print().setParallelism(1)
141 | 
142 | 
143 |     /**
144 |       * 连接es库，导入数据
145 |       * 使用kibana可视化
146 |       */
147 |     val httpHosts = new java.util.ArrayList[HttpHost]
148 |     httpHosts.add(new HttpHost("redhat", 9200, "http"))
149 | 
150 |     val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long)](
151 |       httpHosts,
152 |       new ElasticsearchSinkFunction[(String, String, Long)] {
153 |         override def process(t: (String, String, Long), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
154 |           requestIndexer.add(createIndexRequest(t))
155 |         }
156 | 
157 |         def createIndexRequest(element: (String, String, Long)): IndexRequest = {
158 |           val json = new java.util.HashMap[String, Any]
159 |           json.put("time", element._1)
160 |           json.put("domain", element._2)
161 |           json.put("traffics", element._3)
162 |           val id = element._1 + "-" + element._2
163 |           return Requests.indexRequest()
164 |             .index("cdn")
165 |             .`type`("traffic")
166 |             .id(id)
167 |             .source(json)
168 |         }
169 |       }
170 |     )
171 | 
172 |     //设置要为每个批量请求缓冲的最大操作数
173 |     esSinkBuilder.setBulkFlushMaxActions(1)
174 |     resultData.addSink(esSinkBuilder.build()) //.setParallelism(5)
175 |     env.execute("LogAnalysis")
176 |   }
177 | }
178 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/Logs.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.dmp
  2 | 
  3 | import com.awebone.dmp.util.Utils
  4 | import org.apache.commons.lang3.StringUtils
  5 | 
  6 | case class Logs(val sessionid: String, //会话标识
  7 |            val advertisersid: Int, //广告主id
  8 |            val adorderid: Int, //广告id
  9 |            val adcreativeid: Int, //广告创意id   ( >= 200000 : dsp ,  < 200000 oss)
 10 |            val adplatformproviderid: Int, //广告平台商id  (>= 100000: rtb  , < 100000 : api )
 11 |            val sdkversionnumber: String, //sdk版本号
 12 |            val adplatformkey: String, //平台商key
 13 |            val putinmodeltype: Int, //针对广告主的投放模式,1：展示量投放 2：点击量投放
 14 |            val requestmode: Int, //数据请求方式（1:请求、2:展示、3:点击）
 15 |            val adprice: Double, //广告价格
 16 |            val adppprice: Double, //平台商价格
 17 |            val requestdate: String, //请求时间,格式为：yyyy-MM-dd hh:mm:ss
 18 |            val ip: String, //设备用户的真实ip地址
 19 |            val appid: String, //应用id
 20 |            val appname: String, //应用名称
 21 |            val uuid: String, //设备唯一标识，比如imei或者androidid等
 22 |            val device: String, //设备型号，如htc、iphone
 23 |            val client: Int, //设备类型 （1：android 2：ios 3：wp）
 24 |            val osversion: String, //设备操作系统版本，如4.0
 25 |            val density: String, //备屏幕的密度 android的取值为0.75、1、1.5,ios的取值为：1、2
 26 |            val pw: Int, //设备屏幕宽度
 27 |            val ph: Int, //设备屏幕高度
 28 |            val longitude: String, //设备所在经度
 29 |            val lat: String, //设备所在纬度
 30 |            val provincename: String, //设备所在省份名称
 31 |            val cityname: String, //设备所在城市名称
 32 |            val ispid: Int, //运营商id
 33 |            val ispname: String, //运营商名称
 34 |            val networkmannerid: Int, //联网方式id
 35 |            val networkmannername: String, //联网方式名称
 36 |            val iseffective: Int, //有效标识（有效指可以正常计费的）(0：无效 1：有效)
 37 |            val isbilling: Int, //是否收费（0：未收费 1：已收费）
 38 |            val adspacetype: Int, //广告位类型（1：banner 2：插屏 3：全屏）
 39 |            val adspacetypename: String, //广告位类型名称（banner、插屏、全屏）
 40 |            val devicetype: Int, //设备类型（1：手机 2：平板）
 41 |            val processnode: Int, //流程节点（1：请求量kpi 2：有效请求 3：广告请求）
 42 |            val apptype: Int, //应用类型id
 43 |            val district: String, //设备所在县名称
 44 |            val paymode: Int, //针对平台商的支付模式，1：展示量投放(CPM) 2：点击量投放(CPC)
 45 |            val isbid: Int, //是否rtb
 46 |            val bidprice: Double, //rtb竞价价格
 47 |            val winprice: Double, //rtb竞价成功价格
 48 |            val iswin: Int, //是否竞价成功
 49 |            val cur: String, //values:usd|rmb等
 50 |            val rate: Double, //汇率
 51 |            val cnywinprice: Double, //rtb竞价成功转换成人民币的价格
 52 |            val imei: String, //imei
 53 |            val mac: String, //mac
 54 |            val idfa: String, //idfa
 55 |            val openudid: String, //openudid
 56 |            val androidid: String, //androidid
 57 |            val rtbprovince: String, //rtb 省
 58 |            val rtbcity: String, //rtb 市
 59 |            val rtbdistrict: String, //rtb 区
 60 |            val rtbstreet: String, //rtb 街道
 61 |            val storeurl: String, //app的市场下载地址
 62 |            val realip: String, //真实ip
 63 |            val isqualityapp: Int, //优选标识
 64 |            val bidfloor: Double, //底价
 65 |            val aw: Int, //广告位的宽
 66 |            val ah: Int, //广告位的高
 67 |            val imeimd5: String, //imei_md5
 68 |            val macmd5: String, //mac_md5
 69 |            val idfamd5: String, //idfa_md5
 70 |            val openudidmd5: String, //openudid_md5
 71 |            val androididmd5: String, //androidid_md5
 72 |            val imeisha1: String, //imei_sha1
 73 |            val macsha1: String, //mac_sha1
 74 |            val idfasha1: String, //idfa_sha1
 75 |            val openudidsha1: String, //openudid_sha1
 76 |            val androididsha1: String, //androidid_sha1
 77 |            val uuidunknow: String, //uuid_unknow tanx密文
 78 |            val decuuidunknow: String, // 解密的tanx 明文
 79 |            val userid: String, //平台用户id
 80 |            val reqdate: String, //日期
 81 |            val reqhour: String, //小时
 82 |            val iptype: Int, //表示ip库类型，1为点媒ip库，2为广告协会的ip地理信息标准库，默认为1
 83 |            val initbidprice: Double, //初始出价
 84 |            val adpayment: Double, //转换后的广告消费（保留小数点后6位）
 85 |            val agentrate: Double, //代理商利润率
 86 |            val lomarkrate: Double, //代理利润率
 87 |            val adxrate: Double, //媒介利润率
 88 |            val title: String, //标题
 89 |            val keywords: String, //关键字
 90 |            val tagid: String, //广告位标识(当视频流量时值为视频ID号)
 91 |            val callbackdate: String, //回调时间 格式为:YYYY/mm/dd hh:mm:ss
 92 |            val channelid: String, //频道ID
 93 |            val mediatype: Int ) {//媒体类型：1长尾媒体 2视频媒体 3独立媒体  默认:1)
 94 | 
 95 | }
 96 | 
 97 | object Logs {
 98 | 
 99 |     // 生成一个空的对象
100 |     def makeLogs(): Logs = {
101 |         new Logs("", 0, 0, 0, 0, "", "", 0, 0, 0.0, 0.0, "", "", "", "", "", "", 0, "",
102 |             "", 0, 0, "", "", "", "", 0, "", 0, "", 0, 0, 0, "", 0, 0, 0, "", 0, 0,
103 |             0.0, 0.0, 0, "", 0.0, 0.0, "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0, 0,
104 |             "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0.0, 0.0, 0.0, 0.0, "", "", "", "", "", 0
105 |         )
106 |     }
107 | 
108 |     def line2Logs(s:String):Logs ={
109 |         if(StringUtils.isNotEmpty(s)){
110 |             val fields = s.split(",")
111 |             if(fields.length >= 79){
112 |                 Logs(fields(0), Utils.parseInt(fields(1)), Utils.parseInt(fields(2)), Utils.parseInt(fields(3)), Utils.parseInt(fields(4)), fields(5), fields(6), Utils.parseInt(fields(7)), Utils.parseInt(fields(8)), Utils.parseDouble(fields(9)), Utils.parseDouble(fields(10)),
113 |                     fields(11), fields(12), fields(13), fields(14), fields(15), fields(16), Utils.parseInt(fields(17)), fields(18), fields(19), Utils.parseInt(fields(20)),
114 |                     Utils.parseInt(fields(21)), fields(22), fields(23), fields(24), fields(25), Utils.parseInt(fields(26)), fields(27), Utils.parseInt(fields(28)), fields(29), Utils.parseInt(fields(30)),
115 |                     Utils.parseInt(fields(31)), Utils.parseInt(fields(32)), fields(33), Utils.parseInt(fields(34)), Utils.parseInt(fields(35)), Utils.parseInt(fields(36)), fields(37), Utils.parseInt(fields(38)), Utils.parseInt(fields(39)), Utils.parseDouble(fields(40)),
116 |                     Utils.parseDouble(fields(41)), Utils.parseInt(fields(42)), fields(43), Utils.parseDouble(fields(44)), Utils.parseDouble(fields(45)), fields(46), fields(47), fields(48), fields(49), fields(50),
117 |                     fields(51), fields(52), fields(53), fields(54), fields(55), fields(56), Utils.parseInt(fields(57)), Utils.parseDouble(fields(58)), Utils.parseInt(fields(59)), Utils.parseInt(fields(60)),
118 |                     fields(61), fields(62), fields(63), fields(64), fields(65), fields(66), fields(67), fields(68), fields(69), fields(70),
119 |                     fields(71), "", fields(72), Utils.fmtDate(fields(11)).getOrElse("unkown"), Utils.fmtHour(fields(11)).getOrElse("unkown"),
120 |                     Utils.parseInt(fields(73)), Utils.parseDouble(fields(74)), Utils.parseDouble(fields(75)), Utils.parseDouble(fields(76)), Utils.parseDouble(fields(77)), Utils.parseDouble(fields(78)), "", "", "", "", "", 1)
121 |             }else{
122 |                 makeLogs()
123 |             }
124 |         }else{
125 |             makeLogs()
126 |         }
127 |     }
128 | }


--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/click/ClickSessionStream.java:
--------------------------------------------------------------------------------
  1 | package com.awebone.click;
  2 | 
  3 | import java.io.IOException;
  4 | import java.lang.reflect.InvocationTargetException;
  5 | import java.text.ParseException;
  6 | import java.text.SimpleDateFormat;
  7 | import java.util.ArrayList;
  8 | import java.util.Collections;
  9 | import java.util.Comparator;
 10 | import java.util.Date;
 11 | import java.util.Iterator;
 12 | import java.util.UUID;
 13 | 
 14 | import org.apache.commons.beanutils.BeanUtils;
 15 | import org.apache.hadoop.conf.Configuration;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.LongWritable;
 18 | import org.apache.hadoop.io.NullWritable;
 19 | import org.apache.hadoop.io.Text;
 20 | import org.apache.hadoop.mapreduce.Job;
 21 | import org.apache.hadoop.mapreduce.Mapper;
 22 | import org.apache.hadoop.mapreduce.Reducer;
 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 25 | 
 26 | import com.awebone.bean.WebLogBean;
 27 | 
 28 | /**
 29 |  * 抽取，转化 点击会话流的数据
 30 |  * map端：
 31 |  *		key： ip 
 32 |  *		value： 自定义类   字符串
 33 |  *	reduce：
 34 |  *		相同ip的数据 
 35 |  *		排序   按照访问时间  升序  排序
 36 |  *		计算相邻两个的时间差
 37 |  *		判断
 38 |  *
 39 |  */
 40 | public class ClickSessionStream {
 41 | 	static class ClickSessionStreamMapper extends Mapper<LongWritable, Text, Text, WebLogBean>{
 42 | 		Text mk = new Text();
 43 | 		WebLogBean bean = new WebLogBean();
 44 | 		
 45 | 		@Override
 46 | 		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, WebLogBean>.Context context)
 47 | 				throws IOException, InterruptedException {
 48 | 			String line = value.toString();
 49 | 			String[] pre_datas = line.split("\001");
 50 | 			if(pre_datas.length==9){
 51 | 				bean.setValid(pre_datas[0].equals("true")?true:false);
 52 | 				bean.setRemote_addr(pre_datas[1]);
 53 | 				bean.setRemote_user(pre_datas[2]);
 54 | 				bean.setTime_local(pre_datas[3]);
 55 | 				bean.setRequest(pre_datas[4]);
 56 | 				bean.setStatus(pre_datas[5]);
 57 | 				bean.setBody_bytes_sent(pre_datas[6]);
 58 | 				bean.setHttp_referer(pre_datas[7]);
 59 | 				bean.setHttp_user_agent(pre_datas[8]);
 60 | 				
 61 | 				//过滤数据
 62 | 				if(bean.isValid()){
 63 | 					mk.set(bean.getRemote_addr());
 64 | 					context.write(mk, bean);
 65 | 				}
 66 | 			}
 67 | 		}
 68 | 		
 69 | 	}
 70 | 		
 71 | 	static class ClickSessionStreamReducer extends Reducer<Text, WebLogBean, Text, NullWritable>{
 72 | 		Text rk = new Text();
 73 | 		
 74 | 		@Override
 75 | 		protected void reduce(Text key, Iterable<WebLogBean> values,
 76 | 				Reducer<Text, WebLogBean, Text, NullWritable>.Context context) throws IOException, InterruptedException {
 77 | 			//相同ip的所有数据，循环遍历放在list中，按时间升序排序
 78 | 			ArrayList<WebLogBean> list = new ArrayList<WebLogBean>();
 79 | 			//reducer的坑：k和v都各自只有一个地址，因此要新建对象，再存在list中
 80 | 			for (WebLogBean v:values){
 81 | 				//新建对象
 82 | 				WebLogBean bean = new WebLogBean();
 83 | 				//将迭代器对象中的属性复制到新对象上
 84 | 				try {
 85 | 					BeanUtils.copyProperties(bean, v);
 86 | 					list.add(bean);
 87 | 				} catch (IllegalAccessException e) {
 88 | 					// TODO Auto-generated catch block
 89 | 					e.printStackTrace();
 90 | 				} catch (InvocationTargetException e) {
 91 | 					// TODO Auto-generated catch block
 92 | 					e.printStackTrace();
 93 | 				}
 94 | 			}
 95 | 			
 96 | 			//按时间排序
 97 | 			Collections.sort(list, new Comparator<WebLogBean>() {
 98 | 				public int compare(WebLogBean o1, WebLogBean o2) {
 99 | 					Date date1 = null;
100 | 					Date date2 = null;
101 | 					try {
102 | 						date1 = toDate(o1.getTime_local());
103 | 						date2 = toDate(o2.getTime_local());
104 | 					} catch (ParseException e) {
105 | 						// TODO Auto-generated catch block
106 | 						e.printStackTrace();
107 | 					}
108 | 					if(date1==null || date2==null){
109 | 						return 0;
110 | 					}
111 | 					return date1.compareTo(date2);
112 | 				}
113 | 			});
114 | 			
115 | 			//遍历list，算停留时间，session，step=1
116 | 			int step = 1;
117 | 			UUID sessionid = UUID.randomUUID();
118 | 			for (int i = 0; i < list.size(); i++) {
119 | 				WebLogBean bean = list.get(i);
120 | 				//只有一个访问信息时，直接发送
121 | 				if(list.size()==1){
122 | 					rk.set(sessionid+"\001"+bean.getRemote_addr()+"\001"+bean.getRemote_user()+"\001"+
123 | 							bean.getTime_local()+"\001"+bean.getRequest()+"\001"+(60)+"\001"+step+"\001"+
124 | 							bean.getStatus()+"\001"+bean.getBody_bytes_sent()+"\001"+bean.getHttp_referer()+"\001"+
125 | 							bean.getHttp_user_agent());
126 | 					context.write(rk, NullWritable.get());
127 | 					sessionid = UUID.randomUUID();
128 | 					break;
129 | 				}
130 | 				
131 | 				//大于一个时，算时间差，当前条减去上一条时间
132 | 				if (i==0){
133 | 					continue;
134 | 				}
135 | 				try {
136 | 					long diffDate = diffDate(bean.getTime_local(), list.get(i-1).getTime_local());
137 | 					//判断时间差小于30min
138 | 					if(diffDate < 30*60*1000){
139 | 						WebLogBean lb = list.get(i-1);
140 | 						//输出上一条数据
141 | 						rk.set(sessionid+"\001"+lb.getRemote_addr()+"\001"+lb.getRemote_user()+"\001"+
142 | 								lb.getTime_local()+"\001"+lb.getRequest()+"\001"+(diffDate)/1000+"\001"+step+"\001"+
143 | 								lb.getStatus()+"\001"+lb.getBody_bytes_sent()+"\001"+lb.getHttp_referer()+"\001"+
144 | 								lb.getHttp_user_agent());
145 | 						context.write(rk, NullWritable.get());
146 | 						step++;
147 | 					}else{
148 | 						//大于30min，默认新的session，输出上一个会话的最后一个
149 | 						WebLogBean lsl = list.get(i-1);
150 | 						rk.set(sessionid+"\001"+lsl.getRemote_addr()+"\001"+lsl.getRemote_user()+"\001"+
151 | 								lsl.getTime_local()+"\001"+lsl.getRequest()+"\001"+(60)+"\001"+step+"\001"+
152 | 								lsl.getStatus()+"\001"+lsl.getBody_bytes_sent()+"\001"+lsl.getHttp_referer()+"\001"+
153 | 								lsl.getHttp_user_agent());
154 | 						context.write(rk, NullWritable.get());
155 | 						
156 | 						//step和session重新赋值
157 | 						step = 1;
158 | 						sessionid = UUID.randomUUID();
159 | 					}
160 | 					
161 | 					//输出最后一条
162 | 					if(i == list.size()-1){
163 | 						WebLogBean cb = list.get(i-1);
164 | 						rk.set(sessionid+"\001"+cb.getRemote_addr()+"\001"+cb.getRemote_user()+"\001"+
165 | 								cb.getTime_local()+"\001"+cb.getRequest()+"\001"+(60)+"\001"+step+"\001"+
166 | 								cb.getStatus()+"\001"+cb.getBody_bytes_sent()+"\001"+cb.getHttp_referer()+"\001"+
167 | 								cb.getHttp_user_agent());
168 | 						context.write(rk, NullWritable.get());
169 | 						sessionid = UUID.randomUUID();
170 | 					}
171 | 				} catch (ParseException e) {
172 | 					// TODO Auto-generated catch block
173 | 					e.printStackTrace();
174 | 				}
175 | 			}
176 | 		}
177 | 		
178 | 		public static Date toDate(String time) throws ParseException {
179 | 			SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
180 | 			Date date = sdf.parse(time);
181 | 			return date;
182 | 		}
183 | 		
184 | 		public static long diffDate(String date1,String date2) throws ParseException {
185 | 			Date d1 = toDate(date1);
186 | 			Date d2 = toDate(date2);
187 | 			return d1.getTime() - d2.getTime();
188 | 		}
189 | 	}
190 | 	
191 | 	public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
192 | 		System.setProperty("HADOOP_USER_NAME", "hadoop");
193 |         Configuration conf = new Configuration();
194 |         conf.set("fs.defaultFS", "hdfs://myha/");
195 |         Job job = Job.getInstance(conf);
196 | 
197 |         job.setJarByClass(ClickSessionStream.class);
198 | 
199 |         job.setMapperClass(ClickSessionStreamMapper.class);
200 |         job.setReducerClass(ClickSessionStreamReducer.class);
201 |         
202 |         job.setMapOutputKeyClass(Text.class);
203 |         job.setMapOutputValueClass(WebLogBean.class);
204 |         job.setOutputKeyClass(Text.class);
205 |         job.setOutputValueClass(NullWritable.class);
206 | 
207 |         FileInputFormat.setInputPaths(job, new Path("/weblog/pre/20200221"));
208 |         FileOutputFormat.setOutputPath(job, new Path("/weblog/click/stream/20200221"));
209 | 
210 |         boolean res = job.waitForCompletion(true);
211 |         System.exit(res ? 0 : 1);
212 | 	}
213 | }
214 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/LogAnalysisWithMySQL.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.flink.project
  2 | 
  3 | import java.text.SimpleDateFormat
  4 | import java.util.{Date, Properties}
  5 | 
  6 | import org.apache.flink.api.common.functions.RuntimeContext
  7 | import org.apache.flink.api.common.serialization.SimpleStringSchema
  8 | import org.apache.flink.api.java.tuple.Tuple
  9 | import org.apache.flink.streaming.api.TimeCharacteristic
 10 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
 11 | import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction
 12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 13 | import org.apache.flink.streaming.api.scala.function.WindowFunction
 14 | import org.apache.flink.streaming.api.watermark.Watermark
 15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
 16 | import org.apache.flink.streaming.api.windowing.time.Time
 17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
 18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
 19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
 20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
 21 | import org.apache.flink.util.Collector
 22 | import org.apache.http.HttpHost
 23 | import org.elasticsearch.action.index.IndexRequest
 24 | import org.elasticsearch.client.Requests
 25 | import org.slf4j.LoggerFactory
 26 | 
 27 | import scala.collection.mutable
 28 | import scala.collection.mutable.ArrayBuffer
 29 | 
 30 | object LogAnalysisWithMySQL {
 31 |   def main(args: Array[String]): Unit = {
 32 |     //在生产上进行日志的输出，采用以下方式
 33 |     val logger = LoggerFactory.getLogger("LogAnalysis")
 34 | 
 35 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
 36 |     //设置事件时间作为flink处理的基准时间
 37 |     env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
 38 |     import org.apache.flink.api.scala._
 39 | 
 40 |     /**
 41 |       * 读取kafka集群数据
 42 |       */
 43 |     val topic = "cdnlog"
 44 |     val properties: Properties = new Properties()
 45 |     properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
 46 |     properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
 47 |     properties.setProperty("group.id","test-cdnlog-mysql")
 48 | 
 49 |     val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties)
 50 |     val data = env.addSource(consumer) // 接受kafka数据
 51 |     //    data.print().setParallelism(1) // 测试是否连通
 52 | 
 53 |     /**
 54 |       * 数据清洗：
 55 |       * 在生产上进行业务处理的时候，一定要考虑处理的健壮性以及数据的准确性
 56 |       * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后
 57 |       * 再进行相应业务逻辑的处理
 58 |       */
 59 |     val logData = data.map(x => {
 60 |       val strings = x.split("\t")
 61 | 
 62 |       val level = strings(2)
 63 |       val timeStr = strings(3)
 64 |       var time = 0l
 65 |       try {
 66 |         val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
 67 |         time = sourceFormat.parse(timeStr).getTime
 68 |       } catch {
 69 |         case e:Exception => {
 70 |           logger.error(s"time parse error: $timeStr", e.getMessage)
 71 |         }
 72 |       }
 73 | 
 74 |       val domain = strings(5)
 75 |       val traffic = strings(6).toLong
 76 |       (level, time, domain, traffic)
 77 |     }).filter(_._2 != 0).filter(_._1 == "E")
 78 |       .map(x => {
 79 |         (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic
 80 |       })
 81 | 
 82 |     /**
 83 |       * 连接mysql，合并字段
 84 |       */
 85 |     val mysqlData = env.addSource(new MySQLSource)
 86 | //    mysqlData.print()
 87 |     val connectData = logData.connect(mysqlData)
 88 |         .flatMap(new CoFlatMapFunction[(Long, String, Long), mutable.HashMap[String, String], (Long, String, Long, String)] {
 89 |           var userDomainMap: mutable.HashMap[String, String] = mutable.HashMap[String, String]()
 90 | 
 91 |           //log
 92 |           override def flatMap1(in1: (Long, String, Long), collector: Collector[(Long, String, Long, String)]): Unit = {
 93 |             val domain = in1._2
 94 |             val userId = userDomainMap.getOrElse(domain, "")
 95 | //            collector.collect(in1._1 + "\t" + in1._2 + "\t" + in1._3 + "\t" + userId)
 96 |             collector.collect((in1._1, domain, in1._3, userId))
 97 |           }
 98 | 
 99 |           override def flatMap2(in2: mutable.HashMap[String, String], collector: Collector[(Long, String, Long, String)]): Unit = {
100 |             userDomainMap = in2
101 |           }
102 |         })
103 | 
104 | //    connectData.print()
105 | 
106 |     /**
107 |       *   设置timestamp和watermark,解决时序性问题
108 |       *   AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型
109 |       */
110 |     val resultData = connectData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long, String)] {
111 |       //最大无序容忍的时间 10s
112 |       val maxOutOfOrderness = 10000L // 3.5 seconds
113 |       //当前最大的TimeStamp
114 |       var currentMaxTimestamp: Long = _
115 | 
116 |       //设置TimeStamp生成WaterMark
117 |       override def getCurrentWatermark: Watermark = {
118 |         new Watermark(currentMaxTimestamp - maxOutOfOrderness)
119 |       }
120 | 
121 |       //抽取时间
122 |       override def extractTimestamp(element: (Long, String, Long, String), previousElementTimestamp: Long): Long = {
123 |         //获取数据的event time
124 |         val timestamp: Long = element._1
125 |         currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
126 |         timestamp
127 |       }
128 |     }) //根据window进行业务逻辑的处理   最近一分钟每个用户产生的流量
129 |       .keyBy(3) //以userid进行分组
130 |       .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口，进行统计
131 |       .apply(new WindowFunction[(Long, String, Long, String), (String, String, Long, String), Tuple, TimeWindow] {
132 |         override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long, String)], out: Collector[(String, String, Long, String)]): Unit = {
133 |           val userid = key.getField(0).toString //拿到key，userid
134 | 
135 |           var sum = 0l
136 |           val times = ArrayBuffer[Long]()
137 |           val iterator = input.iterator
138 |           while (iterator.hasNext) {
139 |             val next = iterator.next()
140 |             sum += next._3 //统计流量
141 |             times.append(next._1) //记录这一分钟，格式：yyyy-MM-dd HH:mm
142 |           }
143 |           val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间，格式化
144 | 
145 |           /**
146 |             * 输出结果：
147 |             * 第一个参数：这一分钟的时间
148 |             * 第二个参数：域名
149 |             * 第三个参数：traffic流量的和
150 |             */
151 |           out.collect((time, domain, sum, userid))
152 |         }
153 |       })
154 |     resultData.print().setParallelism(1)
155 | 
156 | 
157 |     /**
158 |       * 连接es库，导入数据
159 |       * 使用kibana可视化
160 |       */
161 |     val httpHosts = new java.util.ArrayList[HttpHost]
162 |     httpHosts.add(new HttpHost("redhat", 9200, "http"))
163 | 
164 |     val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long, String)](
165 |       httpHosts,
166 |       new ElasticsearchSinkFunction[(String, String, Long, String)] {
167 |         override def process(t: (String, String, Long, String), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
168 |           requestIndexer.add(createIndexRequest(t))
169 |         }
170 | 
171 |         def createIndexRequest(element: (String, String, Long, String)): IndexRequest = {
172 |           val json = new java.util.HashMap[String, Any]
173 |           json.put("time", element._1)
174 |           json.put("domain", element._2)
175 |           json.put("traffics", element._3)
176 |           json.put("userid", element._4)
177 |           val id = element._1 + "-" + element._2
178 |           return Requests.indexRequest()
179 |             .index("cdn")
180 |             .`type`("traffic-userid")
181 |             .id(id)
182 |             .source(json)
183 |         }
184 |       }
185 |     )
186 | 
187 |     //设置要为每个批量请求缓冲的最大操作数
188 |     esSinkBuilder.setBulkFlushMaxActions(1)
189 |     resultData.addSink(esSinkBuilder.build()) //.setParallelism(5)
190 | 
191 |     env.execute("LogAnalysisWithMySQL")
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/personas/DmpPersonasJob.scala:
--------------------------------------------------------------------------------
  1 | package com.awebone.dmp.personas
  2 | 
  3 | import java.io.FileInputStream
  4 | import java.util.Properties
  5 | 
  6 | import com.awebone.dmp.Logs
  7 | import com.awebone.dmp.constants.AdTagConstants
  8 | import com.awebone.dmp.tags._
  9 | import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
 10 | import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
 11 | import org.apache.log4j.{Level, Logger}
 12 | import org.apache.spark.SparkConf
 13 | import org.apache.spark.rdd.RDD
 14 | import org.apache.spark.sql.{Dataset, SparkSession}
 15 | 
 16 | import scala.collection.{JavaConversions, mutable}
 17 | 
 18 | /**
 19 |   * dmp用户画像便签统计
 20 |   */
 21 | object DmpPersonasJob {
 22 |   def main(args: Array[String]): Unit = {
 23 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
 24 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 25 |     Logger.getLogger("org.spark-project").setLevel(Level.WARN)
 26 | 
 27 |     if (args == null || args.length < 1) {
 28 |       println(
 29 |         """Parameter Errors! Usage: <inputpath>
 30 |           |inputpath  : input path
 31 |         """.stripMargin)
 32 |       System.exit(-1)
 33 |     }
 34 |     val Array(inputpath) = args
 35 | 
 36 |     val conf: SparkConf = new SparkConf().setAppName("DmpPersonasJob").setMaster("local[*]")
 37 |     val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
 38 |     import spark.implicits._
 39 | 
 40 |     val input: Dataset[Logs] = spark.read.parquet(inputpath).as[Logs]
 41 |     val logs: RDD[Logs] = input.rdd
 42 | 
 43 |     //提取用户的标签 |<userid, map> |
 44 |     val userid2Tags: RDD[(String, Map[String, Int])] = logs.map { case logs: Logs => {
 45 |       var userid: String = logs.userid
 46 |       if (userid == null) {
 47 |         userid = getNotEmptyID(logs).getOrElse("UnKnown")
 48 |       }
 49 | 
 50 |       val adspaceTags: Map[String, Int] = AdPositionTag.extractTag(logs)
 51 |       val appTags: Map[String, Int] = AppTag.extractTag(logs)
 52 |       val channelTags: Map[String, Int] = ChannelTag.extractTag(logs)
 53 |       val deviceTags: Map[String, Int] = DeviceTag.extractTag(logs)
 54 |       val kwTags: Map[String, Int] = KeyWordTag.extractTag(logs)
 55 |       val areaTags: Map[String, Int] = AreaTag.extractTag(logs)
 56 | 
 57 |       (userid, adspaceTags.++(appTags).++(channelTags).++(deviceTags).++(kwTags).++(areaTags))
 58 |     }
 59 |     }
 60 | 
 61 |     //map1<kw-zs,1>  map2<kw-zs,2> --><kw-zs,3>
 62 |     val userid2AggrTags: RDD[(String, Map[String, Int])] = userid2Tags.reduceByKey { case (map1, map2) => {
 63 |       val map = mutable.Map[String, Int]()
 64 |       map.++=(map1)
 65 | 
 66 |       for ((k, v) <- map2) {
 67 |         map.put(k, map.getOrElse(k, 0) + v)
 68 |       }
 69 |       map.toMap
 70 |     }
 71 |     }
 72 |     //    userid2AggrTags.foreach(println)
 73 |     //    (2,Map(NET_3 -> 2, ZC_益阳市 -> 2, DEVICE_1 -> 2, APP_其他 -> 2, ZP_湘南省 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2))
 74 |     //    (1,Map(ZP_上海市 -> 2, NET_3 -> 2, DEVICE_1 -> 2, APP_马上赚 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2, ZC_上海市 -> 2))
 75 | 
 76 |     //转换属性
 77 |     val props = loadProerties()
 78 |     val propsBC = spark.sparkContext.broadcast(props)
 79 | 
 80 |     val aggrTags = userid2AggrTags.map{case (userid, tagMap) => {
 81 |       val map = mutable.Map[String, Int]()
 82 |       val propsMap = propsBC.value
 83 | 
 84 |       for((k,v) <- tagMap){
 85 |         var key = k
 86 | 
 87 |         if(k.contains(AdTagConstants.PREFIX_AD_DEVICE_TAG)){
 88 |           val dMap = propsMap(AdTagConstants.PREFIX_AD_DEVICE_TAG)
 89 |           val id = k.split("_")(1)
 90 |           val dName = dMap.get(id).get.split("\\s+")(1)
 91 |           //k --> prefix_id
 92 |           key = AdTagConstants.PREFIX_AD_DEVICE_TAG + dName
 93 |         }else if(k.contains(AdTagConstants.PREFIX_AD_ISP_TAG)) {
 94 |           val ispMap = propsMap(AdTagConstants.PREFIX_AD_ISP_TAG)
 95 |           val id = k.split("_")(1)
 96 |           val ispName = ispMap.get(id).get.split("\\s+")(1)
 97 |           key = AdTagConstants.PREFIX_AD_ISP_TAG + ispName
 98 |         } else if(k.contains(AdTagConstants.PREFIX_AD_NETWORK_TAG)) {
 99 |           val nwMap = propsMap(AdTagConstants.PREFIX_AD_NETWORK_TAG)
100 |           val id = k.split("_")(1)
101 |           val nwName = nwMap.get(id).get.split("\\s+")(1)
102 |           key =  AdTagConstants.PREFIX_AD_NETWORK_TAG + nwName
103 |         }
104 |         map.put(key, v)
105 |       }
106 | 
107 |       (userid, map)
108 |     }}
109 | 
110 |     /**
111 |       * 将标签聚合结果存储到hbase中
112 |       * 因为，经过我们分析，计算得出的标签可能半结构化的数据，同时如果在dmp和dsp中进行交互的时候，流量比较大的情况下
113 |       * 我们使用mysql没有办法保证时效性，所以我们这里使用hbase进行存储
114 |       * create_space bigdata
115 |       * create 'bigdata:dmp_tag', 'cf'
116 |       * HBase api
117 |       */
118 |     aggrTags.foreachPartition(partition => {
119 |       if(partition != null){
120 |         val connection = ConnectionFactory.createConnection(HBaseConfiguration.create())
121 |         val table = connection.getTable(TableName.valueOf("bigdata:dmp_tag"))
122 | 
123 |         partition.foreach{case (userid, tagMap) => {
124 |           val put = new Put(userid.getBytes())
125 | 
126 |           //tagMap--[Deivce_xxxx, 5]
127 |           for((col,value) <- tagMap){
128 |             put.addColumn("cf".getBytes(), col.getBytes(), value.toString.getBytes())
129 |           }
130 |           table.put(put)
131 |         }}
132 | 
133 |         table.close()
134 |         connection.close()
135 |       }
136 |     })
137 | 
138 |     spark.stop()
139 |   }
140 | 
141 |   /**
142 |     * 加载配置文件
143 |     * type
144 |     *   device  k, value
145 |     *   isp
146 |     *   network
147 |     */
148 |   def loadProerties():mutable.Map[String, mutable.Map[String, String]] = {
149 |     val props = mutable.Map[String, mutable.Map[String, String]]()
150 |     val properties = new Properties()
151 | 
152 |     //加载deivce
153 |     properties.load(new FileInputStream("data/device-mapping.dic"))
154 |     val deviceMap = mutable.Map[String, String]()
155 | 
156 |     for (dk <- JavaConversions.asScalaSet(properties.keySet())){
157 |       deviceMap.put(dk.toString,properties.getProperty(dk.toString))
158 |     }
159 |     props.put(AdTagConstants.PREFIX_AD_DEVICE_TAG, deviceMap)
160 | 
161 |     //加载isp
162 |     properties.clear()
163 |     properties.load(new FileInputStream("data/isp-mapping.dic"))
164 |     val ispMap = mutable.Map[String, String]()
165 |     for(dk <- JavaConversions.asScalaSet(properties.keySet())) {
166 |       ispMap.put(dk.toString, properties.getProperty(dk.toString))
167 |     }
168 |     props.put(AdTagConstants.PREFIX_AD_ISP_TAG, ispMap)
169 | 
170 |     //network
171 |     properties.clear()
172 |     properties.load(new FileInputStream("data/network-mapping.dic"))
173 |     val nwMap = mutable.Map[String, String]()
174 |     for(dk <- JavaConversions.asScalaSet(properties.keySet())) {
175 |       nwMap.put(dk.toString, properties.getProperty(dk.toString))
176 |     }
177 |     props.put(AdTagConstants.PREFIX_AD_NETWORK_TAG, nwMap)
178 | 
179 |     props
180 |   }
181 | 
182 | 
183 |   // 获取用户唯一不为空的ID
184 |   def getNotEmptyID(log: Logs): Option[String] = {
185 |     log match {
186 |       case v if v.imei.nonEmpty => Some("IMEI:" + v.imei.replaceAll(":|-\\", "").toUpperCase)
187 |       case v if v.imeimd5.nonEmpty => Some("IMEIMD5:" + v.imeimd5.toUpperCase)
188 |       case v if v.imeisha1.nonEmpty => Some("IMEISHA1:" + v.imeisha1.toUpperCase)
189 | 
190 |       case v if v.androidid.nonEmpty => Some("ANDROIDID:" + v.androidid.toUpperCase)
191 |       case v if v.androididmd5.nonEmpty => Some("ANDROIDIDMD5:" + v.androididmd5.toUpperCase)
192 |       case v if v.androididsha1.nonEmpty => Some("ANDROIDIDSHA1:" + v.androididsha1.toUpperCase)
193 | 
194 |       case v if v.mac.nonEmpty => Some("MAC:" + v.mac.replaceAll(":|-", "").toUpperCase)
195 |       case v if v.macmd5.nonEmpty => Some("MACMD5:" + v.macmd5.toUpperCase)
196 |       case v if v.macsha1.nonEmpty => Some("MACSHA1:" + v.macsha1.toUpperCase)
197 | 
198 |       case v if v.idfa.nonEmpty => Some("IDFA:" + v.idfa.replaceAll(":|-", "").toUpperCase)
199 |       case v if v.idfamd5.nonEmpty => Some("IDFAMD5:" + v.idfamd5.toUpperCase)
200 |       case v if v.idfasha1.nonEmpty => Some("IDFASHA1:" + v.idfasha1.toUpperCase)
201 | 
202 |       case v if v.openudid.nonEmpty => Some("OPENUDID:" + v.openudid.toUpperCase)
203 |       case v if v.openudidmd5.nonEmpty => Some("OPENDUIDMD5:" + v.openudidmd5.toUpperCase)
204 |       case v if v.openudidsha1.nonEmpty => Some("OPENUDIDSHA1:" + v.openudidsha1.toUpperCase)
205 | 
206 |       case _ => None
207 |     }
208 |   }
209 | }
210 | 


--------------------------------------------------------------------------------
/mllib/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.awebone.spark</groupId>
  8 |   <artifactId>mllib</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <name>mllib</name>
 12 |   <!-- FIXME change it to the project's website -->
 13 |   <url>http://www.example.com</url>
 14 | 
 15 |   <properties>
 16 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17 |     <maven.compiler.source>1.8</maven.compiler.source>
 18 |     <maven.compiler.target>1.8</maven.compiler.target>
 19 |     <encoding>UTF-8</encoding>
 20 |     <scala.version>2.11.8</scala.version>
 21 |     <spark.version>2.3.1</spark.version>
 22 |     <hadoop.version>2.7.6</hadoop.version>
 23 |     <scala.compat.version>2.11</scala.compat.version>
 24 |   </properties>
 25 | 
 26 |   <dependencies>
 27 |     <dependency>
 28 |       <groupId>org.scala-lang</groupId>
 29 |       <artifactId>scala-library</artifactId>
 30 |       <version>${scala.version}</version>
 31 |     </dependency>
 32 | 
 33 |     <dependency>
 34 |       <groupId>org.apache.spark</groupId>
 35 |       <artifactId>spark-core_2.11</artifactId>
 36 |       <version>${spark.version}</version>
 37 |     </dependency>
 38 | 
 39 |     <dependency>
 40 |       <groupId>org.apache.spark</groupId>
 41 |       <artifactId>spark-sql_2.11</artifactId>
 42 |       <version>${spark.version}</version>
 43 |     </dependency>
 44 | 
 45 |     <dependency>
 46 |       <groupId>org.apache.spark</groupId>
 47 |       <artifactId>spark-streaming_2.11</artifactId>
 48 |       <version>${spark.version}</version>
 49 |     </dependency>
 50 | 
 51 |     <dependency>
 52 |       <groupId>org.apache.spark</groupId>
 53 |       <artifactId>spark-graphx_2.11</artifactId>
 54 |       <version>${spark.version}</version>
 55 |     </dependency>
 56 | 
 57 |     <dependency>
 58 |       <groupId>org.apache.spark</groupId>
 59 |       <artifactId>spark-mllib_2.11</artifactId>
 60 |       <version>${spark.version}</version>
 61 |     </dependency>
 62 | 
 63 |     <dependency>
 64 |       <groupId>org.apache.hadoop</groupId>
 65 |       <artifactId>hadoop-client</artifactId>
 66 |       <version>${hadoop.version}</version>
 67 |     </dependency>
 68 | 
 69 |     <dependency>
 70 |       <groupId>org.apache.spark</groupId>
 71 |       <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
 72 |       <version>2.3.1</version>
 73 |     </dependency>
 74 | 
 75 |     <dependency>
 76 |       <groupId>org.apache.spark</groupId>
 77 |       <artifactId>spark-streaming-flume_2.11</artifactId>
 78 |       <version>${spark.version}</version>
 79 |     </dependency>
 80 | 
 81 |     <dependency>
 82 |       <groupId>mysql</groupId>
 83 |       <artifactId>mysql-connector-java</artifactId>
 84 |       <version>5.1.46</version>
 85 |     </dependency>
 86 | 
 87 |     <dependency>
 88 |       <groupId>org.apache.spark</groupId>
 89 |       <artifactId>spark-hive_2.11</artifactId>
 90 |       <version>${spark.version}</version>
 91 |     </dependency>
 92 | 
 93 |     <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
 94 |     <dependency>
 95 |       <groupId>org.apache.kafka</groupId>
 96 |       <artifactId>kafka_2.11</artifactId>
 97 |       <version>1.1.0</version>
 98 |     </dependency>
 99 | 
100 |     <dependency>
101 |       <groupId>junit</groupId>
102 |       <artifactId>junit</artifactId>
103 |       <version>4.11</version>
104 |       <scope>test</scope>
105 |     </dependency>
106 |   </dependencies>
107 | 
108 |   <build>
109 |     <pluginManagement>
110 |       <plugins>
111 |         <plugin>
112 |           <groupId>net.alchim31.maven</groupId>
113 |           <artifactId>scala-maven-plugin</artifactId>
114 |           <version>3.2.2</version>
115 |         </plugin>
116 |         <plugin>
117 |           <groupId>org.apache.maven.plugins</groupId>
118 |           <artifactId>maven-compiler-plugin</artifactId>
119 |           <version>3.5.1</version>
120 |         </plugin>
121 |       </plugins>
122 |     </pluginManagement>
123 |     <plugins>
124 |       <plugin>
125 |         <groupId>net.alchim31.maven</groupId>
126 |         <artifactId>scala-maven-plugin</artifactId>
127 |         <executions>
128 |           <execution>
129 |             <id>scala-compile-first</id>
130 |             <phase>process-resources</phase>
131 |             <goals>
132 |               <goal>add-source</goal>
133 |               <goal>compile</goal>
134 |             </goals>
135 |           </execution>
136 |           <execution>
137 |             <id>scala-test-compile</id>
138 |             <phase>process-test-resources</phase>
139 |             <goals>
140 |               <goal>testCompile</goal>
141 |             </goals>
142 |           </execution>
143 |         </executions>
144 |       </plugin>
145 | 
146 |       <plugin>
147 |         <groupId>org.apache.maven.plugins</groupId>
148 |         <artifactId>maven-compiler-plugin</artifactId>
149 |         <executions>
150 |           <execution>
151 |             <phase>compile</phase>
152 |             <goals>
153 |               <goal>compile</goal>
154 |             </goals>
155 |           </execution>
156 |         </executions>
157 |       </plugin>
158 | 
159 |       <plugin>
160 |         <groupId>org.apache.maven.plugins</groupId>
161 |         <artifactId>maven-shade-plugin</artifactId>
162 |         <version>2.4.3</version>
163 |         <executions>
164 |           <execution>
165 |             <phase>package</phase>
166 |             <goals>
167 |               <goal>shade</goal>
168 |             </goals>
169 |             <configuration>
170 |               <filters>
171 |                 <filter>
172 |                   <artifact>*:*</artifact>
173 |                   <excludes>
174 |                     <exclude>META-INF/*.SF</exclude>
175 |                     <exclude>META-INF/*.DSA</exclude>
176 |                     <exclude>META-INF/*.RSA</exclude>
177 |                   </excludes>
178 |                 </filter>
179 |               </filters>
180 |             </configuration>
181 |           </execution>
182 |         </executions>
183 |       </plugin>
184 |     </plugins>
185 |   </build>
186 | 
187 |   <!--<build>-->
188 |     <!--<pluginManagement>&lt;!&ndash; lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) &ndash;&gt;-->
189 |       <!--<plugins>-->
190 |         <!--&lt;!&ndash; clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle &ndash;&gt;-->
191 |         <!--<plugin>-->
192 |           <!--<artifactId>maven-clean-plugin</artifactId>-->
193 |           <!--<version>3.1.0</version>-->
194 |         <!--</plugin>-->
195 |         <!--&lt;!&ndash; default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging &ndash;&gt;-->
196 |         <!--<plugin>-->
197 |           <!--<artifactId>maven-resources-plugin</artifactId>-->
198 |           <!--<version>3.0.2</version>-->
199 |         <!--</plugin>-->
200 |         <!--<plugin>-->
201 |           <!--<artifactId>maven-compiler-plugin</artifactId>-->
202 |           <!--<version>3.8.0</version>-->
203 |         <!--</plugin>-->
204 |         <!--<plugin>-->
205 |           <!--<artifactId>maven-surefire-plugin</artifactId>-->
206 |           <!--<version>2.22.1</version>-->
207 |         <!--</plugin>-->
208 |         <!--<plugin>-->
209 |           <!--<artifactId>maven-jar-plugin</artifactId>-->
210 |           <!--<version>3.0.2</version>-->
211 |         <!--</plugin>-->
212 |         <!--<plugin>-->
213 |           <!--<artifactId>maven-install-plugin</artifactId>-->
214 |           <!--<version>2.5.2</version>-->
215 |         <!--</plugin>-->
216 |         <!--<plugin>-->
217 |           <!--<artifactId>maven-deploy-plugin</artifactId>-->
218 |           <!--<version>2.8.2</version>-->
219 |         <!--</plugin>-->
220 |         <!--&lt;!&ndash; site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle &ndash;&gt;-->
221 |         <!--<plugin>-->
222 |           <!--<artifactId>maven-site-plugin</artifactId>-->
223 |           <!--<version>3.7.1</version>-->
224 |         <!--</plugin>-->
225 |         <!--<plugin>-->
226 |           <!--<artifactId>maven-project-info-reports-plugin</artifactId>-->
227 |           <!--<version>3.0.0</version>-->
228 |         <!--</plugin>-->
229 |       <!--</plugins>-->
230 |     <!--</pluginManagement>-->
231 |   <!--</build>-->
232 | </project>
233 | 


--------------------------------------------------------------------------------
/dmp/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.awebone</groupId>
  8 |   <artifactId>dmp</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <name>dmp</name>
 12 |   <!-- FIXME change it to the project's website -->
 13 |   <url>http://www.example.com</url>
 14 | 
 15 |   <properties>
 16 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17 |     <scala.version>2.11.8</scala.version>
 18 |     <spark.version>2.3.1</spark.version>
 19 |   </properties>
 20 | 
 21 |   <repositories>
 22 |     <repository>
 23 |       <id>scala-tools.org</id>
 24 |       <name>Scala-Tools Maven2 Repository</name>
 25 |       <url>http://scala-tools.org/repo-releases</url>
 26 |     </repository>
 27 |   </repositories>
 28 | 
 29 |   <pluginRepositories>
 30 |     <pluginRepository>
 31 |       <id>scala-tools.org</id>
 32 |       <name>Scala-Tools Maven2 Repository</name>
 33 |       <url>http://scala-tools.org/repo-releases</url>
 34 |     </pluginRepository>
 35 |   </pluginRepositories>
 36 | 
 37 |   <dependencies>
 38 |     <dependency>
 39 |       <groupId>org.scala-lang</groupId>
 40 |       <artifactId>scala-library</artifactId>
 41 |       <version>${scala.version}</version>
 42 |     </dependency>
 43 |     <dependency>
 44 |       <groupId>junit</groupId>
 45 |       <artifactId>junit</artifactId>
 46 |       <version>4.11</version>
 47 |       <!--
 48 |         scope：作用域或者值域
 49 |           compile(默认,在编译器，运行时都可用，在src/main和src/test下都可用)
 50 |           provided(当前依赖，编译器有用，运行时没用，因为系统服务已经提供了对应的依赖)
 51 |           runtime(当前依赖，编译器没用，运行期有用，面向接口的编程，比如jdbc)
 52 |           test(只针对src/test下面的java代码起作用)
 53 |       -->
 54 |       <scope>test</scope>
 55 |     </dependency>
 56 |     <dependency>
 57 |       <groupId>org.apache.spark</groupId>
 58 |       <artifactId>spark-core_2.11</artifactId>
 59 |       <version>${spark.version}</version>
 60 |     </dependency>
 61 |     <dependency>
 62 |       <groupId>org.apache.spark</groupId>
 63 |       <artifactId>spark-sql_2.11</artifactId>
 64 |       <version>${spark.version}</version>
 65 |     </dependency>
 66 |     <dependency>
 67 |       <groupId>org.apache.spark</groupId>
 68 |       <artifactId>spark-hive_2.11</artifactId>
 69 |       <version>${spark.version}</version>
 70 |     </dependency>
 71 |     <dependency>
 72 |       <groupId>mysql</groupId>
 73 |       <artifactId>mysql-connector-java</artifactId>
 74 |       <version>5.1.40</version>
 75 |     </dependency>
 76 |     <!-- hbase -->
 77 |     <dependency>
 78 |       <groupId>org.apache.hbase</groupId>
 79 |       <artifactId>hbase-client</artifactId>
 80 |       <version>1.2.6</version>
 81 |     </dependency>
 82 |     <dependency>
 83 |       <groupId>org.apache.hbase</groupId>
 84 |       <artifactId>hbase-server</artifactId>
 85 |       <version>1.2.6</version>
 86 |     </dependency>
 87 |   </dependencies>
 88 | 
 89 |   <build>
 90 |     <plugins>
 91 |       <plugin>
 92 |         <groupId>org.scala-tools</groupId>
 93 |         <artifactId>maven-scala-plugin</artifactId>
 94 |         <version>2.15.0</version>
 95 |         <executions>
 96 |           <execution>
 97 |             <goals>
 98 |               <goal>compile</goal>
 99 |               <goal>testCompile</goal>
100 |             </goals>
101 |           </execution>
102 |         </executions>
103 |         <configuration>
104 |           <scalaVersion>${scala.version}</scalaVersion>
105 |           <args>
106 |             <arg>-target:jvm-1.5</arg>
107 |           </args>
108 |         </configuration>
109 |       </plugin>
110 |       <plugin>
111 |         <groupId>org.apache.maven.plugins</groupId>
112 |         <artifactId>maven-eclipse-plugin</artifactId>
113 |         <version>2.10</version>
114 |         <configuration>
115 |           <downloadSources>true</downloadSources>
116 |           <buildcommands>
117 |             <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
118 |           </buildcommands>
119 |           <additionalProjectnatures>
120 |             <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
121 |           </additionalProjectnatures>
122 |           <classpathContainers>
123 |             <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
124 |             <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
125 |           </classpathContainers>
126 |         </configuration>
127 |       </plugin>
128 |       <plugin>
129 |         <artifactId>maven-assembly-plugin</artifactId>
130 |         <configuration>
131 |           <descriptorRefs>
132 |             <descriptorRef>jar-with-dependencies</descriptorRef>
133 |           </descriptorRefs>
134 |           <archive>
135 |             <!--<manifest>
136 |               <mainClass></mainClass>
137 |             </manifest>-->
138 |           </archive>
139 |         </configuration>
140 |         <executions>
141 |           <execution>
142 |             <id>make-assembly</id>
143 |             <phase>package</phase>
144 |             <goals>
145 |               <goal>single</goal>
146 |             </goals>
147 |           </execution>
148 |         </executions>
149 |       </plugin>
150 |       <plugin>
151 |         <groupId>org.apache.maven.plugins</groupId>
152 |         <artifactId>maven-compiler-plugin</artifactId>
153 |         <configuration>
154 |           <source>1.8</source>
155 |           <target>1.8</target>
156 |         </configuration>
157 |       </plugin>
158 |       <plugin>
159 |         <groupId>org.codehaus.mojo</groupId>
160 |         <artifactId>build-helper-maven-plugin</artifactId>
161 |         <version>1.10</version>
162 |         <executions>
163 |           <execution>
164 |             <id>add-source</id>
165 |             <phase>generate-sources</phase>
166 |             <goals>
167 |               <goal>add-source</goal>
168 |             </goals>
169 |             <configuration>
170 |               <!-- 我们可以通过在这里添加多个source节点，来添加任意多个源文件夹 -->
171 |               <sources>
172 |                 <source>src/main/java</source>
173 |                 <source>src/main/scala</source>
174 |               </sources>
175 |             </configuration>
176 |           </execution>
177 |         </executions>
178 |       </plugin>
179 |     </plugins>
180 |     <!--<pluginManagement>&lt;!&ndash; lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) &ndash;&gt;-->
181 |       <!--<plugins>-->
182 |         <!--&lt;!&ndash; clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle &ndash;&gt;-->
183 |         <!--<plugin>-->
184 |           <!--<artifactId>maven-clean-plugin</artifactId>-->
185 |           <!--<version>3.1.0</version>-->
186 |         <!--</plugin>-->
187 |         <!--&lt;!&ndash; default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging &ndash;&gt;-->
188 |         <!--<plugin>-->
189 |           <!--<artifactId>maven-resources-plugin</artifactId>-->
190 |           <!--<version>3.0.2</version>-->
191 |         <!--</plugin>-->
192 |         <!--<plugin>-->
193 |           <!--<artifactId>maven-compiler-plugin</artifactId>-->
194 |           <!--<version>3.8.0</version>-->
195 |         <!--</plugin>-->
196 |         <!--<plugin>-->
197 |           <!--<artifactId>maven-surefire-plugin</artifactId>-->
198 |           <!--<version>2.22.1</version>-->
199 |         <!--</plugin>-->
200 |         <!--<plugin>-->
201 |           <!--<artifactId>maven-jar-plugin</artifactId>-->
202 |           <!--<version>3.0.2</version>-->
203 |         <!--</plugin>-->
204 |         <!--<plugin>-->
205 |           <!--<artifactId>maven-install-plugin</artifactId>-->
206 |           <!--<version>2.5.2</version>-->
207 |         <!--</plugin>-->
208 |         <!--<plugin>-->
209 |           <!--<artifactId>maven-deploy-plugin</artifactId>-->
210 |           <!--<version>2.8.2</version>-->
211 |         <!--</plugin>-->
212 |         <!--&lt;!&ndash; site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle &ndash;&gt;-->
213 |         <!--<plugin>-->
214 |           <!--<artifactId>maven-site-plugin</artifactId>-->
215 |           <!--<version>3.7.1</version>-->
216 |         <!--</plugin>-->
217 |         <!--<plugin>-->
218 |           <!--<artifactId>maven-project-info-reports-plugin</artifactId>-->
219 |           <!--<version>3.0.0</version>-->
220 |         <!--</plugin>-->
221 |       <!--</plugins>-->
222 |     <!--</pluginManagement>-->
223 |   </build>
224 | </project>
225 | 


--------------------------------------------------------------------------------
/flink-train/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>com.awebone</groupId>
  8 |   <artifactId>flink</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <name>flink</name>
 12 |   <!-- FIXME change it to the project's website -->
 13 |   <url>http://www.example.com</url>
 14 | 
 15 |   <properties>
 16 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17 |     <maven.compiler.source>1.8</maven.compiler.source>
 18 |     <maven.compiler.target>1.8</maven.compiler.target>
 19 |     <flink.version>1.7.2</flink.version>
 20 |     <scala.binary.version>2.11</scala.binary.version>
 21 |     <scala.version>2.11.8</scala.version>
 22 |     <hadoop.version>2.7.6</hadoop.version>
 23 |     <iheart.version>1.4.3</iheart.version>
 24 |     <fastjson.version>1.2.7</fastjson.version>
 25 |   </properties>
 26 | 
 27 |   <dependencies>
 28 |     <!--scala依赖-->
 29 |     <dependency>
 30 |       <groupId>org.scala-lang</groupId>
 31 |       <artifactId>scala-library</artifactId>
 32 |       <version>${scala.version}</version>
 33 |     </dependency>
 34 | 
 35 |     <!--flink scala相关依赖-->
 36 |     <dependency>
 37 |       <groupId>org.apache.flink</groupId>
 38 |       <artifactId>flink-scala_${scala.binary.version}</artifactId>
 39 |       <version>${flink.version}</version>
 40 |     </dependency>
 41 |     <dependency>
 42 |       <groupId>org.apache.flink</groupId>
 43 |       <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
 44 |       <version>${flink.version}</version>
 45 |     </dependency>
 46 | 
 47 |     <!--flink Java相关依赖-->
 48 |     <dependency>
 49 |       <groupId>org.apache.flink</groupId>
 50 |       <artifactId>flink-java</artifactId>
 51 |       <version>${flink.version}</version>
 52 |       <scope>compile</scope>
 53 |     </dependency>
 54 |     <dependency>
 55 |       <groupId>org.apache.flink</groupId>
 56 |       <artifactId>flink-streaming-java_2.11</artifactId>
 57 |       <version>${flink.version}</version>
 58 |       <scope>compile</scope>
 59 |     </dependency>
 60 | 
 61 |     <!--flink table&sql 依赖-->
 62 |     <dependency>
 63 |       <groupId>org.apache.flink</groupId>
 64 |       <artifactId>flink-table_2.11</artifactId>
 65 |       <version>${flink.version}</version>
 66 |     </dependency>
 67 |     <!--<dependency>-->
 68 |       <!--<groupId>org.apache.flink</groupId>-->
 69 |       <!--<artifactId>flink-table-api-java-bridge_2.11</artifactId>-->
 70 |       <!--<version>${flink.version}</version>-->
 71 |     <!--</dependency>-->
 72 |     <!--<dependency>-->
 73 |       <!--<groupId>org.apache.flink</groupId>-->
 74 |       <!--<artifactId>flink-table-api-scala-bridge_2.11</artifactId>-->
 75 |       <!--<version>${flink.version}</version>-->
 76 |     <!--</dependency>-->
 77 | 
 78 |     <!--flink连接相关依赖-->
 79 |     <dependency>
 80 |       <groupId>org.apache.flink</groupId>
 81 |       <artifactId>flink-connector-filesystem_2.11</artifactId>
 82 |       <version>${flink.version}</version>
 83 |     </dependency>
 84 |     <dependency>
 85 |       <groupId>org.apache.flink</groupId>
 86 |       <artifactId>flink-connector-kafka_2.11</artifactId>
 87 |       <version>${flink.version}</version>
 88 |     </dependency>
 89 |     <dependency>
 90 |       <groupId>org.apache.flink</groupId>
 91 |       <artifactId>flink-avro</artifactId>
 92 |       <version>${flink.version}</version>
 93 |     </dependency>
 94 | 
 95 |     <!-- flink sink redis -->
 96 |     <dependency>
 97 |       <groupId>org.apache.bahir</groupId>
 98 |       <artifactId>flink-connector-redis_2.11</artifactId>
 99 |       <version>1.0</version>
100 |     </dependency>
101 | 
102 |     <dependency>
103 |       <groupId>org.apache.flink</groupId>
104 |       <artifactId>flink-connector-kafka-0.10_${scala.binary.version}</artifactId>
105 |       <version>${flink.version}</version>
106 |     </dependency>
107 |     <dependency>
108 |       <groupId>org.apache.flink</groupId>
109 |       <artifactId>flink-connector-elasticsearch6_2.11</artifactId>
110 |       <version>${flink.version}</version>
111 |     </dependency>
112 |     <dependency>
113 |       <groupId>org.apache.flink</groupId>
114 |       <artifactId>flink-json</artifactId>
115 |       <version>${flink.version}</version>
116 |     </dependency>
117 |     <dependency>
118 |       <groupId>org.apache.flink</groupId>
119 |       <artifactId>flink-hbase_2.11</artifactId>
120 |       <version>${flink.version}</version>
121 |     </dependency>
122 | 
123 |     <!--flink异步IO相关依赖-->
124 |     <!--<dependency>-->
125 |       <!--<groupId>io.vertx</groupId>-->
126 |       <!--<artifactId>vertx-jdbc-client</artifactId>-->
127 |       <!--<version>3.5.2</version>-->
128 |     <!--</dependency>-->
129 |     <!--<dependency>-->
130 |       <!--<groupId>io.vertx</groupId>-->
131 |       <!--<artifactId>vertx-core</artifactId>-->
132 |       <!--<version>3.5.2</version>-->
133 |     <!--</dependency>-->
134 |     <!--<dependency>-->
135 |       <!--<groupId>io.vertx</groupId>-->
136 |       <!--<artifactId>vertx-redis-client</artifactId>-->
137 |       <!--<version>3.5.2.CR3</version>-->
138 |     <!--</dependency>-->
139 | 
140 |     <!--<dependency>-->
141 |       <!--<groupId>io.vertx</groupId>-->
142 |       <!--<artifactId>lang-scala_2.11</artifactId>-->
143 |       <!--<version>1.0.1-RC1</version>-->
144 |     <!--</dependency>-->
145 | 
146 |     <!--缓存相关依赖-->
147 |     <!--<dependency>-->
148 |       <!--<groupId>com.github.ben-manes.caffeine</groupId>-->
149 |       <!--<artifactId>caffeine</artifactId>-->
150 |       <!--<version>1.0.0</version>-->
151 |     <!--</dependency>-->
152 |     <!--<dependency>-->
153 |       <!--<groupId>redis.clients</groupId>-->
154 |       <!--<artifactId>jedis</artifactId>-->
155 |       <!--<version>2.9.0</version>-->
156 |     <!--</dependency>-->
157 | 
158 |     <!-- other tool -->
159 |     <dependency>
160 |       <groupId>org.slf4j</groupId>
161 |       <artifactId>slf4j-log4j12</artifactId>
162 |       <version>1.7.10</version>
163 |       <scope>runtime</scope>
164 |     </dependency>
165 |     <dependency>
166 |       <groupId>log4j</groupId>
167 |       <artifactId>log4j</artifactId>
168 |       <version>1.2.17</version>
169 |       <scope>runtime</scope>
170 |     </dependency>
171 |     <dependency>
172 |       <groupId>mysql</groupId>
173 |       <artifactId>mysql-connector-java</artifactId>
174 |       <version>5.1.40</version>
175 |     </dependency>
176 |     <dependency>
177 |       <groupId>org.apache.hadoop</groupId>
178 |       <artifactId>hadoop-client</artifactId>
179 |       <version>${hadoop.version}</version>
180 |     </dependency>
181 |     <dependency>
182 |       <groupId>org.apache.kafka</groupId>
183 |       <artifactId>kafka-clients</artifactId>
184 |       <version>1.1.0</version>
185 |     </dependency>
186 | 
187 |     <!--<dependency>-->
188 |       <!--<groupId>net.sf.json-lib</groupId>-->
189 |       <!--<artifactId>json-lib</artifactId>-->
190 |       <!--<version>2.4</version>-->
191 |       <!--<classifier>jdk15</classifier>-->
192 |     <!--</dependency>-->
193 |     <!--<dependency>-->
194 |       <!--<groupId>com.iheart</groupId>-->
195 |       <!--<artifactId>ficus_2.11</artifactId>-->
196 |       <!--<version>${iheart.version}</version>-->
197 |     <!--</dependency>-->
198 |     <!--<dependency>-->
199 |       <!--<groupId>com.alibaba</groupId>-->
200 |       <!--<artifactId>fastjson</artifactId>-->
201 |       <!--<version>${fastjson.version}</version>-->
202 |       <!--<scope>${scope.type}</scope>-->
203 |     <!--</dependency>-->
204 |     <dependency>
205 |       <groupId>junit</groupId>
206 |       <artifactId>junit</artifactId>
207 |       <version>4.11</version>
208 |       <scope>test</scope>
209 |     </dependency>
210 |   </dependencies>
211 | 
212 |   <build>
213 |     <plugins>
214 |       <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
215 |       <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
216 |       <plugin>
217 |         <groupId>org.apache.maven.plugins</groupId>
218 |         <artifactId>maven-shade-plugin</artifactId>
219 |         <version>3.0.0</version>
220 |         <executions>
221 |           <!-- Run shade goal on package phase -->
222 |           <execution>
223 |             <phase>package</phase>
224 |             <goals>
225 |               <goal>shade</goal>
226 |             </goals>
227 |             <configuration>
228 |               <artifactSet>
229 |                 <excludes>
230 |                   <exclude>org.apache.flink:force-shading</exclude>
231 |                   <exclude>com.google.code.findbugs:jsr305</exclude>
232 |                   <exclude>org.slf4j:*</exclude>
233 |                   <exclude>log4j:*</exclude>
234 |                 </excludes>
235 |               </artifactSet>
236 |               <filters>
237 |                 <filter>
238 |                   <!-- Do not copy the signatures in the META-INF folder.
239 |                   Otherwise, this might cause SecurityExceptions when using the JAR. -->
240 |                   <artifact>*:*</artifact>
241 |                   <excludes>
242 |                     <exclude>META-INF/*.SF</exclude>
243 |                     <exclude>META-INF/*.DSA</exclude>
244 |                     <exclude>META-INF/*.RSA</exclude>
245 |                   </excludes>
246 |                 </filter>
247 |               </filters>
248 |               <transformers>
249 |                 <transformer
250 |                         implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
251 |                   <mainClass>com.lp.demo.StreamingJob</mainClass>
252 |                 </transformer>
253 |               </transformers>
254 |             </configuration>
255 |           </execution>
256 |         </executions>
257 |       </plugin>
258 | 
259 |       <!-- Scala Compiler -->
260 |       <plugin>
261 |         <groupId>net.alchim31.maven</groupId>
262 |         <artifactId>scala-maven-plugin</artifactId>
263 |         <version>3.2.2</version>
264 |         <!-- target:jvm-1.8必须添加，否则scala vertx会编译失败-->
265 |         <configuration>
266 |           <args>
267 |             <arg>-target:jvm-1.8</arg>
268 |             <arg>-feature</arg>
269 |             <arg>-deprecation</arg>
270 |             <arg>-explaintypes</arg>
271 |             <arg>-unchecked</arg>
272 |             <arg>-Xlint</arg>
273 |           </args>
274 |         </configuration>
275 |         <executions>
276 |           <execution>
277 |             <goals>
278 |               <goal>compile</goal>
279 |               <goal>testCompile</goal>
280 |             </goals>
281 |           </execution>
282 |         </executions>
283 |       </plugin>
284 | 
285 |       <plugin>
286 |         <groupId>org.codehaus.mojo</groupId>
287 |         <artifactId>build-helper-maven-plugin</artifactId>
288 |         <version>1.8</version>
289 |         <executions>
290 |           <!-- Add src/main/scala to eclipse build path -->
291 |           <execution>
292 |             <id>add-source</id>
293 |             <phase>generate-sources</phase>
294 |             <goals>
295 |               <goal>add-source</goal>
296 |             </goals>
297 |             <configuration>
298 |               <sources>
299 |                 <source>src/main/scala</source>
300 |               </sources>
301 |             </configuration>
302 |           </execution>
303 |           <!-- Add src/test/scala to eclipse build path -->
304 |           <execution>
305 |             <id>add-test-source</id>
306 |             <phase>generate-test-sources</phase>
307 |             <goals>
308 |               <goal>add-test-source</goal>
309 |             </goals>
310 |             <configuration>
311 |               <sources>
312 |                 <source>src/test/scala</source>
313 |               </sources>
314 |             </configuration>
315 |           </execution>
316 |         </executions>
317 |       </plugin>
318 |     </plugins>
319 | 
320 |     <!--<sourceDirectory>src/main/scala</sourceDirectory>-->
321 |     <!--<testSourceDirectory>src/test/scala</testSourceDirectory>-->
322 |     <!--<plugins>-->
323 |       <!--<plugin>-->
324 |         <!--<groupId>org.scala-tools</groupId>-->
325 |         <!--<artifactId>maven-scala-plugin</artifactId>-->
326 |         <!--<version>2.15.0</version>-->
327 |         <!--<executions>-->
328 |           <!--<execution>-->
329 |             <!--<goals>-->
330 |               <!--<goal>compile</goal>-->
331 |               <!--<goal>testCompile</goal>-->
332 |             <!--</goals>-->
333 |             <!--<configuration>-->
334 |               <!--<args>-->
335 |                 <!--<arg>-dependencyfile</arg>-->
336 |                 <!--<arg>${project.build.directory}/.scala_dependencies</arg>-->
337 |               <!--</args>-->
338 |             <!--</configuration>-->
339 |           <!--</execution>-->
340 |         <!--</executions>-->
341 |       <!--</plugin>-->
342 |       <!--<plugin>-->
343 |         <!--<groupId>org.apache.maven.plugins</groupId>-->
344 |         <!--<artifactId>maven-surefire-plugin</artifactId>-->
345 |         <!--<version>2.6</version>-->
346 |         <!--<configuration>-->
347 |           <!--<useFile>false</useFile>-->
348 |           <!--<disableXmlReport>true</disableXmlReport>-->
349 |           <!--&lt;!&ndash; If you have classpath issue like NoDefClassError,... &ndash;&gt;-->
350 |           <!--&lt;!&ndash; useManifestOnlyJar>false</useManifestOnlyJar &ndash;&gt;-->
351 |           <!--<includes>-->
352 |             <!--<include>**/*Test.*</include>-->
353 |             <!--<include>**/*Suite.*</include>-->
354 |           <!--</includes>-->
355 |         <!--</configuration>-->
356 |       <!--</plugin>-->
357 |     <!--</plugins>-->
358 | 
359 |     <!--<pluginManagement>&lt;!&ndash; lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) &ndash;&gt;-->
360 |       <!--<plugins>-->
361 |         <!--&lt;!&ndash; clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle &ndash;&gt;-->
362 |         <!--<plugin>-->
363 |           <!--<artifactId>maven-clean-plugin</artifactId>-->
364 |           <!--<version>3.1.0</version>-->
365 |         <!--</plugin>-->
366 |         <!--&lt;!&ndash; default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging &ndash;&gt;-->
367 |         <!--<plugin>-->
368 |           <!--<artifactId>maven-resources-plugin</artifactId>-->
369 |           <!--<version>3.0.2</version>-->
370 |         <!--</plugin>-->
371 |         <!--<plugin>-->
372 |           <!--<artifactId>maven-compiler-plugin</artifactId>-->
373 |           <!--<version>3.8.0</version>-->
374 |         <!--</plugin>-->
375 |         <!--<plugin>-->
376 |           <!--<artifactId>maven-surefire-plugin</artifactId>-->
377 |           <!--<version>2.22.1</version>-->
378 |         <!--</plugin>-->
379 |         <!--<plugin>-->
380 |           <!--<artifactId>maven-jar-plugin</artifactId>-->
381 |           <!--<version>3.0.2</version>-->
382 |         <!--</plugin>-->
383 |         <!--<plugin>-->
384 |           <!--<artifactId>maven-install-plugin</artifactId>-->
385 |           <!--<version>2.5.2</version>-->
386 |         <!--</plugin>-->
387 |         <!--<plugin>-->
388 |           <!--<artifactId>maven-deploy-plugin</artifactId>-->
389 |           <!--<version>2.8.2</version>-->
390 |         <!--</plugin>-->
391 |         <!--&lt;!&ndash; site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle &ndash;&gt;-->
392 |         <!--<plugin>-->
393 |           <!--<artifactId>maven-site-plugin</artifactId>-->
394 |           <!--<version>3.7.1</version>-->
395 |         <!--</plugin>-->
396 |         <!--<plugin>-->
397 |           <!--<artifactId>maven-project-info-reports-plugin</artifactId>-->
398 |           <!--<version>3.0.0</version>-->
399 |         <!--</plugin>-->
400 |       <!--</plugins>-->
401 |     <!--</pluginManagement>-->
402 |   </build>
403 | 
404 |   <!-- This profile helps to make things run out of the box in IntelliJ -->
405 |   <!-- Its adds Flink's core classes to the runtime class path. -->
406 |   <!-- Otherwise they are missing in IntelliJ, because the dependency is 'provided' -->
407 |   <profiles>
408 |     <profile>
409 |       <id>add-dependencies-for-IDEA</id>
410 | 
411 |       <activation>
412 |         <property>
413 |           <name>idea.version</name>
414 |         </property>
415 |       </activation>
416 | 
417 |       <dependencies>
418 |         <dependency>
419 |           <groupId>org.apache.flink</groupId>
420 |           <artifactId>flink-scala_${scala.binary.version}</artifactId>
421 |           <version>${flink.version}</version>
422 |           <scope>compile</scope>
423 |         </dependency>
424 |         <dependency>
425 |           <groupId>org.apache.flink</groupId>
426 |           <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
427 |           <version>${flink.version}</version>
428 |           <scope>compile</scope>
429 |         </dependency>
430 |         <dependency>
431 |           <groupId>org.scala-lang</groupId>
432 |           <artifactId>scala-library</artifactId>
433 |           <version>${scala.version}</version>
434 |           <scope>compile</scope>
435 |         </dependency>
436 |         <dependency>
437 |           <groupId>org.apache.flink</groupId>
438 |           <artifactId>flink-java</artifactId>
439 |           <version>${flink.version}</version>
440 |           <scope>compile</scope>
441 |         </dependency>
442 |         <dependency>
443 |           <groupId>org.apache.flink</groupId>
444 |           <artifactId>flink-streaming-java_2.11</artifactId>
445 |           <version>${flink.version}</version>
446 |           <scope>compile</scope>
447 |         </dependency>
448 |       </dependencies>
449 |     </profile>
450 |   </profiles>
451 | 
452 | </project>
453 | 


--------------------------------------------------------------------------------