├── .gitignore
├── .gitattributes
├── dmp
├── dmp.iml
├── data
│ ├── isp-mapping.dic
│ ├── device-mapping.dic
│ ├── network-mapping.dic
│ └── data.txt
├── src
│ └── main
│ │ ├── scala
│ │ └── com
│ │ │ └── awebone
│ │ │ └── dmp
│ │ │ ├── tags
│ │ │ ├── Tags.scala
│ │ │ ├── AppTag.scala
│ │ │ ├── ChannelTag.scala
│ │ │ ├── AdPositionTag.scala
│ │ │ ├── AreaTag.scala
│ │ │ ├── KeyWordTag.scala
│ │ │ └── DeviceTag.scala
│ │ │ ├── constants
│ │ │ └── AdTagConstants.scala
│ │ │ ├── util
│ │ │ └── Utils.scala
│ │ │ ├── etl
│ │ │ ├── DMPLogETLOps.scala
│ │ │ └── DMPLogETLHDFSOps.scala
│ │ │ ├── report
│ │ │ ├── ProvinceCityQuantityJob.scala
│ │ │ └── AreaRequestDistributionJob.scala
│ │ │ ├── Logs.scala
│ │ │ └── personas
│ │ │ └── DmpPersonasJob.scala
│ │ └── resources
│ │ ├── hive-site.xml
│ │ ├── hbase-site.xml
│ │ ├── core-site.xml
│ │ └── hdfs-site.xml
├── script
│ └── mysql-create.sql
└── pom.xml
├── mllib
├── mllib.iml
├── src
│ └── main
│ │ ├── resources
│ │ ├── ml-1m
│ │ │ ├── movies.dat
│ │ │ └── README
│ │ ├── core-site.xml
│ │ └── hdfs-site.xml
│ │ ├── scala
│ │ └── com
│ │ │ └── awebone
│ │ │ └── spark
│ │ │ ├── WordCountScala.scala
│ │ │ ├── MovieLensSparkShell.scala
│ │ │ └── MovieLensALS.scala
│ │ └── java
│ │ └── com
│ │ └── awebone
│ │ └── spark
│ │ ├── WordCountJava8.java
│ │ └── WordCountJava7.java
└── pom.xml
├── akka_rpc
├── akka_rpc.iml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── awebone
│ │ │ └── hadoop_rpc
│ │ │ ├── MyDataNode.java
│ │ │ ├── MyServerProtocal.java
│ │ │ ├── MyServerImpl.java
│ │ │ ├── NameNodeClient.java
│ │ │ └── MyNamenode.java
│ │ └── scala
│ │ └── com
│ │ └── awebone
│ │ ├── yarn
│ │ ├── Constant.scala
│ │ ├── Message.scala
│ │ ├── MyNodeManager.scala
│ │ └── MyResourceManager.scala
│ │ └── akka_rpc
│ │ ├── Worker.scala
│ │ └── Master.scala
└── pom.xml
├── flink-train
├── flink-train.iml
├── src
│ └── main
│ │ ├── resources
│ │ ├── scripts
│ │ │ ├── kafka-script
│ │ │ ├── mysql.sql
│ │ │ └── es-scripts
│ │ ├── hive-site.xml
│ │ ├── hbase-site.xml
│ │ ├── core-site.xml
│ │ └── hdfs-site.xml
│ │ └── scala
│ │ └── com
│ │ └── awebone
│ │ └── flink
│ │ ├── connetcor
│ │ └── FileSystemSinkApp.scala
│ │ └── project
│ │ ├── MySQLSource.scala
│ │ ├── MockKafkaProducer.scala
│ │ ├── LogAnalysis.scala
│ │ └── LogAnalysisWithMySQL.scala
└── pom.xml
├── weblog
├── .settings
│ ├── org.eclipse.m2e.core.prefs
│ ├── org.eclipse.core.resources.prefs
│ └── org.eclipse.jdt.core.prefs
├── src
│ ├── main
│ │ └── java
│ │ │ ├── log4j.properties
│ │ │ ├── core-site.xml
│ │ │ ├── com
│ │ │ └── awebone
│ │ │ │ ├── pre
│ │ │ │ ├── WebLogParse.java
│ │ │ │ └── WebLogPreProcess.java
│ │ │ │ ├── bean
│ │ │ │ ├── VisitBean.java
│ │ │ │ ├── PageViewsBean.java
│ │ │ │ └── WebLogBean.java
│ │ │ │ └── click
│ │ │ │ ├── ClickModel.java
│ │ │ │ └── ClickSessionStream.java
│ │ │ ├── hdfs-site.xml
│ │ │ └── hive-op.txt
│ └── test
│ │ └── java
│ │ └── com
│ │ └── awebone
│ │ └── weblog
│ │ └── AppTest.java
├── .project
├── pom.xml
└── .classpath
├── README.md
└── LICENSE
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .vscode
3 | target/
4 | out/
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/dmp/dmp.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/mllib/mllib.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/akka_rpc/akka_rpc.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/dmp/data/isp-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=移动 D0003001
2 | 2=联通 D0003002
3 | 3=电信 D0003003
4 | 4=OPERATOROTHER D0003004
--------------------------------------------------------------------------------
/flink-train/flink-train.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/dmp/data/device-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=Android D0001001
2 | 2=IOS D0001002
3 | 3=Winphone D0001003
4 | 4=其他 D0001004
--------------------------------------------------------------------------------
/dmp/data/network-mapping.dic:
--------------------------------------------------------------------------------
1 | 1=WIFI D0002001
2 | 2=4G D0002002
3 | 3=3G D0002003
4 | 4=2G D0002004
5 | 5=NWTWORKOTHER D0004004
--------------------------------------------------------------------------------
/mllib/src/main/resources/ml-1m/movies.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanbo03/bigdata-projects/HEAD/mllib/src/main/resources/ml-1m/movies.dat
--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/test/java=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyDataNode.java:
--------------------------------------------------------------------------------
1 | package com.awebone.hadoop_rpc;
2 |
3 | public class MyDataNode {
4 |
5 | public static void main(String[] args) {
6 |
7 |
8 |
9 |
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/Tags.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 |
5 | /**
6 | * 用户提取标签的特质
7 | */
8 | trait Tags {
9 |
10 | def extractTag(logs:Logs):Map[String, Int]
11 | }
12 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerProtocal.java:
--------------------------------------------------------------------------------
1 | package com.awebone.hadoop_rpc;
2 |
3 | public interface MyServerProtocal {
4 |
5 | long versionID = 12345678L;
6 |
7 | void hello();
8 |
9 | String getName();
10 | }
11 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/Constant.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.yarn
2 |
3 | object Constant {
4 | val RMAS = "MyResourceManagerActorSystem"
5 | val RMA = "MyResourceManagerActor"
6 | val NMAS = "MyNodeManagerActorSystem"
7 | val NMA = "MyNodeManagerActor"
8 | }
9 |
--------------------------------------------------------------------------------
/weblog/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
3 | org.eclipse.jdt.core.compiler.compliance=1.5
4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5 | org.eclipse.jdt.core.compiler.source=1.5
6 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerImpl.java:
--------------------------------------------------------------------------------
1 | package com.awebone.hadoop_rpc;
2 |
3 | public class MyServerImpl implements MyServerProtocal{
4 |
5 | @Override
6 | public void hello() {
7 | System.out.println("hi");
8 | }
9 |
10 | @Override
11 | public String getName() {
12 | return "mynamenode";
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/weblog/src/main/java/log4j.properties:
--------------------------------------------------------------------------------
1 | ###set log levels###
2 | log4j.rootLogger=info, stdout
3 | ###output to the console###
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=[%d{dd/MM/yy HH:mm:ss:SSS z}] %t %5p %c{2}: %m%n
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AppTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 |
6 | object AppTag extends Tags {
7 | override def extractTag(logs: Logs) = {
8 | val map = Map[String, Int]((AdTagConstants.PREFIX_AD_APP_TAG + logs.appname -> 1))
9 | map
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/ChannelTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 |
6 | /**
7 | * 3)渠道(标签格式:CNxxxx->1)xxxx为渠道ID
8 | */
9 | object ChannelTag extends Tags {
10 | override def extractTag(logs: Logs) = {
11 | if(logs.channelid == null) {
12 | Map[String, Int]()
13 | } else {
14 | Map[String, Int]((AdTagConstants.PREFIX_AD_CHANNEL_TAG + logs.channelid -> 1))
15 | }
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/kafka-script:
--------------------------------------------------------------------------------
1 | 启动:
2 | zkServer.sh start
3 | nohup kafka-server-start.sh $KAFKA_HOME/config/server.properties 1>~/logs/kafka_std.log 2>~/logs/kafka_err.log &
4 |
5 | 查看topics:
6 | kafka-topics.sh --list --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka
7 |
8 | 创建topic:cdnlog
9 | kafka-topics.sh --create --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --replication-factor 1 --partitions 1 --topic cdnlog
10 |
11 | 控制台消费:
12 | kafka-console-consumer.sh --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --topic cdnlog
--------------------------------------------------------------------------------
/weblog/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | weblog
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AdPositionTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 | import com.awebone.dmp.util.Utils
6 |
7 | import scala.collection.mutable
8 |
9 | /**
10 | * 标签一:
11 | 1)广告位类型(标签格式:LC03->1或者LC16->1)xx为数字,小于10 补0
12 | */
13 | object AdPositionTag extends Tags {
14 |
15 | override def extractTag(logs: Logs) = {
16 | val map = mutable.Map[String, Int]()
17 | val adspacetype = Utils.fulfill(logs.adspacetype)
18 | map.put(AdTagConstants.PREFIX_AD_SPACE_TAG + "" + adspacetype, 1)
19 | map.toMap
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/mysql.sql:
--------------------------------------------------------------------------------
1 | create table uesr_domain_config(
2 | id int unsigned auto_increment,
3 | user_id varchar(40) not null,
4 | domain varchar(40) not null,
5 | primary key (id)
6 | );
7 |
8 | insert into uesr_domain_config(user_id,domain) values('8000001','v1.awebone.com');
9 | insert into uesr_domain_config(user_id,domain) values('8000002','v2.awebone.com');
10 | insert into uesr_domain_config(user_id,domain) values('8000003','v3.awebone.com');
11 | insert into uesr_domain_config(user_id,domain) values('8000004','v4.awebone.com');
12 | insert into uesr_domain_config(user_id,domain) values('8000005','vmi.awebone.com');
13 |
14 | select * from uesr_domain_config;
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/constants/AdTagConstants.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.constants
2 |
3 | /**
4 | * dmp中常见广告标签前缀常量
5 | */
6 | object AdTagConstants {
7 | //广告位标签前缀
8 | val PREFIX_AD_SPACE_TAG = "LC_"
9 | //APP
10 | val PREFIX_AD_APP_TAG = "APP_"
11 | //渠道前缀
12 | val PREFIX_AD_CHANNEL_TAG = "CN_"
13 | //设备前缀
14 | val PREFIX_AD_DEVICE_TAG = "DEVICE_"
15 | //联网方式前缀
16 | val PREFIX_AD_NETWORK_TAG = "NET_"
17 | //设备运营商前缀
18 | val PREFIX_AD_ISP_TAG = "ISP_"
19 | //关键字前缀
20 | val PREFIX_AD_KEYWORD_TAG = "KW_"
21 | //省份地域前缀
22 | val PREFIX_AD_PROVINCE_TAG = "ZP_"
23 | //城市地域前缀
24 | val PREFIX_AD_CITY_TAG = "ZC_"
25 | }
26 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/AreaTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 |
6 | import scala.collection.mutable
7 |
8 | /**
9 | * 地域标签(省标签格式:ZPxxx->1,地市标签格式:ZCxxx->1)xxx为省或市名称
10 | */
11 | object AreaTag extends Tags {
12 | override def extractTag(logs: Logs) = {
13 | val areaMap = mutable.Map[String, Int]()
14 | if(logs.provincename != null) {
15 | areaMap.put(AdTagConstants.PREFIX_AD_PROVINCE_TAG + logs.provincename, 1)
16 | }
17 | if(logs.cityname != null) {
18 | areaMap.put(AdTagConstants.PREFIX_AD_CITY_TAG + logs.cityname, 1)
19 | }
20 | areaMap.toMap
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/KeyWordTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 |
6 | import scala.collection.mutable
7 |
8 | /**
9 | * 5)关键词(标签格式:Kxxx->1)xxx为关键字。
10 | * 关键词个数不能少于3个字符,且不能超过8个字符;
11 | * 关键字中如包含”|”,则分割成数组,转化成多个关键字标签
12 | “麻辣小龙虾|麻辣香锅|与神对话|家”
13 | */
14 | object KeyWordTag extends Tags {
15 | override def extractTag(logs: Logs) = {
16 | val map = mutable.Map[String, Int]()
17 | if(logs.keywords != null) {
18 | val kws = logs.keywords.split("\\|")
19 | for (kw <- kws) {
20 | if(kw.length >= 3 && kw.length <= 8) {
21 | map.put(AdTagConstants.PREFIX_AD_KEYWORD_TAG + kw, 1)
22 | }
23 | }
24 | }
25 | map.toMap
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/weblog/src/test/java/com/awebone/weblog/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.awebone.weblog;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/NameNodeClient.java:
--------------------------------------------------------------------------------
1 | package com.awebone.hadoop_rpc;
2 |
3 | import java.io.IOException;
4 | import java.net.InetSocketAddress;
5 |
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.ipc.RPC;
8 |
9 | public class NameNodeClient {
10 |
11 | public static void main(String[] args) {
12 |
13 |
14 | try {
15 | MyServerProtocal proxy = RPC.getProxy(MyServerProtocal.class,
16 | MyServerProtocal.versionID,
17 | new InetSocketAddress("localhost", 9988), new Configuration());
18 |
19 | /**
20 | * proxy.hello();
21 | * 的底层,其实就是调用:
22 | *
23 | * 服务器中的 setInstance这个参数对象中的hello方法
24 | */
25 | proxy.hello();
26 | System.out.println(proxy.getName());
27 |
28 |
29 | } catch (IOException e) {
30 | e.printStackTrace();
31 | }
32 |
33 |
34 |
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/scripts/es-scripts:
--------------------------------------------------------------------------------
1 | 创建索引库:
2 | curl -XPUT http://localhost:9200/cdn
3 |
4 | 删除索引库:
5 | curl -XDELETE http://localhost:9200/cdn
6 |
7 | 创建type表:
8 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic/_mapping -d'{
9 | "traffic": {
10 | "properties": {
11 | "domain": {"type": "keyword"},
12 | "traffics": {"type": "long"},
13 | "time": {"type": "date","format": "yyyy-MM-dd HH:mm"}
14 | }
15 | }
16 | }'
17 |
18 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic-userid/_mapping -d'{
19 | "traffic": {
20 | "properties": {
21 | "userid": {"type": "keyword"},
22 | "domain": {"type": "text"},
23 | "traffics": {"type": "long"},
24 | "time": {"type": "date","format": "yyyy-MM-dd HH:mm"}
25 | }
26 | }
27 | }'
--------------------------------------------------------------------------------
/dmp/script/mysql-create.sql:
--------------------------------------------------------------------------------
1 | CREATE DATABASE `dmp`;
2 | CREATE TABLE `p_c_quantity` (
3 | `data_date` date NOT NULL,
4 | `province` VARCHAR(40),
5 | `city` VARCHAR(40),
6 | `countz` bigint(20) NOT NULL
7 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
8 |
9 | CREATE TABLE `area_ad_req` (
10 | `data_date` date NOT NULL,
11 | `province` VARCHAR(40),
12 | `city` VARCHAR(40),
13 | `orginal_req` bigint(20) DEFAULT NULL,
14 | `valid_req` bigint(20) DEFAULT NULL,
15 | `ad_req` bigint(20) DEFAULT NULL,
16 | `tpi_bid_num` bigint(20) DEFAULT NULL,
17 | `win_bid_num` bigint(20) DEFAULT NULL,
18 | `show_ad_master_num` bigint(20) DEFAULT NULL,
19 | `click_ad_master_num` bigint(20) DEFAULT NULL,
20 | `show_ad_media_num` bigint(20) DEFAULT NULL,
21 | `click_ad_media_num` bigint(20) DEFAULT NULL,
22 | `dsp_ad_xf` double DEFAULT NULL,
23 | `dsp_ad_cost` double DEFAULT NULL
24 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 大数据项目集
2 |
3 | ## 1. 基于Hadoop的离线用户行为日志分析(weblog)
4 |
5 | **技术栈:Hadoop**
6 |
7 | - [x] Bean
8 | - [x] 点击流数据处理
9 | - [x] 点击会话流模型构建
10 | - [x] Hive明细表构建
11 | - [x] 用户行为指标分析
12 |
13 |
14 |
15 |
16 |
17 | ## 2. 基于Akka实现RPC通信(akka_rpc)
18 |
19 | **技术栈:Akka**
20 |
21 | - [x] 模拟Hadoop集群间通信
22 | - [x] 模拟Spark集群间通信
23 | - [x] 模拟Yarn通信
24 |
25 |
26 |
27 |
28 |
29 | ## 3. 广告数据管理平台(dmp)
30 |
31 | **技术栈:Spark、Scala**
32 |
33 | - [x] 广告日志ETL
34 | - [x] 报表统计
35 | - [x] 用户画像构建
36 | - [x] 广告标签统计
37 | - [x] DMP结果入库HBase
38 |
39 |
40 |
41 |
42 |
43 | ## 4. 基于Spark MLLib实现个性化推荐(mllib)
44 |
45 | **技术栈:Spark、Scala**
46 |
47 | - [x] MovieLens DataModel构建
48 | - [x] 冷启动:启动时用户随机对10部电影评分
49 | - [x] 切分数据集
50 | - [x] ALS模型构建
51 | - [x] 模型评估
52 | - [x] 个性化推荐
53 |
54 |
55 |
56 |
57 |
58 | ## 5. 基于Flink对CDN日志分析(flink-train)
59 |
60 | **技术栈:Flink、Scala**
61 |
62 | - [x] 模拟Kafka生产者生成日志数据
63 | - [x] CDN日志分析
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/WordCountScala.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.spark
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | object WordCountScala {
7 | def main(args: Array[String]): Unit = {
8 | //获取程序入口
9 | val sparkConf: SparkConf = new SparkConf()
10 | sparkConf.setAppName(WordCountScala.getClass.getSimpleName)
11 | sparkConf.setMaster("local")
12 | val sparkContext: SparkContext = new SparkContext(sparkConf)
13 |
14 | //WorkCount
15 | val linesRDD: RDD[String] = sparkContext.textFile(args(0))
16 | val wordRDD: RDD[String] = linesRDD.flatMap(_.split(" "))
17 | val wordAndOneRDD: RDD[(String, Int)] = wordRDD.map((_, 1))
18 | val wordsCountRDD = wordAndOneRDD.reduceByKey((x: Int, y: Int) => x + y)
19 | wordsCountRDD.foreach(x => println(x._1, x._2))
20 | wordsCountRDD.saveAsTextFile(args(1))
21 |
22 | sparkContext.stop()
23 | }
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/Message.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.yarn
2 |
3 | //样例类,做模式匹配
4 |
5 | //注册消息 nodemanager -> resourcemanager
6 | case class RegisterNodeManager(val nodemanagerid: String, val memory: Int, val cpu: Int)
7 |
8 | //资源: 不是说哪个任务需要多少资源,就把资源给这个任务
9 | //而是,某个节点有多少适合用于做计算的资源,那么就把这个任务启动在这个节点上
10 |
11 |
12 | //注册完成消息 resourcemanager -》 nodemanager
13 | case class RegisteredNodeManager(val resourcemanagerhostname: String)
14 |
15 |
16 | //心跳消息 nodemanager -》 resourcemanager
17 | case class Heartbeat(val nodemanagerid: String)
18 |
19 | /**
20 | * 是在RM中,为了维持整个集群中,到底哪个节点有多少资源
21 | * 所以吧每个节点的资源都封装在一个NodeManagerInfo对象里
22 | * 然后在RM中就维持了一个NodeManagerInfo对象的集合
23 | */
24 | class NodeManagerInfo(val nodemanagerid: String, val memory: Int, val cpu: Int) {
25 | //用来存储nomanagerid这个NodeManager的最后一次心跳时间
26 | //_是一个默认值
27 | var lastHeartBeatTime: Long = _
28 | }
29 |
30 | //单例
31 | case object SendMessage //仅仅是一个标志
32 | case object CheckTimeOut //也是一个标志
--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/connetcor/FileSystemSinkApp.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.flink.connetcor
2 |
3 |
4 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
5 | import org.apache.flink.streaming.connectors.fs.StringWriter
6 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer}
7 |
8 | object FileSystemSinkApp {
9 | def main(args: Array[String]): Unit = {
10 | System.setProperty("HADOOP_USER_NAME","hadoop")
11 | val env = StreamExecutionEnvironment.getExecutionEnvironment
12 | val data = env.socketTextStream("hadoop04",9999)
13 |
14 | data.print().setParallelism(1)
15 | val filepath = "/tmpdata/flink/hdfssink"
16 |
17 | val sink = new BucketingSink[String](filepath)
18 | sink.setBucketer(new DateTimeBucketer[String]("yyyy-MM-dd--HHmm"))
19 | sink.setWriter(new StringWriter())
20 | sink.setBatchRolloverInterval(20)
21 |
22 | data.addSink(sink)
23 | env.execute("FileSystemSinkApp")
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyNamenode.java:
--------------------------------------------------------------------------------
1 | package com.awebone.hadoop_rpc;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.HadoopIllegalArgumentException;
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.ipc.RPC;
8 | import org.apache.hadoop.ipc.RPC.Server;
9 |
10 | public class MyNamenode {
11 |
12 | public static void main(String[] args) {
13 |
14 |
15 | try {
16 |
17 | /**
18 | * new MyServerImpl().hello() .getName()
19 | */
20 | Server server = new RPC.Builder(new Configuration())
21 | .setProtocol(MyServerProtocal.class)
22 | .setInstance(new MyServerImpl())
23 | .setBindAddress("localhost")
24 | .setPort(9988)
25 | .build();
26 |
27 |
28 | server.start();
29 | System.out.println("SERVER START ......");
30 |
31 |
32 | } catch (HadoopIllegalArgumentException e) {
33 | e.printStackTrace();
34 | } catch (IOException e) {
35 | e.printStackTrace();
36 | }
37 |
38 |
39 |
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/weblog/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.awebone
6 | weblog
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | weblog
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 |
26 | org.apache.hadoop
27 | hadoop-client
28 | 2.7.6
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/weblog/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/mllib/src/main/java/com/awebone/spark/WordCountJava8.java:
--------------------------------------------------------------------------------
1 | package com.awebone.spark;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaPairRDD;
5 | import org.apache.spark.api.java.JavaRDD;
6 | import org.apache.spark.api.java.JavaSparkContext;
7 | import scala.Tuple2;
8 |
9 | import java.util.Arrays;
10 |
11 | public class WordCountJava8 {
12 | public static void main(String[] args) {
13 | //获取程序入口
14 | SparkConf sparkConf = new SparkConf();
15 | sparkConf.setAppName("WordCountJava8");
16 | sparkConf.setMaster("local");
17 | JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
18 |
19 | //获取数据
20 | JavaRDD linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input");
21 |
22 | //计算
23 | JavaRDD rdd1 = linesRDD.flatMap(s -> Arrays.asList(s.split(" ")).iterator());
24 | JavaPairRDD rdd2 = rdd1.mapToPair(s -> new Tuple2<>(s, 1));
25 | JavaPairRDD rdd3 = rdd2.reduceByKey((x, y) -> x + y);
26 |
27 | rdd3.foreach(t -> System.out.println(t._1 + "\t" + t._2));
28 |
29 | javaSparkContext.stop();
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Awebone
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/akka_rpc/Worker.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.akka_rpc
2 |
3 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
4 | import com.typesafe.config.ConfigFactory
5 |
6 | class Worker extends Actor{
7 |
8 | override def preStart(): Unit = {
9 | //指定访问哪个节点上的哪个actorSystem的哪个actor
10 | val connectStr = "akka.tcp://MasterActorSystem@localhost:6789/user/master"
11 | val selection: ActorSelection = context.actorSelection(connectStr)
12 |
13 | selection ! "hello"
14 | }
15 |
16 | override def receive: Receive = {
17 | case "hi" => {
18 | println("master send hi")
19 | }
20 |
21 | case _ => println("非法消息")
22 | }
23 | }
24 |
25 | object WorkerRun{
26 | def main(args: Array[String]): Unit = {
27 | val hostname = "localhost"
28 | val strConfig =
29 | s"""
30 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
31 | |akka.remote.netty.tcp.hostname = ${hostname}
32 | """.stripMargin
33 |
34 | val config = ConfigFactory.parseString(strConfig)
35 | val as = ActorSystem("WorkerActorSystem", config)
36 |
37 | as.actorOf(Props(new Worker()), "worker")
38 | }
39 | }
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/akka_rpc/Master.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.akka_rpc
2 |
3 | import akka.actor.{Actor, ActorSystem, Props}
4 | import com.typesafe.config.ConfigFactory
5 |
6 | class Master extends Actor{
7 |
8 | override def preStart(): Unit = {
9 | //业务逻辑初始化
10 | println("prestart")
11 | }
12 |
13 | //相当于是一个run,处理业务逻辑时有消息传送过来
14 | override def receive: Receive = {
15 | case "hello" => {
16 | //这个注释代表模拟一个业务方法,得到结果
17 | println("receive hi")
18 |
19 | val result = "hi"
20 | //谁发送过来消息,谁就是sender()
21 | sender() ! result
22 | }
23 |
24 | case _ => println("非法新消息")
25 | }
26 | }
27 |
28 | object MasterRun{
29 | def main(args: Array[String]): Unit = {
30 | val strConfig =
31 | """
32 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
33 | |akka.remote.netty.tcp.hostname =localhost
34 | |akka.remote.netty.tcp.port=6789
35 | """.stripMargin
36 |
37 | val config = ConfigFactory.parseString(strConfig)
38 | val as = ActorSystem("MasterActorSystem",config)
39 |
40 | as.actorOf(Props(new Master()), "master")
41 | println("MasterActorSystem init")
42 | }
43 | }
--------------------------------------------------------------------------------
/dmp/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | javax.jdo.option.ConnectionURL
4 | jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true
5 | JDBC connect string for a JDBC metastore
6 |
7 |
8 | javax.jdo.option.ConnectionDriverName
9 | com.mysql.jdbc.Driver
10 | Driver class name for a JDBC metastore
11 |
12 |
13 | javax.jdo.option.ConnectionUserName
14 | root
15 | username to use against metastore database
16 |
17 |
18 | javax.jdo.option.ConnectionPassword
19 | root
20 | password to use against metastore database
21 |
22 |
23 |
24 | hive.server2.thrift.port
25 | 10000
26 |
27 |
28 | hive.server2.thrift.bind.host
29 | hadoop04
30 |
31 |
32 |
33 | hive.metastore.uris
34 | thrift://hadoop04:9083
35 |
36 |
37 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | javax.jdo.option.ConnectionURL
4 | jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true
5 | JDBC connect string for a JDBC metastore
6 |
7 |
8 | javax.jdo.option.ConnectionDriverName
9 | com.mysql.jdbc.Driver
10 | Driver class name for a JDBC metastore
11 |
12 |
13 | javax.jdo.option.ConnectionUserName
14 | root
15 | username to use against metastore database
16 |
17 |
18 | javax.jdo.option.ConnectionPassword
19 | root
20 | password to use against metastore database
21 |
22 |
23 |
24 | hive.server2.thrift.port
25 | 10000
26 |
27 |
28 | hive.server2.thrift.bind.host
29 | hadoop04
30 |
31 |
32 |
33 | hive.metastore.uris
34 | thrift://hadoop04:9083
35 |
36 |
37 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/tags/DeviceTag.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.tags
2 |
3 | import com.awebone.dmp.Logs
4 | import com.awebone.dmp.constants.AdTagConstants
5 |
6 | import scala.collection.mutable
7 |
8 | /**
9 | * 4)设备:操作系统|联网方式|运营商
10 | 设备操作系统
11 | 1 Android D0001001
12 | 2 IOS D0001002
13 | 3 Winphone D0001003
14 | 4 其他 D0001004
15 | 设备联网方式
16 | WIFI D0002001
17 | 4G D0002002
18 | 3G D0002003
19 | 2G D0002004
20 | NWTWORKOTHER D0004004
21 | 设备运营商方案
22 | 移动 D0003001
23 | 联通 D0003002
24 | 电信 D0003003
25 | OPERATOROTHER D0003004
26 | */
27 | object DeviceTag extends Tags {
28 | override def extractTag(logs: Logs) = {
29 | val mMap = mutable.Map[String, Int]()
30 | //设备操作系统为:client
31 | if(logs.client != null) {
32 | mMap.put(AdTagConstants.PREFIX_AD_DEVICE_TAG + logs.client, 1)
33 | }
34 | //联网方式networkmannerid
35 | if(logs.networkmannerid != null) {
36 | mMap.put(AdTagConstants.PREFIX_AD_NETWORK_TAG + logs.networkmannerid, 1)
37 | }
38 |
39 | //设备运营商ispid
40 | if(logs.ispid != null) {
41 | mMap.put(AdTagConstants.PREFIX_AD_ISP_TAG + logs.ispid, 1)
42 | }
43 | mMap.toMap
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/dmp/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 |
26 | hbase.rootdir
27 | hdfs://myha/myhbase
28 |
29 |
30 |
31 | hbase.cluster.distributed
32 | true
33 |
34 |
35 |
36 | hbase.zookeeper.quorum
37 | hadoop01:2181,hadoop02:2181,hadoop03:2181
38 |
39 |
40 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/hbase-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 |
26 | hbase.rootdir
27 | hdfs://myha/myhbase
28 |
29 |
30 |
31 | hbase.cluster.distributed
32 | true
33 |
34 |
35 |
36 | hbase.zookeeper.quorum
37 | hadoop01:2181,hadoop02:2181,hadoop03:2181
38 |
39 |
40 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/util/Utils.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.util
2 |
3 | import org.apache.commons.lang3.StringUtils
4 |
5 | object Utils {
6 | def parseInt(str:String):Int = {
7 | if(StringUtils.isEmpty(str)) {
8 | 0
9 | } else {
10 | str.toInt
11 | }
12 | }
13 |
14 | def parseDouble(str:String):Double = {
15 | if(StringUtils.isEmpty(str)) {
16 | 0.0
17 | } else {
18 | str.toDouble
19 | }
20 | }
21 |
22 | //yyyy-MM-dd hh:mm:ss--->hh
23 | def fmtHour(str: String):Option[String] = {
24 | if(StringUtils.isEmpty(str)) {
25 | None
26 | } else {
27 | Some(str.substring(str.indexOf(" ") + 1, str.indexOf(" ") + 3))
28 | }
29 | }
30 |
31 | //yyyy-MM-dd hh:mm:ss--->yyyy-MM-dd
32 | def fmtDate(str: String):Option[String] = {
33 | if(StringUtils.isEmpty(str)) {
34 | None
35 | } else {
36 | Some(str.substring(0, str.indexOf(" ")))
37 | }
38 | }
39 |
40 | //补全两位字符串
41 | def fulfill(str:String) = {
42 | if(str != null && str.length > 1) {
43 | str
44 | } else if(!"".equals(str) && str.length == 1){
45 | 0 + "" + str
46 | } else {
47 | "other"
48 | }
49 | }
50 | //补全数字
51 | def fulfill(num:Int) = {
52 | if(num >= 0 && num < 10) {
53 | "0" + num
54 | } else {
55 | "" + num
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/dmp/data/data.txt:
--------------------------------------------------------------------------------
1 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,,
2 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,,
3 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,,
4 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,,
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLOps.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.etl
2 |
3 | import com.awebone.dmp.Logs
4 | import org.apache.log4j.{Level, Logger}
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.spark.serializer.KryoSerializer
8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
9 |
10 | /**
11 | * 日志数据清洗过程
12 | *
13 | * 1)要求一:将数据转换成parquet文件格式
14 | * 2)要求二:序列化方式采用KryoSerializer方式
15 | * 3)要求三:parquet文件采用Sanppy压缩方式
16 | *
17 | * 通过处理分析,使用SparkCore只能完成KryoSerializer和Snappy,想要完成parquet比较困难,
18 | * 而SparkSQL处理parquet文件非常简单,所以需要将原先的编码做一稍微改动
19 | */
20 | object DMPLogETLOps {
21 | def main(args: Array[String]): Unit = {
22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 |
26 | val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]")
27 | .set("spark.serializer",classOf[KryoSerializer].getName)
28 | .registerKryoClasses(Array(classOf[Logs])) //要求二:序列化方式采用KryoSerializer方式
29 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
30 | import spark.implicits._
31 |
32 | val lines:RDD[String] = spark.sparkContext.textFile("file:///D:\\workplace\\dmp\\data\\data.txt")
33 |
34 | val retDS: Dataset[Logs] = lines.map(line => {
35 | val log: Logs = Logs.line2Logs(line)
36 | log
37 | }).toDS()
38 |
39 | /**
40 | * 要求一:将数据转换成parquet文件格式
41 | * 要求三:parquet文件采用Sanppy压缩方式
42 | */
43 | retDS.write.mode(SaveMode.Overwrite).parquet("file:///D:\\workplace\\dmp\\data\\out\\")
44 |
45 | spark.stop()
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/MySQLSource.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.flink.project
2 |
3 |
4 | import java.sql.{Connection, DriverManager, PreparedStatement}
5 |
6 | import org.apache.flink.configuration.Configuration
7 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
8 |
9 | import scala.collection.mutable
10 |
11 | /**
12 | * 自定义Mysql 并行的Source
13 | */
14 | class MySQLSource extends RichParallelSourceFunction[mutable.HashMap[String, String]] {
15 | var connection: Connection = null
16 | var ps: PreparedStatement = null
17 |
18 | //创建连接
19 | override def open(parameters: Configuration): Unit = {
20 | super.open(parameters)
21 | val driver = "com.mysql.jdbc.Driver"
22 | val url = "jdbc:mysql://hadoop01:3306/flink"
23 | val user = "root"
24 | val password = "root"
25 | Class.forName(driver)
26 | connection = DriverManager.getConnection(url, user, password)
27 |
28 | val sql = "select user_id,domain from user_domain_config"
29 | ps = connection.prepareStatement(sql)
30 | }
31 |
32 | //不断执行的函数
33 | override def run(sourceContext: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = {
34 | val resultSet = ps.executeQuery()
35 | val collect = mutable.HashMap[String,String]()
36 |
37 | //将查询结果放入HashMap中
38 | while (resultSet.next()){
39 | collect.put(resultSet.getNString("domain"), resultSet.getNString("user_id"))
40 | }
41 | sourceContext.collect(collect)
42 | }
43 |
44 | override def cancel(): Unit = {}
45 |
46 | override def close(): Unit = {
47 | if(ps != null){
48 | ps.close()
49 | }
50 | if(connection != null){
51 | connection.close()
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/dmp/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | fs.defaultFS
23 | hdfs://myha/
24 |
25 |
26 |
27 |
28 | hadoop.tmp.dir
29 | /home/hadoop/data/hadoopdata/
30 |
31 |
32 |
33 |
34 | ha.zookeeper.quorum
35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181
36 |
37 |
38 |
39 |
40 | ha.zookeeper.session-timeout.ms
41 | 1000
42 | ms
43 |
44 |
45 |
46 | topology.script.file.name
47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh
48 |
49 |
50 |
51 | hadoop.proxyuser.hadoop.hosts
52 | *
53 |
54 |
55 | hadoop.proxyuser.hadoop.groups
56 | *
57 |
58 |
59 |
--------------------------------------------------------------------------------
/weblog/src/main/java/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | fs.defaultFS
23 | hdfs://myha/
24 |
25 |
26 |
27 |
28 | hadoop.tmp.dir
29 | /home/hadoop/data/hadoopdata/
30 |
31 |
32 |
33 |
34 | ha.zookeeper.quorum
35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181
36 |
37 |
38 |
39 |
40 | ha.zookeeper.session-timeout.ms
41 | 1000
42 | ms
43 |
44 |
45 |
46 | topology.script.file.name
47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh
48 |
49 |
50 |
51 | hadoop.proxyuser.hadoop.hosts
52 | *
53 |
54 |
55 | hadoop.proxyuser.hadoop.groups
56 | *
57 |
58 |
59 |
--------------------------------------------------------------------------------
/mllib/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | fs.defaultFS
23 | hdfs://myha/
24 |
25 |
26 |
27 |
28 | hadoop.tmp.dir
29 | /home/hadoop/data/hadoopdata/
30 |
31 |
32 |
33 |
34 | ha.zookeeper.quorum
35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181
36 |
37 |
38 |
39 |
40 | ha.zookeeper.session-timeout.ms
41 | 1000
42 | ms
43 |
44 |
45 |
46 | topology.script.file.name
47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh
48 |
49 |
50 |
51 | hadoop.proxyuser.hadoop.hosts
52 | *
53 |
54 |
55 | hadoop.proxyuser.hadoop.groups
56 | *
57 |
58 |
59 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | fs.defaultFS
23 | hdfs://myha/
24 |
25 |
26 |
27 |
28 | hadoop.tmp.dir
29 | /home/hadoop/data/hadoopdata/
30 |
31 |
32 |
33 |
34 | ha.zookeeper.quorum
35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181
36 |
37 |
38 |
39 |
40 | ha.zookeeper.session-timeout.ms
41 | 1000
42 | ms
43 |
44 |
45 |
46 | topology.script.file.name
47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh
48 |
49 |
50 |
51 | hadoop.proxyuser.hadoop.hosts
52 | *
53 |
54 |
55 | hadoop.proxyuser.hadoop.groups
56 | *
57 |
58 |
59 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/report/ProvinceCityQuantityJob.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.report
2 |
3 | import java.util.Properties
4 |
5 | import org.apache.log4j.{Level, Logger}
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
8 |
9 | /**
10 | * 省份:province
11 | * 城市:city
12 | * 结果存储到MySQL数据库
13 | * select
14 | * province,
15 | * city,
16 | * count(1)
17 | * from logs
18 | * group by province, city
19 | **/
20 | object ProvinceCityQuantityJob {
21 | def main(args: Array[String]): Unit = {
22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 |
26 | if(args == null || args.length < 2){
27 | println(
28 | """Parameter Errors! Usage:
29 | |inputpath : input path
30 | |table : mysql table name
31 | """.stripMargin)
32 | System.exit(-1)
33 | }
34 | val Array(inputpath, table) = args
35 |
36 | val conf: SparkConf = new SparkConf().setAppName("ProvinceCityQuantityJob").setMaster("local[*]")
37 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
38 |
39 | val input: DataFrame = spark.read.parquet(inputpath)
40 | input.createOrReplaceTempView("logs")
41 |
42 | val sql =
43 | """
44 | |select
45 | | date_sub(current_date(), 0) data_date,
46 | | provincename province,
47 | | cityname city,
48 | | count(1) as countz
49 | |from logs
50 | |group by provincename, cityname
51 | """.stripMargin
52 |
53 | val url = "jdbc:mysql://hadoop01:3306/dmp"
54 | val properties = new Properties
55 | properties.put("user","root")
56 | properties.put("password","root")
57 |
58 | spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties)
59 |
60 | spark.stop()
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLHDFSOps.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.etl
2 |
3 | import com.awebone.dmp.Logs
4 | import org.apache.log4j.{Level, Logger}
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.spark.serializer.KryoSerializer
8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
9 |
10 | /**
11 | * 日志数据清洗过程
12 | *
13 | * 1)要求一:将数据转换成parquet文件格式
14 | * 2)要求二:序列化方式采用KryoSerializer方式
15 | * 3)要求三:parquet文件采用Sanppy压缩方式
16 | *
17 | * 通过处理分析,使用SparkCore只能完成KryoSerializer和Snappy,想要完成parquet比较困难,
18 | * 而SparkSQL处理parquet文件非常简单,所以需要将原先的编码做一稍微改动
19 | */
20 | object DMPLogETLHDFSOps {
21 | def main(args: Array[String]): Unit = {
22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN)
25 |
26 | if(args == null || args.length < 2){
27 | println(
28 | """Parameter Errors! Usage:
29 | |inputpath : input path
30 | |outputpath : output path
31 | """.stripMargin)
32 | System.exit(-1)
33 | }
34 | val Array(inputpath, outputpath) = args
35 |
36 | val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]")
37 | .set("spark.serializer",classOf[KryoSerializer].getName)
38 | .registerKryoClasses(Array(classOf[Logs])) //要求二:序列化方式采用KryoSerializer方式
39 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
40 | import spark.implicits._
41 |
42 | val lines:RDD[String] = spark.sparkContext.textFile(inputpath)
43 |
44 | val retDS: Dataset[Logs] = lines.map(line => {
45 | val log: Logs = Logs.line2Logs(line)
46 | log
47 | }).toDS()
48 |
49 | /**
50 | * 要求一:将数据转换成parquet文件格式
51 | * 要求三:parquet文件采用Sanppy压缩方式
52 | */
53 | retDS.write.mode(SaveMode.Overwrite).parquet(outputpath)
54 |
55 | spark.stop()
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/pre/WebLogParse.java:
--------------------------------------------------------------------------------
1 | package com.awebone.pre;
2 |
3 | import java.text.ParseException;
4 | import java.text.SimpleDateFormat;
5 | import java.util.HashSet;
6 | import java.util.Locale;
7 | import java.util.Set;
8 |
9 | import com.awebone.bean.WebLogBean;
10 |
11 | public class WebLogParse {
12 | static SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss", Locale.US);
13 | static SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
14 | static Set pages = new HashSet();
15 | static {
16 | pages.add("/about");
17 | pages.add("/black-ip-list/");
18 | pages.add("/cassandra-clustor/");
19 | pages.add("/finance-rhive-repurchase/");
20 | pages.add("/hadoop-family-roadmap/");
21 | pages.add("/hadoop-hive-intro/");
22 | pages.add("/hadoop-zookeeper-intro/");
23 | pages.add("/hadoop-mahout-roadmap/");
24 | }
25 |
26 | public static WebLogBean parse(String line) throws ParseException {
27 | // 参数代表一行日志信息
28 | String[] log_datas = line.split(" ");
29 | if (log_datas.length >= 12) {
30 | String addr = log_datas[0];
31 | String user = log_datas[2];
32 | String local_time = log_datas[3];
33 | // 时间解析
34 | String format_time = sdf2.format(sdf1.parse(local_time.substring(1)));
35 | if (null == format_time || "".equals(format_time)) {
36 | format_time = "_invalid_";
37 | }
38 | String request = log_datas[6];
39 | String status = log_datas[8];
40 | String byte_sent = log_datas[9];
41 | String http_refer = log_datas[10];
42 | // 拼接浏览器对象
43 | StringBuffer sb = new StringBuffer();
44 | for (int i = 11; i < log_datas.length; i++) {
45 | sb.append(log_datas[i] + " ");
46 | }
47 | String user_agent = sb.substring(1, sb.length() - 2);
48 |
49 | WebLogBean bean = new WebLogBean(false, addr, user, format_time, request, status, byte_sent, http_refer,
50 | user_agent);
51 | // 判断数据有效性
52 | if ("_invalid_".equals(format_time)) {
53 | bean.setValid(false);
54 | }
55 | if (Integer.parseInt(bean.getStatus()) > 400) {
56 | bean.setValid(false);
57 | }
58 | if (pages.contains(bean.getRequest())) {
59 | bean.setValid(true);
60 | }
61 | return bean;
62 | }else{
63 | return null;
64 | }
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/pre/WebLogPreProcess.java:
--------------------------------------------------------------------------------
1 | package com.awebone.pre;
2 |
3 | import java.io.IOException;
4 | import java.text.ParseException;
5 |
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.LongWritable;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 |
16 | import com.awebone.bean.WebLogBean;
17 |
18 | //对原始数据进行预处理
19 | public class WebLogPreProcess {
20 | /**
21 | * @author Awebone
22 | * map端:
23 | * 一行数据--- 一条日志--- hive一条数据
24 | * 切分 封装对象 发送 写出hdfs
25 | * key:null
26 | * value:自定义对象
27 | */
28 | static class WebLogPreProcessMapper extends Mapper {
29 | @Override
30 | protected void map(LongWritable key, Text value,
31 | Mapper.Context context)
32 | throws IOException, InterruptedException {
33 | String line = value.toString();
34 | try {
35 | WebLogBean webLogBean = WebLogParse.parse(line);
36 | if (webLogBean != null) {
37 | context.write(NullWritable.get(), webLogBean);
38 | }
39 | } catch (ParseException e) {
40 | e.printStackTrace();
41 | }
42 | }
43 | }
44 |
45 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
46 | System.setProperty("HADOOP_USER_NAME", "hadoop");
47 | Configuration conf = new Configuration();
48 | conf.set("fs.defaultFS", "hdfs://myha/");
49 | Job job = Job.getInstance(conf);
50 |
51 | job.setJarByClass(WebLogPreProcess.class);
52 |
53 | job.setMapperClass(WebLogPreProcessMapper.class);
54 | job.setOutputKeyClass(NullWritable.class);
55 | job.setOutputValueClass(WebLogBean.class);
56 |
57 | FileInputFormat.setInputPaths(job, new Path("/weblog/20200221"));
58 | FileOutputFormat.setOutputPath(job, new Path("/weblog/pre/20200221"));
59 |
60 | //不需要 设置为0
61 | job.setNumReduceTasks(0);
62 |
63 | boolean res = job.waitForCompletion(true);
64 | System.exit(res ? 0 : 1);
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/MockKafkaProducer.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.flink.project
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.{Date, Properties}
5 |
6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
7 | import org.apache.kafka.common.serialization.StringSerializer
8 |
9 | import scala.util.Random
10 |
11 | object MockKafkaProducer {
12 |
13 | private def getLevels() = {
14 | val levels = Array[String]("M","E")
15 |
16 | levels(new Random().nextInt(levels.length))
17 | }
18 |
19 | private def getIps() = {
20 | val ips = Array[String]("233.104.18.110",
21 | "113.101.75.194",
22 | "27.17.127.135",
23 | "185.225.139.16",
24 | "112.1.66.34",
25 | "175.148.211.190",
26 | "183.227.58.21",
27 | "59.83.198.84",
28 | "117.28.38.28",
29 | "117.59.39.169")
30 |
31 | ips(new Random().nextInt(ips.length))
32 | }
33 |
34 | private def getDomains() = {
35 | val domains = Array[String]("v1.awebone.com", "v2.awebone.com", "v3.awebone.com", "v4.awebone.com", "vmi.awebone.com")
36 |
37 | domains(new Random().nextInt(domains.length))
38 | }
39 |
40 | private def getTraffic() = new Random().nextInt(10000)
41 |
42 | def main(args: Array[String]): Unit = {
43 | val properties: Properties = new Properties()
44 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
45 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
46 | // properties.put("metadata.broker.list", "hadoop04:9092") // 声明kafka broker
47 | properties.setProperty("key.serializer", classOf[StringSerializer].getName)
48 | properties.setProperty("value.serializer", classOf[StringSerializer].getName)
49 |
50 | val producer = new KafkaProducer[String, String](properties)
51 | val topic = "cdnlog"
52 |
53 | while (true){
54 | val builder = new StringBuilder()
55 | builder.append("cdnlog").append("\t")
56 | .append("CN").append("\t")
57 | .append(getLevels()).append("\t")
58 | .append(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())).append("\t")
59 | .append(getIps()).append("\t")
60 | .append(getDomains()).append("\t")
61 | .append(getTraffic()).append("\t")
62 |
63 | println(builder.toString())
64 | val pr = new ProducerRecord[String, String](topic, builder.toString())
65 | producer.send(pr)
66 | Thread.sleep(2000)
67 | }
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/MyNodeManager.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.yarn
2 |
3 | import java.util.UUID
4 |
5 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
6 | import com.typesafe.config.ConfigFactory
7 | import sun.plugin2.message.HeartbeatMessage
8 |
9 | class MyNodeManager(val resourcemanagerhostname: String, val resourcemanagerport: Int, val memory: Int, val cpu: Int) extends Actor {
10 |
11 | var nodemanagerid: String = _
12 | var rmRef: ActorSelection = _
13 |
14 | override def preStart(): Unit = {
15 | // 远程path akka.tcp://(ActorSystem的名称)@(远程地址的IP) : (远程地址的端口)/user/(Actor的名称)
16 | rmRef = context.actorSelection(s"akka.tcp://${Constant.RMAS}@${resourcemanagerhostname}:${resourcemanagerport}/user/${Constant.RMA}")
17 |
18 | // val nodemanagerid:String
19 | // val memory:Int
20 | // val cpu:Int
21 | nodemanagerid = UUID.randomUUID().toString
22 | //发送注册消息
23 | rmRef ! RegisterNodeManager(nodemanagerid, memory, cpu)
24 | }
25 |
26 | override def receive: Receive = {
27 | case RegisteredNodeManager(masterURL) => {
28 | println(masterURL);
29 |
30 | /**
31 | * initialDelay: FiniteDuration, 多久以后开始执行
32 | * interval: FiniteDuration, 每隔多长时间执行一次
33 | * receiver: ActorRef, 给谁发送这个消息
34 | * message: Any 发送的消息是啥
35 | */
36 | import scala.concurrent.duration._
37 | import context.dispatcher
38 | //每个4秒对自己发送信息,然后就可以发送心跳信息
39 | context.system.scheduler.schedule(0 millis, 4000 millis, self, SendMessage)
40 | }
41 |
42 | case SendMessage => {
43 |
44 | //向主节点发送心跳信息
45 | rmRef ! Heartbeat(nodemanagerid)
46 |
47 | println(Thread.currentThread().getId)
48 | }
49 | }
50 | }
51 |
52 | object MyNodeManager {
53 | def main(args: Array[String]): Unit = {
54 | val HOSTNAME = args(0)
55 | val RM_HOSTNAME = args(1)
56 | val RM_PORT = args(2).toInt
57 | val NODEMANAGER_MEMORY = args(3).toInt
58 | val NODEMANAGER_CORE = args(4).toInt
59 | var NODEMANAGER_PORT = args(5).toInt
60 | val str =
61 | s"""
62 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
63 | |akka.remote.netty.tcp.hostname =${HOSTNAME}
64 | |akka.remote.netty.tcp.port=${NODEMANAGER_PORT}
65 | """.stripMargin
66 | val conf = ConfigFactory.parseString(str)
67 | val actorSystem = ActorSystem(Constant.NMAS, conf)
68 | actorSystem.actorOf(Props(new MyNodeManager(RM_HOSTNAME, RM_PORT, NODEMANAGER_MEMORY, NODEMANAGER_CORE)), Constant.NMA)
69 | }
70 | }
--------------------------------------------------------------------------------
/mllib/src/main/java/com/awebone/spark/WordCountJava7.java:
--------------------------------------------------------------------------------
1 | package com.awebone.spark;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaPairRDD;
5 | import org.apache.spark.api.java.JavaRDD;
6 | import org.apache.spark.api.java.JavaSparkContext;
7 | import org.apache.spark.api.java.function.FlatMapFunction;
8 | import org.apache.spark.api.java.function.Function2;
9 | import org.apache.spark.api.java.function.PairFunction;
10 | import org.apache.spark.api.java.function.VoidFunction;
11 | import scala.Tuple2;
12 |
13 | import java.util.Arrays;
14 | import java.util.Iterator;
15 |
16 | public class WordCountJava7 {
17 | public static void main(String[] args) {
18 | //获取程序入口
19 | SparkConf sparkConf = new SparkConf();
20 | sparkConf.setAppName("WordCountJava7");
21 | sparkConf.setMaster("local");
22 | JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
23 |
24 | //获取数据
25 | JavaRDD linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input");
26 |
27 | //计算
28 | JavaRDD wordsRDD = linesRDD.flatMap(new FlatMapFunction() {
29 | @Override
30 | public Iterator call(String s) throws Exception {
31 | return Arrays.asList(s.split(" ")).iterator();
32 | }
33 | });
34 |
35 | JavaPairRDD wordAndOneRDD = wordsRDD.mapToPair(new PairFunction() {
36 | @Override
37 | public Tuple2 call(String s) throws Exception {
38 | return new Tuple2<>(s, 1);
39 | }
40 | });
41 |
42 | JavaPairRDD wordsCountRDD = wordAndOneRDD.reduceByKey(new Function2() {
43 | @Override
44 | public Integer call(Integer integer, Integer integer2) throws Exception {
45 | return integer + integer2;
46 | }
47 | });
48 |
49 | JavaPairRDD newWordsCountRDD = wordsCountRDD.mapToPair(new PairFunction, Integer, String>() {
50 | @Override
51 | public Tuple2 call(Tuple2 stringIntegerTuple2) throws Exception {
52 | return stringIntegerTuple2.swap();
53 | }
54 | });
55 | JavaPairRDD sortedRDD = newWordsCountRDD.sortByKey(false);
56 | JavaPairRDD lastSortWordCoundRDD = sortedRDD.mapToPair(new PairFunction, String, Integer>() {
57 | @Override
58 | public Tuple2 call(Tuple2 integerStringTuple2) throws Exception {
59 | return integerStringTuple2.swap();
60 | }
61 | });
62 |
63 | lastSortWordCoundRDD.foreach(new VoidFunction>() {
64 | @Override
65 | public void call(Tuple2 t) throws Exception {
66 | System.out.println(t._1 + "\t" + t._2);
67 | }
68 | });
69 |
70 | javaSparkContext.stop();
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/akka_rpc/src/main/scala/com/awebone/yarn/MyResourceManager.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.yarn
2 |
3 | import akka.actor.{Actor, ActorSystem, Props}
4 | import com.typesafe.config.ConfigFactory
5 |
6 | import scala.collection.mutable
7 |
8 | class MyResourceManager(var hostname: String, var port: Int) extends Actor {
9 |
10 | // 用来存储每个注册的NodeManager节点的信息
11 | private var id2nodemanagerinfo = new mutable.HashMap[String, NodeManagerInfo]()
12 | // 对所有注册的NodeManager进行去重,其实就是一个HashSet
13 | private var nodemanagerInfoes = new mutable.HashSet[NodeManagerInfo]()
14 |
15 | // actor在最开始的时候,会执行一次
16 | override def preStart(): Unit = {
17 | import scala.concurrent.duration._
18 | import context.dispatcher
19 |
20 | // 调度一个任务, 每隔五秒钟执行一次,每隔5秒给自己发送一次信息
21 | context.system.scheduler.schedule(0 millis, 5000 millis, self, CheckTimeOut)
22 | }
23 |
24 | override def receive: Receive = {
25 |
26 | case RegisterNodeManager(nodemanagerid, memory, cpu) => {
27 | val nodeManagerInfo = new NodeManagerInfo(nodemanagerid, memory, cpu)
28 |
29 | // 对注册的NodeManager节点进行存储管理
30 | id2nodemanagerinfo.put(nodemanagerid, nodeManagerInfo)
31 | nodemanagerInfoes += nodeManagerInfo
32 |
33 | //把信息存到zookeeper
34 | sender() ! RegisteredNodeManager(hostname + ":" + port)
35 | }
36 |
37 | case Heartbeat(nodemanagerid) => {
38 | val currentTime = System.currentTimeMillis()
39 | val nodeManagerInfo = id2nodemanagerinfo(nodemanagerid)
40 | nodeManagerInfo.lastHeartBeatTime = currentTime
41 |
42 | id2nodemanagerinfo(nodemanagerid) = nodeManagerInfo
43 | nodemanagerInfoes += nodeManagerInfo
44 | }
45 |
46 | // 检查过期失效的 NodeManager
47 | case CheckTimeOut => {
48 | val currentTime = System.currentTimeMillis()
49 |
50 | // 15 秒钟失效
51 | //foreach:遍历
52 | //filter:拿到所有的已经宕机的节点
53 | nodemanagerInfoes.filter(nm => currentTime - nm.lastHeartBeatTime > 15000)
54 | .foreach(deadnm => {
55 | nodemanagerInfoes -= deadnm
56 | id2nodemanagerinfo.remove(deadnm.nodemanagerid)
57 | })
58 | println("当前注册成功的节点数" + nodemanagerInfoes.size);
59 | }
60 | }
61 | }
62 |
63 | object MyResourceManager {
64 | def main(args: Array[String]): Unit = {
65 | val RESOURCEMANAGER_HOSTNAME = args(0) //解析的配置的日志
66 | val RESOURCEMANAGER_PORT = args(1).toInt
67 |
68 | //解析运行时所需要的参数
69 | val str =
70 | s"""
71 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider"
72 | |akka.remote.netty.tcp.hostname =${RESOURCEMANAGER_HOSTNAME}
73 | |akka.remote.netty.tcp.port=${RESOURCEMANAGER_PORT}
74 | """.stripMargin
75 |
76 | val conf = ConfigFactory.parseString(str)
77 | val actorSystem = ActorSystem(Constant.RMAS, conf)
78 |
79 | //启动一个actor
80 | actorSystem.actorOf(Props(new MyResourceManager(RESOURCEMANAGER_HOSTNAME, RESOURCEMANAGER_PORT)), Constant.RMA)
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/VisitBean.java:
--------------------------------------------------------------------------------
1 | package com.awebone.bean;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.Writable;
8 |
9 | public class VisitBean implements Writable {
10 |
11 | private String session;
12 | private String remote_addr;
13 | private String inTime;
14 | private String outTime;
15 | private String inPage;
16 | private String outPage;
17 | private String referal;
18 | private int pageVisits;
19 |
20 | public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) {
21 | this.session = session;
22 | this.remote_addr = remote_addr;
23 | this.inTime = inTime;
24 | this.outTime = outTime;
25 | this.inPage = inPage;
26 | this.outPage = outPage;
27 | this.referal = referal;
28 | this.pageVisits = pageVisits;
29 | }
30 |
31 | public String getSession() {
32 | return session;
33 | }
34 |
35 | public void setSession(String session) {
36 | this.session = session;
37 | }
38 |
39 | public String getRemote_addr() {
40 | return remote_addr;
41 | }
42 |
43 | public void setRemote_addr(String remote_addr) {
44 | this.remote_addr = remote_addr;
45 | }
46 |
47 | public String getInTime() {
48 | return inTime;
49 | }
50 |
51 | public void setInTime(String inTime) {
52 | this.inTime = inTime;
53 | }
54 |
55 | public String getOutTime() {
56 | return outTime;
57 | }
58 |
59 | public void setOutTime(String outTime) {
60 | this.outTime = outTime;
61 | }
62 |
63 | public String getInPage() {
64 | return inPage;
65 | }
66 |
67 | public void setInPage(String inPage) {
68 | this.inPage = inPage;
69 | }
70 |
71 | public String getOutPage() {
72 | return outPage;
73 | }
74 |
75 | public void setOutPage(String outPage) {
76 | this.outPage = outPage;
77 | }
78 |
79 | public String getReferal() {
80 | return referal;
81 | }
82 |
83 | public void setReferal(String referal) {
84 | this.referal = referal;
85 | }
86 |
87 | public int getPageVisits() {
88 | return pageVisits;
89 | }
90 |
91 | public void setPageVisits(int pageVisits) {
92 | this.pageVisits = pageVisits;
93 | }
94 |
95 | public void readFields(DataInput in) throws IOException {
96 | this.session = in.readUTF();
97 | this.remote_addr = in.readUTF();
98 | this.inTime = in.readUTF();
99 | this.outTime = in.readUTF();
100 | this.inPage = in.readUTF();
101 | this.outPage = in.readUTF();
102 | this.referal = in.readUTF();
103 | this.pageVisits = in.readInt();
104 |
105 | }
106 |
107 | public void write(DataOutput out) throws IOException {
108 | out.writeUTF(session);
109 | out.writeUTF(remote_addr);
110 | out.writeUTF(inTime);
111 | out.writeUTF(outTime);
112 | out.writeUTF(inPage);
113 | out.writeUTF(outPage);
114 | out.writeUTF(referal);
115 | out.writeInt(pageVisits);
116 |
117 | }
118 |
119 | @Override
120 | public String toString() {
121 | return session + "\001" + remote_addr + "\001" + inTime + "\001" + outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits;
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/PageViewsBean.java:
--------------------------------------------------------------------------------
1 | package com.awebone.bean;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.Writable;
8 |
9 | public class PageViewsBean implements Writable {
10 |
11 | private String session;
12 | private String remote_addr;
13 | private String timestr;
14 | private String request;
15 | private int step;
16 | private String staylong;
17 | private String referal;
18 | private String useragent;
19 | private String bytes_send;
20 | private String status;
21 |
22 | public void set(String session, String remote_addr, String useragent, String timestr, String request, int step, String staylong, String referal, String bytes_send, String status) {
23 | this.session = session;
24 | this.remote_addr = remote_addr;
25 | this.useragent = useragent;
26 | this.timestr = timestr;
27 | this.request = request;
28 | this.step = step;
29 | this.staylong = staylong;
30 | this.referal = referal;
31 | this.bytes_send = bytes_send;
32 | this.status = status;
33 | }
34 |
35 | public String getSession() {
36 | return session;
37 | }
38 |
39 | public void setSession(String session) {
40 | this.session = session;
41 | }
42 |
43 | public String getRemote_addr() {
44 | return remote_addr;
45 | }
46 |
47 | public void setRemote_addr(String remote_addr) {
48 | this.remote_addr = remote_addr;
49 | }
50 |
51 | public String getTimestr() {
52 | return timestr;
53 | }
54 |
55 | public void setTimestr(String timestr) {
56 | this.timestr = timestr;
57 | }
58 |
59 | public String getRequest() {
60 | return request;
61 | }
62 |
63 | public void setRequest(String request) {
64 | this.request = request;
65 | }
66 |
67 | public int getStep() {
68 | return step;
69 | }
70 |
71 | public void setStep(int step) {
72 | this.step = step;
73 | }
74 |
75 | public String getStaylong() {
76 | return staylong;
77 | }
78 |
79 | public void setStaylong(String staylong) {
80 | this.staylong = staylong;
81 | }
82 |
83 | public String getReferal() {
84 | return referal;
85 | }
86 |
87 | public void setReferal(String referal) {
88 | this.referal = referal;
89 | }
90 |
91 | public String getUseragent() {
92 | return useragent;
93 | }
94 |
95 | public void setUseragent(String useragent) {
96 | this.useragent = useragent;
97 | }
98 |
99 | public String getBytes_send() {
100 | return bytes_send;
101 | }
102 |
103 | public void setBytes_send(String bytes_send) {
104 | this.bytes_send = bytes_send;
105 | }
106 |
107 | public String getStatus() {
108 | return status;
109 | }
110 |
111 | public void setStatus(String status) {
112 | this.status = status;
113 | }
114 |
115 | public void readFields(DataInput in) throws IOException {
116 | this.session = in.readUTF();
117 | this.remote_addr = in.readUTF();
118 | this.timestr = in.readUTF();
119 | this.request = in.readUTF();
120 | this.step = in.readInt();
121 | this.staylong = in.readUTF();
122 | this.referal = in.readUTF();
123 | this.useragent = in.readUTF();
124 | this.bytes_send = in.readUTF();
125 | this.status = in.readUTF();
126 | }
127 |
128 | public void write(DataOutput out) throws IOException {
129 | out.writeUTF(session);
130 | out.writeUTF(remote_addr);
131 | out.writeUTF(timestr);
132 | out.writeUTF(request);
133 | out.writeInt(step);
134 | out.writeUTF(staylong);
135 | out.writeUTF(referal);
136 | out.writeUTF(useragent);
137 | out.writeUTF(bytes_send);
138 | out.writeUTF(status);
139 | }
140 |
141 | }
142 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/report/AreaRequestDistributionJob.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.report
2 |
3 | import java.util.Properties
4 |
5 | import org.apache.log4j.{Level, Logger}
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
8 |
9 | /**
10 | * 广告请求地域分布统计
11 | * 省市/城市 总请求 有效请求 广告请求 |参与竞价数 竞价成功数 竞价成功率 |展示量 点击量 点击率 |广告成本 广告消费
12 | * 汇总结果,是可以保存到mysql(hbase)表中的,全量结果不建议保存到mysql
13 | */
14 | object AreaRequestDistributionJob {
15 | def main(args: Array[String]): Unit = {
16 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
17 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
18 | Logger.getLogger("org.spark-project").setLevel(Level.WARN)
19 |
20 | if(args == null || args.length < 2){
21 | println(
22 | """Parameter Errors! Usage:
23 | |inputpath : input path
24 | |table : mysql table name
25 | """.stripMargin)
26 | System.exit(-1)
27 | }
28 | val Array(inputpath, table) = args
29 |
30 | val conf: SparkConf = new SparkConf().setAppName("AreaRequestDistributionJob").setMaster("local[*]")
31 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
32 |
33 | val input: DataFrame = spark.read.parquet(inputpath)
34 | input.createOrReplaceTempView("logs")
35 |
36 | val sql =
37 | """
38 | |select
39 | | date_sub(current_date(), 1) data_date,
40 | | provincename province,
41 | | cityname city,
42 | | sum(if(requestmode = 1 and processnode >= 1, 1, 0)) orginal_req,
43 | | sum(if(requestmode = 1 and processnode >= 2, 1, 0)) valid_req,
44 | | sum(if(requestmode = 1 and processnode = 3, 1, 0)) ad_req,
45 | | sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and isbid = 1 and adorderid != 0
46 | | then 1
47 | | else 0
48 | | end) tpi_bid_num,
49 | | sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1
50 | | then 1
51 | | else 0
52 | | end) win_bid_num,
53 | | sum(case when requestmode = 2 and iseffective = 1
54 | | then 1
55 | | else 0
56 | | end) show_ad_master_num,
57 | | sum(case when requestmode = 3 and iseffective = 1
58 | | then 1
59 | | else 0
60 | | end) click_ad_master_num,
61 | | sum(case when requestmode = 2 and iseffective = 1 and isbilling = 1
62 | | then 1
63 | | else 0
64 | | end) show_ad_media_num,
65 | | sum(case when requestmode = 3 and iseffective = 1 and isbilling = 1
66 | | then 1
67 | | else 0
68 | | end) click_ad_media_num,
69 | | round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000
70 | | then winprice
71 | | else 0.0
72 | | end) / 1000, 2) dsp_ad_xf,
73 | | round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000
74 | | then adpayment
75 | | else 0.0
76 | | end) / 1000, 2) dsp_ad_cost
77 | |from logs
78 | |group by provincename, cityname
79 | """.stripMargin
80 |
81 | val url = "jdbc:mysql://hadoop01:3306/dmp"
82 | val properties = new Properties
83 | properties.put("user","root")
84 | properties.put("password","root")
85 |
86 | spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties)
87 |
88 | spark.stop()
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/bean/WebLogBean.java:
--------------------------------------------------------------------------------
1 | package com.awebone.bean;
2 |
3 | import java.io.DataInput;
4 | import java.io.DataOutput;
5 | import java.io.IOException;
6 |
7 | import org.apache.hadoop.io.Writable;
8 |
9 | public class WebLogBean implements Writable {
10 | private boolean valid = true; // 判断数据是否合法
11 | private String remote_addr; // 记录客户端的ip地址
12 | private String remote_user; // 记录客户端用户名称,忽略属性"-"
13 | private String time_local; // 记录访问时间与时区
14 | private String request; // 记录请求的url与http协议
15 | private String status; // 记录请求状态;成功是200
16 | private String body_bytes_sent; // 记录发送给客户端文件主体内容大小
17 | private String http_referer; // 用来记录从那个页面链接访问过来的
18 | private String http_user_agent; // 记录客户浏览器的相关信息
19 |
20 | public boolean isValid() {
21 | return valid;
22 | }
23 |
24 | public void setValid(boolean valid) {
25 | this.valid = valid;
26 | }
27 |
28 | public String getRemote_addr() {
29 | return remote_addr;
30 | }
31 |
32 | public void setRemote_addr(String remote_addr) {
33 | this.remote_addr = remote_addr;
34 | }
35 |
36 | public String getRemote_user() {
37 | return remote_user;
38 | }
39 |
40 | public void setRemote_user(String remote_user) {
41 | this.remote_user = remote_user;
42 | }
43 |
44 | public String getTime_local() {
45 | return time_local;
46 | }
47 |
48 | public void setTime_local(String time_local) {
49 | this.time_local = time_local;
50 | }
51 |
52 | public String getRequest() {
53 | return request;
54 | }
55 |
56 | public void setRequest(String request) {
57 | this.request = request;
58 | }
59 |
60 | public String getStatus() {
61 | return status;
62 | }
63 |
64 | public void setStatus(String status) {
65 | this.status = status;
66 | }
67 |
68 | public String getBody_bytes_sent() {
69 | return body_bytes_sent;
70 | }
71 |
72 | public void setBody_bytes_sent(String body_bytes_sent) {
73 | this.body_bytes_sent = body_bytes_sent;
74 | }
75 |
76 | public String getHttp_referer() {
77 | return http_referer;
78 | }
79 |
80 | public void setHttp_referer(String http_referer) {
81 | this.http_referer = http_referer;
82 | }
83 |
84 | public String getHttp_user_agent() {
85 | return http_user_agent;
86 | }
87 |
88 | public void setHttp_user_agent(String http_user_agent) {
89 | this.http_user_agent = http_user_agent;
90 | }
91 |
92 | public WebLogBean() {
93 | super();
94 | }
95 |
96 | public WebLogBean(boolean valid, String remote_addr, String remote_user, String time_local, String request,
97 | String status, String body_bytes_sent, String http_referer, String http_user_agent) {
98 | super();
99 | this.valid = valid;
100 | this.remote_addr = remote_addr;
101 | this.remote_user = remote_user;
102 | this.time_local = time_local;
103 | this.request = request;
104 | this.status = status;
105 | this.body_bytes_sent = body_bytes_sent;
106 | this.http_referer = http_referer;
107 | this.http_user_agent = http_user_agent;
108 | }
109 |
110 | @Override
111 | public String toString() {
112 | return valid + "\001" + remote_addr + "\001" + remote_user + "\001" + time_local + "\001" + request + "\001"
113 | + status + "\001" + body_bytes_sent + "\001" + http_referer + "\001" + http_user_agent;
114 | }
115 |
116 | // 反序列化
117 | public void readFields(DataInput in) throws IOException {
118 | this.valid = in.readBoolean();
119 | this.remote_addr = in.readUTF();
120 | this.remote_user = in.readUTF();
121 | this.time_local = in.readUTF();
122 | this.request = in.readUTF();
123 | this.status = in.readUTF();
124 | this.body_bytes_sent = in.readUTF();
125 | this.http_referer = in.readUTF();
126 | this.http_user_agent = in.readUTF();
127 | }
128 |
129 | // 序列化
130 | public void write(DataOutput out) throws IOException {
131 | out.writeBoolean(valid);
132 | out.writeUTF(remote_addr);
133 | out.writeUTF(remote_user);
134 | out.writeUTF(time_local);
135 | out.writeUTF(request);
136 | out.writeUTF(status);
137 | out.writeUTF(body_bytes_sent);
138 | out.writeUTF(http_referer);
139 | out.writeUTF(http_user_agent);
140 | }
141 |
142 | }
143 |
--------------------------------------------------------------------------------
/weblog/src/main/java/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | dfs.replication
23 | 2
24 |
25 |
26 |
27 |
28 | dfs.namenode.name.dir
29 | /home/hadoop/data/hadoopdata/dfs/name
30 |
31 |
32 | dfs.datanode.data.dir
33 | /home/hadoop/data/hadoopdata/dfs/data
34 |
35 |
36 |
37 |
38 | dfs.webhdfs.enabled
39 | true
40 |
41 |
42 |
43 |
44 | dfs.nameservices
45 | myha
46 |
47 |
48 |
49 |
50 | dfs.ha.namenodes.myha
51 | nn1,nn2
52 |
53 |
54 |
55 |
56 | dfs.namenode.rpc-address.myha.nn1
57 | hadoop01:9000
58 |
59 |
60 |
61 |
62 | dfs.namenode.http-address.myha.nn1
63 | hadoop01:50070
64 |
65 |
66 |
67 |
68 | dfs.namenode.rpc-address.myha.nn2
69 | hadoop02:9000
70 |
71 |
72 |
73 |
74 | dfs.namenode.http-address.myha.nn2
75 | hadoop02:50070
76 |
77 |
78 |
79 |
80 | dfs.namenode.shared.edits.dir
81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha
82 |
83 |
84 |
85 |
86 | dfs.journalnode.edits.dir
87 | /home/hadoop/data/journaldata
88 |
89 |
90 |
91 |
92 | dfs.ha.automatic-failover.enabled
93 | true
94 |
95 |
96 |
97 |
98 | dfs.client.failover.proxy.provider.myha
99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
100 |
101 |
102 |
103 |
104 | dfs.ha.fencing.methods
105 |
106 | sshfence
107 | shell(/bin/true)
108 |
109 |
110 |
111 |
112 |
113 | dfs.ha.fencing.ssh.private-key-files
114 | /home/hadoop/.ssh/id_rsa
115 |
116 |
117 |
118 |
119 | dfs.ha.fencing.ssh.connect-timeout
120 | 30000
121 |
122 |
123 |
124 | ha.failover-controller.cli-check.rpc-timeout.ms
125 | 60000
126 |
127 |
128 |
--------------------------------------------------------------------------------
/dmp/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | dfs.replication
23 | 2
24 |
25 |
26 |
27 |
28 | dfs.namenode.name.dir
29 | /home/hadoop/data/hadoopdata/dfs/name
30 |
31 |
32 | dfs.datanode.data.dir
33 | /home/hadoop/data/hadoopdata/dfs/data
34 |
35 |
36 |
37 |
38 | dfs.webhdfs.enabled
39 | true
40 |
41 |
42 |
43 |
44 | dfs.nameservices
45 | myha
46 |
47 |
48 |
49 |
50 | dfs.ha.namenodes.myha
51 | nn1,nn2
52 |
53 |
54 |
55 |
56 | dfs.namenode.rpc-address.myha.nn1
57 | hadoop01:9000
58 |
59 |
60 |
61 |
62 | dfs.namenode.http-address.myha.nn1
63 | hadoop01:50070
64 |
65 |
66 |
67 |
68 | dfs.namenode.rpc-address.myha.nn2
69 | hadoop02:9000
70 |
71 |
72 |
73 |
74 | dfs.namenode.http-address.myha.nn2
75 | hadoop02:50070
76 |
77 |
78 |
79 |
80 | dfs.namenode.shared.edits.dir
81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha
82 |
83 |
84 |
85 |
86 | dfs.journalnode.edits.dir
87 | /home/hadoop/data/journaldata
88 |
89 |
90 |
91 |
92 | dfs.ha.automatic-failover.enabled
93 | true
94 |
95 |
96 |
97 |
98 | dfs.client.failover.proxy.provider.myha
99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
100 |
101 |
102 |
103 |
104 | dfs.ha.fencing.methods
105 |
106 | sshfence
107 | shell(/bin/true)
108 |
109 |
110 |
111 |
112 |
113 | dfs.ha.fencing.ssh.private-key-files
114 | /home/hadoop/.ssh/id_rsa
115 |
116 |
117 |
118 |
119 | dfs.ha.fencing.ssh.connect-timeout
120 | 30000
121 |
122 |
123 |
124 | ha.failover-controller.cli-check.rpc-timeout.ms
125 | 60000
126 |
127 |
128 |
--------------------------------------------------------------------------------
/mllib/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | dfs.replication
23 | 2
24 |
25 |
26 |
27 |
28 | dfs.namenode.name.dir
29 | /home/hadoop/data/hadoopdata/dfs/name
30 |
31 |
32 | dfs.datanode.data.dir
33 | /home/hadoop/data/hadoopdata/dfs/data
34 |
35 |
36 |
37 |
38 | dfs.webhdfs.enabled
39 | true
40 |
41 |
42 |
43 |
44 | dfs.nameservices
45 | myha
46 |
47 |
48 |
49 |
50 | dfs.ha.namenodes.myha
51 | nn1,nn2
52 |
53 |
54 |
55 |
56 | dfs.namenode.rpc-address.myha.nn1
57 | hadoop01:9000
58 |
59 |
60 |
61 |
62 | dfs.namenode.http-address.myha.nn1
63 | hadoop01:50070
64 |
65 |
66 |
67 |
68 | dfs.namenode.rpc-address.myha.nn2
69 | hadoop02:9000
70 |
71 |
72 |
73 |
74 | dfs.namenode.http-address.myha.nn2
75 | hadoop02:50070
76 |
77 |
78 |
79 |
80 | dfs.namenode.shared.edits.dir
81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha
82 |
83 |
84 |
85 |
86 | dfs.journalnode.edits.dir
87 | /home/hadoop/data/journaldata
88 |
89 |
90 |
91 |
92 | dfs.ha.automatic-failover.enabled
93 | true
94 |
95 |
96 |
97 |
98 | dfs.client.failover.proxy.provider.myha
99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
100 |
101 |
102 |
103 |
104 | dfs.ha.fencing.methods
105 |
106 | sshfence
107 | shell(/bin/true)
108 |
109 |
110 |
111 |
112 |
113 | dfs.ha.fencing.ssh.private-key-files
114 | /home/hadoop/.ssh/id_rsa
115 |
116 |
117 |
118 |
119 | dfs.ha.fencing.ssh.connect-timeout
120 | 30000
121 |
122 |
123 |
124 | ha.failover-controller.cli-check.rpc-timeout.ms
125 | 60000
126 |
127 |
128 |
--------------------------------------------------------------------------------
/flink-train/src/main/resources/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | dfs.replication
23 | 2
24 |
25 |
26 |
27 |
28 | dfs.namenode.name.dir
29 | /home/hadoop/data/hadoopdata/dfs/name
30 |
31 |
32 | dfs.datanode.data.dir
33 | /home/hadoop/data/hadoopdata/dfs/data
34 |
35 |
36 |
37 |
38 | dfs.webhdfs.enabled
39 | true
40 |
41 |
42 |
43 |
44 | dfs.nameservices
45 | myha
46 |
47 |
48 |
49 |
50 | dfs.ha.namenodes.myha
51 | nn1,nn2
52 |
53 |
54 |
55 |
56 | dfs.namenode.rpc-address.myha.nn1
57 | hadoop01:9000
58 |
59 |
60 |
61 |
62 | dfs.namenode.http-address.myha.nn1
63 | hadoop01:50070
64 |
65 |
66 |
67 |
68 | dfs.namenode.rpc-address.myha.nn2
69 | hadoop02:9000
70 |
71 |
72 |
73 |
74 | dfs.namenode.http-address.myha.nn2
75 | hadoop02:50070
76 |
77 |
78 |
79 |
80 | dfs.namenode.shared.edits.dir
81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha
82 |
83 |
84 |
85 |
86 | dfs.journalnode.edits.dir
87 | /home/hadoop/data/journaldata
88 |
89 |
90 |
91 |
92 | dfs.ha.automatic-failover.enabled
93 | true
94 |
95 |
96 |
97 |
98 | dfs.client.failover.proxy.provider.myha
99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
100 |
101 |
102 |
103 |
104 | dfs.ha.fencing.methods
105 |
106 | sshfence
107 | shell(/bin/true)
108 |
109 |
110 |
111 |
112 |
113 | dfs.ha.fencing.ssh.private-key-files
114 | /home/hadoop/.ssh/id_rsa
115 |
116 |
117 |
118 |
119 | dfs.ha.fencing.ssh.connect-timeout
120 | 30000
121 |
122 |
123 |
124 | ha.failover-controller.cli-check.rpc-timeout.ms
125 | 60000
126 |
127 |
128 |
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/click/ClickModel.java:
--------------------------------------------------------------------------------
1 | package com.awebone.click;
2 |
3 | import java.io.IOException;
4 | import java.lang.reflect.InvocationTargetException;
5 | import java.util.ArrayList;
6 | import java.util.Collections;
7 | import java.util.Comparator;
8 |
9 | import org.apache.commons.beanutils.BeanUtils;
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.io.LongWritable;
13 | import org.apache.hadoop.io.NullWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.mapreduce.Job;
16 | import org.apache.hadoop.mapreduce.Mapper;
17 | import org.apache.hadoop.mapreduce.Reducer;
18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
20 |
21 | import com.awebone.bean.PageViewsBean;
22 | import com.awebone.bean.VisitBean;
23 | import com.awebone.bean.WebLogBean;
24 |
25 | /**
26 | * map端: 相同的会话的数据 发送到 reduce
27 | * key: session
28 | * value: 其他的字段
29 | 访问时间 url step 外链 ip
30 | reduce端:
31 | 相同session的数据过来了
32 | 按照step排序
33 | list 第一个开始
34 | list 最后一个结束
35 | 封装 发送
36 | *
37 | */
38 | public class ClickModel {
39 | static class ClickModelMapper extends Mapper{
40 | Text mk = new Text();
41 | PageViewsBean pbean = new PageViewsBean();
42 |
43 | @Override
44 | protected void map(LongWritable key, Text value,
45 | Mapper.Context context)
46 | throws IOException, InterruptedException {
47 | String[] fields = value.toString().split("\001");
48 | if (fields.length == 11){
49 | mk.set(fields[0]);
50 | int step=Integer.parseInt(fields[6]);
51 | pbean.set(fields[0], fields[1], fields[10], fields[3], fields[4],step,
52 | fields[5], fields[9], fields[8], fields[7]);
53 | context.write(mk, pbean);
54 | }
55 | }
56 | }
57 |
58 | static class ClickModelReducer extends Reducer{
59 | VisitBean vb=new VisitBean();
60 |
61 | @Override
62 | protected void reduce(Text key, Iterable values,
63 | Reducer.Context context)
64 | throws IOException, InterruptedException {
65 | ArrayList list = new ArrayList();
66 | for (PageViewsBean v:values){
67 | PageViewsBean pb = new PageViewsBean();
68 | try {
69 | BeanUtils.copyProperties(pb, v);
70 | list.add(pb);
71 | } catch (IllegalAccessException e) {
72 | // TODO Auto-generated catch block
73 | e.printStackTrace();
74 | } catch (InvocationTargetException e) {
75 | // TODO Auto-generated catch block
76 | e.printStackTrace();
77 | }
78 | }
79 |
80 | Collections.sort(list, new Comparator() {
81 | public int compare(PageViewsBean o1, PageViewsBean o2) {
82 | if(o1 == null || o2 == null){
83 | return 0;
84 | }
85 | return o1.getStep()-o2.getStep();
86 | }
87 | });
88 |
89 | //构造发送的对象
90 | vb.set(key.toString(), list.get(0).getRemote_addr(),
91 | list.get(0).getTimestr(), list.get(list.size()-1).getTimestr(),
92 | list.get(0).getRequest(), list.get(list.size()-1).getRequest(),
93 | list.get(0).getReferal(), list.get(list.size()-1).getStep());
94 | context.write(vb, NullWritable.get());
95 | }
96 | }
97 |
98 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
99 | System.setProperty("HADOOP_USER_NAME", "hadoop");
100 | Configuration conf = new Configuration();
101 | conf.set("fs.defaultFS", "hdfs://myha/");
102 | Job job = Job.getInstance(conf);
103 |
104 | job.setJarByClass(ClickModel.class);
105 |
106 | job.setMapperClass(ClickModelMapper.class);
107 | job.setReducerClass(ClickModelReducer.class);
108 |
109 | job.setMapOutputKeyClass(Text.class);
110 | job.setMapOutputValueClass(PageViewsBean.class);
111 | job.setOutputKeyClass(VisitBean.class);
112 | job.setOutputValueClass(NullWritable.class);
113 |
114 | FileInputFormat.setInputPaths(job, new Path("/weblog/click/stream/20200221"));
115 | FileOutputFormat.setOutputPath(job, new Path("/weblog/click/model/20200221"));
116 |
117 | boolean res = job.waitForCompletion(true);
118 | System.exit(res ? 0 : 1);
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/akka_rpc/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | awebone
8 | akka_rpc
9 | 1.0-SNAPSHOT
10 |
11 | akka_rpc
12 |
13 | http://www.example.com
14 |
15 |
16 | UTF8
17 | 1.8
18 | 1.8
19 | UTF-8
20 | 2.11.8
21 | 2.11.8
22 | 2.4.17
23 |
24 |
25 |
26 |
27 | org.scala-lang
28 | scala-library
29 | ${scala.version}
30 |
31 |
32 |
33 | com.typesafe.akka
34 | akka-actor_2.11
35 | ${akka.version}
36 |
37 |
38 |
39 |
40 | org.scala-lang
41 | scala-actors
42 | ${scala.actors.version}
43 |
44 |
45 |
46 | com.typesafe.akka
47 | akka-remote_2.11
48 | ${akka.version}
49 |
50 |
51 |
52 |
53 | org.apache.hadoop
54 | hadoop-client
55 | 2.7.6
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | net.alchim31.maven
65 | scala-maven-plugin
66 | 3.2.2
67 |
68 |
69 | org.apache.maven.plugins
70 | maven-compiler-plugin
71 | 3.5.1
72 |
73 |
74 |
75 |
76 |
77 | net.alchim31.maven
78 | scala-maven-plugin
79 |
80 |
81 | scala-compile-first
82 | process-resources
83 |
84 | add-source
85 | compile
86 |
87 |
88 |
89 | scala-test-compile
90 | process-test-resources
91 |
92 | testCompile
93 |
94 |
95 |
96 |
97 |
98 |
99 | org.apache.maven.plugins
100 | maven-compiler-plugin
101 |
102 |
103 | compile
104 |
105 | compile
106 |
107 |
108 |
109 |
110 |
111 |
112 | org.apache.maven.plugins
113 | maven-shade-plugin
114 | 2.4.3
115 |
116 |
117 | package
118 |
119 | shade
120 |
121 |
122 |
123 |
124 | *:*
125 |
126 | META-INF/*.SF
127 | META-INF/*.DSA
128 | META-INF/*.RSA
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/MovieLensSparkShell.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.spark
2 |
3 | import org.apache.spark.mllib.evaluation.RegressionMetrics
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
7 | import org.apache.spark.sql.{SQLContext, SparkSession}
8 |
9 | case class Movie(movieId: Int, title: String, genres: Seq[String])
10 |
11 | case class User(userId: Int, gender: String, age: Int, occupation: Int, zip: String)
12 |
13 | object DataProcess {
14 | //获取程序入口
15 | val sparkConf: SparkConf = new SparkConf()
16 | sparkConf.setAppName(DataProcess.getClass.getSimpleName)
17 | sparkConf.setMaster("local")
18 | val sc: SparkContext = new SparkContext(sparkConf)
19 | val sqlContext = new SQLContext(sc)
20 |
21 | import sqlContext.implicits._
22 | // val spark:SparkSession = SparkSession.builder().appName("MyFirstSparkSQL").config("someKey", "someValue").master("local").getOrCreate()
23 |
24 | //Define parse function
25 | def parseMovie(str: String): Movie = {
26 | val fields = str.split("::")
27 | assert(fields.size == 3)
28 | Movie(fields(0).toInt, fields(1).toString, Seq(fields(2)))
29 | }
30 |
31 | def parseUser(str: String): User = {
32 | val fields = str.split("::")
33 | assert(fields.size == 5)
34 | User(fields(0).toInt, fields(1).toString, fields(2).toInt, fields(3).toInt, fields(4).toString)
35 | }
36 |
37 | def parseRating(str: String): Rating = {
38 | val fields = str.split("::")
39 | assert(fields.size == 4)
40 | Rating(fields(0).toInt, fields(1).toInt, fields(2).toInt)
41 | }
42 |
43 | //Rating analysis
44 | val ratingText = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\ratings.dat")
45 | ratingText.first()
46 | val ratingRDD = ratingText.map(parseRating).cache()
47 | println("Total number of ratings: " + ratingRDD.count())
48 | println("Total number of movies rated: " + ratingRDD.map(_.product).distinct().count())
49 | println("Total number of users who rated movies: " + ratingRDD.map(_.user).distinct().count())
50 |
51 | //Create DataFrames
52 | val ratingDF = ratingRDD.toDF
53 | // val ratingDF = spark.createDataFrame(ratingRDD)
54 | val movieDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\movies.dat").map(parseMovie).toDF
55 | val userDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\users.dat").map(parseUser).toDF
56 | ratingDF.printSchema
57 | // ratingDF.show
58 | movieDF.printSchema
59 | userDF.printSchema
60 |
61 | //注册成表
62 | ratingDF.registerTempTable("ratings")
63 | // ratingDF.createOrReplaceTempView(“ratings”)
64 | movieDF.registerTempTable("movies")
65 | userDF.registerTempTable("users")
66 |
67 | //数据探索
68 | val rantingMovies = sqlContext.sql(
69 | """
70 | |select title,ramx,rmin,ucnt from
71 | |(select product, max(rating) as rmax, min(rating) as rmin, count(distinct user) as ucnt from ratings group by product) rantingsCNT
72 | |join movies on product=movieId
73 | |order by ucnt desc
74 | """.stripMargin)
75 | rantingMovies.show()
76 |
77 | val mostActiveUser = sqlContext.sql(
78 | """
79 | |select user,count(*) as cnt
80 | |from ratings group by user order by cnt desc limit 10
81 | """.stripMargin)
82 | mostActiveUser.show()
83 |
84 | val userRating = sqlContext.sql(
85 | """
86 | |select distinct title,rating
87 | |from ratings join movies on movieId=product
88 | |where user=4169 and rating>4
89 | """.stripMargin)
90 | userRating.show()
91 |
92 | //ALS model
93 | //数据切分
94 | val splitsData = ratingRDD.randomSplit(Array(0.8, 0.2), 0L)
95 | val trainingSet = splitsData(0).cache()
96 | val testSet = splitsData(0).cache()
97 | trainingSet.count()
98 | testSet.count()
99 |
100 | //构建模型
101 | val model = new ALS()
102 | .setRank(20)
103 | .setIterations(10)
104 | .run(trainingSet)
105 |
106 | //进行推荐
107 | val recomForTopUser = model.recommendProducts(4169, 5)
108 | val movieTitle = movieDF.rdd.map(x => (x(0), x(1))).collectAsMap
109 | val recomResult = recomForTopUser.map(rating => (movieTitle(rating.product), rating.rating)).foreach(println)
110 |
111 | //测试集预测
112 | val testUserProduct = testSet.map {
113 | case Rating(user, product, rating) => (user, product)
114 | }
115 | val testUserProductPredict = model.predict(testUserProduct)
116 | testUserProductPredict.take(10).mkString("\n")
117 |
118 | //模型评估
119 | val testSetPair = testSet.map {
120 | case Rating(user, product, rating) => ((user, product), rating)
121 | }
122 | val predictionsPair = testUserProductPredict.map {
123 | case Rating(user, product, rating) => ((user, product), rating)
124 | }
125 |
126 | val joinTestPredict = testSetPair.join(predictionsPair)
127 | val mae = joinTestPredict.map {
128 | case ((user, product), (ratingT, ratingP)) =>
129 | val err = ratingT - ratingP
130 | Math.abs(err)
131 | }.mean()
132 | val fp = joinTestPredict.filter {
133 | case ((user, product), (ratingT, ratingP)) =>
134 | (ratingT <= 1 & ratingP >= 4)
135 | }.count()
136 |
137 | //使用库进行评估
138 | val ratingTP = joinTestPredict.map {
139 | case ((user, product), (ratingT, ratingP)) =>
140 | (ratingP, ratingT)
141 | }
142 | val evalutor = new RegressionMetrics(ratingTP)
143 | evalutor.meanAbsoluteError
144 | evalutor.rootMeanSquaredError
145 | }
146 |
--------------------------------------------------------------------------------
/weblog/src/main/java/hive-op.txt:
--------------------------------------------------------------------------------
1 | 启动服务
2 | nohup hiveserver2 1>~/logs/hive_std.log 2>~/logs/hive_err.log &
3 |
4 | 连接服务
5 | beeline或者hive
6 | !connect jdbc:hive2://hadoop04:10000
7 | show databases;
8 | show tables;
9 |
10 |
11 | 创建表
12 | ODS层
13 | 原始数据表:
14 | create database if not exists weblog;
15 | use weblog;
16 | drop table if exists weblog.ods_weblog_origin;
17 | create table weblog.ods_weblog_origin(
18 | valid string,
19 | remote_addr string,
20 | remote_user string,
21 | time_local string,
22 | request string,
23 | status string,
24 | body_bytes_sent string,
25 | http_referer string,
26 | http_user_agent string)
27 | partitioned by (datestr string)
28 | row format delimited
29 | fields terminated by '\001';
30 |
31 |
32 | dw层
33 | 点击流事件表:
34 | create database if not exists weblog;
35 | use weblog;
36 | drop table if exists weblog.click_stream_pageviews;
37 | create table weblog.click_stream_pageviews (
38 | session string,
39 | remote_addr string,
40 | remote_user string,
41 | time_local string,
42 | request string,
43 | page_staylong string,
44 | visit_step string,
45 | status string,
46 | body_bytes_sent string,
47 | http_referer string,
48 | http_user_agent string)
49 | partitioned by (datestr string)
50 | row format delimited
51 | fields terminated by '\001';
52 |
53 | 会话访问统计表 点击流访客表
54 | create database if not exists weblog;
55 | use weblog;
56 | drop table if exists weblog.click_stream_visit;
57 | create table weblog.click_stream_visit(
58 | session string,
59 | remote_addr string,
60 | inTime string,
61 | outTime string,
62 | inPage string,
63 | outPage string,
64 | referal string,
65 | pageVisits int)
66 | partitioned by (datestr string);
67 |
68 |
69 | 加载数据
70 | /weblog/pre/20200221 原始表
71 | load data inpath '/weblog/pre/20200221' into table weblog.ods_weblog_origin partition(datestr = "20200221");
72 |
73 | /weblog/click/stream/20200221 点击流事件表
74 | load data inpath "/weblog/click/stream/20200221" into table weblog.click_stream_pageviews partition(datestr ="20200221");
75 |
76 | /weblog/click/model/20200221 点击流访客表
77 | load data inpath "/weblog/click/model/20200221" into table weblog.click_stream_visit partition(datestr ="20200221");
78 |
79 | 查询数据
80 | select * from weblog.ods_weblog_origin limit 1;
81 | select * from weblog.click_stream_pageviews limit 1;
82 | select * from weblog.click_stream_visit limit 1;
83 |
84 |
85 | dw层创建明细宽表:
86 | create database if not exists weblog;
87 | use weblog;
88 | drop table if exists weblog.ods_weblog_detail;
89 | create table weblog.ods_weblog_detail(
90 | valid string comment "有效标识",
91 | remote_addr string comment "来源 IP",
92 | remote_user string comment "用户标识",
93 | time_local string comment "访问完整时间",
94 | daystr string comment "访问日期",
95 | timestr string comment "访问时间",
96 | year string comment "访问年",
97 | month string comment "访问月",
98 | day string comment "访问日",
99 | hour string comment "访问时",
100 | request string comment "请求的 url",
101 | status string comment "响应码",
102 | body_bytes_sent string comment "传输字节数",
103 | http_referer string comment "来源 url",
104 | ref_host string comment "来源的 host",
105 | ref_path string comment "来源的路径",
106 | ref_query string comment "来源参数 query",
107 | ref_query_id string comment "来源参数 query 的值",
108 | http_user_agent string comment "客户终端标识"
109 | )
110 | partitioned by(datestr string)
111 | row format delimited fields terminated by '\001';
112 |
113 |
114 | 设置本地模式和打印表头
115 | set hive.exec.mode.local.auto=true;
116 | set hive.cli.print.header=true;
117 |
118 |
119 | 解析url:解析外链的信息
120 | create database if not exists weblog;
121 | use weblog;
122 | drop table if exists weblog.t_ods_tmp_referurl;
123 | create table weblog.t_ods_tmp_referurl as
124 | SELECT a.*, b.*
125 | FROM ods_weblog_origin a
126 | LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH', 'QUERY','QUERY:id') b
127 | as host, path, query, query_id;
128 |
129 | 查询外链信息临时表
130 | select * from weblog.t_ods_tmp_referurl a where a.host is not null limit 1;
131 |
132 | 最终明细宽表
133 | create database if not exists weblog;
134 | use weblog;
135 | drop table if exists weblog.t_ods_tmp_detail;
136 | create table weblog.t_ods_tmp_detail as
137 | select b.*,substring(time_local,0,10) as daystr,
138 | substring(time_local,11) as tmstr,
139 | substring(time_local,0,4) as year,
140 | substring(time_local,6,2) as month,
141 | substring(time_local,9,2) as day,
142 | substring(time_local,12,2) as hour
143 | From t_ods_tmp_referurl b;
144 |
145 | 查询宽表
146 | select * from weblog.t_ods_tmp_detail where month is not null limit 3;
147 |
148 |
149 | 统计日志中的相关指标
150 | 1)pv:page view
151 | click_stream_pageviews 76
152 | select count(*) from click_stream_pageviews;
153 |
154 | 2)uv:独立用户数 独立会话数,统计的会话的个数
155 | click_stream_visit 57
156 | select count(*) from click_stream_visit;
157 |
158 | 3)dv:平均每一个会话的访问深度,所有的pv / uv
159 | 关联
160 | set hive.strict.checks.cartesian.product=false;
161 | set hive.mapred.mode=nonstrict;
162 |
163 | select a.pv/b.uv avgdv
164 | from
165 | (select count(*) pv from click_stream_pageviews ) a join
166 | (select count(*) uv from click_stream_visit) b;
167 |
168 | 4)转化率
169 | 数据order.txt
170 | 1,广告,10000
171 | 2,菜单,3000
172 | 3,商品详情,2600
173 | 4,购物车,300
174 | 5,下单,200
175 | 6,支付,190
176 | 7,支付成功,189
177 |
178 | 建表加载数据
179 | create database if not exists hive_order;
180 | use hive_order;
181 | drop table if exists t_order;
182 | create table t_order(step int, name string, pv int) row format delimited fields terminated by ",";
183 | load data local inpath "/home/hadoop/tmpdata/order.txt" into table t_order;
184 | select * from t_order limit 10;
185 |
186 | 查转化率
187 | select step,name,pv,pv/lpv t
188 | from
189 | (select step,name,pv,lag(pv,1,pv) over(order by step) lpv from t_order) a;
190 |
191 |
--------------------------------------------------------------------------------
/mllib/src/main/resources/ml-1m/README:
--------------------------------------------------------------------------------
1 | SUMMARY
2 | ================================================================================
3 |
4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies
5 | made by 6,040 MovieLens users who joined MovieLens in 2000.
6 |
7 | USAGE LICENSE
8 | ================================================================================
9 |
10 | Neither the University of Minnesota nor any of the researchers
11 | involved can guarantee the correctness of the data, its suitability
12 | for any particular purpose, or the validity of results based on the
13 | use of the data set. The data set may be used for any research
14 | purposes under the following conditions:
15 |
16 | * The user may not state or imply any endorsement from the
17 | University of Minnesota or the GroupLens Research Group.
18 |
19 | * The user must acknowledge the use of the data set in
20 | publications resulting from the use of the data set, and must
21 | send us an electronic or paper copy of those publications.
22 |
23 | * The user may not redistribute the data without separate
24 | permission.
25 |
26 | * The user may not use this information for any commercial or
27 | revenue-bearing purposes without first obtaining permission
28 | from a faculty member of the GroupLens Research Project at the
29 | University of Minnesota.
30 |
31 | If you have any further questions or comments, please contact GroupLens
32 | .
33 |
34 | ACKNOWLEDGEMENTS
35 | ================================================================================
36 |
37 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
38 | set.
39 |
40 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
41 | ================================================================================
42 |
43 | The GroupLens Research Project is a research group in the Department of
44 | Computer Science and Engineering at the University of Minnesota. Members of
45 | the GroupLens Research Project are involved in many research projects related
46 | to the fields of information filtering, collaborative filtering, and
47 | recommender systems. The project is lead by professors John Riedl and Joseph
48 | Konstan. The project began to explore automated collaborative filtering in
49 | 1992, but is most well known for its world wide trial of an automated
50 | collaborative filtering system for Usenet news in 1996. Since then the project
51 | has expanded its scope to research overall information filtering solutions,
52 | integrating in content-based methods as well as improving current collaborative
53 | filtering technology.
54 |
55 | Further information on the GroupLens Research project, including research
56 | publications, can be found at the following web site:
57 |
58 | http://www.grouplens.org/
59 |
60 | GroupLens Research currently operates a movie recommender based on
61 | collaborative filtering:
62 |
63 | http://www.movielens.org/
64 |
65 | RATINGS FILE DESCRIPTION
66 | ================================================================================
67 |
68 | All ratings are contained in the file "ratings.dat" and are in the
69 | following format:
70 |
71 | UserID::MovieID::Rating::Timestamp
72 |
73 | - UserIDs range between 1 and 6040
74 | - MovieIDs range between 1 and 3952
75 | - Ratings are made on a 5-star scale (whole-star ratings only)
76 | - Timestamp is represented in seconds since the epoch as returned by time(2)
77 | - Each user has at least 20 ratings
78 |
79 | USERS FILE DESCRIPTION
80 | ================================================================================
81 |
82 | User information is in the file "users.dat" and is in the following
83 | format:
84 |
85 | UserID::Gender::Age::Occupation::Zip-code
86 |
87 | All demographic information is provided voluntarily by the users and is
88 | not checked for accuracy. Only users who have provided some demographic
89 | information are included in this data set.
90 |
91 | - Gender is denoted by a "M" for male and "F" for female
92 | - Age is chosen from the following ranges:
93 |
94 | * 1: "Under 18"
95 | * 18: "18-24"
96 | * 25: "25-34"
97 | * 35: "35-44"
98 | * 45: "45-49"
99 | * 50: "50-55"
100 | * 56: "56+"
101 |
102 | - Occupation is chosen from the following choices:
103 |
104 | * 0: "other" or not specified
105 | * 1: "academic/educator"
106 | * 2: "artist"
107 | * 3: "clerical/admin"
108 | * 4: "college/grad student"
109 | * 5: "customer service"
110 | * 6: "doctor/health care"
111 | * 7: "executive/managerial"
112 | * 8: "farmer"
113 | * 9: "homemaker"
114 | * 10: "K-12 student"
115 | * 11: "lawyer"
116 | * 12: "programmer"
117 | * 13: "retired"
118 | * 14: "sales/marketing"
119 | * 15: "scientist"
120 | * 16: "self-employed"
121 | * 17: "technician/engineer"
122 | * 18: "tradesman/craftsman"
123 | * 19: "unemployed"
124 | * 20: "writer"
125 |
126 | MOVIES FILE DESCRIPTION
127 | ================================================================================
128 |
129 | Movie information is in the file "movies.dat" and is in the following
130 | format:
131 |
132 | MovieID::Title::Genres
133 |
134 | - Titles are identical to titles provided by the IMDB (including
135 | year of release)
136 | - Genres are pipe-separated and are selected from the following genres:
137 |
138 | * Action
139 | * Adventure
140 | * Animation
141 | * Children's
142 | * Comedy
143 | * Crime
144 | * Documentary
145 | * Drama
146 | * Fantasy
147 | * Film-Noir
148 | * Horror
149 | * Musical
150 | * Mystery
151 | * Romance
152 | * Sci-Fi
153 | * Thriller
154 | * War
155 | * Western
156 |
157 | - Some MovieIDs do not correspond to a movie due to accidental duplicate
158 | entries and/or test entries
159 | - Movies are mostly entered by hand, so errors and inconsistencies may exist
160 |
--------------------------------------------------------------------------------
/mllib/src/main/scala/com/awebone/spark/MovieLensALS.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.spark
2 |
3 | import java.io.File
4 |
5 | import org.apache.log4j.{Level, Logger}
6 | import org.apache.spark.mllib.evaluation.RegressionMetrics
7 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.{SparkConf, SparkContext}
10 |
11 | import scala.util.Random
12 |
13 | object MovieLensALS {
14 | //1. Define a rating elicitation function
15 | def elicitateRating(movies: Seq[(Int, String)]) = {
16 | val prompt = "Please rate the following movie(1-5(best) or 0 if not seen: )"
17 | println(prompt)
18 |
19 | val ratings = movies.flatMap { x =>
20 | var rating: Option[Rating] = None
21 | var vaild = false
22 | while (!vaild) {
23 | println(x._2 + " :")
24 | try {
25 | val r = Console.readInt()
26 | if (r > 5 || r < 0) {
27 | println(prompt)
28 | } else {
29 | vaild = true
30 | if (r > 0) {
31 | rating = Some(Rating(0, x._1, r))
32 | }
33 | }
34 | } catch {
35 | case e: Exception => println(prompt)
36 | }
37 | }
38 | rating match {
39 | case Some(r) => Iterator(r)
40 | case None => Iterator.empty
41 | }
42 | }
43 | if (ratings.isEmpty) {
44 | error("No ratings provided!")
45 | } else {
46 | ratings
47 | }
48 | }
49 |
50 | //2. Define a RMSE computation function
51 | def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]) = {
52 | val prediction = model.predict(data.map(x => (x.user, x.product)))
53 | val predDataJoined = prediction
54 | .map(x => ((x.user, x.product), x.rating))
55 | .join(data.map(x => ((x.user, x.product), x.rating)))
56 | .values
57 | new RegressionMetrics(predDataJoined).rootMeanSquaredError
58 | }
59 |
60 | //3. Main
61 | def main(args: Array[String]) = {
62 | //3.1 Setup env
63 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
64 | if (args.length != 1) {
65 | println("Usage: movieLensDir")
66 | sys.exit(1)
67 | }
68 |
69 | val conf = new SparkConf()
70 | .setAppName("MovieLensALS")
71 | .setMaster("local")
72 | .set("spark.executor.memory", "500m")
73 | val sc = new SparkContext(conf)
74 |
75 | //3.2 Load ratings data and know your data
76 | val movieLensHomeDir = args(0)
77 | val ratings = sc
78 | .textFile(new File(movieLensHomeDir, "ratings.dat").toString)
79 | .map { line =>
80 | val fields = line.split("::")
81 | (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
82 | }
83 | val movies = sc
84 | .textFile(new File(movieLensHomeDir, "movies.dat").toString)
85 | .map { line =>
86 | val fields = line.split("::")
87 | (fields(0).toInt, fields(1).toString)
88 | }
89 | .collectAsMap()
90 |
91 | val numRatings = ratings.count()
92 | val numUser = ratings.map(x => x._2.user).distinct().count()
93 | val numMovie = ratings.map(x => x._2.product).distinct().count()
94 | println("Got " + numRatings + " ratings from " + numUser + " users on " + numMovie + " movies.")
95 |
96 | //3.3 Elicitate personal rating
97 | val topMovies = ratings
98 | .map(_._2.product)
99 | .countByValue()
100 | .toSeq
101 | .sortBy(-_._2)
102 | .take(50)
103 | .map(_._1)
104 | val random = new Random(0)
105 | val selectMovies = topMovies
106 | .filter(x => random.nextDouble() < 0.2)
107 | .map(x => (x, movies(x)))
108 |
109 | val myRatings = elicitateRating(selectMovies)
110 | val myRatingsRDD = sc.parallelize(myRatings, 1)
111 |
112 | //3.4 Split data into train(60%), validation(20%) and test(20%)
113 | val numPartitions = 10
114 | val trainSet = ratings
115 | .filter(x => x._1 < 6)
116 | .map(_._2)
117 | .union(myRatingsRDD)
118 | .repartition(numPartitions)
119 | .persist()
120 | val validationSet = ratings
121 | .filter(x => x._1 >= 6 && x._1 < 8)
122 | .map(_._2)
123 | .persist()
124 | val testSet = ratings
125 | .filter(x => x._1 >= 8)
126 | .map(_._2)
127 | .persist()
128 |
129 | val numTrain = trainSet.count()
130 | val numValidation = validationSet.count()
131 | val numTest = testSet.count()
132 | println("Training data: " + numTrain + " Validation data: " + numValidation + " Test data: " + numTest)
133 |
134 | //3.5 Train model and optimize model with validation set
135 | val numRanks = List(8, 12)
136 | val numIters = List(10, 20)
137 | val numLambdas = List(0.1, 0.01)
138 | var bestRmse = Double.MaxValue
139 | var bestModel: Option[MatrixFactorizationModel] = None
140 | var bestRanks = -1
141 | var bestIters = 0
142 | var bestLambdas = -1.0
143 |
144 | for (rank <- numRanks; iter <- numIters; lambda <- numLambdas) {
145 | val model = ALS.train(trainSet, rank, iter, lambda)
146 | val validationRmse = computeRmse(model, validationSet)
147 | println("RMSE(validation) = " + validationRmse + " with ranks = " + rank + ", iter = " + iter + ", Lambda = " + lambda)
148 |
149 | if (validationRmse < bestRmse) {
150 | bestRmse = validationRmse
151 | bestModel = Some(model)
152 | bestIters = iter
153 | bestLambdas = lambda
154 | bestRanks = rank
155 | }
156 | }
157 |
158 | //3.6 Evaluate model with test set
159 | val testRmse = computeRmse(bestModel.get, testSet)
160 | println("The best model was trained with rank = " + bestRanks + ", iter = " + bestIters + ", Lambda = " + bestLambdas + " and compute RMSE on test set is " + testRmse)
161 |
162 | //3.7 Create a baseline and compare it with best model
163 | val meanRating = trainSet.union(validationSet).map(_.rating).mean()
164 | val baselineRmse = new RegressionMetrics(testSet.map(x => (x.rating, meanRating))).rootMeanSquaredError
165 | val improvement = (baselineRmse - testRmse) / baselineRmse * 100
166 | println("The best model improves the baseline by %1.2f".format(improvement) + "%.")
167 |
168 | //3.8 Make a personal recommendation
169 | val moviesId = myRatings.map(_.product)
170 | val candidates = sc.parallelize(movies.keys.filter(!moviesId.contains(_)).toSeq)
171 | val recommendations = bestModel.get
172 | .predict(candidates.map(x => (0, x)))
173 | .sortBy(-_.rating)
174 | .take(50)
175 |
176 | var i = 1
177 | println("Movies recommended for you: ")
178 | recommendations.foreach { line =>
179 | println("%2d".format(i) + " : " + movies(line.product))
180 | i += 1
181 | }
182 |
183 | sc.stop()
184 | }
185 | }
186 |
--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/LogAnalysis.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.flink.project
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util
5 | import java.util.{Date, Properties}
6 |
7 | import org.apache.flink.api.common.functions.RuntimeContext
8 | import org.apache.flink.api.common.serialization.SimpleStringSchema
9 | import org.apache.flink.api.java.tuple.Tuple
10 | import org.apache.flink.streaming.api.TimeCharacteristic
11 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
13 | import org.apache.flink.streaming.api.scala.function.WindowFunction
14 | import org.apache.flink.streaming.api.watermark.Watermark
15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
16 | import org.apache.flink.streaming.api.windowing.time.Time
17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
21 | import org.apache.flink.util.Collector
22 | import org.apache.http.HttpHost
23 | import org.elasticsearch.action.index.IndexRequest
24 | import org.elasticsearch.client.Requests
25 | import org.slf4j.LoggerFactory
26 |
27 | import scala.collection.mutable.ArrayBuffer
28 |
29 | /**
30 | * 日志分析系统
31 | * * 功能:
32 | * * 最近一分钟每个域名产生的流量统计
33 | */
34 | object LogAnalysis {
35 |
36 | def main(args: Array[String]): Unit = {
37 | //在生产上进行日志的输出,采用以下方式
38 | val logger = LoggerFactory.getLogger("LogAnalysis")
39 |
40 | val env = StreamExecutionEnvironment.getExecutionEnvironment
41 | //设置事件时间作为flink处理的基准时间
42 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
43 | import org.apache.flink.api.scala._
44 |
45 | /**
46 | * 读取kafka集群数据
47 | */
48 | val topic = "cdnlog"
49 | val properties: Properties = new Properties()
50 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
51 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
52 | // properties.setProperty("enable.auto.commit", "true")
53 | // properties.setProperty("bootstrap.servers","hadoop04:9092")
54 | properties.setProperty("group.id","test-cdnlog")
55 |
56 | val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties)
57 | val data = env.addSource(consumer) // 接受kafka数据
58 | // data.print().setParallelism(1) // 测试是否连通
59 |
60 | /**
61 | * 数据清洗:
62 | * 在生产上进行业务处理的时候,一定要考虑处理的健壮性以及数据的准确性
63 | * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后
64 | * 再进行相应业务逻辑的处理
65 | */
66 | val logData = data.map(x => {
67 | val strings = x.split("\t")
68 |
69 | val level = strings(2)
70 | val timeStr = strings(3)
71 | var time = 0l
72 | try {
73 | val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
74 | time = sourceFormat.parse(timeStr).getTime
75 | } catch {
76 | case e:Exception => {
77 | logger.error(s"time parse error: $timeStr", e.getMessage)
78 | }
79 | }
80 |
81 | val domain = strings(5)
82 | val traffic = strings(6).toLong
83 | (level, time, domain, traffic)
84 | }).filter(_._2 != 0).filter(_._1 == "E")
85 | .map(x => {
86 | (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic
87 | })
88 | // logData.print.setParallelism(1)
89 |
90 | /**
91 | * Flink watermarks 定义
92 | * 设置timestamp和watermark,解决时序性问题
93 | * Windows function 使用
94 | * AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型
95 | */
96 | val resultData = logData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long)] {
97 | //最大无序容忍的时间 10s
98 | val maxOutOfOrderness = 10000L // 3.5 seconds
99 | //当前最大的TimeStamp
100 | var currentMaxTimestamp: Long = _
101 |
102 | //设置TimeStamp生成WaterMark
103 | override def getCurrentWatermark: Watermark = {
104 | new Watermark(currentMaxTimestamp - maxOutOfOrderness)
105 | }
106 |
107 | //抽取时间
108 | override def extractTimestamp(element: (Long, String, Long), previousElementTimestamp: Long): Long = {
109 | //获取数据的event time
110 | val timestamp: Long = element._1
111 | currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
112 | timestamp
113 | }
114 | }) //根据window进行业务逻辑的处理 最近一分钟每个域名产生的流量
115 | .keyBy(1) //以域名进行分组,按照域名进行keyby
116 | .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口,进行统计
117 | .apply(new WindowFunction[(Long, String, Long), (String, String, Long), Tuple, TimeWindow] {
118 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long)], out: Collector[(String, String, Long)]): Unit = {
119 | val domain = key.getField(0).toString //拿到key,域名
120 |
121 | var sum = 0l
122 | val times = ArrayBuffer[Long]()
123 | val iterator = input.iterator
124 | while (iterator.hasNext) {
125 | val next = iterator.next()
126 | sum += next._3 //统计流量
127 | times.append(next._1) //记录这一分钟,格式:yyyy-MM-dd HH:mm
128 | }
129 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间,格式化
130 |
131 | /**
132 | * 输出结果:
133 | * 第一个参数:这一分钟的时间
134 | * 第二个参数:域名
135 | * 第三个参数:traffic流量的和
136 | */
137 | out.collect((time, domain, sum))
138 | }
139 | })
140 | resultData.print().setParallelism(1)
141 |
142 |
143 | /**
144 | * 连接es库,导入数据
145 | * 使用kibana可视化
146 | */
147 | val httpHosts = new java.util.ArrayList[HttpHost]
148 | httpHosts.add(new HttpHost("redhat", 9200, "http"))
149 |
150 | val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long)](
151 | httpHosts,
152 | new ElasticsearchSinkFunction[(String, String, Long)] {
153 | override def process(t: (String, String, Long), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
154 | requestIndexer.add(createIndexRequest(t))
155 | }
156 |
157 | def createIndexRequest(element: (String, String, Long)): IndexRequest = {
158 | val json = new java.util.HashMap[String, Any]
159 | json.put("time", element._1)
160 | json.put("domain", element._2)
161 | json.put("traffics", element._3)
162 | val id = element._1 + "-" + element._2
163 | return Requests.indexRequest()
164 | .index("cdn")
165 | .`type`("traffic")
166 | .id(id)
167 | .source(json)
168 | }
169 | }
170 | )
171 |
172 | //设置要为每个批量请求缓冲的最大操作数
173 | esSinkBuilder.setBulkFlushMaxActions(1)
174 | resultData.addSink(esSinkBuilder.build()) //.setParallelism(5)
175 | env.execute("LogAnalysis")
176 | }
177 | }
178 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/Logs.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp
2 |
3 | import com.awebone.dmp.util.Utils
4 | import org.apache.commons.lang3.StringUtils
5 |
6 | case class Logs(val sessionid: String, //会话标识
7 | val advertisersid: Int, //广告主id
8 | val adorderid: Int, //广告id
9 | val adcreativeid: Int, //广告创意id ( >= 200000 : dsp , < 200000 oss)
10 | val adplatformproviderid: Int, //广告平台商id (>= 100000: rtb , < 100000 : api )
11 | val sdkversionnumber: String, //sdk版本号
12 | val adplatformkey: String, //平台商key
13 | val putinmodeltype: Int, //针对广告主的投放模式,1:展示量投放 2:点击量投放
14 | val requestmode: Int, //数据请求方式(1:请求、2:展示、3:点击)
15 | val adprice: Double, //广告价格
16 | val adppprice: Double, //平台商价格
17 | val requestdate: String, //请求时间,格式为:yyyy-MM-dd hh:mm:ss
18 | val ip: String, //设备用户的真实ip地址
19 | val appid: String, //应用id
20 | val appname: String, //应用名称
21 | val uuid: String, //设备唯一标识,比如imei或者androidid等
22 | val device: String, //设备型号,如htc、iphone
23 | val client: Int, //设备类型 (1:android 2:ios 3:wp)
24 | val osversion: String, //设备操作系统版本,如4.0
25 | val density: String, //备屏幕的密度 android的取值为0.75、1、1.5,ios的取值为:1、2
26 | val pw: Int, //设备屏幕宽度
27 | val ph: Int, //设备屏幕高度
28 | val longitude: String, //设备所在经度
29 | val lat: String, //设备所在纬度
30 | val provincename: String, //设备所在省份名称
31 | val cityname: String, //设备所在城市名称
32 | val ispid: Int, //运营商id
33 | val ispname: String, //运营商名称
34 | val networkmannerid: Int, //联网方式id
35 | val networkmannername: String, //联网方式名称
36 | val iseffective: Int, //有效标识(有效指可以正常计费的)(0:无效 1:有效)
37 | val isbilling: Int, //是否收费(0:未收费 1:已收费)
38 | val adspacetype: Int, //广告位类型(1:banner 2:插屏 3:全屏)
39 | val adspacetypename: String, //广告位类型名称(banner、插屏、全屏)
40 | val devicetype: Int, //设备类型(1:手机 2:平板)
41 | val processnode: Int, //流程节点(1:请求量kpi 2:有效请求 3:广告请求)
42 | val apptype: Int, //应用类型id
43 | val district: String, //设备所在县名称
44 | val paymode: Int, //针对平台商的支付模式,1:展示量投放(CPM) 2:点击量投放(CPC)
45 | val isbid: Int, //是否rtb
46 | val bidprice: Double, //rtb竞价价格
47 | val winprice: Double, //rtb竞价成功价格
48 | val iswin: Int, //是否竞价成功
49 | val cur: String, //values:usd|rmb等
50 | val rate: Double, //汇率
51 | val cnywinprice: Double, //rtb竞价成功转换成人民币的价格
52 | val imei: String, //imei
53 | val mac: String, //mac
54 | val idfa: String, //idfa
55 | val openudid: String, //openudid
56 | val androidid: String, //androidid
57 | val rtbprovince: String, //rtb 省
58 | val rtbcity: String, //rtb 市
59 | val rtbdistrict: String, //rtb 区
60 | val rtbstreet: String, //rtb 街道
61 | val storeurl: String, //app的市场下载地址
62 | val realip: String, //真实ip
63 | val isqualityapp: Int, //优选标识
64 | val bidfloor: Double, //底价
65 | val aw: Int, //广告位的宽
66 | val ah: Int, //广告位的高
67 | val imeimd5: String, //imei_md5
68 | val macmd5: String, //mac_md5
69 | val idfamd5: String, //idfa_md5
70 | val openudidmd5: String, //openudid_md5
71 | val androididmd5: String, //androidid_md5
72 | val imeisha1: String, //imei_sha1
73 | val macsha1: String, //mac_sha1
74 | val idfasha1: String, //idfa_sha1
75 | val openudidsha1: String, //openudid_sha1
76 | val androididsha1: String, //androidid_sha1
77 | val uuidunknow: String, //uuid_unknow tanx密文
78 | val decuuidunknow: String, // 解密的tanx 明文
79 | val userid: String, //平台用户id
80 | val reqdate: String, //日期
81 | val reqhour: String, //小时
82 | val iptype: Int, //表示ip库类型,1为点媒ip库,2为广告协会的ip地理信息标准库,默认为1
83 | val initbidprice: Double, //初始出价
84 | val adpayment: Double, //转换后的广告消费(保留小数点后6位)
85 | val agentrate: Double, //代理商利润率
86 | val lomarkrate: Double, //代理利润率
87 | val adxrate: Double, //媒介利润率
88 | val title: String, //标题
89 | val keywords: String, //关键字
90 | val tagid: String, //广告位标识(当视频流量时值为视频ID号)
91 | val callbackdate: String, //回调时间 格式为:YYYY/mm/dd hh:mm:ss
92 | val channelid: String, //频道ID
93 | val mediatype: Int ) {//媒体类型:1长尾媒体 2视频媒体 3独立媒体 默认:1)
94 |
95 | }
96 |
97 | object Logs {
98 |
99 | // 生成一个空的对象
100 | def makeLogs(): Logs = {
101 | new Logs("", 0, 0, 0, 0, "", "", 0, 0, 0.0, 0.0, "", "", "", "", "", "", 0, "",
102 | "", 0, 0, "", "", "", "", 0, "", 0, "", 0, 0, 0, "", 0, 0, 0, "", 0, 0,
103 | 0.0, 0.0, 0, "", 0.0, 0.0, "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0, 0,
104 | "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0.0, 0.0, 0.0, 0.0, "", "", "", "", "", 0
105 | )
106 | }
107 |
108 | def line2Logs(s:String):Logs ={
109 | if(StringUtils.isNotEmpty(s)){
110 | val fields = s.split(",")
111 | if(fields.length >= 79){
112 | Logs(fields(0), Utils.parseInt(fields(1)), Utils.parseInt(fields(2)), Utils.parseInt(fields(3)), Utils.parseInt(fields(4)), fields(5), fields(6), Utils.parseInt(fields(7)), Utils.parseInt(fields(8)), Utils.parseDouble(fields(9)), Utils.parseDouble(fields(10)),
113 | fields(11), fields(12), fields(13), fields(14), fields(15), fields(16), Utils.parseInt(fields(17)), fields(18), fields(19), Utils.parseInt(fields(20)),
114 | Utils.parseInt(fields(21)), fields(22), fields(23), fields(24), fields(25), Utils.parseInt(fields(26)), fields(27), Utils.parseInt(fields(28)), fields(29), Utils.parseInt(fields(30)),
115 | Utils.parseInt(fields(31)), Utils.parseInt(fields(32)), fields(33), Utils.parseInt(fields(34)), Utils.parseInt(fields(35)), Utils.parseInt(fields(36)), fields(37), Utils.parseInt(fields(38)), Utils.parseInt(fields(39)), Utils.parseDouble(fields(40)),
116 | Utils.parseDouble(fields(41)), Utils.parseInt(fields(42)), fields(43), Utils.parseDouble(fields(44)), Utils.parseDouble(fields(45)), fields(46), fields(47), fields(48), fields(49), fields(50),
117 | fields(51), fields(52), fields(53), fields(54), fields(55), fields(56), Utils.parseInt(fields(57)), Utils.parseDouble(fields(58)), Utils.parseInt(fields(59)), Utils.parseInt(fields(60)),
118 | fields(61), fields(62), fields(63), fields(64), fields(65), fields(66), fields(67), fields(68), fields(69), fields(70),
119 | fields(71), "", fields(72), Utils.fmtDate(fields(11)).getOrElse("unkown"), Utils.fmtHour(fields(11)).getOrElse("unkown"),
120 | Utils.parseInt(fields(73)), Utils.parseDouble(fields(74)), Utils.parseDouble(fields(75)), Utils.parseDouble(fields(76)), Utils.parseDouble(fields(77)), Utils.parseDouble(fields(78)), "", "", "", "", "", 1)
121 | }else{
122 | makeLogs()
123 | }
124 | }else{
125 | makeLogs()
126 | }
127 | }
128 | }
--------------------------------------------------------------------------------
/weblog/src/main/java/com/awebone/click/ClickSessionStream.java:
--------------------------------------------------------------------------------
1 | package com.awebone.click;
2 |
3 | import java.io.IOException;
4 | import java.lang.reflect.InvocationTargetException;
5 | import java.text.ParseException;
6 | import java.text.SimpleDateFormat;
7 | import java.util.ArrayList;
8 | import java.util.Collections;
9 | import java.util.Comparator;
10 | import java.util.Date;
11 | import java.util.Iterator;
12 | import java.util.UUID;
13 |
14 | import org.apache.commons.beanutils.BeanUtils;
15 | import org.apache.hadoop.conf.Configuration;
16 | import org.apache.hadoop.fs.Path;
17 | import org.apache.hadoop.io.LongWritable;
18 | import org.apache.hadoop.io.NullWritable;
19 | import org.apache.hadoop.io.Text;
20 | import org.apache.hadoop.mapreduce.Job;
21 | import org.apache.hadoop.mapreduce.Mapper;
22 | import org.apache.hadoop.mapreduce.Reducer;
23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
25 |
26 | import com.awebone.bean.WebLogBean;
27 |
28 | /**
29 | * 抽取,转化 点击会话流的数据
30 | * map端:
31 | * key: ip
32 | * value: 自定义类 字符串
33 | * reduce:
34 | * 相同ip的数据
35 | * 排序 按照访问时间 升序 排序
36 | * 计算相邻两个的时间差
37 | * 判断
38 | *
39 | */
40 | public class ClickSessionStream {
41 | static class ClickSessionStreamMapper extends Mapper{
42 | Text mk = new Text();
43 | WebLogBean bean = new WebLogBean();
44 |
45 | @Override
46 | protected void map(LongWritable key, Text value, Mapper.Context context)
47 | throws IOException, InterruptedException {
48 | String line = value.toString();
49 | String[] pre_datas = line.split("\001");
50 | if(pre_datas.length==9){
51 | bean.setValid(pre_datas[0].equals("true")?true:false);
52 | bean.setRemote_addr(pre_datas[1]);
53 | bean.setRemote_user(pre_datas[2]);
54 | bean.setTime_local(pre_datas[3]);
55 | bean.setRequest(pre_datas[4]);
56 | bean.setStatus(pre_datas[5]);
57 | bean.setBody_bytes_sent(pre_datas[6]);
58 | bean.setHttp_referer(pre_datas[7]);
59 | bean.setHttp_user_agent(pre_datas[8]);
60 |
61 | //过滤数据
62 | if(bean.isValid()){
63 | mk.set(bean.getRemote_addr());
64 | context.write(mk, bean);
65 | }
66 | }
67 | }
68 |
69 | }
70 |
71 | static class ClickSessionStreamReducer extends Reducer{
72 | Text rk = new Text();
73 |
74 | @Override
75 | protected void reduce(Text key, Iterable values,
76 | Reducer.Context context) throws IOException, InterruptedException {
77 | //相同ip的所有数据,循环遍历放在list中,按时间升序排序
78 | ArrayList list = new ArrayList();
79 | //reducer的坑:k和v都各自只有一个地址,因此要新建对象,再存在list中
80 | for (WebLogBean v:values){
81 | //新建对象
82 | WebLogBean bean = new WebLogBean();
83 | //将迭代器对象中的属性复制到新对象上
84 | try {
85 | BeanUtils.copyProperties(bean, v);
86 | list.add(bean);
87 | } catch (IllegalAccessException e) {
88 | // TODO Auto-generated catch block
89 | e.printStackTrace();
90 | } catch (InvocationTargetException e) {
91 | // TODO Auto-generated catch block
92 | e.printStackTrace();
93 | }
94 | }
95 |
96 | //按时间排序
97 | Collections.sort(list, new Comparator() {
98 | public int compare(WebLogBean o1, WebLogBean o2) {
99 | Date date1 = null;
100 | Date date2 = null;
101 | try {
102 | date1 = toDate(o1.getTime_local());
103 | date2 = toDate(o2.getTime_local());
104 | } catch (ParseException e) {
105 | // TODO Auto-generated catch block
106 | e.printStackTrace();
107 | }
108 | if(date1==null || date2==null){
109 | return 0;
110 | }
111 | return date1.compareTo(date2);
112 | }
113 | });
114 |
115 | //遍历list,算停留时间,session,step=1
116 | int step = 1;
117 | UUID sessionid = UUID.randomUUID();
118 | for (int i = 0; i < list.size(); i++) {
119 | WebLogBean bean = list.get(i);
120 | //只有一个访问信息时,直接发送
121 | if(list.size()==1){
122 | rk.set(sessionid+"\001"+bean.getRemote_addr()+"\001"+bean.getRemote_user()+"\001"+
123 | bean.getTime_local()+"\001"+bean.getRequest()+"\001"+(60)+"\001"+step+"\001"+
124 | bean.getStatus()+"\001"+bean.getBody_bytes_sent()+"\001"+bean.getHttp_referer()+"\001"+
125 | bean.getHttp_user_agent());
126 | context.write(rk, NullWritable.get());
127 | sessionid = UUID.randomUUID();
128 | break;
129 | }
130 |
131 | //大于一个时,算时间差,当前条减去上一条时间
132 | if (i==0){
133 | continue;
134 | }
135 | try {
136 | long diffDate = diffDate(bean.getTime_local(), list.get(i-1).getTime_local());
137 | //判断时间差小于30min
138 | if(diffDate < 30*60*1000){
139 | WebLogBean lb = list.get(i-1);
140 | //输出上一条数据
141 | rk.set(sessionid+"\001"+lb.getRemote_addr()+"\001"+lb.getRemote_user()+"\001"+
142 | lb.getTime_local()+"\001"+lb.getRequest()+"\001"+(diffDate)/1000+"\001"+step+"\001"+
143 | lb.getStatus()+"\001"+lb.getBody_bytes_sent()+"\001"+lb.getHttp_referer()+"\001"+
144 | lb.getHttp_user_agent());
145 | context.write(rk, NullWritable.get());
146 | step++;
147 | }else{
148 | //大于30min,默认新的session,输出上一个会话的最后一个
149 | WebLogBean lsl = list.get(i-1);
150 | rk.set(sessionid+"\001"+lsl.getRemote_addr()+"\001"+lsl.getRemote_user()+"\001"+
151 | lsl.getTime_local()+"\001"+lsl.getRequest()+"\001"+(60)+"\001"+step+"\001"+
152 | lsl.getStatus()+"\001"+lsl.getBody_bytes_sent()+"\001"+lsl.getHttp_referer()+"\001"+
153 | lsl.getHttp_user_agent());
154 | context.write(rk, NullWritable.get());
155 |
156 | //step和session重新赋值
157 | step = 1;
158 | sessionid = UUID.randomUUID();
159 | }
160 |
161 | //输出最后一条
162 | if(i == list.size()-1){
163 | WebLogBean cb = list.get(i-1);
164 | rk.set(sessionid+"\001"+cb.getRemote_addr()+"\001"+cb.getRemote_user()+"\001"+
165 | cb.getTime_local()+"\001"+cb.getRequest()+"\001"+(60)+"\001"+step+"\001"+
166 | cb.getStatus()+"\001"+cb.getBody_bytes_sent()+"\001"+cb.getHttp_referer()+"\001"+
167 | cb.getHttp_user_agent());
168 | context.write(rk, NullWritable.get());
169 | sessionid = UUID.randomUUID();
170 | }
171 | } catch (ParseException e) {
172 | // TODO Auto-generated catch block
173 | e.printStackTrace();
174 | }
175 | }
176 | }
177 |
178 | public static Date toDate(String time) throws ParseException {
179 | SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
180 | Date date = sdf.parse(time);
181 | return date;
182 | }
183 |
184 | public static long diffDate(String date1,String date2) throws ParseException {
185 | Date d1 = toDate(date1);
186 | Date d2 = toDate(date2);
187 | return d1.getTime() - d2.getTime();
188 | }
189 | }
190 |
191 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
192 | System.setProperty("HADOOP_USER_NAME", "hadoop");
193 | Configuration conf = new Configuration();
194 | conf.set("fs.defaultFS", "hdfs://myha/");
195 | Job job = Job.getInstance(conf);
196 |
197 | job.setJarByClass(ClickSessionStream.class);
198 |
199 | job.setMapperClass(ClickSessionStreamMapper.class);
200 | job.setReducerClass(ClickSessionStreamReducer.class);
201 |
202 | job.setMapOutputKeyClass(Text.class);
203 | job.setMapOutputValueClass(WebLogBean.class);
204 | job.setOutputKeyClass(Text.class);
205 | job.setOutputValueClass(NullWritable.class);
206 |
207 | FileInputFormat.setInputPaths(job, new Path("/weblog/pre/20200221"));
208 | FileOutputFormat.setOutputPath(job, new Path("/weblog/click/stream/20200221"));
209 |
210 | boolean res = job.waitForCompletion(true);
211 | System.exit(res ? 0 : 1);
212 | }
213 | }
214 |
--------------------------------------------------------------------------------
/flink-train/src/main/scala/com/awebone/flink/project/LogAnalysisWithMySQL.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.flink.project
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.{Date, Properties}
5 |
6 | import org.apache.flink.api.common.functions.RuntimeContext
7 | import org.apache.flink.api.common.serialization.SimpleStringSchema
8 | import org.apache.flink.api.java.tuple.Tuple
9 | import org.apache.flink.streaming.api.TimeCharacteristic
10 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks
11 | import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction
12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
13 | import org.apache.flink.streaming.api.scala.function.WindowFunction
14 | import org.apache.flink.streaming.api.watermark.Watermark
15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
16 | import org.apache.flink.streaming.api.windowing.time.Time
17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
21 | import org.apache.flink.util.Collector
22 | import org.apache.http.HttpHost
23 | import org.elasticsearch.action.index.IndexRequest
24 | import org.elasticsearch.client.Requests
25 | import org.slf4j.LoggerFactory
26 |
27 | import scala.collection.mutable
28 | import scala.collection.mutable.ArrayBuffer
29 |
30 | object LogAnalysisWithMySQL {
31 | def main(args: Array[String]): Unit = {
32 | //在生产上进行日志的输出,采用以下方式
33 | val logger = LoggerFactory.getLogger("LogAnalysis")
34 |
35 | val env = StreamExecutionEnvironment.getExecutionEnvironment
36 | //设置事件时间作为flink处理的基准时间
37 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
38 | import org.apache.flink.api.scala._
39 |
40 | /**
41 | * 读取kafka集群数据
42 | */
43 | val topic = "cdnlog"
44 | val properties: Properties = new Properties()
45 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092")
46 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk
47 | properties.setProperty("group.id","test-cdnlog-mysql")
48 |
49 | val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties)
50 | val data = env.addSource(consumer) // 接受kafka数据
51 | // data.print().setParallelism(1) // 测试是否连通
52 |
53 | /**
54 | * 数据清洗:
55 | * 在生产上进行业务处理的时候,一定要考虑处理的健壮性以及数据的准确性
56 | * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后
57 | * 再进行相应业务逻辑的处理
58 | */
59 | val logData = data.map(x => {
60 | val strings = x.split("\t")
61 |
62 | val level = strings(2)
63 | val timeStr = strings(3)
64 | var time = 0l
65 | try {
66 | val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
67 | time = sourceFormat.parse(timeStr).getTime
68 | } catch {
69 | case e:Exception => {
70 | logger.error(s"time parse error: $timeStr", e.getMessage)
71 | }
72 | }
73 |
74 | val domain = strings(5)
75 | val traffic = strings(6).toLong
76 | (level, time, domain, traffic)
77 | }).filter(_._2 != 0).filter(_._1 == "E")
78 | .map(x => {
79 | (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic
80 | })
81 |
82 | /**
83 | * 连接mysql,合并字段
84 | */
85 | val mysqlData = env.addSource(new MySQLSource)
86 | // mysqlData.print()
87 | val connectData = logData.connect(mysqlData)
88 | .flatMap(new CoFlatMapFunction[(Long, String, Long), mutable.HashMap[String, String], (Long, String, Long, String)] {
89 | var userDomainMap: mutable.HashMap[String, String] = mutable.HashMap[String, String]()
90 |
91 | //log
92 | override def flatMap1(in1: (Long, String, Long), collector: Collector[(Long, String, Long, String)]): Unit = {
93 | val domain = in1._2
94 | val userId = userDomainMap.getOrElse(domain, "")
95 | // collector.collect(in1._1 + "\t" + in1._2 + "\t" + in1._3 + "\t" + userId)
96 | collector.collect((in1._1, domain, in1._3, userId))
97 | }
98 |
99 | override def flatMap2(in2: mutable.HashMap[String, String], collector: Collector[(Long, String, Long, String)]): Unit = {
100 | userDomainMap = in2
101 | }
102 | })
103 |
104 | // connectData.print()
105 |
106 | /**
107 | * 设置timestamp和watermark,解决时序性问题
108 | * AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型
109 | */
110 | val resultData = connectData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long, String)] {
111 | //最大无序容忍的时间 10s
112 | val maxOutOfOrderness = 10000L // 3.5 seconds
113 | //当前最大的TimeStamp
114 | var currentMaxTimestamp: Long = _
115 |
116 | //设置TimeStamp生成WaterMark
117 | override def getCurrentWatermark: Watermark = {
118 | new Watermark(currentMaxTimestamp - maxOutOfOrderness)
119 | }
120 |
121 | //抽取时间
122 | override def extractTimestamp(element: (Long, String, Long, String), previousElementTimestamp: Long): Long = {
123 | //获取数据的event time
124 | val timestamp: Long = element._1
125 | currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
126 | timestamp
127 | }
128 | }) //根据window进行业务逻辑的处理 最近一分钟每个用户产生的流量
129 | .keyBy(3) //以userid进行分组
130 | .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口,进行统计
131 | .apply(new WindowFunction[(Long, String, Long, String), (String, String, Long, String), Tuple, TimeWindow] {
132 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long, String)], out: Collector[(String, String, Long, String)]): Unit = {
133 | val userid = key.getField(0).toString //拿到key,userid
134 |
135 | var sum = 0l
136 | val times = ArrayBuffer[Long]()
137 | val iterator = input.iterator
138 | while (iterator.hasNext) {
139 | val next = iterator.next()
140 | sum += next._3 //统计流量
141 | times.append(next._1) //记录这一分钟,格式:yyyy-MM-dd HH:mm
142 | }
143 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间,格式化
144 |
145 | /**
146 | * 输出结果:
147 | * 第一个参数:这一分钟的时间
148 | * 第二个参数:域名
149 | * 第三个参数:traffic流量的和
150 | */
151 | out.collect((time, domain, sum, userid))
152 | }
153 | })
154 | resultData.print().setParallelism(1)
155 |
156 |
157 | /**
158 | * 连接es库,导入数据
159 | * 使用kibana可视化
160 | */
161 | val httpHosts = new java.util.ArrayList[HttpHost]
162 | httpHosts.add(new HttpHost("redhat", 9200, "http"))
163 |
164 | val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long, String)](
165 | httpHosts,
166 | new ElasticsearchSinkFunction[(String, String, Long, String)] {
167 | override def process(t: (String, String, Long, String), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
168 | requestIndexer.add(createIndexRequest(t))
169 | }
170 |
171 | def createIndexRequest(element: (String, String, Long, String)): IndexRequest = {
172 | val json = new java.util.HashMap[String, Any]
173 | json.put("time", element._1)
174 | json.put("domain", element._2)
175 | json.put("traffics", element._3)
176 | json.put("userid", element._4)
177 | val id = element._1 + "-" + element._2
178 | return Requests.indexRequest()
179 | .index("cdn")
180 | .`type`("traffic-userid")
181 | .id(id)
182 | .source(json)
183 | }
184 | }
185 | )
186 |
187 | //设置要为每个批量请求缓冲的最大操作数
188 | esSinkBuilder.setBulkFlushMaxActions(1)
189 | resultData.addSink(esSinkBuilder.build()) //.setParallelism(5)
190 |
191 | env.execute("LogAnalysisWithMySQL")
192 | }
193 | }
194 |
--------------------------------------------------------------------------------
/dmp/src/main/scala/com/awebone/dmp/personas/DmpPersonasJob.scala:
--------------------------------------------------------------------------------
1 | package com.awebone.dmp.personas
2 |
3 | import java.io.FileInputStream
4 | import java.util.Properties
5 |
6 | import com.awebone.dmp.Logs
7 | import com.awebone.dmp.constants.AdTagConstants
8 | import com.awebone.dmp.tags._
9 | import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
10 | import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
11 | import org.apache.log4j.{Level, Logger}
12 | import org.apache.spark.SparkConf
13 | import org.apache.spark.rdd.RDD
14 | import org.apache.spark.sql.{Dataset, SparkSession}
15 |
16 | import scala.collection.{JavaConversions, mutable}
17 |
18 | /**
19 | * dmp用户画像便签统计
20 | */
21 | object DmpPersonasJob {
22 | def main(args: Array[String]): Unit = {
23 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
24 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
25 | Logger.getLogger("org.spark-project").setLevel(Level.WARN)
26 |
27 | if (args == null || args.length < 1) {
28 | println(
29 | """Parameter Errors! Usage:
30 | |inputpath : input path
31 | """.stripMargin)
32 | System.exit(-1)
33 | }
34 | val Array(inputpath) = args
35 |
36 | val conf: SparkConf = new SparkConf().setAppName("DmpPersonasJob").setMaster("local[*]")
37 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
38 | import spark.implicits._
39 |
40 | val input: Dataset[Logs] = spark.read.parquet(inputpath).as[Logs]
41 | val logs: RDD[Logs] = input.rdd
42 |
43 | //提取用户的标签 | |
44 | val userid2Tags: RDD[(String, Map[String, Int])] = logs.map { case logs: Logs => {
45 | var userid: String = logs.userid
46 | if (userid == null) {
47 | userid = getNotEmptyID(logs).getOrElse("UnKnown")
48 | }
49 |
50 | val adspaceTags: Map[String, Int] = AdPositionTag.extractTag(logs)
51 | val appTags: Map[String, Int] = AppTag.extractTag(logs)
52 | val channelTags: Map[String, Int] = ChannelTag.extractTag(logs)
53 | val deviceTags: Map[String, Int] = DeviceTag.extractTag(logs)
54 | val kwTags: Map[String, Int] = KeyWordTag.extractTag(logs)
55 | val areaTags: Map[String, Int] = AreaTag.extractTag(logs)
56 |
57 | (userid, adspaceTags.++(appTags).++(channelTags).++(deviceTags).++(kwTags).++(areaTags))
58 | }
59 | }
60 |
61 | //map1 map2 -->
62 | val userid2AggrTags: RDD[(String, Map[String, Int])] = userid2Tags.reduceByKey { case (map1, map2) => {
63 | val map = mutable.Map[String, Int]()
64 | map.++=(map1)
65 |
66 | for ((k, v) <- map2) {
67 | map.put(k, map.getOrElse(k, 0) + v)
68 | }
69 | map.toMap
70 | }
71 | }
72 | // userid2AggrTags.foreach(println)
73 | // (2,Map(NET_3 -> 2, ZC_益阳市 -> 2, DEVICE_1 -> 2, APP_其他 -> 2, ZP_湘南省 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2))
74 | // (1,Map(ZP_上海市 -> 2, NET_3 -> 2, DEVICE_1 -> 2, APP_马上赚 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2, ZC_上海市 -> 2))
75 |
76 | //转换属性
77 | val props = loadProerties()
78 | val propsBC = spark.sparkContext.broadcast(props)
79 |
80 | val aggrTags = userid2AggrTags.map{case (userid, tagMap) => {
81 | val map = mutable.Map[String, Int]()
82 | val propsMap = propsBC.value
83 |
84 | for((k,v) <- tagMap){
85 | var key = k
86 |
87 | if(k.contains(AdTagConstants.PREFIX_AD_DEVICE_TAG)){
88 | val dMap = propsMap(AdTagConstants.PREFIX_AD_DEVICE_TAG)
89 | val id = k.split("_")(1)
90 | val dName = dMap.get(id).get.split("\\s+")(1)
91 | //k --> prefix_id
92 | key = AdTagConstants.PREFIX_AD_DEVICE_TAG + dName
93 | }else if(k.contains(AdTagConstants.PREFIX_AD_ISP_TAG)) {
94 | val ispMap = propsMap(AdTagConstants.PREFIX_AD_ISP_TAG)
95 | val id = k.split("_")(1)
96 | val ispName = ispMap.get(id).get.split("\\s+")(1)
97 | key = AdTagConstants.PREFIX_AD_ISP_TAG + ispName
98 | } else if(k.contains(AdTagConstants.PREFIX_AD_NETWORK_TAG)) {
99 | val nwMap = propsMap(AdTagConstants.PREFIX_AD_NETWORK_TAG)
100 | val id = k.split("_")(1)
101 | val nwName = nwMap.get(id).get.split("\\s+")(1)
102 | key = AdTagConstants.PREFIX_AD_NETWORK_TAG + nwName
103 | }
104 | map.put(key, v)
105 | }
106 |
107 | (userid, map)
108 | }}
109 |
110 | /**
111 | * 将标签聚合结果存储到hbase中
112 | * 因为,经过我们分析,计算得出的标签可能半结构化的数据,同时如果在dmp和dsp中进行交互的时候,流量比较大的情况下
113 | * 我们使用mysql没有办法保证时效性,所以我们这里使用hbase进行存储
114 | * create_space bigdata
115 | * create 'bigdata:dmp_tag', 'cf'
116 | * HBase api
117 | */
118 | aggrTags.foreachPartition(partition => {
119 | if(partition != null){
120 | val connection = ConnectionFactory.createConnection(HBaseConfiguration.create())
121 | val table = connection.getTable(TableName.valueOf("bigdata:dmp_tag"))
122 |
123 | partition.foreach{case (userid, tagMap) => {
124 | val put = new Put(userid.getBytes())
125 |
126 | //tagMap--[Deivce_xxxx, 5]
127 | for((col,value) <- tagMap){
128 | put.addColumn("cf".getBytes(), col.getBytes(), value.toString.getBytes())
129 | }
130 | table.put(put)
131 | }}
132 |
133 | table.close()
134 | connection.close()
135 | }
136 | })
137 |
138 | spark.stop()
139 | }
140 |
141 | /**
142 | * 加载配置文件
143 | * type
144 | * device k, value
145 | * isp
146 | * network
147 | */
148 | def loadProerties():mutable.Map[String, mutable.Map[String, String]] = {
149 | val props = mutable.Map[String, mutable.Map[String, String]]()
150 | val properties = new Properties()
151 |
152 | //加载deivce
153 | properties.load(new FileInputStream("data/device-mapping.dic"))
154 | val deviceMap = mutable.Map[String, String]()
155 |
156 | for (dk <- JavaConversions.asScalaSet(properties.keySet())){
157 | deviceMap.put(dk.toString,properties.getProperty(dk.toString))
158 | }
159 | props.put(AdTagConstants.PREFIX_AD_DEVICE_TAG, deviceMap)
160 |
161 | //加载isp
162 | properties.clear()
163 | properties.load(new FileInputStream("data/isp-mapping.dic"))
164 | val ispMap = mutable.Map[String, String]()
165 | for(dk <- JavaConversions.asScalaSet(properties.keySet())) {
166 | ispMap.put(dk.toString, properties.getProperty(dk.toString))
167 | }
168 | props.put(AdTagConstants.PREFIX_AD_ISP_TAG, ispMap)
169 |
170 | //network
171 | properties.clear()
172 | properties.load(new FileInputStream("data/network-mapping.dic"))
173 | val nwMap = mutable.Map[String, String]()
174 | for(dk <- JavaConversions.asScalaSet(properties.keySet())) {
175 | nwMap.put(dk.toString, properties.getProperty(dk.toString))
176 | }
177 | props.put(AdTagConstants.PREFIX_AD_NETWORK_TAG, nwMap)
178 |
179 | props
180 | }
181 |
182 |
183 | // 获取用户唯一不为空的ID
184 | def getNotEmptyID(log: Logs): Option[String] = {
185 | log match {
186 | case v if v.imei.nonEmpty => Some("IMEI:" + v.imei.replaceAll(":|-\\", "").toUpperCase)
187 | case v if v.imeimd5.nonEmpty => Some("IMEIMD5:" + v.imeimd5.toUpperCase)
188 | case v if v.imeisha1.nonEmpty => Some("IMEISHA1:" + v.imeisha1.toUpperCase)
189 |
190 | case v if v.androidid.nonEmpty => Some("ANDROIDID:" + v.androidid.toUpperCase)
191 | case v if v.androididmd5.nonEmpty => Some("ANDROIDIDMD5:" + v.androididmd5.toUpperCase)
192 | case v if v.androididsha1.nonEmpty => Some("ANDROIDIDSHA1:" + v.androididsha1.toUpperCase)
193 |
194 | case v if v.mac.nonEmpty => Some("MAC:" + v.mac.replaceAll(":|-", "").toUpperCase)
195 | case v if v.macmd5.nonEmpty => Some("MACMD5:" + v.macmd5.toUpperCase)
196 | case v if v.macsha1.nonEmpty => Some("MACSHA1:" + v.macsha1.toUpperCase)
197 |
198 | case v if v.idfa.nonEmpty => Some("IDFA:" + v.idfa.replaceAll(":|-", "").toUpperCase)
199 | case v if v.idfamd5.nonEmpty => Some("IDFAMD5:" + v.idfamd5.toUpperCase)
200 | case v if v.idfasha1.nonEmpty => Some("IDFASHA1:" + v.idfasha1.toUpperCase)
201 |
202 | case v if v.openudid.nonEmpty => Some("OPENUDID:" + v.openudid.toUpperCase)
203 | case v if v.openudidmd5.nonEmpty => Some("OPENDUIDMD5:" + v.openudidmd5.toUpperCase)
204 | case v if v.openudidsha1.nonEmpty => Some("OPENUDIDSHA1:" + v.openudidsha1.toUpperCase)
205 |
206 | case _ => None
207 | }
208 | }
209 | }
210 |
--------------------------------------------------------------------------------
/mllib/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | com.awebone.spark
8 | mllib
9 | 1.0-SNAPSHOT
10 |
11 | mllib
12 |
13 | http://www.example.com
14 |
15 |
16 | UTF-8
17 | 1.8
18 | 1.8
19 | UTF-8
20 | 2.11.8
21 | 2.3.1
22 | 2.7.6
23 | 2.11
24 |
25 |
26 |
27 |
28 | org.scala-lang
29 | scala-library
30 | ${scala.version}
31 |
32 |
33 |
34 | org.apache.spark
35 | spark-core_2.11
36 | ${spark.version}
37 |
38 |
39 |
40 | org.apache.spark
41 | spark-sql_2.11
42 | ${spark.version}
43 |
44 |
45 |
46 | org.apache.spark
47 | spark-streaming_2.11
48 | ${spark.version}
49 |
50 |
51 |
52 | org.apache.spark
53 | spark-graphx_2.11
54 | ${spark.version}
55 |
56 |
57 |
58 | org.apache.spark
59 | spark-mllib_2.11
60 | ${spark.version}
61 |
62 |
63 |
64 | org.apache.hadoop
65 | hadoop-client
66 | ${hadoop.version}
67 |
68 |
69 |
70 | org.apache.spark
71 | spark-streaming-kafka-0-10_2.11
72 | 2.3.1
73 |
74 |
75 |
76 | org.apache.spark
77 | spark-streaming-flume_2.11
78 | ${spark.version}
79 |
80 |
81 |
82 | mysql
83 | mysql-connector-java
84 | 5.1.46
85 |
86 |
87 |
88 | org.apache.spark
89 | spark-hive_2.11
90 | ${spark.version}
91 |
92 |
93 |
94 |
95 | org.apache.kafka
96 | kafka_2.11
97 | 1.1.0
98 |
99 |
100 |
101 | junit
102 | junit
103 | 4.11
104 | test
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 | net.alchim31.maven
113 | scala-maven-plugin
114 | 3.2.2
115 |
116 |
117 | org.apache.maven.plugins
118 | maven-compiler-plugin
119 | 3.5.1
120 |
121 |
122 |
123 |
124 |
125 | net.alchim31.maven
126 | scala-maven-plugin
127 |
128 |
129 | scala-compile-first
130 | process-resources
131 |
132 | add-source
133 | compile
134 |
135 |
136 |
137 | scala-test-compile
138 | process-test-resources
139 |
140 | testCompile
141 |
142 |
143 |
144 |
145 |
146 |
147 | org.apache.maven.plugins
148 | maven-compiler-plugin
149 |
150 |
151 | compile
152 |
153 | compile
154 |
155 |
156 |
157 |
158 |
159 |
160 | org.apache.maven.plugins
161 | maven-shade-plugin
162 | 2.4.3
163 |
164 |
165 | package
166 |
167 | shade
168 |
169 |
170 |
171 |
172 | *:*
173 |
174 | META-INF/*.SF
175 | META-INF/*.DSA
176 | META-INF/*.RSA
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
--------------------------------------------------------------------------------
/dmp/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | com.awebone
8 | dmp
9 | 1.0-SNAPSHOT
10 |
11 | dmp
12 |
13 | http://www.example.com
14 |
15 |
16 | UTF-8
17 | 2.11.8
18 | 2.3.1
19 |
20 |
21 |
22 |
23 | scala-tools.org
24 | Scala-Tools Maven2 Repository
25 | http://scala-tools.org/repo-releases
26 |
27 |
28 |
29 |
30 |
31 | scala-tools.org
32 | Scala-Tools Maven2 Repository
33 | http://scala-tools.org/repo-releases
34 |
35 |
36 |
37 |
38 |
39 | org.scala-lang
40 | scala-library
41 | ${scala.version}
42 |
43 |
44 | junit
45 | junit
46 | 4.11
47 |
54 | test
55 |
56 |
57 | org.apache.spark
58 | spark-core_2.11
59 | ${spark.version}
60 |
61 |
62 | org.apache.spark
63 | spark-sql_2.11
64 | ${spark.version}
65 |
66 |
67 | org.apache.spark
68 | spark-hive_2.11
69 | ${spark.version}
70 |
71 |
72 | mysql
73 | mysql-connector-java
74 | 5.1.40
75 |
76 |
77 |
78 | org.apache.hbase
79 | hbase-client
80 | 1.2.6
81 |
82 |
83 | org.apache.hbase
84 | hbase-server
85 | 1.2.6
86 |
87 |
88 |
89 |
90 |
91 |
92 | org.scala-tools
93 | maven-scala-plugin
94 | 2.15.0
95 |
96 |
97 |
98 | compile
99 | testCompile
100 |
101 |
102 |
103 |
104 | ${scala.version}
105 |
106 | -target:jvm-1.5
107 |
108 |
109 |
110 |
111 | org.apache.maven.plugins
112 | maven-eclipse-plugin
113 | 2.10
114 |
115 | true
116 |
117 | ch.epfl.lamp.sdt.core.scalabuilder
118 |
119 |
120 | ch.epfl.lamp.sdt.core.scalanature
121 |
122 |
123 | org.eclipse.jdt.launching.JRE_CONTAINER
124 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
125 |
126 |
127 |
128 |
129 | maven-assembly-plugin
130 |
131 |
132 | jar-with-dependencies
133 |
134 |
135 |
138 |
139 |
140 |
141 |
142 | make-assembly
143 | package
144 |
145 | single
146 |
147 |
148 |
149 |
150 |
151 | org.apache.maven.plugins
152 | maven-compiler-plugin
153 |
154 | 1.8
155 | 1.8
156 |
157 |
158 |
159 | org.codehaus.mojo
160 | build-helper-maven-plugin
161 | 1.10
162 |
163 |
164 | add-source
165 | generate-sources
166 |
167 | add-source
168 |
169 |
170 |
171 |
172 | src/main/java
173 | src/main/scala
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
--------------------------------------------------------------------------------
/flink-train/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | com.awebone
8 | flink
9 | 1.0-SNAPSHOT
10 |
11 | flink
12 |
13 | http://www.example.com
14 |
15 |
16 | UTF-8
17 | 1.8
18 | 1.8
19 | 1.7.2
20 | 2.11
21 | 2.11.8
22 | 2.7.6
23 | 1.4.3
24 | 1.2.7
25 |
26 |
27 |
28 |
29 |
30 | org.scala-lang
31 | scala-library
32 | ${scala.version}
33 |
34 |
35 |
36 |
37 | org.apache.flink
38 | flink-scala_${scala.binary.version}
39 | ${flink.version}
40 |
41 |
42 | org.apache.flink
43 | flink-streaming-scala_${scala.binary.version}
44 | ${flink.version}
45 |
46 |
47 |
48 |
49 | org.apache.flink
50 | flink-java
51 | ${flink.version}
52 | compile
53 |
54 |
55 | org.apache.flink
56 | flink-streaming-java_2.11
57 | ${flink.version}
58 | compile
59 |
60 |
61 |
62 |
63 | org.apache.flink
64 | flink-table_2.11
65 | ${flink.version}
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 | org.apache.flink
81 | flink-connector-filesystem_2.11
82 | ${flink.version}
83 |
84 |
85 | org.apache.flink
86 | flink-connector-kafka_2.11
87 | ${flink.version}
88 |
89 |
90 | org.apache.flink
91 | flink-avro
92 | ${flink.version}
93 |
94 |
95 |
96 |
97 | org.apache.bahir
98 | flink-connector-redis_2.11
99 | 1.0
100 |
101 |
102 |
103 | org.apache.flink
104 | flink-connector-kafka-0.10_${scala.binary.version}
105 | ${flink.version}
106 |
107 |
108 | org.apache.flink
109 | flink-connector-elasticsearch6_2.11
110 | ${flink.version}
111 |
112 |
113 | org.apache.flink
114 | flink-json
115 | ${flink.version}
116 |
117 |
118 | org.apache.flink
119 | flink-hbase_2.11
120 | ${flink.version}
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 | org.slf4j
161 | slf4j-log4j12
162 | 1.7.10
163 | runtime
164 |
165 |
166 | log4j
167 | log4j
168 | 1.2.17
169 | runtime
170 |
171 |
172 | mysql
173 | mysql-connector-java
174 | 5.1.40
175 |
176 |
177 | org.apache.hadoop
178 | hadoop-client
179 | ${hadoop.version}
180 |
181 |
182 | org.apache.kafka
183 | kafka-clients
184 | 1.1.0
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 | junit
206 | junit
207 | 4.11
208 | test
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 | org.apache.maven.plugins
218 | maven-shade-plugin
219 | 3.0.0
220 |
221 |
222 |
223 | package
224 |
225 | shade
226 |
227 |
228 |
229 |
230 | org.apache.flink:force-shading
231 | com.google.code.findbugs:jsr305
232 | org.slf4j:*
233 | log4j:*
234 |
235 |
236 |
237 |
238 |
240 | *:*
241 |
242 | META-INF/*.SF
243 | META-INF/*.DSA
244 | META-INF/*.RSA
245 |
246 |
247 |
248 |
249 |
251 | com.lp.demo.StreamingJob
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 | net.alchim31.maven
262 | scala-maven-plugin
263 | 3.2.2
264 |
265 |
266 |
267 | -target:jvm-1.8
268 | -feature
269 | -deprecation
270 | -explaintypes
271 | -unchecked
272 | -Xlint
273 |
274 |
275 |
276 |
277 |
278 | compile
279 | testCompile
280 |
281 |
282 |
283 |
284 |
285 |
286 | org.codehaus.mojo
287 | build-helper-maven-plugin
288 | 1.8
289 |
290 |
291 |
292 | add-source
293 | generate-sources
294 |
295 | add-source
296 |
297 |
298 |
299 | src/main/scala
300 |
301 |
302 |
303 |
304 |
305 | add-test-source
306 | generate-test-sources
307 |
308 | add-test-source
309 |
310 |
311 |
312 | src/test/scala
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 | add-dependencies-for-IDEA
410 |
411 |
412 |
413 | idea.version
414 |
415 |
416 |
417 |
418 |
419 | org.apache.flink
420 | flink-scala_${scala.binary.version}
421 | ${flink.version}
422 | compile
423 |
424 |
425 | org.apache.flink
426 | flink-streaming-scala_${scala.binary.version}
427 | ${flink.version}
428 | compile
429 |
430 |
431 | org.scala-lang
432 | scala-library
433 | ${scala.version}
434 | compile
435 |
436 |
437 | org.apache.flink
438 | flink-java
439 | ${flink.version}
440 | compile
441 |
442 |
443 | org.apache.flink
444 | flink-streaming-java_2.11
445 | ${flink.version}
446 | compile
447 |
448 |
449 |
450 |
451 |
452 |
453 |
--------------------------------------------------------------------------------