├── .gitignore ├── .gitattributes ├── dmp ├── dmp.iml ├── data │ ├── isp-mapping.dic │ ├── device-mapping.dic │ ├── network-mapping.dic │ └── data.txt ├── src │ └── main │ │ ├── scala │ │ └── com │ │ │ └── awebone │ │ │ └── dmp │ │ │ ├── tags │ │ │ ├── Tags.scala │ │ │ ├── AppTag.scala │ │ │ ├── ChannelTag.scala │ │ │ ├── AdPositionTag.scala │ │ │ ├── AreaTag.scala │ │ │ ├── KeyWordTag.scala │ │ │ └── DeviceTag.scala │ │ │ ├── constants │ │ │ └── AdTagConstants.scala │ │ │ ├── util │ │ │ └── Utils.scala │ │ │ ├── etl │ │ │ ├── DMPLogETLOps.scala │ │ │ └── DMPLogETLHDFSOps.scala │ │ │ ├── report │ │ │ ├── ProvinceCityQuantityJob.scala │ │ │ └── AreaRequestDistributionJob.scala │ │ │ ├── Logs.scala │ │ │ └── personas │ │ │ └── DmpPersonasJob.scala │ │ └── resources │ │ ├── hive-site.xml │ │ ├── hbase-site.xml │ │ ├── core-site.xml │ │ └── hdfs-site.xml ├── script │ └── mysql-create.sql └── pom.xml ├── mllib ├── mllib.iml ├── src │ └── main │ │ ├── resources │ │ ├── ml-1m │ │ │ ├── movies.dat │ │ │ └── README │ │ ├── core-site.xml │ │ └── hdfs-site.xml │ │ ├── scala │ │ └── com │ │ │ └── awebone │ │ │ └── spark │ │ │ ├── WordCountScala.scala │ │ │ ├── MovieLensSparkShell.scala │ │ │ └── MovieLensALS.scala │ │ └── java │ │ └── com │ │ └── awebone │ │ └── spark │ │ ├── WordCountJava8.java │ │ └── WordCountJava7.java └── pom.xml ├── akka_rpc ├── akka_rpc.iml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── awebone │ │ │ └── hadoop_rpc │ │ │ ├── MyDataNode.java │ │ │ ├── MyServerProtocal.java │ │ │ ├── MyServerImpl.java │ │ │ ├── NameNodeClient.java │ │ │ └── MyNamenode.java │ │ └── scala │ │ └── com │ │ └── awebone │ │ ├── yarn │ │ ├── Constant.scala │ │ ├── Message.scala │ │ ├── MyNodeManager.scala │ │ └── MyResourceManager.scala │ │ └── akka_rpc │ │ ├── Worker.scala │ │ └── Master.scala └── pom.xml ├── flink-train ├── flink-train.iml ├── src │ └── main │ │ ├── resources │ │ ├── scripts │ │ │ ├── kafka-script │ │ │ ├── mysql.sql │ │ │ └── es-scripts │ │ ├── hive-site.xml │ │ ├── hbase-site.xml │ │ ├── core-site.xml │ │ └── hdfs-site.xml │ │ └── scala │ │ └── com │ │ └── awebone │ │ └── flink │ │ ├── connetcor │ │ └── FileSystemSinkApp.scala │ │ └── project │ │ ├── MySQLSource.scala │ │ ├── MockKafkaProducer.scala │ │ ├── LogAnalysis.scala │ │ └── LogAnalysisWithMySQL.scala └── pom.xml ├── weblog ├── .settings │ ├── org.eclipse.m2e.core.prefs │ ├── org.eclipse.core.resources.prefs │ └── org.eclipse.jdt.core.prefs ├── src │ ├── main │ │ └── java │ │ │ ├── log4j.properties │ │ │ ├── core-site.xml │ │ │ ├── com │ │ │ └── awebone │ │ │ │ ├── pre │ │ │ │ ├── WebLogParse.java │ │ │ │ └── WebLogPreProcess.java │ │ │ │ ├── bean │ │ │ │ ├── VisitBean.java │ │ │ │ ├── PageViewsBean.java │ │ │ │ └── WebLogBean.java │ │ │ │ └── click │ │ │ │ ├── ClickModel.java │ │ │ │ └── ClickSessionStream.java │ │ │ ├── hdfs-site.xml │ │ │ └── hive-op.txt │ └── test │ │ └── java │ │ └── com │ │ └── awebone │ │ └── weblog │ │ └── AppTest.java ├── .project ├── pom.xml └── .classpath ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | target/ 4 | out/ -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /dmp/dmp.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mllib/mllib.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /akka_rpc/akka_rpc.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dmp/data/isp-mapping.dic: -------------------------------------------------------------------------------- 1 | 1=移动 D0003001 2 | 2=联通 D0003002 3 | 3=电信 D0003003 4 | 4=OPERATOROTHER D0003004 -------------------------------------------------------------------------------- /flink-train/flink-train.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dmp/data/device-mapping.dic: -------------------------------------------------------------------------------- 1 | 1=Android D0001001 2 | 2=IOS D0001002 3 | 3=Winphone D0001003 4 | 4=其他 D0001004 -------------------------------------------------------------------------------- /dmp/data/network-mapping.dic: -------------------------------------------------------------------------------- 1 | 1=WIFI D0002001 2 | 2=4G D0002002 3 | 3=3G D0002003 4 | 4=2G D0002004 5 | 5=NWTWORKOTHER D0004004 -------------------------------------------------------------------------------- /mllib/src/main/resources/ml-1m/movies.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanbo03/bigdata-projects/HEAD/mllib/src/main/resources/ml-1m/movies.dat -------------------------------------------------------------------------------- /weblog/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /weblog/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyDataNode.java: -------------------------------------------------------------------------------- 1 | package com.awebone.hadoop_rpc; 2 | 3 | public class MyDataNode { 4 | 5 | public static void main(String[] args) { 6 | 7 | 8 | 9 | 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/Tags.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | 5 | /** 6 | * 用户提取标签的特质 7 | */ 8 | trait Tags { 9 | 10 | def extractTag(logs:Logs):Map[String, Int] 11 | } 12 | -------------------------------------------------------------------------------- /akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerProtocal.java: -------------------------------------------------------------------------------- 1 | package com.awebone.hadoop_rpc; 2 | 3 | public interface MyServerProtocal { 4 | 5 | long versionID = 12345678L; 6 | 7 | void hello(); 8 | 9 | String getName(); 10 | } 11 | -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/yarn/Constant.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.yarn 2 | 3 | object Constant { 4 | val RMAS = "MyResourceManagerActorSystem" 5 | val RMA = "MyResourceManagerActor" 6 | val NMAS = "MyNodeManagerActorSystem" 7 | val NMA = "MyNodeManagerActor" 8 | } 9 | -------------------------------------------------------------------------------- /weblog/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 3 | org.eclipse.jdt.core.compiler.compliance=1.5 4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 5 | org.eclipse.jdt.core.compiler.source=1.5 6 | -------------------------------------------------------------------------------- /akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyServerImpl.java: -------------------------------------------------------------------------------- 1 | package com.awebone.hadoop_rpc; 2 | 3 | public class MyServerImpl implements MyServerProtocal{ 4 | 5 | @Override 6 | public void hello() { 7 | System.out.println("hi"); 8 | } 9 | 10 | @Override 11 | public String getName() { 12 | return "mynamenode"; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /weblog/src/main/java/log4j.properties: -------------------------------------------------------------------------------- 1 | ###set log levels### 2 | log4j.rootLogger=info, stdout 3 | ###output to the console### 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=[%d{dd/MM/yy HH:mm:ss:SSS z}] %t %5p %c{2}: %m%n -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/AppTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | 6 | object AppTag extends Tags { 7 | override def extractTag(logs: Logs) = { 8 | val map = Map[String, Int]((AdTagConstants.PREFIX_AD_APP_TAG + logs.appname -> 1)) 9 | map 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/ChannelTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | 6 | /** 7 | * 3)渠道(标签格式:CNxxxx->1)xxxx为渠道ID 8 | */ 9 | object ChannelTag extends Tags { 10 | override def extractTag(logs: Logs) = { 11 | if(logs.channelid == null) { 12 | Map[String, Int]() 13 | } else { 14 | Map[String, Int]((AdTagConstants.PREFIX_AD_CHANNEL_TAG + logs.channelid -> 1)) 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/scripts/kafka-script: -------------------------------------------------------------------------------- 1 | 启动: 2 | zkServer.sh start 3 | nohup kafka-server-start.sh $KAFKA_HOME/config/server.properties 1>~/logs/kafka_std.log 2>~/logs/kafka_err.log & 4 | 5 | 查看topics: 6 | kafka-topics.sh --list --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka 7 | 8 | 创建topic:cdnlog 9 | kafka-topics.sh --create --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --replication-factor 1 --partitions 1 --topic cdnlog 10 | 11 | 控制台消费: 12 | kafka-console-consumer.sh --zookeeper hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka --topic cdnlog -------------------------------------------------------------------------------- /weblog/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | weblog 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/AdPositionTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | import com.awebone.dmp.util.Utils 6 | 7 | import scala.collection.mutable 8 | 9 | /** 10 | * 标签一: 11 | 1)广告位类型(标签格式:LC03->1或者LC16->1)xx为数字,小于10 补0 12 | */ 13 | object AdPositionTag extends Tags { 14 | 15 | override def extractTag(logs: Logs) = { 16 | val map = mutable.Map[String, Int]() 17 | val adspacetype = Utils.fulfill(logs.adspacetype) 18 | map.put(AdTagConstants.PREFIX_AD_SPACE_TAG + "" + adspacetype, 1) 19 | map.toMap 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/scripts/mysql.sql: -------------------------------------------------------------------------------- 1 | create table uesr_domain_config( 2 | id int unsigned auto_increment, 3 | user_id varchar(40) not null, 4 | domain varchar(40) not null, 5 | primary key (id) 6 | ); 7 | 8 | insert into uesr_domain_config(user_id,domain) values('8000001','v1.awebone.com'); 9 | insert into uesr_domain_config(user_id,domain) values('8000002','v2.awebone.com'); 10 | insert into uesr_domain_config(user_id,domain) values('8000003','v3.awebone.com'); 11 | insert into uesr_domain_config(user_id,domain) values('8000004','v4.awebone.com'); 12 | insert into uesr_domain_config(user_id,domain) values('8000005','vmi.awebone.com'); 13 | 14 | select * from uesr_domain_config; -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/constants/AdTagConstants.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.constants 2 | 3 | /** 4 | * dmp中常见广告标签前缀常量 5 | */ 6 | object AdTagConstants { 7 | //广告位标签前缀 8 | val PREFIX_AD_SPACE_TAG = "LC_" 9 | //APP 10 | val PREFIX_AD_APP_TAG = "APP_" 11 | //渠道前缀 12 | val PREFIX_AD_CHANNEL_TAG = "CN_" 13 | //设备前缀 14 | val PREFIX_AD_DEVICE_TAG = "DEVICE_" 15 | //联网方式前缀 16 | val PREFIX_AD_NETWORK_TAG = "NET_" 17 | //设备运营商前缀 18 | val PREFIX_AD_ISP_TAG = "ISP_" 19 | //关键字前缀 20 | val PREFIX_AD_KEYWORD_TAG = "KW_" 21 | //省份地域前缀 22 | val PREFIX_AD_PROVINCE_TAG = "ZP_" 23 | //城市地域前缀 24 | val PREFIX_AD_CITY_TAG = "ZC_" 25 | } 26 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/AreaTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 地域标签(省标签格式:ZPxxx->1,地市标签格式:ZCxxx->1)xxx为省或市名称 10 | */ 11 | object AreaTag extends Tags { 12 | override def extractTag(logs: Logs) = { 13 | val areaMap = mutable.Map[String, Int]() 14 | if(logs.provincename != null) { 15 | areaMap.put(AdTagConstants.PREFIX_AD_PROVINCE_TAG + logs.provincename, 1) 16 | } 17 | if(logs.cityname != null) { 18 | areaMap.put(AdTagConstants.PREFIX_AD_CITY_TAG + logs.cityname, 1) 19 | } 20 | areaMap.toMap 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/KeyWordTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 5)关键词(标签格式:Kxxx->1)xxx为关键字。 10 | * 关键词个数不能少于3个字符,且不能超过8个字符; 11 | * 关键字中如包含”|”,则分割成数组,转化成多个关键字标签 12 | “麻辣小龙虾|麻辣香锅|与神对话|家” 13 | */ 14 | object KeyWordTag extends Tags { 15 | override def extractTag(logs: Logs) = { 16 | val map = mutable.Map[String, Int]() 17 | if(logs.keywords != null) { 18 | val kws = logs.keywords.split("\\|") 19 | for (kw <- kws) { 20 | if(kw.length >= 3 && kw.length <= 8) { 21 | map.put(AdTagConstants.PREFIX_AD_KEYWORD_TAG + kw, 1) 22 | } 23 | } 24 | } 25 | map.toMap 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /weblog/src/test/java/com/awebone/weblog/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.awebone.weblog; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /akka_rpc/src/main/java/com/awebone/hadoop_rpc/NameNodeClient.java: -------------------------------------------------------------------------------- 1 | package com.awebone.hadoop_rpc; 2 | 3 | import java.io.IOException; 4 | import java.net.InetSocketAddress; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.ipc.RPC; 8 | 9 | public class NameNodeClient { 10 | 11 | public static void main(String[] args) { 12 | 13 | 14 | try { 15 | MyServerProtocal proxy = RPC.getProxy(MyServerProtocal.class, 16 | MyServerProtocal.versionID, 17 | new InetSocketAddress("localhost", 9988), new Configuration()); 18 | 19 | /** 20 | * proxy.hello(); 21 | * 的底层,其实就是调用: 22 | * 23 | * 服务器中的 setInstance这个参数对象中的hello方法 24 | */ 25 | proxy.hello(); 26 | System.out.println(proxy.getName()); 27 | 28 | 29 | } catch (IOException e) { 30 | e.printStackTrace(); 31 | } 32 | 33 | 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/scripts/es-scripts: -------------------------------------------------------------------------------- 1 | 创建索引库: 2 | curl -XPUT http://localhost:9200/cdn 3 | 4 | 删除索引库: 5 | curl -XDELETE http://localhost:9200/cdn 6 | 7 | 创建type表: 8 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic/_mapping -d'{ 9 | "traffic": { 10 | "properties": { 11 | "domain": {"type": "keyword"}, 12 | "traffics": {"type": "long"}, 13 | "time": {"type": "date","format": "yyyy-MM-dd HH:mm"} 14 | } 15 | } 16 | }' 17 | 18 | curl -H "Content-Type: application/json" -XPOST http://localhost:9200/cdn/traffic-userid/_mapping -d'{ 19 | "traffic": { 20 | "properties": { 21 | "userid": {"type": "keyword"}, 22 | "domain": {"type": "text"}, 23 | "traffics": {"type": "long"}, 24 | "time": {"type": "date","format": "yyyy-MM-dd HH:mm"} 25 | } 26 | } 27 | }' -------------------------------------------------------------------------------- /dmp/script/mysql-create.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE `dmp`; 2 | CREATE TABLE `p_c_quantity` ( 3 | `data_date` date NOT NULL, 4 | `province` VARCHAR(40), 5 | `city` VARCHAR(40), 6 | `countz` bigint(20) NOT NULL 7 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 8 | 9 | CREATE TABLE `area_ad_req` ( 10 | `data_date` date NOT NULL, 11 | `province` VARCHAR(40), 12 | `city` VARCHAR(40), 13 | `orginal_req` bigint(20) DEFAULT NULL, 14 | `valid_req` bigint(20) DEFAULT NULL, 15 | `ad_req` bigint(20) DEFAULT NULL, 16 | `tpi_bid_num` bigint(20) DEFAULT NULL, 17 | `win_bid_num` bigint(20) DEFAULT NULL, 18 | `show_ad_master_num` bigint(20) DEFAULT NULL, 19 | `click_ad_master_num` bigint(20) DEFAULT NULL, 20 | `show_ad_media_num` bigint(20) DEFAULT NULL, 21 | `click_ad_media_num` bigint(20) DEFAULT NULL, 22 | `dsp_ad_xf` double DEFAULT NULL, 23 | `dsp_ad_cost` double DEFAULT NULL 24 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 大数据项目集 2 | 3 | ## 1. 基于Hadoop的离线用户行为日志分析(weblog) 4 | 5 | **技术栈:Hadoop** 6 | 7 | - [x] Bean 8 | - [x] 点击流数据处理 9 | - [x] 点击会话流模型构建 10 | - [x] Hive明细表构建 11 | - [x] 用户行为指标分析 12 | 13 |
14 | 15 | 16 | 17 | ## 2. 基于Akka实现RPC通信(akka_rpc) 18 | 19 | **技术栈:Akka** 20 | 21 | - [x] 模拟Hadoop集群间通信 22 | - [x] 模拟Spark集群间通信 23 | - [x] 模拟Yarn通信 24 | 25 |
26 | 27 | 28 | 29 | ## 3. 广告数据管理平台(dmp) 30 | 31 | **技术栈:Spark、Scala** 32 | 33 | - [x] 广告日志ETL 34 | - [x] 报表统计 35 | - [x] 用户画像构建 36 | - [x] 广告标签统计 37 | - [x] DMP结果入库HBase 38 | 39 |
40 | 41 | 42 | 43 | ## 4. 基于Spark MLLib实现个性化推荐(mllib) 44 | 45 | **技术栈:Spark、Scala** 46 | 47 | - [x] MovieLens DataModel构建 48 | - [x] 冷启动:启动时用户随机对10部电影评分 49 | - [x] 切分数据集 50 | - [x] ALS模型构建 51 | - [x] 模型评估 52 | - [x] 个性化推荐 53 | 54 |
55 | 56 | 57 | 58 | ## 5. 基于Flink对CDN日志分析(flink-train) 59 | 60 | **技术栈:Flink、Scala** 61 | 62 | - [x] 模拟Kafka生产者生成日志数据 63 | - [x] CDN日志分析 64 | 65 |
66 | 67 | -------------------------------------------------------------------------------- /mllib/src/main/scala/com/awebone/spark/WordCountScala.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.spark 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | object WordCountScala { 7 | def main(args: Array[String]): Unit = { 8 | //获取程序入口 9 | val sparkConf: SparkConf = new SparkConf() 10 | sparkConf.setAppName(WordCountScala.getClass.getSimpleName) 11 | sparkConf.setMaster("local") 12 | val sparkContext: SparkContext = new SparkContext(sparkConf) 13 | 14 | //WorkCount 15 | val linesRDD: RDD[String] = sparkContext.textFile(args(0)) 16 | val wordRDD: RDD[String] = linesRDD.flatMap(_.split(" ")) 17 | val wordAndOneRDD: RDD[(String, Int)] = wordRDD.map((_, 1)) 18 | val wordsCountRDD = wordAndOneRDD.reduceByKey((x: Int, y: Int) => x + y) 19 | wordsCountRDD.foreach(x => println(x._1, x._2)) 20 | wordsCountRDD.saveAsTextFile(args(1)) 21 | 22 | sparkContext.stop() 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/yarn/Message.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.yarn 2 | 3 | //样例类,做模式匹配 4 | 5 | //注册消息 nodemanager -> resourcemanager 6 | case class RegisterNodeManager(val nodemanagerid: String, val memory: Int, val cpu: Int) 7 | 8 | //资源: 不是说哪个任务需要多少资源,就把资源给这个任务 9 | //而是,某个节点有多少适合用于做计算的资源,那么就把这个任务启动在这个节点上 10 | 11 | 12 | //注册完成消息 resourcemanager -》 nodemanager 13 | case class RegisteredNodeManager(val resourcemanagerhostname: String) 14 | 15 | 16 | //心跳消息 nodemanager -》 resourcemanager 17 | case class Heartbeat(val nodemanagerid: String) 18 | 19 | /** 20 | * 是在RM中,为了维持整个集群中,到底哪个节点有多少资源 21 | * 所以吧每个节点的资源都封装在一个NodeManagerInfo对象里 22 | * 然后在RM中就维持了一个NodeManagerInfo对象的集合 23 | */ 24 | class NodeManagerInfo(val nodemanagerid: String, val memory: Int, val cpu: Int) { 25 | //用来存储nomanagerid这个NodeManager的最后一次心跳时间 26 | //_是一个默认值 27 | var lastHeartBeatTime: Long = _ 28 | } 29 | 30 | //单例 31 | case object SendMessage //仅仅是一个标志 32 | case object CheckTimeOut //也是一个标志 -------------------------------------------------------------------------------- /flink-train/src/main/scala/com/awebone/flink/connetcor/FileSystemSinkApp.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.flink.connetcor 2 | 3 | 4 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 5 | import org.apache.flink.streaming.connectors.fs.StringWriter 6 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer} 7 | 8 | object FileSystemSinkApp { 9 | def main(args: Array[String]): Unit = { 10 | System.setProperty("HADOOP_USER_NAME","hadoop") 11 | val env = StreamExecutionEnvironment.getExecutionEnvironment 12 | val data = env.socketTextStream("hadoop04",9999) 13 | 14 | data.print().setParallelism(1) 15 | val filepath = "/tmpdata/flink/hdfssink" 16 | 17 | val sink = new BucketingSink[String](filepath) 18 | sink.setBucketer(new DateTimeBucketer[String]("yyyy-MM-dd--HHmm")) 19 | sink.setWriter(new StringWriter()) 20 | sink.setBatchRolloverInterval(20) 21 | 22 | data.addSink(sink) 23 | env.execute("FileSystemSinkApp") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /akka_rpc/src/main/java/com/awebone/hadoop_rpc/MyNamenode.java: -------------------------------------------------------------------------------- 1 | package com.awebone.hadoop_rpc; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.HadoopIllegalArgumentException; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.ipc.RPC; 8 | import org.apache.hadoop.ipc.RPC.Server; 9 | 10 | public class MyNamenode { 11 | 12 | public static void main(String[] args) { 13 | 14 | 15 | try { 16 | 17 | /** 18 | * new MyServerImpl().hello() .getName() 19 | */ 20 | Server server = new RPC.Builder(new Configuration()) 21 | .setProtocol(MyServerProtocal.class) 22 | .setInstance(new MyServerImpl()) 23 | .setBindAddress("localhost") 24 | .setPort(9988) 25 | .build(); 26 | 27 | 28 | server.start(); 29 | System.out.println("SERVER START ......"); 30 | 31 | 32 | } catch (HadoopIllegalArgumentException e) { 33 | e.printStackTrace(); 34 | } catch (IOException e) { 35 | e.printStackTrace(); 36 | } 37 | 38 | 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /weblog/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.awebone 6 | weblog 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | weblog 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | 26 | org.apache.hadoop 27 | hadoop-client 28 | 2.7.6 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /weblog/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /mllib/src/main/java/com/awebone/spark/WordCountJava8.java: -------------------------------------------------------------------------------- 1 | package com.awebone.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import scala.Tuple2; 8 | 9 | import java.util.Arrays; 10 | 11 | public class WordCountJava8 { 12 | public static void main(String[] args) { 13 | //获取程序入口 14 | SparkConf sparkConf = new SparkConf(); 15 | sparkConf.setAppName("WordCountJava8"); 16 | sparkConf.setMaster("local"); 17 | JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); 18 | 19 | //获取数据 20 | JavaRDD linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input"); 21 | 22 | //计算 23 | JavaRDD rdd1 = linesRDD.flatMap(s -> Arrays.asList(s.split(" ")).iterator()); 24 | JavaPairRDD rdd2 = rdd1.mapToPair(s -> new Tuple2<>(s, 1)); 25 | JavaPairRDD rdd3 = rdd2.reduceByKey((x, y) -> x + y); 26 | 27 | rdd3.foreach(t -> System.out.println(t._1 + "\t" + t._2)); 28 | 29 | javaSparkContext.stop(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Awebone 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/akka_rpc/Worker.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.akka_rpc 2 | 3 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props} 4 | import com.typesafe.config.ConfigFactory 5 | 6 | class Worker extends Actor{ 7 | 8 | override def preStart(): Unit = { 9 | //指定访问哪个节点上的哪个actorSystem的哪个actor 10 | val connectStr = "akka.tcp://MasterActorSystem@localhost:6789/user/master" 11 | val selection: ActorSelection = context.actorSelection(connectStr) 12 | 13 | selection ! "hello" 14 | } 15 | 16 | override def receive: Receive = { 17 | case "hi" => { 18 | println("master send hi") 19 | } 20 | 21 | case _ => println("非法消息") 22 | } 23 | } 24 | 25 | object WorkerRun{ 26 | def main(args: Array[String]): Unit = { 27 | val hostname = "localhost" 28 | val strConfig = 29 | s""" 30 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider" 31 | |akka.remote.netty.tcp.hostname = ${hostname} 32 | """.stripMargin 33 | 34 | val config = ConfigFactory.parseString(strConfig) 35 | val as = ActorSystem("WorkerActorSystem", config) 36 | 37 | as.actorOf(Props(new Worker()), "worker") 38 | } 39 | } -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/akka_rpc/Master.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.akka_rpc 2 | 3 | import akka.actor.{Actor, ActorSystem, Props} 4 | import com.typesafe.config.ConfigFactory 5 | 6 | class Master extends Actor{ 7 | 8 | override def preStart(): Unit = { 9 | //业务逻辑初始化 10 | println("prestart") 11 | } 12 | 13 | //相当于是一个run,处理业务逻辑时有消息传送过来 14 | override def receive: Receive = { 15 | case "hello" => { 16 | //这个注释代表模拟一个业务方法,得到结果 17 | println("receive hi") 18 | 19 | val result = "hi" 20 | //谁发送过来消息,谁就是sender() 21 | sender() ! result 22 | } 23 | 24 | case _ => println("非法新消息") 25 | } 26 | } 27 | 28 | object MasterRun{ 29 | def main(args: Array[String]): Unit = { 30 | val strConfig = 31 | """ 32 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider" 33 | |akka.remote.netty.tcp.hostname =localhost 34 | |akka.remote.netty.tcp.port=6789 35 | """.stripMargin 36 | 37 | val config = ConfigFactory.parseString(strConfig) 38 | val as = ActorSystem("MasterActorSystem",config) 39 | 40 | as.actorOf(Props(new Master()), "master") 41 | println("MasterActorSystem init") 42 | } 43 | } -------------------------------------------------------------------------------- /dmp/src/main/resources/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | javax.jdo.option.ConnectionURL 4 | jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true 5 | JDBC connect string for a JDBC metastore 6 | 7 | 8 | javax.jdo.option.ConnectionDriverName 9 | com.mysql.jdbc.Driver 10 | Driver class name for a JDBC metastore 11 | 12 | 13 | javax.jdo.option.ConnectionUserName 14 | root 15 | username to use against metastore database 16 | 17 | 18 | javax.jdo.option.ConnectionPassword 19 | root 20 | password to use against metastore database 21 | 22 | 23 | 24 | hive.server2.thrift.port 25 | 10000 26 | 27 | 28 | hive.server2.thrift.bind.host 29 | hadoop04 30 | 31 | 32 | 33 | hive.metastore.uris 34 | thrift://hadoop04:9083 35 | 36 | 37 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | javax.jdo.option.ConnectionURL 4 | jdbc:mysql://hadoop01:3306/hivedb_ms?createDatabaseIfNotExist=true 5 | JDBC connect string for a JDBC metastore 6 | 7 | 8 | javax.jdo.option.ConnectionDriverName 9 | com.mysql.jdbc.Driver 10 | Driver class name for a JDBC metastore 11 | 12 | 13 | javax.jdo.option.ConnectionUserName 14 | root 15 | username to use against metastore database 16 | 17 | 18 | javax.jdo.option.ConnectionPassword 19 | root 20 | password to use against metastore database 21 | 22 | 23 | 24 | hive.server2.thrift.port 25 | 10000 26 | 27 | 28 | hive.server2.thrift.bind.host 29 | hadoop04 30 | 31 | 32 | 33 | hive.metastore.uris 34 | thrift://hadoop04:9083 35 | 36 | 37 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/tags/DeviceTag.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.tags 2 | 3 | import com.awebone.dmp.Logs 4 | import com.awebone.dmp.constants.AdTagConstants 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * 4)设备:操作系统|联网方式|运营商 10 | 设备操作系统 11 | 1 Android D0001001 12 | 2 IOS D0001002 13 | 3 Winphone D0001003 14 | 4 其他 D0001004 15 | 设备联网方式 16 | WIFI D0002001 17 | 4G D0002002 18 | 3G D0002003 19 | 2G D0002004 20 | NWTWORKOTHER D0004004 21 | 设备运营商方案 22 | 移动 D0003001 23 | 联通 D0003002 24 | 电信 D0003003 25 | OPERATOROTHER D0003004 26 | */ 27 | object DeviceTag extends Tags { 28 | override def extractTag(logs: Logs) = { 29 | val mMap = mutable.Map[String, Int]() 30 | //设备操作系统为:client 31 | if(logs.client != null) { 32 | mMap.put(AdTagConstants.PREFIX_AD_DEVICE_TAG + logs.client, 1) 33 | } 34 | //联网方式networkmannerid 35 | if(logs.networkmannerid != null) { 36 | mMap.put(AdTagConstants.PREFIX_AD_NETWORK_TAG + logs.networkmannerid, 1) 37 | } 38 | 39 | //设备运营商ispid 40 | if(logs.ispid != null) { 41 | mMap.put(AdTagConstants.PREFIX_AD_ISP_TAG + logs.ispid, 1) 42 | } 43 | mMap.toMap 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /dmp/src/main/resources/hbase-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | 26 | hbase.rootdir 27 | hdfs://myha/myhbase 28 | 29 | 30 | 31 | hbase.cluster.distributed 32 | true 33 | 34 | 35 | 36 | hbase.zookeeper.quorum 37 | hadoop01:2181,hadoop02:2181,hadoop03:2181 38 | 39 | 40 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/hbase-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | 26 | hbase.rootdir 27 | hdfs://myha/myhbase 28 | 29 | 30 | 31 | hbase.cluster.distributed 32 | true 33 | 34 | 35 | 36 | hbase.zookeeper.quorum 37 | hadoop01:2181,hadoop02:2181,hadoop03:2181 38 | 39 | 40 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.util 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | 5 | object Utils { 6 | def parseInt(str:String):Int = { 7 | if(StringUtils.isEmpty(str)) { 8 | 0 9 | } else { 10 | str.toInt 11 | } 12 | } 13 | 14 | def parseDouble(str:String):Double = { 15 | if(StringUtils.isEmpty(str)) { 16 | 0.0 17 | } else { 18 | str.toDouble 19 | } 20 | } 21 | 22 | //yyyy-MM-dd hh:mm:ss--->hh 23 | def fmtHour(str: String):Option[String] = { 24 | if(StringUtils.isEmpty(str)) { 25 | None 26 | } else { 27 | Some(str.substring(str.indexOf(" ") + 1, str.indexOf(" ") + 3)) 28 | } 29 | } 30 | 31 | //yyyy-MM-dd hh:mm:ss--->yyyy-MM-dd 32 | def fmtDate(str: String):Option[String] = { 33 | if(StringUtils.isEmpty(str)) { 34 | None 35 | } else { 36 | Some(str.substring(0, str.indexOf(" "))) 37 | } 38 | } 39 | 40 | //补全两位字符串 41 | def fulfill(str:String) = { 42 | if(str != null && str.length > 1) { 43 | str 44 | } else if(!"".equals(str) && str.length == 1){ 45 | 0 + "" + str 46 | } else { 47 | "other" 48 | } 49 | } 50 | //补全数字 51 | def fulfill(num:Int) = { 52 | if(num >= 0 && num < 10) { 53 | "0" + num 54 | } else { 55 | "" + num 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /dmp/data/data.txt: -------------------------------------------------------------------------------- 1 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,, 2 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,, 3 | 0bb49045000057eee4ed3a580019ca06,0,0,0,100002,未知,26C7B9C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,139.227.161.115,com.apptreehot.horse,马上赚,AQ+KIQeBhehxf6xf98BFFnl+CV00p,A10%E55F%BC%E6%AO%B%,1,4.1.1,,760,980,,,上海市,上海市,4,未知,3,Wifi,0,0,2,插屏,1,2,6,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+KIQeBhexf6x988FFnl+CVOOp,1,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17,, 4 | 0bfbf7c8000057eee4ed2a0b000ca4d3,0,0,0,100002,未知,26C07B8C83DB4B6197CEB80D53B3F5DA,1,1,0,0,2016-10-01 06:19:17,58.47.147.169,cn.touchnagic.game.cllubpa2121bvnoolgwwel,其他,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,lenovo+A500,1,2.3.5,,480,800,,,湘南省,益阳市,4,未知,3,Wifi,0,0,2,插屏,1,2,999,未知,1,0,0,0,0,0,0,0,,,,,,,,,,,,0,555,240,290,,,,,,,,,,,AQ+CJwCFjOlxf6V98cdAmlja+SXQ,2,1,0,0,0,0,0,,,mm_26632353_8068780_27326559,2016-10-01 06:19:17 ,, -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLOps.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.etl 2 | 3 | import com.awebone.dmp.Logs 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.serializer.KryoSerializer 8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession} 9 | 10 | /** 11 | * 日志数据清洗过程 12 | * 13 | * 1)要求一:将数据转换成parquet文件格式 14 | * 2)要求二:序列化方式采用KryoSerializer方式 15 | * 3)要求三:parquet文件采用Sanppy压缩方式 16 | * 17 | * 通过处理分析,使用SparkCore只能完成KryoSerializer和Snappy,想要完成parquet比较困难, 18 | * 而SparkSQL处理parquet文件非常简单,所以需要将原先的编码做一稍微改动 19 | */ 20 | object DMPLogETLOps { 21 | def main(args: Array[String]): Unit = { 22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) 23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN) 25 | 26 | val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]") 27 | .set("spark.serializer",classOf[KryoSerializer].getName) 28 | .registerKryoClasses(Array(classOf[Logs])) //要求二:序列化方式采用KryoSerializer方式 29 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() 30 | import spark.implicits._ 31 | 32 | val lines:RDD[String] = spark.sparkContext.textFile("file:///D:\\workplace\\dmp\\data\\data.txt") 33 | 34 | val retDS: Dataset[Logs] = lines.map(line => { 35 | val log: Logs = Logs.line2Logs(line) 36 | log 37 | }).toDS() 38 | 39 | /** 40 | * 要求一:将数据转换成parquet文件格式 41 | * 要求三:parquet文件采用Sanppy压缩方式 42 | */ 43 | retDS.write.mode(SaveMode.Overwrite).parquet("file:///D:\\workplace\\dmp\\data\\out\\") 44 | 45 | spark.stop() 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/com/awebone/flink/project/MySQLSource.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.flink.project 2 | 3 | 4 | import java.sql.{Connection, DriverManager, PreparedStatement} 5 | 6 | import org.apache.flink.configuration.Configuration 7 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction} 8 | 9 | import scala.collection.mutable 10 | 11 | /** 12 | * 自定义Mysql 并行的Source 13 | */ 14 | class MySQLSource extends RichParallelSourceFunction[mutable.HashMap[String, String]] { 15 | var connection: Connection = null 16 | var ps: PreparedStatement = null 17 | 18 | //创建连接 19 | override def open(parameters: Configuration): Unit = { 20 | super.open(parameters) 21 | val driver = "com.mysql.jdbc.Driver" 22 | val url = "jdbc:mysql://hadoop01:3306/flink" 23 | val user = "root" 24 | val password = "root" 25 | Class.forName(driver) 26 | connection = DriverManager.getConnection(url, user, password) 27 | 28 | val sql = "select user_id,domain from user_domain_config" 29 | ps = connection.prepareStatement(sql) 30 | } 31 | 32 | //不断执行的函数 33 | override def run(sourceContext: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = { 34 | val resultSet = ps.executeQuery() 35 | val collect = mutable.HashMap[String,String]() 36 | 37 | //将查询结果放入HashMap中 38 | while (resultSet.next()){ 39 | collect.put(resultSet.getNString("domain"), resultSet.getNString("user_id")) 40 | } 41 | sourceContext.collect(collect) 42 | } 43 | 44 | override def cancel(): Unit = {} 45 | 46 | override def close(): Unit = { 47 | if(ps != null){ 48 | ps.close() 49 | } 50 | if(connection != null){ 51 | connection.close() 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /dmp/src/main/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | fs.defaultFS 23 | hdfs://myha/ 24 | 25 | 26 | 27 | 28 | hadoop.tmp.dir 29 | /home/hadoop/data/hadoopdata/ 30 | 31 | 32 | 33 | 34 | ha.zookeeper.quorum 35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181 36 | 37 | 38 | 39 | 40 | ha.zookeeper.session-timeout.ms 41 | 1000 42 | ms 43 | 44 | 45 | 46 | topology.script.file.name 47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh 48 | 49 | 50 | 51 | hadoop.proxyuser.hadoop.hosts 52 | * 53 | 54 | 55 | hadoop.proxyuser.hadoop.groups 56 | * 57 | 58 | 59 | -------------------------------------------------------------------------------- /weblog/src/main/java/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | fs.defaultFS 23 | hdfs://myha/ 24 | 25 | 26 | 27 | 28 | hadoop.tmp.dir 29 | /home/hadoop/data/hadoopdata/ 30 | 31 | 32 | 33 | 34 | ha.zookeeper.quorum 35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181 36 | 37 | 38 | 39 | 40 | ha.zookeeper.session-timeout.ms 41 | 1000 42 | ms 43 | 44 | 45 | 46 | topology.script.file.name 47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh 48 | 49 | 50 | 51 | hadoop.proxyuser.hadoop.hosts 52 | * 53 | 54 | 55 | hadoop.proxyuser.hadoop.groups 56 | * 57 | 58 | 59 | -------------------------------------------------------------------------------- /mllib/src/main/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | fs.defaultFS 23 | hdfs://myha/ 24 | 25 | 26 | 27 | 28 | hadoop.tmp.dir 29 | /home/hadoop/data/hadoopdata/ 30 | 31 | 32 | 33 | 34 | ha.zookeeper.quorum 35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181 36 | 37 | 38 | 39 | 40 | ha.zookeeper.session-timeout.ms 41 | 1000 42 | ms 43 | 44 | 45 | 46 | topology.script.file.name 47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh 48 | 49 | 50 | 51 | hadoop.proxyuser.hadoop.hosts 52 | * 53 | 54 | 55 | hadoop.proxyuser.hadoop.groups 56 | * 57 | 58 | 59 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | fs.defaultFS 23 | hdfs://myha/ 24 | 25 | 26 | 27 | 28 | hadoop.tmp.dir 29 | /home/hadoop/data/hadoopdata/ 30 | 31 | 32 | 33 | 34 | ha.zookeeper.quorum 35 | hadoop01:2181,hadoop02:2181,hadoop03:2181,hadoop04:2181 36 | 37 | 38 | 39 | 40 | ha.zookeeper.session-timeout.ms 41 | 1000 42 | ms 43 | 44 | 45 | 46 | topology.script.file.name 47 | /home/hadoop/apps/hadoop-2.7.6/etc/hadoop/topology.sh 48 | 49 | 50 | 51 | hadoop.proxyuser.hadoop.hosts 52 | * 53 | 54 | 55 | hadoop.proxyuser.hadoop.groups 56 | * 57 | 58 | 59 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/report/ProvinceCityQuantityJob.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.report 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 8 | 9 | /** 10 | * 省份:province 11 | * 城市:city 12 | * 结果存储到MySQL数据库 13 | * select 14 | * province, 15 | * city, 16 | * count(1) 17 | * from logs 18 | * group by province, city 19 | **/ 20 | object ProvinceCityQuantityJob { 21 | def main(args: Array[String]): Unit = { 22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) 23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN) 25 | 26 | if(args == null || args.length < 2){ 27 | println( 28 | """Parameter Errors! Usage: 29 | |inputpath : input path 30 | |table : mysql table name 31 | """.stripMargin) 32 | System.exit(-1) 33 | } 34 | val Array(inputpath, table) = args 35 | 36 | val conf: SparkConf = new SparkConf().setAppName("ProvinceCityQuantityJob").setMaster("local[*]") 37 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() 38 | 39 | val input: DataFrame = spark.read.parquet(inputpath) 40 | input.createOrReplaceTempView("logs") 41 | 42 | val sql = 43 | """ 44 | |select 45 | | date_sub(current_date(), 0) data_date, 46 | | provincename province, 47 | | cityname city, 48 | | count(1) as countz 49 | |from logs 50 | |group by provincename, cityname 51 | """.stripMargin 52 | 53 | val url = "jdbc:mysql://hadoop01:3306/dmp" 54 | val properties = new Properties 55 | properties.put("user","root") 56 | properties.put("password","root") 57 | 58 | spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties) 59 | 60 | spark.stop() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/etl/DMPLogETLHDFSOps.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.etl 2 | 3 | import com.awebone.dmp.Logs 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.serializer.KryoSerializer 8 | import org.apache.spark.sql.{Dataset, SaveMode, SparkSession} 9 | 10 | /** 11 | * 日志数据清洗过程 12 | * 13 | * 1)要求一:将数据转换成parquet文件格式 14 | * 2)要求二:序列化方式采用KryoSerializer方式 15 | * 3)要求三:parquet文件采用Sanppy压缩方式 16 | * 17 | * 通过处理分析,使用SparkCore只能完成KryoSerializer和Snappy,想要完成parquet比较困难, 18 | * 而SparkSQL处理parquet文件非常简单,所以需要将原先的编码做一稍微改动 19 | */ 20 | object DMPLogETLHDFSOps { 21 | def main(args: Array[String]): Unit = { 22 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) 23 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 24 | Logger.getLogger("org.spark-project").setLevel(Level.WARN) 25 | 26 | if(args == null || args.length < 2){ 27 | println( 28 | """Parameter Errors! Usage: 29 | |inputpath : input path 30 | |outputpath : output path 31 | """.stripMargin) 32 | System.exit(-1) 33 | } 34 | val Array(inputpath, outputpath) = args 35 | 36 | val conf: SparkConf = new SparkConf().setAppName("DMPLogETL").setMaster("local[*]") 37 | .set("spark.serializer",classOf[KryoSerializer].getName) 38 | .registerKryoClasses(Array(classOf[Logs])) //要求二:序列化方式采用KryoSerializer方式 39 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() 40 | import spark.implicits._ 41 | 42 | val lines:RDD[String] = spark.sparkContext.textFile(inputpath) 43 | 44 | val retDS: Dataset[Logs] = lines.map(line => { 45 | val log: Logs = Logs.line2Logs(line) 46 | log 47 | }).toDS() 48 | 49 | /** 50 | * 要求一:将数据转换成parquet文件格式 51 | * 要求三:parquet文件采用Sanppy压缩方式 52 | */ 53 | retDS.write.mode(SaveMode.Overwrite).parquet(outputpath) 54 | 55 | spark.stop() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/pre/WebLogParse.java: -------------------------------------------------------------------------------- 1 | package com.awebone.pre; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.HashSet; 6 | import java.util.Locale; 7 | import java.util.Set; 8 | 9 | import com.awebone.bean.WebLogBean; 10 | 11 | public class WebLogParse { 12 | static SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss", Locale.US); 13 | static SimpleDateFormat sdf2 = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); 14 | static Set pages = new HashSet(); 15 | static { 16 | pages.add("/about"); 17 | pages.add("/black-ip-list/"); 18 | pages.add("/cassandra-clustor/"); 19 | pages.add("/finance-rhive-repurchase/"); 20 | pages.add("/hadoop-family-roadmap/"); 21 | pages.add("/hadoop-hive-intro/"); 22 | pages.add("/hadoop-zookeeper-intro/"); 23 | pages.add("/hadoop-mahout-roadmap/"); 24 | } 25 | 26 | public static WebLogBean parse(String line) throws ParseException { 27 | // 参数代表一行日志信息 28 | String[] log_datas = line.split(" "); 29 | if (log_datas.length >= 12) { 30 | String addr = log_datas[0]; 31 | String user = log_datas[2]; 32 | String local_time = log_datas[3]; 33 | // 时间解析 34 | String format_time = sdf2.format(sdf1.parse(local_time.substring(1))); 35 | if (null == format_time || "".equals(format_time)) { 36 | format_time = "_invalid_"; 37 | } 38 | String request = log_datas[6]; 39 | String status = log_datas[8]; 40 | String byte_sent = log_datas[9]; 41 | String http_refer = log_datas[10]; 42 | // 拼接浏览器对象 43 | StringBuffer sb = new StringBuffer(); 44 | for (int i = 11; i < log_datas.length; i++) { 45 | sb.append(log_datas[i] + " "); 46 | } 47 | String user_agent = sb.substring(1, sb.length() - 2); 48 | 49 | WebLogBean bean = new WebLogBean(false, addr, user, format_time, request, status, byte_sent, http_refer, 50 | user_agent); 51 | // 判断数据有效性 52 | if ("_invalid_".equals(format_time)) { 53 | bean.setValid(false); 54 | } 55 | if (Integer.parseInt(bean.getStatus()) > 400) { 56 | bean.setValid(false); 57 | } 58 | if (pages.contains(bean.getRequest())) { 59 | bean.setValid(true); 60 | } 61 | return bean; 62 | }else{ 63 | return null; 64 | } 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/pre/WebLogPreProcess.java: -------------------------------------------------------------------------------- 1 | package com.awebone.pre; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | import com.awebone.bean.WebLogBean; 17 | 18 | //对原始数据进行预处理 19 | public class WebLogPreProcess { 20 | /** 21 | * @author Awebone 22 | * map端: 23 | * 一行数据--- 一条日志--- hive一条数据 24 | * 切分 封装对象 发送 写出hdfs 25 | * key:null 26 | * value:自定义对象 27 | */ 28 | static class WebLogPreProcessMapper extends Mapper { 29 | @Override 30 | protected void map(LongWritable key, Text value, 31 | Mapper.Context context) 32 | throws IOException, InterruptedException { 33 | String line = value.toString(); 34 | try { 35 | WebLogBean webLogBean = WebLogParse.parse(line); 36 | if (webLogBean != null) { 37 | context.write(NullWritable.get(), webLogBean); 38 | } 39 | } catch (ParseException e) { 40 | e.printStackTrace(); 41 | } 42 | } 43 | } 44 | 45 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { 46 | System.setProperty("HADOOP_USER_NAME", "hadoop"); 47 | Configuration conf = new Configuration(); 48 | conf.set("fs.defaultFS", "hdfs://myha/"); 49 | Job job = Job.getInstance(conf); 50 | 51 | job.setJarByClass(WebLogPreProcess.class); 52 | 53 | job.setMapperClass(WebLogPreProcessMapper.class); 54 | job.setOutputKeyClass(NullWritable.class); 55 | job.setOutputValueClass(WebLogBean.class); 56 | 57 | FileInputFormat.setInputPaths(job, new Path("/weblog/20200221")); 58 | FileOutputFormat.setOutputPath(job, new Path("/weblog/pre/20200221")); 59 | 60 | //不需要 设置为0 61 | job.setNumReduceTasks(0); 62 | 63 | boolean res = job.waitForCompletion(true); 64 | System.exit(res ? 0 : 1); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/com/awebone/flink/project/MockKafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.flink.project 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.{Date, Properties} 5 | 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | import org.apache.kafka.common.serialization.StringSerializer 8 | 9 | import scala.util.Random 10 | 11 | object MockKafkaProducer { 12 | 13 | private def getLevels() = { 14 | val levels = Array[String]("M","E") 15 | 16 | levels(new Random().nextInt(levels.length)) 17 | } 18 | 19 | private def getIps() = { 20 | val ips = Array[String]("233.104.18.110", 21 | "113.101.75.194", 22 | "27.17.127.135", 23 | "185.225.139.16", 24 | "112.1.66.34", 25 | "175.148.211.190", 26 | "183.227.58.21", 27 | "59.83.198.84", 28 | "117.28.38.28", 29 | "117.59.39.169") 30 | 31 | ips(new Random().nextInt(ips.length)) 32 | } 33 | 34 | private def getDomains() = { 35 | val domains = Array[String]("v1.awebone.com", "v2.awebone.com", "v3.awebone.com", "v4.awebone.com", "vmi.awebone.com") 36 | 37 | domains(new Random().nextInt(domains.length)) 38 | } 39 | 40 | private def getTraffic() = new Random().nextInt(10000) 41 | 42 | def main(args: Array[String]): Unit = { 43 | val properties: Properties = new Properties() 44 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092") 45 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk 46 | // properties.put("metadata.broker.list", "hadoop04:9092") // 声明kafka broker 47 | properties.setProperty("key.serializer", classOf[StringSerializer].getName) 48 | properties.setProperty("value.serializer", classOf[StringSerializer].getName) 49 | 50 | val producer = new KafkaProducer[String, String](properties) 51 | val topic = "cdnlog" 52 | 53 | while (true){ 54 | val builder = new StringBuilder() 55 | builder.append("cdnlog").append("\t") 56 | .append("CN").append("\t") 57 | .append(getLevels()).append("\t") 58 | .append(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())).append("\t") 59 | .append(getIps()).append("\t") 60 | .append(getDomains()).append("\t") 61 | .append(getTraffic()).append("\t") 62 | 63 | println(builder.toString()) 64 | val pr = new ProducerRecord[String, String](topic, builder.toString()) 65 | producer.send(pr) 66 | Thread.sleep(2000) 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/yarn/MyNodeManager.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.yarn 2 | 3 | import java.util.UUID 4 | 5 | import akka.actor.{Actor, ActorSelection, ActorSystem, Props} 6 | import com.typesafe.config.ConfigFactory 7 | import sun.plugin2.message.HeartbeatMessage 8 | 9 | class MyNodeManager(val resourcemanagerhostname: String, val resourcemanagerport: Int, val memory: Int, val cpu: Int) extends Actor { 10 | 11 | var nodemanagerid: String = _ 12 | var rmRef: ActorSelection = _ 13 | 14 | override def preStart(): Unit = { 15 | // 远程path   akka.tcp://(ActorSystem的名称)@(远程地址的IP) : (远程地址的端口)/user/(Actor的名称) 16 | rmRef = context.actorSelection(s"akka.tcp://${Constant.RMAS}@${resourcemanagerhostname}:${resourcemanagerport}/user/${Constant.RMA}") 17 | 18 | // val nodemanagerid:String 19 | // val memory:Int 20 | // val cpu:Int 21 | nodemanagerid = UUID.randomUUID().toString 22 | //发送注册消息 23 | rmRef ! RegisterNodeManager(nodemanagerid, memory, cpu) 24 | } 25 | 26 | override def receive: Receive = { 27 | case RegisteredNodeManager(masterURL) => { 28 | println(masterURL); 29 | 30 | /** 31 | * initialDelay: FiniteDuration, 多久以后开始执行 32 | * interval: FiniteDuration, 每隔多长时间执行一次 33 | * receiver: ActorRef, 给谁发送这个消息 34 | * message: Any 发送的消息是啥 35 | */ 36 | import scala.concurrent.duration._ 37 | import context.dispatcher 38 | //每个4秒对自己发送信息,然后就可以发送心跳信息 39 | context.system.scheduler.schedule(0 millis, 4000 millis, self, SendMessage) 40 | } 41 | 42 | case SendMessage => { 43 | 44 | //向主节点发送心跳信息 45 | rmRef ! Heartbeat(nodemanagerid) 46 | 47 | println(Thread.currentThread().getId) 48 | } 49 | } 50 | } 51 | 52 | object MyNodeManager { 53 | def main(args: Array[String]): Unit = { 54 | val HOSTNAME = args(0) 55 | val RM_HOSTNAME = args(1) 56 | val RM_PORT = args(2).toInt 57 | val NODEMANAGER_MEMORY = args(3).toInt 58 | val NODEMANAGER_CORE = args(4).toInt 59 | var NODEMANAGER_PORT = args(5).toInt 60 | val str = 61 | s""" 62 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider" 63 | |akka.remote.netty.tcp.hostname =${HOSTNAME} 64 | |akka.remote.netty.tcp.port=${NODEMANAGER_PORT} 65 | """.stripMargin 66 | val conf = ConfigFactory.parseString(str) 67 | val actorSystem = ActorSystem(Constant.NMAS, conf) 68 | actorSystem.actorOf(Props(new MyNodeManager(RM_HOSTNAME, RM_PORT, NODEMANAGER_MEMORY, NODEMANAGER_CORE)), Constant.NMA) 69 | } 70 | } -------------------------------------------------------------------------------- /mllib/src/main/java/com/awebone/spark/WordCountJava7.java: -------------------------------------------------------------------------------- 1 | package com.awebone.spark; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.FlatMapFunction; 8 | import org.apache.spark.api.java.function.Function2; 9 | import org.apache.spark.api.java.function.PairFunction; 10 | import org.apache.spark.api.java.function.VoidFunction; 11 | import scala.Tuple2; 12 | 13 | import java.util.Arrays; 14 | import java.util.Iterator; 15 | 16 | public class WordCountJava7 { 17 | public static void main(String[] args) { 18 | //获取程序入口 19 | SparkConf sparkConf = new SparkConf(); 20 | sparkConf.setAppName("WordCountJava7"); 21 | sparkConf.setMaster("local"); 22 | JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); 23 | 24 | //获取数据 25 | JavaRDD linesRDD = javaSparkContext.textFile("hdfs://myha/wc/input"); 26 | 27 | //计算 28 | JavaRDD wordsRDD = linesRDD.flatMap(new FlatMapFunction() { 29 | @Override 30 | public Iterator call(String s) throws Exception { 31 | return Arrays.asList(s.split(" ")).iterator(); 32 | } 33 | }); 34 | 35 | JavaPairRDD wordAndOneRDD = wordsRDD.mapToPair(new PairFunction() { 36 | @Override 37 | public Tuple2 call(String s) throws Exception { 38 | return new Tuple2<>(s, 1); 39 | } 40 | }); 41 | 42 | JavaPairRDD wordsCountRDD = wordAndOneRDD.reduceByKey(new Function2() { 43 | @Override 44 | public Integer call(Integer integer, Integer integer2) throws Exception { 45 | return integer + integer2; 46 | } 47 | }); 48 | 49 | JavaPairRDD newWordsCountRDD = wordsCountRDD.mapToPair(new PairFunction, Integer, String>() { 50 | @Override 51 | public Tuple2 call(Tuple2 stringIntegerTuple2) throws Exception { 52 | return stringIntegerTuple2.swap(); 53 | } 54 | }); 55 | JavaPairRDD sortedRDD = newWordsCountRDD.sortByKey(false); 56 | JavaPairRDD lastSortWordCoundRDD = sortedRDD.mapToPair(new PairFunction, String, Integer>() { 57 | @Override 58 | public Tuple2 call(Tuple2 integerStringTuple2) throws Exception { 59 | return integerStringTuple2.swap(); 60 | } 61 | }); 62 | 63 | lastSortWordCoundRDD.foreach(new VoidFunction>() { 64 | @Override 65 | public void call(Tuple2 t) throws Exception { 66 | System.out.println(t._1 + "\t" + t._2); 67 | } 68 | }); 69 | 70 | javaSparkContext.stop(); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /akka_rpc/src/main/scala/com/awebone/yarn/MyResourceManager.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.yarn 2 | 3 | import akka.actor.{Actor, ActorSystem, Props} 4 | import com.typesafe.config.ConfigFactory 5 | 6 | import scala.collection.mutable 7 | 8 | class MyResourceManager(var hostname: String, var port: Int) extends Actor { 9 | 10 | // 用来存储每个注册的NodeManager节点的信息 11 | private var id2nodemanagerinfo = new mutable.HashMap[String, NodeManagerInfo]() 12 | // 对所有注册的NodeManager进行去重,其实就是一个HashSet 13 | private var nodemanagerInfoes = new mutable.HashSet[NodeManagerInfo]() 14 | 15 | // actor在最开始的时候,会执行一次 16 | override def preStart(): Unit = { 17 | import scala.concurrent.duration._ 18 | import context.dispatcher 19 | 20 | // 调度一个任务, 每隔五秒钟执行一次,每隔5秒给自己发送一次信息 21 | context.system.scheduler.schedule(0 millis, 5000 millis, self, CheckTimeOut) 22 | } 23 | 24 | override def receive: Receive = { 25 | 26 | case RegisterNodeManager(nodemanagerid, memory, cpu) => { 27 | val nodeManagerInfo = new NodeManagerInfo(nodemanagerid, memory, cpu) 28 | 29 | // 对注册的NodeManager节点进行存储管理 30 | id2nodemanagerinfo.put(nodemanagerid, nodeManagerInfo) 31 | nodemanagerInfoes += nodeManagerInfo 32 | 33 | //把信息存到zookeeper 34 | sender() ! RegisteredNodeManager(hostname + ":" + port) 35 | } 36 | 37 | case Heartbeat(nodemanagerid) => { 38 | val currentTime = System.currentTimeMillis() 39 | val nodeManagerInfo = id2nodemanagerinfo(nodemanagerid) 40 | nodeManagerInfo.lastHeartBeatTime = currentTime 41 | 42 | id2nodemanagerinfo(nodemanagerid) = nodeManagerInfo 43 | nodemanagerInfoes += nodeManagerInfo 44 | } 45 | 46 | // 检查过期失效的 NodeManager 47 | case CheckTimeOut => { 48 | val currentTime = System.currentTimeMillis() 49 | 50 | // 15 秒钟失效 51 | //foreach:遍历 52 | //filter:拿到所有的已经宕机的节点 53 | nodemanagerInfoes.filter(nm => currentTime - nm.lastHeartBeatTime > 15000) 54 | .foreach(deadnm => { 55 | nodemanagerInfoes -= deadnm 56 | id2nodemanagerinfo.remove(deadnm.nodemanagerid) 57 | }) 58 | println("当前注册成功的节点数" + nodemanagerInfoes.size); 59 | } 60 | } 61 | } 62 | 63 | object MyResourceManager { 64 | def main(args: Array[String]): Unit = { 65 | val RESOURCEMANAGER_HOSTNAME = args(0) //解析的配置的日志 66 | val RESOURCEMANAGER_PORT = args(1).toInt 67 | 68 | //解析运行时所需要的参数 69 | val str = 70 | s""" 71 | |akka.actor.provider = "akka.remote.RemoteActorRefProvider" 72 | |akka.remote.netty.tcp.hostname =${RESOURCEMANAGER_HOSTNAME} 73 | |akka.remote.netty.tcp.port=${RESOURCEMANAGER_PORT} 74 | """.stripMargin 75 | 76 | val conf = ConfigFactory.parseString(str) 77 | val actorSystem = ActorSystem(Constant.RMAS, conf) 78 | 79 | //启动一个actor 80 | actorSystem.actorOf(Props(new MyResourceManager(RESOURCEMANAGER_HOSTNAME, RESOURCEMANAGER_PORT)), Constant.RMA) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/bean/VisitBean.java: -------------------------------------------------------------------------------- 1 | package com.awebone.bean; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | 9 | public class VisitBean implements Writable { 10 | 11 | private String session; 12 | private String remote_addr; 13 | private String inTime; 14 | private String outTime; 15 | private String inPage; 16 | private String outPage; 17 | private String referal; 18 | private int pageVisits; 19 | 20 | public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) { 21 | this.session = session; 22 | this.remote_addr = remote_addr; 23 | this.inTime = inTime; 24 | this.outTime = outTime; 25 | this.inPage = inPage; 26 | this.outPage = outPage; 27 | this.referal = referal; 28 | this.pageVisits = pageVisits; 29 | } 30 | 31 | public String getSession() { 32 | return session; 33 | } 34 | 35 | public void setSession(String session) { 36 | this.session = session; 37 | } 38 | 39 | public String getRemote_addr() { 40 | return remote_addr; 41 | } 42 | 43 | public void setRemote_addr(String remote_addr) { 44 | this.remote_addr = remote_addr; 45 | } 46 | 47 | public String getInTime() { 48 | return inTime; 49 | } 50 | 51 | public void setInTime(String inTime) { 52 | this.inTime = inTime; 53 | } 54 | 55 | public String getOutTime() { 56 | return outTime; 57 | } 58 | 59 | public void setOutTime(String outTime) { 60 | this.outTime = outTime; 61 | } 62 | 63 | public String getInPage() { 64 | return inPage; 65 | } 66 | 67 | public void setInPage(String inPage) { 68 | this.inPage = inPage; 69 | } 70 | 71 | public String getOutPage() { 72 | return outPage; 73 | } 74 | 75 | public void setOutPage(String outPage) { 76 | this.outPage = outPage; 77 | } 78 | 79 | public String getReferal() { 80 | return referal; 81 | } 82 | 83 | public void setReferal(String referal) { 84 | this.referal = referal; 85 | } 86 | 87 | public int getPageVisits() { 88 | return pageVisits; 89 | } 90 | 91 | public void setPageVisits(int pageVisits) { 92 | this.pageVisits = pageVisits; 93 | } 94 | 95 | public void readFields(DataInput in) throws IOException { 96 | this.session = in.readUTF(); 97 | this.remote_addr = in.readUTF(); 98 | this.inTime = in.readUTF(); 99 | this.outTime = in.readUTF(); 100 | this.inPage = in.readUTF(); 101 | this.outPage = in.readUTF(); 102 | this.referal = in.readUTF(); 103 | this.pageVisits = in.readInt(); 104 | 105 | } 106 | 107 | public void write(DataOutput out) throws IOException { 108 | out.writeUTF(session); 109 | out.writeUTF(remote_addr); 110 | out.writeUTF(inTime); 111 | out.writeUTF(outTime); 112 | out.writeUTF(inPage); 113 | out.writeUTF(outPage); 114 | out.writeUTF(referal); 115 | out.writeInt(pageVisits); 116 | 117 | } 118 | 119 | @Override 120 | public String toString() { 121 | return session + "\001" + remote_addr + "\001" + inTime + "\001" + outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/bean/PageViewsBean.java: -------------------------------------------------------------------------------- 1 | package com.awebone.bean; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | 9 | public class PageViewsBean implements Writable { 10 | 11 | private String session; 12 | private String remote_addr; 13 | private String timestr; 14 | private String request; 15 | private int step; 16 | private String staylong; 17 | private String referal; 18 | private String useragent; 19 | private String bytes_send; 20 | private String status; 21 | 22 | public void set(String session, String remote_addr, String useragent, String timestr, String request, int step, String staylong, String referal, String bytes_send, String status) { 23 | this.session = session; 24 | this.remote_addr = remote_addr; 25 | this.useragent = useragent; 26 | this.timestr = timestr; 27 | this.request = request; 28 | this.step = step; 29 | this.staylong = staylong; 30 | this.referal = referal; 31 | this.bytes_send = bytes_send; 32 | this.status = status; 33 | } 34 | 35 | public String getSession() { 36 | return session; 37 | } 38 | 39 | public void setSession(String session) { 40 | this.session = session; 41 | } 42 | 43 | public String getRemote_addr() { 44 | return remote_addr; 45 | } 46 | 47 | public void setRemote_addr(String remote_addr) { 48 | this.remote_addr = remote_addr; 49 | } 50 | 51 | public String getTimestr() { 52 | return timestr; 53 | } 54 | 55 | public void setTimestr(String timestr) { 56 | this.timestr = timestr; 57 | } 58 | 59 | public String getRequest() { 60 | return request; 61 | } 62 | 63 | public void setRequest(String request) { 64 | this.request = request; 65 | } 66 | 67 | public int getStep() { 68 | return step; 69 | } 70 | 71 | public void setStep(int step) { 72 | this.step = step; 73 | } 74 | 75 | public String getStaylong() { 76 | return staylong; 77 | } 78 | 79 | public void setStaylong(String staylong) { 80 | this.staylong = staylong; 81 | } 82 | 83 | public String getReferal() { 84 | return referal; 85 | } 86 | 87 | public void setReferal(String referal) { 88 | this.referal = referal; 89 | } 90 | 91 | public String getUseragent() { 92 | return useragent; 93 | } 94 | 95 | public void setUseragent(String useragent) { 96 | this.useragent = useragent; 97 | } 98 | 99 | public String getBytes_send() { 100 | return bytes_send; 101 | } 102 | 103 | public void setBytes_send(String bytes_send) { 104 | this.bytes_send = bytes_send; 105 | } 106 | 107 | public String getStatus() { 108 | return status; 109 | } 110 | 111 | public void setStatus(String status) { 112 | this.status = status; 113 | } 114 | 115 | public void readFields(DataInput in) throws IOException { 116 | this.session = in.readUTF(); 117 | this.remote_addr = in.readUTF(); 118 | this.timestr = in.readUTF(); 119 | this.request = in.readUTF(); 120 | this.step = in.readInt(); 121 | this.staylong = in.readUTF(); 122 | this.referal = in.readUTF(); 123 | this.useragent = in.readUTF(); 124 | this.bytes_send = in.readUTF(); 125 | this.status = in.readUTF(); 126 | } 127 | 128 | public void write(DataOutput out) throws IOException { 129 | out.writeUTF(session); 130 | out.writeUTF(remote_addr); 131 | out.writeUTF(timestr); 132 | out.writeUTF(request); 133 | out.writeInt(step); 134 | out.writeUTF(staylong); 135 | out.writeUTF(referal); 136 | out.writeUTF(useragent); 137 | out.writeUTF(bytes_send); 138 | out.writeUTF(status); 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/report/AreaRequestDistributionJob.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.report 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 8 | 9 | /** 10 | * 广告请求地域分布统计 11 | * 省市/城市 总请求 有效请求 广告请求 |参与竞价数 竞价成功数 竞价成功率 |展示量 点击量 点击率 |广告成本 广告消费 12 | * 汇总结果,是可以保存到mysql(hbase)表中的,全量结果不建议保存到mysql 13 | */ 14 | object AreaRequestDistributionJob { 15 | def main(args: Array[String]): Unit = { 16 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) 17 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 18 | Logger.getLogger("org.spark-project").setLevel(Level.WARN) 19 | 20 | if(args == null || args.length < 2){ 21 | println( 22 | """Parameter Errors! Usage:
23 | |inputpath : input path 24 | |table : mysql table name 25 | """.stripMargin) 26 | System.exit(-1) 27 | } 28 | val Array(inputpath, table) = args 29 | 30 | val conf: SparkConf = new SparkConf().setAppName("AreaRequestDistributionJob").setMaster("local[*]") 31 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() 32 | 33 | val input: DataFrame = spark.read.parquet(inputpath) 34 | input.createOrReplaceTempView("logs") 35 | 36 | val sql = 37 | """ 38 | |select 39 | | date_sub(current_date(), 1) data_date, 40 | | provincename province, 41 | | cityname city, 42 | | sum(if(requestmode = 1 and processnode >= 1, 1, 0)) orginal_req, 43 | | sum(if(requestmode = 1 and processnode >= 2, 1, 0)) valid_req, 44 | | sum(if(requestmode = 1 and processnode = 3, 1, 0)) ad_req, 45 | | sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and isbid = 1 and adorderid != 0 46 | | then 1 47 | | else 0 48 | | end) tpi_bid_num, 49 | | sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 50 | | then 1 51 | | else 0 52 | | end) win_bid_num, 53 | | sum(case when requestmode = 2 and iseffective = 1 54 | | then 1 55 | | else 0 56 | | end) show_ad_master_num, 57 | | sum(case when requestmode = 3 and iseffective = 1 58 | | then 1 59 | | else 0 60 | | end) click_ad_master_num, 61 | | sum(case when requestmode = 2 and iseffective = 1 and isbilling = 1 62 | | then 1 63 | | else 0 64 | | end) show_ad_media_num, 65 | | sum(case when requestmode = 3 and iseffective = 1 and isbilling = 1 66 | | then 1 67 | | else 0 68 | | end) click_ad_media_num, 69 | | round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000 70 | | then winprice 71 | | else 0.0 72 | | end) / 1000, 2) dsp_ad_xf, 73 | | round(sum(case when ADPlatformProviderID >=100000 and iseffective = 1 and isbilling = 1 and iswin = 1 and adorderid >=200000 and adcreativeid >=200000 74 | | then adpayment 75 | | else 0.0 76 | | end) / 1000, 2) dsp_ad_cost 77 | |from logs 78 | |group by provincename, cityname 79 | """.stripMargin 80 | 81 | val url = "jdbc:mysql://hadoop01:3306/dmp" 82 | val properties = new Properties 83 | properties.put("user","root") 84 | properties.put("password","root") 85 | 86 | spark.sql(sql).write.mode(SaveMode.Append).jdbc(url,table,properties) 87 | 88 | spark.stop() 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/bean/WebLogBean.java: -------------------------------------------------------------------------------- 1 | package com.awebone.bean; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | 9 | public class WebLogBean implements Writable { 10 | private boolean valid = true; // 判断数据是否合法 11 | private String remote_addr; // 记录客户端的ip地址 12 | private String remote_user; // 记录客户端用户名称,忽略属性"-" 13 | private String time_local; // 记录访问时间与时区 14 | private String request; // 记录请求的url与http协议 15 | private String status; // 记录请求状态;成功是200 16 | private String body_bytes_sent; // 记录发送给客户端文件主体内容大小 17 | private String http_referer; // 用来记录从那个页面链接访问过来的 18 | private String http_user_agent; // 记录客户浏览器的相关信息 19 | 20 | public boolean isValid() { 21 | return valid; 22 | } 23 | 24 | public void setValid(boolean valid) { 25 | this.valid = valid; 26 | } 27 | 28 | public String getRemote_addr() { 29 | return remote_addr; 30 | } 31 | 32 | public void setRemote_addr(String remote_addr) { 33 | this.remote_addr = remote_addr; 34 | } 35 | 36 | public String getRemote_user() { 37 | return remote_user; 38 | } 39 | 40 | public void setRemote_user(String remote_user) { 41 | this.remote_user = remote_user; 42 | } 43 | 44 | public String getTime_local() { 45 | return time_local; 46 | } 47 | 48 | public void setTime_local(String time_local) { 49 | this.time_local = time_local; 50 | } 51 | 52 | public String getRequest() { 53 | return request; 54 | } 55 | 56 | public void setRequest(String request) { 57 | this.request = request; 58 | } 59 | 60 | public String getStatus() { 61 | return status; 62 | } 63 | 64 | public void setStatus(String status) { 65 | this.status = status; 66 | } 67 | 68 | public String getBody_bytes_sent() { 69 | return body_bytes_sent; 70 | } 71 | 72 | public void setBody_bytes_sent(String body_bytes_sent) { 73 | this.body_bytes_sent = body_bytes_sent; 74 | } 75 | 76 | public String getHttp_referer() { 77 | return http_referer; 78 | } 79 | 80 | public void setHttp_referer(String http_referer) { 81 | this.http_referer = http_referer; 82 | } 83 | 84 | public String getHttp_user_agent() { 85 | return http_user_agent; 86 | } 87 | 88 | public void setHttp_user_agent(String http_user_agent) { 89 | this.http_user_agent = http_user_agent; 90 | } 91 | 92 | public WebLogBean() { 93 | super(); 94 | } 95 | 96 | public WebLogBean(boolean valid, String remote_addr, String remote_user, String time_local, String request, 97 | String status, String body_bytes_sent, String http_referer, String http_user_agent) { 98 | super(); 99 | this.valid = valid; 100 | this.remote_addr = remote_addr; 101 | this.remote_user = remote_user; 102 | this.time_local = time_local; 103 | this.request = request; 104 | this.status = status; 105 | this.body_bytes_sent = body_bytes_sent; 106 | this.http_referer = http_referer; 107 | this.http_user_agent = http_user_agent; 108 | } 109 | 110 | @Override 111 | public String toString() { 112 | return valid + "\001" + remote_addr + "\001" + remote_user + "\001" + time_local + "\001" + request + "\001" 113 | + status + "\001" + body_bytes_sent + "\001" + http_referer + "\001" + http_user_agent; 114 | } 115 | 116 | // 反序列化 117 | public void readFields(DataInput in) throws IOException { 118 | this.valid = in.readBoolean(); 119 | this.remote_addr = in.readUTF(); 120 | this.remote_user = in.readUTF(); 121 | this.time_local = in.readUTF(); 122 | this.request = in.readUTF(); 123 | this.status = in.readUTF(); 124 | this.body_bytes_sent = in.readUTF(); 125 | this.http_referer = in.readUTF(); 126 | this.http_user_agent = in.readUTF(); 127 | } 128 | 129 | // 序列化 130 | public void write(DataOutput out) throws IOException { 131 | out.writeBoolean(valid); 132 | out.writeUTF(remote_addr); 133 | out.writeUTF(remote_user); 134 | out.writeUTF(time_local); 135 | out.writeUTF(request); 136 | out.writeUTF(status); 137 | out.writeUTF(body_bytes_sent); 138 | out.writeUTF(http_referer); 139 | out.writeUTF(http_user_agent); 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /weblog/src/main/java/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | dfs.replication 23 | 2 24 | 25 | 26 | 27 | 28 | dfs.namenode.name.dir 29 | /home/hadoop/data/hadoopdata/dfs/name 30 | 31 | 32 | dfs.datanode.data.dir 33 | /home/hadoop/data/hadoopdata/dfs/data 34 | 35 | 36 | 37 | 38 | dfs.webhdfs.enabled 39 | true 40 | 41 | 42 | 43 | 44 | dfs.nameservices 45 | myha 46 | 47 | 48 | 49 | 50 | dfs.ha.namenodes.myha 51 | nn1,nn2 52 | 53 | 54 | 55 | 56 | dfs.namenode.rpc-address.myha.nn1 57 | hadoop01:9000 58 | 59 | 60 | 61 | 62 | dfs.namenode.http-address.myha.nn1 63 | hadoop01:50070 64 | 65 | 66 | 67 | 68 | dfs.namenode.rpc-address.myha.nn2 69 | hadoop02:9000 70 | 71 | 72 | 73 | 74 | dfs.namenode.http-address.myha.nn2 75 | hadoop02:50070 76 | 77 | 78 | 79 | 80 | dfs.namenode.shared.edits.dir 81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha 82 | 83 | 84 | 85 | 86 | dfs.journalnode.edits.dir 87 | /home/hadoop/data/journaldata 88 | 89 | 90 | 91 | 92 | dfs.ha.automatic-failover.enabled 93 | true 94 | 95 | 96 | 97 | 98 | dfs.client.failover.proxy.provider.myha 99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 100 | 101 | 102 | 103 | 104 | dfs.ha.fencing.methods 105 | 106 | sshfence 107 | shell(/bin/true) 108 | 109 | 110 | 111 | 112 | 113 | dfs.ha.fencing.ssh.private-key-files 114 | /home/hadoop/.ssh/id_rsa 115 | 116 | 117 | 118 | 119 | dfs.ha.fencing.ssh.connect-timeout 120 | 30000 121 | 122 | 123 | 124 | ha.failover-controller.cli-check.rpc-timeout.ms 125 | 60000 126 | 127 | 128 | -------------------------------------------------------------------------------- /dmp/src/main/resources/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | dfs.replication 23 | 2 24 | 25 | 26 | 27 | 28 | dfs.namenode.name.dir 29 | /home/hadoop/data/hadoopdata/dfs/name 30 | 31 | 32 | dfs.datanode.data.dir 33 | /home/hadoop/data/hadoopdata/dfs/data 34 | 35 | 36 | 37 | 38 | dfs.webhdfs.enabled 39 | true 40 | 41 | 42 | 43 | 44 | dfs.nameservices 45 | myha 46 | 47 | 48 | 49 | 50 | dfs.ha.namenodes.myha 51 | nn1,nn2 52 | 53 | 54 | 55 | 56 | dfs.namenode.rpc-address.myha.nn1 57 | hadoop01:9000 58 | 59 | 60 | 61 | 62 | dfs.namenode.http-address.myha.nn1 63 | hadoop01:50070 64 | 65 | 66 | 67 | 68 | dfs.namenode.rpc-address.myha.nn2 69 | hadoop02:9000 70 | 71 | 72 | 73 | 74 | dfs.namenode.http-address.myha.nn2 75 | hadoop02:50070 76 | 77 | 78 | 79 | 80 | dfs.namenode.shared.edits.dir 81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha 82 | 83 | 84 | 85 | 86 | dfs.journalnode.edits.dir 87 | /home/hadoop/data/journaldata 88 | 89 | 90 | 91 | 92 | dfs.ha.automatic-failover.enabled 93 | true 94 | 95 | 96 | 97 | 98 | dfs.client.failover.proxy.provider.myha 99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 100 | 101 | 102 | 103 | 104 | dfs.ha.fencing.methods 105 | 106 | sshfence 107 | shell(/bin/true) 108 | 109 | 110 | 111 | 112 | 113 | dfs.ha.fencing.ssh.private-key-files 114 | /home/hadoop/.ssh/id_rsa 115 | 116 | 117 | 118 | 119 | dfs.ha.fencing.ssh.connect-timeout 120 | 30000 121 | 122 | 123 | 124 | ha.failover-controller.cli-check.rpc-timeout.ms 125 | 60000 126 | 127 | 128 | -------------------------------------------------------------------------------- /mllib/src/main/resources/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | dfs.replication 23 | 2 24 | 25 | 26 | 27 | 28 | dfs.namenode.name.dir 29 | /home/hadoop/data/hadoopdata/dfs/name 30 | 31 | 32 | dfs.datanode.data.dir 33 | /home/hadoop/data/hadoopdata/dfs/data 34 | 35 | 36 | 37 | 38 | dfs.webhdfs.enabled 39 | true 40 | 41 | 42 | 43 | 44 | dfs.nameservices 45 | myha 46 | 47 | 48 | 49 | 50 | dfs.ha.namenodes.myha 51 | nn1,nn2 52 | 53 | 54 | 55 | 56 | dfs.namenode.rpc-address.myha.nn1 57 | hadoop01:9000 58 | 59 | 60 | 61 | 62 | dfs.namenode.http-address.myha.nn1 63 | hadoop01:50070 64 | 65 | 66 | 67 | 68 | dfs.namenode.rpc-address.myha.nn2 69 | hadoop02:9000 70 | 71 | 72 | 73 | 74 | dfs.namenode.http-address.myha.nn2 75 | hadoop02:50070 76 | 77 | 78 | 79 | 80 | dfs.namenode.shared.edits.dir 81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha 82 | 83 | 84 | 85 | 86 | dfs.journalnode.edits.dir 87 | /home/hadoop/data/journaldata 88 | 89 | 90 | 91 | 92 | dfs.ha.automatic-failover.enabled 93 | true 94 | 95 | 96 | 97 | 98 | dfs.client.failover.proxy.provider.myha 99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 100 | 101 | 102 | 103 | 104 | dfs.ha.fencing.methods 105 | 106 | sshfence 107 | shell(/bin/true) 108 | 109 | 110 | 111 | 112 | 113 | dfs.ha.fencing.ssh.private-key-files 114 | /home/hadoop/.ssh/id_rsa 115 | 116 | 117 | 118 | 119 | dfs.ha.fencing.ssh.connect-timeout 120 | 30000 121 | 122 | 123 | 124 | ha.failover-controller.cli-check.rpc-timeout.ms 125 | 60000 126 | 127 | 128 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | dfs.replication 23 | 2 24 | 25 | 26 | 27 | 28 | dfs.namenode.name.dir 29 | /home/hadoop/data/hadoopdata/dfs/name 30 | 31 | 32 | dfs.datanode.data.dir 33 | /home/hadoop/data/hadoopdata/dfs/data 34 | 35 | 36 | 37 | 38 | dfs.webhdfs.enabled 39 | true 40 | 41 | 42 | 43 | 44 | dfs.nameservices 45 | myha 46 | 47 | 48 | 49 | 50 | dfs.ha.namenodes.myha 51 | nn1,nn2 52 | 53 | 54 | 55 | 56 | dfs.namenode.rpc-address.myha.nn1 57 | hadoop01:9000 58 | 59 | 60 | 61 | 62 | dfs.namenode.http-address.myha.nn1 63 | hadoop01:50070 64 | 65 | 66 | 67 | 68 | dfs.namenode.rpc-address.myha.nn2 69 | hadoop02:9000 70 | 71 | 72 | 73 | 74 | dfs.namenode.http-address.myha.nn2 75 | hadoop02:50070 76 | 77 | 78 | 79 | 80 | dfs.namenode.shared.edits.dir 81 | qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/myha 82 | 83 | 84 | 85 | 86 | dfs.journalnode.edits.dir 87 | /home/hadoop/data/journaldata 88 | 89 | 90 | 91 | 92 | dfs.ha.automatic-failover.enabled 93 | true 94 | 95 | 96 | 97 | 98 | dfs.client.failover.proxy.provider.myha 99 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 100 | 101 | 102 | 103 | 104 | dfs.ha.fencing.methods 105 | 106 | sshfence 107 | shell(/bin/true) 108 | 109 | 110 | 111 | 112 | 113 | dfs.ha.fencing.ssh.private-key-files 114 | /home/hadoop/.ssh/id_rsa 115 | 116 | 117 | 118 | 119 | dfs.ha.fencing.ssh.connect-timeout 120 | 30000 121 | 122 | 123 | 124 | ha.failover-controller.cli-check.rpc-timeout.ms 125 | 60000 126 | 127 | 128 | -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/click/ClickModel.java: -------------------------------------------------------------------------------- 1 | package com.awebone.click; 2 | 3 | import java.io.IOException; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | import java.util.Comparator; 8 | 9 | import org.apache.commons.beanutils.BeanUtils; 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.LongWritable; 13 | import org.apache.hadoop.io.NullWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | 21 | import com.awebone.bean.PageViewsBean; 22 | import com.awebone.bean.VisitBean; 23 | import com.awebone.bean.WebLogBean; 24 | 25 | /** 26 | * map端: 相同的会话的数据 发送到 reduce 27 | * key: session 28 | * value: 其他的字段 29 | 访问时间 url step 外链 ip 30 | reduce端: 31 | 相同session的数据过来了 32 | 按照step排序 33 | list 第一个开始 34 | list 最后一个结束 35 | 封装 发送 36 | * 37 | */ 38 | public class ClickModel { 39 | static class ClickModelMapper extends Mapper{ 40 | Text mk = new Text(); 41 | PageViewsBean pbean = new PageViewsBean(); 42 | 43 | @Override 44 | protected void map(LongWritable key, Text value, 45 | Mapper.Context context) 46 | throws IOException, InterruptedException { 47 | String[] fields = value.toString().split("\001"); 48 | if (fields.length == 11){ 49 | mk.set(fields[0]); 50 | int step=Integer.parseInt(fields[6]); 51 | pbean.set(fields[0], fields[1], fields[10], fields[3], fields[4],step, 52 | fields[5], fields[9], fields[8], fields[7]); 53 | context.write(mk, pbean); 54 | } 55 | } 56 | } 57 | 58 | static class ClickModelReducer extends Reducer{ 59 | VisitBean vb=new VisitBean(); 60 | 61 | @Override 62 | protected void reduce(Text key, Iterable values, 63 | Reducer.Context context) 64 | throws IOException, InterruptedException { 65 | ArrayList list = new ArrayList(); 66 | for (PageViewsBean v:values){ 67 | PageViewsBean pb = new PageViewsBean(); 68 | try { 69 | BeanUtils.copyProperties(pb, v); 70 | list.add(pb); 71 | } catch (IllegalAccessException e) { 72 | // TODO Auto-generated catch block 73 | e.printStackTrace(); 74 | } catch (InvocationTargetException e) { 75 | // TODO Auto-generated catch block 76 | e.printStackTrace(); 77 | } 78 | } 79 | 80 | Collections.sort(list, new Comparator() { 81 | public int compare(PageViewsBean o1, PageViewsBean o2) { 82 | if(o1 == null || o2 == null){ 83 | return 0; 84 | } 85 | return o1.getStep()-o2.getStep(); 86 | } 87 | }); 88 | 89 | //构造发送的对象 90 | vb.set(key.toString(), list.get(0).getRemote_addr(), 91 | list.get(0).getTimestr(), list.get(list.size()-1).getTimestr(), 92 | list.get(0).getRequest(), list.get(list.size()-1).getRequest(), 93 | list.get(0).getReferal(), list.get(list.size()-1).getStep()); 94 | context.write(vb, NullWritable.get()); 95 | } 96 | } 97 | 98 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { 99 | System.setProperty("HADOOP_USER_NAME", "hadoop"); 100 | Configuration conf = new Configuration(); 101 | conf.set("fs.defaultFS", "hdfs://myha/"); 102 | Job job = Job.getInstance(conf); 103 | 104 | job.setJarByClass(ClickModel.class); 105 | 106 | job.setMapperClass(ClickModelMapper.class); 107 | job.setReducerClass(ClickModelReducer.class); 108 | 109 | job.setMapOutputKeyClass(Text.class); 110 | job.setMapOutputValueClass(PageViewsBean.class); 111 | job.setOutputKeyClass(VisitBean.class); 112 | job.setOutputValueClass(NullWritable.class); 113 | 114 | FileInputFormat.setInputPaths(job, new Path("/weblog/click/stream/20200221")); 115 | FileOutputFormat.setOutputPath(job, new Path("/weblog/click/model/20200221")); 116 | 117 | boolean res = job.waitForCompletion(true); 118 | System.exit(res ? 0 : 1); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /akka_rpc/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | awebone 8 | akka_rpc 9 | 1.0-SNAPSHOT 10 | 11 | akka_rpc 12 | 13 | http://www.example.com 14 | 15 | 16 | UTF8 17 | 1.8 18 | 1.8 19 | UTF-8 20 | 2.11.8 21 | 2.11.8 22 | 2.4.17 23 | 24 | 25 | 26 | 27 | org.scala-lang 28 | scala-library 29 | ${scala.version} 30 | 31 | 32 | 33 | com.typesafe.akka 34 | akka-actor_2.11 35 | ${akka.version} 36 | 37 | 38 | 39 | 40 | org.scala-lang 41 | scala-actors 42 | ${scala.actors.version} 43 | 44 | 45 | 46 | com.typesafe.akka 47 | akka-remote_2.11 48 | ${akka.version} 49 | 50 | 51 | 52 | 53 | org.apache.hadoop 54 | hadoop-client 55 | 2.7.6 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | net.alchim31.maven 65 | scala-maven-plugin 66 | 3.2.2 67 | 68 | 69 | org.apache.maven.plugins 70 | maven-compiler-plugin 71 | 3.5.1 72 | 73 | 74 | 75 | 76 | 77 | net.alchim31.maven 78 | scala-maven-plugin 79 | 80 | 81 | scala-compile-first 82 | process-resources 83 | 84 | add-source 85 | compile 86 | 87 | 88 | 89 | scala-test-compile 90 | process-test-resources 91 | 92 | testCompile 93 | 94 | 95 | 96 | 97 | 98 | 99 | org.apache.maven.plugins 100 | maven-compiler-plugin 101 | 102 | 103 | compile 104 | 105 | compile 106 | 107 | 108 | 109 | 110 | 111 | 112 | org.apache.maven.plugins 113 | maven-shade-plugin 114 | 2.4.3 115 | 116 | 117 | package 118 | 119 | shade 120 | 121 | 122 | 123 | 124 | *:* 125 | 126 | META-INF/*.SF 127 | META-INF/*.DSA 128 | META-INF/*.RSA 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /mllib/src/main/scala/com/awebone/spark/MovieLensSparkShell.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.spark 2 | 3 | import org.apache.spark.mllib.evaluation.RegressionMetrics 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 7 | import org.apache.spark.sql.{SQLContext, SparkSession} 8 | 9 | case class Movie(movieId: Int, title: String, genres: Seq[String]) 10 | 11 | case class User(userId: Int, gender: String, age: Int, occupation: Int, zip: String) 12 | 13 | object DataProcess { 14 | //获取程序入口 15 | val sparkConf: SparkConf = new SparkConf() 16 | sparkConf.setAppName(DataProcess.getClass.getSimpleName) 17 | sparkConf.setMaster("local") 18 | val sc: SparkContext = new SparkContext(sparkConf) 19 | val sqlContext = new SQLContext(sc) 20 | 21 | import sqlContext.implicits._ 22 | // val spark:SparkSession = SparkSession.builder().appName("MyFirstSparkSQL").config("someKey", "someValue").master("local").getOrCreate() 23 | 24 | //Define parse function 25 | def parseMovie(str: String): Movie = { 26 | val fields = str.split("::") 27 | assert(fields.size == 3) 28 | Movie(fields(0).toInt, fields(1).toString, Seq(fields(2))) 29 | } 30 | 31 | def parseUser(str: String): User = { 32 | val fields = str.split("::") 33 | assert(fields.size == 5) 34 | User(fields(0).toInt, fields(1).toString, fields(2).toInt, fields(3).toInt, fields(4).toString) 35 | } 36 | 37 | def parseRating(str: String): Rating = { 38 | val fields = str.split("::") 39 | assert(fields.size == 4) 40 | Rating(fields(0).toInt, fields(1).toInt, fields(2).toInt) 41 | } 42 | 43 | //Rating analysis 44 | val ratingText = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\ratings.dat") 45 | ratingText.first() 46 | val ratingRDD = ratingText.map(parseRating).cache() 47 | println("Total number of ratings: " + ratingRDD.count()) 48 | println("Total number of movies rated: " + ratingRDD.map(_.product).distinct().count()) 49 | println("Total number of users who rated movies: " + ratingRDD.map(_.user).distinct().count()) 50 | 51 | //Create DataFrames 52 | val ratingDF = ratingRDD.toDF 53 | // val ratingDF = spark.createDataFrame(ratingRDD) 54 | val movieDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\movies.dat").map(parseMovie).toDF 55 | val userDF = sc.textFile("file://D:\\workplace\\spark\\core\\src\\main\\resources\\ml-1m\\users.dat").map(parseUser).toDF 56 | ratingDF.printSchema 57 | // ratingDF.show 58 | movieDF.printSchema 59 | userDF.printSchema 60 | 61 | //注册成表 62 | ratingDF.registerTempTable("ratings") 63 | // ratingDF.createOrReplaceTempView(“ratings”) 64 | movieDF.registerTempTable("movies") 65 | userDF.registerTempTable("users") 66 | 67 | //数据探索 68 | val rantingMovies = sqlContext.sql( 69 | """ 70 | |select title,ramx,rmin,ucnt from 71 | |(select product, max(rating) as rmax, min(rating) as rmin, count(distinct user) as ucnt from ratings group by product) rantingsCNT 72 | |join movies on product=movieId 73 | |order by ucnt desc 74 | """.stripMargin) 75 | rantingMovies.show() 76 | 77 | val mostActiveUser = sqlContext.sql( 78 | """ 79 | |select user,count(*) as cnt 80 | |from ratings group by user order by cnt desc limit 10 81 | """.stripMargin) 82 | mostActiveUser.show() 83 | 84 | val userRating = sqlContext.sql( 85 | """ 86 | |select distinct title,rating 87 | |from ratings join movies on movieId=product 88 | |where user=4169 and rating>4 89 | """.stripMargin) 90 | userRating.show() 91 | 92 | //ALS model 93 | //数据切分 94 | val splitsData = ratingRDD.randomSplit(Array(0.8, 0.2), 0L) 95 | val trainingSet = splitsData(0).cache() 96 | val testSet = splitsData(0).cache() 97 | trainingSet.count() 98 | testSet.count() 99 | 100 | //构建模型 101 | val model = new ALS() 102 | .setRank(20) 103 | .setIterations(10) 104 | .run(trainingSet) 105 | 106 | //进行推荐 107 | val recomForTopUser = model.recommendProducts(4169, 5) 108 | val movieTitle = movieDF.rdd.map(x => (x(0), x(1))).collectAsMap 109 | val recomResult = recomForTopUser.map(rating => (movieTitle(rating.product), rating.rating)).foreach(println) 110 | 111 | //测试集预测 112 | val testUserProduct = testSet.map { 113 | case Rating(user, product, rating) => (user, product) 114 | } 115 | val testUserProductPredict = model.predict(testUserProduct) 116 | testUserProductPredict.take(10).mkString("\n") 117 | 118 | //模型评估 119 | val testSetPair = testSet.map { 120 | case Rating(user, product, rating) => ((user, product), rating) 121 | } 122 | val predictionsPair = testUserProductPredict.map { 123 | case Rating(user, product, rating) => ((user, product), rating) 124 | } 125 | 126 | val joinTestPredict = testSetPair.join(predictionsPair) 127 | val mae = joinTestPredict.map { 128 | case ((user, product), (ratingT, ratingP)) => 129 | val err = ratingT - ratingP 130 | Math.abs(err) 131 | }.mean() 132 | val fp = joinTestPredict.filter { 133 | case ((user, product), (ratingT, ratingP)) => 134 | (ratingT <= 1 & ratingP >= 4) 135 | }.count() 136 | 137 | //使用库进行评估 138 | val ratingTP = joinTestPredict.map { 139 | case ((user, product), (ratingT, ratingP)) => 140 | (ratingP, ratingT) 141 | } 142 | val evalutor = new RegressionMetrics(ratingTP) 143 | evalutor.meanAbsoluteError 144 | evalutor.rootMeanSquaredError 145 | } 146 | -------------------------------------------------------------------------------- /weblog/src/main/java/hive-op.txt: -------------------------------------------------------------------------------- 1 | 启动服务 2 | nohup hiveserver2 1>~/logs/hive_std.log 2>~/logs/hive_err.log & 3 | 4 | 连接服务 5 | beeline或者hive 6 | !connect jdbc:hive2://hadoop04:10000 7 | show databases; 8 | show tables; 9 | 10 | 11 | 创建表 12 | ODS层 13 | 原始数据表: 14 | create database if not exists weblog; 15 | use weblog; 16 | drop table if exists weblog.ods_weblog_origin; 17 | create table weblog.ods_weblog_origin( 18 | valid string, 19 | remote_addr string, 20 | remote_user string, 21 | time_local string, 22 | request string, 23 | status string, 24 | body_bytes_sent string, 25 | http_referer string, 26 | http_user_agent string) 27 | partitioned by (datestr string) 28 | row format delimited 29 | fields terminated by '\001'; 30 | 31 | 32 | dw层 33 | 点击流事件表: 34 | create database if not exists weblog; 35 | use weblog; 36 | drop table if exists weblog.click_stream_pageviews; 37 | create table weblog.click_stream_pageviews ( 38 | session string, 39 | remote_addr string, 40 | remote_user string, 41 | time_local string, 42 | request string, 43 | page_staylong string, 44 | visit_step string, 45 | status string, 46 | body_bytes_sent string, 47 | http_referer string, 48 | http_user_agent string) 49 | partitioned by (datestr string) 50 | row format delimited 51 | fields terminated by '\001'; 52 | 53 | 会话访问统计表 点击流访客表 54 | create database if not exists weblog; 55 | use weblog; 56 | drop table if exists weblog.click_stream_visit; 57 | create table weblog.click_stream_visit( 58 | session string, 59 | remote_addr string, 60 | inTime string, 61 | outTime string, 62 | inPage string, 63 | outPage string, 64 | referal string, 65 | pageVisits int) 66 | partitioned by (datestr string); 67 | 68 | 69 | 加载数据 70 | /weblog/pre/20200221 原始表 71 | load data inpath '/weblog/pre/20200221' into table weblog.ods_weblog_origin partition(datestr = "20200221"); 72 | 73 | /weblog/click/stream/20200221 点击流事件表 74 | load data inpath "/weblog/click/stream/20200221" into table weblog.click_stream_pageviews partition(datestr ="20200221"); 75 | 76 | /weblog/click/model/20200221 点击流访客表 77 | load data inpath "/weblog/click/model/20200221" into table weblog.click_stream_visit partition(datestr ="20200221"); 78 | 79 | 查询数据 80 | select * from weblog.ods_weblog_origin limit 1; 81 | select * from weblog.click_stream_pageviews limit 1; 82 | select * from weblog.click_stream_visit limit 1; 83 | 84 | 85 | dw层创建明细宽表: 86 | create database if not exists weblog; 87 | use weblog; 88 | drop table if exists weblog.ods_weblog_detail; 89 | create table weblog.ods_weblog_detail( 90 | valid string comment "有效标识", 91 | remote_addr string comment "来源 IP", 92 | remote_user string comment "用户标识", 93 | time_local string comment "访问完整时间", 94 | daystr string comment "访问日期", 95 | timestr string comment "访问时间", 96 | year string comment "访问年", 97 | month string comment "访问月", 98 | day string comment "访问日", 99 | hour string comment "访问时", 100 | request string comment "请求的 url", 101 | status string comment "响应码", 102 | body_bytes_sent string comment "传输字节数", 103 | http_referer string comment "来源 url", 104 | ref_host string comment "来源的 host", 105 | ref_path string comment "来源的路径", 106 | ref_query string comment "来源参数 query", 107 | ref_query_id string comment "来源参数 query 的值", 108 | http_user_agent string comment "客户终端标识" 109 | ) 110 | partitioned by(datestr string) 111 | row format delimited fields terminated by '\001'; 112 | 113 | 114 | 设置本地模式和打印表头 115 | set hive.exec.mode.local.auto=true; 116 | set hive.cli.print.header=true; 117 | 118 | 119 | 解析url:解析外链的信息 120 | create database if not exists weblog; 121 | use weblog; 122 | drop table if exists weblog.t_ods_tmp_referurl; 123 | create table weblog.t_ods_tmp_referurl as 124 | SELECT a.*, b.* 125 | FROM ods_weblog_origin a 126 | LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH', 'QUERY','QUERY:id') b 127 | as host, path, query, query_id; 128 | 129 | 查询外链信息临时表 130 | select * from weblog.t_ods_tmp_referurl a where a.host is not null limit 1; 131 | 132 | 最终明细宽表 133 | create database if not exists weblog; 134 | use weblog; 135 | drop table if exists weblog.t_ods_tmp_detail; 136 | create table weblog.t_ods_tmp_detail as 137 | select b.*,substring(time_local,0,10) as daystr, 138 | substring(time_local,11) as tmstr, 139 | substring(time_local,0,4) as year, 140 | substring(time_local,6,2) as month, 141 | substring(time_local,9,2) as day, 142 | substring(time_local,12,2) as hour 143 | From t_ods_tmp_referurl b; 144 | 145 | 查询宽表 146 | select * from weblog.t_ods_tmp_detail where month is not null limit 3; 147 | 148 | 149 | 统计日志中的相关指标 150 | 1)pv:page view 151 | click_stream_pageviews 76 152 | select count(*) from click_stream_pageviews; 153 | 154 | 2)uv:独立用户数 独立会话数,统计的会话的个数 155 | click_stream_visit 57 156 | select count(*) from click_stream_visit; 157 | 158 | 3)dv:平均每一个会话的访问深度,所有的pv / uv 159 | 关联 160 | set hive.strict.checks.cartesian.product=false; 161 | set hive.mapred.mode=nonstrict; 162 | 163 | select a.pv/b.uv avgdv 164 | from 165 | (select count(*) pv from click_stream_pageviews ) a join 166 | (select count(*) uv from click_stream_visit) b; 167 | 168 | 4)转化率 169 | 数据order.txt 170 | 1,广告,10000 171 | 2,菜单,3000 172 | 3,商品详情,2600 173 | 4,购物车,300 174 | 5,下单,200 175 | 6,支付,190 176 | 7,支付成功,189 177 | 178 | 建表加载数据 179 | create database if not exists hive_order; 180 | use hive_order; 181 | drop table if exists t_order; 182 | create table t_order(step int, name string, pv int) row format delimited fields terminated by ","; 183 | load data local inpath "/home/hadoop/tmpdata/order.txt" into table t_order; 184 | select * from t_order limit 10; 185 | 186 | 查转化率 187 | select step,name,pv,pv/lpv t 188 | from 189 | (select step,name,pv,lag(pv,1,pv) over(order by step) lpv from t_order) a; 190 | 191 | -------------------------------------------------------------------------------- /mllib/src/main/resources/ml-1m/README: -------------------------------------------------------------------------------- 1 | SUMMARY 2 | ================================================================================ 3 | 4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 5 | made by 6,040 MovieLens users who joined MovieLens in 2000. 6 | 7 | USAGE LICENSE 8 | ================================================================================ 9 | 10 | Neither the University of Minnesota nor any of the researchers 11 | involved can guarantee the correctness of the data, its suitability 12 | for any particular purpose, or the validity of results based on the 13 | use of the data set. The data set may be used for any research 14 | purposes under the following conditions: 15 | 16 | * The user may not state or imply any endorsement from the 17 | University of Minnesota or the GroupLens Research Group. 18 | 19 | * The user must acknowledge the use of the data set in 20 | publications resulting from the use of the data set, and must 21 | send us an electronic or paper copy of those publications. 22 | 23 | * The user may not redistribute the data without separate 24 | permission. 25 | 26 | * The user may not use this information for any commercial or 27 | revenue-bearing purposes without first obtaining permission 28 | from a faculty member of the GroupLens Research Project at the 29 | University of Minnesota. 30 | 31 | If you have any further questions or comments, please contact GroupLens 32 | . 33 | 34 | ACKNOWLEDGEMENTS 35 | ================================================================================ 36 | 37 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data 38 | set. 39 | 40 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT 41 | ================================================================================ 42 | 43 | The GroupLens Research Project is a research group in the Department of 44 | Computer Science and Engineering at the University of Minnesota. Members of 45 | the GroupLens Research Project are involved in many research projects related 46 | to the fields of information filtering, collaborative filtering, and 47 | recommender systems. The project is lead by professors John Riedl and Joseph 48 | Konstan. The project began to explore automated collaborative filtering in 49 | 1992, but is most well known for its world wide trial of an automated 50 | collaborative filtering system for Usenet news in 1996. Since then the project 51 | has expanded its scope to research overall information filtering solutions, 52 | integrating in content-based methods as well as improving current collaborative 53 | filtering technology. 54 | 55 | Further information on the GroupLens Research project, including research 56 | publications, can be found at the following web site: 57 | 58 | http://www.grouplens.org/ 59 | 60 | GroupLens Research currently operates a movie recommender based on 61 | collaborative filtering: 62 | 63 | http://www.movielens.org/ 64 | 65 | RATINGS FILE DESCRIPTION 66 | ================================================================================ 67 | 68 | All ratings are contained in the file "ratings.dat" and are in the 69 | following format: 70 | 71 | UserID::MovieID::Rating::Timestamp 72 | 73 | - UserIDs range between 1 and 6040 74 | - MovieIDs range between 1 and 3952 75 | - Ratings are made on a 5-star scale (whole-star ratings only) 76 | - Timestamp is represented in seconds since the epoch as returned by time(2) 77 | - Each user has at least 20 ratings 78 | 79 | USERS FILE DESCRIPTION 80 | ================================================================================ 81 | 82 | User information is in the file "users.dat" and is in the following 83 | format: 84 | 85 | UserID::Gender::Age::Occupation::Zip-code 86 | 87 | All demographic information is provided voluntarily by the users and is 88 | not checked for accuracy. Only users who have provided some demographic 89 | information are included in this data set. 90 | 91 | - Gender is denoted by a "M" for male and "F" for female 92 | - Age is chosen from the following ranges: 93 | 94 | * 1: "Under 18" 95 | * 18: "18-24" 96 | * 25: "25-34" 97 | * 35: "35-44" 98 | * 45: "45-49" 99 | * 50: "50-55" 100 | * 56: "56+" 101 | 102 | - Occupation is chosen from the following choices: 103 | 104 | * 0: "other" or not specified 105 | * 1: "academic/educator" 106 | * 2: "artist" 107 | * 3: "clerical/admin" 108 | * 4: "college/grad student" 109 | * 5: "customer service" 110 | * 6: "doctor/health care" 111 | * 7: "executive/managerial" 112 | * 8: "farmer" 113 | * 9: "homemaker" 114 | * 10: "K-12 student" 115 | * 11: "lawyer" 116 | * 12: "programmer" 117 | * 13: "retired" 118 | * 14: "sales/marketing" 119 | * 15: "scientist" 120 | * 16: "self-employed" 121 | * 17: "technician/engineer" 122 | * 18: "tradesman/craftsman" 123 | * 19: "unemployed" 124 | * 20: "writer" 125 | 126 | MOVIES FILE DESCRIPTION 127 | ================================================================================ 128 | 129 | Movie information is in the file "movies.dat" and is in the following 130 | format: 131 | 132 | MovieID::Title::Genres 133 | 134 | - Titles are identical to titles provided by the IMDB (including 135 | year of release) 136 | - Genres are pipe-separated and are selected from the following genres: 137 | 138 | * Action 139 | * Adventure 140 | * Animation 141 | * Children's 142 | * Comedy 143 | * Crime 144 | * Documentary 145 | * Drama 146 | * Fantasy 147 | * Film-Noir 148 | * Horror 149 | * Musical 150 | * Mystery 151 | * Romance 152 | * Sci-Fi 153 | * Thriller 154 | * War 155 | * Western 156 | 157 | - Some MovieIDs do not correspond to a movie due to accidental duplicate 158 | entries and/or test entries 159 | - Movies are mostly entered by hand, so errors and inconsistencies may exist 160 | -------------------------------------------------------------------------------- /mllib/src/main/scala/com/awebone/spark/MovieLensALS.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.spark 2 | 3 | import java.io.File 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.mllib.evaluation.RegressionMetrics 7 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.{SparkConf, SparkContext} 10 | 11 | import scala.util.Random 12 | 13 | object MovieLensALS { 14 | //1. Define a rating elicitation function 15 | def elicitateRating(movies: Seq[(Int, String)]) = { 16 | val prompt = "Please rate the following movie(1-5(best) or 0 if not seen: )" 17 | println(prompt) 18 | 19 | val ratings = movies.flatMap { x => 20 | var rating: Option[Rating] = None 21 | var vaild = false 22 | while (!vaild) { 23 | println(x._2 + " :") 24 | try { 25 | val r = Console.readInt() 26 | if (r > 5 || r < 0) { 27 | println(prompt) 28 | } else { 29 | vaild = true 30 | if (r > 0) { 31 | rating = Some(Rating(0, x._1, r)) 32 | } 33 | } 34 | } catch { 35 | case e: Exception => println(prompt) 36 | } 37 | } 38 | rating match { 39 | case Some(r) => Iterator(r) 40 | case None => Iterator.empty 41 | } 42 | } 43 | if (ratings.isEmpty) { 44 | error("No ratings provided!") 45 | } else { 46 | ratings 47 | } 48 | } 49 | 50 | //2. Define a RMSE computation function 51 | def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]) = { 52 | val prediction = model.predict(data.map(x => (x.user, x.product))) 53 | val predDataJoined = prediction 54 | .map(x => ((x.user, x.product), x.rating)) 55 | .join(data.map(x => ((x.user, x.product), x.rating))) 56 | .values 57 | new RegressionMetrics(predDataJoined).rootMeanSquaredError 58 | } 59 | 60 | //3. Main 61 | def main(args: Array[String]) = { 62 | //3.1 Setup env 63 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 64 | if (args.length != 1) { 65 | println("Usage: movieLensDir") 66 | sys.exit(1) 67 | } 68 | 69 | val conf = new SparkConf() 70 | .setAppName("MovieLensALS") 71 | .setMaster("local") 72 | .set("spark.executor.memory", "500m") 73 | val sc = new SparkContext(conf) 74 | 75 | //3.2 Load ratings data and know your data 76 | val movieLensHomeDir = args(0) 77 | val ratings = sc 78 | .textFile(new File(movieLensHomeDir, "ratings.dat").toString) 79 | .map { line => 80 | val fields = line.split("::") 81 | (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)) 82 | } 83 | val movies = sc 84 | .textFile(new File(movieLensHomeDir, "movies.dat").toString) 85 | .map { line => 86 | val fields = line.split("::") 87 | (fields(0).toInt, fields(1).toString) 88 | } 89 | .collectAsMap() 90 | 91 | val numRatings = ratings.count() 92 | val numUser = ratings.map(x => x._2.user).distinct().count() 93 | val numMovie = ratings.map(x => x._2.product).distinct().count() 94 | println("Got " + numRatings + " ratings from " + numUser + " users on " + numMovie + " movies.") 95 | 96 | //3.3 Elicitate personal rating 97 | val topMovies = ratings 98 | .map(_._2.product) 99 | .countByValue() 100 | .toSeq 101 | .sortBy(-_._2) 102 | .take(50) 103 | .map(_._1) 104 | val random = new Random(0) 105 | val selectMovies = topMovies 106 | .filter(x => random.nextDouble() < 0.2) 107 | .map(x => (x, movies(x))) 108 | 109 | val myRatings = elicitateRating(selectMovies) 110 | val myRatingsRDD = sc.parallelize(myRatings, 1) 111 | 112 | //3.4 Split data into train(60%), validation(20%) and test(20%) 113 | val numPartitions = 10 114 | val trainSet = ratings 115 | .filter(x => x._1 < 6) 116 | .map(_._2) 117 | .union(myRatingsRDD) 118 | .repartition(numPartitions) 119 | .persist() 120 | val validationSet = ratings 121 | .filter(x => x._1 >= 6 && x._1 < 8) 122 | .map(_._2) 123 | .persist() 124 | val testSet = ratings 125 | .filter(x => x._1 >= 8) 126 | .map(_._2) 127 | .persist() 128 | 129 | val numTrain = trainSet.count() 130 | val numValidation = validationSet.count() 131 | val numTest = testSet.count() 132 | println("Training data: " + numTrain + " Validation data: " + numValidation + " Test data: " + numTest) 133 | 134 | //3.5 Train model and optimize model with validation set 135 | val numRanks = List(8, 12) 136 | val numIters = List(10, 20) 137 | val numLambdas = List(0.1, 0.01) 138 | var bestRmse = Double.MaxValue 139 | var bestModel: Option[MatrixFactorizationModel] = None 140 | var bestRanks = -1 141 | var bestIters = 0 142 | var bestLambdas = -1.0 143 | 144 | for (rank <- numRanks; iter <- numIters; lambda <- numLambdas) { 145 | val model = ALS.train(trainSet, rank, iter, lambda) 146 | val validationRmse = computeRmse(model, validationSet) 147 | println("RMSE(validation) = " + validationRmse + " with ranks = " + rank + ", iter = " + iter + ", Lambda = " + lambda) 148 | 149 | if (validationRmse < bestRmse) { 150 | bestRmse = validationRmse 151 | bestModel = Some(model) 152 | bestIters = iter 153 | bestLambdas = lambda 154 | bestRanks = rank 155 | } 156 | } 157 | 158 | //3.6 Evaluate model with test set 159 | val testRmse = computeRmse(bestModel.get, testSet) 160 | println("The best model was trained with rank = " + bestRanks + ", iter = " + bestIters + ", Lambda = " + bestLambdas + " and compute RMSE on test set is " + testRmse) 161 | 162 | //3.7 Create a baseline and compare it with best model 163 | val meanRating = trainSet.union(validationSet).map(_.rating).mean() 164 | val baselineRmse = new RegressionMetrics(testSet.map(x => (x.rating, meanRating))).rootMeanSquaredError 165 | val improvement = (baselineRmse - testRmse) / baselineRmse * 100 166 | println("The best model improves the baseline by %1.2f".format(improvement) + "%.") 167 | 168 | //3.8 Make a personal recommendation 169 | val moviesId = myRatings.map(_.product) 170 | val candidates = sc.parallelize(movies.keys.filter(!moviesId.contains(_)).toSeq) 171 | val recommendations = bestModel.get 172 | .predict(candidates.map(x => (0, x))) 173 | .sortBy(-_.rating) 174 | .take(50) 175 | 176 | var i = 1 177 | println("Movies recommended for you: ") 178 | recommendations.foreach { line => 179 | println("%2d".format(i) + " : " + movies(line.product)) 180 | i += 1 181 | } 182 | 183 | sc.stop() 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/com/awebone/flink/project/LogAnalysis.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.flink.project 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | import java.util.{Date, Properties} 6 | 7 | import org.apache.flink.api.common.functions.RuntimeContext 8 | import org.apache.flink.api.common.serialization.SimpleStringSchema 9 | import org.apache.flink.api.java.tuple.Tuple 10 | import org.apache.flink.streaming.api.TimeCharacteristic 11 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks 12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 13 | import org.apache.flink.streaming.api.scala.function.WindowFunction 14 | import org.apache.flink.streaming.api.watermark.Watermark 15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 16 | import org.apache.flink.streaming.api.windowing.time.Time 17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer} 19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink 20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 21 | import org.apache.flink.util.Collector 22 | import org.apache.http.HttpHost 23 | import org.elasticsearch.action.index.IndexRequest 24 | import org.elasticsearch.client.Requests 25 | import org.slf4j.LoggerFactory 26 | 27 | import scala.collection.mutable.ArrayBuffer 28 | 29 | /** 30 | * 日志分析系统 31 | * * 功能: 32 | * * 最近一分钟每个域名产生的流量统计 33 | */ 34 | object LogAnalysis { 35 | 36 | def main(args: Array[String]): Unit = { 37 | //在生产上进行日志的输出,采用以下方式 38 | val logger = LoggerFactory.getLogger("LogAnalysis") 39 | 40 | val env = StreamExecutionEnvironment.getExecutionEnvironment 41 | //设置事件时间作为flink处理的基准时间 42 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 43 | import org.apache.flink.api.scala._ 44 | 45 | /** 46 | * 读取kafka集群数据 47 | */ 48 | val topic = "cdnlog" 49 | val properties: Properties = new Properties() 50 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092") 51 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk 52 | // properties.setProperty("enable.auto.commit", "true") 53 | // properties.setProperty("bootstrap.servers","hadoop04:9092") 54 | properties.setProperty("group.id","test-cdnlog") 55 | 56 | val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties) 57 | val data = env.addSource(consumer) // 接受kafka数据 58 | // data.print().setParallelism(1) // 测试是否连通 59 | 60 | /** 61 | * 数据清洗: 62 | * 在生产上进行业务处理的时候,一定要考虑处理的健壮性以及数据的准确性 63 | * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后 64 | * 再进行相应业务逻辑的处理 65 | */ 66 | val logData = data.map(x => { 67 | val strings = x.split("\t") 68 | 69 | val level = strings(2) 70 | val timeStr = strings(3) 71 | var time = 0l 72 | try { 73 | val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") 74 | time = sourceFormat.parse(timeStr).getTime 75 | } catch { 76 | case e:Exception => { 77 | logger.error(s"time parse error: $timeStr", e.getMessage) 78 | } 79 | } 80 | 81 | val domain = strings(5) 82 | val traffic = strings(6).toLong 83 | (level, time, domain, traffic) 84 | }).filter(_._2 != 0).filter(_._1 == "E") 85 | .map(x => { 86 | (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic 87 | }) 88 | // logData.print.setParallelism(1) 89 | 90 | /** 91 | * Flink watermarks 定义 92 | * 设置timestamp和watermark,解决时序性问题 93 | * Windows function 使用 94 | * AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型 95 | */ 96 | val resultData = logData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long)] { 97 | //最大无序容忍的时间 10s 98 | val maxOutOfOrderness = 10000L // 3.5 seconds 99 | //当前最大的TimeStamp 100 | var currentMaxTimestamp: Long = _ 101 | 102 | //设置TimeStamp生成WaterMark 103 | override def getCurrentWatermark: Watermark = { 104 | new Watermark(currentMaxTimestamp - maxOutOfOrderness) 105 | } 106 | 107 | //抽取时间 108 | override def extractTimestamp(element: (Long, String, Long), previousElementTimestamp: Long): Long = { 109 | //获取数据的event time 110 | val timestamp: Long = element._1 111 | currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp) 112 | timestamp 113 | } 114 | }) //根据window进行业务逻辑的处理 最近一分钟每个域名产生的流量 115 | .keyBy(1) //以域名进行分组,按照域名进行keyby 116 | .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口,进行统计 117 | .apply(new WindowFunction[(Long, String, Long), (String, String, Long), Tuple, TimeWindow] { 118 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long)], out: Collector[(String, String, Long)]): Unit = { 119 | val domain = key.getField(0).toString //拿到key,域名 120 | 121 | var sum = 0l 122 | val times = ArrayBuffer[Long]() 123 | val iterator = input.iterator 124 | while (iterator.hasNext) { 125 | val next = iterator.next() 126 | sum += next._3 //统计流量 127 | times.append(next._1) //记录这一分钟,格式:yyyy-MM-dd HH:mm 128 | } 129 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间,格式化 130 | 131 | /** 132 | * 输出结果: 133 | * 第一个参数:这一分钟的时间 134 | * 第二个参数:域名 135 | * 第三个参数:traffic流量的和 136 | */ 137 | out.collect((time, domain, sum)) 138 | } 139 | }) 140 | resultData.print().setParallelism(1) 141 | 142 | 143 | /** 144 | * 连接es库,导入数据 145 | * 使用kibana可视化 146 | */ 147 | val httpHosts = new java.util.ArrayList[HttpHost] 148 | httpHosts.add(new HttpHost("redhat", 9200, "http")) 149 | 150 | val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long)]( 151 | httpHosts, 152 | new ElasticsearchSinkFunction[(String, String, Long)] { 153 | override def process(t: (String, String, Long), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = { 154 | requestIndexer.add(createIndexRequest(t)) 155 | } 156 | 157 | def createIndexRequest(element: (String, String, Long)): IndexRequest = { 158 | val json = new java.util.HashMap[String, Any] 159 | json.put("time", element._1) 160 | json.put("domain", element._2) 161 | json.put("traffics", element._3) 162 | val id = element._1 + "-" + element._2 163 | return Requests.indexRequest() 164 | .index("cdn") 165 | .`type`("traffic") 166 | .id(id) 167 | .source(json) 168 | } 169 | } 170 | ) 171 | 172 | //设置要为每个批量请求缓冲的最大操作数 173 | esSinkBuilder.setBulkFlushMaxActions(1) 174 | resultData.addSink(esSinkBuilder.build()) //.setParallelism(5) 175 | env.execute("LogAnalysis") 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/Logs.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp 2 | 3 | import com.awebone.dmp.util.Utils 4 | import org.apache.commons.lang3.StringUtils 5 | 6 | case class Logs(val sessionid: String, //会话标识 7 | val advertisersid: Int, //广告主id 8 | val adorderid: Int, //广告id 9 | val adcreativeid: Int, //广告创意id ( >= 200000 : dsp , < 200000 oss) 10 | val adplatformproviderid: Int, //广告平台商id (>= 100000: rtb , < 100000 : api ) 11 | val sdkversionnumber: String, //sdk版本号 12 | val adplatformkey: String, //平台商key 13 | val putinmodeltype: Int, //针对广告主的投放模式,1:展示量投放 2:点击量投放 14 | val requestmode: Int, //数据请求方式(1:请求、2:展示、3:点击) 15 | val adprice: Double, //广告价格 16 | val adppprice: Double, //平台商价格 17 | val requestdate: String, //请求时间,格式为:yyyy-MM-dd hh:mm:ss 18 | val ip: String, //设备用户的真实ip地址 19 | val appid: String, //应用id 20 | val appname: String, //应用名称 21 | val uuid: String, //设备唯一标识,比如imei或者androidid等 22 | val device: String, //设备型号,如htc、iphone 23 | val client: Int, //设备类型 (1:android 2:ios 3:wp) 24 | val osversion: String, //设备操作系统版本,如4.0 25 | val density: String, //备屏幕的密度 android的取值为0.75、1、1.5,ios的取值为:1、2 26 | val pw: Int, //设备屏幕宽度 27 | val ph: Int, //设备屏幕高度 28 | val longitude: String, //设备所在经度 29 | val lat: String, //设备所在纬度 30 | val provincename: String, //设备所在省份名称 31 | val cityname: String, //设备所在城市名称 32 | val ispid: Int, //运营商id 33 | val ispname: String, //运营商名称 34 | val networkmannerid: Int, //联网方式id 35 | val networkmannername: String, //联网方式名称 36 | val iseffective: Int, //有效标识(有效指可以正常计费的)(0:无效 1:有效) 37 | val isbilling: Int, //是否收费(0:未收费 1:已收费) 38 | val adspacetype: Int, //广告位类型(1:banner 2:插屏 3:全屏) 39 | val adspacetypename: String, //广告位类型名称(banner、插屏、全屏) 40 | val devicetype: Int, //设备类型(1:手机 2:平板) 41 | val processnode: Int, //流程节点(1:请求量kpi 2:有效请求 3:广告请求) 42 | val apptype: Int, //应用类型id 43 | val district: String, //设备所在县名称 44 | val paymode: Int, //针对平台商的支付模式,1:展示量投放(CPM) 2:点击量投放(CPC) 45 | val isbid: Int, //是否rtb 46 | val bidprice: Double, //rtb竞价价格 47 | val winprice: Double, //rtb竞价成功价格 48 | val iswin: Int, //是否竞价成功 49 | val cur: String, //values:usd|rmb等 50 | val rate: Double, //汇率 51 | val cnywinprice: Double, //rtb竞价成功转换成人民币的价格 52 | val imei: String, //imei 53 | val mac: String, //mac 54 | val idfa: String, //idfa 55 | val openudid: String, //openudid 56 | val androidid: String, //androidid 57 | val rtbprovince: String, //rtb 省 58 | val rtbcity: String, //rtb 市 59 | val rtbdistrict: String, //rtb 区 60 | val rtbstreet: String, //rtb 街道 61 | val storeurl: String, //app的市场下载地址 62 | val realip: String, //真实ip 63 | val isqualityapp: Int, //优选标识 64 | val bidfloor: Double, //底价 65 | val aw: Int, //广告位的宽 66 | val ah: Int, //广告位的高 67 | val imeimd5: String, //imei_md5 68 | val macmd5: String, //mac_md5 69 | val idfamd5: String, //idfa_md5 70 | val openudidmd5: String, //openudid_md5 71 | val androididmd5: String, //androidid_md5 72 | val imeisha1: String, //imei_sha1 73 | val macsha1: String, //mac_sha1 74 | val idfasha1: String, //idfa_sha1 75 | val openudidsha1: String, //openudid_sha1 76 | val androididsha1: String, //androidid_sha1 77 | val uuidunknow: String, //uuid_unknow tanx密文 78 | val decuuidunknow: String, // 解密的tanx 明文 79 | val userid: String, //平台用户id 80 | val reqdate: String, //日期 81 | val reqhour: String, //小时 82 | val iptype: Int, //表示ip库类型,1为点媒ip库,2为广告协会的ip地理信息标准库,默认为1 83 | val initbidprice: Double, //初始出价 84 | val adpayment: Double, //转换后的广告消费(保留小数点后6位) 85 | val agentrate: Double, //代理商利润率 86 | val lomarkrate: Double, //代理利润率 87 | val adxrate: Double, //媒介利润率 88 | val title: String, //标题 89 | val keywords: String, //关键字 90 | val tagid: String, //广告位标识(当视频流量时值为视频ID号) 91 | val callbackdate: String, //回调时间 格式为:YYYY/mm/dd hh:mm:ss 92 | val channelid: String, //频道ID 93 | val mediatype: Int ) {//媒体类型:1长尾媒体 2视频媒体 3独立媒体 默认:1) 94 | 95 | } 96 | 97 | object Logs { 98 | 99 | // 生成一个空的对象 100 | def makeLogs(): Logs = { 101 | new Logs("", 0, 0, 0, 0, "", "", 0, 0, 0.0, 0.0, "", "", "", "", "", "", 0, "", 102 | "", 0, 0, "", "", "", "", 0, "", 0, "", 0, 0, 0, "", 0, 0, 0, "", 0, 0, 103 | 0.0, 0.0, 0, "", 0.0, 0.0, "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0, 0, 104 | "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 0, 0.0, 0.0, 0.0, 0.0, 0.0, "", "", "", "", "", 0 105 | ) 106 | } 107 | 108 | def line2Logs(s:String):Logs ={ 109 | if(StringUtils.isNotEmpty(s)){ 110 | val fields = s.split(",") 111 | if(fields.length >= 79){ 112 | Logs(fields(0), Utils.parseInt(fields(1)), Utils.parseInt(fields(2)), Utils.parseInt(fields(3)), Utils.parseInt(fields(4)), fields(5), fields(6), Utils.parseInt(fields(7)), Utils.parseInt(fields(8)), Utils.parseDouble(fields(9)), Utils.parseDouble(fields(10)), 113 | fields(11), fields(12), fields(13), fields(14), fields(15), fields(16), Utils.parseInt(fields(17)), fields(18), fields(19), Utils.parseInt(fields(20)), 114 | Utils.parseInt(fields(21)), fields(22), fields(23), fields(24), fields(25), Utils.parseInt(fields(26)), fields(27), Utils.parseInt(fields(28)), fields(29), Utils.parseInt(fields(30)), 115 | Utils.parseInt(fields(31)), Utils.parseInt(fields(32)), fields(33), Utils.parseInt(fields(34)), Utils.parseInt(fields(35)), Utils.parseInt(fields(36)), fields(37), Utils.parseInt(fields(38)), Utils.parseInt(fields(39)), Utils.parseDouble(fields(40)), 116 | Utils.parseDouble(fields(41)), Utils.parseInt(fields(42)), fields(43), Utils.parseDouble(fields(44)), Utils.parseDouble(fields(45)), fields(46), fields(47), fields(48), fields(49), fields(50), 117 | fields(51), fields(52), fields(53), fields(54), fields(55), fields(56), Utils.parseInt(fields(57)), Utils.parseDouble(fields(58)), Utils.parseInt(fields(59)), Utils.parseInt(fields(60)), 118 | fields(61), fields(62), fields(63), fields(64), fields(65), fields(66), fields(67), fields(68), fields(69), fields(70), 119 | fields(71), "", fields(72), Utils.fmtDate(fields(11)).getOrElse("unkown"), Utils.fmtHour(fields(11)).getOrElse("unkown"), 120 | Utils.parseInt(fields(73)), Utils.parseDouble(fields(74)), Utils.parseDouble(fields(75)), Utils.parseDouble(fields(76)), Utils.parseDouble(fields(77)), Utils.parseDouble(fields(78)), "", "", "", "", "", 1) 121 | }else{ 122 | makeLogs() 123 | } 124 | }else{ 125 | makeLogs() 126 | } 127 | } 128 | } -------------------------------------------------------------------------------- /weblog/src/main/java/com/awebone/click/ClickSessionStream.java: -------------------------------------------------------------------------------- 1 | package com.awebone.click; 2 | 3 | import java.io.IOException; 4 | import java.lang.reflect.InvocationTargetException; 5 | import java.text.ParseException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.Comparator; 10 | import java.util.Date; 11 | import java.util.Iterator; 12 | import java.util.UUID; 13 | 14 | import org.apache.commons.beanutils.BeanUtils; 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.LongWritable; 18 | import org.apache.hadoop.io.NullWritable; 19 | import org.apache.hadoop.io.Text; 20 | import org.apache.hadoop.mapreduce.Job; 21 | import org.apache.hadoop.mapreduce.Mapper; 22 | import org.apache.hadoop.mapreduce.Reducer; 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 25 | 26 | import com.awebone.bean.WebLogBean; 27 | 28 | /** 29 | * 抽取,转化 点击会话流的数据 30 | * map端: 31 | * key: ip 32 | * value: 自定义类 字符串 33 | * reduce: 34 | * 相同ip的数据 35 | * 排序 按照访问时间 升序 排序 36 | * 计算相邻两个的时间差 37 | * 判断 38 | * 39 | */ 40 | public class ClickSessionStream { 41 | static class ClickSessionStreamMapper extends Mapper{ 42 | Text mk = new Text(); 43 | WebLogBean bean = new WebLogBean(); 44 | 45 | @Override 46 | protected void map(LongWritable key, Text value, Mapper.Context context) 47 | throws IOException, InterruptedException { 48 | String line = value.toString(); 49 | String[] pre_datas = line.split("\001"); 50 | if(pre_datas.length==9){ 51 | bean.setValid(pre_datas[0].equals("true")?true:false); 52 | bean.setRemote_addr(pre_datas[1]); 53 | bean.setRemote_user(pre_datas[2]); 54 | bean.setTime_local(pre_datas[3]); 55 | bean.setRequest(pre_datas[4]); 56 | bean.setStatus(pre_datas[5]); 57 | bean.setBody_bytes_sent(pre_datas[6]); 58 | bean.setHttp_referer(pre_datas[7]); 59 | bean.setHttp_user_agent(pre_datas[8]); 60 | 61 | //过滤数据 62 | if(bean.isValid()){ 63 | mk.set(bean.getRemote_addr()); 64 | context.write(mk, bean); 65 | } 66 | } 67 | } 68 | 69 | } 70 | 71 | static class ClickSessionStreamReducer extends Reducer{ 72 | Text rk = new Text(); 73 | 74 | @Override 75 | protected void reduce(Text key, Iterable values, 76 | Reducer.Context context) throws IOException, InterruptedException { 77 | //相同ip的所有数据,循环遍历放在list中,按时间升序排序 78 | ArrayList list = new ArrayList(); 79 | //reducer的坑:k和v都各自只有一个地址,因此要新建对象,再存在list中 80 | for (WebLogBean v:values){ 81 | //新建对象 82 | WebLogBean bean = new WebLogBean(); 83 | //将迭代器对象中的属性复制到新对象上 84 | try { 85 | BeanUtils.copyProperties(bean, v); 86 | list.add(bean); 87 | } catch (IllegalAccessException e) { 88 | // TODO Auto-generated catch block 89 | e.printStackTrace(); 90 | } catch (InvocationTargetException e) { 91 | // TODO Auto-generated catch block 92 | e.printStackTrace(); 93 | } 94 | } 95 | 96 | //按时间排序 97 | Collections.sort(list, new Comparator() { 98 | public int compare(WebLogBean o1, WebLogBean o2) { 99 | Date date1 = null; 100 | Date date2 = null; 101 | try { 102 | date1 = toDate(o1.getTime_local()); 103 | date2 = toDate(o2.getTime_local()); 104 | } catch (ParseException e) { 105 | // TODO Auto-generated catch block 106 | e.printStackTrace(); 107 | } 108 | if(date1==null || date2==null){ 109 | return 0; 110 | } 111 | return date1.compareTo(date2); 112 | } 113 | }); 114 | 115 | //遍历list,算停留时间,session,step=1 116 | int step = 1; 117 | UUID sessionid = UUID.randomUUID(); 118 | for (int i = 0; i < list.size(); i++) { 119 | WebLogBean bean = list.get(i); 120 | //只有一个访问信息时,直接发送 121 | if(list.size()==1){ 122 | rk.set(sessionid+"\001"+bean.getRemote_addr()+"\001"+bean.getRemote_user()+"\001"+ 123 | bean.getTime_local()+"\001"+bean.getRequest()+"\001"+(60)+"\001"+step+"\001"+ 124 | bean.getStatus()+"\001"+bean.getBody_bytes_sent()+"\001"+bean.getHttp_referer()+"\001"+ 125 | bean.getHttp_user_agent()); 126 | context.write(rk, NullWritable.get()); 127 | sessionid = UUID.randomUUID(); 128 | break; 129 | } 130 | 131 | //大于一个时,算时间差,当前条减去上一条时间 132 | if (i==0){ 133 | continue; 134 | } 135 | try { 136 | long diffDate = diffDate(bean.getTime_local(), list.get(i-1).getTime_local()); 137 | //判断时间差小于30min 138 | if(diffDate < 30*60*1000){ 139 | WebLogBean lb = list.get(i-1); 140 | //输出上一条数据 141 | rk.set(sessionid+"\001"+lb.getRemote_addr()+"\001"+lb.getRemote_user()+"\001"+ 142 | lb.getTime_local()+"\001"+lb.getRequest()+"\001"+(diffDate)/1000+"\001"+step+"\001"+ 143 | lb.getStatus()+"\001"+lb.getBody_bytes_sent()+"\001"+lb.getHttp_referer()+"\001"+ 144 | lb.getHttp_user_agent()); 145 | context.write(rk, NullWritable.get()); 146 | step++; 147 | }else{ 148 | //大于30min,默认新的session,输出上一个会话的最后一个 149 | WebLogBean lsl = list.get(i-1); 150 | rk.set(sessionid+"\001"+lsl.getRemote_addr()+"\001"+lsl.getRemote_user()+"\001"+ 151 | lsl.getTime_local()+"\001"+lsl.getRequest()+"\001"+(60)+"\001"+step+"\001"+ 152 | lsl.getStatus()+"\001"+lsl.getBody_bytes_sent()+"\001"+lsl.getHttp_referer()+"\001"+ 153 | lsl.getHttp_user_agent()); 154 | context.write(rk, NullWritable.get()); 155 | 156 | //step和session重新赋值 157 | step = 1; 158 | sessionid = UUID.randomUUID(); 159 | } 160 | 161 | //输出最后一条 162 | if(i == list.size()-1){ 163 | WebLogBean cb = list.get(i-1); 164 | rk.set(sessionid+"\001"+cb.getRemote_addr()+"\001"+cb.getRemote_user()+"\001"+ 165 | cb.getTime_local()+"\001"+cb.getRequest()+"\001"+(60)+"\001"+step+"\001"+ 166 | cb.getStatus()+"\001"+cb.getBody_bytes_sent()+"\001"+cb.getHttp_referer()+"\001"+ 167 | cb.getHttp_user_agent()); 168 | context.write(rk, NullWritable.get()); 169 | sessionid = UUID.randomUUID(); 170 | } 171 | } catch (ParseException e) { 172 | // TODO Auto-generated catch block 173 | e.printStackTrace(); 174 | } 175 | } 176 | } 177 | 178 | public static Date toDate(String time) throws ParseException { 179 | SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); 180 | Date date = sdf.parse(time); 181 | return date; 182 | } 183 | 184 | public static long diffDate(String date1,String date2) throws ParseException { 185 | Date d1 = toDate(date1); 186 | Date d2 = toDate(date2); 187 | return d1.getTime() - d2.getTime(); 188 | } 189 | } 190 | 191 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { 192 | System.setProperty("HADOOP_USER_NAME", "hadoop"); 193 | Configuration conf = new Configuration(); 194 | conf.set("fs.defaultFS", "hdfs://myha/"); 195 | Job job = Job.getInstance(conf); 196 | 197 | job.setJarByClass(ClickSessionStream.class); 198 | 199 | job.setMapperClass(ClickSessionStreamMapper.class); 200 | job.setReducerClass(ClickSessionStreamReducer.class); 201 | 202 | job.setMapOutputKeyClass(Text.class); 203 | job.setMapOutputValueClass(WebLogBean.class); 204 | job.setOutputKeyClass(Text.class); 205 | job.setOutputValueClass(NullWritable.class); 206 | 207 | FileInputFormat.setInputPaths(job, new Path("/weblog/pre/20200221")); 208 | FileOutputFormat.setOutputPath(job, new Path("/weblog/click/stream/20200221")); 209 | 210 | boolean res = job.waitForCompletion(true); 211 | System.exit(res ? 0 : 1); 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/com/awebone/flink/project/LogAnalysisWithMySQL.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.flink.project 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.{Date, Properties} 5 | 6 | import org.apache.flink.api.common.functions.RuntimeContext 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema 8 | import org.apache.flink.api.java.tuple.Tuple 9 | import org.apache.flink.streaming.api.TimeCharacteristic 10 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks 11 | import org.apache.flink.streaming.api.functions.co.CoFlatMapFunction 12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 13 | import org.apache.flink.streaming.api.scala.function.WindowFunction 14 | import org.apache.flink.streaming.api.watermark.Watermark 15 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 16 | import org.apache.flink.streaming.api.windowing.time.Time 17 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 18 | import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer} 19 | import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink 20 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 21 | import org.apache.flink.util.Collector 22 | import org.apache.http.HttpHost 23 | import org.elasticsearch.action.index.IndexRequest 24 | import org.elasticsearch.client.Requests 25 | import org.slf4j.LoggerFactory 26 | 27 | import scala.collection.mutable 28 | import scala.collection.mutable.ArrayBuffer 29 | 30 | object LogAnalysisWithMySQL { 31 | def main(args: Array[String]): Unit = { 32 | //在生产上进行日志的输出,采用以下方式 33 | val logger = LoggerFactory.getLogger("LogAnalysis") 34 | 35 | val env = StreamExecutionEnvironment.getExecutionEnvironment 36 | //设置事件时间作为flink处理的基准时间 37 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 38 | import org.apache.flink.api.scala._ 39 | 40 | /** 41 | * 读取kafka集群数据 42 | */ 43 | val topic = "cdnlog" 44 | val properties: Properties = new Properties() 45 | properties.setProperty("bootstrap.servers","hadoop01:9092,hadoop02:9092,hadoop03:9092,hadoop04:9092") 46 | properties.setProperty("zookeeper.connect", "hadoop02:2181,hadoop03:2181,hadoop01:2181/kafka") //声明zk 47 | properties.setProperty("group.id","test-cdnlog-mysql") 48 | 49 | val consumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties) 50 | val data = env.addSource(consumer) // 接受kafka数据 51 | // data.print().setParallelism(1) // 测试是否连通 52 | 53 | /** 54 | * 数据清洗: 55 | * 在生产上进行业务处理的时候,一定要考虑处理的健壮性以及数据的准确性 56 | * 脏数据或者是不符合业务规则的数据是需要全部过滤掉之后 57 | * 再进行相应业务逻辑的处理 58 | */ 59 | val logData = data.map(x => { 60 | val strings = x.split("\t") 61 | 62 | val level = strings(2) 63 | val timeStr = strings(3) 64 | var time = 0l 65 | try { 66 | val sourceFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") 67 | time = sourceFormat.parse(timeStr).getTime 68 | } catch { 69 | case e:Exception => { 70 | logger.error(s"time parse error: $timeStr", e.getMessage) 71 | } 72 | } 73 | 74 | val domain = strings(5) 75 | val traffic = strings(6).toLong 76 | (level, time, domain, traffic) 77 | }).filter(_._2 != 0).filter(_._1 == "E") 78 | .map(x => { 79 | (x._2, x._3, x._4) //数据清洗按照业务规则取相关数据 1level(不需要可以抛弃) 2time 3 domain 4traffic 80 | }) 81 | 82 | /** 83 | * 连接mysql,合并字段 84 | */ 85 | val mysqlData = env.addSource(new MySQLSource) 86 | // mysqlData.print() 87 | val connectData = logData.connect(mysqlData) 88 | .flatMap(new CoFlatMapFunction[(Long, String, Long), mutable.HashMap[String, String], (Long, String, Long, String)] { 89 | var userDomainMap: mutable.HashMap[String, String] = mutable.HashMap[String, String]() 90 | 91 | //log 92 | override def flatMap1(in1: (Long, String, Long), collector: Collector[(Long, String, Long, String)]): Unit = { 93 | val domain = in1._2 94 | val userId = userDomainMap.getOrElse(domain, "") 95 | // collector.collect(in1._1 + "\t" + in1._2 + "\t" + in1._3 + "\t" + userId) 96 | collector.collect((in1._1, domain, in1._3, userId)) 97 | } 98 | 99 | override def flatMap2(in2: mutable.HashMap[String, String], collector: Collector[(Long, String, Long, String)]): Unit = { 100 | userDomainMap = in2 101 | } 102 | }) 103 | 104 | // connectData.print() 105 | 106 | /** 107 | * 设置timestamp和watermark,解决时序性问题 108 | * AssignerWithPeriodicWatermarks[T] 对应logdata的tuple类型 109 | */ 110 | val resultData = connectData.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(Long, String, Long, String)] { 111 | //最大无序容忍的时间 10s 112 | val maxOutOfOrderness = 10000L // 3.5 seconds 113 | //当前最大的TimeStamp 114 | var currentMaxTimestamp: Long = _ 115 | 116 | //设置TimeStamp生成WaterMark 117 | override def getCurrentWatermark: Watermark = { 118 | new Watermark(currentMaxTimestamp - maxOutOfOrderness) 119 | } 120 | 121 | //抽取时间 122 | override def extractTimestamp(element: (Long, String, Long, String), previousElementTimestamp: Long): Long = { 123 | //获取数据的event time 124 | val timestamp: Long = element._1 125 | currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp) 126 | timestamp 127 | } 128 | }) //根据window进行业务逻辑的处理 最近一分钟每个用户产生的流量 129 | .keyBy(3) //以userid进行分组 130 | .window(TumblingEventTimeWindows.of(Time.seconds(60))) //每60秒为一个窗口,进行统计 131 | .apply(new WindowFunction[(Long, String, Long, String), (String, String, Long, String), Tuple, TimeWindow] { 132 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(Long, String, Long, String)], out: Collector[(String, String, Long, String)]): Unit = { 133 | val userid = key.getField(0).toString //拿到key,userid 134 | 135 | var sum = 0l 136 | val times = ArrayBuffer[Long]() 137 | val iterator = input.iterator 138 | while (iterator.hasNext) { 139 | val next = iterator.next() 140 | sum += next._3 //统计流量 141 | times.append(next._1) //记录这一分钟,格式:yyyy-MM-dd HH:mm 142 | } 143 | val time = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(new Date(times.max)) // 这一分钟的时间,格式化 144 | 145 | /** 146 | * 输出结果: 147 | * 第一个参数:这一分钟的时间 148 | * 第二个参数:域名 149 | * 第三个参数:traffic流量的和 150 | */ 151 | out.collect((time, domain, sum, userid)) 152 | } 153 | }) 154 | resultData.print().setParallelism(1) 155 | 156 | 157 | /** 158 | * 连接es库,导入数据 159 | * 使用kibana可视化 160 | */ 161 | val httpHosts = new java.util.ArrayList[HttpHost] 162 | httpHosts.add(new HttpHost("redhat", 9200, "http")) 163 | 164 | val esSinkBuilder = new ElasticsearchSink.Builder[(String, String, Long, String)]( 165 | httpHosts, 166 | new ElasticsearchSinkFunction[(String, String, Long, String)] { 167 | override def process(t: (String, String, Long, String), runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = { 168 | requestIndexer.add(createIndexRequest(t)) 169 | } 170 | 171 | def createIndexRequest(element: (String, String, Long, String)): IndexRequest = { 172 | val json = new java.util.HashMap[String, Any] 173 | json.put("time", element._1) 174 | json.put("domain", element._2) 175 | json.put("traffics", element._3) 176 | json.put("userid", element._4) 177 | val id = element._1 + "-" + element._2 178 | return Requests.indexRequest() 179 | .index("cdn") 180 | .`type`("traffic-userid") 181 | .id(id) 182 | .source(json) 183 | } 184 | } 185 | ) 186 | 187 | //设置要为每个批量请求缓冲的最大操作数 188 | esSinkBuilder.setBulkFlushMaxActions(1) 189 | resultData.addSink(esSinkBuilder.build()) //.setParallelism(5) 190 | 191 | env.execute("LogAnalysisWithMySQL") 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /dmp/src/main/scala/com/awebone/dmp/personas/DmpPersonasJob.scala: -------------------------------------------------------------------------------- 1 | package com.awebone.dmp.personas 2 | 3 | import java.io.FileInputStream 4 | import java.util.Properties 5 | 6 | import com.awebone.dmp.Logs 7 | import com.awebone.dmp.constants.AdTagConstants 8 | import com.awebone.dmp.tags._ 9 | import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} 10 | import org.apache.hadoop.hbase.client.{ConnectionFactory, Put} 11 | import org.apache.log4j.{Level, Logger} 12 | import org.apache.spark.SparkConf 13 | import org.apache.spark.rdd.RDD 14 | import org.apache.spark.sql.{Dataset, SparkSession} 15 | 16 | import scala.collection.{JavaConversions, mutable} 17 | 18 | /** 19 | * dmp用户画像便签统计 20 | */ 21 | object DmpPersonasJob { 22 | def main(args: Array[String]): Unit = { 23 | Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) 24 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 25 | Logger.getLogger("org.spark-project").setLevel(Level.WARN) 26 | 27 | if (args == null || args.length < 1) { 28 | println( 29 | """Parameter Errors! Usage: 30 | |inputpath : input path 31 | """.stripMargin) 32 | System.exit(-1) 33 | } 34 | val Array(inputpath) = args 35 | 36 | val conf: SparkConf = new SparkConf().setAppName("DmpPersonasJob").setMaster("local[*]") 37 | val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate() 38 | import spark.implicits._ 39 | 40 | val input: Dataset[Logs] = spark.read.parquet(inputpath).as[Logs] 41 | val logs: RDD[Logs] = input.rdd 42 | 43 | //提取用户的标签 | | 44 | val userid2Tags: RDD[(String, Map[String, Int])] = logs.map { case logs: Logs => { 45 | var userid: String = logs.userid 46 | if (userid == null) { 47 | userid = getNotEmptyID(logs).getOrElse("UnKnown") 48 | } 49 | 50 | val adspaceTags: Map[String, Int] = AdPositionTag.extractTag(logs) 51 | val appTags: Map[String, Int] = AppTag.extractTag(logs) 52 | val channelTags: Map[String, Int] = ChannelTag.extractTag(logs) 53 | val deviceTags: Map[String, Int] = DeviceTag.extractTag(logs) 54 | val kwTags: Map[String, Int] = KeyWordTag.extractTag(logs) 55 | val areaTags: Map[String, Int] = AreaTag.extractTag(logs) 56 | 57 | (userid, adspaceTags.++(appTags).++(channelTags).++(deviceTags).++(kwTags).++(areaTags)) 58 | } 59 | } 60 | 61 | //map1 map2 --> 62 | val userid2AggrTags: RDD[(String, Map[String, Int])] = userid2Tags.reduceByKey { case (map1, map2) => { 63 | val map = mutable.Map[String, Int]() 64 | map.++=(map1) 65 | 66 | for ((k, v) <- map2) { 67 | map.put(k, map.getOrElse(k, 0) + v) 68 | } 69 | map.toMap 70 | } 71 | } 72 | // userid2AggrTags.foreach(println) 73 | // (2,Map(NET_3 -> 2, ZC_益阳市 -> 2, DEVICE_1 -> 2, APP_其他 -> 2, ZP_湘南省 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2)) 74 | // (1,Map(ZP_上海市 -> 2, NET_3 -> 2, DEVICE_1 -> 2, APP_马上赚 -> 2, LC_02 -> 2, ISP_4 -> 2, CN_ -> 2, ZC_上海市 -> 2)) 75 | 76 | //转换属性 77 | val props = loadProerties() 78 | val propsBC = spark.sparkContext.broadcast(props) 79 | 80 | val aggrTags = userid2AggrTags.map{case (userid, tagMap) => { 81 | val map = mutable.Map[String, Int]() 82 | val propsMap = propsBC.value 83 | 84 | for((k,v) <- tagMap){ 85 | var key = k 86 | 87 | if(k.contains(AdTagConstants.PREFIX_AD_DEVICE_TAG)){ 88 | val dMap = propsMap(AdTagConstants.PREFIX_AD_DEVICE_TAG) 89 | val id = k.split("_")(1) 90 | val dName = dMap.get(id).get.split("\\s+")(1) 91 | //k --> prefix_id 92 | key = AdTagConstants.PREFIX_AD_DEVICE_TAG + dName 93 | }else if(k.contains(AdTagConstants.PREFIX_AD_ISP_TAG)) { 94 | val ispMap = propsMap(AdTagConstants.PREFIX_AD_ISP_TAG) 95 | val id = k.split("_")(1) 96 | val ispName = ispMap.get(id).get.split("\\s+")(1) 97 | key = AdTagConstants.PREFIX_AD_ISP_TAG + ispName 98 | } else if(k.contains(AdTagConstants.PREFIX_AD_NETWORK_TAG)) { 99 | val nwMap = propsMap(AdTagConstants.PREFIX_AD_NETWORK_TAG) 100 | val id = k.split("_")(1) 101 | val nwName = nwMap.get(id).get.split("\\s+")(1) 102 | key = AdTagConstants.PREFIX_AD_NETWORK_TAG + nwName 103 | } 104 | map.put(key, v) 105 | } 106 | 107 | (userid, map) 108 | }} 109 | 110 | /** 111 | * 将标签聚合结果存储到hbase中 112 | * 因为,经过我们分析,计算得出的标签可能半结构化的数据,同时如果在dmp和dsp中进行交互的时候,流量比较大的情况下 113 | * 我们使用mysql没有办法保证时效性,所以我们这里使用hbase进行存储 114 | * create_space bigdata 115 | * create 'bigdata:dmp_tag', 'cf' 116 | * HBase api 117 | */ 118 | aggrTags.foreachPartition(partition => { 119 | if(partition != null){ 120 | val connection = ConnectionFactory.createConnection(HBaseConfiguration.create()) 121 | val table = connection.getTable(TableName.valueOf("bigdata:dmp_tag")) 122 | 123 | partition.foreach{case (userid, tagMap) => { 124 | val put = new Put(userid.getBytes()) 125 | 126 | //tagMap--[Deivce_xxxx, 5] 127 | for((col,value) <- tagMap){ 128 | put.addColumn("cf".getBytes(), col.getBytes(), value.toString.getBytes()) 129 | } 130 | table.put(put) 131 | }} 132 | 133 | table.close() 134 | connection.close() 135 | } 136 | }) 137 | 138 | spark.stop() 139 | } 140 | 141 | /** 142 | * 加载配置文件 143 | * type 144 | * device k, value 145 | * isp 146 | * network 147 | */ 148 | def loadProerties():mutable.Map[String, mutable.Map[String, String]] = { 149 | val props = mutable.Map[String, mutable.Map[String, String]]() 150 | val properties = new Properties() 151 | 152 | //加载deivce 153 | properties.load(new FileInputStream("data/device-mapping.dic")) 154 | val deviceMap = mutable.Map[String, String]() 155 | 156 | for (dk <- JavaConversions.asScalaSet(properties.keySet())){ 157 | deviceMap.put(dk.toString,properties.getProperty(dk.toString)) 158 | } 159 | props.put(AdTagConstants.PREFIX_AD_DEVICE_TAG, deviceMap) 160 | 161 | //加载isp 162 | properties.clear() 163 | properties.load(new FileInputStream("data/isp-mapping.dic")) 164 | val ispMap = mutable.Map[String, String]() 165 | for(dk <- JavaConversions.asScalaSet(properties.keySet())) { 166 | ispMap.put(dk.toString, properties.getProperty(dk.toString)) 167 | } 168 | props.put(AdTagConstants.PREFIX_AD_ISP_TAG, ispMap) 169 | 170 | //network 171 | properties.clear() 172 | properties.load(new FileInputStream("data/network-mapping.dic")) 173 | val nwMap = mutable.Map[String, String]() 174 | for(dk <- JavaConversions.asScalaSet(properties.keySet())) { 175 | nwMap.put(dk.toString, properties.getProperty(dk.toString)) 176 | } 177 | props.put(AdTagConstants.PREFIX_AD_NETWORK_TAG, nwMap) 178 | 179 | props 180 | } 181 | 182 | 183 | // 获取用户唯一不为空的ID 184 | def getNotEmptyID(log: Logs): Option[String] = { 185 | log match { 186 | case v if v.imei.nonEmpty => Some("IMEI:" + v.imei.replaceAll(":|-\\", "").toUpperCase) 187 | case v if v.imeimd5.nonEmpty => Some("IMEIMD5:" + v.imeimd5.toUpperCase) 188 | case v if v.imeisha1.nonEmpty => Some("IMEISHA1:" + v.imeisha1.toUpperCase) 189 | 190 | case v if v.androidid.nonEmpty => Some("ANDROIDID:" + v.androidid.toUpperCase) 191 | case v if v.androididmd5.nonEmpty => Some("ANDROIDIDMD5:" + v.androididmd5.toUpperCase) 192 | case v if v.androididsha1.nonEmpty => Some("ANDROIDIDSHA1:" + v.androididsha1.toUpperCase) 193 | 194 | case v if v.mac.nonEmpty => Some("MAC:" + v.mac.replaceAll(":|-", "").toUpperCase) 195 | case v if v.macmd5.nonEmpty => Some("MACMD5:" + v.macmd5.toUpperCase) 196 | case v if v.macsha1.nonEmpty => Some("MACSHA1:" + v.macsha1.toUpperCase) 197 | 198 | case v if v.idfa.nonEmpty => Some("IDFA:" + v.idfa.replaceAll(":|-", "").toUpperCase) 199 | case v if v.idfamd5.nonEmpty => Some("IDFAMD5:" + v.idfamd5.toUpperCase) 200 | case v if v.idfasha1.nonEmpty => Some("IDFASHA1:" + v.idfasha1.toUpperCase) 201 | 202 | case v if v.openudid.nonEmpty => Some("OPENUDID:" + v.openudid.toUpperCase) 203 | case v if v.openudidmd5.nonEmpty => Some("OPENDUIDMD5:" + v.openudidmd5.toUpperCase) 204 | case v if v.openudidsha1.nonEmpty => Some("OPENUDIDSHA1:" + v.openudidsha1.toUpperCase) 205 | 206 | case _ => None 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /mllib/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.awebone.spark 8 | mllib 9 | 1.0-SNAPSHOT 10 | 11 | mllib 12 | 13 | http://www.example.com 14 | 15 | 16 | UTF-8 17 | 1.8 18 | 1.8 19 | UTF-8 20 | 2.11.8 21 | 2.3.1 22 | 2.7.6 23 | 2.11 24 | 25 | 26 | 27 | 28 | org.scala-lang 29 | scala-library 30 | ${scala.version} 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-core_2.11 36 | ${spark.version} 37 | 38 | 39 | 40 | org.apache.spark 41 | spark-sql_2.11 42 | ${spark.version} 43 | 44 | 45 | 46 | org.apache.spark 47 | spark-streaming_2.11 48 | ${spark.version} 49 | 50 | 51 | 52 | org.apache.spark 53 | spark-graphx_2.11 54 | ${spark.version} 55 | 56 | 57 | 58 | org.apache.spark 59 | spark-mllib_2.11 60 | ${spark.version} 61 | 62 | 63 | 64 | org.apache.hadoop 65 | hadoop-client 66 | ${hadoop.version} 67 | 68 | 69 | 70 | org.apache.spark 71 | spark-streaming-kafka-0-10_2.11 72 | 2.3.1 73 | 74 | 75 | 76 | org.apache.spark 77 | spark-streaming-flume_2.11 78 | ${spark.version} 79 | 80 | 81 | 82 | mysql 83 | mysql-connector-java 84 | 5.1.46 85 | 86 | 87 | 88 | org.apache.spark 89 | spark-hive_2.11 90 | ${spark.version} 91 | 92 | 93 | 94 | 95 | org.apache.kafka 96 | kafka_2.11 97 | 1.1.0 98 | 99 | 100 | 101 | junit 102 | junit 103 | 4.11 104 | test 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | net.alchim31.maven 113 | scala-maven-plugin 114 | 3.2.2 115 | 116 | 117 | org.apache.maven.plugins 118 | maven-compiler-plugin 119 | 3.5.1 120 | 121 | 122 | 123 | 124 | 125 | net.alchim31.maven 126 | scala-maven-plugin 127 | 128 | 129 | scala-compile-first 130 | process-resources 131 | 132 | add-source 133 | compile 134 | 135 | 136 | 137 | scala-test-compile 138 | process-test-resources 139 | 140 | testCompile 141 | 142 | 143 | 144 | 145 | 146 | 147 | org.apache.maven.plugins 148 | maven-compiler-plugin 149 | 150 | 151 | compile 152 | 153 | compile 154 | 155 | 156 | 157 | 158 | 159 | 160 | org.apache.maven.plugins 161 | maven-shade-plugin 162 | 2.4.3 163 | 164 | 165 | package 166 | 167 | shade 168 | 169 | 170 | 171 | 172 | *:* 173 | 174 | META-INF/*.SF 175 | META-INF/*.DSA 176 | META-INF/*.RSA 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /dmp/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.awebone 8 | dmp 9 | 1.0-SNAPSHOT 10 | 11 | dmp 12 | 13 | http://www.example.com 14 | 15 | 16 | UTF-8 17 | 2.11.8 18 | 2.3.1 19 | 20 | 21 | 22 | 23 | scala-tools.org 24 | Scala-Tools Maven2 Repository 25 | http://scala-tools.org/repo-releases 26 | 27 | 28 | 29 | 30 | 31 | scala-tools.org 32 | Scala-Tools Maven2 Repository 33 | http://scala-tools.org/repo-releases 34 | 35 | 36 | 37 | 38 | 39 | org.scala-lang 40 | scala-library 41 | ${scala.version} 42 | 43 | 44 | junit 45 | junit 46 | 4.11 47 | 54 | test 55 | 56 | 57 | org.apache.spark 58 | spark-core_2.11 59 | ${spark.version} 60 | 61 | 62 | org.apache.spark 63 | spark-sql_2.11 64 | ${spark.version} 65 | 66 | 67 | org.apache.spark 68 | spark-hive_2.11 69 | ${spark.version} 70 | 71 | 72 | mysql 73 | mysql-connector-java 74 | 5.1.40 75 | 76 | 77 | 78 | org.apache.hbase 79 | hbase-client 80 | 1.2.6 81 | 82 | 83 | org.apache.hbase 84 | hbase-server 85 | 1.2.6 86 | 87 | 88 | 89 | 90 | 91 | 92 | org.scala-tools 93 | maven-scala-plugin 94 | 2.15.0 95 | 96 | 97 | 98 | compile 99 | testCompile 100 | 101 | 102 | 103 | 104 | ${scala.version} 105 | 106 | -target:jvm-1.5 107 | 108 | 109 | 110 | 111 | org.apache.maven.plugins 112 | maven-eclipse-plugin 113 | 2.10 114 | 115 | true 116 | 117 | ch.epfl.lamp.sdt.core.scalabuilder 118 | 119 | 120 | ch.epfl.lamp.sdt.core.scalanature 121 | 122 | 123 | org.eclipse.jdt.launching.JRE_CONTAINER 124 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 125 | 126 | 127 | 128 | 129 | maven-assembly-plugin 130 | 131 | 132 | jar-with-dependencies 133 | 134 | 135 | 138 | 139 | 140 | 141 | 142 | make-assembly 143 | package 144 | 145 | single 146 | 147 | 148 | 149 | 150 | 151 | org.apache.maven.plugins 152 | maven-compiler-plugin 153 | 154 | 1.8 155 | 1.8 156 | 157 | 158 | 159 | org.codehaus.mojo 160 | build-helper-maven-plugin 161 | 1.10 162 | 163 | 164 | add-source 165 | generate-sources 166 | 167 | add-source 168 | 169 | 170 | 171 | 172 | src/main/java 173 | src/main/scala 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /flink-train/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.awebone 8 | flink 9 | 1.0-SNAPSHOT 10 | 11 | flink 12 | 13 | http://www.example.com 14 | 15 | 16 | UTF-8 17 | 1.8 18 | 1.8 19 | 1.7.2 20 | 2.11 21 | 2.11.8 22 | 2.7.6 23 | 1.4.3 24 | 1.2.7 25 | 26 | 27 | 28 | 29 | 30 | org.scala-lang 31 | scala-library 32 | ${scala.version} 33 | 34 | 35 | 36 | 37 | org.apache.flink 38 | flink-scala_${scala.binary.version} 39 | ${flink.version} 40 | 41 | 42 | org.apache.flink 43 | flink-streaming-scala_${scala.binary.version} 44 | ${flink.version} 45 | 46 | 47 | 48 | 49 | org.apache.flink 50 | flink-java 51 | ${flink.version} 52 | compile 53 | 54 | 55 | org.apache.flink 56 | flink-streaming-java_2.11 57 | ${flink.version} 58 | compile 59 | 60 | 61 | 62 | 63 | org.apache.flink 64 | flink-table_2.11 65 | ${flink.version} 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.flink 81 | flink-connector-filesystem_2.11 82 | ${flink.version} 83 | 84 | 85 | org.apache.flink 86 | flink-connector-kafka_2.11 87 | ${flink.version} 88 | 89 | 90 | org.apache.flink 91 | flink-avro 92 | ${flink.version} 93 | 94 | 95 | 96 | 97 | org.apache.bahir 98 | flink-connector-redis_2.11 99 | 1.0 100 | 101 | 102 | 103 | org.apache.flink 104 | flink-connector-kafka-0.10_${scala.binary.version} 105 | ${flink.version} 106 | 107 | 108 | org.apache.flink 109 | flink-connector-elasticsearch6_2.11 110 | ${flink.version} 111 | 112 | 113 | org.apache.flink 114 | flink-json 115 | ${flink.version} 116 | 117 | 118 | org.apache.flink 119 | flink-hbase_2.11 120 | ${flink.version} 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | org.slf4j 161 | slf4j-log4j12 162 | 1.7.10 163 | runtime 164 | 165 | 166 | log4j 167 | log4j 168 | 1.2.17 169 | runtime 170 | 171 | 172 | mysql 173 | mysql-connector-java 174 | 5.1.40 175 | 176 | 177 | org.apache.hadoop 178 | hadoop-client 179 | ${hadoop.version} 180 | 181 | 182 | org.apache.kafka 183 | kafka-clients 184 | 1.1.0 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | junit 206 | junit 207 | 4.11 208 | test 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | org.apache.maven.plugins 218 | maven-shade-plugin 219 | 3.0.0 220 | 221 | 222 | 223 | package 224 | 225 | shade 226 | 227 | 228 | 229 | 230 | org.apache.flink:force-shading 231 | com.google.code.findbugs:jsr305 232 | org.slf4j:* 233 | log4j:* 234 | 235 | 236 | 237 | 238 | 240 | *:* 241 | 242 | META-INF/*.SF 243 | META-INF/*.DSA 244 | META-INF/*.RSA 245 | 246 | 247 | 248 | 249 | 251 | com.lp.demo.StreamingJob 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | net.alchim31.maven 262 | scala-maven-plugin 263 | 3.2.2 264 | 265 | 266 | 267 | -target:jvm-1.8 268 | -feature 269 | -deprecation 270 | -explaintypes 271 | -unchecked 272 | -Xlint 273 | 274 | 275 | 276 | 277 | 278 | compile 279 | testCompile 280 | 281 | 282 | 283 | 284 | 285 | 286 | org.codehaus.mojo 287 | build-helper-maven-plugin 288 | 1.8 289 | 290 | 291 | 292 | add-source 293 | generate-sources 294 | 295 | add-source 296 | 297 | 298 | 299 | src/main/scala 300 | 301 | 302 | 303 | 304 | 305 | add-test-source 306 | generate-test-sources 307 | 308 | add-test-source 309 | 310 | 311 | 312 | src/test/scala 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | add-dependencies-for-IDEA 410 | 411 | 412 | 413 | idea.version 414 | 415 | 416 | 417 | 418 | 419 | org.apache.flink 420 | flink-scala_${scala.binary.version} 421 | ${flink.version} 422 | compile 423 | 424 | 425 | org.apache.flink 426 | flink-streaming-scala_${scala.binary.version} 427 | ${flink.version} 428 | compile 429 | 430 | 431 | org.scala-lang 432 | scala-library 433 | ${scala.version} 434 | compile 435 | 436 | 437 | org.apache.flink 438 | flink-java 439 | ${flink.version} 440 | compile 441 | 442 | 443 | org.apache.flink 444 | flink-streaming-java_2.11 445 | ${flink.version} 446 | compile 447 | 448 | 449 | 450 | 451 | 452 | 453 | --------------------------------------------------------------------------------