├── sqoop.txt
├── README.md
├── hadoop_learning
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       └── log4j2.properties
    │   │   └── java
    │   │       ├── outline_calcul
    │   │           ├── StringUtils.java
    │   │           └── AppDataClean.java
    │   │       └── example
    │   │           └── WordCount.java
    └── pom.xml
├── createTable.sql
└── calculate.sh


/sqoop.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JRhuang-96/offline-hadoop/HEAD/sqoop.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # offline-hadoop
 2 | Hadoop离线计算 :模拟 使用hadoop MR 进行数据清洗,再使用shell 脚本执行hive 进行数据统计,维度分析
 3 | 
 4 | ## 完整项目架构:
 5 | 
 6 | #### 1.收集数据: &nbsp;使用flume 收集 web logs 到 HDFS上;           
 7 | #### 2.清洗数据: &nbsp;使用 Hadoop MR 清洗数据;
 8 | #### 3.处理数据: &nbsp;使用 HQL 分析数据, 求出日活、日活维度分析、日新、日新维度分析等;  
 9 | #### 4.导出数据: &nbsp;使用sqoop将数据导出到mysql 中;
10 | #### 5.编写shell 脚本: &nbsp; 设置每天启动处理数据,重复以上流程.
11 | 
12 | -------
13 | <h3>本项目只实现</h3> 
14 | <h4>清洗数据-->处理数据-->shell脚本启动处理数据</h4>
15 | 


--------------------------------------------------------------------------------
/hadoop_learning/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | # this is the basic properties
 3 | #appender.console.type = Console
 4 | #appender.console.blacklist = console
 5 | #appender.console.layout.type = PatternLayout
 6 | #
 7 | #rootLogger.level = info
 8 | #rootLogger.appenderRef.console.ref = console
 9 | 
10 | #-----------
11 | 
12 | log4j.rootCategory=ERROR, console
13 | log4j.appender.console=org.apache.log4j.ConsoleAppender
14 | log4j.appender.console.target=System.err
15 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
16 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
17 | 
18 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
19 | # log level for this class is used to overwrite the root logger's log level, so that
20 | # the user can have different defaults for the shell and regular Spark apps.
21 | log4j.logger.org.apache.spark.repl.Main=WARN
22 | 
23 | # Settings to quiet third party logs that are too verbose
24 | log4j.logger.org.spark_project.jetty=WARN
25 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
26 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
27 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
28 | log4j.logger.org.apache.parquet=ERROR
29 | log4j.logger.parquet=ERROR
30 | 
31 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
32 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
33 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR


--------------------------------------------------------------------------------
/hadoop_learning/src/main/java/outline_calcul/StringUtils.java:
--------------------------------------------------------------------------------
 1 | package outline_calcul;
 2 | 
 3 | import com.alibaba.fastjson.JSONObject;
 4 | import org.apache.commons.lang3.time.FastDateFormat;
 5 | 
 6 | import java.text.SimpleDateFormat;
 7 | 
 8 | public class StringUtils {
 9 | 
10 |     public static String getStr(JSONObject json){
11 |         StringBuffer buffer = new StringBuffer();
12 |         buffer.append(json.getString("imei")).append(" ").append(json.getString("sdk_ver")).append(" ");
13 |         buffer.append(json.getString("time_zone")).append(" ").append(json.getString("commit_id")).append(" ");
14 |         buffer.append(json.getString("commit_time")).append(" ").append(json.getString("pid")).append(" ");
15 |         buffer.append(json.getString("app_token")).append(" ").append(json.getString("app_id")).append(" ");
16 |         buffer.append(json.getString("device_id")).append(" ").append(json.getString("device_id_type")).append(" ");
17 |         buffer.append(json.getString("release_channel")).append(" ").append(json.getString("app_ver_name")).append(" ");
18 |         buffer.append(json.getString("app_ver_code")).append(" ").append(json.getString("os_name")).append(" ");
19 |         buffer.append(json.getString("os_ver")).append(" ").append(json.getString("language")).append(" ");
20 |         buffer.append(json.getString("country")).append(" ").append(json.getString("manufacture")).append(" ");
21 |         buffer.append(json.getString("device_model")).append(" ").append(json.getString("resolution")).append(" ");
22 |         buffer.append(json.getString("net_type")).append(" ").append(json.getString("account")).append(" ");
23 |         buffer.append(json.getString("app_device_id")).append(" ").append(json.getString("mac")).append(" ");
24 |         buffer.append(json.getString("android_id")).append(" ").append(json.getString("user_id")).append(" ");
25 |         buffer.append(json.getString("cid_sn")).append(" ").append(json.getString("build_num")).append(" ");
26 |         buffer.append(json.getString("mobile_data_type")).append(" ").append(json.getString("promotion_channel")).append(" ");
27 |         buffer.append(json.getString("carrier")).append(" ").append(json.getString("city"));
28 | 
29 |         return buffer.toString();
30 | 
31 |     }
32 | 
33 |     /**
34 |      获取当前的 年月日
35 |      */
36 |     public static String getFileName() {
37 |         SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd");
38 |         return format.format(System.currentTimeMillis());
39 |     }
40 | 
41 |     public static String getTime(String oldTime){
42 |         FastDateFormat fastDateFormat =  FastDateFormat.getInstance("yyyyMMdd");
43 | 
44 |         return fastDateFormat.format(oldTime);
45 |     }
46 | 
47 | 
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/hadoop_learning/src/main/java/example/WordCount.java:
--------------------------------------------------------------------------------
  1 | package example;
  2 | 
  3 | import org.apache.hadoop.conf.Configuration;
  4 | import org.apache.hadoop.fs.Path;
  5 | import org.apache.hadoop.io.IntWritable;
  6 | import org.apache.hadoop.io.Text;
  7 | import org.apache.hadoop.mapreduce.Job;
  8 | import org.apache.hadoop.mapreduce.Mapper;
  9 | import org.apache.hadoop.mapreduce.Reducer;
 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 12 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 15 | import outline_calcul.StringUtils;
 16 | 
 17 | import java.io.IOException;
 18 | import java.util.StringTokenizer;
 19 | 
 20 | /**
 21 |  Hadoop最新版本的MapReduce Release 0.20.0的API包括了一个全新的Mapreduce JAVA API，有时候也称为上下文对象。
 22 |  新的API类型上不兼容以前的API，所以，以前的应用程序需要重写才能使新的API发挥其作用 。
 23 |  新的API和旧的API之间有下面几个明显的区别。
 24 |  新的API倾向于使用抽象类，而不是接口，因为这更容易扩展。例如，你可以添加一个方法(用默认的实现)到一个抽象类而不需修改类之前的实现方法。在新的API中，Mapper和Reducer是抽象类。
 25 |  新的API是在org.apache.hadoop.mapreduce包(和子包)中的。之前版本的API则是放在org.apache.hadoop.mapred中的。
 26 |  新的API广泛使用context object(上下文对象)，并允许用户代码与MapReduce系统进行通信。例如，MapContext基本上充当着JobConf的OutputCollector和Reporter的角色。
 27 |  新的API同时支持"推"和"拉"式的迭代。在这两个新老API中，键/值记录对被推mapper中，但除此之外，新的API允许把记录从map()方法中拉出，这也适用于reducer。"拉"式的一个有用的例子是分批处理记录，而不是一个接一个。
 28 |  新的API统一了配置。旧的API有一个特殊的JobConf对象用于作业配置，这是一个对于Hadoop通常的Configuration对象的扩展。
 29 |  在新的API中，这种区别没有了，所以作业配置通过Configuration来完成。作业控制的执行由Job类来负责，而不是JobClient，它在新的API中已经荡然无存。
 30 | 
 31 |  */
 32 | //这是新版API
 33 | public class WordCount{
 34 | 
 35 |    public static class TokenizerMapper extends Mapper<Object,Text, Text,IntWritable>{
 36 | 
 37 |        private static final IntWritable one = new IntWritable(1);
 38 |        private Text word = new Text();
 39 |        MultipleOutputs mos = null;
 40 | 
 41 |        @Override
 42 |        protected void setup(Context context) {
 43 |         mos = new  MultipleOutputs<>(context);
 44 | 
 45 |        }
 46 | 
 47 |        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 48 |            StringTokenizer tokenizer = new StringTokenizer(value.toString());
 49 |            while(tokenizer.hasMoreTokens()){
 50 |                word.set(tokenizer.nextToken());
 51 |                context.write(word,one);
 52 |            }
 53 | 
 54 |        }
 55 |    }
 56 | 
 57 |    public static class InSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 58 |        private IntWritable result = new IntWritable();
 59 | 
 60 |        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
 61 |            int sum = 0;
 62 |            for (IntWritable val : values) {
 63 |                sum += val.get();
 64 |            }
 65 |            result.set(sum);
 66 |            context.write(key, result);
 67 | 
 68 |        }
 69 |    }
 70 | 
 71 |     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
 72 |         Configuration conf = new Configuration();
 73 | 
 74 |         Job job= Job.getInstance(conf,"wordCount");
 75 | 
 76 |         job.setJarByClass(WordCount.class);
 77 | 
 78 |         job.setMapperClass(TokenizerMapper.class);
 79 |         job.setCombinerClass(InSumReducer.class);
 80 |         job.setReducerClass(InSumReducer.class);
 81 |         job.setOutputKeyClass(Text.class);
 82 |         job.setOutputValueClass(IntWritable.class);
 83 | 
 84 |         //有些作业不需要进行归约进行处理，那么就可以设置reduce的数量为0来进行处理，这种情况下用户的作业运行速度相对较高，
 85 |         // map的输出会直接写入到 SetOutputPath(path)设置的输出目录，而不是作为中间结果写到本地。
 86 |         // 同时Hadoop框架在写入文件系统前并不对之进行排序。
 87 |         job.setNumReduceTasks(0);
 88 | //        避免生成默认的文件 ,
 89 |         LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
 90 | 
 91 | 
 92 | 
 93 |         FileInputFormat.addInputPath(job, new Path("./tmp/tmpdata/learning.txt"));
 94 |         FileOutputFormat.setOutputPath(job, new Path("./"+ StringUtils.getFileName()));
 95 | 
 96 |         System.exit(job.waitForCompletion(true)?0:1);
 97 | 
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/createTable.sql:
--------------------------------------------------------------------------------
  1 | --强制删除数据库
  2 | --drop database etl_tb cascade;
  3 | 
  4 | 
  5 | CREATE database if not exists etl_tb;
  6 | use etl_tb;
  7 | 
  8 | --创建外部分区表,映射MR清洗后的数据
  9 | CREATE external TABLE etl_cleared_info(
 10 | imei				    string,
 11 | sdk_ver  			    string,
 12 | time_zone			    string,
 13 | commit_id			    string,
 14 | commit_time		    	string,
 15 | pid					    string,
 16 | app_token  		    	string,
 17 | app_id	 			    string,
 18 | device_id			    string,
 19 | device_id_type	  		string,
 20 | release_channel			string,
 21 | app_ver_name 	 	  	string,
 22 | app_ver_code		  	string,
 23 | os_name				    string,
 24 | os_ver  			    string,
 25 | language			    string,
 26 | country				    string,
 27 | manufacture			  	string,
 28 | device_model		  	string,
 29 | resolution			  	string,
 30 | net_type			    string,
 31 | account				    string,
 32 | app_device_id 			string,
 33 | mac					    string,
 34 | android_id			  	string,
 35 | user_id 			    string,
 36 | cid_sn  			    string,
 37 | build_num			    string,
 38 | mobile_data_type 		string,
 39 | promotion_channel 		string,
 40 | carrier 			    string,
 41 | city 				    string
 42 | )
 43 | partitioned by (day string)
 44 | row format delimited 
 45 | fields terminated by ' '
 46 | lines terminated by '\n'
 47 | location '/app_cleared_data/'
 48 | ;
 49 | 
 50 | 
 51 | --日活表 -每日活动用户 (create DAU table)
 52 | CREATE TABLE etl_day_active_user_info (
 53 | imei 				    string,
 54 | sdk_ver 			    string,
 55 | time_zone 			  	string,
 56 | commit_id 			  	string,
 57 | commit_time 		  	string,
 58 | pid 			    	string,
 59 | app_token 			  	string,
 60 | app_id 				    string,
 61 | device_id 			  	string,
 62 | device_id_type 	  		string,
 63 | release_channel   		string,
 64 | app_ver_name		  	string,
 65 | app_ver_code 		  	string,
 66 | os_name 			    string,
 67 | os_ver 				    string,
 68 | language 			    string,
 69 | country 			    string,
 70 | manufacture 		  	string,
 71 | device_model 		  	string,
 72 | resolution 			  	string,
 73 | net_type 			    string,
 74 | account 			    string,
 75 | app_device_id 	  		string,
 76 | mac 				    string,
 77 | android_id 			  	string,
 78 | user_id 			    string,
 79 | cid_sn 				    string,
 80 | build_num 			  	string,
 81 | mobile_data_type  		string,
 82 | promotion_channel 		string,
 83 | carrier 			    string,
 84 | city 				    string
 85 | ) 
 86 | partitioned BY (day string) 
 87 | row format delimited 
 88 | fields terminated BY ' '
 89 | lines terminated BY '\n';
 90 | 
 91 | 
 92 | --日新表 -每日新增用户
 93 | CREATE TABLE etl_day_new_user_info  like etl_day_active_user_info ;
 94 | 
 95 | --历史用户表 
 96 | CREATE TABLE etl_user_history_info(user_id string)  ;
 97 | 
 98 | --日活用户维度表
 99 | CREATE TABLE dim_day_user_active_info(
100 | sdk_ver 		  string,
101 | app_ver_name 	  string,
102 | app_ver_code	  string,
103 | os_name			  string,
104 | city			  string,
105 | manufacture		  string,
106 | nums			  int
107 | ) 
108 | partitioned by (day string, dim string)
109 | row format delimited
110 | fields terminated by ' '
111 | lines terminated by '\n'
112 | ;
113 | 
114 | --日新用户维度表
115 | CREATE TABLE dim_day_new_user_info(
116 | sdk_ver 		  string,
117 | app_ver_name 	  string,
118 | app_ver_code	  string,
119 | os_name			  string,
120 | city			  string,
121 | manufacture		  string,
122 | nums			  int
123 | )
124 | partitioned by (day string, dim string)
125 | row format delimited
126 | fields terminated by ' '
127 | lines terminated by '\n'
128 | ;
129 | 
130 | --CREATE TABLE dim_day_new_user_info like dim_day_user_active_info;
131 | 
132 | 
133 | --次日留存表
134 | CREATE table retain_oneday_ago_info(
135 | imei				    string,
136 | sdk_ver  			    string,
137 | time_zone			    string,
138 | commit_id			    string,
139 | commit_time			  	string,
140 | pid					    string,
141 | app_token  			  	string,
142 | app_id	 			    string,
143 | device_id			    string,
144 | device_id_type	  		string,
145 | release_channel	  		string,
146 | app_ver_name 	 	  	string,
147 | app_ver_code		  	string,
148 | os_name				    string,
149 | os_ver  			    string,
150 | language			    string,
151 | country				    string,
152 | manufacture			  	string,
153 | device_model		  	string,
154 | resolution			  	string,
155 | net_type			    string,
156 | account				    string,
157 | app_device_id 	  		string,
158 | mac					    string,
159 | android_id			  	string,
160 | user_id 			    string,
161 | cid_sn  			    string,
162 | build_num			    string,
163 | mobile_data_type 		string,
164 | promotion_channel 		string,
165 | carrier 			    string,
166 | city 				    string
167 | ) 
168 | partitioned by (day string)
169 | ;
170 | 
171 | 


--------------------------------------------------------------------------------
/hadoop_learning/src/main/java/outline_calcul/AppDataClean.java:
--------------------------------------------------------------------------------
 1 | package outline_calcul;
 2 | 
 3 | import com.alibaba.fastjson.JSON;
 4 | import com.alibaba.fastjson.JSONObject;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.NullWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | 
14 | import java.io.IOException;
15 | 
16 | 
17 | 
18 | public class AppDataClean {
19 |     public static class TokenMapper extends Mapper<Object,Text,Text,NullWritable> {
20 | 
21 |         Text k = new Text();
22 | 
23 |         NullWritable val = NullWritable.get();
24 | 
25 |         public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
26 | 
27 |             JSONObject record = (JSONObject)JSON.parse(value.toString());
28 |             String headerStr = record.getString("header");
29 | 
30 |             //获取内嵌 header
31 |             JSONObject header = (JSONObject)JSONObject.parse(headerStr);
32 | 
33 |             //必选字段--清理数据
34 |             if(header.getString("imei")==null||"".equals(header.getString("imei")))return;
35 |             if (header.getString("sdk_ver") ==null||"".equals(header.getString("sdk_ver")))return;
36 |             if (header.getString("time_zone") ==null||"".equals(header.getString("time_zone")))return;
37 |             if (header.getString("commit_id") ==null||"".equals(header.getString("commit_id")))return;
38 |             if (header.getString("commit_time") ==null||"".equals(header.getString("commit_time")))return;
39 |             if (header.getString("pid") ==null||" ".equals(header.getString("pid")))return;
40 |             if (header.getString("app_token") ==null||" ".equals(header.getString("app_token")))return;
41 |             if (header.getString("app_id") ==null||" ".equals(header.getString("app_id")))return;
42 |             if (header.getString("device_id") ==null||" ".equals(header.getString("device_id")))return;
43 |             if (header.getString("device_id_type") ==null||" ".equals(header.getString("device_id_type")))return;
44 |             if (header.getString("release_channel") ==null||" ".equals(header.getString("release_channel")))return;
45 |             if (header.getString("app_ver_name") ==null||" ".equals(header.getString("app_ver_name")))return;
46 |             if (header.getString("app_ver_code") ==null||" ".equals(header.getString("app_ver_code")))return;
47 |             if (header.getString("os_name") ==null||" ".equals(header.getString("os_name")))return;
48 |             if (header.getString("os_ver") ==null||" ".equals(header.getString("os_ver")))return;
49 |             if (header.getString("language") ==null||" ".equals(header.getString("language")))return;
50 |             if (header.getString("country") ==null||" ".equals(header.getString("country")))return;
51 |             if (header.getString("manufacture") ==null||" ".equals(header.getString("manufacture")))return;
52 |             if (header.getString("device_model") ==null||" ".equals(header.getString("device_model")))return;
53 |             if (header.getString("resolution") ==null||" ".equals(header.getString("resolution")))return;
54 |             if (header.getString("net_type") ==null||" ".equals(header.getString("net_type")))return;
55 |             if (header.getString("user_id") ==null||" ".equals(header.getString("user_id")))return;
56 |         
57 |             String res =  StringUtils.getStr(header);
58 | 
59 |             k.set(res);
60 |             context.write(k,val);
61 | 
62 |         }
63 |     }
64 | 
65 | 
66 |     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
67 |         Configuration conf =new Configuration();
68 |         Job job = Job.getInstance(conf,"data_clean");
69 | 
70 |         job.setJarByClass(AppDataClean.class);
71 | 
72 |         job.setMapperClass(TokenMapper.class);
73 | 
74 | 
75 |         job.setOutputKeyClass(Text.class);
76 |         job.setOutputValueClass(NullWritable.class);
77 | 
78 |         //设置为0 就只有 map 端输出
79 |         job.setNumReduceTasks(0);
80 | 
81 |         FileInputFormat.setInputPaths(job,new Path(args[0]));
82 |         FileOutputFormat.setOutputPath(job,new Path(args[1]));
83 | 
84 |         System.exit(job.waitForCompletion(true)?0:1);
85 |     }
86 | 
87 | 
88 | 
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/hadoop_learning/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>groupId</groupId>
  8 |     <artifactId>hadoop_learning</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 | 
 11 |     <properties>
 12 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 13 |         <sbt.project.name>spark-demo</sbt.project.name>
 14 |         <json.version>20180130</json.version>
 15 |         <junit.version>4.11</junit.version>
 16 |         <fastjson.version>1.2.28</fastjson.version>
 17 | 
 18 |     </properties>
 19 | 
 20 |     <dependencies>
 21 |         <!--单元测试-->
 22 |         <dependency>
 23 |             <groupId>junit</groupId>
 24 |             <artifactId>junit</artifactId>
 25 |             <version>${junit.version}</version>
 26 |             <scope>test</scope>
 27 |         </dependency>
 28 | 
 29 |         <dependency>
 30 |             <groupId>org.apache.commons</groupId>
 31 |             <artifactId>commons-email</artifactId>
 32 |             <version>1.4</version>
 33 |         </dependency>
 34 |         <dependency>
 35 |             <groupId>com.alibaba</groupId>
 36 |             <artifactId>fastjson</artifactId>
 37 |             <version>${fastjson.version}</version>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>org.json</groupId>
 41 |             <artifactId>json</artifactId>
 42 |             <version>${json.version}</version>
 43 |         </dependency>
 44 |  
 45 |         <dependency>
 46 |             <groupId>org.apache.hadoop</groupId>
 47 |             <artifactId>hadoop-client</artifactId>
 48 |             <version>2.7.1</version>
 49 |         </dependency>
 50 | 
 51 |     </dependencies>
 52 | 
 53 |     <build>
 54 |         <plugins>
 55 |             <plugin>
 56 |                 <artifactId>maven-compiler-plugin</artifactId>
 57 |                 <configuration>
 58 |                     <source>1.8</source>
 59 |                     <target>1.8</target>
 60 |                     <encoding>UTF-8</encoding>
 61 |                 </configuration>
 62 |             </plugin>
 63 |             <plugin>
 64 | 
 65 |                 <groupId>org.apache.maven.plugins</groupId>
 66 |                 <artifactId>maven-shade-plugin</artifactId>
 67 |                 <version>2.1</version>
 68 |                 <executions>
 69 |                     <execution>
 70 |                         <phase>package</phase>
 71 |                         <goals>
 72 |                             <goal>shade</goal>
 73 |                         </goals>
 74 |                         <configuration>
 75 |                             <minimizeJar>false</minimizeJar>
 76 |                             <shadedArtifactAttached>true</shadedArtifactAttached>
 77 |                             <artifactSet>
 78 |                                 <includes>
 79 |                                     <!-- Include here the dependencies you
 80 |                                         want to be packed in your fat jar -->
 81 |                                     <include>*:*</include>
 82 |                                 </includes>
 83 |                                 <excludes>
 84 |                                     <exclude>test:test:jar:</exclude>
 85 |                                 </excludes>
 86 |                             </artifactSet>
 87 |                             <filters>
 88 |                                 <filter>
 89 |                                     <artifact>*:*</artifact>
 90 |                                     <excludes>
 91 |                                         <exclude>META-INF/*.SF</exclude>
 92 |                                         <exclude>META-INF/*.DSA</exclude>
 93 |                                         <exclude>META-INF/*.RSA</exclude>
 94 |                                     </excludes>
 95 |                                 </filter>
 96 |                             </filters>
 97 |                             <transformers>
 98 |                                 <transformer
 99 |                                         implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
100 |                                     <resource>reference.conf</resource>
101 |                                 </transformer>
102 |                             </transformers>
103 |                         </configuration>
104 |                     </execution>
105 |                 </executions>
106 |             </plugin>
107 |             <plugin>
108 |                 <groupId>net.alchim31.maven</groupId>
109 |                 <artifactId>scala-maven-plugin</artifactId>
110 |                 <version>3.2.2</version>
111 |             </plugin>
112 |         </plugins>
113 |         <resources>
114 |             <resource>
115 |                 <directory>src/main/resources</directory>
116 |                 <filtering>false</filtering>
117 |                 <includes>
118 |                     <include>**/*</include>
119 |                 </includes>
120 |             </resource>
121 |             <resource>
122 |                 <directory>src/main/scala</directory>
123 |                 <filtering>false</filtering>
124 |                 <includes>
125 |                     <include>**/*</include>
126 |                 </includes>
127 |             </resource>
128 |             <resource>
129 |                 <directory>src/main/java</directory>
130 |                 <filtering>false</filtering>
131 |                 <includes>
132 |                     <include>**/*</include>
133 |                 </includes>
134 |             </resource>
135 |         </resources>
136 |     </build>
137 | 
138 | 
139 | 
140 | 
141 |     
142 | </project>


--------------------------------------------------------------------------------
/calculate.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | #get yesterday time
  4 | oneday_ago=`date -d yesterday +%Y-%m-%d`
  5 | twodays_ago=`date -d "-2 days" +%Y-%m-%d`
  6 | eightdays_ago=`date -d "-8 days" +%Y-%m-%d`
  7 | thirtyonedays_ago=`date -d"-31 days" +%Y-%m-%d`
  8 | 
  9 | set -e
 10 | inpath=/app-log-data/data/
 11 | outpath=/app-log-data/clean/day=${oneday_ago}
 12 | #
 13 | hive_exec=/home/hadoop/software/hive/bin/hive
 14 | #
 15 | add_data_sql="
 16 | load data inpath \"${outpath}/part-m*\" into table etl_cleared_info partition(day=\"$oneday_ago\");
 17 | "
 18 | #
 19 | #计算日活  
 20 | calculate_day_user_active_sql="
 21 | insert into etl_day_active_user_info partition(day=\"$oneday_ago\")
 22 | select 
 23 | t.imei              ,
 24 | t.sdk_ver 			,
 25 | t.time_zone  	 	,
 26 | t.commit_id  		,
 27 | t.commit_time 		,
 28 | t.pid 				,
 29 | t.app_token  		,
 30 | t.app_id 			,
 31 | t.device_id 		,
 32 | t.device_id_type 	,
 33 | t.release_channel 	,
 34 | t.app_ver_name		,
 35 | t.app_ver_code 		,
 36 | t.os_name 			,
 37 | t.os_ver 			,
 38 | t.language 			,
 39 | t.country 			,
 40 | t.manufacture 		,
 41 | t.device_model 		,
 42 | t.resolution 		,
 43 | t.net_type 			,
 44 | t.account 			,
 45 | t.app_device_id 	,
 46 | t.mac 				,
 47 | t.android_id 		,
 48 | t.user_id 			,
 49 | t.cid_sn 			,
 50 | t.build_num 		,
 51 | t.mobile_data_type 	,
 52 | t.promotion_channel ,
 53 | t.carrier 			,
 54 | t.city 				 
 55 | from ( 
 56 | 	select *,
 57 | 	row_number()over(partition by user_id order by commit_time) as r
 58 | 	from etl_cleared_info 
 59 | 	where day = \"$oneday_ago\")t
 60 | where r = 1;
 61 | 
 62 | "
 63 | #
 64 | #计算日新
 65 | calculate_daily_new_user_sql="
 66 | insert into table etl_day_new_user_info partition(day=\"$oneday_ago\")
 67 | select 
 68 | imei                 ,
 69 | sdk_ver 			 ,
 70 | time_zone 			 ,
 71 | commit_id 			 ,
 72 | commit_time 		 ,
 73 | pid 				 ,
 74 | app_token 			 ,
 75 | app_id 				 ,
 76 | device_id 			 ,
 77 | device_id_type 		 ,
 78 | release_channel 	 ,
 79 | app_ver_name		 ,
 80 | app_ver_code 		 ,
 81 | os_name 			 ,
 82 | os_ver 				 ,
 83 | language 			 ,
 84 | country 			 ,
 85 | manufacture 		 ,
 86 | device_model 		 ,
 87 | resolution 			 ,
 88 | net_type 			 ,
 89 | account 			 ,
 90 | app_device_id 		 ,
 91 | mac 				 ,
 92 | android_id 			 ,
 93 | t1.user_id 			 ,
 94 | cid_sn 				 ,
 95 | build_num 			 ,
 96 | mobile_data_type 	 ,
 97 | promotion_channel 	 ,
 98 | carrier 			 ,
 99 | city
100 | from etl_day_active_user_info t1 left join etl_user_history_info  t2 
101 | on t1.user_id = t2.user_id 
102 | where t1.day = \"$oneday_ago\" and t2.user_id is null;
103 | 
104 | "
105 | #将日新追加到历史用户表中
106 | append_new_user_tohistory_sql="
107 | insert into table etl_user_history_info
108 | select user_id from etl_day_new_user_info where day=\"$oneday_ago\" ;
109 | 
110 | "
111 | #
112 | #维度计算
113 | #日活维度
114 | dim_day_user_active_sql="
115 | from etl_day_active_user_info 
116 | 
117 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000000')
118 | 	select 'all','all','all','all','all','all',count(1) 
119 | 	where day = \"$oneday_ago\"
120 | 
121 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='100000')
122 | 	select sdk_ver,'all','all','all','all','all',count(1)
123 | 	where day = \"$oneday_ago\"
124 | 	group by sdk_ver
125 | 
126 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='010000')
127 | 	select 'all',app_ver_name,'all','all','all','all',count(1) 
128 | 	where day = \"$oneday_ago\"
129 | 	group by app_ver_name
130 | 
131 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='001000')
132 | 	select 'all','all',app_ver_code,'all','all','all',count(1)
133 | 	where day = \"$oneday_ago\"
134 | 	group by app_ver_code
135 | 
136 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000100')
137 | 	select 'all','all','all',os_name,'all','all',count(1) 
138 | 	where day = \"$oneday_ago\"
139 | 	group by os_name
140 | 
141 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000010')
142 | 	select 'all','all','all','all',city ,'all',count(1)
143 | 	where day = \"$oneday_ago\"
144 | 	group by city
145 | 
146 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000001')
147 | 	select 'all','all','all','all','all',manufacture,count(1)
148 | 	where day = \"$oneday_ago\"
149 | 	group by manufacture
150 | 
151 | --根据业务添加其他的维度分析
152 | --可以多重维度分析
153 | 
154 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000101')
155 | 	select 'all','all','all',os_name,'all',manufacture,count(1)
156 | 	where day = \"$oneday_ago\"
157 | 	group by os_name,manufacture
158 | 
159 | 
160 | 	;
161 | 
162 | "
163 | #日新维度计算
164 | dim_day_new_user_sql="
165 | from etl_day_new_user_info 
166 | 
167 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000000')
168 | 	select 'all','all','all','all','all','all',count(1) 
169 | 	where day = \"$oneday_ago\"
170 | 
171 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='100000')
172 | 	select sdk_ver,'all','all','all','all','all',count(1)
173 | 	where day = \"$oneday_ago\"
174 | 	group by sdk_ver
175 | 
176 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='010000')
177 | 	select 'all',app_ver_name,'all','all','all','all',count(1) 
178 | 	where day = \"$oneday_ago\"
179 | 	group by app_ver_name
180 | 
181 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='001000')
182 | 	select 'all','all',app_ver_code,'all','all','all',count(1) 
183 | 	where day = \"$oneday_ago\"
184 | 	group by app_ver_code
185 | 
186 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000100')
187 | 	select 'all','all','all',os_name,'all','all',count(1) 
188 | 	where day = \"$oneday_ago\"
189 | 	group by os_name
190 | 
191 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000010')
192 | 	select 'all','all','all','all',city ,'all',count(1)
193 | 	where day = \"$oneday_ago\"
194 | 	group by city
195 | 
196 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000001')
197 | 	select 'all','all','all','all','all',manufacture,count(1)
198 | 	where day = \"$oneday_ago\"
199 | 	group by manufacture
200 | 
201 | 
202 | --根据业务添加其他的维度分析
203 | 
204 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000011')
205 | 	select 'all','all','all','all',city,manufacture,count(1)
206 | 	where day = \"$oneday_ago\"
207 | 	group by city,manufacture
208 | 
209 | 	;
210 | 
211 | "
212 | #
213 | #留存计算
214 | #次日留存计算
215 | #
216 | retain_oneday_sql="
217 | insert into table retain_oneday_ago_info partition (day=\"$twodays_ago\")
218 | select		
219 | t1.imei					,
220 | t1.sdk_ver  			,
221 | t1.time_zone			,
222 | t1.commit_id			,
223 | t1.commit_time			,
224 | t1.pid					,
225 | t1.app_token  			,
226 | t1.app_id	 			,
227 | t1.device_id			,
228 | t1.device_id_type		,
229 | t1.release_channel		,
230 | t1.app_ver_name 	 	,
231 | t1.app_ver_code		,
232 | t1.os_name				,
233 | t1.os_ver  			,
234 | t1.language			,
235 | t1.country				,
236 | t1.manufacture			,
237 | t1.device_model		,
238 | t1.resolution			,
239 | t1.net_type			,
240 | t1.account				,
241 | t1.app_device_id 		,
242 | t1.mac					,
243 | t1.android_id			,
244 | t1.user_id 			,
245 | t1.cid_sn  			,
246 | t1.build_num			,
247 | t1.mobile_data_type 	,
248 | t1.promotion_channel 	,
249 | t1.carrier 			,
250 | t1.city 				 
251 |  from 
252 | (select * from etl_day_active_user_info where day =\"$twodays_ago\")t1
253 | left join
254 | (select user_id from etl_day_active_user_info where day =\"$oneday_ago\")t2 
255 | on t1.user_id = t2.user_id
256 | where  t2.user_id is not null
257 | ;
258 | 
259 | "
260 | #.....
261 | #.....
262 | #.....
263 | #执行命令
264 | #
265 | echo "开始执行脚本..."
266 | hadoop jar /home/hadoop/script/app_cleandata.jar outline_calcul.AppDataClean $inpath  $outpath   
267 | #hadoop MR clean the data --hadoop 执行 清洗数据的jar
268 | echo '清洗数据完成 ,进行数据计算'
269 | #
270 | hive -e  "
271 | use etl_tb;  
272 | -- append data into table etl_cleared_info 将数据导入数据表
273 | $add_data_sql;
274 | 
275 | --calculate  DAU(daily active user)  计算日活
276 | $calculate_day_user_active_sql
277 | 
278 | --calculate the dimension of DAU   计算日活的维度
279 | $dim_day_user_active_sql
280 | 
281 | --calculate DNU(daily new user)  计算日新
282 | $calculate_daily_new_user_sql
283 | 
284 | --append   将日新追加到历史用户表中
285 | $append_new_user_tohistory_sql
286 | 
287 | --calculate the dimension of DNU  计算日新的维度
288 | $dim_day_new_user_sql
289 | 
290 | --calculate the D1U   计算次日留存
291 | $retain_oneday_sql
292 | 
293 | 
294 | 
295 | " || { echo "command failed"; exit 1; }
296 | echo"脚本执行完成...."


--------------------------------------------------------------------------------