├── sqoop.txt ├── README.md ├── hadoop_learning ├── src │ └── main │ │ ├── resources │ │ └── log4j2.properties │ │ └── java │ │ ├── outline_calcul │ │ ├── StringUtils.java │ │ └── AppDataClean.java │ │ └── example │ │ └── WordCount.java └── pom.xml ├── createTable.sql └── calculate.sh /sqoop.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRhuang-96/offline-hadoop/HEAD/sqoop.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # offline-hadoop 2 | Hadoop离线计算 :模拟 使用hadoop MR 进行数据清洗,再使用shell 脚本执行hive 进行数据统计,维度分析 3 | 4 | ## 完整项目架构: 5 | 6 | #### 1.收集数据:  使用flume 收集 web logs 到 HDFS上; 7 | #### 2.清洗数据:  使用 Hadoop MR 清洗数据; 8 | #### 3.处理数据:  使用 HQL 分析数据, 求出日活、日活维度分析、日新、日新维度分析等; 9 | #### 4.导出数据:  使用sqoop将数据导出到mysql 中; 10 | #### 5.编写shell 脚本:   设置每天启动处理数据,重复以上流程. 11 | 12 | ------- 13 |

本项目只实现

14 |

清洗数据-->处理数据-->shell脚本启动处理数据

15 | -------------------------------------------------------------------------------- /hadoop_learning/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | 2 | # this is the basic properties 3 | #appender.console.type = Console 4 | #appender.console.blacklist = console 5 | #appender.console.layout.type = PatternLayout 6 | # 7 | #rootLogger.level = info 8 | #rootLogger.appenderRef.console.ref = console 9 | 10 | #----------- 11 | 12 | log4j.rootCategory=ERROR, console 13 | log4j.appender.console=org.apache.log4j.ConsoleAppender 14 | log4j.appender.console.target=System.err 15 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 17 | 18 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 19 | # log level for this class is used to overwrite the root logger's log level, so that 20 | # the user can have different defaults for the shell and regular Spark apps. 21 | log4j.logger.org.apache.spark.repl.Main=WARN 22 | 23 | # Settings to quiet third party logs that are too verbose 24 | log4j.logger.org.spark_project.jetty=WARN 25 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 26 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR 27 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR 28 | log4j.logger.org.apache.parquet=ERROR 29 | log4j.logger.parquet=ERROR 30 | 31 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 32 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 33 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR -------------------------------------------------------------------------------- /hadoop_learning/src/main/java/outline_calcul/StringUtils.java: -------------------------------------------------------------------------------- 1 | package outline_calcul; 2 | 3 | import com.alibaba.fastjson.JSONObject; 4 | import org.apache.commons.lang3.time.FastDateFormat; 5 | 6 | import java.text.SimpleDateFormat; 7 | 8 | public class StringUtils { 9 | 10 | public static String getStr(JSONObject json){ 11 | StringBuffer buffer = new StringBuffer(); 12 | buffer.append(json.getString("imei")).append(" ").append(json.getString("sdk_ver")).append(" "); 13 | buffer.append(json.getString("time_zone")).append(" ").append(json.getString("commit_id")).append(" "); 14 | buffer.append(json.getString("commit_time")).append(" ").append(json.getString("pid")).append(" "); 15 | buffer.append(json.getString("app_token")).append(" ").append(json.getString("app_id")).append(" "); 16 | buffer.append(json.getString("device_id")).append(" ").append(json.getString("device_id_type")).append(" "); 17 | buffer.append(json.getString("release_channel")).append(" ").append(json.getString("app_ver_name")).append(" "); 18 | buffer.append(json.getString("app_ver_code")).append(" ").append(json.getString("os_name")).append(" "); 19 | buffer.append(json.getString("os_ver")).append(" ").append(json.getString("language")).append(" "); 20 | buffer.append(json.getString("country")).append(" ").append(json.getString("manufacture")).append(" "); 21 | buffer.append(json.getString("device_model")).append(" ").append(json.getString("resolution")).append(" "); 22 | buffer.append(json.getString("net_type")).append(" ").append(json.getString("account")).append(" "); 23 | buffer.append(json.getString("app_device_id")).append(" ").append(json.getString("mac")).append(" "); 24 | buffer.append(json.getString("android_id")).append(" ").append(json.getString("user_id")).append(" "); 25 | buffer.append(json.getString("cid_sn")).append(" ").append(json.getString("build_num")).append(" "); 26 | buffer.append(json.getString("mobile_data_type")).append(" ").append(json.getString("promotion_channel")).append(" "); 27 | buffer.append(json.getString("carrier")).append(" ").append(json.getString("city")); 28 | 29 | return buffer.toString(); 30 | 31 | } 32 | 33 | /** 34 | 获取当前的 年月日 35 | */ 36 | public static String getFileName() { 37 | SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd"); 38 | return format.format(System.currentTimeMillis()); 39 | } 40 | 41 | public static String getTime(String oldTime){ 42 | FastDateFormat fastDateFormat = FastDateFormat.getInstance("yyyyMMdd"); 43 | 44 | return fastDateFormat.format(oldTime); 45 | } 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /hadoop_learning/src/main/java/example/WordCount.java: -------------------------------------------------------------------------------- 1 | package example; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 15 | import outline_calcul.StringUtils; 16 | 17 | import java.io.IOException; 18 | import java.util.StringTokenizer; 19 | 20 | /** 21 | Hadoop最新版本的MapReduce Release 0.20.0的API包括了一个全新的Mapreduce JAVA API,有时候也称为上下文对象。 22 | 新的API类型上不兼容以前的API,所以,以前的应用程序需要重写才能使新的API发挥其作用 。 23 | 新的API和旧的API之间有下面几个明显的区别。 24 | 新的API倾向于使用抽象类,而不是接口,因为这更容易扩展。例如,你可以添加一个方法(用默认的实现)到一个抽象类而不需修改类之前的实现方法。在新的API中,Mapper和Reducer是抽象类。 25 | 新的API是在org.apache.hadoop.mapreduce包(和子包)中的。之前版本的API则是放在org.apache.hadoop.mapred中的。 26 | 新的API广泛使用context object(上下文对象),并允许用户代码与MapReduce系统进行通信。例如,MapContext基本上充当着JobConf的OutputCollector和Reporter的角色。 27 | 新的API同时支持"推"和"拉"式的迭代。在这两个新老API中,键/值记录对被推mapper中,但除此之外,新的API允许把记录从map()方法中拉出,这也适用于reducer。"拉"式的一个有用的例子是分批处理记录,而不是一个接一个。 28 | 新的API统一了配置。旧的API有一个特殊的JobConf对象用于作业配置,这是一个对于Hadoop通常的Configuration对象的扩展。 29 | 在新的API中,这种区别没有了,所以作业配置通过Configuration来完成。作业控制的执行由Job类来负责,而不是JobClient,它在新的API中已经荡然无存。 30 | 31 | */ 32 | //这是新版API 33 | public class WordCount{ 34 | 35 | public static class TokenizerMapper extends Mapper{ 36 | 37 | private static final IntWritable one = new IntWritable(1); 38 | private Text word = new Text(); 39 | MultipleOutputs mos = null; 40 | 41 | @Override 42 | protected void setup(Context context) { 43 | mos = new MultipleOutputs<>(context); 44 | 45 | } 46 | 47 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 48 | StringTokenizer tokenizer = new StringTokenizer(value.toString()); 49 | while(tokenizer.hasMoreTokens()){ 50 | word.set(tokenizer.nextToken()); 51 | context.write(word,one); 52 | } 53 | 54 | } 55 | } 56 | 57 | public static class InSumReducer extends Reducer { 58 | private IntWritable result = new IntWritable(); 59 | 60 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 61 | int sum = 0; 62 | for (IntWritable val : values) { 63 | sum += val.get(); 64 | } 65 | result.set(sum); 66 | context.write(key, result); 67 | 68 | } 69 | } 70 | 71 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 72 | Configuration conf = new Configuration(); 73 | 74 | Job job= Job.getInstance(conf,"wordCount"); 75 | 76 | job.setJarByClass(WordCount.class); 77 | 78 | job.setMapperClass(TokenizerMapper.class); 79 | job.setCombinerClass(InSumReducer.class); 80 | job.setReducerClass(InSumReducer.class); 81 | job.setOutputKeyClass(Text.class); 82 | job.setOutputValueClass(IntWritable.class); 83 | 84 | //有些作业不需要进行归约进行处理,那么就可以设置reduce的数量为0来进行处理,这种情况下用户的作业运行速度相对较高, 85 | // map的输出会直接写入到 SetOutputPath(path)设置的输出目录,而不是作为中间结果写到本地。 86 | // 同时Hadoop框架在写入文件系统前并不对之进行排序。 87 | job.setNumReduceTasks(0); 88 | // 避免生成默认的文件 , 89 | LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); 90 | 91 | 92 | 93 | FileInputFormat.addInputPath(job, new Path("./tmp/tmpdata/learning.txt")); 94 | FileOutputFormat.setOutputPath(job, new Path("./"+ StringUtils.getFileName())); 95 | 96 | System.exit(job.waitForCompletion(true)?0:1); 97 | 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /createTable.sql: -------------------------------------------------------------------------------- 1 | --强制删除数据库 2 | --drop database etl_tb cascade; 3 | 4 | 5 | CREATE database if not exists etl_tb; 6 | use etl_tb; 7 | 8 | --创建外部分区表,映射MR清洗后的数据 9 | CREATE external TABLE etl_cleared_info( 10 | imei string, 11 | sdk_ver string, 12 | time_zone string, 13 | commit_id string, 14 | commit_time string, 15 | pid string, 16 | app_token string, 17 | app_id string, 18 | device_id string, 19 | device_id_type string, 20 | release_channel string, 21 | app_ver_name string, 22 | app_ver_code string, 23 | os_name string, 24 | os_ver string, 25 | language string, 26 | country string, 27 | manufacture string, 28 | device_model string, 29 | resolution string, 30 | net_type string, 31 | account string, 32 | app_device_id string, 33 | mac string, 34 | android_id string, 35 | user_id string, 36 | cid_sn string, 37 | build_num string, 38 | mobile_data_type string, 39 | promotion_channel string, 40 | carrier string, 41 | city string 42 | ) 43 | partitioned by (day string) 44 | row format delimited 45 | fields terminated by ' ' 46 | lines terminated by '\n' 47 | location '/app_cleared_data/' 48 | ; 49 | 50 | 51 | --日活表 -每日活动用户 (create DAU table) 52 | CREATE TABLE etl_day_active_user_info ( 53 | imei string, 54 | sdk_ver string, 55 | time_zone string, 56 | commit_id string, 57 | commit_time string, 58 | pid string, 59 | app_token string, 60 | app_id string, 61 | device_id string, 62 | device_id_type string, 63 | release_channel string, 64 | app_ver_name string, 65 | app_ver_code string, 66 | os_name string, 67 | os_ver string, 68 | language string, 69 | country string, 70 | manufacture string, 71 | device_model string, 72 | resolution string, 73 | net_type string, 74 | account string, 75 | app_device_id string, 76 | mac string, 77 | android_id string, 78 | user_id string, 79 | cid_sn string, 80 | build_num string, 81 | mobile_data_type string, 82 | promotion_channel string, 83 | carrier string, 84 | city string 85 | ) 86 | partitioned BY (day string) 87 | row format delimited 88 | fields terminated BY ' ' 89 | lines terminated BY '\n'; 90 | 91 | 92 | --日新表 -每日新增用户 93 | CREATE TABLE etl_day_new_user_info like etl_day_active_user_info ; 94 | 95 | --历史用户表 96 | CREATE TABLE etl_user_history_info(user_id string) ; 97 | 98 | --日活用户维度表 99 | CREATE TABLE dim_day_user_active_info( 100 | sdk_ver string, 101 | app_ver_name string, 102 | app_ver_code string, 103 | os_name string, 104 | city string, 105 | manufacture string, 106 | nums int 107 | ) 108 | partitioned by (day string, dim string) 109 | row format delimited 110 | fields terminated by ' ' 111 | lines terminated by '\n' 112 | ; 113 | 114 | --日新用户维度表 115 | CREATE TABLE dim_day_new_user_info( 116 | sdk_ver string, 117 | app_ver_name string, 118 | app_ver_code string, 119 | os_name string, 120 | city string, 121 | manufacture string, 122 | nums int 123 | ) 124 | partitioned by (day string, dim string) 125 | row format delimited 126 | fields terminated by ' ' 127 | lines terminated by '\n' 128 | ; 129 | 130 | --CREATE TABLE dim_day_new_user_info like dim_day_user_active_info; 131 | 132 | 133 | --次日留存表 134 | CREATE table retain_oneday_ago_info( 135 | imei string, 136 | sdk_ver string, 137 | time_zone string, 138 | commit_id string, 139 | commit_time string, 140 | pid string, 141 | app_token string, 142 | app_id string, 143 | device_id string, 144 | device_id_type string, 145 | release_channel string, 146 | app_ver_name string, 147 | app_ver_code string, 148 | os_name string, 149 | os_ver string, 150 | language string, 151 | country string, 152 | manufacture string, 153 | device_model string, 154 | resolution string, 155 | net_type string, 156 | account string, 157 | app_device_id string, 158 | mac string, 159 | android_id string, 160 | user_id string, 161 | cid_sn string, 162 | build_num string, 163 | mobile_data_type string, 164 | promotion_channel string, 165 | carrier string, 166 | city string 167 | ) 168 | partitioned by (day string) 169 | ; 170 | 171 | -------------------------------------------------------------------------------- /hadoop_learning/src/main/java/outline_calcul/AppDataClean.java: -------------------------------------------------------------------------------- 1 | package outline_calcul; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.alibaba.fastjson.JSONObject; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | 14 | import java.io.IOException; 15 | 16 | 17 | 18 | public class AppDataClean { 19 | public static class TokenMapper extends Mapper { 20 | 21 | Text k = new Text(); 22 | 23 | NullWritable val = NullWritable.get(); 24 | 25 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 26 | 27 | JSONObject record = (JSONObject)JSON.parse(value.toString()); 28 | String headerStr = record.getString("header"); 29 | 30 | //获取内嵌 header 31 | JSONObject header = (JSONObject)JSONObject.parse(headerStr); 32 | 33 | //必选字段--清理数据 34 | if(header.getString("imei")==null||"".equals(header.getString("imei")))return; 35 | if (header.getString("sdk_ver") ==null||"".equals(header.getString("sdk_ver")))return; 36 | if (header.getString("time_zone") ==null||"".equals(header.getString("time_zone")))return; 37 | if (header.getString("commit_id") ==null||"".equals(header.getString("commit_id")))return; 38 | if (header.getString("commit_time") ==null||"".equals(header.getString("commit_time")))return; 39 | if (header.getString("pid") ==null||" ".equals(header.getString("pid")))return; 40 | if (header.getString("app_token") ==null||" ".equals(header.getString("app_token")))return; 41 | if (header.getString("app_id") ==null||" ".equals(header.getString("app_id")))return; 42 | if (header.getString("device_id") ==null||" ".equals(header.getString("device_id")))return; 43 | if (header.getString("device_id_type") ==null||" ".equals(header.getString("device_id_type")))return; 44 | if (header.getString("release_channel") ==null||" ".equals(header.getString("release_channel")))return; 45 | if (header.getString("app_ver_name") ==null||" ".equals(header.getString("app_ver_name")))return; 46 | if (header.getString("app_ver_code") ==null||" ".equals(header.getString("app_ver_code")))return; 47 | if (header.getString("os_name") ==null||" ".equals(header.getString("os_name")))return; 48 | if (header.getString("os_ver") ==null||" ".equals(header.getString("os_ver")))return; 49 | if (header.getString("language") ==null||" ".equals(header.getString("language")))return; 50 | if (header.getString("country") ==null||" ".equals(header.getString("country")))return; 51 | if (header.getString("manufacture") ==null||" ".equals(header.getString("manufacture")))return; 52 | if (header.getString("device_model") ==null||" ".equals(header.getString("device_model")))return; 53 | if (header.getString("resolution") ==null||" ".equals(header.getString("resolution")))return; 54 | if (header.getString("net_type") ==null||" ".equals(header.getString("net_type")))return; 55 | if (header.getString("user_id") ==null||" ".equals(header.getString("user_id")))return; 56 | 57 | String res = StringUtils.getStr(header); 58 | 59 | k.set(res); 60 | context.write(k,val); 61 | 62 | } 63 | } 64 | 65 | 66 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 67 | Configuration conf =new Configuration(); 68 | Job job = Job.getInstance(conf,"data_clean"); 69 | 70 | job.setJarByClass(AppDataClean.class); 71 | 72 | job.setMapperClass(TokenMapper.class); 73 | 74 | 75 | job.setOutputKeyClass(Text.class); 76 | job.setOutputValueClass(NullWritable.class); 77 | 78 | //设置为0 就只有 map 端输出 79 | job.setNumReduceTasks(0); 80 | 81 | FileInputFormat.setInputPaths(job,new Path(args[0])); 82 | FileOutputFormat.setOutputPath(job,new Path(args[1])); 83 | 84 | System.exit(job.waitForCompletion(true)?0:1); 85 | } 86 | 87 | 88 | 89 | 90 | } 91 | -------------------------------------------------------------------------------- /hadoop_learning/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | groupId 8 | hadoop_learning 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | spark-demo 14 | 20180130 15 | 4.11 16 | 1.2.28 17 | 18 | 19 | 20 | 21 | 22 | 23 | junit 24 | junit 25 | ${junit.version} 26 | test 27 | 28 | 29 | 30 | org.apache.commons 31 | commons-email 32 | 1.4 33 | 34 | 35 | com.alibaba 36 | fastjson 37 | ${fastjson.version} 38 | 39 | 40 | org.json 41 | json 42 | ${json.version} 43 | 44 | 45 | 46 | org.apache.hadoop 47 | hadoop-client 48 | 2.7.1 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | maven-compiler-plugin 57 | 58 | 1.8 59 | 1.8 60 | UTF-8 61 | 62 | 63 | 64 | 65 | org.apache.maven.plugins 66 | maven-shade-plugin 67 | 2.1 68 | 69 | 70 | package 71 | 72 | shade 73 | 74 | 75 | false 76 | true 77 | 78 | 79 | 81 | *:* 82 | 83 | 84 | test:test:jar: 85 | 86 | 87 | 88 | 89 | *:* 90 | 91 | META-INF/*.SF 92 | META-INF/*.DSA 93 | META-INF/*.RSA 94 | 95 | 96 | 97 | 98 | 100 | reference.conf 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | net.alchim31.maven 109 | scala-maven-plugin 110 | 3.2.2 111 | 112 | 113 | 114 | 115 | src/main/resources 116 | false 117 | 118 | **/* 119 | 120 | 121 | 122 | src/main/scala 123 | false 124 | 125 | **/* 126 | 127 | 128 | 129 | src/main/java 130 | false 131 | 132 | **/* 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /calculate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #get yesterday time 4 | oneday_ago=`date -d yesterday +%Y-%m-%d` 5 | twodays_ago=`date -d "-2 days" +%Y-%m-%d` 6 | eightdays_ago=`date -d "-8 days" +%Y-%m-%d` 7 | thirtyonedays_ago=`date -d"-31 days" +%Y-%m-%d` 8 | 9 | set -e 10 | inpath=/app-log-data/data/ 11 | outpath=/app-log-data/clean/day=${oneday_ago} 12 | # 13 | hive_exec=/home/hadoop/software/hive/bin/hive 14 | # 15 | add_data_sql=" 16 | load data inpath \"${outpath}/part-m*\" into table etl_cleared_info partition(day=\"$oneday_ago\"); 17 | " 18 | # 19 | #计算日活 20 | calculate_day_user_active_sql=" 21 | insert into etl_day_active_user_info partition(day=\"$oneday_ago\") 22 | select 23 | t.imei , 24 | t.sdk_ver , 25 | t.time_zone , 26 | t.commit_id , 27 | t.commit_time , 28 | t.pid , 29 | t.app_token , 30 | t.app_id , 31 | t.device_id , 32 | t.device_id_type , 33 | t.release_channel , 34 | t.app_ver_name , 35 | t.app_ver_code , 36 | t.os_name , 37 | t.os_ver , 38 | t.language , 39 | t.country , 40 | t.manufacture , 41 | t.device_model , 42 | t.resolution , 43 | t.net_type , 44 | t.account , 45 | t.app_device_id , 46 | t.mac , 47 | t.android_id , 48 | t.user_id , 49 | t.cid_sn , 50 | t.build_num , 51 | t.mobile_data_type , 52 | t.promotion_channel , 53 | t.carrier , 54 | t.city 55 | from ( 56 | select *, 57 | row_number()over(partition by user_id order by commit_time) as r 58 | from etl_cleared_info 59 | where day = \"$oneday_ago\")t 60 | where r = 1; 61 | 62 | " 63 | # 64 | #计算日新 65 | calculate_daily_new_user_sql=" 66 | insert into table etl_day_new_user_info partition(day=\"$oneday_ago\") 67 | select 68 | imei , 69 | sdk_ver , 70 | time_zone , 71 | commit_id , 72 | commit_time , 73 | pid , 74 | app_token , 75 | app_id , 76 | device_id , 77 | device_id_type , 78 | release_channel , 79 | app_ver_name , 80 | app_ver_code , 81 | os_name , 82 | os_ver , 83 | language , 84 | country , 85 | manufacture , 86 | device_model , 87 | resolution , 88 | net_type , 89 | account , 90 | app_device_id , 91 | mac , 92 | android_id , 93 | t1.user_id , 94 | cid_sn , 95 | build_num , 96 | mobile_data_type , 97 | promotion_channel , 98 | carrier , 99 | city 100 | from etl_day_active_user_info t1 left join etl_user_history_info t2 101 | on t1.user_id = t2.user_id 102 | where t1.day = \"$oneday_ago\" and t2.user_id is null; 103 | 104 | " 105 | #将日新追加到历史用户表中 106 | append_new_user_tohistory_sql=" 107 | insert into table etl_user_history_info 108 | select user_id from etl_day_new_user_info where day=\"$oneday_ago\" ; 109 | 110 | " 111 | # 112 | #维度计算 113 | #日活维度 114 | dim_day_user_active_sql=" 115 | from etl_day_active_user_info 116 | 117 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000000') 118 | select 'all','all','all','all','all','all',count(1) 119 | where day = \"$oneday_ago\" 120 | 121 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='100000') 122 | select sdk_ver,'all','all','all','all','all',count(1) 123 | where day = \"$oneday_ago\" 124 | group by sdk_ver 125 | 126 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='010000') 127 | select 'all',app_ver_name,'all','all','all','all',count(1) 128 | where day = \"$oneday_ago\" 129 | group by app_ver_name 130 | 131 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='001000') 132 | select 'all','all',app_ver_code,'all','all','all',count(1) 133 | where day = \"$oneday_ago\" 134 | group by app_ver_code 135 | 136 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000100') 137 | select 'all','all','all',os_name,'all','all',count(1) 138 | where day = \"$oneday_ago\" 139 | group by os_name 140 | 141 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000010') 142 | select 'all','all','all','all',city ,'all',count(1) 143 | where day = \"$oneday_ago\" 144 | group by city 145 | 146 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000001') 147 | select 'all','all','all','all','all',manufacture,count(1) 148 | where day = \"$oneday_ago\" 149 | group by manufacture 150 | 151 | --根据业务添加其他的维度分析 152 | --可以多重维度分析 153 | 154 | insert into dim_day_user_active_info partition(day=\"$oneday_ago\",dim='000101') 155 | select 'all','all','all',os_name,'all',manufacture,count(1) 156 | where day = \"$oneday_ago\" 157 | group by os_name,manufacture 158 | 159 | 160 | ; 161 | 162 | " 163 | #日新维度计算 164 | dim_day_new_user_sql=" 165 | from etl_day_new_user_info 166 | 167 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000000') 168 | select 'all','all','all','all','all','all',count(1) 169 | where day = \"$oneday_ago\" 170 | 171 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='100000') 172 | select sdk_ver,'all','all','all','all','all',count(1) 173 | where day = \"$oneday_ago\" 174 | group by sdk_ver 175 | 176 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='010000') 177 | select 'all',app_ver_name,'all','all','all','all',count(1) 178 | where day = \"$oneday_ago\" 179 | group by app_ver_name 180 | 181 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='001000') 182 | select 'all','all',app_ver_code,'all','all','all',count(1) 183 | where day = \"$oneday_ago\" 184 | group by app_ver_code 185 | 186 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000100') 187 | select 'all','all','all',os_name,'all','all',count(1) 188 | where day = \"$oneday_ago\" 189 | group by os_name 190 | 191 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000010') 192 | select 'all','all','all','all',city ,'all',count(1) 193 | where day = \"$oneday_ago\" 194 | group by city 195 | 196 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000001') 197 | select 'all','all','all','all','all',manufacture,count(1) 198 | where day = \"$oneday_ago\" 199 | group by manufacture 200 | 201 | 202 | --根据业务添加其他的维度分析 203 | 204 | insert into dim_day_new_user_info partition(day=\"$oneday_ago\",dim='000011') 205 | select 'all','all','all','all',city,manufacture,count(1) 206 | where day = \"$oneday_ago\" 207 | group by city,manufacture 208 | 209 | ; 210 | 211 | " 212 | # 213 | #留存计算 214 | #次日留存计算 215 | # 216 | retain_oneday_sql=" 217 | insert into table retain_oneday_ago_info partition (day=\"$twodays_ago\") 218 | select 219 | t1.imei , 220 | t1.sdk_ver , 221 | t1.time_zone , 222 | t1.commit_id , 223 | t1.commit_time , 224 | t1.pid , 225 | t1.app_token , 226 | t1.app_id , 227 | t1.device_id , 228 | t1.device_id_type , 229 | t1.release_channel , 230 | t1.app_ver_name , 231 | t1.app_ver_code , 232 | t1.os_name , 233 | t1.os_ver , 234 | t1.language , 235 | t1.country , 236 | t1.manufacture , 237 | t1.device_model , 238 | t1.resolution , 239 | t1.net_type , 240 | t1.account , 241 | t1.app_device_id , 242 | t1.mac , 243 | t1.android_id , 244 | t1.user_id , 245 | t1.cid_sn , 246 | t1.build_num , 247 | t1.mobile_data_type , 248 | t1.promotion_channel , 249 | t1.carrier , 250 | t1.city 251 | from 252 | (select * from etl_day_active_user_info where day =\"$twodays_ago\")t1 253 | left join 254 | (select user_id from etl_day_active_user_info where day =\"$oneday_ago\")t2 255 | on t1.user_id = t2.user_id 256 | where t2.user_id is not null 257 | ; 258 | 259 | " 260 | #..... 261 | #..... 262 | #..... 263 | #执行命令 264 | # 265 | echo "开始执行脚本..." 266 | hadoop jar /home/hadoop/script/app_cleandata.jar outline_calcul.AppDataClean $inpath $outpath 267 | #hadoop MR clean the data --hadoop 执行 清洗数据的jar 268 | echo '清洗数据完成 ,进行数据计算' 269 | # 270 | hive -e " 271 | use etl_tb; 272 | -- append data into table etl_cleared_info 将数据导入数据表 273 | $add_data_sql; 274 | 275 | --calculate DAU(daily active user) 计算日活 276 | $calculate_day_user_active_sql 277 | 278 | --calculate the dimension of DAU 计算日活的维度 279 | $dim_day_user_active_sql 280 | 281 | --calculate DNU(daily new user) 计算日新 282 | $calculate_daily_new_user_sql 283 | 284 | --append 将日新追加到历史用户表中 285 | $append_new_user_tohistory_sql 286 | 287 | --calculate the dimension of DNU 计算日新的维度 288 | $dim_day_new_user_sql 289 | 290 | --calculate the D1U 计算次日留存 291 | $retain_oneday_sql 292 | 293 | 294 | 295 | " || { echo "command failed"; exit 1; } 296 | echo"脚本执行完成...." --------------------------------------------------------------------------------