├── .gitignore ├── .project ├── README.md ├── datasource ├── brand.py ├── command.sh ├── record.py └── user.py ├── etl ├── etl.py ├── loadDataToHive │ ├── loadDataToHive.iml │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── bigdata │ │ └── etl │ │ └── loadDataToHive.java └── start_etl.py ├── flume ├── command │ ├── start_flume_batch.sh │ └── start_flume_realtime.sh └── conf │ ├── flume-conf-logAnalysis-kafka.properties │ ├── flume-conf-logAnalysis.properties │ └── flume-env.sh ├── hadoop ├── .classpath ├── .project ├── command │ ├── start-dfs.sh │ ├── start-historyserver.sh │ ├── start-yarn.sh │ ├── stop-dfs.sh │ ├── stop-historyserver.sh │ └── stop-yarn.sh ├── conf │ ├── capacity-scheduler.xml │ ├── core-site.xml │ ├── hadoop-env.sh │ ├── hdfs-site.xml │ ├── mapred-env.sh │ ├── mapred-site.xml │ ├── slaves │ ├── yarn-env.sh │ └── yarn-site.xml ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── cn │ │ │ └── chinahadoop │ │ │ ├── hdfs │ │ │ └── HdfsExample.java │ │ │ └── mapreduce │ │ │ ├── Grep.java │ │ │ ├── InvertedIndex.java │ │ │ ├── JobFailureTest.java │ │ │ ├── OOMTest.java │ │ │ ├── TaskAttemptTest.java │ │ │ └── WordCount.java │ │ └── resources │ │ ├── input │ │ ├── input_1.txt │ │ └── input_2.txt │ │ └── output │ │ ├── ._SUCCESS.crc │ │ ├── .part-r-00000.crc │ │ ├── _SUCCESS │ │ └── part-r-00000 └── streaming │ ├── mapper.cpp │ ├── mapper.php │ ├── mapper.sh │ ├── mapper2.cpp │ ├── mapper2.sh │ ├── reducer.cpp │ ├── reducer.php │ ├── reducer.sh │ ├── run_cpp_mr.sh │ ├── run_php_mr.sh │ ├── run_shell_mr.sh │ └── test.txt ├── hbase └── hbase-ingest │ ├── pom.xml │ └── src │ └── main │ └── java │ └── bigdata │ └── hbase │ ├── Ingest.java │ ├── ProfileIngest.java │ ├── Query.java │ └── RecordIngest.java ├── hive ├── README.md ├── command │ ├── add_partition.sql │ ├── age_price_list.sql │ ├── brand_price_list.sql │ ├── create_orc_table.sql │ ├── create_parquet_table.sql │ ├── create_table_brand.sql │ ├── create_table_record.sql │ ├── create_table_user.sql │ ├── employees.sql │ ├── employees_part.sql │ ├── load_data_to_orc.sql │ ├── load_data_to_parquet.sql │ ├── province_prince_list.sql │ ├── skewed.sql │ ├── start-hiveserver2.sh │ ├── start-metastore.sh │ ├── start-mysql.sh │ └── weblog.sql ├── conf │ ├── hive-env.sh │ ├── hive-log4j2.properties │ └── hive-site.xml └── data │ └── employees.txt ├── kafka └── command │ ├── start-kafka.sh │ └── start-zookeeper.sh ├── mysql ├── create_table_brand.sql ├── create_table_user.sql ├── load_table_brand.sql ├── load_table_user.sql └── start-client.txt ├── pom.xml ├── presto ├── command │ ├── age_price_list_presto.sql │ ├── brand_price_list_presto.sql │ ├── gender_brand_rank.sql │ ├── start-presto-client.sh │ ├── start-presto.sh │ └── stop-presto.sh └── conf │ └── etc │ ├── catalog │ └── hive.properties │ ├── config.properties │ ├── jvm.config │ └── node.properties ├── redis └── command │ ├── start-redis-client.sh │ └── start-redis.sh ├── sqoop └── command │ ├── brand_dimension_sqoop.sh │ └── user_dimension_sqoop.sh ├── storm ├── command │ ├── realtime_process.sh │ ├── start-storm-nimbus.sh │ ├── start-storm-supervisor.sh │ └── start-storm-ui.sh ├── conf │ ├── storm-env.sh │ └── storm.yaml └── storm_realtime_process │ ├── pom.xml │ └── src │ └── main │ └── java │ └── bigdata │ └── storm │ ├── ExtractBolt.java │ ├── LogProcessTopology.java │ ├── ProvinceBolt.java │ ├── ProvinceStoreMapper.java │ ├── WebsiteBolt.java │ └── WebsiteStoreMapper.java └── visualization ├── command └── start-web.sh ├── py-echarts ├── main.py ├── models.py ├── models.pyc ├── query_presto.py ├── query_presto.pyc ├── query_redis.py ├── query_redis.pyc └── templates │ ├── chart.html │ └── main.html └── result ├── image-1.png ├── image-2.png └── image-3.png /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | .DS_Store 4 | .idea 5 | *.iml 6 | .settings 7 | .class 8 | target/ 9 | .classpath 10 | .project 11 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | hadoop-example 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.m2e.core.maven2Builder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.m2e.core.maven2Nature 16 | 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bigdata_logAnalysis 2 | 3 | ##Version: 4 | | Component | Version | Download link | 5 | | ------------- |:-------------:|:----------------------------------------------------------------------------------------------------| 6 | | flume | 1.7.0 | http://mirrors.hust.edu.cn/apache/flume/1.7.0/apache-flume-1.7.0-bin.tar.gz | 7 | | hadoop | 2.7.3 | http://apache.fayea.com/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz | 8 | | oozie | 4.2.0 | http://apache.fayea.com/oozie/4.2.0/oozie-4.2.0.tar.gz | 9 | | hive | 2.1.0 | http://mirror.bit.edu.cn/apache/hive/stable-2/apache-hive-2.1.0-bin.tar.gz | 10 | | sqoop | 1.4.6 | http://mirror.bit.edu.cn/apache/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | 11 | | presto | 0.157 | https://repo1.maven.org/maven2/com/facebook/presto/presto-server/0.157/presto-server-0.157.tar.gz | 12 | | presto-client | 0.157 | https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/0.157/presto-cli-0.157-executable.jar | 13 | | kafka | 0.10.1.0 | http://mirrors.tuna.tsinghua.edu.cn/apache/kafka/0.10.1.0/kafka_2.11-0.10.1.0.tgz | 14 | | storm | 1.0.2 | http://mirrors.cnnic.cn/apache/storm/apache-storm-1.0.2/apache-storm-1.0.2.tar.gz | 15 | | redis | 3.2 | http://download.redis.io/releases/redis-3.2.5.tar.gz | 16 | | echarts | 3.3.2 | http://echarts.baidu.com/dist/echarts.min.js | 17 | -------------------------------------------------------------------------------- /datasource/brand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import random,sys 5 | 6 | BRAND_FILE="/home/bigdata/datasource/brand.list" 7 | BRAND={"computer":("APPLE","HP","ACER","LENOVO","DELL","SONY","ASUS"),"telephone":("IPHONE","SAMSUNG","HTC","MOTOROLA","HUAWEI","OPPO","VIVO","XIAOMI","MEIZU"),"television":("HISENSE","SAMSUNG","SKYWORTH","SHARP","HAIER","PHILIPS","TCL"),"sports":("NIKE","ADIDAS","LINING","PUMA","ANTA","MIZUNO","KAPPA","NB","PEAK","361"),"food":("MENGNIU","YILI","GUANGMING","SANYUAN","WULIANGYE","MOUTAI","HONGXING","NIULANSHAN","LANGJIU"),"clothes":("ZARA","HLA","UNIQLO","PEACEBIRD","GXG","SELECTED","SEMIR","SEPTWOLVES","CAMEL"),"cosmetic":("LOREAL","NIVEA","KANS","DHC","CLINIQUE","INNISFREE","MEIFUBAO","OLAY","LANCOME")} 8 | 9 | def get_one_brand(category_list,id): 10 | brand_id="%08d"%id 11 | category_size=len(category_list) 12 | category=category_list[random.randint(0,category_size-1)] 13 | brand_size=len(BRAND[category]) 14 | brand=BRAND[category][random.randint(0,brand_size-1)] 15 | return brand_id+","+category+","+brand 16 | 17 | 18 | 19 | def generate_brand(): 20 | category_list=[] 21 | for k in BRAND: 22 | category_list.append(k) 23 | f=open(BRAND_FILE,'w') 24 | for i in range(count): 25 | brand=get_one_brand(category_list,i) 26 | f.write(brand+"\n") 27 | f.close() 28 | 29 | 30 | 31 | 32 | 33 | if __name__ == '__main__': 34 | count=int(sys.argv[1]) 35 | print("start to generate brand data...") 36 | generate_brand() 37 | -------------------------------------------------------------------------------- /datasource/command.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/datasource/user.py 1000 4 | /home/bigdata/datasource/brand.py 1000 5 | /home/bigdata/datasource/record.py 100000 6 | -------------------------------------------------------------------------------- /datasource/record.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from faker import Factory 5 | import random,sys,time,uuid 6 | 7 | USER_FILE="/home/bigdata/datasource/user.list" 8 | BRAND_FILE="/home/bigdata/datasource/brand.list" 9 | RECORD_FILE="/home/bigdata/datasource/record.list" 10 | 11 | WEBSITE_LIST=("TAOBAO","TIANMAO","JUHUASUAN","TIANMAOCHAOSHI") 12 | EXPRESS_LIST=("SHENTONG","SHUNFENG","EMS","YUANTONG","YUNDA","ZHONGTONG") 13 | PROVINCE="BeiJing,ShangHai,TianJin,ChongQing,XiangGang,Aomen,AnHui,FuJian,GuangDong,GuangXi,GuiZhou,GanSu,HaiNan,HeBei,HeNan,HeiLongJiang,HuBei,HuNan,JiLin,JiangSu,JiangXi,LiaoNing,NeiMengGu,NingXia,QingHai,ShanXi1,ShanXi3,ShanDong,SiChuan,TaiWan,XiZang,XinJiang,YunNan,ZheJiang" 14 | PROVINCE_LIST=PROVINCE.split(","); 15 | 16 | def get_one_record(fake,user_list,brand_list,id): 17 | record_id="%010d"%id 18 | user_id=user_list[random.randint(0,len(user_list)-1)] 19 | brand_id=brand_list[random.randint(0,len(brand_list)-1)] 20 | transaction_time=int(time.time()) 21 | price=random.randint(0,1000) 22 | source_province=PROVINCE_LIST[random.randint(0,len(PROVINCE_LIST)-1)] 23 | target_province=PROVINCE_LIST[random.randint(0,len(PROVINCE_LIST)-1)] 24 | website=WEBSITE_LIST[random.randint(0,len(WEBSITE_LIST)-1)] 25 | express=EXPRESS_LIST[random.randint(0,len(EXPRESS_LIST)-1)] 26 | express_id=fake.credit_card_number() 27 | ip=fake.ipv4() 28 | language=fake.language_code() 29 | return record_id+","+user_id+","+brand_id+","+str(transaction_time)+","+str(price)+","+source_province+","+target_province+","+website+","+str(express_id)+","+express+","+ip+","+language 30 | 31 | 32 | 33 | 34 | def generate_record(total): 35 | fake = Factory.create() 36 | user_list=get_user_list() 37 | brand_list=get_brand_list() 38 | f=open(RECORD_FILE,'w') 39 | count=0 40 | while(count 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /etl/loadDataToHive/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | bigdata 8 | etl 9 | 1.0-SNAPSHOT 10 | 11 | 1.8 12 | 2.7.3 13 | 2.1.0 14 | 15 | 16 | 17 | org.apache.hadoop 18 | hadoop-common 19 | ${hadoop.version} 20 | 21 | 22 | org.apache.hive 23 | hive-jdbc 24 | ${hive.version} 25 | 26 | 27 | 28 | 29 | 30 | maven-assembly-plugin 31 | 2.3 32 | 33 | dist 34 | true 35 | 36 | jar-with-dependencies 37 | 38 | 39 | 40 | 41 | make-assembly 42 | package 43 | 44 | single 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /etl/loadDataToHive/src/main/java/bigdata/etl/loadDataToHive.java: -------------------------------------------------------------------------------- 1 | package bigdata.etl; 2 | 3 | import java.sql.*; 4 | 5 | /** 6 | * Created by qianxi.zhang on 11/25/16. 7 | */ 8 | public class loadDataToHive { 9 | private static String LOAD_CMD = 10 | "load data inpath '%s' overwrite into table record partition(partition_date='%s',hour_minute='%s')"; 11 | 12 | private static String driverName = "org.apache.hive.jdbc.HiveDriver"; 13 | 14 | public static void loadData(String dataDir, String date, String hour_minute) throws SQLException { 15 | try { 16 | Class.forName(driverName); 17 | } catch (ClassNotFoundException e) { 18 | e.printStackTrace(); 19 | System.exit(1); 20 | } 21 | Connection con = 22 | DriverManager.getConnection("jdbc:hive2://bigdata:10000/default", "bigdata", "bigdata"); 23 | Statement stmt = con.createStatement(); 24 | String sql = String.format(LOAD_CMD, dataDir, date, hour_minute); 25 | stmt.execute(sql); 26 | } 27 | 28 | public static void main(String[] args) throws SQLException { 29 | if (args.length != 3) throw new IllegalArgumentException("need 3 args"); 30 | loadDataToHive.loadData(args[0], args[1], args[2]); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /etl/start_etl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os,time 5 | 6 | INTPUT_PATH="hdfs://bigdata:9000/flume/record/" 7 | OUTPUT_PATH="hdfs://bigdata:9000/etl/record/" 8 | LOAD_CMD="java -cp /home/bigdata/etl/etl-1.0-SNAPSHOT-jar-with-dependencies.jar bigdata.etl.loadDataToHive %s %s %s" 9 | HADOOP_CMD="hadoop jar /home/bigdata/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar -D mapred.reduce.tasks=0 -D mapred.map.tasks=1 -input %s -output %s -mapper /home/bigdata/etl/etl.py -file /home/bigdata/etl/etl.py" 10 | 11 | def getCurrentYmdHM(): 12 | time_struct=time.localtime(time.time()-60*10) 13 | 14 | H= time.strftime('%H',time_struct) 15 | M= int(time.strftime('%M',time_struct)) 16 | M= "%02d" % ((M/10)*10) 17 | Ymd=time.strftime('%Y-%m-%d',time_struct) 18 | 19 | return Ymd+"/"+H+M 20 | 21 | def startETL(): 22 | subPath=getCurrentYmdHM() 23 | input=INTPUT_PATH+subPath 24 | output=OUTPUT_PATH+subPath 25 | 26 | hadoop_cmd=HADOOP_CMD %(input, output) 27 | print hadoop_cmd 28 | os.system(hadoop_cmd) 29 | print 'loading data into Hive' 30 | load_cmd= LOAD_CMD %(output,subPath.split("/")[0],subPath.split("/")[1]) 31 | os.system(load_cmd) 32 | 33 | if __name__ == '__main__': 34 | startETL() 35 | -------------------------------------------------------------------------------- /flume/command/start_flume_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | flume-ng agent --conf /home/bigdata/apache-flume-1.7.0-bin/conf --conf-file /home/bigdata/apache-flume-1.7.0-bin/conf/flume-conf-logAnalysis.properties --name logAgent -Dflume.root.logger=DEBUG,console -Dflume.monitoring.type=http -Dflume.monitoring.port=34545 4 | 5 | -------------------------------------------------------------------------------- /flume/command/start_flume_realtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | flume-ng agent --conf /home/bigdata/apache-flume-1.7.0-bin/conf --conf-file /home/bigdata/apache-flume-1.7.0-bin/conf/flume-conf-logAnalysis-kafka.properties --name logAgent -Dflume.root.logger=DEBUG,console -Dflume.monitoring.type=http -Dflume.monitoring.port=34546 4 | -------------------------------------------------------------------------------- /flume/conf/flume-conf-logAnalysis-kafka.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | # The configuration file needs to define the sources, 20 | # the channels and the sinks. 21 | # Sources, channels and sinks are defined per agent, 22 | # in this case called 'agent' 23 | 24 | logAgent.sources = logSource 25 | logAgent.channels = fileChannel 26 | logAgent.sinks = kafkaSink 27 | 28 | # For each one of the sources, the type is defined 29 | logAgent.sources.logSource.type = exec 30 | logAgent.sources.logSource.command = tail -F /home/bigdata/datasource/record.list 31 | 32 | # The channel can be defined as follows. 33 | logAgent.sources.logSource.channels = fileChannel 34 | 35 | # Each sink's type must be defined 36 | logAgent.sinks.kafkaSink.type = org.apache.flume.sink.kafka.KafkaSink 37 | logAgent.sinks.kafkaSink.topic = log 38 | logAgent.sinks.kafkaSink.brokerList= bigdata:9092 39 | logAgent.sinks.kafkaSink.batchSize= 10 40 | #Specify the channel the sink should use 41 | logAgent.sinks.kafkaSink.channel = fileChannel 42 | 43 | # Each channel's type is defined. 44 | logAgent.channels.fileChannel.type = file 45 | logAgent.channels.fileChannel.checkpointDir= /home/bigdata/apache-flume-1.7.0-bin/dataCheckpointDir_realtime 46 | logAgent.channels.fileChannel.dataDirs= /home/bigdata/apache-flume-1.7.0-bin/dataDir_realtime 47 | 48 | # Other config values specific to each type of channel(sink or source) 49 | # can be defined as well 50 | # In this case, it specifies the capacity of the memory channel 51 | -------------------------------------------------------------------------------- /flume/conf/flume-conf-logAnalysis.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | 19 | # The configuration file needs to define the sources, 20 | # the channels and the sinks. 21 | # Sources, channels and sinks are defined per agent, 22 | # in this case called 'agent' 23 | 24 | logAgent.sources = logSource 25 | logAgent.channels = fileChannel 26 | logAgent.sinks = hdfsSink 27 | 28 | # For each one of the sources, the type is defined 29 | logAgent.sources.logSource.type = exec 30 | logAgent.sources.logSource.command = tail -F /home/bigdata/datasource/record.list 31 | 32 | # The channel can be defined as follows. 33 | logAgent.sources.logSource.channels = fileChannel 34 | 35 | # Each sink's type must be defined 36 | logAgent.sinks.hdfsSink.type = hdfs 37 | logAgent.sinks.hdfsSink.hdfs.path = hdfs://bigdata:9000/flume/record/%Y-%m-%d/%H%M 38 | logAgent.sinks.hdfsSink.hdfs.filePrefix= transaction_log 39 | logAgent.sinks.hdfsSink.hdfs.rollInterval= 600 40 | logAgent.sinks.hdfsSink.hdfs.rollCount= 10000 41 | logAgent.sinks.hdfsSink.hdfs.rollSize= 0 42 | logAgent.sinks.hdfsSink.hdfs.round = true 43 | logAgent.sinks.hdfsSink.hdfs.roundValue = 10 44 | logAgent.sinks.hdfsSink.hdfs.roundUnit = minute 45 | logAgent.sinks.hdfsSink.hdfs.fileType = DataStream 46 | logAgent.sinks.hdfsSink.hdfs.useLocalTimeStamp = true 47 | #Specify the channel the sink should use 48 | logAgent.sinks.hdfsSink.channel = fileChannel 49 | 50 | # Each channel's type is defined. 51 | logAgent.channels.fileChannel.type = file 52 | logAgent.channels.fileChannel.checkpointDir= /home/bigdata/apache-flume-1.7.0-bin/dataCheckpointDir 53 | logAgent.channels.fileChannel.dataDirs= /home/bigdata/apache-flume-1.7.0-bin/dataDir 54 | 55 | # Other config values specific to each type of channel(sink or source) 56 | # can be defined as well 57 | # In this case, it specifies the capacity of the memory channel 58 | -------------------------------------------------------------------------------- /flume/conf/flume-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | # Give Flume more memory and pre-allocate, enable remote monitoring via JMX 19 | export JAVA_OPTS="-Xms100m -Xmx200m -Dcom.sun.management.jmxremote" 20 | 21 | # Let Flume write raw event data and configuration information to its log files for debugging 22 | # purposes. Enabling these flags is not recommended in production, 23 | # as it may result in logging sensitive user information or encryption secrets. 24 | # $JAVA_OPTS="$JAVA_OPTS -Dorg.apache.flume.log.rawdata=true -Dorg.apache.flume.log.printconfig=true " 25 | 26 | # Foll. classpath will be included in Flume's classpath. 27 | # Note that the Flume conf directory is always included in the classpath. 28 | FLUME_CLASSPATH="$HADOOP_HOME/share/hadoop/common/hadoop-common-2.7.3.jar" # Example: "path1;path2;path3" 29 | -------------------------------------------------------------------------------- /hadoop/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /hadoop/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | hadoop-core 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /hadoop/command/start-dfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/start-dfs.sh 4 | -------------------------------------------------------------------------------- /hadoop/command/start-historyserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/mr-jobhistory-daemon.sh start historyserver 4 | 5 | -------------------------------------------------------------------------------- /hadoop/command/start-yarn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/start-yarn.sh 4 | 5 | -------------------------------------------------------------------------------- /hadoop/command/stop-dfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/stop-dfs.sh 4 | -------------------------------------------------------------------------------- /hadoop/command/stop-historyserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/mr-jobhistory-daemon.sh stop historyserver 4 | 5 | -------------------------------------------------------------------------------- /hadoop/command/stop-yarn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/hadoop-2.7.3/sbin/stop-yarn.sh 4 | 5 | -------------------------------------------------------------------------------- /hadoop/conf/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 14 | 15 | 16 | 17 | yarn.scheduler.capacity.maximum-applications 18 | 10000 19 | 20 | Maximum number of applications that can be pending and running. 21 | 22 | 23 | 24 | 25 | yarn.scheduler.capacity.maximum-am-resource-percent 26 | 0.5 27 | 28 | Maximum percent of resources in the cluster which can be used to run 29 | application masters i.e. controls number of concurrent running 30 | applications. 31 | 32 | 33 | 34 | 35 | yarn.scheduler.capacity.resource-calculator 36 | org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator 37 | 38 | The ResourceCalculator implementation to be used to compare 39 | Resources in the scheduler. 40 | The default i.e. DefaultResourceCalculator only uses Memory while 41 | DominantResourceCalculator uses dominant-resource to compare 42 | multi-dimensional resources such as Memory, CPU etc. 43 | 44 | 45 | 46 | 47 | yarn.scheduler.capacity.root.queues 48 | etl,report,default 49 | 50 | The queues at the this level (root is the root queue). 51 | 52 | 53 | 54 | 55 | yarn.scheduler.capacity.root.default.capacity 56 | 40 57 | Default queue target capacity. 58 | 59 | 60 | yarn.scheduler.capacity.root.etl.capacity 61 | 30 62 | etl queue target capacity. 63 | 64 | 65 | yarn.scheduler.capacity.root.report.capacity 66 | 30 67 | report queue target capacity. 68 | 69 | 70 | yarn.scheduler.capacity.root.default.user-limit-factor 71 | 1 72 | 73 | Default queue user limit a percentage from 0.0 to 1.0. 74 | 75 | 76 | 77 | 78 | yarn.scheduler.capacity.root.etl.user-limit-factor 79 | 1 80 | 81 | etl queue user limit a percentage from 0.0 to 1.0. 82 | 83 | 84 | 85 | yarn.scheduler.capacity.root.report.user-limit-factor 86 | 1 87 | 88 | report queue user limit a percentage from 0.0 to 1.0. 89 | 90 | 91 | 92 | yarn.scheduler.capacity.root.default.maximum-capacity 93 | 100 94 | 95 | The maximum capacity of the default queue. 96 | 97 | 98 | 99 | yarn.scheduler.capacity.root.etl.maximum-capacity 100 | 100 101 | 102 | The maximum capacity of the etl queue. 103 | 104 | 105 | 106 | yarn.scheduler.capacity.root.report.maximum-capacity 107 | 100 108 | 109 | The maximum capacity of the report queue. 110 | 111 | 112 | 113 | 114 | yarn.scheduler.capacity.root.default.state 115 | RUNNING 116 | 117 | The state of the default queue. State can be one of RUNNING or STOPPED. 118 | 119 | 120 | 121 | yarn.scheduler.capacity.root.etl.state 122 | RUNNING 123 | 124 | The state of the etl queue. State can be one of RUNNING or STOPPED. 125 | 126 | 127 | 128 | yarn.scheduler.capacity.root.report.state 129 | RUNNING 130 | 131 | The state of the report queue. State can be one of RUNNING or STOPPED. 132 | 133 | 134 | 135 | yarn.scheduler.capacity.root.default.acl_submit_applications 136 | * 137 | 138 | The ACL of who can submit jobs to the default queue. 139 | 140 | 141 | 142 | yarn.scheduler.capacity.root.etl.acl_submit_applications 143 | * 144 | 145 | The ACL of who can submit jobs to the etl queue. 146 | 147 | 148 | 149 | yarn.scheduler.capacity.root.report.acl_submit_applications 150 | * 151 | 152 | The ACL of who can submit jobs to the report queue. 153 | 154 | 155 | 156 | yarn.scheduler.capacity.root.default.acl_administer_queue 157 | * 158 | 159 | The ACL of who can administer jobs on the default queue. 160 | 161 | 162 | 163 | yarn.scheduler.capacity.root.etl.acl_administer_queue 164 | * 165 | 166 | The ACL of who can administer jobs on the etl queue. 167 | 168 | 169 | 170 | yarn.scheduler.capacity.root.report.acl_administer_queue 171 | * 172 | 173 | The ACL of who can administer jobs on the report queue. 174 | 175 | 176 | 177 | yarn.scheduler.capacity.node-locality-delay 178 | 40 179 | 180 | Number of missed scheduling opportunities after which the CapacityScheduler 181 | attempts to schedule rack-local containers. 182 | Typically this should be set to number of nodes in the cluster, By default is setting 183 | approximately number of nodes in one rack which is 40. 184 | 185 | 186 | 187 | 188 | yarn.scheduler.capacity.queue-mappings 189 | 190 | 191 | A list of mappings that will be used to assign jobs to queues 192 | The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]* 193 | Typically this list will be used to map users to queues, 194 | for example, u:%user:%user maps all users to queues with the same name 195 | as the user. 196 | 197 | 198 | 199 | 200 | yarn.scheduler.capacity.queue-mappings-override.enable 201 | false 202 | 203 | If a queue mapping is present, will it override the value specified 204 | by the user? This can be used by administrators to place jobs in queues 205 | that are different than the one specified by the user. 206 | The default is false. 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /hadoop/conf/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | fs.defaultFS 21 | hdfs://bigdata:9000 22 | 23 | 24 | hadoop.tmp.dir 25 | /home/bigdata/hadoopdata 26 | 27 | 28 | hadoop.proxyuser.bigdata.hosts 29 | * 30 | 31 | 32 | hadoop.proxyuser.bigdata.groups 33 | * 34 | 35 | 36 | -------------------------------------------------------------------------------- /hadoop/conf/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Set Hadoop-specific environment variables here. 18 | 19 | # The only required environment variable is JAVA_HOME. All others are 20 | # optional. When running a distributed configuration it is best to 21 | # set JAVA_HOME in this file, so that it is correctly defined on 22 | # remote nodes. 23 | 24 | # The java implementation to use. 25 | export JAVA_HOME=/usr/java/jdk1.8.0_101/ 26 | 27 | # The jsvc implementation to use. Jsvc is required to run secure datanodes 28 | # that bind to privileged ports to provide authentication of data transfer 29 | # protocol. Jsvc is not required if SASL is configured for authentication of 30 | # data transfer protocol using non-privileged ports. 31 | #export JSVC_HOME=${JSVC_HOME} 32 | 33 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"} 34 | 35 | # Extra Java CLASSPATH elements. Automatically insert capacity-scheduler. 36 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do 37 | if [ "$HADOOP_CLASSPATH" ]; then 38 | export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f 39 | else 40 | export HADOOP_CLASSPATH=$f 41 | fi 42 | done 43 | 44 | # The maximum amount of heap to use, in MB. Default is 1000. 45 | export HADOOP_HEAPSIZE=512 46 | #export HADOOP_NAMENODE_INIT_HEAPSIZE="" 47 | 48 | # Extra Java runtime options. Empty by default. 49 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true" 50 | 51 | # Command specific options appended to HADOOP_OPTS when specified 52 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS" 53 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS" 54 | 55 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS" 56 | 57 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS" 58 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS" 59 | 60 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc) 61 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS" 62 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS" 63 | 64 | # On secure datanodes, user to run the datanode as after dropping privileges. 65 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports 66 | # to provide authentication of data transfer protocol. This **MUST NOT** be 67 | # defined if SASL is configured for authentication of data transfer protocol 68 | # using non-privileged ports. 69 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER} 70 | 71 | # Where log files are stored. $HADOOP_HOME/logs by default. 72 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER 73 | 74 | # Where log files are stored in the secure data environment. 75 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER} 76 | 77 | ### 78 | # HDFS Mover specific parameters 79 | ### 80 | # Specify the JVM options to be used when starting the HDFS Mover. 81 | # These options will be appended to the options specified as HADOOP_OPTS 82 | # and therefore may override any similar flags set in HADOOP_OPTS 83 | # 84 | # export HADOOP_MOVER_OPTS="" 85 | 86 | ### 87 | # Advanced Users Only! 88 | ### 89 | 90 | # The directory where pid files are stored. /tmp by default. 91 | # NOTE: this should be set to a directory that can only be written to by 92 | # the user that will run the hadoop daemons. Otherwise there is the 93 | # potential for a symlink attack. 94 | export HADOOP_PID_DIR=${HADOOP_PID_DIR} 95 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} 96 | 97 | # A string representing this instance of hadoop. $USER by default. 98 | export HADOOP_IDENT_STRING=$USER 99 | -------------------------------------------------------------------------------- /hadoop/conf/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | dfs.replication 22 | 1 23 | 24 | 25 | -------------------------------------------------------------------------------- /hadoop/conf/mapred-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # export JAVA_HOME=/home/y/libexec/jdk1.6.0/ 17 | 18 | export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=512 19 | 20 | export HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA 21 | 22 | #export HADOOP_JOB_HISTORYSERVER_OPTS= 23 | #export HADOOP_MAPRED_LOG_DIR="" # Where log files are stored. $HADOOP_MAPRED_HOME/logs by default. 24 | #export HADOOP_JHS_LOGGER=INFO,RFA # Hadoop JobSummary logger. 25 | #export HADOOP_MAPRED_PID_DIR= # The pid files are stored. /tmp by default. 26 | #export HADOOP_MAPRED_IDENT_STRING= #A string representing this instance of hadoop. $USER by default 27 | #export HADOOP_MAPRED_NICENESS= #The scheduling priority for daemons. Defaults to 0. 28 | -------------------------------------------------------------------------------- /hadoop/conf/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapreduce.framework.name 22 | yarn 23 | 24 | 25 | mapreduce.jobhistory.done-dir 26 | /user/history/done 27 | 28 | 29 | mapreduce.jobhistory.intermediate-done-dir 30 | /user/history/done_intermediate 31 | 32 | 33 | -------------------------------------------------------------------------------- /hadoop/conf/slaves: -------------------------------------------------------------------------------- 1 | bigdata 2 | -------------------------------------------------------------------------------- /hadoop/conf/yarn-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # User for YARN daemons 17 | export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn} 18 | 19 | # resolve links - $0 may be a softlink 20 | export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}" 21 | 22 | # some Java parameters 23 | export JAVA_HOME=/usr/java/jdk1.8.0_101/ 24 | if [ "$JAVA_HOME" != "" ]; then 25 | #echo "run java in $JAVA_HOME" 26 | JAVA_HOME=$JAVA_HOME 27 | fi 28 | 29 | if [ "$JAVA_HOME" = "" ]; then 30 | echo "Error: JAVA_HOME is not set." 31 | exit 1 32 | fi 33 | 34 | JAVA=$JAVA_HOME/bin/java 35 | JAVA_HEAP_MAX=-Xmx512m 36 | 37 | # For setting YARN specific HEAP sizes please use this 38 | # Parameter and set appropriately 39 | # YARN_HEAPSIZE=1000 40 | 41 | # check envvars which might override default args 42 | if [ "$YARN_HEAPSIZE" != "" ]; then 43 | JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m" 44 | fi 45 | 46 | # Resource Manager specific parameters 47 | 48 | # Specify the max Heapsize for the ResourceManager using a numerical value 49 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set 50 | # the value to 1000. 51 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS 52 | # and/or YARN_RESOURCEMANAGER_OPTS. 53 | # If not specified, the default value will be picked from either YARN_HEAPMAX 54 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. 55 | #export YARN_RESOURCEMANAGER_HEAPSIZE=1000 56 | 57 | # Specify the max Heapsize for the timeline server using a numerical value 58 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set 59 | # the value to 1000. 60 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS 61 | # and/or YARN_TIMELINESERVER_OPTS. 62 | # If not specified, the default value will be picked from either YARN_HEAPMAX 63 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. 64 | #export YARN_TIMELINESERVER_HEAPSIZE=1000 65 | 66 | # Specify the JVM options to be used when starting the ResourceManager. 67 | # These options will be appended to the options specified as YARN_OPTS 68 | # and therefore may override any similar flags set in YARN_OPTS 69 | #export YARN_RESOURCEMANAGER_OPTS= 70 | 71 | # Node Manager specific parameters 72 | 73 | # Specify the max Heapsize for the NodeManager using a numerical value 74 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set 75 | # the value to 1000. 76 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS 77 | # and/or YARN_NODEMANAGER_OPTS. 78 | # If not specified, the default value will be picked from either YARN_HEAPMAX 79 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. 80 | #export YARN_NODEMANAGER_HEAPSIZE=1000 81 | 82 | # Specify the JVM options to be used when starting the NodeManager. 83 | # These options will be appended to the options specified as YARN_OPTS 84 | # and therefore may override any similar flags set in YARN_OPTS 85 | #export YARN_NODEMANAGER_OPTS= 86 | 87 | # so that filenames w/ spaces are handled correctly in loops below 88 | IFS= 89 | 90 | 91 | # default log directory & file 92 | if [ "$YARN_LOG_DIR" = "" ]; then 93 | YARN_LOG_DIR="$HADOOP_YARN_HOME/logs" 94 | fi 95 | if [ "$YARN_LOGFILE" = "" ]; then 96 | YARN_LOGFILE='yarn.log' 97 | fi 98 | 99 | # default policy file for service-level authorization 100 | if [ "$YARN_POLICYFILE" = "" ]; then 101 | YARN_POLICYFILE="hadoop-policy.xml" 102 | fi 103 | 104 | # restore ordinary behaviour 105 | unset IFS 106 | 107 | 108 | YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR" 109 | YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR" 110 | YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE" 111 | YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE" 112 | YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME" 113 | YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING" 114 | YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" 115 | YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" 116 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then 117 | YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" 118 | fi 119 | YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE" 120 | 121 | 122 | -------------------------------------------------------------------------------- /hadoop/conf/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 16 | 17 | 18 | 19 | yarn.nodemanager.aux-services 20 | mapreduce_shuffle 21 | 22 | 23 | 24 | yarn.resourcemanager.address 25 | bigdata:18040 26 | 27 | 28 | 29 | yarn.resourcemanager.scheduler.address 30 | bigdata:18030 31 | 32 | 33 | 34 | yarn.resourcemanager.resource-tracker.address 35 | bigdata:18025 36 | 37 | 38 | 39 | yarn.resourcemanager.admin.address 40 | bigdata:18141 41 | 42 | 43 | 44 | yarn.resourcemanager.webapp.address 45 | bigdata:18088 46 | 47 | 48 | yarn.resourcemanager.scheduler.class 49 | org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler 50 | 51 | 52 | -------------------------------------------------------------------------------- /hadoop/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 4.0.0 7 | 8 | hadoop-example 9 | cn.chinahadoop 10 | 0.1.0-SNAPSHOT 11 | ../pom.xml 12 | 13 | 14 | cn.chinahadoop 15 | hadoop-core 16 | 0.1.0-SNAPSHOT 17 | Hadoop Core Examples 18 | hadoop core examples 19 | jar 20 | 21 | 22 | 2.7.3 23 | 24 | 25 | 26 | 27 | org.apache.hadoop 28 | hadoop-client 29 | ${hadoop.version} 30 | 31 | 32 | org.apache.hadoop 33 | hadoop-common 34 | 2.7.3 35 | 36 | 37 | org.apache.hadoop 38 | hadoop-mapreduce-client-core 39 | 2.7.3 40 | 41 | 42 | org.apache.hadoop 43 | hadoop-mapreduce-client-common 44 | 2.7.3 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/hdfs/HdfsExample.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.hdfs; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | 7 | public class HdfsExample { 8 | 9 | public static void testMkdirPath(String path) throws Exception { 10 | FileSystem fs = null; 11 | try { 12 | System.out.println("Creating " + path + " on hdfs..."); 13 | Configuration conf = new Configuration(); 14 | // First create a new directory with mkdirs 15 | Path myPath = new Path(path); 16 | fs = myPath.getFileSystem(conf); 17 | 18 | fs.mkdirs(myPath); 19 | System.out.println("Create " + path + " on hdfs successfully."); 20 | } catch (Exception e) { 21 | System.out.println("Exception:" + e); 22 | } finally { 23 | if(fs != null) 24 | fs.close(); 25 | } 26 | } 27 | 28 | public static void testDeletePath(String path) throws Exception { 29 | FileSystem fs = null; 30 | try { 31 | System.out.println("Deleting " + path + " on hdfs..."); 32 | Configuration conf = new Configuration(); 33 | Path myPath = new Path(path); 34 | fs = myPath.getFileSystem(conf); 35 | 36 | fs.delete(myPath, true); 37 | System.out.println("Deleting " + path + " on hdfs successfully."); 38 | } catch (Exception e) { 39 | System.out.println("Exception:" + e); 40 | } finally { 41 | if(fs != null) 42 | fs.close(); 43 | } 44 | } 45 | 46 | public static void main(String[] args) { 47 | try { 48 | //String path = "hdfs:namenodehost:8020/test/mkdirs-test"; 49 | String path = "/test/mkdirs-test"; 50 | testMkdirPath(path); 51 | //testDeletePath(path); 52 | } catch (Exception e) { 53 | System.out.println("Exceptions:" + e); 54 | } 55 | System.out.println("timestamp:" + System.currentTimeMillis()); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/Grep.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import java.util.Random; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.map.InverseMapper; 14 | import org.apache.hadoop.mapreduce.lib.map.RegexMapper; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 17 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer; 18 | import org.apache.hadoop.util.Tool; 19 | import org.apache.hadoop.util.ToolRunner; 20 | import org.apache.hadoop.io.Text; 21 | 22 | public class Grep extends Configured implements Tool { 23 | private Grep() { 24 | } // singleton 25 | 26 | public int run(String[] args) throws Exception { 27 | if (args.length < 3) { 28 | System.out.println("Grep []"); 29 | ToolRunner.printGenericCommandUsage(System.out); 30 | return 2; 31 | } 32 | // the temp dir between two mapreduce jobs 33 | Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); 34 | 35 | Configuration conf = getConf(); 36 | conf.set(RegexMapper.PATTERN, args[2]); 37 | if (args.length == 4) 38 | conf.set(RegexMapper.GROUP, args[3]); 39 | //the first job 40 | // word count 41 | Job grepJob = new Job(conf); 42 | 43 | try { 44 | //define the first job 45 | grepJob.setJobName("grep-search"); 46 | 47 | FileInputFormat.setInputPaths(grepJob, args[0]); 48 | 49 | grepJob.setMapperClass(RegexMapper.class); 50 | 51 | grepJob.setCombinerClass(LongSumReducer.class); 52 | grepJob.setReducerClass(LongSumReducer.class); 53 | // output to tempDir 54 | FileOutputFormat.setOutputPath(grepJob, tempDir); 55 | grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); 56 | grepJob.setOutputKeyClass(Text.class); 57 | grepJob.setOutputValueClass(LongWritable.class); 58 | // result: word + count 59 | grepJob.waitForCompletion(true); 60 | //the second job 61 | //sort 62 | Job sortJob = new Job(conf); 63 | sortJob.setJobName("grep-sort"); 64 | //tempDir to input 65 | FileInputFormat.setInputPaths(sortJob, tempDir); 66 | sortJob.setInputFormatClass(SequenceFileInputFormat.class); 67 | 68 | sortJob.setMapperClass(InverseMapper.class); 69 | //just write the sort data out 70 | sortJob.setNumReduceTasks(1); // write a single file 71 | FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); 72 | sortJob.setSortComparatorClass( // sort by decreasing freq 73 | LongWritable.DecreasingComparator.class); 74 | 75 | FileSystem.get(conf).delete(new Path(args[1]),true); 76 | 77 | sortJob.waitForCompletion(true); 78 | } finally { 79 | FileSystem.get(conf).delete(tempDir, true); 80 | } 81 | return 0; 82 | } 83 | 84 | public static void main(String[] args) throws Exception { 85 | int res = ToolRunner.run(new Configuration(), new Grep(), args); 86 | System.exit(res); 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/InvertedIndex.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 15 | import org.apache.hadoop.util.GenericOptionsParser; 16 | 17 | import java.io.IOException; 18 | import java.util.HashMap; 19 | import java.util.Map; 20 | import java.util.StringTokenizer; 21 | 22 | /** 23 | * Created by qianxi.zhang on 12/17/16. 24 | */ 25 | 26 | public class InvertedIndex { 27 | public static class WordToFileMapper extends Mapper { 28 | @Override 29 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 30 | // Get the name of the file using context.getInputSplit()method 31 | String fileName = ((FileSplit) context.getInputSplit()).getPath().getName(); 32 | // Split the line in words 33 | StringTokenizer itr = new StringTokenizer(value.toString()); 34 | while (itr.hasMoreTokens()) { 35 | // For each word emit word as key and file name as value 36 | context.write(new Text(itr.nextToken()), new Text(fileName)); 37 | } 38 | } 39 | } 40 | 41 | public static class WordToFileCountReducer extends Reducer { 42 | @Override 43 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 44 | // Declare the Hash Map to store File name as key to compute 45 | // and store number of times the filename is occurred for as value 46 | Map map = new HashMap(); 47 | for (Text fileText : values) { 48 | String file = fileText.toString(); 49 | if (map.containsKey(file)) { 50 | map.put(file, map.get(file) + 1); 51 | } else { 52 | map.put(file, 1); 53 | } 54 | } 55 | context.write(key, new Text(map.toString())); 56 | } 57 | } 58 | 59 | public static void main(String[] args) throws Exception { 60 | Configuration conf = new Configuration(); 61 | 62 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 63 | if (otherArgs.length < 2) { 64 | System.err.println("Usage: invertedindex [...] "); 65 | System.exit(2); 66 | } 67 | Job job = Job.getInstance(conf, "invert index"); 68 | job.setJarByClass(InvertedIndex.class); 69 | job.setMapperClass(WordToFileMapper.class); 70 | job.setReducerClass(WordToFileCountReducer.class); 71 | 72 | // Defining the output key and value class for the mapper 73 | job.setMapOutputKeyClass(Text.class); 74 | job.setMapOutputValueClass(Text.class); 75 | 76 | // Defining the output key and value class for the reducer 77 | job.setOutputKeyClass(Text.class); 78 | job.setOutputValueClass(Text.class); 79 | 80 | job.setInputFormatClass(TextInputFormat.class); 81 | job.setOutputFormatClass(TextOutputFormat.class); 82 | 83 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 84 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 85 | 86 | Path outputPath = new Path(otherArgs[1]); 87 | 88 | outputPath.getFileSystem(conf).delete(outputPath); 89 | 90 | System.exit(job.waitForCompletion(true) ? 0 : 1); 91 | } 92 | } -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/JobFailureTest.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.conf.Configured; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.Reducer.Context; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.util.GenericOptionsParser; 18 | import org.apache.hadoop.util.Tool; 19 | import org.apache.hadoop.util.ToolRunner; 20 | 21 | public class JobFailureTest extends Configured implements Tool { 22 | public static class TokenizerMapper extends Mapper { 23 | 24 | private final static IntWritable one = new IntWritable(1); 25 | private Text word = new Text(); 26 | 27 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 28 | //get the task id from context 29 | //for the first map task(task id=0), the task always fails 30 | int id = context.getTaskAttemptID().getTaskID().getId(); 31 | System.out.println("id:" + id); 32 | if (id == 0) 33 | System.exit(-1); 34 | StringTokenizer itr = new StringTokenizer(value.toString()); 35 | while (itr.hasMoreTokens()) { 36 | word.set(itr.nextToken()); 37 | context.write(word, one); 38 | } 39 | } 40 | } 41 | 42 | public static class IntSumReducer extends Reducer { 43 | private IntWritable result = new IntWritable(); 44 | 45 | public void reduce(Text key, Iterable values, Context context) 46 | throws IOException, InterruptedException { 47 | int sum = 0; 48 | for (IntWritable val : values) { 49 | sum += val.get(); 50 | } 51 | result.set(sum); 52 | context.write(key, result); 53 | } 54 | } 55 | 56 | public int run(String[] args) throws Exception { 57 | Configuration conf = getConf(); 58 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 59 | if (otherArgs.length < 2) { 60 | System.err.println("Usage: word count job failure test [...] "); 61 | System.exit(2); 62 | } 63 | Job job = new Job(conf, "word count job failure test"); 64 | job.setJarByClass(JobFailureTest.class); 65 | job.setMapperClass(TokenizerMapper.class); 66 | job.setCombinerClass(IntSumReducer.class); 67 | job.setReducerClass(IntSumReducer.class); 68 | job.setOutputKeyClass(Text.class); 69 | job.setOutputValueClass(IntWritable.class); 70 | for (int i = 0; i < otherArgs.length - 1; ++i) { 71 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 72 | } 73 | 74 | Path outputPath = new Path(otherArgs[1]); 75 | 76 | outputPath.getFileSystem(conf).delete(outputPath); 77 | 78 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); 79 | return job.waitForCompletion(true) ? 0 : 1; 80 | } 81 | 82 | public static void main(String[] args) throws Exception { 83 | int res = ToolRunner.run(new Configuration(), new JobFailureTest(), args); 84 | System.exit(res); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/OOMTest.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import java.io.IOException; 4 | import java.lang.reflect.Field; 5 | import java.util.StringTokenizer; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.conf.Configured; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.util.GenericOptionsParser; 18 | import org.apache.hadoop.util.Tool; 19 | import org.apache.hadoop.util.ToolRunner; 20 | import sun.misc.Unsafe; 21 | 22 | public class OOMTest extends Configured implements Tool { 23 | 24 | public static final int on_heap_length = 1 * 100 * 1000 * 1000; 25 | public static final int off_heap_length = 1000 * 1000 * 1000; 26 | 27 | public static class TokenizerMapper extends Mapper { 28 | private final static IntWritable one = new IntWritable(1); 29 | private Text word = new Text(); 30 | 31 | // allocate on heap space 32 | private byte[] byteArray = new byte[on_heap_length]; 33 | 34 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 35 | Field f = null; 36 | try { 37 | f = Unsafe.class.getDeclaredField("theUnsafe"); 38 | } catch (NoSuchFieldException e) { 39 | e.printStackTrace(); 40 | } 41 | f.setAccessible(true); 42 | Unsafe us = null; 43 | try { 44 | us = (Unsafe) f.get(null); 45 | } catch (IllegalAccessException e) { 46 | e.printStackTrace(); 47 | } 48 | // allocate off heap space 49 | long id = us.allocateMemory(off_heap_length); 50 | 51 | StringTokenizer itr = new StringTokenizer(value.toString()); 52 | while (itr.hasMoreTokens()) { 53 | word.set(itr.nextToken()); 54 | context.write(word, one); 55 | } 56 | } 57 | } 58 | 59 | public static class IntSumReducer extends Reducer { 60 | private IntWritable result = new IntWritable(); 61 | 62 | public void reduce(Text key, Iterable values, Context context) 63 | throws IOException, InterruptedException { 64 | int sum = 0; 65 | for (IntWritable val : values) { 66 | sum += val.get(); 67 | } 68 | result.set(sum); 69 | context.write(key, result); 70 | } 71 | } 72 | 73 | public int run(String[] args) throws Exception { 74 | Configuration conf = getConf(); 75 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 76 | if (otherArgs.length < 2) { 77 | System.err.println("Usage: oom-test [...] "); 78 | System.exit(2); 79 | } 80 | Job job = new Job(conf, "oom test"); 81 | job.setJarByClass(OOMTest.class); 82 | job.setMapperClass(TokenizerMapper.class); 83 | job.setCombinerClass(IntSumReducer.class); 84 | job.setReducerClass(IntSumReducer.class); 85 | job.setOutputKeyClass(Text.class); 86 | job.setOutputValueClass(IntWritable.class); 87 | 88 | Path outputPath = new Path(otherArgs[1]); 89 | outputPath.getFileSystem(conf).delete(outputPath); 90 | 91 | for (int i = 0; i < otherArgs.length - 1; ++i) { 92 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 93 | } 94 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); 95 | return job.waitForCompletion(true) ? 0 : 1; 96 | } 97 | 98 | public static void main(String[] args) throws Exception { 99 | int res = ToolRunner.run(new Configuration(), new OOMTest(), args); 100 | System.exit(res); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/TaskAttemptTest.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.conf.Configured; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.Reducer.Context; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.util.GenericOptionsParser; 18 | import org.apache.hadoop.util.Tool; 19 | import org.apache.hadoop.util.ToolRunner; 20 | 21 | public class TaskAttemptTest extends Configured implements Tool { 22 | public static class TokenizerMapper extends Mapper { 23 | 24 | private final static IntWritable one = new IntWritable(1); 25 | private Text word = new Text(); 26 | 27 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 28 | //get the task attempt id 29 | //for the 4 previous attempt for the task, the attempt always fails. 30 | int id = context.getTaskAttemptID().getId(); 31 | System.out.println("id:" + id); 32 | if (id < 4) 33 | System.exit(-1); 34 | StringTokenizer itr = new StringTokenizer(value.toString()); 35 | while (itr.hasMoreTokens()) { 36 | word.set(itr.nextToken()); 37 | context.write(word, one); 38 | } 39 | } 40 | } 41 | 42 | public static class IntSumReducer extends Reducer { 43 | private IntWritable result = new IntWritable(); 44 | 45 | public void reduce(Text key, Iterable values, Context context) 46 | throws IOException, InterruptedException { 47 | int sum = 0; 48 | for (IntWritable val : values) { 49 | sum += val.get(); 50 | } 51 | result.set(sum); 52 | context.write(key, result); 53 | } 54 | } 55 | 56 | public int run(String[] args) throws Exception { 57 | Configuration conf = getConf(); 58 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 59 | if (otherArgs.length < 2) { 60 | System.err.println("Usage: word count attempt test [...] "); 61 | System.exit(2); 62 | } 63 | Job job = new Job(conf, "word count attempt test"); 64 | job.setJarByClass(TaskAttemptTest.class); 65 | job.setMapperClass(TokenizerMapper.class); 66 | job.setCombinerClass(IntSumReducer.class); 67 | job.setReducerClass(IntSumReducer.class); 68 | job.setOutputKeyClass(Text.class); 69 | job.setOutputValueClass(IntWritable.class); 70 | for (int i = 0; i < otherArgs.length - 1; ++i) { 71 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 72 | } 73 | 74 | Path outputPath = new Path(otherArgs[1]); 75 | 76 | outputPath.getFileSystem(conf).delete(outputPath); 77 | 78 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); 79 | return job.waitForCompletion(true) ? 0 : 1; 80 | } 81 | 82 | public static void main(String[] args) throws Exception { 83 | int res = ToolRunner.run(new Configuration(), new TaskAttemptTest(), args); 84 | System.exit(res); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /hadoop/src/main/java/cn/chinahadoop/mapreduce/WordCount.java: -------------------------------------------------------------------------------- 1 | package cn.chinahadoop.mapreduce; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | import org.apache.hadoop.util.GenericOptionsParser; 16 | 17 | public class WordCount { 18 | 19 | public static class TokenizerMapper extends Mapper { 20 | 21 | private final static IntWritable one = new IntWritable(1); 22 | private Text word = new Text(); 23 | 24 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 25 | StringTokenizer itr = new StringTokenizer(value.toString()); 26 | while (itr.hasMoreTokens()) { 27 | word.set(itr.nextToken()); 28 | context.write(word, one); 29 | } 30 | } 31 | } 32 | 33 | public static class IntSumReducer extends Reducer { 34 | private IntWritable result = new IntWritable(); 35 | 36 | public void reduce(Text key, Iterable values, Context context) 37 | throws IOException, InterruptedException { 38 | int sum = 0; 39 | for (IntWritable val : values) { 40 | sum += val.get(); 41 | } 42 | result.set(sum); 43 | context.write(key, result); 44 | } 45 | } 46 | 47 | public static void main(String[] args) throws Exception { 48 | Configuration conf = new Configuration(); 49 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 50 | if (otherArgs.length < 2) { 51 | System.err.println("Usage: wordcount [...] "); 52 | System.exit(2); 53 | } 54 | Job job = new Job(conf, "word count"); 55 | job.setJarByClass(WordCount.class); 56 | job.setMapperClass(TokenizerMapper.class); 57 | job.setCombinerClass(IntSumReducer.class); 58 | job.setReducerClass(IntSumReducer.class); 59 | job.setOutputKeyClass(Text.class); 60 | job.setOutputValueClass(IntWritable.class); 61 | for (int i = 0; i < otherArgs.length - 1; ++i) { 62 | FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 63 | } 64 | 65 | Path outputPath = new Path(otherArgs[1]); 66 | 67 | outputPath.getFileSystem(conf).delete(outputPath); 68 | 69 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); 70 | System.exit(job.waitForCompletion(true) ? 0 : 1); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /hadoop/src/main/resources/input/input_1.txt: -------------------------------------------------------------------------------- 1 | hello world 2 | I have a dream 3 | all over the world 4 | hello china 5 | -------------------------------------------------------------------------------- /hadoop/src/main/resources/input/input_2.txt: -------------------------------------------------------------------------------- 1 | I have a best friend 2 | She is very thin and kind 3 | She is hard-working 4 | -------------------------------------------------------------------------------- /hadoop/src/main/resources/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /hadoop/src/main/resources/output/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crc'H݇ -------------------------------------------------------------------------------- /hadoop/src/main/resources/output/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/src/main/resources/output/_SUCCESS -------------------------------------------------------------------------------- /hadoop/src/main/resources/output/part-r-00000: -------------------------------------------------------------------------------- 1 | I 2 2 | She 2 3 | a 2 4 | all 1 5 | and 1 6 | best 1 7 | china 1 8 | dream 1 9 | friend 1 10 | hard-working 1 11 | have 2 12 | hello 2 13 | is 2 14 | kind 1 15 | over 1 16 | the 1 17 | thin 1 18 | very 1 19 | world 2 20 | -------------------------------------------------------------------------------- /hadoop/streaming/mapper.cpp: -------------------------------------------------------------------------------- 1 | // By dongxicheng, 2 | // blog:http://dongxicheng.org/ 3 | // mapper.cpp 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | using namespace std; 10 | string charArrayToString(char *str) { 11 | stringstream ss(str); 12 | return ss.str(); 13 | } 14 | 15 | vector& split( 16 | const string &s, char delim, vector &elems) { 17 | stringstream ss(s); 18 | string item; 19 | while(getline(ss, item, delim)) { 20 | elems.push_back(item); 21 | } 22 | return elems; 23 | } 24 | 25 | int main(int argc, char *argv[], char *env[]) { 26 | int reduce_task_no = -1; 27 | int iterator = -1; 28 | vector pairs; 29 | for(int i = 0; env[i] != NULL; i++) { 30 | pairs.clear(); 31 | split(charArrayToString(env[i]), '=', pairs); 32 | if(pairs.size() < 2) continue; 33 | if(pairs[0] == "mapreduce_job_reduces") // number of reduce tasks 34 | reduce_task_no = atoi(pairs[1].c_str()); 35 | else if(pairs[0] == "mapreduce_iterator_no") // user-defined attribute 36 | iterator = atoi(pairs[1].c_str()); 37 | } 38 | cerr << "mapreduce.job.reduces:" << reduce_task_no 39 | << ",mapreduce.iterator.no:" << iterator << endl; 40 | 41 | string key; 42 | while(cin >> key) { 43 | cout << key << "\t" << "1" << endl; 44 | // Define counter named counter_no in group counter_group 45 | cerr << "reporter:counter:counter_group,counter_no,1\n"; 46 | // dispaly status 47 | cerr << "reporter:status:processing......\n"; 48 | // Print logs for testing 49 | cerr << "This is log, will be printed in stdout file\n"; 50 | } 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /hadoop/streaming/mapper.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/streaming/mapper.php -------------------------------------------------------------------------------- /hadoop/streaming/mapper.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | while read LINE; do 3 | for word in $LINE 4 | do 5 | echo "$word 1" 6 | # in streaming, we define counter by 7 | # [reporter:counter:,,] 8 | # define a counter named counter_no, in group counter_group 9 | # increase this counter by 1 10 | # counter shoule be output through stderr 11 | echo "reporter:counter:counter_group,counter_no,1" >&2 12 | echo "reporter:counter:status,processing......" >&2 13 | echo "This is log for testing, will be printed in stdout file" >&2 14 | done 15 | done 16 | -------------------------------------------------------------------------------- /hadoop/streaming/mapper2.cpp: -------------------------------------------------------------------------------- 1 | // By dongxicheng, 2 | // blog:http://dongxicheng.org/ 3 | // mapper.cpp 4 | #include 5 | #include 6 | using namespace std; 7 | 8 | int main() { 9 | string key; 10 | while(cin >> key) { 11 | cout << key << "\t" << "1" << endl; 12 | } 13 | return 0; 14 | } 15 | -------------------------------------------------------------------------------- /hadoop/streaming/mapper2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | while read LINE; do 3 | for word in $LINE 4 | do 5 | echo "$word 1" 6 | done 7 | done 8 | -------------------------------------------------------------------------------- /hadoop/streaming/reducer.cpp: -------------------------------------------------------------------------------- 1 | // By dongxicheng, 2 | // blog:http://dongxicheng.org/ 3 | // reducer.cpp 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | int main() { 9 | string cur_key, last_key, value; 10 | cin >> cur_key >> value; 11 | last_key = cur_key; 12 | int n = 1; 13 | while(cin >> cur_key) { 14 | cin >> value; 15 | if(last_key != cur_key) { 16 | cout << last_key << "\t" << n << endl; 17 | last_key = cur_key; 18 | n = 1; 19 | } else { 20 | n++; 21 | } 22 | } 23 | cout << last_key << "\t" << n << endl; 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /hadoop/streaming/reducer.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/streaming/reducer.php -------------------------------------------------------------------------------- /hadoop/streaming/reducer.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | count=0 3 | started=0 4 | word="" 5 | while read LINE;do 6 | newword=`echo $LINE | cut -d ' ' -f 1` 7 | if [ "$word" != "$newword" ];then 8 | [ $started -ne 0 ] && echo "$word\t$count" 9 | word=$newword 10 | count=1 11 | started=1 12 | else 13 | count=$(( $count + 1 )) 14 | fi 15 | done 16 | echo "$word\t$count" 17 | -------------------------------------------------------------------------------- /hadoop/streaming/run_cpp_mr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3 3 | INPUT_PATH=/test/input 4 | OUTPUT_PATH=/test/output 5 | echo "Clearing output path: $OUTPUT_PATH" 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH 7 | 8 | ${HADOOP_HOME}/bin/hadoop jar\ 9 | ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\ 10 | -D mapred.reduce.tasks=2\ 11 | -files mapper,reducer\ 12 | -input $INPUT_PATH\ 13 | -output $OUTPUT_PATH\ 14 | -mapper mapper\ 15 | -reducer reducer 16 | -------------------------------------------------------------------------------- /hadoop/streaming/run_php_mr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3 3 | INPUT_PATH=/test/input 4 | OUTPUT_PATH=/test/output 5 | echo "Clearing output path: $OUTPUT_PATH" 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH 7 | 8 | ${HADOOP_HOME}/bin/hadoop jar\ 9 | ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\ 10 | -files mapper.php,reducer.php\ 11 | -input $INPUT_PATH\ 12 | -output $OUTPUT_PATH\ 13 | -mapper "php mapper.php" \ 14 | -reducer "php reducer.php" \ 15 | -------------------------------------------------------------------------------- /hadoop/streaming/run_shell_mr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3 3 | INPUT_PATH=/test/input 4 | OUTPUT_PATH=/test/output 5 | echo "Clearing output path: $OUTPUT_PATH" 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH 7 | 8 | ${HADOOP_HOME}/bin/hadoop jar\ 9 | ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\ 10 | -files mapper.sh,reducer.sh\ 11 | -input $INPUT_PATH\ 12 | -output $OUTPUT_PATH\ 13 | -mapper "sh mapper.sh"\ 14 | -reducer "sh reducer.sh" 15 | -------------------------------------------------------------------------------- /hadoop/streaming/test.txt: -------------------------------------------------------------------------------- 1 | i 2 | have 3 | a 4 | book 5 | you 6 | do 7 | not 8 | have 9 | one 10 | so 11 | i 12 | am 13 | better 14 | than 15 | you 16 | ha 17 | ha 18 | -------------------------------------------------------------------------------- /hbase/hbase-ingest/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | bigdata.hbase 8 | ingest 9 | 1.0-SNAPSHOT 10 | 11 | 12 | aliyun 13 | http://maven.aliyun.com/nexus/content/groups/public/ 14 | 15 | 16 | 17 | 18 | org.apache.hadoop 19 | hadoop-common 20 | 2.6.0 21 | 22 | 23 | org.apache.hbase 24 | hbase-client 25 | 1.2.4 26 | 27 | 28 | 29 | 30 | 31 | maven-assembly-plugin 32 | 2.3 33 | 34 | dist 35 | true 36 | 37 | jar-with-dependencies 38 | 39 | 40 | 41 | 42 | make-assembly 43 | package 44 | 45 | single 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /hbase/hbase-ingest/src/main/java/bigdata/hbase/Ingest.java: -------------------------------------------------------------------------------- 1 | package bigdata.hbase; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.hbase.HBaseConfiguration; 8 | import org.apache.hadoop.hbase.HColumnDescriptor; 9 | import org.apache.hadoop.hbase.HTableDescriptor; 10 | import org.apache.hadoop.hbase.TableName; 11 | import org.apache.hadoop.hbase.client.*; 12 | import org.apache.hadoop.hbase.util.Bytes; 13 | 14 | import java.io.BufferedReader; 15 | import java.io.IOException; 16 | import java.io.InputStreamReader; 17 | 18 | /** 19 | * Created by qianxi.zhang on 5/1/17. 20 | */ 21 | public abstract class Ingest { 22 | public static final String TABLE_NAME = "user_behavior"; 23 | public static final String SEPARATOR = ","; 24 | //Connection to the cluster. 25 | private Connection connection = null; 26 | //A lightweight handler for a specific table. 27 | private Table table = null; 28 | public static final String FAMILY_NAME_P = "p"; 29 | public static final String FAMILY_NAME_B = "b"; 30 | 31 | public static Configuration getHBaseConfiguration() { 32 | Configuration conf = HBaseConfiguration.create(); 33 | conf.set("hbase.zookeeper.quorum", 34 | "bigdata"); 35 | conf.set("zookeeper.znode.parent", "/hbase"); 36 | 37 | return conf; 38 | } 39 | 40 | public void init() throws IOException { 41 | //establish the connection to the cluster. 42 | connection = ConnectionFactory.createConnection(getHBaseConfiguration()); 43 | //retrieve a handler to the target table 44 | table = connection.getTable(TableName.valueOf(TABLE_NAME)); 45 | } 46 | 47 | public void shutdown() throws IOException { 48 | if (table != null) { 49 | table.close(); 50 | } 51 | if (connection != null) { 52 | connection.close(); 53 | } 54 | } 55 | 56 | public void createTable() throws IOException { 57 | Admin admin = connection.getAdmin(); 58 | 59 | if (!admin.tableExists(TableName.valueOf(TABLE_NAME))) { 60 | HTableDescriptor tableDescriptor = new HTableDescriptor(TableName.valueOf(TABLE_NAME)); 61 | HColumnDescriptor columnDescriptor_1 = new HColumnDescriptor(Bytes.toBytes(FAMILY_NAME_P)); 62 | HColumnDescriptor columnDescriptor_2 = new HColumnDescriptor(Bytes.toBytes(FAMILY_NAME_B)); 63 | columnDescriptor_1.setMaxVersions(1); 64 | columnDescriptor_2.setMaxVersions(1000); 65 | tableDescriptor.addFamily(columnDescriptor_1); 66 | tableDescriptor.addFamily(columnDescriptor_2); 67 | admin.createTable(tableDescriptor); 68 | } 69 | } 70 | 71 | public void ingest(String path) throws IOException { 72 | init(); 73 | createTable(); 74 | FileSystem fs = null; 75 | Configuration conf = new Configuration(); 76 | Path myPath = new Path(path); 77 | fs = myPath.getFileSystem(conf); 78 | FSDataInputStream hdfsInStream = fs.open(new Path(path)); 79 | BufferedReader in = null; 80 | in = new BufferedReader(new InputStreamReader(hdfsInStream)); 81 | String line = null; 82 | while ((line = in.readLine()) != null) { 83 | System.out.println(line); 84 | Put put = process(line); 85 | //send the data 86 | table.put(put); 87 | } 88 | if (in != null) { 89 | in.close(); 90 | } 91 | shutdown(); 92 | } 93 | 94 | abstract public Put process(String line); 95 | 96 | } 97 | -------------------------------------------------------------------------------- /hbase/hbase-ingest/src/main/java/bigdata/hbase/ProfileIngest.java: -------------------------------------------------------------------------------- 1 | package bigdata.hbase; 2 | 3 | import org.apache.hadoop.hbase.client.Put; 4 | import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; 5 | import org.apache.hadoop.hbase.util.Bytes; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Created by qianxi.zhang on 5/2/17. 11 | */ 12 | public class ProfileIngest extends Ingest { 13 | public static final String QUALIFIER_NAME_P_NAME = "name"; 14 | public static final String QUALIFIER_NAME_P_GENDER = "gender"; 15 | public static final String QUALIFIER_NAME_P_BIRTH = "birth"; 16 | public static final String QUALIFIER_NAME_P_PROVINCE = "province"; 17 | 18 | public Put process(String line) { 19 | String[] attributes = line.split(SEPARATOR); 20 | Put put = new Put(Bytes.toBytes(attributes[0])); 21 | put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_NAME), Bytes.toBytes(attributes[1])); 22 | put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_GENDER), Bytes.toBytes(attributes[2])); 23 | put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_BIRTH), Bytes.toBytes(attributes[3])); 24 | put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_PROVINCE), Bytes.toBytes(attributes[4])); 25 | 26 | return put; 27 | } 28 | 29 | public static void main(String[] args) throws IOException { 30 | if (args == null || args.length != 1) 31 | throw new IllegalArgumentIOException("path should be offered"); 32 | ProfileIngest ingest = new ProfileIngest(); 33 | ingest.ingest(args[0]); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /hbase/hbase-ingest/src/main/java/bigdata/hbase/Query.java: -------------------------------------------------------------------------------- 1 | package bigdata.hbase; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hbase.HBaseConfiguration; 5 | import org.apache.hadoop.hbase.TableName; 6 | import org.apache.hadoop.hbase.client.*; 7 | import org.apache.hadoop.hbase.util.Bytes; 8 | 9 | import java.io.IOException; 10 | 11 | import static bigdata.hbase.Ingest.TABLE_NAME; 12 | 13 | /** 14 | * Created by qianxi.zhang on 5/2/17. 15 | */ 16 | public class Query { 17 | public static final String TABLE_NAME = "user_behavior"; 18 | public static final String FAMILY_NAME_P = "p"; 19 | public static final String FAMILY_NAME_B = "b"; 20 | public static final String QUALIFIER_NAME_B_RID = "rid"; 21 | 22 | public static Configuration getHBaseConfiguration() { 23 | Configuration conf = HBaseConfiguration.create(); 24 | conf.set("hbase.zookeeper.quorum", 25 | "bigdata"); 26 | conf.set("zookeeper.znode.parent", "/hbase"); 27 | 28 | return conf; 29 | } 30 | 31 | public void process() throws IOException { 32 | //establish the connection to the cluster. 33 | Connection connection = ConnectionFactory.createConnection(); 34 | //retrieve a handler to the target table 35 | Table table = connection.getTable(TableName.valueOf(TABLE_NAME)); 36 | 37 | Scan scan = new Scan(); 38 | scan.addColumn(Bytes.toBytes(FAMILY_NAME_B), Bytes.toBytes(QUALIFIER_NAME_B_RID)); 39 | scan.setMaxVersions(1000); 40 | scan.setCaching(100); 41 | ResultScanner results = table.getScanner(scan); 42 | 43 | for (Result result : results) { 44 | System.out.println(Bytes.toString(result.getRow()) + " : " + (result.isEmpty() ? 0 : result.listCells().size())); 45 | } 46 | table.close(); 47 | connection.close(); 48 | } 49 | 50 | public static void main(String[] args) throws IOException { 51 | Query query = new Query(); 52 | query.process(); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /hbase/hbase-ingest/src/main/java/bigdata/hbase/RecordIngest.java: -------------------------------------------------------------------------------- 1 | package bigdata.hbase; 2 | 3 | import org.apache.hadoop.hbase.client.Put; 4 | import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; 5 | import org.apache.hadoop.hbase.util.Bytes; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Created by qianxi.zhang on 5/2/17. 11 | */ 12 | public class RecordIngest extends Ingest { 13 | public static final String QUALIFIER_NAME_B_RID = "rid"; 14 | 15 | public Put process(String line) { 16 | String[] attributes = line.split(SEPARATOR); 17 | Put put = new Put(Bytes.toBytes(attributes[1])); 18 | put.addColumn(Bytes.toBytes(FAMILY_NAME_B), Bytes.toBytes(QUALIFIER_NAME_B_RID), Long.valueOf(attributes[3]), Bytes.toBytes(attributes[0])); 19 | return put; 20 | } 21 | 22 | public static void main(String[] args) throws IOException { 23 | if (args == null || args.length != 1) 24 | throw new IllegalArgumentIOException("path should be offered"); 25 | RecordIngest ingest = new RecordIngest(); 26 | ingest.ingest(args[0]); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /hive/README.md: -------------------------------------------------------------------------------- 1 | # Hive实验手册 2 | 3 | ### 0. 下载本实验所需要的Git工程 4 | 5 | ``` 6 | cd /home/bigdata 7 | git clone https://github.com/bigdataguide/hadooptraining.git 8 | cd hadooptraining/hive 9 | ``` 10 | hive目录下包含三个子目录: 11 | * conf: hive的配置 12 | * command: 启动Hive服务的的命令和一些实验用的SQL语句 13 | * data中包含的是外部的一些数据,可以直接导入到Hive表中 14 | 15 | **注意:本实验使用的用户主目录是bigdata, 你需要将目录替换为你自己的主目录,下同。** 16 | 17 | 18 | ### 1. 下载安装Hadoop 19 | 20 | 参考之前的课程内容,下面假定我们的Hadoop目录安装在/home/bigdata/hadoop-2.7.3中,如果你不是安装在这个目录需要替换为你自己的目录 21 | 22 | ### 2. 配置Mysql 23 | ``` 24 | #Ubuntu 25 | sudo apt-get install mysql libmysql-java 26 | #CentOS 27 | sudo yum install mysql mysql-connector-java 28 | #启动Mysql 29 | sudo service mysqld start 30 | ``` 31 | 32 | ### 3. 安装Hive 33 | 34 | #### 3.1 下载Hive二进制包 35 | ``` 36 | wget http://apache.mirrors.pair.com/hive/hive-2.1.1/apache-hive-2.1.1-bin.tar.gz /tmp/ 37 | #解压hive到工作目录 38 | tar -zxvf /tmp/apache-hive-2.1.1-bin.tar.gz -C /home/bigdata/ 39 | cd /home/bigdata/apache-hive-2.1.1-bin/ 40 | ``` 41 | 42 | #### 3.2 配置Hive:拷贝Git工程中的配置到Hive目录 43 | ``` 44 | cp /home/bigdata/hadooptraining/hive/conf/hive-env.sh /home/bigdata/apache-hive-2.1.1-bin/conf/ 45 | cp /home/bigdata/hadooptraining/hive/conf/hive-site.xml /home/bigdata/apache-hive-2.1.1-bin/conf/ 46 | ``` 47 | **配置文件的说明见附1,根据实际情况选择自己的主目录** 48 | 49 | #### 3.3 启动Hive组件 50 | ``` 51 | export HADOOP_HOME=/home/bigdata/hadoop-2.7.3 52 | export PATH=/home/bigdata/apache-hive-2.1.1-bin:$PATH 53 | #启动MetaStore Server 54 | nohup hive --service metastore >> /home/bigdata/apache-hive-2.1.0-bin/logs/metastore.log 2>&1 & 55 | #启动HiveServer2 56 | nohup hive --service hiveserver2 >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 & 57 | ``` 58 | ### 4. 启动Hive 59 | #### 4.1 启动Hive CLI 60 | ``` 61 | hive 62 | ``` 63 | #### 4.2 启动Beeline CLI 64 | ``` 65 | beeline -n bigdata -pbigdata -u "jdbc:hive2://localhost:10000/default;auth=noSasl" 66 | #或者 67 | beeline 68 | beeline> !connect jdbc:hive2://localhost:10000/default bigdata bigdata 69 | ``` 70 | 71 | ### 附 配置文件说明 72 | 在hive-env.sh中,我们配置了HADOOP_HOME目录,你需要将主目录替换为你自己的主目录 73 | HADOOP_HOME=/home/bigdata/hadoop-2.7.3 74 | 75 | 在hive-site.xml中,我们配置了: 76 | 1)使用mysql存储元数据 77 | ```xml 78 | 79 | javax.jdo.option.ConnectionURL 80 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true 81 | 82 | 83 | javax.jdo.option.ConnectionDriverName 84 | com.mysql.jdbc.Driver 85 | 86 | 87 | javax.jdo.option.ConnectionUserName 88 | root 89 | 90 | 91 | javax.jdo.option.ConnectionPassword 92 | root 93 | 94 | ``` 95 | 2)hive在HDFS上的存储路径 96 | ```xml 97 | 98 | hive.metastore.warehouse.dir 99 | /warehouse 100 | 101 | 102 | fs.defaultFS 103 | hdfs://bigdata:9000 104 | 105 | ``` 106 | 3)metastore的端口 107 | ```xml 108 | 109 |         hive.metastore.uris 110 |         thrift://bigdata:9083 111 | 112 | ``` 113 | 4)HiveServer2的端口 114 | ```xml 115 | 116 |     hive.server2.thrift.port 117 |     10000 118 | 119 | 120 | beeline.hs2.connection.user 121 | bigdata 122 | 123 | 124 | beeline.hs2.connection.password 125 | bigdata 126 | 127 | ``` 128 | 5) 此外我们还配置自动创建Meta Store的数据库和表 129 | ```xml 130 | 131 | datanucleus.autoCreateSchema 132 | true 133 | 134 | 135 | datanucleus.autoStartMechanism 136 | SchemaTable 137 | 138 | 139 | datanucleus.schema.autoCreateTables 140 | true 141 | 142 | ``` 143 | -------------------------------------------------------------------------------- /hive/command/add_partition.sql: -------------------------------------------------------------------------------- 1 | 2 | load data inpath "hdfs://bigdata:9000/etl/record/2016-11-24/2300" overwrite into table record partition(partition_date="2016-11-24",hour_minute="2300") 3 | -------------------------------------------------------------------------------- /hive/command/age_price_list.sql: -------------------------------------------------------------------------------- 1 | select cast(DATEDIFF(CURRENT_DATE, birth)/365 as int) as age, 2 | sum(price) as totalPrice 3 | from record join user_dimension on record.uid=user_dimension.uid 4 | group by cast(DATEDIFF(CURRENT_DATE, birth)/365 as int) 5 | order by totalPrice desc; 6 | -------------------------------------------------------------------------------- /hive/command/brand_price_list.sql: -------------------------------------------------------------------------------- 1 | select brand,sum(price) as totalPrice 2 | from record join brand_dimension on record.bid=brand_dimension.bid 3 | group by brand_dimension.brand 4 | order by totalPrice desc; 5 | -------------------------------------------------------------------------------- /hive/command/create_orc_table.sql: -------------------------------------------------------------------------------- 1 | create table if not exists record_orc ( 2 | rid STRING, 3 | uid STRING, 4 | bid STRING, 5 | trancation_date TIMESTAMP, 6 | price INT, 7 | source_province STRING, 8 | target_province STRING, 9 | site STRING, 10 | express_number STRING, 11 | express_company STRING 12 | ) 13 | PARTITIONED BY ( 14 | partition_date STRING, 15 | hour_minute STRING 16 | ) 17 | STORED AS ORC; 18 | -------------------------------------------------------------------------------- /hive/command/create_parquet_table.sql: -------------------------------------------------------------------------------- 1 | create table if not exists record_parquet ( 2 | rid STRING, 3 | uid STRING, 4 | bid STRING, 5 | trancation_date TIMESTAMP, 6 | price INT, 7 | source_province STRING, 8 | target_province STRING, 9 | site STRING, 10 | express_number STRING, 11 | express_company STRING 12 | ) 13 | PARTITIONED BY ( 14 | partition_date STRING, 15 | hour_minute STRING 16 | ) 17 | STORED AS PARQUET; 18 | -------------------------------------------------------------------------------- /hive/command/create_table_brand.sql: -------------------------------------------------------------------------------- 1 | create external table if not exists brand_dimension ( 2 | bid STRING, 3 | category STRING, 4 | brand STRING 5 | )ROW FORMAT DELIMITED 6 | FIELDS TERMINATED BY ',' 7 | location 'hdfs://bigdata:9000/warehouse/brand_dimension' 8 | ; 9 | -------------------------------------------------------------------------------- /hive/command/create_table_record.sql: -------------------------------------------------------------------------------- 1 | create table if not exists record ( 2 | rid STRING, 3 | uid STRING, 4 | bid STRING, 5 | trancation_date TIMESTAMP, 6 | price INT, 7 | source_province STRING, 8 | target_province STRING, 9 | site STRING, 10 | express_number STRING, 11 | express_company STRING 12 | ) 13 | PARTITIONED BY ( 14 | partition_date STRING, 15 | hour INT 16 | ) 17 | ROW FORMAT DELIMITED 18 | FIELDS TERMINATED BY ',' 19 | -------------------------------------------------------------------------------- /hive/command/create_table_user.sql: -------------------------------------------------------------------------------- 1 | create external table if not exists user_dimension ( 2 | uid STRING, 3 | name STRING, 4 | gender STRING, 5 | birth DATE, 6 | province STRING 7 | )ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY ',' 9 | location 'hdfs://bigdata:9000/warehouse/user_dimension' 10 | ; 11 | -------------------------------------------------------------------------------- /hive/command/employees.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS employees ( 2 | name STRING, 3 | salary FLOAT, 4 | subordinates ARRAY, 5 | decutions MAP, 6 | address STRUCT 7 | ) 8 | ROW FORMAT DELIMITED 9 | FIELDS TERMINATED BY '\001' 10 | COLLECTION ITEMS TERMINATED BY '\002' 11 | MAP KEYS TERMINATED BY '\003' 12 | LINES TERMINATED BY '\n' 13 | STORED AS TEXTFILE; 14 | 15 | -- LOAD DATA LOCAL INPATH '/home/bigdata/hadooop/training/hive/data/employees.txt' OVERWRITE INTO TABLE employees; 16 | -------------------------------------------------------------------------------- /hive/command/employees_part.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS employees_part ( 2 | name STRING, 3 | salary FLOAT, 4 | subordinates ARRAY, 5 | decutions MAP, 6 | address STRUCT 7 | ) 8 | PARTITIONED BY (state STRING) 9 | ROW FORMAT DELIMITED 10 | FIELDS TERMINATED BY '\001' 11 | COLLECTION ITEMS TERMINATED BY '\002' 12 | MAP KEYS TERMINATED BY '\003' 13 | LINES TERMINATED BY '\n' 14 | STORED AS TEXTFILE; 15 | 16 | -- LOAD DATA LOCAL INPATH '/home/bigdata/hadooptraining/hive/data/employees.txt' 17 | -- OVERWRITE INTO TABLE employees_part PARTITION(state='IL'); 18 | 19 | --INSERT INTO TABLE employees_part PARTITION(state = 'IL') 20 | --SELECT * FROM employees where address.state='IL'; 21 | 22 | -- FROM employees e 23 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'IL') SELECT e.* where e.address.state='IL' 24 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'CA') SELECT e.* where e.address.state='CA' 25 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'NY') SELECT e.* where e.address.state='NY'; 26 | 27 | FROM employees e 28 | INSERT OVERWRITE TABLE employees_part PARTITION(state) SELECT e.*,e.address.state 29 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'CA') SELECT * where e.address.state='CA' 30 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'NY') SELECT * where e.address.state='NY'; 31 | -------------------------------------------------------------------------------- /hive/command/load_data_to_orc.sql: -------------------------------------------------------------------------------- 1 | set hive.exec.dynamic.partition.mode=nonstrict; 2 | insert into table record_orc partition(partition_date,hour_minute) select * from record; 3 | -------------------------------------------------------------------------------- /hive/command/load_data_to_parquet.sql: -------------------------------------------------------------------------------- 1 | set hive.exec.dynamic.partition.mode=nonstrict; 2 | insert into table record_parquet partition(partition_date,hour_minute) select * from record; 3 | -------------------------------------------------------------------------------- /hive/command/province_prince_list.sql: -------------------------------------------------------------------------------- 1 | select province,sum(price) as totalPrice 2 | from record join user_dimension on record.uid=user_dimension.uid 3 | group by user_dimension.province 4 | order by totalPrice desc; 5 | -------------------------------------------------------------------------------- /hive/command/skewed.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE T1(key STRING, val STRING) 2 | SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE; 3 | -------------------------------------------------------------------------------- /hive/command/start-hiveserver2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup hive --service hiveserver2 >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 & 4 | -------------------------------------------------------------------------------- /hive/command/start-metastore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup hive --service metastore >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 & 4 | -------------------------------------------------------------------------------- /hive/command/start-mysql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo service mysqld start 4 | -------------------------------------------------------------------------------- /hive/command/weblog.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS weblog ( 2 | user_id INT, 3 | url STRING, 4 | source_ip STRING 5 | ) PARTITIONED BY (dt STRING) 6 | CLUSTERED BY (user_id) INTO 96 BUCKETS; 7 | 8 | -------------------------------------------------------------------------------- /hive/conf/hive-env.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Set Hive and Hadoop environment variables here. These variables can be used 18 | # to control the execution of Hive. It should be used by admins to configure 19 | # the Hive installation (so that users do not have to set environment variables 20 | # or set command line parameters to get correct behavior). 21 | # 22 | # The hive service being invoked (CLI/HWI etc.) is available via the environment 23 | # variable SERVICE 24 | 25 | 26 | # Hive Client memory usage can be an issue if a large number of clients 27 | # are running at the same time. The flags below have been useful in 28 | # reducing memory usage: 29 | # 30 | # if [ "$SERVICE" = "cli" ]; then 31 | # if [ -z "$DEBUG" ]; then 32 | # export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit" 33 | # else 34 | # export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit" 35 | # fi 36 | # fi 37 | 38 | # The heap size of the jvm stared by hive shell script can be controlled via: 39 | # 40 | export HADOOP_HEAPSIZE=512 41 | # 42 | # Larger heap size may be required when running queries over large number of files or partitions. 43 | # By default hive shell scripts use a heap size of 256 (MB). Larger heap size would also be 44 | # appropriate for hive server (hwi etc). 45 | 46 | 47 | # Set HADOOP_HOME to point to a specific hadoop install directory 48 | # HADOOP_HOME=${bin}/../../hadoop 49 | HADOOP_HOME=/home/bigdata/hadoop-2.7.3 50 | 51 | # Hive Configuration Directory can be controlled by: 52 | # export HIVE_CONF_DIR= 53 | 54 | # Folder containing extra ibraries required for hive compilation/execution can be controlled by: 55 | # export HIVE_AUX_JARS_PATH= 56 | -------------------------------------------------------------------------------- /hive/conf/hive-log4j2.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | status = INFO 18 | name = HiveLog4j2 19 | packages = org.apache.hadoop.hive.ql.log 20 | 21 | # list of properties 22 | property.hive.log.level = INFO 23 | property.hive.root.logger = DRFA 24 | property.hive.log.dir = /home/bigdata/apache-hive-2.1.0-bin/logs 25 | property.hive.log.file = hive.log 26 | property.hive.perflogger.log.level = INFO 27 | 28 | # list of all appenders 29 | appenders = console, DRFA 30 | 31 | # console appender 32 | appender.console.type = Console 33 | appender.console.name = console 34 | appender.console.target = SYSTEM_ERR 35 | appender.console.layout.type = PatternLayout 36 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n 37 | 38 | # daily rolling file appender 39 | appender.DRFA.type = RollingRandomAccessFile 40 | appender.DRFA.name = DRFA 41 | appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file} 42 | # Use %pid in the filePattern to append @ to the filename if you want separate log files for different CLI session 43 | appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd} 44 | appender.DRFA.layout.type = PatternLayout 45 | appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n 46 | appender.DRFA.policies.type = Policies 47 | appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy 48 | appender.DRFA.policies.time.interval = 1 49 | appender.DRFA.policies.time.modulate = true 50 | appender.DRFA.strategy.type = DefaultRolloverStrategy 51 | appender.DRFA.strategy.max = 30 52 | 53 | # list of all loggers 54 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, PerfLogger 55 | 56 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn 57 | logger.NIOServerCnxn.level = WARN 58 | 59 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO 60 | logger.ClientCnxnSocketNIO.level = WARN 61 | 62 | logger.DataNucleus.name = DataNucleus 63 | logger.DataNucleus.level = ERROR 64 | 65 | logger.Datastore.name = Datastore 66 | logger.Datastore.level = ERROR 67 | 68 | logger.JPOX.name = JPOX 69 | logger.JPOX.level = ERROR 70 | 71 | logger.PerfLogger.name = org.apache.hadoop.hive.ql.log.PerfLogger 72 | logger.PerfLogger.level = ${sys:hive.perflogger.log.level} 73 | 74 | # root logger 75 | rootLogger.level = ${sys:hive.log.level} 76 | rootLogger.appenderRefs = root 77 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger} 78 | -------------------------------------------------------------------------------- /hive/conf/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 |         hive.metastore.uris 21 |         thrift://localhost:9083 22 | 23 | 24 |     hive.server2.thrift.port 25 |     10000 26 | 27 | 28 | javax.jdo.option.ConnectionURL 29 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true 30 | 31 | 32 | javax.jdo.option.ConnectionDriverName 33 | com.mysql.jdbc.Driver 34 | 35 | 36 | javax.jdo.option.ConnectionUserName 37 | root 38 | 39 | 40 | javax.jdo.option.ConnectionPassword 41 | root 42 | 43 | 44 | hive.metastore.warehouse.dir 45 | /warehouse 46 | 47 | 48 | fs.defaultFS 49 | hdfs://localhost:9000 50 | 51 | 52 | datanucleus.autoCreateSchema 53 | true 54 | 55 | 56 | datanucleus.autoStartMechanism 57 | SchemaTable 58 | 59 | 60 | datanucleus.schema.autoCreateTables 61 | true 62 | 63 | 64 | 65 | beeline.hs2.connection.user 66 | bigdata 67 | 68 | 69 | beeline.hs2.connection.password 70 | bigdata 71 | 72 | 73 | -------------------------------------------------------------------------------- /hive/data/employees.txt: -------------------------------------------------------------------------------- 1 | John Doe100000.0Mary SmithTodd JonesFederal Taxes.2State Taxes.05Insurance.11 Michigan Ave.ChicagoIL60600 2 | Mary Smith80000.0Bill KingFederal Taxes.2State Taxes.05Insurance.1100 Ontario St.ChicagoIL60601 3 | Todd Jones70000.0Federal Taxes.15State Taxes.03Insurance.1200 Chicago Ave.Oak ParkIL60700 4 | Bill King60000.0Federal Taxes.15State Taxes.03Insurance.1300 Obscure Dr.ObscuriaIL60100 5 | -------------------------------------------------------------------------------- /kafka/command/start-kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/kafka_2.11-0.10.1.0/bin/kafka-server-start.sh -daemon /home/bigdata/kafka_2.11-0.10.1.0/config/server.properties 4 | -------------------------------------------------------------------------------- /kafka/command/start-zookeeper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/kafka_2.11-0.10.1.0/bin/zookeeper-server-start.sh -daemon /home/bigdata/kafka_2.11-0.10.1.0/config/zookeeper.properties 4 | -------------------------------------------------------------------------------- /mysql/create_table_brand.sql: -------------------------------------------------------------------------------- 1 | create table log.brand_dimension( 2 | bid varchar(255), 3 | category varchar(255), 4 | brand varchar(255), 5 | primary key (bid) 6 | ); 7 | -------------------------------------------------------------------------------- /mysql/create_table_user.sql: -------------------------------------------------------------------------------- 1 | create table log.user_dimension( 2 | uid varchar(255), 3 | name varchar(255), 4 | gender varchar(255), 5 | birth date, 6 | province varchar(255), 7 | primary key (uid) 8 | ) 9 | -------------------------------------------------------------------------------- /mysql/load_table_brand.sql: -------------------------------------------------------------------------------- 1 | 2 | LOAD DATA LOCAL INFILE '/home/bigdata/datasource/brand.list' INTO TABLE log.brand_dimension 3 | FIELDS TERMINATED BY ',' ENCLOSED BY '"' 4 | LINES TERMINATED BY '\n' 5 | -------------------------------------------------------------------------------- /mysql/load_table_user.sql: -------------------------------------------------------------------------------- 1 | 2 | LOAD DATA LOCAL INFILE '/home/bigdata/datasource/user.list' INTO TABLE log.user_dimension 3 | FIELDS TERMINATED BY ',' ENCLOSED BY '"' 4 | LINES TERMINATED BY '\n' 5 | -------------------------------------------------------------------------------- /mysql/start-client.txt: -------------------------------------------------------------------------------- 1 | mysql -u root 2 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 4.0.0 7 | cn.chinahadoop 8 | hadoop-example 9 | 0.1.0-SNAPSHOT 10 | Hadoop Examples 11 | hadoop examples 12 | pom 13 | 14 | 15 | hadoop 16 | 17 | 18 | 19 | 20 | Maven2 21 | http://repo1.maven.org/maven2 22 | 23 | 24 | 25 | 26 | 28 | 29 | 30 | 31 | ${project.build.directory} 32 | 33 | webapps/** 34 | 35 | 36 | 37 | 38 | src/main/resources 39 | 40 | **/** 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /presto/command/age_price_list_presto.sql: -------------------------------------------------------------------------------- 1 | select cast((year(CURRENT_DATE)-year(birth)) as integer) as age,sum(price) as totalPrice 2 | from record join user_dimension on record.uid=user_dimension.uid 3 | group by cast((year(CURRENT_DATE)-year(birth)) as integer) 4 | order by totalPrice desc 5 | -------------------------------------------------------------------------------- /presto/command/brand_price_list_presto.sql: -------------------------------------------------------------------------------- 1 | select brand,sum(price) as totalPrice 2 | from record join brand_dimension on record.bid=brand_dimension.bid 3 | group by brand_dimension.brand 4 | order by totalPrice desc 5 | -------------------------------------------------------------------------------- /presto/command/gender_brand_rank.sql: -------------------------------------------------------------------------------- 1 | select gender, brand,count(*) as purchase_count 2 | from record_orc join user_dimension_orc on record_orc.uid=user_dimension_orc.uid 3 | join brand_dimension_orc on record_orc.bid=brand_dimension_orc.bid 4 | group by gender, brand 5 | order by gender, purchase_count DESC 6 | 7 | -------------------------------------------------------------------------------- /presto/command/start-presto-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | presto --server localhost:8080 --catalog hive --schema default 4 | -------------------------------------------------------------------------------- /presto/command/start-presto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/presto-server-0.157/bin/launcher start 4 | -------------------------------------------------------------------------------- /presto/command/stop-presto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/presto-server-0.157/bin/launcher stop 4 | -------------------------------------------------------------------------------- /presto/conf/etc/catalog/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore.uri=thrift://bigdata:9083 3 | hive.config.resources=/home/bigdata/hadoop-2.7.3/etc/hadoop/core-site.xml,=/home/bigdata/hadoop-2.7.3/etc/hadoop/hdfs-site.xml 4 | -------------------------------------------------------------------------------- /presto/conf/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | query.max-memory=512MB 5 | query.max-memory-per-node=512MB 6 | discovery-server.enabled=true 7 | discovery.uri=http://bigdata:8080 8 | -------------------------------------------------------------------------------- /presto/conf/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx1G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:OnOutOfMemoryError=kill -9 %p 9 | -------------------------------------------------------------------------------- /presto/conf/etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=production 2 | node.id=bigdata 3 | node.data-dir=/home/bigdata/presto-server-0.157/presto_data 4 | -------------------------------------------------------------------------------- /redis/command/start-redis-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/redis-stable/src/redis-cli 4 | -------------------------------------------------------------------------------- /redis/command/start-redis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/bigdata/redis-stable/src/redis-server /home/bigdata/redis-stable/redis.conf 4 | -------------------------------------------------------------------------------- /sqoop/command/brand_dimension_sqoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sqoop import --connect jdbc:mysql://bigdata:3306/log --username root --password root --table brand_dimension --driver com.mysql.jdbc.Driver --m 10 --target-dir /warehouse/brand_dimension 4 | -------------------------------------------------------------------------------- /sqoop/command/user_dimension_sqoop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sqoop import --connect jdbc:mysql://bigdata:3306/log --username root --password root --table user_dimension --driver com.mysql.jdbc.Driver --m 10 --target-dir /warehouse/user_dimension 4 | -------------------------------------------------------------------------------- /storm/command/realtime_process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | storm jar /home/bigdata/real_time_process/storm-1.0-SNAPSHOT-jar-with-dependencies.jar bigdata.storm.LogProcessTopology LogProcess 4 | -------------------------------------------------------------------------------- /storm/command/start-storm-nimbus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm nimbus >> /home/bigdata/apache-storm-1.0.2/logs/nimbus.log 2>&1 & 4 | -------------------------------------------------------------------------------- /storm/command/start-storm-supervisor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm supervisor >> /home/bigdata/apache-storm-1.0.2/logs/supervisor.log 2>&1 & 4 | -------------------------------------------------------------------------------- /storm/command/start-storm-ui.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm ui >> /home/bigdata/apache-storm-1.0.2/logs/ui.log 2>&1 & 4 | -------------------------------------------------------------------------------- /storm/conf/storm-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Set Storm specific environment variables here. 20 | 21 | # The java implementation to use. 22 | #export JAVA_HOME=/path/to/jdk/home 23 | 24 | # export STORM_CONF_DIR="" 25 | -------------------------------------------------------------------------------- /storm/conf/storm.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ########### These MUST be filled in for a storm configuration 18 | storm.zookeeper.servers: 19 | - "bigdata" 20 | 21 | storm.local.dir: "/home/bigdata/apache-storm-1.0.2/storm_data" 22 | 23 | nimbus.seeds: ["bigdata"] 24 | supervisor.slots.ports: 25 | - 6700 26 | - 6701 27 | 28 | storm.exhibitor.port: 9080 29 | ui.port: 9080 30 | # 31 | # 32 | # ##### These may optionally be filled in: 33 | # 34 | ## List of custom serializations 35 | # topology.kryo.register: 36 | # - org.mycompany.MyType 37 | # - org.mycompany.MyType2: org.mycompany.MyType2Serializer 38 | # 39 | ## List of custom kryo decorators 40 | # topology.kryo.decorators: 41 | # - org.mycompany.MyDecorator 42 | # 43 | ## Locations of the drpc servers 44 | # drpc.servers: 45 | # - "server1" 46 | # - "server2" 47 | 48 | ## Metrics Consumers 49 | # topology.metrics.consumer.register: 50 | # - class: "org.apache.storm.metric.LoggingMetricsConsumer" 51 | # parallelism.hint: 1 52 | # - class: "org.mycompany.MyMetricsConsumer" 53 | # parallelism.hint: 1 54 | # argument: 55 | # - endpoint: "metrics-collector.mycompany.org" 56 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | bigdata 8 | storm 9 | 1.0-SNAPSHOT 10 | 11 | 1.8 12 | 1.0.2 13 | 0.8.2.1 14 | 15 | 16 | 17 | 18 | org.apache.storm 19 | storm-kafka 20 | ${storm.version} 21 | 22 | 23 | org.apache.storm 24 | storm-core 25 | ${storm.version} 26 | provided 27 | 28 | 29 | org.apache.storm 30 | storm-redis 31 | ${storm.version} 32 | 33 | 34 | org.apache.kafka 35 | kafka_2.11 36 | ${kafka.version} 37 | 38 | 39 | org.apache.zookeeper 40 | zookeeper 41 | 42 | 43 | log4j 44 | log4j 45 | 46 | 47 | org.slf4j 48 | slf4j-log4j12 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | maven-assembly-plugin 58 | 2.3 59 | 60 | dist 61 | true 62 | 63 | jar-with-dependencies 64 | 65 | 66 | 67 | 68 | make-assembly 69 | package 70 | 71 | single 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/ExtractBolt.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.topology.BasicOutputCollector; 4 | import org.apache.storm.topology.OutputFieldsDeclarer; 5 | import org.apache.storm.topology.base.BaseBasicBolt; 6 | import org.apache.storm.tuple.Fields; 7 | import org.apache.storm.tuple.Tuple; 8 | import org.apache.storm.tuple.Values; 9 | 10 | import java.io.DataOutputStream; 11 | import java.io.FileNotFoundException; 12 | import java.io.FileOutputStream; 13 | import java.io.IOException; 14 | 15 | /** 16 | * Created by qianxi.zhang on 11/26/16. 17 | */ 18 | public class ExtractBolt extends BaseBasicBolt { 19 | public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { 20 | String word = (String) tuple.getValue(0); 21 | 22 | String price = "0"; 23 | String province = "other"; 24 | String website = "other"; 25 | 26 | String[] attributes_list = word.split(","); 27 | 28 | if (attributes_list.length == 12) { 29 | price = attributes_list[4]; 30 | province = attributes_list[6]; 31 | website = attributes_list[7]; 32 | } 33 | 34 | basicOutputCollector.emit("province", new Values(province, price)); 35 | basicOutputCollector.emit("website", new Values(website, price)); 36 | } 37 | 38 | public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { 39 | outputFieldsDeclarer.declareStream("province", new Fields("province", "price")); 40 | outputFieldsDeclarer.declareStream("website", new Fields("website", "price")); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/LogProcessTopology.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.Config; 4 | import org.apache.storm.LocalCluster; 5 | import org.apache.storm.StormSubmitter; 6 | import org.apache.storm.generated.AlreadyAliveException; 7 | import org.apache.storm.generated.AuthorizationException; 8 | import org.apache.storm.generated.InvalidTopologyException; 9 | import org.apache.storm.generated.StormTopology; 10 | import org.apache.storm.kafka.*; 11 | import org.apache.storm.redis.bolt.RedisStoreBolt; 12 | import org.apache.storm.redis.common.config.JedisPoolConfig; 13 | import org.apache.storm.redis.common.mapper.RedisStoreMapper; 14 | import org.apache.storm.spout.SchemeAsMultiScheme; 15 | import org.apache.storm.topology.TopologyBuilder; 16 | import org.apache.storm.tuple.Fields; 17 | import org.apache.storm.utils.Utils; 18 | 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | 22 | /** 23 | * Created by qianxi.zhang on 11/26/16. 24 | */ 25 | public class LogProcessTopology { 26 | 27 | public static final String brokerZkStr = "bigdata:2181"; 28 | public static final String topicName = "log"; 29 | public static final String offsetZkRoot = "/storm" + "-" + topicName; 30 | public static final String offsetZkId = "offsetZkId"; 31 | public static final String redis_hots = "bigdata"; 32 | public static final int redis_port = 6379; 33 | 34 | public static StormTopology getStormTopology() { 35 | 36 | BrokerHosts hosts = new ZkHosts(brokerZkStr); 37 | SpoutConfig spoutConfig = new SpoutConfig(hosts, topicName, offsetZkRoot, offsetZkId); 38 | spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()); 39 | KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig); 40 | 41 | JedisPoolConfig poolConfig = 42 | new JedisPoolConfig.Builder().setHost(redis_hots).setPort(redis_port).build(); 43 | 44 | RedisStoreMapper provinceStoreMapper = new ProvinceStoreMapper(); 45 | RedisStoreBolt provinceStoreBolt = new RedisStoreBolt(poolConfig, provinceStoreMapper); 46 | 47 | RedisStoreMapper websiteStoreMapper = new WebsiteStoreMapper(); 48 | RedisStoreBolt websiteStoreBolt = new RedisStoreBolt(poolConfig, websiteStoreMapper); 49 | 50 | TopologyBuilder builder = new TopologyBuilder(); 51 | builder.setSpout("spout", kafkaSpout, 1); 52 | builder.setBolt("extractbolt", new ExtractBolt(), 1).shuffleGrouping("spout"); 53 | 54 | builder.setBolt("provincebolt", new ProvinceBolt(), 1) 55 | .fieldsGrouping("extractbolt", "province", new Fields("province")); 56 | builder.setBolt("websitebolt", new WebsiteBolt(), 1) 57 | .fieldsGrouping("extractbolt", "website", new Fields("website")); 58 | 59 | builder.setBolt("provinceredisstore", provinceStoreBolt).shuffleGrouping("provincebolt"); 60 | builder.setBolt("websiteredisstore", websiteStoreBolt).shuffleGrouping("websitebolt"); 61 | 62 | return builder.createTopology(); 63 | } 64 | 65 | public static Config getConfig() { 66 | Config conf = new Config(); 67 | return conf; 68 | } 69 | 70 | public static void main(String[] args) { 71 | 72 | Config conf = getConfig(); 73 | StormTopology topology = getStormTopology(); 74 | 75 | if (args != null && args.length > 0) { 76 | //提交到集群运行 77 | try { 78 | StormSubmitter.submitTopology(args[0], conf, topology); 79 | } catch (AlreadyAliveException e) { 80 | e.printStackTrace(); 81 | } catch (InvalidTopologyException e) { 82 | e.printStackTrace(); 83 | } catch (AuthorizationException e) { 84 | e.printStackTrace(); 85 | } 86 | } else { 87 | //本地模式运行 88 | LocalCluster cluster = new LocalCluster(); 89 | cluster.submitTopology("Topotest", conf, topology); 90 | Utils.sleep(1000000); 91 | cluster.killTopology("Topotest"); 92 | cluster.shutdown(); 93 | } 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/ProvinceBolt.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.task.TopologyContext; 4 | import org.apache.storm.topology.BasicOutputCollector; 5 | import org.apache.storm.topology.OutputFieldsDeclarer; 6 | import org.apache.storm.topology.base.BaseBasicBolt; 7 | import org.apache.storm.tuple.Fields; 8 | import org.apache.storm.tuple.Tuple; 9 | import org.apache.storm.tuple.Values; 10 | 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | /** 15 | * Created by qianxi.zhang on 11/26/16. 16 | */ 17 | public class ProvinceBolt extends BaseBasicBolt { 18 | 19 | Map province_price = new HashMap(); 20 | 21 | public void execute(Tuple input, BasicOutputCollector collector) { 22 | String province = input.getStringByField("province"); 23 | long price = Long.valueOf(input.getStringByField("price")); 24 | long totalPrice = price; 25 | if (province_price.containsKey(province)) { 26 | totalPrice += province_price.get(province); 27 | } 28 | province_price.put(province, totalPrice); 29 | collector.emit(new Values(province, String.valueOf(totalPrice))); 30 | } 31 | 32 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 33 | declarer.declare(new Fields("province", "price")); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/ProvinceStoreMapper.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.redis.common.mapper.RedisDataTypeDescription; 4 | import org.apache.storm.redis.common.mapper.RedisStoreMapper; 5 | import org.apache.storm.tuple.ITuple; 6 | 7 | /** 8 | * Created by qianxi.zhang on 11/26/16. 9 | */ 10 | public class ProvinceStoreMapper implements RedisStoreMapper { 11 | private RedisDataTypeDescription description; 12 | private final String hashKey = "province"; 13 | 14 | public ProvinceStoreMapper() { 15 | description = 16 | new RedisDataTypeDescription(RedisDataTypeDescription.RedisDataType.HASH, hashKey); 17 | } 18 | 19 | public RedisDataTypeDescription getDataTypeDescription() { 20 | return description; 21 | } 22 | 23 | public String getKeyFromTuple(ITuple iTuple) { 24 | return iTuple.getStringByField("province"); 25 | } 26 | 27 | public String getValueFromTuple(ITuple iTuple) { 28 | return iTuple.getStringByField("price"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/WebsiteBolt.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.topology.BasicOutputCollector; 4 | import org.apache.storm.topology.OutputFieldsDeclarer; 5 | import org.apache.storm.topology.base.BaseBasicBolt; 6 | import org.apache.storm.tuple.Fields; 7 | import org.apache.storm.tuple.Tuple; 8 | import org.apache.storm.tuple.Values; 9 | 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | /** 14 | * Created by qianxi.zhang on 11/26/16. 15 | */ 16 | public class WebsiteBolt extends BaseBasicBolt { 17 | 18 | Map website_price = new HashMap(); 19 | 20 | public void execute(Tuple input, BasicOutputCollector collector) { 21 | String website = input.getStringByField("website"); 22 | long price = Long.valueOf(input.getStringByField("price")); 23 | long totalPrice = price; 24 | if (website_price.containsKey(website)) { 25 | totalPrice += website_price.get(website); 26 | } 27 | website_price.put(website, totalPrice); 28 | collector.emit(new Values(website, String.valueOf(totalPrice))); 29 | } 30 | 31 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 32 | declarer.declare(new Fields("website", "price")); 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /storm/storm_realtime_process/src/main/java/bigdata/storm/WebsiteStoreMapper.java: -------------------------------------------------------------------------------- 1 | package bigdata.storm; 2 | 3 | import org.apache.storm.redis.common.mapper.RedisDataTypeDescription; 4 | import org.apache.storm.redis.common.mapper.RedisStoreMapper; 5 | import org.apache.storm.tuple.ITuple; 6 | 7 | /** 8 | * Created by qianxi.zhang on 11/26/16. 9 | */ 10 | public class WebsiteStoreMapper implements RedisStoreMapper { 11 | private RedisDataTypeDescription description; 12 | private final String hashKey = "website"; 13 | 14 | public WebsiteStoreMapper() { 15 | description = 16 | new RedisDataTypeDescription(RedisDataTypeDescription.RedisDataType.HASH, hashKey); 17 | } 18 | 19 | public RedisDataTypeDescription getDataTypeDescription() { 20 | return description; 21 | } 22 | 23 | public String getKeyFromTuple(ITuple iTuple) { 24 | return iTuple.getStringByField("website"); 25 | } 26 | 27 | public String getValueFromTuple(ITuple iTuple) { 28 | return iTuple.getStringByField("price"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /visualization/command/start-web.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python /home/bigdata/visualization/py-echarts/main.py 4 | -------------------------------------------------------------------------------- /visualization/py-echarts/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from flask import Flask, render_template 4 | import json 5 | from models import Chart 6 | from query_presto import Presto_Query 7 | from query_redis import Redis_Query 8 | 9 | app = Flask(__name__) 10 | 11 | @app.route("/") 12 | def index(): 13 | presto=Presto_Query() 14 | age_price_tuples=presto.query_age_price() 15 | age_dict=presto.getAgeDict(age_price_tuples) 16 | chart1 = Chart().pie("饼图", data=age_dict 17 | ) 18 | 19 | tuples=presto.query_brand_price() 20 | keys=presto.getKeys(tuples) 21 | values=presto.getValues(tuples) 22 | chart2 = Chart() \ 23 | .x_axis(data=keys) \ 24 | .y_axis(formatter="{value}") \ 25 | .bar(u"Brand Price", values, show_item_label=True) 26 | 27 | redis=Redis_Query() 28 | province_price=redis.query_province() 29 | china_province_price=redis.get_province_price(province_price) 30 | print china_province_price 31 | chart3= Chart()\ 32 | .map(china_province_price) 33 | 34 | render = { 35 | "title": u"电商双十一大数据日志分析系统", 36 | "templates": [ 37 | {"type": "chart", "title":u"不同年龄消费的情况", "option": json.dumps(chart1, indent=2)}, 38 | {"type": "chart", "title":u"消费商品的情况", "option": json.dumps(chart2, indent=2)}, 39 | {"type": "chart", "title":u"各省购买情况", "option": json.dumps(chart3, indent=2)} 40 | ] 41 | } 42 | return render_template("main.html", **render) 43 | 44 | if __name__ == "__main__": 45 | app.run(debug=True) 46 | -------------------------------------------------------------------------------- /visualization/py-echarts/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | 5 | class Chart(dict): 6 | """ 7 | 图表模板 8 | """ 9 | def __init__(self): 10 | super(Chart, self).__init__() 11 | self["calculable"] = True 12 | self["tooltip"] = {"show": True} 13 | self["toolbox"] = { 14 | "show": True, 15 | "x": "left", 16 | "feature": { 17 | "dataView": { 18 | "show": True, 19 | "readOnly": False 20 | }, 21 | "magicType": { 22 | "show": True, 23 | "type": ["line", "bar"] 24 | }, 25 | "restore": { 26 | "show": True 27 | }, 28 | "saveAsImage": { 29 | "show": True 30 | }, 31 | "dataZoom": { 32 | "show": True, 33 | "title": { 34 | "dataZoom": u"区域缩放", 35 | "dataZoomReset": u"区域缩放后退" 36 | } 37 | } 38 | } 39 | } 40 | self["legend"] = { 41 | "show": True, 42 | "data": [] 43 | } 44 | self["series"] = [] 45 | 46 | def title(self, x="center", **kwargs): 47 | """ 48 | 设置图表标题 49 | """ 50 | self["title"].update({ 51 | "x": x 52 | }) 53 | self["title"].update(kwargs) 54 | return self 55 | 56 | def tooltip(self, show=True, trigger='axis', formatter=None, **kwargs): 57 | """ 58 | 设置提示信息 59 | """ 60 | self["tooltip"].update({ 61 | "show": show, 62 | "trigger": trigger 63 | }) 64 | if formatter is not None: 65 | self["tooltip"].update({"formatter": formatter}) 66 | self["tooltip"].update(kwargs) 67 | return self 68 | 69 | def legend(self, show=True, data=None, orient='horizontal', **kwargs): 70 | """ 71 | 设置图例 72 | `data`: [u"图例1", u"图例2", u"图例3"] 73 | `orient`: "vertical"|"horizontal" 74 | """ 75 | data = [] if data is None else data 76 | self["legend"].update({ 77 | "show": show, 78 | "data": data, 79 | "orient": orient 80 | }) 81 | self["legend"].update(kwargs) 82 | return self 83 | 84 | def toolbox(self, show=True, x='left', **kwargs): 85 | """ 86 | 设置工具箱 87 | """ 88 | self["toolbox"].update({ 89 | "show": show, 90 | "x": x 91 | }) 92 | self["toolbox"].update(kwargs) 93 | return self 94 | 95 | def pie(self, name, data=None, radius="55%", center=None, auto_legend=True, **kwargs): 96 | """ 97 | 添加一个饼图 98 | `data`: {u"名称": 100}, u"名称2": 200} 99 | """ 100 | center = ["50%", "60%"] if center is None else center 101 | data = {} if data is None else data 102 | self["series"].append(self.__merge_dict({ 103 | "type": "pie", 104 | "name": name, 105 | "radius": radius, 106 | "center": center, 107 | "data": [{"name": n, "value": v} for n, v in data.items()] 108 | }, kwargs)) 109 | if auto_legend: 110 | legend_data = self["legend"]["data"] 111 | [legend_data.append(x) for x in data if x not in legend_data] 112 | return self 113 | 114 | def bar(self, name, data=None, auto_legend=True, y_axis_index=0, **kwargs): 115 | """ 116 | 添加一个柱状图 117 | `data`: [10, 20, 30, 40] 118 | `auto_legend`: 自动生成图例 119 | """ 120 | data = [] if data is None else data 121 | self["series"].append(self.__merge_dict({ 122 | "type": "bar", 123 | "name": name, 124 | "data": data, 125 | "yAxisIndex": y_axis_index 126 | }, kwargs)) 127 | if "yAxis" not in self: 128 | self.y_axis() 129 | if name not in self["legend"]["data"] and auto_legend: 130 | self["legend"]["data"].append(name) 131 | return self 132 | 133 | def line(self, name, data=None, mark_max_point=False, mark_min_point=False, show_item_label=False, auto_legend=True, y_axis_index=0, **kwargs): 134 | """ 135 | 添加一个折线图 136 | `data`: [10, 20, 30, 40] 137 | """ 138 | data = [] if data is None else data 139 | mark_point = [] 140 | if mark_max_point: 141 | mark_point.append({"type": "max", "name": "最大值"}) 142 | if mark_min_point: 143 | mark_point.append({"type": "min", "name": "最小值"}) 144 | self["series"].append(self.__merge_dict({ 145 | "type": "line", 146 | "name": name, 147 | "data": data, 148 | "markPoint": { 149 | "data":mark_point 150 | }, 151 | "itemStyle": { 152 | "normal": { 153 | "label": {"show": show_item_label} 154 | } 155 | }, 156 | "yAxisIndex": y_axis_index 157 | }, kwargs)) 158 | if "yAxis" not in self: 159 | self.y_axis() 160 | if name not in self["legend"]["data"] and auto_legend: 161 | self["legend"]["data"].append(name) 162 | return self 163 | 164 | def x_axis(self, data=None, type_="category", name="", **kwargs): 165 | """ 166 | 添加X轴 167 | """ 168 | data = [] if data is None else data 169 | if "xAxis" not in self: 170 | self["xAxis"] = [] 171 | self["xAxis"].append(self.__merge_dict({ 172 | "type": type_, 173 | "name": name, 174 | "data": data 175 | }, kwargs)) 176 | return self 177 | 178 | def y_axis(self, data=None, type_="value", name="", formatter=None, **kwargs): 179 | """ 180 | 添加X轴 181 | """ 182 | if "yAxis" not in self: 183 | self["yAxis"] = [] 184 | self["yAxis"].append(self.__merge_dict({ 185 | "type": type_, 186 | "name": name, 187 | }, {"axisLabel": {"formatter": formatter}} if formatter is not None else {}, kwargs)) 188 | if data is not None: 189 | self["yAxis"] = data 190 | return self 191 | def map(self,data,**kwargs): 192 | self["legend"]={ 193 | "orient":"vertical", 194 | "left": "left", 195 | "data":['price'] 196 | } 197 | self["toolbox"]={ 198 | "show": True, 199 | "orient": "vertical", 200 | "left": "right", 201 | "top": "center", 202 | "feature": { 203 | "mark":{"show":True}, 204 | "dataView": {"show": True, "readOnly": False}, 205 | "restore": {"show":True}, 206 | "saveAsImage": {"show":True} 207 | } 208 | } 209 | #data={"name": '北京',"value": 10 } 210 | #data=json.dumps(data,ensure_ascii=False) 211 | self["series"]=[{ 212 | "name":"price", 213 | "type": "map", 214 | "mapType": "china", 215 | "roam": False, 216 | "label": { 217 | "normal": { 218 | "show": True 219 | }, 220 | "emphasis": { 221 | "show": True 222 | } 223 | }, 224 | #"data": [data] 225 | "data": [{"name": n, "value": v} for n, v in data.items()] 226 | }] 227 | self["visualMap"]={ 228 | "min": 0, 229 | "max": 2500, 230 | "left": 'left', 231 | "top": 'bottom', 232 | "text": ['高','低'], 233 | "calculable": True 234 | } 235 | self["tooltip"]={ 236 | "trigger":"item" 237 | } 238 | # self["title"]={ 239 | # "text": 'price', 240 | # "subtext": 'price', 241 | # "left": 'center' 242 | # } 243 | return self 244 | 245 | 246 | @staticmethod 247 | def __merge_dict(*args): 248 | """ 249 | 合并多个字典并返回 250 | """ 251 | return reduce(lambda x, y: dict(x.items() + y.items()), args) 252 | 253 | 254 | def main(): 255 | c = Chart().tooltip() 256 | print json.dumps(c) 257 | 258 | if __name__ == "__main__": 259 | main() 260 | -------------------------------------------------------------------------------- /visualization/py-echarts/models.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/models.pyc -------------------------------------------------------------------------------- /visualization/py-echarts/query_presto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pyhive import presto 5 | 6 | PRESTO_SERVER = {'host': 'bigdata', 'port': 8080, 'catalog': 'hive', 'schema': 'default'} 7 | BRAND_PRICE_QUERY="select brand,sum(price) as totalPrice from record join brand_dimension on record.bid=brand_dimension.bid group by brand_dimension.brand order by totalPrice desc limit 10" 8 | 9 | AGE_PRICE_QUERY="select cast((year(CURRENT_DATE)-year(birth)) as integer) as age,sum(price) as totalPrice from record join user_dimension on record.uid=user_dimension.uid group by cast((year(CURRENT_DATE)-year(birth)) as integer) order by totalPrice desc" 10 | 11 | class Presto_Query: 12 | 13 | def query_brand_price(self): 14 | conn = presto.connect(**PRESTO_SERVER) 15 | cursor = conn.cursor() 16 | cursor.execute(BRAND_PRICE_QUERY) 17 | tuples=cursor.fetchall() 18 | return tuples 19 | 20 | def getKeys(self,tuples): 21 | keys=[] 22 | for tuple in tuples: 23 | keys.append(tuple[0]) 24 | return keys 25 | 26 | def getValues(self, tuples): 27 | values=[] 28 | for tuple in tuples: 29 | values.append(tuple[1]) 30 | return values 31 | 32 | def query_age_price(self): 33 | conn = presto.connect(**PRESTO_SERVER) 34 | cursor = conn.cursor() 35 | cursor.execute(AGE_PRICE_QUERY) 36 | tuples=cursor.fetchall() 37 | return tuples 38 | 39 | def getAgeDict(self, tuples): 40 | dict={'<10':0L,'10~20':0L,'20~30':0L,'30~40':0L,'40~50':0L,'50~60':0L,'60~70':0L,'>70':0L} 41 | for tuple in tuples: 42 | age=int(tuple[0]) 43 | price=long(tuple[1]) 44 | age=age/10; 45 | if age<1: 46 | value=dict['<10'] 47 | dict['<10']=value+price 48 | elif age>=1 and age<2: 49 | value=dict['10~20'] 50 | dict['10~20']=value+price 51 | elif age>=2 and age<3: 52 | value=dict['20~30'] 53 | dict['20~30']=value+price 54 | elif age>=3 and age<4: 55 | value=dict['30~40'] 56 | dict['30~40']=value+price 57 | elif age>=4 and age<5: 58 | value=dict['40~50'] 59 | dict['40~50']=value+price 60 | elif age>=5 and age<6: 61 | value=dict['50~60'] 62 | dict['50~60']=value+price 63 | elif age>=6 and age<7: 64 | value=dict['60~70'] 65 | dict['60~70']=value+price 66 | else: 67 | value=dict['>70'] 68 | dict['>70']=value+price 69 | return dict 70 | -------------------------------------------------------------------------------- /visualization/py-echarts/query_presto.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/query_presto.pyc -------------------------------------------------------------------------------- /visualization/py-echarts/query_redis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import redis 6 | 7 | PROVINCE_MAP={"BeiJing":"北京","ShangHai":"上海","TianJin":"天津","ChongQing":"重庆","XiangGang":"香港","Aomen":"澳门","AnHui":"安徽","FuJian":"福建","GuangDong":"广东","GuangXi":"广西","GuiZhou":"贵州","GanSu":"甘肃","HaiNan":"海南","HeBei":"河北","HeNan":"河南","HeiLongJiang":"黑龙江","HuBei":"湖北","HuNan":"湖南","JiLin":"吉林","JiangSu":"江苏","JiangXi":"江西","LiaoNing":"辽宁","NeiMengGu":"内蒙古","NingXia":"宁夏","QingHai":"青海","ShanXi1":"山西","ShanXi3":"陕西","ShanDong":"山东","SiChuan":"四川","TaiWan":"台湾","XiZang":"西藏","XinJiang":"新疆","YunNan":"云南","ZheJiang":"浙江"} 8 | 9 | class Redis_Query: 10 | 11 | def query_province(self): 12 | r = redis.StrictRedis(host='127.0.0.1', port=6379) 13 | return r.hgetall('province') 14 | 15 | def get_province_price(self,dict): 16 | china_price={} 17 | for k,v in dict.items(): 18 | if k in PROVINCE_MAP: 19 | new_key=PROVINCE_MAP[k] 20 | china_price[new_key]=v 21 | return china_price 22 | -------------------------------------------------------------------------------- /visualization/py-echarts/query_redis.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/query_redis.pyc -------------------------------------------------------------------------------- /visualization/py-echarts/templates/chart.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | 5 | 52 | -------------------------------------------------------------------------------- /visualization/py-echarts/templates/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 电商双十一大数据日志分析系统 6 | 7 | 14 | 15 | 16 | 17 |

{{title}}

18 | {% set i = 1 %} 19 | {% for template in templates %} 20 | {% if template.type == 'chart' %} 21 |

22 | {{template.title}} 23 |

24 |
25 | 31 | 45 | 46 | {% set i = i + 1 %} 47 | {% endif %} 48 | {% endfor %} 49 | 50 | 51 | -------------------------------------------------------------------------------- /visualization/result/image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-1.png -------------------------------------------------------------------------------- /visualization/result/image-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-2.png -------------------------------------------------------------------------------- /visualization/result/image-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-3.png --------------------------------------------------------------------------------