├── .gitignore
├── .project
├── README.md
├── datasource
    ├── brand.py
    ├── command.sh
    ├── record.py
    └── user.py
├── etl
    ├── etl.py
    ├── loadDataToHive
    │   ├── loadDataToHive.iml
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── bigdata
    │   │               └── etl
    │   │                   └── loadDataToHive.java
    └── start_etl.py
├── flume
    ├── command
    │   ├── start_flume_batch.sh
    │   └── start_flume_realtime.sh
    └── conf
    │   ├── flume-conf-logAnalysis-kafka.properties
    │   ├── flume-conf-logAnalysis.properties
    │   └── flume-env.sh
├── hadoop
    ├── .classpath
    ├── .project
    ├── command
    │   ├── start-dfs.sh
    │   ├── start-historyserver.sh
    │   ├── start-yarn.sh
    │   ├── stop-dfs.sh
    │   ├── stop-historyserver.sh
    │   └── stop-yarn.sh
    ├── conf
    │   ├── capacity-scheduler.xml
    │   ├── core-site.xml
    │   ├── hadoop-env.sh
    │   ├── hdfs-site.xml
    │   ├── mapred-env.sh
    │   ├── mapred-site.xml
    │   ├── slaves
    │   ├── yarn-env.sh
    │   └── yarn-site.xml
    ├── pom.xml
    ├── src
    │   └── main
    │   │   ├── java
    │   │       └── cn
    │   │       │   └── chinahadoop
    │   │       │       ├── hdfs
    │   │       │           └── HdfsExample.java
    │   │       │       └── mapreduce
    │   │       │           ├── Grep.java
    │   │       │           ├── InvertedIndex.java
    │   │       │           ├── JobFailureTest.java
    │   │       │           ├── OOMTest.java
    │   │       │           ├── TaskAttemptTest.java
    │   │       │           └── WordCount.java
    │   │   └── resources
    │   │       ├── input
    │   │           ├── input_1.txt
    │   │           └── input_2.txt
    │   │       └── output
    │   │           ├── ._SUCCESS.crc
    │   │           ├── .part-r-00000.crc
    │   │           ├── _SUCCESS
    │   │           └── part-r-00000
    └── streaming
    │   ├── mapper.cpp
    │   ├── mapper.php
    │   ├── mapper.sh
    │   ├── mapper2.cpp
    │   ├── mapper2.sh
    │   ├── reducer.cpp
    │   ├── reducer.php
    │   ├── reducer.sh
    │   ├── run_cpp_mr.sh
    │   ├── run_php_mr.sh
    │   ├── run_shell_mr.sh
    │   └── test.txt
├── hbase
    └── hbase-ingest
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── bigdata
    │                   └── hbase
    │                       ├── Ingest.java
    │                       ├── ProfileIngest.java
    │                       ├── Query.java
    │                       └── RecordIngest.java
├── hive
    ├── README.md
    ├── command
    │   ├── add_partition.sql
    │   ├── age_price_list.sql
    │   ├── brand_price_list.sql
    │   ├── create_orc_table.sql
    │   ├── create_parquet_table.sql
    │   ├── create_table_brand.sql
    │   ├── create_table_record.sql
    │   ├── create_table_user.sql
    │   ├── employees.sql
    │   ├── employees_part.sql
    │   ├── load_data_to_orc.sql
    │   ├── load_data_to_parquet.sql
    │   ├── province_prince_list.sql
    │   ├── skewed.sql
    │   ├── start-hiveserver2.sh
    │   ├── start-metastore.sh
    │   ├── start-mysql.sh
    │   └── weblog.sql
    ├── conf
    │   ├── hive-env.sh
    │   ├── hive-log4j2.properties
    │   └── hive-site.xml
    └── data
    │   └── employees.txt
├── kafka
    └── command
    │   ├── start-kafka.sh
    │   └── start-zookeeper.sh
├── mysql
    ├── create_table_brand.sql
    ├── create_table_user.sql
    ├── load_table_brand.sql
    ├── load_table_user.sql
    └── start-client.txt
├── pom.xml
├── presto
    ├── command
    │   ├── age_price_list_presto.sql
    │   ├── brand_price_list_presto.sql
    │   ├── gender_brand_rank.sql
    │   ├── start-presto-client.sh
    │   ├── start-presto.sh
    │   └── stop-presto.sh
    └── conf
    │   └── etc
    │       ├── catalog
    │           └── hive.properties
    │       ├── config.properties
    │       ├── jvm.config
    │       └── node.properties
├── redis
    └── command
    │   ├── start-redis-client.sh
    │   └── start-redis.sh
├── sqoop
    └── command
    │   ├── brand_dimension_sqoop.sh
    │   └── user_dimension_sqoop.sh
├── storm
    ├── command
    │   ├── realtime_process.sh
    │   ├── start-storm-nimbus.sh
    │   ├── start-storm-supervisor.sh
    │   └── start-storm-ui.sh
    ├── conf
    │   ├── storm-env.sh
    │   └── storm.yaml
    └── storm_realtime_process
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           └── java
    │               └── bigdata
    │                   └── storm
    │                       ├── ExtractBolt.java
    │                       ├── LogProcessTopology.java
    │                       ├── ProvinceBolt.java
    │                       ├── ProvinceStoreMapper.java
    │                       ├── WebsiteBolt.java
    │                       └── WebsiteStoreMapper.java
└── visualization
    ├── command
        └── start-web.sh
    ├── py-echarts
        ├── main.py
        ├── models.py
        ├── models.pyc
        ├── query_presto.py
        ├── query_presto.pyc
        ├── query_redis.py
        ├── query_redis.pyc
        └── templates
        │   ├── chart.html
        │   └── main.html
    └── result
        ├── image-1.png
        ├── image-2.png
        └── image-3.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | .DS_Store
 4 | .idea
 5 | *.iml
 6 | .settings
 7 | .class
 8 | target/
 9 | .classpath
10 | .project
11 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>hadoop-example</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bigdata_logAnalysis
 2 | 
 3 | ##Version:
 4 | | Component     | Version       | Download link                                                                                       |
 5 | | ------------- |:-------------:|:----------------------------------------------------------------------------------------------------|
 6 | | flume         | 1.7.0         | http://mirrors.hust.edu.cn/apache/flume/1.7.0/apache-flume-1.7.0-bin.tar.gz                         |
 7 | | hadoop        | 2.7.3         | http://apache.fayea.com/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz                              |
 8 | | oozie         | 4.2.0         | http://apache.fayea.com/oozie/4.2.0/oozie-4.2.0.tar.gz                                              |
 9 | | hive          | 2.1.0         | http://mirror.bit.edu.cn/apache/hive/stable-2/apache-hive-2.1.0-bin.tar.gz                          |
10 | | sqoop         | 1.4.6         | http://mirror.bit.edu.cn/apache/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz              |
11 | | presto        | 0.157         | https://repo1.maven.org/maven2/com/facebook/presto/presto-server/0.157/presto-server-0.157.tar.gz   |
12 | | presto-client | 0.157         | https://repo1.maven.org/maven2/com/facebook/presto/presto-cli/0.157/presto-cli-0.157-executable.jar |
13 | | kafka         | 0.10.1.0      | http://mirrors.tuna.tsinghua.edu.cn/apache/kafka/0.10.1.0/kafka_2.11-0.10.1.0.tgz                   |
14 | | storm         | 1.0.2         | http://mirrors.cnnic.cn/apache/storm/apache-storm-1.0.2/apache-storm-1.0.2.tar.gz                   |
15 | | redis         | 3.2           | http://download.redis.io/releases/redis-3.2.5.tar.gz                                                |
16 | | echarts       | 3.3.2         | http://echarts.baidu.com/dist/echarts.min.js                                                        |
17 | 


--------------------------------------------------------------------------------
/datasource/brand.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import random,sys
 5 | 
 6 | BRAND_FILE="/home/bigdata/datasource/brand.list"
 7 | BRAND={"computer":("APPLE","HP","ACER","LENOVO","DELL","SONY","ASUS"),"telephone":("IPHONE","SAMSUNG","HTC","MOTOROLA","HUAWEI","OPPO","VIVO","XIAOMI","MEIZU"),"television":("HISENSE","SAMSUNG","SKYWORTH","SHARP","HAIER","PHILIPS","TCL"),"sports":("NIKE","ADIDAS","LINING","PUMA","ANTA","MIZUNO","KAPPA","NB","PEAK","361"),"food":("MENGNIU","YILI","GUANGMING","SANYUAN","WULIANGYE","MOUTAI","HONGXING","NIULANSHAN","LANGJIU"),"clothes":("ZARA","HLA","UNIQLO","PEACEBIRD","GXG","SELECTED","SEMIR","SEPTWOLVES","CAMEL"),"cosmetic":("LOREAL","NIVEA","KANS","DHC","CLINIQUE","INNISFREE","MEIFUBAO","OLAY","LANCOME")}
 8 | 
 9 | def get_one_brand(category_list,id):
10 |     brand_id="%08d"%id
11 |     category_size=len(category_list)
12 |     category=category_list[random.randint(0,category_size-1)]
13 |     brand_size=len(BRAND[category])
14 |     brand=BRAND[category][random.randint(0,brand_size-1)]
15 |     return brand_id+","+category+","+brand
16 | 
17 | 
18 | 
19 | def generate_brand():
20 |     category_list=[]
21 |     for k in BRAND:
22 |         category_list.append(k)
23 |     f=open(BRAND_FILE,'w')
24 |     for i in range(count):
25 |         brand=get_one_brand(category_list,i)
26 |         f.write(brand+"\n")
27 |     f.close()
28 | 
29 | 
30 | 
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     count=int(sys.argv[1])
35 |     print("start to generate brand data...")
36 |     generate_brand()
37 | 


--------------------------------------------------------------------------------
/datasource/command.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/datasource/user.py 1000
4 | /home/bigdata/datasource/brand.py 1000
5 | /home/bigdata/datasource/record.py 100000
6 | 


--------------------------------------------------------------------------------
/datasource/record.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from faker import Factory
 5 | import random,sys,time,uuid
 6 | 
 7 | USER_FILE="/home/bigdata/datasource/user.list"
 8 | BRAND_FILE="/home/bigdata/datasource/brand.list"
 9 | RECORD_FILE="/home/bigdata/datasource/record.list"
10 | 
11 | WEBSITE_LIST=("TAOBAO","TIANMAO","JUHUASUAN","TIANMAOCHAOSHI")
12 | EXPRESS_LIST=("SHENTONG","SHUNFENG","EMS","YUANTONG","YUNDA","ZHONGTONG")
13 | PROVINCE="BeiJing,ShangHai,TianJin,ChongQing,XiangGang,Aomen,AnHui,FuJian,GuangDong,GuangXi,GuiZhou,GanSu,HaiNan,HeBei,HeNan,HeiLongJiang,HuBei,HuNan,JiLin,JiangSu,JiangXi,LiaoNing,NeiMengGu,NingXia,QingHai,ShanXi1,ShanXi3,ShanDong,SiChuan,TaiWan,XiZang,XinJiang,YunNan,ZheJiang"
14 | PROVINCE_LIST=PROVINCE.split(",");
15 | 
16 | def get_one_record(fake,user_list,brand_list,id):
17 |     record_id="%010d"%id
18 |     user_id=user_list[random.randint(0,len(user_list)-1)]
19 |     brand_id=brand_list[random.randint(0,len(brand_list)-1)]
20 |     transaction_time=int(time.time())
21 |     price=random.randint(0,1000)
22 |     source_province=PROVINCE_LIST[random.randint(0,len(PROVINCE_LIST)-1)]
23 |     target_province=PROVINCE_LIST[random.randint(0,len(PROVINCE_LIST)-1)]
24 |     website=WEBSITE_LIST[random.randint(0,len(WEBSITE_LIST)-1)]
25 |     express=EXPRESS_LIST[random.randint(0,len(EXPRESS_LIST)-1)]
26 |     express_id=fake.credit_card_number()
27 |     ip=fake.ipv4()
28 |     language=fake.language_code()
29 |     return record_id+","+user_id+","+brand_id+","+str(transaction_time)+","+str(price)+","+source_province+","+target_province+","+website+","+str(express_id)+","+express+","+ip+","+language
30 | 
31 | 
32 | 
33 | 
34 | def generate_record(total):
35 |     fake = Factory.create()
36 |     user_list=get_user_list()
37 |     brand_list=get_brand_list()
38 |     f=open(RECORD_FILE,'w')
39 |     count=0
40 |     while(count<total):
41 |         mini_count=random.randint(1,10)
42 |         for i in range(mini_count):
43 |             record=get_one_record(fake,user_list,brand_list,count)
44 |             f.write(record+"\n")
45 |             count+=1
46 |         time.sleep(random.randint(0,2))
47 |     f.close()
48 | 
49 | 
50 | 
51 | def get_user_list():
52 |     user_list=[]
53 |     for line in open(USER_FILE):
54 |         user_list.append(line.split(",")[0])
55 |     return user_list
56 | 
57 | def get_brand_list():
58 |     brand_list=[]
59 |     for line in open(BRAND_FILE):
60 |         brand_list.append(line.split(",")[0])
61 |     return brand_list
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     count=int(sys.argv[1])
66 |     print("start to generate transaction data...")
67 |     generate_record(count)
68 | 


--------------------------------------------------------------------------------
/datasource/user.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from faker import Factory
 5 | import random
 6 | import sys
 7 | USER_FILE="/home/bigdata/datasource/user.list"
 8 | PROVINCE="BeiJing,ShangHai,TianJin,ChongQing,XiangGang,Aomen,AnHui,FuJian,GuangDong,GuangXi,GuiZhou,GanSu,HaiNan,HeBei,HeNan,HeiLongJiang,HuBei,HuNan,JiLin,JiangSu,JiangXi,LiaoNing,NeiMengGu,NingXia,QingHai,ShanXi1,ShanXi3,ShanDong,SiChuan,TaiWan,XiZang,XinJiang,YunNan,ZheJiang"
 9 | PROVINCE_LIST=PROVINCE.split(",");
10 | def get_one_user(fake,id):
11 |     uid="%08d"%id
12 |     name=fake.last_name()
13 |     gender=fake.simple_profile()["sex"]
14 |     birth=fake.simple_profile()["birthdate"]
15 |     province=PROVINCE_LIST[random.randint(0,len(PROVINCE_LIST)-1)]
16 |     return uid+","+name+","+gender+","+birth+","+province
17 | 
18 | def generate_user(count):
19 |     fake = Factory.create()
20 |     f=open(USER_FILE,'w')
21 |     for i in range(count):
22 |         user=get_one_user(fake,i)
23 |         f.write(user+"\n")
24 |     f.close()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     count=int(sys.argv[1])
29 |     print("start to generate user data...")
30 |     generate_user(count)
31 | 


--------------------------------------------------------------------------------
/etl/etl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys,time
 5 | 
 6 | 
 7 | def timestamp_datetime(value):
 8 |     format = '%Y-%m-%d %H:%M:%S'
 9 |     value = time.localtime(value)
10 |     dt = time.strftime(format,value)
11 |     return dt
12 | 
13 | # input comes from STDIN (standard input)
14 | for line in sys.stdin:
15 |     # remove leading and trailing whitespace
16 |     columns = line.split(",");
17 | 
18 |     if len(columns) != 12:
19 |         continue
20 | 
21 |     for i in range(len(columns)):
22 |         columns[i] = columns[i].strip()
23 | 
24 |     columns[3]=timestamp_datetime(int(columns[3]))
25 |     print columns[0]+","+columns[1]+","+columns[2]+","+columns[3]+","+columns[4]+","+columns[5]+","+columns[6]+","+columns[7]+","+columns[8]+","+columns[9]
26 | 


--------------------------------------------------------------------------------
/etl/loadDataToHive/loadDataToHive.iml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
  3 |   <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_5" inherit-compiler-output="false">
  4 |     <output url="file://$MODULE_DIR$/target/classes" />
  5 |     <output-test url="file://$MODULE_DIR$/target/test-classes" />
  6 |     <content url="file://$MODULE_DIR$">
  7 |       <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
  8 |       <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
  9 |       <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
 10 |       <excludeFolder url="file://$MODULE_DIR$/target" />
 11 |     </content>
 12 |     <orderEntry type="inheritedJdk" />
 13 |     <orderEntry type="sourceFolder" forTests="false" />
 14 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-jdbc:2.1.0" level="project" />
 15 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-common:2.1.0" level="project" />
 16 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-storage-api:2.1.0" level="project" />
 17 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-orc:2.1.0" level="project" />
 18 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-common:2.6.0" level="project" />
 19 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-annotations:2.6.0" level="project" />
 20 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-math3:3.1.1" level="project" />
 21 |     <orderEntry type="library" name="Maven: xmlenc:xmlenc:0.52" level="project" />
 22 |     <orderEntry type="library" name="Maven: commons-httpclient:commons-httpclient:3.1" level="project" />
 23 |     <orderEntry type="library" name="Maven: commons-net:commons-net:3.1" level="project" />
 24 |     <orderEntry type="library" name="Maven: commons-collections:commons-collections:3.2.1" level="project" />
 25 |     <orderEntry type="library" name="Maven: com.sun.jersey:jersey-core:1.9" level="project" />
 26 |     <orderEntry type="library" name="Maven: com.sun.jersey:jersey-json:1.9" level="project" />
 27 |     <orderEntry type="library" name="Maven: com.sun.xml.bind:jaxb-impl:2.2.3-1" level="project" />
 28 |     <orderEntry type="library" name="Maven: com.sun.jersey:jersey-server:1.9" level="project" />
 29 |     <orderEntry type="library" name="Maven: asm:asm:3.1" level="project" />
 30 |     <orderEntry type="library" name="Maven: net.java.dev.jets3t:jets3t:0.9.0" level="project" />
 31 |     <orderEntry type="library" name="Maven: com.jamesmurty.utils:java-xmlbuilder:0.4" level="project" />
 32 |     <orderEntry type="library" name="Maven: commons-configuration:commons-configuration:1.6" level="project" />
 33 |     <orderEntry type="library" name="Maven: commons-digester:commons-digester:1.8" level="project" />
 34 |     <orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils:1.7.0" level="project" />
 35 |     <orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils-core:1.8.0" level="project" />
 36 |     <orderEntry type="library" name="Maven: com.jcraft:jsch:0.1.42" level="project" />
 37 |     <orderEntry type="library" name="Maven: org.htrace:htrace-core:3.0.4" level="project" />
 38 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-hdfs:2.6.0" level="project" />
 39 |     <orderEntry type="library" name="Maven: commons-daemon:commons-daemon:1.0.13" level="project" />
 40 |     <orderEntry type="library" name="Maven: xerces:xercesImpl:2.9.1" level="project" />
 41 |     <orderEntry type="library" name="Maven: xml-apis:xml-apis:1.3.04" level="project" />
 42 |     <orderEntry type="library" name="Maven: org.iq80.snappy:snappy:0.2" level="project" />
 43 |     <orderEntry type="library" name="Maven: commons-cli:commons-cli:1.2" level="project" />
 44 |     <orderEntry type="library" name="Maven: commons-lang:commons-lang:2.6" level="project" />
 45 |     <orderEntry type="library" name="Maven: org.eclipse.jetty.aggregate:jetty-all:7.6.0.v20120127" level="project" />
 46 |     <orderEntry type="library" name="Maven: org.apache.geronimo.specs:geronimo-jta_1.1_spec:1.1.1" level="project" />
 47 |     <orderEntry type="library" name="Maven: javax.mail:mail:1.4.1" level="project" />
 48 |     <orderEntry type="library" name="Maven: javax.activation:activation:1.1" level="project" />
 49 |     <orderEntry type="library" name="Maven: org.apache.geronimo.specs:geronimo-jaspic_1.0_spec:1.0" level="project" />
 50 |     <orderEntry type="library" name="Maven: org.apache.geronimo.specs:geronimo-annotation_1.0_spec:1.1.1" level="project" />
 51 |     <orderEntry type="library" name="Maven: asm:asm-commons:3.1" level="project" />
 52 |     <orderEntry type="library" name="Maven: asm:asm-tree:3.1" level="project" />
 53 |     <orderEntry type="library" name="Maven: org.eclipse.jetty.orbit:javax.servlet:3.0.0.v201112011016" level="project" />
 54 |     <orderEntry type="library" name="Maven: joda-time:joda-time:2.5" level="project" />
 55 |     <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-1.2-api:2.4.1" level="project" />
 56 |     <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.4.1" level="project" />
 57 |     <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.4.1" level="project" />
 58 |     <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-web:2.4.1" level="project" />
 59 |     <orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-slf4j-impl:2.4.1" level="project" />
 60 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-compress:1.9" level="project" />
 61 |     <orderEntry type="library" name="Maven: org.apache.ant:ant:1.9.1" level="project" />
 62 |     <orderEntry type="library" name="Maven: org.apache.ant:ant-launcher:1.9.1" level="project" />
 63 |     <orderEntry type="library" name="Maven: org.json:json:20090211" level="project" />
 64 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-core:3.1.0" level="project" />
 65 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-jvm:3.1.0" level="project" />
 66 |     <orderEntry type="library" name="Maven: io.dropwizard.metrics:metrics-json:3.1.0" level="project" />
 67 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.4.2" level="project" />
 68 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.4.0" level="project" />
 69 |     <orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.4.2" level="project" />
 70 |     <orderEntry type="library" name="Maven: com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter:0.1.0" level="project" />
 71 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-service:2.1.0" level="project" />
 72 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-server:2.1.0" level="project" />
 73 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-common:2.1.0" level="project" />
 74 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-client:2.1.0" level="project" />
 75 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.1" level="project" />
 76 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-llap-tez:2.1.0" level="project" />
 77 |     <orderEntry type="library" name="Maven: org.apache.slider:slider-core:0.90.2-incubating" level="project" />
 78 |     <orderEntry type="library" name="Maven: com.beust:jcommander:1.30" level="project" />
 79 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-jaxrs:1.9.13" level="project" />
 80 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-xc:1.9.13" level="project" />
 81 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-client:2.7.1" level="project" />
 82 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-app:2.7.1" level="project" />
 83 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-common:2.7.1" level="project" />
 84 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-shuffle:2.7.1" level="project" />
 85 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-jobclient:2.7.1" level="project" />
 86 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-registry:2.7.1" level="project" />
 87 |     <orderEntry type="library" name="Maven: com.sun.jersey:jersey-client:1.9" level="project" />
 88 |     <orderEntry type="library" name="Maven: com.google.inject.extensions:guice-servlet:3.0" level="project" />
 89 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:jetty:6.1.26" level="project" />
 90 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:jetty-util:6.1.26" level="project" />
 91 |     <orderEntry type="library" scope="RUNTIME" name="Maven: javax.servlet.jsp:jsp-api:2.1" level="project" />
 92 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-hadoop2-compat:1.1.1" level="project" />
 93 |     <orderEntry type="library" name="Maven: org.apache.commons:commons-math:2.2" level="project" />
 94 |     <orderEntry type="library" name="Maven: com.yammer.metrics:metrics-core:2.2.0" level="project" />
 95 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-server:1.1.1" level="project" />
 96 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-procedure:1.1.1" level="project" />
 97 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-common:tests:1.1.1" level="project" />
 98 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hbase:hbase-prefix-tree:1.1.1" level="project" />
 99 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:jetty-sslengine:6.1.26" level="project" />
100 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:jsp-2.1:6.1.14" level="project" />
101 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:jsp-api-2.1:6.1.14" level="project" />
102 |     <orderEntry type="library" name="Maven: org.mortbay.jetty:servlet-api-2.5:6.1.14" level="project" />
103 |     <orderEntry type="library" name="Maven: com.lmax:disruptor:3.3.0" level="project" />
104 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-common:1.1.1" level="project" />
105 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-hadoop-compat:1.1.1" level="project" />
106 |     <orderEntry type="library" name="Maven: commons-codec:commons-codec:1.4" level="project" />
107 |     <orderEntry type="library" name="Maven: net.sf.jpam:jpam:1.1" level="project" />
108 |     <orderEntry type="library" name="Maven: tomcat:jasper-compiler:5.5.23" level="project" />
109 |     <orderEntry type="library" name="Maven: javax.servlet:jsp-api:2.0" level="project" />
110 |     <orderEntry type="library" name="Maven: ant:ant:1.6.5" level="project" />
111 |     <orderEntry type="library" name="Maven: tomcat:jasper-runtime:5.5.23" level="project" />
112 |     <orderEntry type="library" name="Maven: javax.servlet:servlet-api:2.4" level="project" />
113 |     <orderEntry type="library" name="Maven: commons-el:commons-el:1.0" level="project" />
114 |     <orderEntry type="library" name="Maven: org.apache.thrift:libfb303:0.9.3" level="project" />
115 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-recipes:2.6.0" level="project" />
116 |     <orderEntry type="library" name="Maven: org.jamon:jamon-runtime:2.3.1" level="project" />
117 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-serde:2.1.0" level="project" />
118 |     <orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:3.0.0" level="project" />
119 |     <orderEntry type="library" name="Maven: org.apache.avro:avro:1.7.7" level="project" />
120 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-core-asl:1.9.13" level="project" />
121 |     <orderEntry type="library" name="Maven: org.codehaus.jackson:jackson-mapper-asl:1.9.13" level="project" />
122 |     <orderEntry type="library" name="Maven: com.thoughtworks.paranamer:paranamer:2.3" level="project" />
123 |     <orderEntry type="library" name="Maven: org.xerial.snappy:snappy-java:1.0.5" level="project" />
124 |     <orderEntry type="library" name="Maven: net.sf.opencsv:opencsv:2.3" level="project" />
125 |     <orderEntry type="library" name="Maven: org.apache.parquet:parquet-hadoop-bundle:1.8.1" level="project" />
126 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-metastore:2.1.0" level="project" />
127 |     <orderEntry type="library" name="Maven: javolution:javolution:5.5.1" level="project" />
128 |     <orderEntry type="library" name="Maven: com.google.guava:guava:14.0.1" level="project" />
129 |     <orderEntry type="library" name="Maven: com.google.protobuf:protobuf-java:2.5.0" level="project" />
130 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-client:1.1.1" level="project" />
131 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-annotations:1.1.1" level="project" />
132 |     <orderEntry type="module-library">
133 |       <library name="Maven: jdk.tools:jdk.tools:1.7">
134 |         <CLASSES>
135 |           <root url="jar:///Library/Java/JavaVirtualMachines/jdk1.8.0_91.jdk/Contents/Home/lib/tools.jar!/" />
136 |         </CLASSES>
137 |         <JAVADOC />
138 |         <SOURCES />
139 |       </library>
140 |     </orderEntry>
141 |     <orderEntry type="library" name="Maven: org.apache.hbase:hbase-protocol:1.1.1" level="project" />
142 |     <orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
143 |     <orderEntry type="library" name="Maven: io.netty:netty-all:4.0.23.Final" level="project" />
144 |     <orderEntry type="library" name="Maven: org.apache.htrace:htrace-core:3.1.0-incubating" level="project" />
145 |     <orderEntry type="library" name="Maven: org.jruby.jcodings:jcodings:1.0.8" level="project" />
146 |     <orderEntry type="library" name="Maven: org.jruby.joni:joni:2.1.2" level="project" />
147 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-auth:2.5.1" level="project" />
148 |     <orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-kerberos-codec:2.0.0-M15" level="project" />
149 |     <orderEntry type="library" name="Maven: org.apache.directory.server:apacheds-i18n:2.0.0-M15" level="project" />
150 |     <orderEntry type="library" name="Maven: org.apache.directory.api:api-asn1-api:1.0.0-M20" level="project" />
151 |     <orderEntry type="library" name="Maven: org.apache.directory.api:api-util:1.0.0-M20" level="project" />
152 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-mapreduce-client-core:2.5.1" level="project" />
153 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-common:2.5.1" level="project" />
154 |     <orderEntry type="library" name="Maven: com.github.stephenc.findbugs:findbugs-annotations:1.3.9-1" level="project" />
155 |     <orderEntry type="library" name="Maven: junit:junit:4.11" level="project" />
156 |     <orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.3" level="project" />
157 |     <orderEntry type="library" name="Maven: com.jolbox:bonecp:0.8.0.RELEASE" level="project" />
158 |     <orderEntry type="library" name="Maven: org.apache.derby:derby:10.10.2.0" level="project" />
159 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-api-jdo:4.2.1" level="project" />
160 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-core:4.1.6" level="project" />
161 |     <orderEntry type="library" name="Maven: org.datanucleus:datanucleus-rdbms:4.1.7" level="project" />
162 |     <orderEntry type="library" name="Maven: commons-pool:commons-pool:1.5.4" level="project" />
163 |     <orderEntry type="library" name="Maven: commons-dbcp:commons-dbcp:1.4" level="project" />
164 |     <orderEntry type="library" name="Maven: javax.jdo:jdo-api:3.0.1" level="project" />
165 |     <orderEntry type="library" name="Maven: javax.transaction:jta:1.1" level="project" />
166 |     <orderEntry type="library" name="Maven: org.datanucleus:javax.jdo:3.2.0-m3" level="project" />
167 |     <orderEntry type="library" name="Maven: javax.transaction:transaction-api:1.1" level="project" />
168 |     <orderEntry type="library" name="Maven: org.antlr:antlr-runtime:3.4" level="project" />
169 |     <orderEntry type="library" name="Maven: org.antlr:stringtemplate:3.2.1" level="project" />
170 |     <orderEntry type="library" name="Maven: antlr:antlr:2.7.7" level="project" />
171 |     <orderEntry type="library" name="Maven: co.cask.tephra:tephra-api:0.6.0" level="project" />
172 |     <orderEntry type="library" name="Maven: co.cask.tephra:tephra-core:0.6.0" level="project" />
173 |     <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" />
174 |     <orderEntry type="library" name="Maven: com.google.inject:guice:3.0" level="project" />
175 |     <orderEntry type="library" name="Maven: javax.inject:javax.inject:1" level="project" />
176 |     <orderEntry type="library" name="Maven: aopalliance:aopalliance:1.0" level="project" />
177 |     <orderEntry type="library" name="Maven: com.google.inject.extensions:guice-assistedinject:3.0" level="project" />
178 |     <orderEntry type="library" name="Maven: it.unimi.dsi:fastutil:6.5.6" level="project" />
179 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-common:0.6.0-incubating" level="project" />
180 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-core:0.6.0-incubating" level="project" />
181 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-api:0.6.0-incubating" level="project" />
182 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-discovery-api:0.6.0-incubating" level="project" />
183 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-discovery-core:0.6.0-incubating" level="project" />
184 |     <orderEntry type="library" name="Maven: org.apache.twill:twill-zookeeper:0.6.0-incubating" level="project" />
185 |     <orderEntry type="library" name="Maven: co.cask.tephra:tephra-hbase-compat-1.0:0.6.0" level="project" />
186 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-shims:2.1.0" level="project" />
187 |     <orderEntry type="library" name="Maven: org.apache.hive.shims:hive-shims-common:2.1.0" level="project" />
188 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hive.shims:hive-shims-0.23:2.1.0" level="project" />
189 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hadoop:hadoop-yarn-server-resourcemanager:2.6.0" level="project" />
190 |     <orderEntry type="library" name="Maven: com.sun.jersey.contribs:jersey-guice:1.9" level="project" />
191 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-api:2.6.0" level="project" />
192 |     <orderEntry type="library" name="Maven: javax.xml.bind:jaxb-api:2.2.2" level="project" />
193 |     <orderEntry type="library" name="Maven: javax.xml.stream:stax-api:1.0-2" level="project" />
194 |     <orderEntry type="library" name="Maven: org.codehaus.jettison:jettison:1.1" level="project" />
195 |     <orderEntry type="library" name="Maven: org.apache.hadoop:hadoop-yarn-server-common:2.6.0" level="project" />
196 |     <orderEntry type="library" name="Maven: org.fusesource.leveldbjni:leveldbjni-all:1.8" level="project" />
197 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hadoop:hadoop-yarn-server-applicationhistoryservice:2.6.0" level="project" />
198 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hadoop:hadoop-yarn-server-web-proxy:2.6.0" level="project" />
199 |     <orderEntry type="library" scope="RUNTIME" name="Maven: org.apache.hive.shims:hive-shims-scheduler:2.1.0" level="project" />
200 |     <orderEntry type="library" name="Maven: org.apache.hive:hive-service-rpc:2.1.0" level="project" />
201 |     <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpclient:4.4" level="project" />
202 |     <orderEntry type="library" name="Maven: commons-logging:commons-logging:1.2" level="project" />
203 |     <orderEntry type="library" name="Maven: org.apache.httpcomponents:httpcore:4.4" level="project" />
204 |     <orderEntry type="library" name="Maven: org.apache.thrift:libthrift:0.9.3" level="project" />
205 |     <orderEntry type="library" name="Maven: org.apache.zookeeper:zookeeper:3.4.6" level="project" />
206 |     <orderEntry type="library" name="Maven: org.slf4j:slf4j-log4j12:1.6.1" level="project" />
207 |     <orderEntry type="library" name="Maven: log4j:log4j:1.2.16" level="project" />
208 |     <orderEntry type="library" name="Maven: jline:jline:0.9.94" level="project" />
209 |     <orderEntry type="library" name="Maven: io.netty:netty:3.7.0.Final" level="project" />
210 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-framework:2.6.0" level="project" />
211 |     <orderEntry type="library" name="Maven: org.apache.curator:curator-client:2.6.0" level="project" />
212 |     <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.10" level="project" />
213 |   </component>
214 | </module>


--------------------------------------------------------------------------------
/etl/loadDataToHive/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>bigdata</groupId>
 8 |     <artifactId>etl</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 |     <properties>
11 |         <java.version>1.8</java.version>
12 |         <hadoop.version>2.7.3</hadoop.version>
13 |         <hive.version>2.1.0</hive.version>
14 |     </properties>
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>org.apache.hadoop</groupId>
18 |             <artifactId>hadoop-common</artifactId>
19 |             <version>${hadoop.version}</version>
20 |         </dependency>
21 |         <dependency>
22 |             <groupId>org.apache.hive</groupId>
23 |             <artifactId>hive-jdbc</artifactId>
24 |             <version>${hive.version}</version>
25 |         </dependency>
26 |     </dependencies>
27 |     <build>
28 |         <plugins>
29 |             <plugin>
30 |                 <artifactId>maven-assembly-plugin</artifactId>
31 |                 <version>2.3</version>
32 |                 <configuration>
33 |                     <classifier>dist</classifier>
34 |                     <appendAssemblyId>true</appendAssemblyId>
35 |                     <descriptorRefs>
36 |                         <descriptor>jar-with-dependencies</descriptor>
37 |                     </descriptorRefs>
38 |                 </configuration>
39 |                 <executions>
40 |                     <execution>
41 |                         <id>make-assembly</id>
42 |                         <phase>package</phase>
43 |                         <goals>
44 |                             <goal>single</goal>
45 |                         </goals>
46 |                     </execution>
47 |                 </executions>
48 |             </plugin>
49 |         </plugins>
50 |     </build>
51 | </project>


--------------------------------------------------------------------------------
/etl/loadDataToHive/src/main/java/bigdata/etl/loadDataToHive.java:
--------------------------------------------------------------------------------
 1 | package bigdata.etl;
 2 | 
 3 | import java.sql.*;
 4 | 
 5 | /**
 6 |  * Created by qianxi.zhang on 11/25/16.
 7 |  */
 8 | public class loadDataToHive {
 9 |   private static String LOAD_CMD =
10 |       "load data inpath '%s' overwrite into table record partition(partition_date='%s',hour_minute='%s')";
11 | 
12 |   private static String driverName = "org.apache.hive.jdbc.HiveDriver";
13 | 
14 |   public static void loadData(String dataDir, String date, String hour_minute) throws SQLException {
15 |     try {
16 |       Class.forName(driverName);
17 |     } catch (ClassNotFoundException e) {
18 |       e.printStackTrace();
19 |       System.exit(1);
20 |     }
21 |     Connection con =
22 |         DriverManager.getConnection("jdbc:hive2://bigdata:10000/default", "bigdata", "bigdata");
23 |     Statement stmt = con.createStatement();
24 |     String sql = String.format(LOAD_CMD, dataDir, date, hour_minute);
25 |     stmt.execute(sql);
26 |   }
27 | 
28 |   public static void main(String[] args) throws SQLException {
29 |     if (args.length != 3) throw new IllegalArgumentException("need 3 args");
30 |     loadDataToHive.loadData(args[0], args[1], args[2]);
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/etl/start_etl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os,time
 5 | 
 6 | INTPUT_PATH="hdfs://bigdata:9000/flume/record/"
 7 | OUTPUT_PATH="hdfs://bigdata:9000/etl/record/"
 8 | LOAD_CMD="java -cp /home/bigdata/etl/etl-1.0-SNAPSHOT-jar-with-dependencies.jar bigdata.etl.loadDataToHive %s  %s  %s"
 9 | HADOOP_CMD="hadoop jar /home/bigdata/hadoop-2.7.3/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar -D mapred.reduce.tasks=0 -D mapred.map.tasks=1  -input %s -output %s -mapper /home/bigdata/etl/etl.py -file /home/bigdata/etl/etl.py"
10 | 
11 | def getCurrentYmdHM():
12 |     time_struct=time.localtime(time.time()-60*10)
13 | 
14 |     H= time.strftime('%H',time_struct)
15 |     M= int(time.strftime('%M',time_struct))
16 |     M= "%02d" % ((M/10)*10)
17 |     Ymd=time.strftime('%Y-%m-%d',time_struct)
18 | 
19 |     return Ymd+"/"+H+M
20 | 
21 | def startETL():
22 |     subPath=getCurrentYmdHM()
23 |     input=INTPUT_PATH+subPath
24 |     output=OUTPUT_PATH+subPath
25 | 
26 |     hadoop_cmd=HADOOP_CMD %(input, output)
27 |     print hadoop_cmd
28 |     os.system(hadoop_cmd)
29 |     print 'loading data into Hive'
30 |     load_cmd= LOAD_CMD %(output,subPath.split("/")[0],subPath.split("/")[1])
31 |     os.system(load_cmd)
32 | 
33 | if __name__ == '__main__':
34 |     startETL()
35 | 


--------------------------------------------------------------------------------
/flume/command/start_flume_batch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | flume-ng agent --conf /home/bigdata/apache-flume-1.7.0-bin/conf --conf-file /home/bigdata/apache-flume-1.7.0-bin/conf/flume-conf-logAnalysis.properties --name logAgent -Dflume.root.logger=DEBUG,console -Dflume.monitoring.type=http -Dflume.monitoring.port=34545
4 | 
5 | 


--------------------------------------------------------------------------------
/flume/command/start_flume_realtime.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | flume-ng agent --conf /home/bigdata/apache-flume-1.7.0-bin/conf --conf-file /home/bigdata/apache-flume-1.7.0-bin/conf/flume-conf-logAnalysis-kafka.properties --name logAgent -Dflume.root.logger=DEBUG,console -Dflume.monitoring.type=http -Dflume.monitoring.port=34546
4 | 


--------------------------------------------------------------------------------
/flume/conf/flume-conf-logAnalysis-kafka.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | 
19 | # The configuration file needs to define the sources, 
20 | # the channels and the sinks.
21 | # Sources, channels and sinks are defined per agent, 
22 | # in this case called 'agent'
23 | 
24 | logAgent.sources = logSource
25 | logAgent.channels = fileChannel
26 | logAgent.sinks = kafkaSink
27 | 
28 | # For each one of the sources, the type is defined
29 | logAgent.sources.logSource.type = exec
30 | logAgent.sources.logSource.command = tail -F /home/bigdata/datasource/record.list
31 | 
32 | # The channel can be defined as follows.
33 | logAgent.sources.logSource.channels = fileChannel
34 | 
35 | # Each sink's type must be defined
36 | logAgent.sinks.kafkaSink.type = org.apache.flume.sink.kafka.KafkaSink
37 | logAgent.sinks.kafkaSink.topic = log
38 | logAgent.sinks.kafkaSink.brokerList= bigdata:9092
39 | logAgent.sinks.kafkaSink.batchSize= 10
40 | #Specify the channel the sink should use
41 | logAgent.sinks.kafkaSink.channel = fileChannel
42 | 
43 | # Each channel's type is defined.
44 | logAgent.channels.fileChannel.type = file
45 | logAgent.channels.fileChannel.checkpointDir= /home/bigdata/apache-flume-1.7.0-bin/dataCheckpointDir_realtime
46 | logAgent.channels.fileChannel.dataDirs= /home/bigdata/apache-flume-1.7.0-bin/dataDir_realtime
47 | 
48 | # Other config values specific to each type of channel(sink or source)
49 | # can be defined as well
50 | # In this case, it specifies the capacity of the memory channel
51 | 


--------------------------------------------------------------------------------
/flume/conf/flume-conf-logAnalysis.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #  http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | 
19 | # The configuration file needs to define the sources, 
20 | # the channels and the sinks.
21 | # Sources, channels and sinks are defined per agent, 
22 | # in this case called 'agent'
23 | 
24 | logAgent.sources = logSource
25 | logAgent.channels = fileChannel
26 | logAgent.sinks = hdfsSink
27 | 
28 | # For each one of the sources, the type is defined
29 | logAgent.sources.logSource.type = exec
30 | logAgent.sources.logSource.command = tail -F /home/bigdata/datasource/record.list
31 | 
32 | # The channel can be defined as follows.
33 | logAgent.sources.logSource.channels = fileChannel
34 | 
35 | # Each sink's type must be defined
36 | logAgent.sinks.hdfsSink.type = hdfs
37 | logAgent.sinks.hdfsSink.hdfs.path = hdfs://bigdata:9000/flume/record/%Y-%m-%d/%H%M
38 | logAgent.sinks.hdfsSink.hdfs.filePrefix= transaction_log
39 | logAgent.sinks.hdfsSink.hdfs.rollInterval= 600
40 | logAgent.sinks.hdfsSink.hdfs.rollCount= 10000
41 | logAgent.sinks.hdfsSink.hdfs.rollSize= 0
42 | logAgent.sinks.hdfsSink.hdfs.round = true
43 | logAgent.sinks.hdfsSink.hdfs.roundValue = 10
44 | logAgent.sinks.hdfsSink.hdfs.roundUnit = minute
45 | logAgent.sinks.hdfsSink.hdfs.fileType = DataStream
46 | logAgent.sinks.hdfsSink.hdfs.useLocalTimeStamp = true
47 | #Specify the channel the sink should use
48 | logAgent.sinks.hdfsSink.channel = fileChannel
49 | 
50 | # Each channel's type is defined.
51 | logAgent.channels.fileChannel.type = file
52 | logAgent.channels.fileChannel.checkpointDir= /home/bigdata/apache-flume-1.7.0-bin/dataCheckpointDir
53 | logAgent.channels.fileChannel.dataDirs= /home/bigdata/apache-flume-1.7.0-bin/dataDir
54 | 
55 | # Other config values specific to each type of channel(sink or source)
56 | # can be defined as well
57 | # In this case, it specifies the capacity of the memory channel
58 | 


--------------------------------------------------------------------------------
/flume/conf/flume-env.sh:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | # Give Flume more memory and pre-allocate, enable remote monitoring via JMX
19 | export JAVA_OPTS="-Xms100m -Xmx200m -Dcom.sun.management.jmxremote"
20 | 
21 | # Let Flume write raw event data and configuration information to its log files for debugging
22 | # purposes. Enabling these flags is not recommended in production,
23 | # as it may result in logging sensitive user information or encryption secrets.
24 | # $JAVA_OPTS="$JAVA_OPTS -Dorg.apache.flume.log.rawdata=true -Dorg.apache.flume.log.printconfig=true "
25 | 
26 | # Foll. classpath will be included in Flume's classpath.
27 | # Note that the Flume conf directory is always included in the classpath.
28 | FLUME_CLASSPATH="$HADOOP_HOME/share/hadoop/common/hadoop-common-2.7.3.jar"   # Example:  "path1;path2;path3"
29 | 


--------------------------------------------------------------------------------
/hadoop/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
10 | 		<attributes>
11 | 			<attribute name="optional" value="true"/>
12 | 			<attribute name="maven.pomderived" value="true"/>
13 | 		</attributes>
14 | 	</classpathentry>
15 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
16 | 		<attributes>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="output" path="target/classes"/>
26 | </classpath>
27 | 


--------------------------------------------------------------------------------
/hadoop/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>hadoop-core</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/hadoop/command/start-dfs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/start-dfs.sh
4 | 


--------------------------------------------------------------------------------
/hadoop/command/start-historyserver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/mr-jobhistory-daemon.sh start historyserver
4 | 
5 | 


--------------------------------------------------------------------------------
/hadoop/command/start-yarn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/start-yarn.sh
4 | 
5 | 


--------------------------------------------------------------------------------
/hadoop/command/stop-dfs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/stop-dfs.sh
4 | 


--------------------------------------------------------------------------------
/hadoop/command/stop-historyserver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/mr-jobhistory-daemon.sh stop historyserver
4 | 
5 | 


--------------------------------------------------------------------------------
/hadoop/command/stop-yarn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/hadoop-2.7.3/sbin/stop-yarn.sh
4 | 
5 | 


--------------------------------------------------------------------------------
/hadoop/conf/capacity-scheduler.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 | 
  6 |     http://www.apache.org/licenses/LICENSE-2.0
  7 | 
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | <configuration>
 15 | 
 16 |   <property>
 17 |     <name>yarn.scheduler.capacity.maximum-applications</name>
 18 |     <value>10000</value>
 19 |     <description>
 20 |       Maximum number of applications that can be pending and running.
 21 |     </description>
 22 |   </property>
 23 | 
 24 |   <property>
 25 |     <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
 26 |     <value>0.5</value>
 27 |     <description>
 28 |       Maximum percent of resources in the cluster which can be used to run 
 29 |       application masters i.e. controls number of concurrent running
 30 |       applications.
 31 |     </description>
 32 |   </property>
 33 | 
 34 |   <property>
 35 |     <name>yarn.scheduler.capacity.resource-calculator</name>
 36 |     <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
 37 |     <description>
 38 |       The ResourceCalculator implementation to be used to compare 
 39 |       Resources in the scheduler.
 40 |       The default i.e. DefaultResourceCalculator only uses Memory while
 41 |       DominantResourceCalculator uses dominant-resource to compare 
 42 |       multi-dimensional resources such as Memory, CPU etc.
 43 |     </description>
 44 |   </property>
 45 | 
 46 |   <property>
 47 |     <name>yarn.scheduler.capacity.root.queues</name>
 48 |     <value>etl,report,default</value>
 49 |     <description>
 50 |       The queues at the this level (root is the root queue).
 51 |     </description>
 52 |   </property>
 53 | 
 54 |   <property>
 55 |     <name>yarn.scheduler.capacity.root.default.capacity</name>
 56 |     <value>40</value>
 57 |     <description>Default queue target capacity.</description>
 58 |   </property>
 59 |   <property>
 60 |     <name>yarn.scheduler.capacity.root.etl.capacity</name>
 61 |     <value>30</value>
 62 |     <description>etl queue target capacity.</description>
 63 |   </property>
 64 |   <property>
 65 |     <name>yarn.scheduler.capacity.root.report.capacity</name>
 66 |     <value>30</value>
 67 |     <description>report queue target capacity.</description>
 68 |   </property>
 69 |   <property>
 70 |     <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
 71 |     <value>1</value>
 72 |     <description>
 73 |       Default queue user limit a percentage from 0.0 to 1.0.
 74 |     </description>
 75 |   </property>
 76 | 
 77 |   <property>
 78 |     <name>yarn.scheduler.capacity.root.etl.user-limit-factor</name>
 79 |     <value>1</value>
 80 |     <description>
 81 |       etl queue user limit a percentage from 0.0 to 1.0.
 82 |     </description>
 83 |   </property>
 84 |   <property>
 85 |     <name>yarn.scheduler.capacity.root.report.user-limit-factor</name>
 86 |     <value>1</value>
 87 |     <description>
 88 |       report queue user limit a percentage from 0.0 to 1.0.
 89 |     </description>
 90 |   </property>
 91 |   <property>
 92 |     <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
 93 |     <value>100</value>
 94 |     <description>
 95 |       The maximum capacity of the default queue. 
 96 |     </description>
 97 |   </property>
 98 |   <property>
 99 |     <name>yarn.scheduler.capacity.root.etl.maximum-capacity</name>
100 |     <value>100</value>
101 |     <description>
102 |       The maximum capacity of the etl queue.
103 |     </description>
104 |   </property>
105 |   <property>
106 |     <name>yarn.scheduler.capacity.root.report.maximum-capacity</name>
107 |     <value>100</value>
108 |     <description>
109 |       The maximum capacity of the report queue.
110 |     </description>
111 |   </property>
112 | 
113 |   <property>
114 |     <name>yarn.scheduler.capacity.root.default.state</name>
115 |     <value>RUNNING</value>
116 |     <description>
117 |       The state of the default queue. State can be one of RUNNING or STOPPED.
118 |     </description>
119 |   </property>
120 |   <property>
121 |     <name>yarn.scheduler.capacity.root.etl.state</name>
122 |     <value>RUNNING</value>
123 |     <description>
124 |       The state of the etl queue. State can be one of RUNNING or STOPPED.
125 |     </description>
126 |   </property>
127 |   <property>
128 |     <name>yarn.scheduler.capacity.root.report.state</name>
129 |     <value>RUNNING</value>
130 |     <description>
131 |       The state of the report queue. State can be one of RUNNING or STOPPED.
132 |     </description>
133 |   </property>
134 |   <property>
135 |     <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
136 |     <value>*</value>
137 |     <description>
138 |       The ACL of who can submit jobs to the default queue.
139 |     </description>
140 |   </property>
141 |   <property>
142 |     <name>yarn.scheduler.capacity.root.etl.acl_submit_applications</name>
143 |     <value>*</value>
144 |     <description>
145 |       The ACL of who can submit jobs to the etl queue.
146 |     </description>
147 |   </property>
148 |   <property>
149 |     <name>yarn.scheduler.capacity.root.report.acl_submit_applications</name>
150 |     <value>*</value>
151 |     <description>
152 |       The ACL of who can submit jobs to the report queue.
153 |     </description>
154 |   </property>
155 |   <property>
156 |     <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
157 |     <value>*</value>
158 |     <description>
159 |       The ACL of who can administer jobs on the default queue.
160 |     </description>
161 |   </property>
162 |   <property>
163 |     <name>yarn.scheduler.capacity.root.etl.acl_administer_queue</name>
164 |     <value>*</value>
165 |     <description>
166 |       The ACL of who can administer jobs on the etl queue.
167 |     </description>
168 |   </property>
169 |   <property>
170 |     <name>yarn.scheduler.capacity.root.report.acl_administer_queue</name>
171 |     <value>*</value>
172 |     <description>
173 |       The ACL of who can administer jobs on the report queue.
174 |     </description>
175 |   </property>
176 |   <property>
177 |     <name>yarn.scheduler.capacity.node-locality-delay</name>
178 |     <value>40</value>
179 |     <description>
180 |       Number of missed scheduling opportunities after which the CapacityScheduler 
181 |       attempts to schedule rack-local containers. 
182 |       Typically this should be set to number of nodes in the cluster, By default is setting 
183 |       approximately number of nodes in one rack which is 40.
184 |     </description>
185 |   </property>
186 | 
187 |   <property>
188 |     <name>yarn.scheduler.capacity.queue-mappings</name>
189 |     <value></value>
190 |     <description>
191 |       A list of mappings that will be used to assign jobs to queues
192 |       The syntax for this list is [u|g]:[name]:[queue_name][,next mapping]*
193 |       Typically this list will be used to map users to queues,
194 |       for example, u:%user:%user maps all users to queues with the same name
195 |       as the user.
196 |     </description>
197 |   </property>
198 | 
199 |   <property>
200 |     <name>yarn.scheduler.capacity.queue-mappings-override.enable</name>
201 |     <value>false</value>
202 |     <description>
203 |       If a queue mapping is present, will it override the value specified
204 |       by the user? This can be used by administrators to place jobs in queues
205 |       that are different than the one specified by the user.
206 |       The default is false.
207 |     </description>
208 |   </property>
209 | 
210 | </configuration>
211 | 


--------------------------------------------------------------------------------
/hadoop/conf/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | <configuration>
19 |   <property>
20 |     <name>fs.defaultFS</name>
21 |     <value>hdfs://bigdata:9000</value>
22 |   </property>
23 |   <property>
24 |     <name>hadoop.tmp.dir</name>
25 |     <value>/home/bigdata/hadoopdata</value>
26 |   </property>
27 |   <property>
28 |     <name>hadoop.proxyuser.bigdata.hosts</name>
29 |     <value>*</value>
30 |   </property>
31 |   <property>
32 |     <name>hadoop.proxyuser.bigdata.groups</name>
33 |     <value>*</value>
34 |   </property>
35 | </configuration>
36 | 


--------------------------------------------------------------------------------
/hadoop/conf/hadoop-env.sh:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Set Hadoop-specific environment variables here.
18 | 
19 | # The only required environment variable is JAVA_HOME.  All others are
20 | # optional.  When running a distributed configuration it is best to
21 | # set JAVA_HOME in this file, so that it is correctly defined on
22 | # remote nodes.
23 | 
24 | # The java implementation to use.
25 | export JAVA_HOME=/usr/java/jdk1.8.0_101/
26 | 
27 | # The jsvc implementation to use. Jsvc is required to run secure datanodes
28 | # that bind to privileged ports to provide authentication of data transfer
29 | # protocol.  Jsvc is not required if SASL is configured for authentication of
30 | # data transfer protocol using non-privileged ports.
31 | #export JSVC_HOME=${JSVC_HOME}
32 | 
33 | export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
34 | 
35 | # Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
36 | for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
37 |   if [ "$HADOOP_CLASSPATH" ]; then
38 |     export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
39 |   else
40 |     export HADOOP_CLASSPATH=$f
41 |   fi
42 | done
43 | 
44 | # The maximum amount of heap to use, in MB. Default is 1000.
45 | export HADOOP_HEAPSIZE=512
46 | #export HADOOP_NAMENODE_INIT_HEAPSIZE=""
47 | 
48 | # Extra Java runtime options.  Empty by default.
49 | export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
50 | 
51 | # Command specific options appended to HADOOP_OPTS when specified
52 | export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
53 | export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
54 | 
55 | export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
56 | 
57 | export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
58 | export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
59 | 
60 | # The following applies to multiple commands (fs, dfs, fsck, distcp etc)
61 | export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
62 | #HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
63 | 
64 | # On secure datanodes, user to run the datanode as after dropping privileges.
65 | # This **MUST** be uncommented to enable secure HDFS if using privileged ports
66 | # to provide authentication of data transfer protocol.  This **MUST NOT** be
67 | # defined if SASL is configured for authentication of data transfer protocol
68 | # using non-privileged ports.
69 | export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
70 | 
71 | # Where log files are stored.  $HADOOP_HOME/logs by default.
72 | #export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
73 | 
74 | # Where log files are stored in the secure data environment.
75 | export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
76 | 
77 | ###
78 | # HDFS Mover specific parameters
79 | ###
80 | # Specify the JVM options to be used when starting the HDFS Mover.
81 | # These options will be appended to the options specified as HADOOP_OPTS
82 | # and therefore may override any similar flags set in HADOOP_OPTS
83 | #
84 | # export HADOOP_MOVER_OPTS=""
85 | 
86 | ###
87 | # Advanced Users Only!
88 | ###
89 | 
90 | # The directory where pid files are stored. /tmp by default.
91 | # NOTE: this should be set to a directory that can only be written to by 
92 | #       the user that will run the hadoop daemons.  Otherwise there is the
93 | #       potential for a symlink attack.
94 | export HADOOP_PID_DIR=${HADOOP_PID_DIR}
95 | export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
96 | 
97 | # A string representing this instance of hadoop. $USER by default.
98 | export HADOOP_IDENT_STRING=$USER
99 | 


--------------------------------------------------------------------------------
/hadoop/conf/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | <property>
21 | <name>dfs.replication</name>
22 | <value>1</value>
23 | </property>
24 | </configuration>
25 | 


--------------------------------------------------------------------------------
/hadoop/conf/mapred-env.sh:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # export JAVA_HOME=/home/y/libexec/jdk1.6.0/
17 | 
18 | export HADOOP_JOB_HISTORYSERVER_HEAPSIZE=512
19 | 
20 | export HADOOP_MAPRED_ROOT_LOGGER=INFO,RFA
21 | 
22 | #export HADOOP_JOB_HISTORYSERVER_OPTS=
23 | #export HADOOP_MAPRED_LOG_DIR="" # Where log files are stored.  $HADOOP_MAPRED_HOME/logs by default.
24 | #export HADOOP_JHS_LOGGER=INFO,RFA # Hadoop JobSummary logger.
25 | #export HADOOP_MAPRED_PID_DIR= # The pid files are stored. /tmp by default.
26 | #export HADOOP_MAPRED_IDENT_STRING= #A string representing this instance of hadoop. $USER by default
27 | #export HADOOP_MAPRED_NICENESS= #The scheduling priority for daemons. Defaults to 0.
28 | 


--------------------------------------------------------------------------------
/hadoop/conf/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |    <property>
21 |      <name>mapreduce.framework.name</name>
22 |      <value>yarn</value>
23 |    </property>
24 |    <property>
25 |      <name>mapreduce.jobhistory.done-dir</name> 
26 |      <value>/user/history/done</value> 
27 |   </property> 
28 |   <property> 
29 |     <name>mapreduce.jobhistory.intermediate-done-dir</name> 
30 |     <value>/user/history/done_intermediate</value> 
31 |     </property>
32 | </configuration>
33 | 


--------------------------------------------------------------------------------
/hadoop/conf/slaves:
--------------------------------------------------------------------------------
1 | bigdata
2 | 


--------------------------------------------------------------------------------
/hadoop/conf/yarn-env.sh:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # User for YARN daemons
 17 | export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn}
 18 | 
 19 | # resolve links - $0 may be a softlink
 20 | export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}"
 21 | 
 22 | # some Java parameters
 23 | export JAVA_HOME=/usr/java/jdk1.8.0_101/
 24 | if [ "$JAVA_HOME" != "" ]; then
 25 |   #echo "run java in $JAVA_HOME"
 26 |   JAVA_HOME=$JAVA_HOME
 27 | fi
 28 |   
 29 | if [ "$JAVA_HOME" = "" ]; then
 30 |   echo "Error: JAVA_HOME is not set."
 31 |   exit 1
 32 | fi
 33 | 
 34 | JAVA=$JAVA_HOME/bin/java
 35 | JAVA_HEAP_MAX=-Xmx512m 
 36 | 
 37 | # For setting YARN specific HEAP sizes please use this
 38 | # Parameter and set appropriately
 39 | # YARN_HEAPSIZE=1000
 40 | 
 41 | # check envvars which might override default args
 42 | if [ "$YARN_HEAPSIZE" != "" ]; then
 43 |   JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m"
 44 | fi
 45 | 
 46 | # Resource Manager specific parameters
 47 | 
 48 | # Specify the max Heapsize for the ResourceManager using a numerical value
 49 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
 50 | # the value to 1000.
 51 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS
 52 | # and/or YARN_RESOURCEMANAGER_OPTS.
 53 | # If not specified, the default value will be picked from either YARN_HEAPMAX
 54 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
 55 | #export YARN_RESOURCEMANAGER_HEAPSIZE=1000
 56 | 
 57 | # Specify the max Heapsize for the timeline server using a numerical value
 58 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
 59 | # the value to 1000.
 60 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS
 61 | # and/or YARN_TIMELINESERVER_OPTS.
 62 | # If not specified, the default value will be picked from either YARN_HEAPMAX
 63 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
 64 | #export YARN_TIMELINESERVER_HEAPSIZE=1000
 65 | 
 66 | # Specify the JVM options to be used when starting the ResourceManager.
 67 | # These options will be appended to the options specified as YARN_OPTS
 68 | # and therefore may override any similar flags set in YARN_OPTS
 69 | #export YARN_RESOURCEMANAGER_OPTS=
 70 | 
 71 | # Node Manager specific parameters
 72 | 
 73 | # Specify the max Heapsize for the NodeManager using a numerical value
 74 | # in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set
 75 | # the value to 1000.
 76 | # This value will be overridden by an Xmx setting specified in either YARN_OPTS
 77 | # and/or YARN_NODEMANAGER_OPTS.
 78 | # If not specified, the default value will be picked from either YARN_HEAPMAX
 79 | # or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two.
 80 | #export YARN_NODEMANAGER_HEAPSIZE=1000
 81 | 
 82 | # Specify the JVM options to be used when starting the NodeManager.
 83 | # These options will be appended to the options specified as YARN_OPTS
 84 | # and therefore may override any similar flags set in YARN_OPTS
 85 | #export YARN_NODEMANAGER_OPTS=
 86 | 
 87 | # so that filenames w/ spaces are handled correctly in loops below
 88 | IFS=
 89 | 
 90 | 
 91 | # default log directory & file
 92 | if [ "$YARN_LOG_DIR" = "" ]; then
 93 |   YARN_LOG_DIR="$HADOOP_YARN_HOME/logs"
 94 | fi
 95 | if [ "$YARN_LOGFILE" = "" ]; then
 96 |   YARN_LOGFILE='yarn.log'
 97 | fi
 98 | 
 99 | # default policy file for service-level authorization
100 | if [ "$YARN_POLICYFILE" = "" ]; then
101 |   YARN_POLICYFILE="hadoop-policy.xml"
102 | fi
103 | 
104 | # restore ordinary behaviour
105 | unset IFS
106 | 
107 | 
108 | YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR"
109 | YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR"
110 | YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE"
111 | YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE"
112 | YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME"
113 | YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING"
114 | YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
115 | YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}"
116 | if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
117 |   YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
118 | fi  
119 | YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE"
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/hadoop/conf/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |   See the License for the specific language governing permissions and
13 |   limitations under the License. See accompanying LICENSE file.
14 | -->
15 | <configuration>
16 | 
17 | <!-- Site specific YARN configuration properties -->
18 |   <property>
19 |     <name>yarn.nodemanager.aux-services</name>
20 |     <value>mapreduce_shuffle</value>
21 |   </property>
22 | 
23 |   <property>
24 |     <name>yarn.resourcemanager.address</name>
25 |     <value>bigdata:18040</value>
26 |   </property>
27 | 
28 |   <property>
29 |     <name>yarn.resourcemanager.scheduler.address</name>
30 |     <value>bigdata:18030</value>
31 |   </property>
32 | 
33 |   <property>
34 |     <name>yarn.resourcemanager.resource-tracker.address</name>
35 |     <value>bigdata:18025</value>
36 |   </property>
37 | 
38 |   <property>
39 |     <name>yarn.resourcemanager.admin.address</name>
40 |     <value>bigdata:18141</value>
41 |   </property>
42 | 
43 |   <property>
44 |     <name>yarn.resourcemanager.webapp.address</name>
45 |     <value>bigdata:18088</value>
46 |   </property>
47 |   <property>
48 |     <name>yarn.resourcemanager.scheduler.class</name>
49 |     <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
50 |   </property>
51 | </configuration>
52 | 


--------------------------------------------------------------------------------
/hadoop/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 | 	xmlns="http://maven.apache.org/POM/4.0.0"
 5 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 6 | 	<modelVersion>4.0.0</modelVersion>
 7 | 	<parent>
 8 | 		<artifactId>hadoop-example</artifactId>
 9 | 		<groupId>cn.chinahadoop</groupId>
10 | 		<version>0.1.0-SNAPSHOT</version>
11 | 		<relativePath>../pom.xml</relativePath>
12 | 	</parent>
13 | 
14 | 	<groupId>cn.chinahadoop</groupId>
15 | 	<artifactId>hadoop-core</artifactId>
16 | 	<version>0.1.0-SNAPSHOT</version>
17 | 	<description>Hadoop Core Examples</description>
18 | 	<name>hadoop core examples</name>
19 | 	<packaging>jar</packaging>
20 | 
21 | 	<properties>
22 | 		<hadoop.version>2.7.3</hadoop.version>
23 | 	</properties>
24 | 
25 | 	<dependencies>
26 | 		<dependency>
27 | 			<groupId>org.apache.hadoop</groupId>
28 | 			<artifactId>hadoop-client</artifactId>
29 | 			<version>${hadoop.version}</version>
30 | 		</dependency>
31 | 		<dependency>
32 | 			<groupId>org.apache.hadoop</groupId>
33 | 			<artifactId>hadoop-common</artifactId>
34 | 			<version>2.7.3</version>
35 | 		</dependency>
36 | 		<dependency>
37 | 			<groupId>org.apache.hadoop</groupId>
38 | 			<artifactId>hadoop-mapreduce-client-core</artifactId>
39 | 			<version>2.7.3</version>
40 | 		</dependency>
41 | 		<dependency>
42 | 			<groupId>org.apache.hadoop</groupId>
43 | 			<artifactId>hadoop-mapreduce-client-common</artifactId>
44 | 			<version>2.7.3</version>
45 | 		</dependency>
46 | 	</dependencies>
47 | </project>
48 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/hdfs/HdfsExample.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.hdfs;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | 
 7 | public class HdfsExample {
 8 | 
 9 |   public static void testMkdirPath(String path) throws Exception {
10 |     FileSystem fs = null;
11 |     try {
12 |       System.out.println("Creating " + path + " on hdfs...");
13 |       Configuration conf = new Configuration();
14 |       // First create a new directory with mkdirs
15 |       Path myPath = new Path(path);
16 |       fs = myPath.getFileSystem(conf);
17 | 
18 |       fs.mkdirs(myPath);
19 |       System.out.println("Create " + path + " on hdfs successfully.");
20 |     } catch (Exception e) {
21 |       System.out.println("Exception:" + e);
22 |     } finally {
23 |       if(fs != null)
24 |         fs.close();
25 |     }
26 |   }
27 | 
28 |   public static void testDeletePath(String path) throws Exception {
29 |     FileSystem fs = null;
30 |     try {
31 |       System.out.println("Deleting " + path + " on hdfs...");
32 |       Configuration conf = new Configuration();
33 |       Path myPath = new Path(path);
34 |       fs = myPath.getFileSystem(conf);
35 | 
36 |       fs.delete(myPath, true);
37 |       System.out.println("Deleting " + path + " on hdfs successfully.");
38 |     } catch (Exception e) {
39 |       System.out.println("Exception:" + e);
40 |     } finally {
41 |       if(fs != null)
42 |         fs.close();
43 |     }
44 |   }
45 | 
46 |   public static void main(String[] args) {
47 |     try {
48 |       //String path = "hdfs:namenodehost:8020/test/mkdirs-test";
49 |       String path = "/test/mkdirs-test";
50 |       testMkdirPath(path);
51 |       //testDeletePath(path);
52 |     } catch (Exception e) {
53 |       System.out.println("Exceptions:" + e);
54 |     }
55 |     System.out.println("timestamp:" + System.currentTimeMillis());
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/Grep.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.mapreduce;
 2 | 
 3 | import java.util.Random;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.conf.Configured;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.map.InverseMapper;
14 | import org.apache.hadoop.mapreduce.lib.map.RegexMapper;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
17 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
18 | import org.apache.hadoop.util.Tool;
19 | import org.apache.hadoop.util.ToolRunner;
20 | import org.apache.hadoop.io.Text;
21 | 
22 | public class Grep extends Configured implements Tool {
23 | 	private Grep() {
24 | 	} // singleton
25 | 
26 | 	public int run(String[] args) throws Exception {
27 | 		if (args.length < 3) {
28 | 			System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
29 | 			ToolRunner.printGenericCommandUsage(System.out);
30 | 			return 2;
31 | 		}
32 |         // the temp dir between two mapreduce jobs
33 | 		Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
34 | 
35 | 		Configuration conf = getConf();
36 | 		conf.set(RegexMapper.PATTERN, args[2]);
37 | 		if (args.length == 4)
38 | 			conf.set(RegexMapper.GROUP, args[3]);
39 |          //the first job
40 | 		// word count
41 | 		Job grepJob = new Job(conf);
42 | 
43 | 		try {
44 |             //define the first job
45 | 			grepJob.setJobName("grep-search");
46 | 
47 | 			FileInputFormat.setInputPaths(grepJob, args[0]);
48 | 
49 | 			grepJob.setMapperClass(RegexMapper.class);
50 | 
51 | 			grepJob.setCombinerClass(LongSumReducer.class);
52 | 			grepJob.setReducerClass(LongSumReducer.class);
53 |             // output to tempDir
54 | 			FileOutputFormat.setOutputPath(grepJob, tempDir);
55 | 			grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
56 | 			grepJob.setOutputKeyClass(Text.class);
57 | 			grepJob.setOutputValueClass(LongWritable.class);
58 |             // result: word + count
59 | 			grepJob.waitForCompletion(true);
60 |             //the second job
61 | 			//sort
62 | 			Job sortJob = new Job(conf);
63 | 			sortJob.setJobName("grep-sort");
64 |             //tempDir to input
65 | 			FileInputFormat.setInputPaths(sortJob, tempDir);
66 | 			sortJob.setInputFormatClass(SequenceFileInputFormat.class);
67 | 
68 | 			sortJob.setMapperClass(InverseMapper.class);
69 |             //just write the sort data out
70 | 			sortJob.setNumReduceTasks(1); // write a single file
71 | 			FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
72 | 			sortJob.setSortComparatorClass( // sort by decreasing freq
73 | 					LongWritable.DecreasingComparator.class);
74 | 
75 | 			FileSystem.get(conf).delete(new Path(args[1]),true);
76 | 			
77 | 			sortJob.waitForCompletion(true);
78 | 		} finally {
79 | 			FileSystem.get(conf).delete(tempDir, true);
80 | 		}
81 | 		return 0;
82 | 	}
83 | 
84 | 	public static void main(String[] args) throws Exception {
85 | 		int res = ToolRunner.run(new Configuration(), new Grep(), args);
86 | 		System.exit(res);
87 | 	}
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/InvertedIndex.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.mapreduce;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | import org.apache.hadoop.mapreduce.Reducer;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 | import org.apache.hadoop.util.GenericOptionsParser;
16 | 
17 | import java.io.IOException;
18 | import java.util.HashMap;
19 | import java.util.Map;
20 | import java.util.StringTokenizer;
21 | 
22 | /**
23 |  * Created by qianxi.zhang on 12/17/16.
24 |  */
25 | 
26 | public class InvertedIndex {
27 | 	public static class WordToFileMapper extends Mapper<LongWritable, Text, Text, Text> {
28 | 		@Override
29 | 		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
30 | 			// Get the name of the file using context.getInputSplit()method
31 | 			String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
32 | 			// Split the line in words
33 | 			StringTokenizer itr = new StringTokenizer(value.toString());
34 | 			while (itr.hasMoreTokens()) {
35 | 				// For each word emit word as key and file name as value
36 | 				context.write(new Text(itr.nextToken()), new Text(fileName));
37 | 			}
38 | 		}
39 | 	}
40 | 
41 | 	public static class WordToFileCountReducer extends Reducer<Text, Text, Text, Text> {
42 | 		@Override
43 | 		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
44 | 			// Declare the Hash Map to store File name as key to compute
45 | 			// and store number of times the filename is occurred for as value
46 | 			Map<String, Integer> map = new HashMap<String, Integer>();
47 | 			for (Text fileText : values) {
48 | 				String file = fileText.toString();
49 | 				if (map.containsKey(file)) {
50 | 					map.put(file, map.get(file) + 1);
51 | 				} else {
52 | 					map.put(file, 1);
53 | 				}
54 | 			}
55 | 			context.write(key, new Text(map.toString()));
56 | 		}
57 | 	}
58 | 
59 | 	public static void main(String[] args) throws Exception {
60 | 		Configuration conf = new Configuration();
61 | 
62 | 		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
63 | 		if (otherArgs.length < 2) {
64 | 			System.err.println("Usage: invertedindex <inDir> [<in>...] <out>");
65 | 			System.exit(2);
66 | 		}
67 | 		Job job = Job.getInstance(conf, "invert index");
68 | 		job.setJarByClass(InvertedIndex.class);
69 | 		job.setMapperClass(WordToFileMapper.class);
70 | 		job.setReducerClass(WordToFileCountReducer.class);
71 | 
72 | 		// Defining the output key and value class for the mapper
73 | 		job.setMapOutputKeyClass(Text.class);
74 | 		job.setMapOutputValueClass(Text.class);
75 | 
76 | 		// Defining the output key and value class for the reducer
77 | 		job.setOutputKeyClass(Text.class);
78 | 		job.setOutputValueClass(Text.class);
79 | 
80 | 		job.setInputFormatClass(TextInputFormat.class);
81 | 		job.setOutputFormatClass(TextOutputFormat.class);
82 | 
83 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
84 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
85 | 
86 | 		Path outputPath = new Path(otherArgs[1]);
87 | 
88 | 		outputPath.getFileSystem(conf).delete(outputPath);
89 | 
90 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
91 | 	}
92 | }


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/JobFailureTest.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.conf.Configured;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.Reducer.Context;
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
17 | import org.apache.hadoop.util.GenericOptionsParser;
18 | import org.apache.hadoop.util.Tool;
19 | import org.apache.hadoop.util.ToolRunner;
20 | 
21 | public class JobFailureTest extends Configured implements Tool {
22 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
23 | 
24 | 		private final static IntWritable one = new IntWritable(1);
25 | 		private Text word = new Text();
26 | 
27 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
28 | 			//get the task id from context
29 | 			//for the first map task(task id=0), the task always fails
30 | 			int id = context.getTaskAttemptID().getTaskID().getId();
31 | 			System.out.println("id:" + id);
32 | 			if (id == 0)
33 | 				System.exit(-1);
34 | 			StringTokenizer itr = new StringTokenizer(value.toString());
35 | 			while (itr.hasMoreTokens()) {
36 | 				word.set(itr.nextToken());
37 | 				context.write(word, one);
38 | 			}
39 | 		}
40 | 	}
41 | 
42 | 	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
43 | 		private IntWritable result = new IntWritable();
44 | 
45 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
46 | 				throws IOException, InterruptedException {
47 | 			int sum = 0;
48 | 			for (IntWritable val : values) {
49 | 				sum += val.get();
50 | 			}
51 | 			result.set(sum);
52 | 			context.write(key, result);
53 | 		}
54 | 	}
55 | 
56 | 	public int run(String[] args) throws Exception {
57 | 		Configuration conf = getConf();
58 | 		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
59 | 		if (otherArgs.length < 2) {
60 | 			System.err.println("Usage: word count job failure test <in> [<in>...] <out>");
61 | 			System.exit(2);
62 | 		}
63 | 		Job job = new Job(conf, "word count job failure test");
64 | 		job.setJarByClass(JobFailureTest.class);
65 | 		job.setMapperClass(TokenizerMapper.class);
66 | 		job.setCombinerClass(IntSumReducer.class);
67 | 		job.setReducerClass(IntSumReducer.class);
68 | 		job.setOutputKeyClass(Text.class);
69 | 		job.setOutputValueClass(IntWritable.class);
70 | 		for (int i = 0; i < otherArgs.length - 1; ++i) {
71 | 			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
72 | 		}
73 | 
74 | 		Path outputPath = new Path(otherArgs[1]);
75 | 
76 | 		outputPath.getFileSystem(conf).delete(outputPath);
77 | 
78 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
79 | 		return job.waitForCompletion(true) ? 0 : 1;
80 | 	}
81 | 
82 | 	public static void main(String[] args) throws Exception {
83 | 		int res = ToolRunner.run(new Configuration(), new JobFailureTest(), args);
84 | 		System.exit(res);
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/OOMTest.java:
--------------------------------------------------------------------------------
  1 | package cn.chinahadoop.mapreduce;
  2 | 
  3 | import java.io.IOException;
  4 | import java.lang.reflect.Field;
  5 | import java.util.StringTokenizer;
  6 | 
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.conf.Configured;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.IntWritable;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.apache.hadoop.mapreduce.Job;
 13 | import org.apache.hadoop.mapreduce.Mapper;
 14 | import org.apache.hadoop.mapreduce.Reducer;
 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 17 | import org.apache.hadoop.util.GenericOptionsParser;
 18 | import org.apache.hadoop.util.Tool;
 19 | import org.apache.hadoop.util.ToolRunner;
 20 | import sun.misc.Unsafe;
 21 | 
 22 | public class OOMTest extends Configured implements Tool {
 23 | 
 24 | 	public static final int on_heap_length = 1 * 100 * 1000 * 1000;
 25 | 	public static final int off_heap_length = 1000 * 1000 * 1000;
 26 | 
 27 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
 28 | 		private final static IntWritable one = new IntWritable(1);
 29 | 		private Text word = new Text();
 30 | 
 31 | 		// allocate on heap space
 32 | 		private byte[] byteArray = new byte[on_heap_length];
 33 | 
 34 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 35 | 			Field f = null;
 36 | 			try {
 37 | 				f = Unsafe.class.getDeclaredField("theUnsafe");
 38 | 			} catch (NoSuchFieldException e) {
 39 | 				e.printStackTrace();
 40 | 			}
 41 | 			f.setAccessible(true);
 42 | 			Unsafe us = null;
 43 | 			try {
 44 | 				us = (Unsafe) f.get(null);
 45 | 			} catch (IllegalAccessException e) {
 46 | 				e.printStackTrace();
 47 | 			}
 48 | 			// allocate off heap space
 49 | 			long id = us.allocateMemory(off_heap_length);
 50 | 
 51 | 			StringTokenizer itr = new StringTokenizer(value.toString());
 52 | 			while (itr.hasMoreTokens()) {
 53 | 				word.set(itr.nextToken());
 54 | 				context.write(word, one);
 55 | 			}
 56 | 		}
 57 | 	}
 58 | 
 59 | 	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
 60 | 		private IntWritable result = new IntWritable();
 61 | 
 62 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
 63 | 				throws IOException, InterruptedException {
 64 | 			int sum = 0;
 65 | 			for (IntWritable val : values) {
 66 | 				sum += val.get();
 67 | 			}
 68 | 			result.set(sum);
 69 | 			context.write(key, result);
 70 | 		}
 71 | 	}
 72 | 
 73 | 	public int run(String[] args) throws Exception {
 74 | 		Configuration conf = getConf();
 75 | 		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
 76 | 		if (otherArgs.length < 2) {
 77 | 			System.err.println("Usage: oom-test <in> [<in>...] <out>");
 78 | 			System.exit(2);
 79 | 		}
 80 | 		Job job = new Job(conf, "oom test");
 81 | 		job.setJarByClass(OOMTest.class);
 82 | 		job.setMapperClass(TokenizerMapper.class);
 83 | 		job.setCombinerClass(IntSumReducer.class);
 84 | 		job.setReducerClass(IntSumReducer.class);
 85 | 		job.setOutputKeyClass(Text.class);
 86 | 		job.setOutputValueClass(IntWritable.class);
 87 | 
 88 | 		Path outputPath = new Path(otherArgs[1]);
 89 | 		outputPath.getFileSystem(conf).delete(outputPath);
 90 | 
 91 | 		for (int i = 0; i < otherArgs.length - 1; ++i) {
 92 | 			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
 93 | 		}
 94 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
 95 | 		return job.waitForCompletion(true) ? 0 : 1;
 96 | 	}
 97 | 
 98 | 	public static void main(String[] args) throws Exception {
 99 | 		int res = ToolRunner.run(new Configuration(), new OOMTest(), args);
100 | 		System.exit(res);
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/TaskAttemptTest.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.conf.Configured;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.Reducer.Context;
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
17 | import org.apache.hadoop.util.GenericOptionsParser;
18 | import org.apache.hadoop.util.Tool;
19 | import org.apache.hadoop.util.ToolRunner;
20 | 
21 | public class TaskAttemptTest extends Configured implements Tool {
22 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
23 | 
24 | 		private final static IntWritable one = new IntWritable(1);
25 | 		private Text word = new Text();
26 | 
27 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
28 | 			//get the task attempt id
29 | 			//for the 4 previous attempt for the task, the attempt always fails.   
30 | 			int id = context.getTaskAttemptID().getId();
31 | 			System.out.println("id:" + id);
32 | 			if (id < 4)
33 | 				System.exit(-1);
34 | 			StringTokenizer itr = new StringTokenizer(value.toString());
35 | 			while (itr.hasMoreTokens()) {
36 | 				word.set(itr.nextToken());
37 | 				context.write(word, one);
38 | 			}
39 | 		}
40 | 	}
41 | 
42 | 	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
43 | 		private IntWritable result = new IntWritable();
44 | 
45 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
46 | 				throws IOException, InterruptedException {
47 | 			int sum = 0;
48 | 			for (IntWritable val : values) {
49 | 				sum += val.get();
50 | 			}
51 | 			result.set(sum);
52 | 			context.write(key, result);
53 | 		}
54 | 	}
55 | 
56 | 	public int run(String[] args) throws Exception {
57 | 		Configuration conf = getConf();
58 | 		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
59 | 		if (otherArgs.length < 2) {
60 | 			System.err.println("Usage: word count attempt test <in> [<in>...] <out>");
61 | 			System.exit(2);
62 | 		}
63 | 		Job job = new Job(conf, "word count attempt test");
64 | 		job.setJarByClass(TaskAttemptTest.class);
65 | 		job.setMapperClass(TokenizerMapper.class);
66 | 		job.setCombinerClass(IntSumReducer.class);
67 | 		job.setReducerClass(IntSumReducer.class);
68 | 		job.setOutputKeyClass(Text.class);
69 | 		job.setOutputValueClass(IntWritable.class);
70 | 		for (int i = 0; i < otherArgs.length - 1; ++i) {
71 | 			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
72 | 		}
73 | 
74 | 		Path outputPath = new Path(otherArgs[1]);
75 | 
76 | 		outputPath.getFileSystem(conf).delete(outputPath);
77 | 
78 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
79 | 		return job.waitForCompletion(true) ? 0 : 1;
80 | 	}
81 | 
82 | 	public static void main(String[] args) throws Exception {
83 | 		int res = ToolRunner.run(new Configuration(), new TaskAttemptTest(), args);
84 | 		System.exit(res);
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/hadoop/src/main/java/cn/chinahadoop/mapreduce/WordCount.java:
--------------------------------------------------------------------------------
 1 | package cn.chinahadoop.mapreduce;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | import org.apache.hadoop.util.GenericOptionsParser;
16 | 
17 | public class WordCount {
18 | 
19 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
20 | 
21 | 		private final static IntWritable one = new IntWritable(1);
22 | 		private Text word = new Text();
23 | 
24 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
25 | 			StringTokenizer itr = new StringTokenizer(value.toString());
26 | 			while (itr.hasMoreTokens()) {
27 | 				word.set(itr.nextToken());
28 | 				context.write(word, one);
29 | 			}
30 | 		}
31 | 	}
32 | 
33 | 	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
34 | 		private IntWritable result = new IntWritable();
35 | 
36 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
37 | 				throws IOException, InterruptedException {
38 | 			int sum = 0;
39 | 			for (IntWritable val : values) {
40 | 				sum += val.get();
41 | 			}
42 | 			result.set(sum);
43 | 			context.write(key, result);
44 | 		}
45 | 	}
46 | 
47 | 	public static void main(String[] args) throws Exception {
48 | 		Configuration conf = new Configuration();
49 | 		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
50 | 		if (otherArgs.length < 2) {
51 | 			System.err.println("Usage: wordcount <in> [<in>...] <out>");
52 | 			System.exit(2);
53 | 		}
54 | 		Job job = new Job(conf, "word count");
55 | 		job.setJarByClass(WordCount.class);
56 | 		job.setMapperClass(TokenizerMapper.class);
57 | 		job.setCombinerClass(IntSumReducer.class);
58 | 		job.setReducerClass(IntSumReducer.class);
59 | 		job.setOutputKeyClass(Text.class);
60 | 		job.setOutputValueClass(IntWritable.class);
61 | 		for (int i = 0; i < otherArgs.length - 1; ++i) {
62 | 			FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
63 | 		}
64 | 
65 | 		Path outputPath = new Path(otherArgs[1]);
66 | 
67 | 		outputPath.getFileSystem(conf).delete(outputPath);
68 | 
69 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
70 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/hadoop/src/main/resources/input/input_1.txt:
--------------------------------------------------------------------------------
1 | hello world
2 | I have a dream
3 | all over the world
4 | hello china
5 | 


--------------------------------------------------------------------------------
/hadoop/src/main/resources/input/input_2.txt:
--------------------------------------------------------------------------------
1 | I have a best friend
2 | She is very thin and kind
3 | She is hard-working
4 | 


--------------------------------------------------------------------------------
/hadoop/src/main/resources/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/hadoop/src/main/resources/output/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    'H݇


--------------------------------------------------------------------------------
/hadoop/src/main/resources/output/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/src/main/resources/output/_SUCCESS


--------------------------------------------------------------------------------
/hadoop/src/main/resources/output/part-r-00000:
--------------------------------------------------------------------------------
 1 | I	2
 2 | She	2
 3 | a	2
 4 | all	1
 5 | and	1
 6 | best	1
 7 | china	1
 8 | dream	1
 9 | friend	1
10 | hard-working	1
11 | have	2
12 | hello	2
13 | is	2
14 | kind	1
15 | over	1
16 | the	1
17 | thin	1
18 | very	1
19 | world	2
20 | 


--------------------------------------------------------------------------------
/hadoop/streaming/mapper.cpp:
--------------------------------------------------------------------------------
 1 | // By dongxicheng,
 2 | // blog:http://dongxicheng.org/
 3 | // mapper.cpp
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <sstream>
 7 | #include <vector>
 8 | #include <cstdlib>
 9 | using namespace std;
10 | string charArrayToString(char *str) {
11 |   stringstream ss(str);
12 |   return ss.str(); 
13 | }
14 | 
15 | vector<std::string>& split(
16 |   const string &s, char delim, vector<string> &elems) {
17 |   stringstream ss(s);
18 |   string item;
19 |   while(getline(ss, item, delim)) {
20 |     elems.push_back(item);
21 |   }
22 |  return elems;
23 | }
24 | 
25 | int main(int argc, char *argv[], char *env[]) {
26 |   int reduce_task_no = -1;
27 |   int iterator = -1;
28 |   vector<string> pairs; 
29 |   for(int i = 0; env[i] != NULL; i++) {
30 |     pairs.clear();
31 |     split(charArrayToString(env[i]), '=', pairs);
32 |     if(pairs.size() < 2) continue;
33 |     if(pairs[0] == "mapreduce_job_reduces") // number of reduce tasks
34 |       reduce_task_no = atoi(pairs[1].c_str());
35 |     else if(pairs[0] == "mapreduce_iterator_no") // user-defined attribute
36 |       iterator = atoi(pairs[1].c_str());
37 |   }
38 |   cerr << "mapreduce.job.reduces:" << reduce_task_no 
39 |        << ",mapreduce.iterator.no:" << iterator << endl;
40 | 
41 |   string key;
42 |   while(cin >> key) {
43 |     cout << key << "\t" << "1" << endl;
44 |     // Define counter named counter_no in group counter_group
45 |     cerr << "reporter:counter:counter_group,counter_no,1\n";
46 |     // dispaly status 
47 |     cerr << "reporter:status:processing......\n"; 
48 |     // Print logs for testing
49 |     cerr << "This is log, will be printed in stdout file\n";
50 |   }
51 |   return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/hadoop/streaming/mapper.php:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/streaming/mapper.php


--------------------------------------------------------------------------------
/hadoop/streaming/mapper.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | while read LINE; do
 3 |   for word in $LINE
 4 |   do
 5 |     echo "$word 1"
 6 |     # in streaming, we define counter by 
 7 |     # [reporter:counter:<group>,<counter>,<amount>]
 8 |     # define a counter named counter_no, in group counter_group
 9 |     # increase this counter by 1
10 |     # counter shoule be output through stderr
11 |     echo "reporter:counter:counter_group,counter_no,1" >&2
12 |     echo "reporter:counter:status,processing......" >&2
13 |     echo "This is log for testing, will be printed in stdout file" >&2
14 |   done
15 | done
16 | 


--------------------------------------------------------------------------------
/hadoop/streaming/mapper2.cpp:
--------------------------------------------------------------------------------
 1 | // By dongxicheng,
 2 | // blog:http://dongxicheng.org/
 3 | // mapper.cpp
 4 | #include <iostream>
 5 | #include <string>
 6 | using namespace std;
 7 | 
 8 | int main() {
 9 |   string key;
10 |   while(cin >> key) {
11 |     cout << key << "\t" << "1" << endl;
12 |   }
13 |   return 0;
14 | }
15 | 


--------------------------------------------------------------------------------
/hadoop/streaming/mapper2.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | while read LINE; do
3 |   for word in $LINE
4 |   do
5 |     echo "$word 1"
6 |   done
7 | done
8 | 


--------------------------------------------------------------------------------
/hadoop/streaming/reducer.cpp:
--------------------------------------------------------------------------------
 1 | // By dongxicheng,
 2 | // blog:http://dongxicheng.org/
 3 | // reducer.cpp
 4 | #include <iostream>
 5 | #include <string>
 6 | 
 7 | using namespace std;
 8 | int main() {
 9 |   string cur_key, last_key, value;
10 |   cin >> cur_key >> value;
11 |   last_key = cur_key;
12 |   int n = 1;
13 |   while(cin >> cur_key) {
14 |     cin >> value;
15 |     if(last_key != cur_key) {
16 |       cout << last_key << "\t" << n << endl;
17 |       last_key = cur_key;
18 |       n = 1;
19 |     } else {
20 |       n++;
21 |     }
22 |   }
23 |   cout << last_key << "\t" << n << endl;
24 |   return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/hadoop/streaming/reducer.php:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/hadoop/streaming/reducer.php


--------------------------------------------------------------------------------
/hadoop/streaming/reducer.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | count=0
 3 | started=0
 4 | word=""
 5 | while read LINE;do
 6 |   newword=`echo $LINE | cut -d ' '  -f 1`
 7 |   if [ "$word" != "$newword" ];then
 8 |     [ $started -ne 0 ] && echo "$word\t$count"
 9 |     word=$newword
10 |     count=1
11 |     started=1
12 |   else
13 |     count=$(( $count + 1 ))
14 |   fi
15 | done
16 | echo "$word\t$count"
17 | 


--------------------------------------------------------------------------------
/hadoop/streaming/run_cpp_mr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3
 3 | INPUT_PATH=/test/input
 4 | OUTPUT_PATH=/test/output
 5 | echo "Clearing output path: $OUTPUT_PATH"
 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH
 7 | 
 8 | ${HADOOP_HOME}/bin/hadoop jar\
 9 |    ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\
10 |   -D mapred.reduce.tasks=2\
11 |   -files mapper,reducer\
12 |   -input $INPUT_PATH\
13 |   -output $OUTPUT_PATH\
14 |   -mapper mapper\
15 |   -reducer reducer
16 | 


--------------------------------------------------------------------------------
/hadoop/streaming/run_php_mr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3
 3 | INPUT_PATH=/test/input
 4 | OUTPUT_PATH=/test/output
 5 | echo "Clearing output path: $OUTPUT_PATH"
 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH
 7 | 
 8 | ${HADOOP_HOME}/bin/hadoop jar\
 9 |    ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\
10 |   -files mapper.php,reducer.php\
11 |   -input $INPUT_PATH\
12 |   -output $OUTPUT_PATH\
13 |   -mapper "php mapper.php" \
14 |   -reducer "php reducer.php" \
15 | 


--------------------------------------------------------------------------------
/hadoop/streaming/run_shell_mr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | HADOOP_HOME=/home/hadoop/hadoop-2.7.3
 3 | INPUT_PATH=/test/input
 4 | OUTPUT_PATH=/test/output
 5 | echo "Clearing output path: $OUTPUT_PATH"
 6 | $HADOOP_HOME/bin/hadoop fs -rmr $OUTPUT_PATH
 7 | 
 8 | ${HADOOP_HOME}/bin/hadoop jar\
 9 |    ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar\
10 |   -files mapper.sh,reducer.sh\
11 |   -input $INPUT_PATH\
12 |   -output $OUTPUT_PATH\
13 |   -mapper "sh mapper.sh"\
14 |   -reducer "sh reducer.sh"
15 | 


--------------------------------------------------------------------------------
/hadoop/streaming/test.txt:
--------------------------------------------------------------------------------
 1 | i
 2 | have
 3 | a
 4 | book
 5 | you
 6 | do
 7 | not
 8 | have
 9 | one
10 | so
11 | i
12 | am
13 | better
14 | than
15 | you
16 | ha
17 | ha
18 | 


--------------------------------------------------------------------------------
/hbase/hbase-ingest/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>bigdata.hbase</groupId>
 8 |     <artifactId>ingest</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 |     <repositories>
11 |         <repository>
12 |             <id>aliyun</id>
13 |             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
14 |         </repository>
15 |     </repositories>
16 |     <dependencies>
17 |         <dependency>
18 |             <groupId>org.apache.hadoop</groupId>
19 |             <artifactId>hadoop-common</artifactId>
20 |             <version>2.6.0</version>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>org.apache.hbase</groupId>
24 |             <artifactId>hbase-client</artifactId>
25 |             <version>1.2.4</version>
26 |         </dependency>
27 |     </dependencies>
28 |     <build>
29 |         <plugins>
30 |             <plugin>
31 |                 <artifactId>maven-assembly-plugin</artifactId>
32 |                 <version>2.3</version>
33 |                 <configuration>
34 |                     <classifier>dist</classifier>
35 |                     <appendAssemblyId>true</appendAssemblyId>
36 |                     <descriptorRefs>
37 |                         <descriptor>jar-with-dependencies</descriptor>
38 |                     </descriptorRefs>
39 |                 </configuration>
40 |                 <executions>
41 |                     <execution>
42 |                         <id>make-assembly</id>
43 |                         <phase>package</phase>
44 |                         <goals>
45 |                             <goal>single</goal>
46 |                         </goals>
47 |                     </execution>
48 |                 </executions>
49 |             </plugin>
50 |         </plugins>
51 |     </build>
52 | </project>


--------------------------------------------------------------------------------
/hbase/hbase-ingest/src/main/java/bigdata/hbase/Ingest.java:
--------------------------------------------------------------------------------
 1 | package bigdata.hbase;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FSDataInputStream;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.hbase.HBaseConfiguration;
 8 | import org.apache.hadoop.hbase.HColumnDescriptor;
 9 | import org.apache.hadoop.hbase.HTableDescriptor;
10 | import org.apache.hadoop.hbase.TableName;
11 | import org.apache.hadoop.hbase.client.*;
12 | import org.apache.hadoop.hbase.util.Bytes;
13 | 
14 | import java.io.BufferedReader;
15 | import java.io.IOException;
16 | import java.io.InputStreamReader;
17 | 
18 | /**
19 |  * Created by qianxi.zhang on 5/1/17.
20 |  */
21 | public abstract class Ingest {
22 |   public static final String TABLE_NAME = "user_behavior";
23 |   public static final String SEPARATOR = ",";
24 |   //Connection to the cluster.
25 |   private Connection connection = null;
26 |   //A lightweight handler for a specific table.
27 |   private Table table = null;
28 |   public static final String FAMILY_NAME_P = "p";
29 |   public static final String FAMILY_NAME_B = "b";
30 | 
31 |   public static Configuration getHBaseConfiguration() {
32 |     Configuration conf = HBaseConfiguration.create();
33 |     conf.set("hbase.zookeeper.quorum",
34 |         "bigdata");
35 |     conf.set("zookeeper.znode.parent", "/hbase");
36 | 
37 |     return conf;
38 |   }
39 | 
40 |   public void init() throws IOException {
41 |     //establish the connection to the cluster.
42 |     connection = ConnectionFactory.createConnection(getHBaseConfiguration());
43 |     //retrieve a handler to the target table
44 |     table = connection.getTable(TableName.valueOf(TABLE_NAME));
45 |   }
46 | 
47 |   public void shutdown() throws IOException {
48 |     if (table != null) {
49 |       table.close();
50 |     }
51 |     if (connection != null) {
52 |       connection.close();
53 |     }
54 |   }
55 | 
56 |   public void createTable() throws IOException {
57 |     Admin admin = connection.getAdmin();
58 | 
59 |     if (!admin.tableExists(TableName.valueOf(TABLE_NAME))) {
60 |       HTableDescriptor tableDescriptor = new HTableDescriptor(TableName.valueOf(TABLE_NAME));
61 |       HColumnDescriptor columnDescriptor_1 = new HColumnDescriptor(Bytes.toBytes(FAMILY_NAME_P));
62 |       HColumnDescriptor columnDescriptor_2 = new HColumnDescriptor(Bytes.toBytes(FAMILY_NAME_B));
63 |       columnDescriptor_1.setMaxVersions(1);
64 |       columnDescriptor_2.setMaxVersions(1000);
65 |       tableDescriptor.addFamily(columnDescriptor_1);
66 |       tableDescriptor.addFamily(columnDescriptor_2);
67 |       admin.createTable(tableDescriptor);
68 |     }
69 |   }
70 | 
71 |   public void ingest(String path) throws IOException {
72 |     init();
73 |     createTable();
74 |     FileSystem fs = null;
75 |     Configuration conf = new Configuration();
76 |     Path myPath = new Path(path);
77 |     fs = myPath.getFileSystem(conf);
78 |     FSDataInputStream hdfsInStream = fs.open(new Path(path));
79 |     BufferedReader in = null;
80 |     in = new BufferedReader(new InputStreamReader(hdfsInStream));
81 |     String line = null;
82 |     while ((line = in.readLine()) != null) {
83 |       System.out.println(line);
84 |       Put put = process(line);
85 |       //send the data
86 |       table.put(put);
87 |     }
88 |     if (in != null) {
89 |       in.close();
90 |     }
91 |     shutdown();
92 |   }
93 | 
94 |   abstract public Put process(String line);
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/hbase/hbase-ingest/src/main/java/bigdata/hbase/ProfileIngest.java:
--------------------------------------------------------------------------------
 1 | package bigdata.hbase;
 2 | 
 3 | import org.apache.hadoop.hbase.client.Put;
 4 | import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
 5 | import org.apache.hadoop.hbase.util.Bytes;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * Created by qianxi.zhang on 5/2/17.
11 |  */
12 | public class ProfileIngest extends Ingest {
13 |   public static final String QUALIFIER_NAME_P_NAME = "name";
14 |   public static final String QUALIFIER_NAME_P_GENDER = "gender";
15 |   public static final String QUALIFIER_NAME_P_BIRTH = "birth";
16 |   public static final String QUALIFIER_NAME_P_PROVINCE = "province";
17 | 
18 |   public Put process(String line) {
19 |     String[] attributes = line.split(SEPARATOR);
20 |     Put put = new Put(Bytes.toBytes(attributes[0]));
21 |     put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_NAME), Bytes.toBytes(attributes[1]));
22 |     put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_GENDER), Bytes.toBytes(attributes[2]));
23 |     put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_BIRTH), Bytes.toBytes(attributes[3]));
24 |     put.addColumn(Bytes.toBytes(FAMILY_NAME_P), Bytes.toBytes(QUALIFIER_NAME_P_PROVINCE), Bytes.toBytes(attributes[4]));
25 | 
26 |     return put;
27 |   }
28 | 
29 |   public static void main(String[] args) throws IOException {
30 |     if (args == null || args.length != 1)
31 |       throw new IllegalArgumentIOException("path should be offered");
32 |     ProfileIngest ingest = new ProfileIngest();
33 |     ingest.ingest(args[0]);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/hbase/hbase-ingest/src/main/java/bigdata/hbase/Query.java:
--------------------------------------------------------------------------------
 1 | package bigdata.hbase;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.hbase.HBaseConfiguration;
 5 | import org.apache.hadoop.hbase.TableName;
 6 | import org.apache.hadoop.hbase.client.*;
 7 | import org.apache.hadoop.hbase.util.Bytes;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | import static bigdata.hbase.Ingest.TABLE_NAME;
12 | 
13 | /**
14 |  * Created by qianxi.zhang on 5/2/17.
15 |  */
16 | public class Query {
17 |   public static final String TABLE_NAME = "user_behavior";
18 |   public static final String FAMILY_NAME_P = "p";
19 |   public static final String FAMILY_NAME_B = "b";
20 |   public static final String QUALIFIER_NAME_B_RID = "rid";
21 | 
22 |   public static Configuration getHBaseConfiguration() {
23 |     Configuration conf = HBaseConfiguration.create();
24 |     conf.set("hbase.zookeeper.quorum",
25 |         "bigdata");
26 |     conf.set("zookeeper.znode.parent", "/hbase");
27 | 
28 |     return conf;
29 |   }
30 | 
31 |   public void process() throws IOException {
32 |     //establish the connection to the cluster.
33 |     Connection connection = ConnectionFactory.createConnection();
34 |     //retrieve a handler to the target table
35 |     Table table = connection.getTable(TableName.valueOf(TABLE_NAME));
36 | 
37 |     Scan scan = new Scan();
38 |     scan.addColumn(Bytes.toBytes(FAMILY_NAME_B), Bytes.toBytes(QUALIFIER_NAME_B_RID));
39 |     scan.setMaxVersions(1000);
40 |     scan.setCaching(100);
41 |     ResultScanner results = table.getScanner(scan);
42 | 
43 |     for (Result result : results) {
44 |       System.out.println(Bytes.toString(result.getRow()) + " : " + (result.isEmpty() ? 0 : result.listCells().size()));
45 |     }
46 |     table.close();
47 |     connection.close();
48 |   }
49 | 
50 |   public static void main(String[] args) throws IOException {
51 |     Query query = new Query();
52 |     query.process();
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/hbase/hbase-ingest/src/main/java/bigdata/hbase/RecordIngest.java:
--------------------------------------------------------------------------------
 1 | package bigdata.hbase;
 2 | 
 3 | import org.apache.hadoop.hbase.client.Put;
 4 | import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;
 5 | import org.apache.hadoop.hbase.util.Bytes;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * Created by qianxi.zhang on 5/2/17.
11 |  */
12 | public class RecordIngest extends Ingest {
13 |   public static final String QUALIFIER_NAME_B_RID = "rid";
14 | 
15 |   public Put process(String line) {
16 |     String[] attributes = line.split(SEPARATOR);
17 |     Put put = new Put(Bytes.toBytes(attributes[1]));
18 |     put.addColumn(Bytes.toBytes(FAMILY_NAME_B), Bytes.toBytes(QUALIFIER_NAME_B_RID), Long.valueOf(attributes[3]), Bytes.toBytes(attributes[0]));
19 |     return put;
20 |   }
21 | 
22 |   public static void main(String[] args) throws IOException {
23 |     if (args == null || args.length != 1)
24 |       throw new IllegalArgumentIOException("path should be offered");
25 |     RecordIngest ingest = new RecordIngest();
26 |     ingest.ingest(args[0]);
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/hive/README.md:
--------------------------------------------------------------------------------
  1 | # Hive实验手册
  2 | 
  3 | ### 0. 下载本实验所需要的Git工程
  4 | 
  5 | ```
  6 | cd /home/bigdata
  7 | git clone https://github.com/bigdataguide/hadooptraining.git
  8 | cd hadooptraining/hive
  9 | ```
 10 | hive目录下包含三个子目录:
 11 | * conf: hive的配置
 12 | * command: 启动Hive服务的的命令和一些实验用的SQL语句
 13 | * data中包含的是外部的一些数据，可以直接导入到Hive表中
 14 | 
 15 | **注意：本实验使用的用户主目录是bigdata, 你需要将目录替换为你自己的主目录，下同。**
 16 | 
 17 | 
 18 | ### 1. 下载安装Hadoop
 19 | 
 20 | 参考之前的课程内容,下面假定我们的Hadoop目录安装在/home/bigdata/hadoop-2.7.3中，如果你不是安装在这个目录需要替换为你自己的目录
 21 | 
 22 | ### 2. 配置Mysql
 23 | ```
 24 | #Ubuntu
 25 | sudo apt-get install mysql libmysql-java
 26 | #CentOS
 27 | sudo yum install mysql mysql-connector-java
 28 | #启动Mysql
 29 | sudo service mysqld start
 30 | ```
 31 | 
 32 | ### 3. 安装Hive
 33 | 
 34 | #### 3.1 下载Hive二进制包
 35 | ```    
 36 | wget http://apache.mirrors.pair.com/hive/hive-2.1.1/apache-hive-2.1.1-bin.tar.gz /tmp/
 37 | #解压hive到工作目录
 38 | tar -zxvf /tmp/apache-hive-2.1.1-bin.tar.gz -C /home/bigdata/
 39 | cd /home/bigdata/apache-hive-2.1.1-bin/
 40 | ```
 41 | 
 42 | #### 3.2 配置Hive：拷贝Git工程中的配置到Hive目录
 43 | ```
 44 | cp /home/bigdata/hadooptraining/hive/conf/hive-env.sh /home/bigdata/apache-hive-2.1.1-bin/conf/
 45 | cp /home/bigdata/hadooptraining/hive/conf/hive-site.xml /home/bigdata/apache-hive-2.1.1-bin/conf/
 46 | ```
 47 | **配置文件的说明见附1,根据实际情况选择自己的主目录**
 48 | 
 49 | #### 3.3 启动Hive组件
 50 | ```
 51 | export HADOOP_HOME=/home/bigdata/hadoop-2.7.3
 52 | export PATH=/home/bigdata/apache-hive-2.1.1-bin:$PATH
 53 | #启动MetaStore Server
 54 | nohup hive --service metastore >> /home/bigdata/apache-hive-2.1.0-bin/logs/metastore.log 2>&1 &
 55 | #启动HiveServer2 
 56 | nohup hive --service hiveserver2 >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 &
 57 | ```
 58 | ### 4. 启动Hive
 59 | #### 4.1 启动Hive CLI
 60 | ```
 61 | hive
 62 | ```
 63 | #### 4.2 启动Beeline CLI
 64 | ```
 65 | beeline -n bigdata -pbigdata -u "jdbc:hive2://localhost:10000/default;auth=noSasl"
 66 | #或者
 67 | beeline 
 68 | beeline> !connect jdbc:hive2://localhost:10000/default bigdata bigdata
 69 | ```
 70 | 
 71 | ### 附 配置文件说明
 72 | 在hive-env.sh中，我们配置了HADOOP_HOME目录，你需要将主目录替换为你自己的主目录
 73 | HADOOP_HOME=/home/bigdata/hadoop-2.7.3
 74 | 
 75 | 在hive-site.xml中，我们配置了:  
 76 | 1）使用mysql存储元数据
 77 | ```xml
 78 |     <property>
 79 |         <name>javax.jdo.option.ConnectionURL</name>
 80 |         <value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
 81 |     </property>
 82 |     <property>
 83 |         <name>javax.jdo.option.ConnectionDriverName</name>
 84 |         <value>com.mysql.jdbc.Driver</value>
 85 |     </property>
 86 |     <property>
 87 |         <name>javax.jdo.option.ConnectionUserName</name>
 88 |         <value>root</value>
 89 |     </property>
 90 |     <property>
 91 |         <name>javax.jdo.option.ConnectionPassword</name>
 92 |         <value>root</value>
 93 |     </property>
 94 | ```    
 95 | 2）hive在HDFS上的存储路径
 96 | ```xml
 97 |     <property>
 98 |         <name>hive.metastore.warehouse.dir</name>
 99 |         <value>/warehouse</value>
100 |     </property>
101 |     <property>
102 |         <name>fs.defaultFS</name>
103 |         <value>hdfs://bigdata:9000</value>
104 |     </property>
105 | ```    
106 | 3）metastore的端口
107 | ```xml
108 |    <property>
109 |         <name>hive.metastore.uris</name>
110 |         <value>thrift://bigdata:9083</value>
111 |     </property>
112 | ```    
113 | 4）HiveServer2的端口
114 | ```xml
115 |    <property>
116 |         <name>hive.server2.thrift.port</name>
117 |         <value>10000</value>
118 |     </property>
119 |     <property>
120 |         <name>beeline.hs2.connection.user</name>
121 |         <value>bigdata</value>
122 |         </property>
123 |      <property>
124 |         <name>beeline.hs2.connection.password</name>
125 |         <value>bigdata</value>
126 |      </property>
127 | ```
128 | 5) 此外我们还配置自动创建Meta Store的数据库和表
129 | ```xml
130 |     <property>
131 |         <name>datanucleus.autoCreateSchema</name>
132 |         <value>true</value>
133 |     </property>
134 |     <property>
135 |         <name>datanucleus.autoStartMechanism</name>
136 |         <value>SchemaTable</value>
137 |     </property>
138 |     <property>
139 |         <name>datanucleus.schema.autoCreateTables</name>
140 |         <value>true</value>
141 |     </property>
142 | ```    
143 | 


--------------------------------------------------------------------------------
/hive/command/add_partition.sql:
--------------------------------------------------------------------------------
1 | 
2 | load data inpath "hdfs://bigdata:9000/etl/record/2016-11-24/2300" overwrite into table record partition(partition_date="2016-11-24",hour_minute="2300")
3 | 


--------------------------------------------------------------------------------
/hive/command/age_price_list.sql:
--------------------------------------------------------------------------------
1 | select cast(DATEDIFF(CURRENT_DATE, birth)/365 as int) as age,
2 | sum(price) as totalPrice
3 | from record join user_dimension on record.uid=user_dimension.uid
4 | group by cast(DATEDIFF(CURRENT_DATE, birth)/365 as int)
5 | order by totalPrice desc;
6 | 


--------------------------------------------------------------------------------
/hive/command/brand_price_list.sql:
--------------------------------------------------------------------------------
1 | select brand,sum(price) as totalPrice
2 | from record join brand_dimension on record.bid=brand_dimension.bid
3 | group by brand_dimension.brand
4 | order by totalPrice desc;
5 | 


--------------------------------------------------------------------------------
/hive/command/create_orc_table.sql:
--------------------------------------------------------------------------------
 1 | create table if not exists record_orc (
 2 |  rid STRING,
 3 |  uid STRING,
 4 |  bid STRING,
 5 |  trancation_date TIMESTAMP,
 6 |  price INT,
 7 |  source_province STRING,
 8 |  target_province STRING,
 9 |  site STRING,
10 |  express_number STRING,
11 |  express_company STRING
12 | )
13 |  PARTITIONED BY (
14 |  partition_date STRING,
15 |  hour_minute STRING
16 |  )
17 | STORED AS ORC;
18 | 


--------------------------------------------------------------------------------
/hive/command/create_parquet_table.sql:
--------------------------------------------------------------------------------
 1 | create table if not exists record_parquet (
 2 |  rid STRING,
 3 |  uid STRING,
 4 |  bid STRING,
 5 |  trancation_date TIMESTAMP,
 6 |  price INT,
 7 |  source_province STRING,
 8 |  target_province STRING,
 9 |  site STRING,
10 |  express_number STRING,
11 |  express_company STRING
12 | )
13 |  PARTITIONED BY (
14 |  partition_date STRING,
15 |  hour_minute STRING
16 |  )
17 | STORED AS PARQUET;
18 | 


--------------------------------------------------------------------------------
/hive/command/create_table_brand.sql:
--------------------------------------------------------------------------------
1 | create external table if not exists brand_dimension (
2 |  bid STRING,
3 |  category STRING,
4 |  brand STRING
5 | )ROW FORMAT DELIMITED
6 |  FIELDS TERMINATED BY ','
7 |  location 'hdfs://bigdata:9000/warehouse/brand_dimension'
8 |  ;
9 | 


--------------------------------------------------------------------------------
/hive/command/create_table_record.sql:
--------------------------------------------------------------------------------
 1 | create table if not exists record (
 2 |  rid STRING,
 3 |  uid STRING,
 4 |  bid STRING,
 5 |  trancation_date TIMESTAMP,
 6 |  price INT,
 7 |  source_province STRING,
 8 |  target_province STRING,
 9 |  site STRING,
10 |  express_number STRING,
11 |  express_company STRING
12 | )
13 |  PARTITIONED BY (
14 |  partition_date STRING,
15 |  hour INT
16 |  )
17 |  ROW FORMAT DELIMITED
18 |  FIELDS TERMINATED BY ','
19 | 


--------------------------------------------------------------------------------
/hive/command/create_table_user.sql:
--------------------------------------------------------------------------------
 1 | create external table if not exists user_dimension (
 2 |  uid STRING,
 3 |  name STRING,
 4 |  gender STRING,
 5 |  birth DATE,
 6 |  province STRING
 7 | )ROW FORMAT DELIMITED
 8 |  FIELDS TERMINATED BY ','
 9 |  location 'hdfs://bigdata:9000/warehouse/user_dimension'
10 |  ;
11 | 


--------------------------------------------------------------------------------
/hive/command/employees.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS employees (
 2 | 	name	STRING,
 3 | 	salary	FLOAT,
 4 | 	subordinates ARRAY<STRING>,
 5 | 	decutions	MAP<STRING, FLOAT>,
 6 | 	address	STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
 7 | )
 8 | ROW FORMAT DELIMITED
 9 | FIELDS TERMINATED BY '\001'
10 | COLLECTION ITEMS TERMINATED BY '\002'
11 | MAP KEYS TERMINATED BY '\003'
12 | LINES TERMINATED BY '\n'
13 | STORED AS TEXTFILE;
14 | 
15 | -- LOAD DATA LOCAL INPATH '/home/bigdata/hadooop/training/hive/data/employees.txt' OVERWRITE INTO TABLE employees;
16 | 


--------------------------------------------------------------------------------
/hive/command/employees_part.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS employees_part (
 2 | 	name	STRING,
 3 | 	salary	FLOAT,
 4 | 	subordinates ARRAY<STRING>,
 5 | 	decutions	MAP<STRING, FLOAT>,
 6 | 	address	STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
 7 | )
 8 | PARTITIONED BY (state STRING)
 9 | ROW FORMAT DELIMITED
10 | FIELDS TERMINATED BY '\001'
11 | COLLECTION ITEMS TERMINATED BY '\002'
12 | MAP KEYS TERMINATED BY '\003'
13 | LINES TERMINATED BY '\n'
14 | STORED AS TEXTFILE;
15 | 
16 | -- LOAD DATA LOCAL INPATH '/home/bigdata/hadooptraining/hive/data/employees.txt' 
17 | -- OVERWRITE INTO TABLE employees_part PARTITION(state='IL');
18 | 
19 | --INSERT INTO TABLE employees_part PARTITION(state = 'IL')
20 | --SELECT * FROM employees where address.state='IL'; 
21 | 
22 | -- FROM employees e
23 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'IL') SELECT e.* where e.address.state='IL'
24 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'CA') SELECT e.* where e.address.state='CA' 
25 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'NY') SELECT e.* where e.address.state='NY'; 
26 | 
27 | FROM employees e
28 | INSERT OVERWRITE TABLE employees_part PARTITION(state) SELECT e.*,e.address.state
29 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'CA') SELECT * where e.address.state='CA'
30 | -- INSERT OVERWRITE TABLE employees_part PARTITION(state = 'NY') SELECT * where e.address.state='NY';
31 | 


--------------------------------------------------------------------------------
/hive/command/load_data_to_orc.sql:
--------------------------------------------------------------------------------
1 | set hive.exec.dynamic.partition.mode=nonstrict;
2 | insert into table record_orc partition(partition_date,hour_minute) select * from record;
3 | 


--------------------------------------------------------------------------------
/hive/command/load_data_to_parquet.sql:
--------------------------------------------------------------------------------
1 | set hive.exec.dynamic.partition.mode=nonstrict;
2 | insert into table record_parquet partition(partition_date,hour_minute) select * from record;
3 | 


--------------------------------------------------------------------------------
/hive/command/province_prince_list.sql:
--------------------------------------------------------------------------------
1 | select province,sum(price) as totalPrice
2 | from record join user_dimension on record.uid=user_dimension.uid
3 | group by user_dimension.province
4 | order by totalPrice desc;
5 | 


--------------------------------------------------------------------------------
/hive/command/skewed.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE T1(key STRING, val STRING)
2 | SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE;
3 | 


--------------------------------------------------------------------------------
/hive/command/start-hiveserver2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup hive --service hiveserver2 >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 &
4 | 


--------------------------------------------------------------------------------
/hive/command/start-metastore.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup hive --service metastore >> /home/bigdata/apache-hive-2.1.0-bin/logs/hive.log 2>&1 &
4 | 


--------------------------------------------------------------------------------
/hive/command/start-mysql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo service mysqld start
4 | 


--------------------------------------------------------------------------------
/hive/command/weblog.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS weblog (
2 | 	user_id INT,
3 | 	url STRING,
4 | 	source_ip STRING
5 | ) PARTITIONED BY (dt STRING)
6 | CLUSTERED BY (user_id) INTO 96 BUCKETS;
7 | 
8 | 


--------------------------------------------------------------------------------
/hive/conf/hive-env.sh:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Set Hive and Hadoop environment variables here. These variables can be used
18 | # to control the execution of Hive. It should be used by admins to configure
19 | # the Hive installation (so that users do not have to set environment variables
20 | # or set command line parameters to get correct behavior).
21 | #
22 | # The hive service being invoked (CLI/HWI etc.) is available via the environment
23 | # variable SERVICE
24 | 
25 | 
26 | # Hive Client memory usage can be an issue if a large number of clients
27 | # are running at the same time. The flags below have been useful in 
28 | # reducing memory usage:
29 | #
30 | # if [ "$SERVICE" = "cli" ]; then
31 | #   if [ -z "$DEBUG" ]; then
32 | #     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit"
33 | #   else
34 | #     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"
35 | #   fi
36 | # fi
37 | 
38 | # The heap size of the jvm stared by hive shell script can be controlled via:
39 | #
40 | export HADOOP_HEAPSIZE=512
41 | #
42 | # Larger heap size may be required when running queries over large number of files or partitions. 
43 | # By default hive shell scripts use a heap size of 256 (MB).  Larger heap size would also be 
44 | # appropriate for hive server (hwi etc).
45 | 
46 | 
47 | # Set HADOOP_HOME to point to a specific hadoop install directory
48 | # HADOOP_HOME=${bin}/../../hadoop
49 | HADOOP_HOME=/home/bigdata/hadoop-2.7.3
50 | 
51 | # Hive Configuration Directory can be controlled by:
52 | # export HIVE_CONF_DIR=
53 | 
54 | # Folder containing extra ibraries required for hive compilation/execution can be controlled by:
55 | # export HIVE_AUX_JARS_PATH=
56 | 


--------------------------------------------------------------------------------
/hive/conf/hive-log4j2.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | status = INFO
18 | name = HiveLog4j2
19 | packages = org.apache.hadoop.hive.ql.log
20 | 
21 | # list of properties
22 | property.hive.log.level = INFO
23 | property.hive.root.logger = DRFA
24 | property.hive.log.dir = /home/bigdata/apache-hive-2.1.0-bin/logs
25 | property.hive.log.file = hive.log
26 | property.hive.perflogger.log.level = INFO
27 | 
28 | # list of all appenders
29 | appenders = console, DRFA
30 | 
31 | # console appender
32 | appender.console.type = Console
33 | appender.console.name = console
34 | appender.console.target = SYSTEM_ERR
35 | appender.console.layout.type = PatternLayout
36 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
37 | 
38 | # daily rolling file appender
39 | appender.DRFA.type = RollingRandomAccessFile
40 | appender.DRFA.name = DRFA
41 | appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
42 | # Use %pid in the filePattern to append <process-id>@<host-name> to the filename if you want separate log files for different CLI session
43 | appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd}
44 | appender.DRFA.layout.type = PatternLayout
45 | appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
46 | appender.DRFA.policies.type = Policies
47 | appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy
48 | appender.DRFA.policies.time.interval = 1
49 | appender.DRFA.policies.time.modulate = true
50 | appender.DRFA.strategy.type = DefaultRolloverStrategy
51 | appender.DRFA.strategy.max = 30
52 | 
53 | # list of all loggers
54 | loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, PerfLogger
55 | 
56 | logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
57 | logger.NIOServerCnxn.level = WARN
58 | 
59 | logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
60 | logger.ClientCnxnSocketNIO.level = WARN
61 | 
62 | logger.DataNucleus.name = DataNucleus
63 | logger.DataNucleus.level = ERROR
64 | 
65 | logger.Datastore.name = Datastore
66 | logger.Datastore.level = ERROR
67 | 
68 | logger.JPOX.name = JPOX
69 | logger.JPOX.level = ERROR
70 | 
71 | logger.PerfLogger.name = org.apache.hadoop.hive.ql.log.PerfLogger
72 | logger.PerfLogger.level = ${sys:hive.perflogger.log.level}
73 | 
74 | # root logger
75 | rootLogger.level = ${sys:hive.log.level}
76 | rootLogger.appenderRefs = root
77 | rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
78 | 


--------------------------------------------------------------------------------
/hive/conf/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |        Licensed under the Apache License, Version 2.0 (the "License");
 5 |          you may not use this file except in compliance with the License.
 6 |            You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |     distributed under the License is distributed on an "AS IS" BASIS,
12 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |         See the License for the specific language governing permissions and
14 |           limitations under the License. See accompanying LICENSE file.
15 |           -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | <configuration>
19 |     <property>
20 |         <name>hive.metastore.uris</name>
21 |         <value>thrift://localhost:9083</value>
22 |     </property>
23 |     <property>
24 |         <name>hive.server2.thrift.port</name>
25 |         <value>10000</value>
26 |     </property>
27 |     <property>
28 |         <name>javax.jdo.option.ConnectionURL</name>
29 |         <value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
30 |     </property>
31 |     <property>
32 |         <name>javax.jdo.option.ConnectionDriverName</name>
33 |         <value>com.mysql.jdbc.Driver</value>
34 |     </property>
35 |     <property>
36 |         <name>javax.jdo.option.ConnectionUserName</name>
37 |         <value>root</value>
38 |     </property>
39 |     <property>
40 |         <name>javax.jdo.option.ConnectionPassword</name>
41 |         <value>root</value>
42 |     </property>
43 |     <property>
44 |         <name>hive.metastore.warehouse.dir</name>
45 |         <value>/warehouse</value>
46 |     </property>
47 |     <property>
48 |         <name>fs.defaultFS</name>
49 |         <value>hdfs://localhost:9000</value>
50 |     </property>
51 |     <property>
52 |         <name>datanucleus.autoCreateSchema</name>
53 |         <value>true</value>
54 |     </property>
55 |     <property>
56 |         <name>datanucleus.autoStartMechanism</name> 
57 |         <value>SchemaTable</value>
58 |     </property>
59 |     <property>
60 |         <name>datanucleus.schema.autoCreateTables</name>
61 |         <value>true</value>
62 |     </property>
63 | 
64 |     <property>
65 |         <name>beeline.hs2.connection.user</name>
66 |         <value>bigdata</value>
67 |         </property>
68 |      <property>
69 |         <name>beeline.hs2.connection.password</name>
70 |         <value>bigdata</value>
71 |      </property>
72 | </configuration>
73 | 


--------------------------------------------------------------------------------
/hive/data/employees.txt:
--------------------------------------------------------------------------------
1 | John Doe100000.0Mary SmithTodd JonesFederal Taxes.2State Taxes.05Insurance.11 Michigan Ave.ChicagoIL60600
2 | Mary Smith80000.0Bill KingFederal Taxes.2State Taxes.05Insurance.1100 Ontario St.ChicagoIL60601
3 | Todd Jones70000.0Federal Taxes.15State Taxes.03Insurance.1200 Chicago Ave.Oak ParkIL60700
4 | Bill King60000.0Federal Taxes.15State Taxes.03Insurance.1300 Obscure Dr.ObscuriaIL60100
5 | 


--------------------------------------------------------------------------------
/kafka/command/start-kafka.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/kafka_2.11-0.10.1.0/bin/kafka-server-start.sh -daemon /home/bigdata/kafka_2.11-0.10.1.0/config/server.properties
4 | 


--------------------------------------------------------------------------------
/kafka/command/start-zookeeper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/kafka_2.11-0.10.1.0/bin/zookeeper-server-start.sh -daemon /home/bigdata/kafka_2.11-0.10.1.0/config/zookeeper.properties
4 | 


--------------------------------------------------------------------------------
/mysql/create_table_brand.sql:
--------------------------------------------------------------------------------
1 | create table log.brand_dimension(
2 | bid varchar(255),
3 | category varchar(255),
4 | brand varchar(255),
5 | primary key (bid)
6 | );
7 | 


--------------------------------------------------------------------------------
/mysql/create_table_user.sql:
--------------------------------------------------------------------------------
1 | create table log.user_dimension(
2 | uid varchar(255),
3 | name varchar(255),
4 | gender varchar(255),
5 | birth  date,
6 | province varchar(255),
7 | primary key (uid)
8 | )
9 | 


--------------------------------------------------------------------------------
/mysql/load_table_brand.sql:
--------------------------------------------------------------------------------
1 | 
2 | LOAD DATA LOCAL INFILE '/home/bigdata/datasource/brand.list' INTO TABLE log.brand_dimension
3 | FIELDS TERMINATED BY ',' ENCLOSED BY '"'
4 | LINES TERMINATED BY '\n'
5 | 


--------------------------------------------------------------------------------
/mysql/load_table_user.sql:
--------------------------------------------------------------------------------
1 | 
2 | LOAD DATA LOCAL INFILE '/home/bigdata/datasource/user.list' INTO TABLE log.user_dimension
3 | FIELDS TERMINATED BY ',' ENCLOSED BY '"'
4 | LINES TERMINATED BY '\n'
5 | 


--------------------------------------------------------------------------------
/mysql/start-client.txt:
--------------------------------------------------------------------------------
1 | mysql -u root
2 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xmlns="http://maven.apache.org/POM/4.0.0"
 5 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 6 |   <modelVersion>4.0.0</modelVersion>
 7 |   <groupId>cn.chinahadoop</groupId>
 8 |   <artifactId>hadoop-example</artifactId>
 9 |   <version>0.1.0-SNAPSHOT</version>
10 |   <description>Hadoop Examples</description>
11 |   <name>hadoop examples</name>
12 |   <packaging>pom</packaging>
13 | 
14 |   <modules>
15 |     <module>hadoop</module>
16 |   </modules>
17 | 
18 |   <repositories>
19 |     <repository>
20 |       <id>Maven2</id>
21 |       <url>http://repo1.maven.org/maven2</url>
22 |     </repository>
23 |   </repositories>
24 | 
25 |   <build>
26 |     <!-- Makes sure the resources get added before they are processed
27 |   by placing this first -->
28 |     <resources>
29 |       <!-- Add the build webabpps to the classpth -->
30 |       <resource>
31 |         <directory>${project.build.directory}</directory>
32 |         <includes>
33 |           <include>webapps/**</include>
34 |         </includes>
35 |       </resource>
36 | 
37 |       <resource>
38 |         <directory>src/main/resources</directory>
39 |         <includes>
40 |           <include>**/**</include>
41 |         </includes>
42 |       </resource>
43 |     </resources>
44 |   </build>
45 | 
46 | </project>
47 | 


--------------------------------------------------------------------------------
/presto/command/age_price_list_presto.sql:
--------------------------------------------------------------------------------
1 | select cast((year(CURRENT_DATE)-year(birth)) as integer) as age,sum(price) as totalPrice
2 | from record join user_dimension on record.uid=user_dimension.uid
3 | group by cast((year(CURRENT_DATE)-year(birth)) as integer)
4 | order by totalPrice desc
5 | 


--------------------------------------------------------------------------------
/presto/command/brand_price_list_presto.sql:
--------------------------------------------------------------------------------
1 | select brand,sum(price) as totalPrice  
2 | from record join brand_dimension on record.bid=brand_dimension.bid
3 | group by brand_dimension.brand
4 | order by totalPrice desc
5 | 


--------------------------------------------------------------------------------
/presto/command/gender_brand_rank.sql:
--------------------------------------------------------------------------------
1 | select gender, brand,count(*) as purchase_count 
2 | from record_orc join user_dimension_orc on record_orc.uid=user_dimension_orc.uid 
3 | join brand_dimension_orc on record_orc.bid=brand_dimension_orc.bid 
4 | group by gender, brand
5 | order by gender, purchase_count DESC
6 | 
7 | 


--------------------------------------------------------------------------------
/presto/command/start-presto-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | presto --server localhost:8080 --catalog hive --schema default
4 | 


--------------------------------------------------------------------------------
/presto/command/start-presto.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/presto-server-0.157/bin/launcher start
4 | 


--------------------------------------------------------------------------------
/presto/command/stop-presto.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/presto-server-0.157/bin/launcher stop
4 | 


--------------------------------------------------------------------------------
/presto/conf/etc/catalog/hive.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore.uri=thrift://bigdata:9083
3 | hive.config.resources=/home/bigdata/hadoop-2.7.3/etc/hadoop/core-site.xml,=/home/bigdata/hadoop-2.7.3/etc/hadoop/hdfs-site.xml
4 | 


--------------------------------------------------------------------------------
/presto/conf/etc/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | query.max-memory=512MB
5 | query.max-memory-per-node=512MB
6 | discovery-server.enabled=true
7 | discovery.uri=http://bigdata:8080
8 | 


--------------------------------------------------------------------------------
/presto/conf/etc/jvm.config:
--------------------------------------------------------------------------------
1 | -server
2 | -Xmx1G
3 | -XX:+UseG1GC
4 | -XX:G1HeapRegionSize=32M
5 | -XX:+UseGCOverheadLimit
6 | -XX:+ExplicitGCInvokesConcurrent
7 | -XX:+HeapDumpOnOutOfMemoryError
8 | -XX:OnOutOfMemoryError=kill -9 %p
9 | 


--------------------------------------------------------------------------------
/presto/conf/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=production
2 | node.id=bigdata
3 | node.data-dir=/home/bigdata/presto-server-0.157/presto_data
4 | 


--------------------------------------------------------------------------------
/redis/command/start-redis-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/redis-stable/src/redis-cli
4 | 


--------------------------------------------------------------------------------
/redis/command/start-redis.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /home/bigdata/redis-stable/src/redis-server /home/bigdata/redis-stable/redis.conf
4 | 


--------------------------------------------------------------------------------
/sqoop/command/brand_dimension_sqoop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sqoop import --connect jdbc:mysql://bigdata:3306/log --username root --password root --table brand_dimension --driver com.mysql.jdbc.Driver --m 10 --target-dir /warehouse/brand_dimension
4 | 


--------------------------------------------------------------------------------
/sqoop/command/user_dimension_sqoop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 |  sqoop import --connect jdbc:mysql://bigdata:3306/log --username root --password root --table user_dimension --driver com.mysql.jdbc.Driver --m 10 --target-dir /warehouse/user_dimension
4 | 


--------------------------------------------------------------------------------
/storm/command/realtime_process.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | storm jar /home/bigdata/real_time_process/storm-1.0-SNAPSHOT-jar-with-dependencies.jar bigdata.storm.LogProcessTopology LogProcess
4 | 


--------------------------------------------------------------------------------
/storm/command/start-storm-nimbus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm nimbus >> /home/bigdata/apache-storm-1.0.2/logs/nimbus.log 2>&1 & 
4 | 


--------------------------------------------------------------------------------
/storm/command/start-storm-supervisor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm supervisor >> /home/bigdata/apache-storm-1.0.2/logs/supervisor.log 2>&1 & 
4 | 


--------------------------------------------------------------------------------
/storm/command/start-storm-ui.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nohup /home/bigdata/apache-storm-1.0.2/bin/storm ui >> /home/bigdata/apache-storm-1.0.2/logs/ui.log 2>&1 & 
4 | 


--------------------------------------------------------------------------------
/storm/conf/storm-env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | 
19 | # Set Storm specific environment variables here.
20 | 
21 | # The java implementation to use.
22 | #export JAVA_HOME=/path/to/jdk/home
23 | 
24 | # export STORM_CONF_DIR=""
25 | 


--------------------------------------------------------------------------------
/storm/conf/storm.yaml:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | ########### These MUST be filled in for a storm configuration
18 | storm.zookeeper.servers:
19 |      - "bigdata"
20 | 
21 | storm.local.dir: "/home/bigdata/apache-storm-1.0.2/storm_data"
22 | 
23 | nimbus.seeds: ["bigdata"]
24 | supervisor.slots.ports:
25 |     - 6700
26 |     - 6701
27 | 
28 | storm.exhibitor.port: 9080
29 | ui.port: 9080
30 | # 
31 | # 
32 | # ##### These may optionally be filled in:
33 | #    
34 | ## List of custom serializations
35 | # topology.kryo.register:
36 | #     - org.mycompany.MyType
37 | #     - org.mycompany.MyType2: org.mycompany.MyType2Serializer
38 | #
39 | ## List of custom kryo decorators
40 | # topology.kryo.decorators:
41 | #     - org.mycompany.MyDecorator
42 | #
43 | ## Locations of the drpc servers
44 | # drpc.servers:
45 | #     - "server1"
46 | #     - "server2"
47 | 
48 | ## Metrics Consumers
49 | # topology.metrics.consumer.register:
50 | #   - class: "org.apache.storm.metric.LoggingMetricsConsumer"
51 | #     parallelism.hint: 1
52 | #   - class: "org.mycompany.MyMetricsConsumer"
53 | #     parallelism.hint: 1
54 | #     argument:
55 | #       - endpoint: "metrics-collector.mycompany.org"
56 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>bigdata</groupId>
 8 |     <artifactId>storm</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 |     <properties>
11 |         <java.version>1.8</java.version>
12 |         <storm.version>1.0.2</storm.version>
13 |         <kafka.version>0.8.2.1</kafka.version>
14 |     </properties>
15 | 
16 |     <dependencies>
17 |         <dependency>
18 |             <groupId>org.apache.storm</groupId>
19 |             <artifactId>storm-kafka</artifactId>
20 |             <version>${storm.version}</version>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>org.apache.storm</groupId>
24 |             <artifactId>storm-core</artifactId>
25 |             <version>${storm.version}</version>
26 |             <scope>provided</scope>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>org.apache.storm</groupId>
30 |             <artifactId>storm-redis</artifactId>
31 |             <version>${storm.version}</version>
32 |         </dependency>
33 |         <dependency>
34 |             <groupId>org.apache.kafka</groupId>
35 |             <artifactId>kafka_2.11</artifactId>
36 |             <version>${kafka.version}</version>
37 |             <exclusions>
38 |                 <exclusion>
39 |                     <groupId>org.apache.zookeeper</groupId>
40 |                     <artifactId>zookeeper</artifactId>
41 |                 </exclusion>
42 |                 <exclusion>
43 |                     <groupId>log4j</groupId>
44 |                     <artifactId>log4j</artifactId>
45 |                 </exclusion>
46 |                 <exclusion>
47 |                     <groupId>org.slf4j</groupId>
48 |                     <artifactId>slf4j-log4j12</artifactId>
49 |                 </exclusion>
50 |             </exclusions>
51 |         </dependency>
52 |     </dependencies>
53 | 
54 |     <build>
55 |         <plugins>
56 |             <plugin>
57 |                 <artifactId>maven-assembly-plugin</artifactId>
58 |                 <version>2.3</version>
59 |                 <configuration>
60 |                     <classifier>dist</classifier>
61 |                     <appendAssemblyId>true</appendAssemblyId>
62 |                     <descriptorRefs>
63 |                         <descriptor>jar-with-dependencies</descriptor>
64 |                     </descriptorRefs>
65 |                 </configuration>
66 |                 <executions>
67 |                     <execution>
68 |                         <id>make-assembly</id>
69 |                         <phase>package</phase>
70 |                         <goals>
71 |                             <goal>single</goal>
72 |                         </goals>
73 |                     </execution>
74 |                 </executions>
75 |             </plugin>
76 |         </plugins>
77 |     </build>
78 | </project>


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/ExtractBolt.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.topology.BasicOutputCollector;
 4 | import org.apache.storm.topology.OutputFieldsDeclarer;
 5 | import org.apache.storm.topology.base.BaseBasicBolt;
 6 | import org.apache.storm.tuple.Fields;
 7 | import org.apache.storm.tuple.Tuple;
 8 | import org.apache.storm.tuple.Values;
 9 | 
10 | import java.io.DataOutputStream;
11 | import java.io.FileNotFoundException;
12 | import java.io.FileOutputStream;
13 | import java.io.IOException;
14 | 
15 | /**
16 |  * Created by qianxi.zhang on 11/26/16.
17 |  */
18 | public class ExtractBolt extends BaseBasicBolt {
19 |   public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) {
20 |     String word = (String) tuple.getValue(0);
21 | 
22 |     String price = "0";
23 |     String province = "other";
24 |     String website = "other";
25 | 
26 |     String[] attributes_list = word.split(",");
27 | 
28 |     if (attributes_list.length == 12) {
29 |       price = attributes_list[4];
30 |       province = attributes_list[6];
31 |       website = attributes_list[7];
32 |     }
33 | 
34 |     basicOutputCollector.emit("province", new Values(province, price));
35 |     basicOutputCollector.emit("website", new Values(website, price));
36 |   }
37 | 
38 |   public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
39 |     outputFieldsDeclarer.declareStream("province", new Fields("province", "price"));
40 |     outputFieldsDeclarer.declareStream("website", new Fields("website", "price"));
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/LogProcessTopology.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.Config;
 4 | import org.apache.storm.LocalCluster;
 5 | import org.apache.storm.StormSubmitter;
 6 | import org.apache.storm.generated.AlreadyAliveException;
 7 | import org.apache.storm.generated.AuthorizationException;
 8 | import org.apache.storm.generated.InvalidTopologyException;
 9 | import org.apache.storm.generated.StormTopology;
10 | import org.apache.storm.kafka.*;
11 | import org.apache.storm.redis.bolt.RedisStoreBolt;
12 | import org.apache.storm.redis.common.config.JedisPoolConfig;
13 | import org.apache.storm.redis.common.mapper.RedisStoreMapper;
14 | import org.apache.storm.spout.SchemeAsMultiScheme;
15 | import org.apache.storm.topology.TopologyBuilder;
16 | import org.apache.storm.tuple.Fields;
17 | import org.apache.storm.utils.Utils;
18 | 
19 | import java.util.HashMap;
20 | import java.util.Map;
21 | 
22 | /**
23 |  * Created by qianxi.zhang on 11/26/16.
24 |  */
25 | public class LogProcessTopology {
26 | 
27 |   public static final String brokerZkStr = "bigdata:2181";
28 |   public static final String topicName = "log";
29 |   public static final String offsetZkRoot = "/storm" + "-" + topicName;
30 |   public static final String offsetZkId = "offsetZkId";
31 |   public static final String redis_hots = "bigdata";
32 |   public static final int redis_port = 6379;
33 | 
34 |   public static StormTopology getStormTopology() {
35 | 
36 |     BrokerHosts hosts = new ZkHosts(brokerZkStr);
37 |     SpoutConfig spoutConfig = new SpoutConfig(hosts, topicName, offsetZkRoot, offsetZkId);
38 |     spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
39 |     KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
40 | 
41 |     JedisPoolConfig poolConfig =
42 |         new JedisPoolConfig.Builder().setHost(redis_hots).setPort(redis_port).build();
43 | 
44 |     RedisStoreMapper provinceStoreMapper = new ProvinceStoreMapper();
45 |     RedisStoreBolt provinceStoreBolt = new RedisStoreBolt(poolConfig, provinceStoreMapper);
46 | 
47 |     RedisStoreMapper websiteStoreMapper = new WebsiteStoreMapper();
48 |     RedisStoreBolt websiteStoreBolt = new RedisStoreBolt(poolConfig, websiteStoreMapper);
49 | 
50 |     TopologyBuilder builder = new TopologyBuilder();
51 |     builder.setSpout("spout", kafkaSpout, 1);
52 |     builder.setBolt("extractbolt", new ExtractBolt(), 1).shuffleGrouping("spout");
53 | 
54 |     builder.setBolt("provincebolt", new ProvinceBolt(), 1)
55 |         .fieldsGrouping("extractbolt", "province", new Fields("province"));
56 |     builder.setBolt("websitebolt", new WebsiteBolt(), 1)
57 |         .fieldsGrouping("extractbolt", "website", new Fields("website"));
58 | 
59 |     builder.setBolt("provinceredisstore", provinceStoreBolt).shuffleGrouping("provincebolt");
60 |     builder.setBolt("websiteredisstore", websiteStoreBolt).shuffleGrouping("websitebolt");
61 | 
62 |     return builder.createTopology();
63 |   }
64 | 
65 |   public static Config getConfig() {
66 |     Config conf = new Config();
67 |     return conf;
68 |   }
69 | 
70 |   public static void main(String[] args) {
71 | 
72 |     Config conf = getConfig();
73 |     StormTopology topology = getStormTopology();
74 | 
75 |     if (args != null && args.length > 0) {
76 |       //提交到集群运行
77 |       try {
78 |         StormSubmitter.submitTopology(args[0], conf, topology);
79 |       } catch (AlreadyAliveException e) {
80 |         e.printStackTrace();
81 |       } catch (InvalidTopologyException e) {
82 |         e.printStackTrace();
83 |       } catch (AuthorizationException e) {
84 |         e.printStackTrace();
85 |       }
86 |     } else {
87 |       //本地模式运行
88 |       LocalCluster cluster = new LocalCluster();
89 |       cluster.submitTopology("Topotest", conf, topology);
90 |       Utils.sleep(1000000);
91 |       cluster.killTopology("Topotest");
92 |       cluster.shutdown();
93 |     }
94 |   }
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/ProvinceBolt.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.task.TopologyContext;
 4 | import org.apache.storm.topology.BasicOutputCollector;
 5 | import org.apache.storm.topology.OutputFieldsDeclarer;
 6 | import org.apache.storm.topology.base.BaseBasicBolt;
 7 | import org.apache.storm.tuple.Fields;
 8 | import org.apache.storm.tuple.Tuple;
 9 | import org.apache.storm.tuple.Values;
10 | 
11 | import java.util.HashMap;
12 | import java.util.Map;
13 | 
14 | /**
15 |  * Created by qianxi.zhang on 11/26/16.
16 |  */
17 | public class ProvinceBolt extends BaseBasicBolt {
18 | 
19 |   Map<String, Long> province_price = new HashMap<String, Long>();
20 | 
21 |   public void execute(Tuple input, BasicOutputCollector collector) {
22 |     String province = input.getStringByField("province");
23 |     long price = Long.valueOf(input.getStringByField("price"));
24 |     long totalPrice = price;
25 |     if (province_price.containsKey(province)) {
26 |       totalPrice += province_price.get(province);
27 |     }
28 |     province_price.put(province, totalPrice);
29 |     collector.emit(new Values(province, String.valueOf(totalPrice)));
30 |   }
31 | 
32 |   public void declareOutputFields(OutputFieldsDeclarer declarer) {
33 |     declarer.declare(new Fields("province", "price"));
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/ProvinceStoreMapper.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.redis.common.mapper.RedisDataTypeDescription;
 4 | import org.apache.storm.redis.common.mapper.RedisStoreMapper;
 5 | import org.apache.storm.tuple.ITuple;
 6 | 
 7 | /**
 8 |  * Created by qianxi.zhang on 11/26/16.
 9 |  */
10 | public class ProvinceStoreMapper implements RedisStoreMapper {
11 |   private RedisDataTypeDescription description;
12 |   private final String hashKey = "province";
13 | 
14 |   public ProvinceStoreMapper() {
15 |     description =
16 |         new RedisDataTypeDescription(RedisDataTypeDescription.RedisDataType.HASH, hashKey);
17 |   }
18 | 
19 |   public RedisDataTypeDescription getDataTypeDescription() {
20 |     return description;
21 |   }
22 | 
23 |   public String getKeyFromTuple(ITuple iTuple) {
24 |     return iTuple.getStringByField("province");
25 |   }
26 | 
27 |   public String getValueFromTuple(ITuple iTuple) {
28 |     return iTuple.getStringByField("price");
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/WebsiteBolt.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.topology.BasicOutputCollector;
 4 | import org.apache.storm.topology.OutputFieldsDeclarer;
 5 | import org.apache.storm.topology.base.BaseBasicBolt;
 6 | import org.apache.storm.tuple.Fields;
 7 | import org.apache.storm.tuple.Tuple;
 8 | import org.apache.storm.tuple.Values;
 9 | 
10 | import java.util.HashMap;
11 | import java.util.Map;
12 | 
13 | /**
14 |  * Created by qianxi.zhang on 11/26/16.
15 |  */
16 | public class WebsiteBolt extends BaseBasicBolt {
17 | 
18 |   Map<String, Long> website_price = new HashMap<String, Long>();
19 | 
20 |   public void execute(Tuple input, BasicOutputCollector collector) {
21 |     String website = input.getStringByField("website");
22 |     long price = Long.valueOf(input.getStringByField("price"));
23 |     long totalPrice = price;
24 |     if (website_price.containsKey(website)) {
25 |       totalPrice += website_price.get(website);
26 |     }
27 |     website_price.put(website, totalPrice);
28 |     collector.emit(new Values(website, String.valueOf(totalPrice)));
29 |   }
30 | 
31 |   public void declareOutputFields(OutputFieldsDeclarer declarer) {
32 |     declarer.declare(new Fields("website", "price"));
33 |   }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/storm/storm_realtime_process/src/main/java/bigdata/storm/WebsiteStoreMapper.java:
--------------------------------------------------------------------------------
 1 | package bigdata.storm;
 2 | 
 3 | import org.apache.storm.redis.common.mapper.RedisDataTypeDescription;
 4 | import org.apache.storm.redis.common.mapper.RedisStoreMapper;
 5 | import org.apache.storm.tuple.ITuple;
 6 | 
 7 | /**
 8 |  * Created by qianxi.zhang on 11/26/16.
 9 |  */
10 | public class WebsiteStoreMapper implements RedisStoreMapper {
11 |   private RedisDataTypeDescription description;
12 |   private final String hashKey = "website";
13 | 
14 |   public WebsiteStoreMapper() {
15 |     description =
16 |         new RedisDataTypeDescription(RedisDataTypeDescription.RedisDataType.HASH, hashKey);
17 |   }
18 | 
19 |   public RedisDataTypeDescription getDataTypeDescription() {
20 |     return description;
21 |   }
22 | 
23 |   public String getKeyFromTuple(ITuple iTuple) {
24 |     return iTuple.getStringByField("website");
25 |   }
26 | 
27 |   public String getValueFromTuple(ITuple iTuple) {
28 |     return iTuple.getStringByField("price");
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/visualization/command/start-web.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python /home/bigdata/visualization/py-echarts/main.py
4 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from flask import Flask, render_template
 4 | import json
 5 | from models import Chart
 6 | from query_presto import Presto_Query
 7 | from query_redis import Redis_Query
 8 | 
 9 | app = Flask(__name__)
10 | 
11 | @app.route("/")
12 | def index():
13 |     presto=Presto_Query()
14 |     age_price_tuples=presto.query_age_price()
15 |     age_dict=presto.getAgeDict(age_price_tuples)
16 |     chart1 = Chart().pie("饼图", data=age_dict
17 |            )
18 |     
19 |     tuples=presto.query_brand_price()
20 |     keys=presto.getKeys(tuples)   
21 |     values=presto.getValues(tuples) 
22 |     chart2 = Chart() \
23 |              .x_axis(data=keys) \
24 |              .y_axis(formatter="{value}") \
25 |              .bar(u"Brand Price", values, show_item_label=True)
26 |  
27 |     redis=Redis_Query()
28 |     province_price=redis.query_province()
29 |     china_province_price=redis.get_province_price(province_price)
30 |     print china_province_price
31 |     chart3= Chart()\
32 |              .map(china_province_price)
33 | 
34 |     render = {
35 |         "title": u"电商双十一大数据日志分析系统",
36 |         "templates": [
37 |             {"type": "chart", "title":u"不同年龄消费的情况", "option": json.dumps(chart1, indent=2)},
38 |             {"type": "chart", "title":u"消费商品的情况", "option": json.dumps(chart2, indent=2)},
39 |             {"type": "chart", "title":u"各省购买情况", "option": json.dumps(chart3, indent=2)}
40 |         ]
41 |     }
42 |     return render_template("main.html", **render)
43 | 
44 | if __name__ == "__main__":
45 |     app.run(debug=True)
46 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | 
  5 | class Chart(dict):
  6 |     """
  7 |     图表模板
  8 |     """
  9 |     def __init__(self):
 10 |         super(Chart, self).__init__()
 11 |         self["calculable"] = True
 12 |         self["tooltip"] = {"show": True}
 13 |         self["toolbox"] = {
 14 |             "show": True,
 15 |             "x": "left",
 16 |             "feature": {
 17 |                 "dataView": {
 18 |                     "show": True,
 19 |                     "readOnly": False
 20 |                 },
 21 |                 "magicType": {
 22 |                     "show": True,
 23 |                     "type": ["line", "bar"]
 24 |                 },
 25 |                 "restore": {
 26 |                     "show": True
 27 |                 },
 28 |                 "saveAsImage": {
 29 |                     "show": True
 30 |                 },
 31 |                 "dataZoom": {
 32 |                     "show": True,
 33 |                     "title": {
 34 |                         "dataZoom": u"区域缩放",
 35 |                         "dataZoomReset": u"区域缩放后退"
 36 |                     }
 37 |                 }
 38 |             }
 39 |         }
 40 |         self["legend"] = {
 41 |             "show": True,
 42 |             "data": []
 43 |         }
 44 |         self["series"] = []
 45 | 
 46 |     def title(self, x="center", **kwargs):
 47 |         """
 48 |         设置图表标题
 49 |         """
 50 |         self["title"].update({
 51 |             "x": x
 52 |         })
 53 |         self["title"].update(kwargs)
 54 |         return self
 55 | 
 56 |     def tooltip(self, show=True, trigger='axis', formatter=None, **kwargs):
 57 |         """
 58 |         设置提示信息
 59 |         """
 60 |         self["tooltip"].update({
 61 |             "show": show,
 62 |             "trigger": trigger
 63 |         })
 64 |         if formatter is not None:
 65 |             self["tooltip"].update({"formatter": formatter})
 66 |         self["tooltip"].update(kwargs)
 67 |         return self
 68 | 
 69 |     def legend(self, show=True, data=None, orient='horizontal', **kwargs):
 70 |         """
 71 |         设置图例
 72 |         `data`: [u"图例1", u"图例2", u"图例3"]
 73 |         `orient`: "vertical"|"horizontal"
 74 |         """
 75 |         data = [] if data is None else data
 76 |         self["legend"].update({
 77 |             "show": show,
 78 |             "data": data,
 79 |             "orient": orient
 80 |         })
 81 |         self["legend"].update(kwargs)
 82 |         return self
 83 | 
 84 |     def toolbox(self, show=True, x='left', **kwargs):
 85 |         """
 86 |         设置工具箱
 87 |         """
 88 |         self["toolbox"].update({
 89 |             "show": show,
 90 |             "x": x
 91 |         })
 92 |         self["toolbox"].update(kwargs)
 93 |         return self
 94 | 
 95 |     def pie(self, name, data=None, radius="55%", center=None, auto_legend=True, **kwargs):
 96 |         """
 97 |         添加一个饼图
 98 |         `data`: {u"名称": 100}, u"名称2": 200}
 99 |         """
100 |         center = ["50%", "60%"] if center is None else center
101 |         data = {} if data is None else data
102 |         self["series"].append(self.__merge_dict({
103 |             "type": "pie",
104 |             "name": name,
105 |             "radius": radius,
106 |             "center": center,
107 |             "data": [{"name": n, "value": v} for n, v in data.items()]
108 |         }, kwargs))
109 |         if auto_legend:
110 |             legend_data = self["legend"]["data"]
111 |             [legend_data.append(x) for x in data if x not in legend_data]
112 |         return self
113 | 
114 |     def bar(self, name, data=None, auto_legend=True, y_axis_index=0, **kwargs):
115 |         """
116 |         添加一个柱状图
117 |         `data`: [10, 20, 30, 40]
118 |         `auto_legend`: 自动生成图例
119 |         """
120 |         data = [] if data is None else data
121 |         self["series"].append(self.__merge_dict({
122 |             "type": "bar",
123 |             "name": name,
124 |             "data": data,
125 |             "yAxisIndex": y_axis_index
126 |         }, kwargs))
127 |         if "yAxis" not in self:
128 |             self.y_axis()
129 |         if name not in self["legend"]["data"] and auto_legend:
130 |             self["legend"]["data"].append(name)
131 |         return self
132 | 
133 |     def line(self, name, data=None, mark_max_point=False, mark_min_point=False, show_item_label=False, auto_legend=True, y_axis_index=0, **kwargs):
134 |         """
135 |         添加一个折线图
136 |         `data`: [10, 20, 30, 40]
137 |         """
138 |         data = [] if data is None else data
139 |         mark_point = []
140 |         if mark_max_point:
141 |             mark_point.append({"type": "max", "name": "最大值"})
142 |         if mark_min_point:
143 |             mark_point.append({"type": "min", "name": "最小值"})
144 |         self["series"].append(self.__merge_dict({
145 |             "type": "line",
146 |             "name": name,
147 |             "data": data,
148 |             "markPoint": {
149 |                 "data":mark_point
150 |             },
151 |             "itemStyle": {
152 |                 "normal": {
153 |                     "label": {"show": show_item_label}
154 |                 }
155 |             },
156 |             "yAxisIndex": y_axis_index
157 |         }, kwargs))
158 |         if "yAxis" not in self:
159 |             self.y_axis()
160 |         if name not in self["legend"]["data"] and auto_legend:
161 |             self["legend"]["data"].append(name)
162 |         return self
163 | 
164 |     def x_axis(self, data=None, type_="category", name="", **kwargs):
165 |         """
166 |         添加X轴
167 |         """
168 |         data = [] if data is None else data
169 |         if "xAxis" not in self:
170 |             self["xAxis"] = []
171 |         self["xAxis"].append(self.__merge_dict({
172 |             "type": type_,
173 |             "name": name,
174 |             "data": data
175 |         }, kwargs))
176 |         return self
177 | 
178 |     def y_axis(self, data=None, type_="value", name="", formatter=None, **kwargs):
179 |         """
180 |         添加X轴
181 |         """
182 |         if "yAxis" not in self:
183 |             self["yAxis"] = []
184 |         self["yAxis"].append(self.__merge_dict({
185 |             "type": type_,
186 |             "name": name,
187 |         }, {"axisLabel": {"formatter": formatter}} if formatter is not None else {}, kwargs))
188 |         if data is not None:
189 |             self["yAxis"] = data
190 |         return self
191 |     def map(self,data,**kwargs):
192 |         self["legend"]={
193 |             "orient":"vertical",
194 |             "left": "left",
195 |             "data":['price']
196 |         }
197 |         self["toolbox"]={
198 |             "show": True,
199 |             "orient": "vertical",
200 |             "left": "right",
201 |             "top": "center",
202 |             "feature": {
203 |               "mark":{"show":True},
204 |               "dataView": {"show": True, "readOnly": False},
205 |               "restore": {"show":True},
206 |               "saveAsImage": {"show":True}
207 |           }
208 |         }
209 |         #data={"name": '北京',"value": 10 }
210 |         #data=json.dumps(data,ensure_ascii=False)
211 |         self["series"]=[{
212 |             "name":"price",
213 |             "type": "map",
214 |             "mapType": "china",
215 |             "roam": False,
216 |             "label": {
217 |                 "normal": {
218 |                     "show": True
219 |                 },
220 |                 "emphasis": {
221 |                     "show": True
222 |                 }
223 |             },
224 |             #"data": [data]
225 |             "data": [{"name": n, "value": v} for n, v in data.items()]
226 |         }]
227 |         self["visualMap"]={
228 |             "min": 0,
229 |             "max": 2500,
230 |             "left": 'left',
231 |             "top": 'bottom',
232 |             "text": ['高','低'],
233 |             "calculable": True
234 |         }
235 |         self["tooltip"]={
236 |             "trigger":"item"
237 |         }
238 |        # self["title"]={
239 |        #     "text": 'price',
240 |        #     "subtext": 'price',
241 |        #     "left": 'center'
242 |        # }
243 |         return self
244 | 
245 | 
246 |     @staticmethod
247 |     def __merge_dict(*args):
248 |         """
249 |         合并多个字典并返回
250 |         """
251 |         return reduce(lambda x, y: dict(x.items() + y.items()), args)
252 | 
253 | 
254 | def main():
255 |     c = Chart().tooltip()
256 |     print json.dumps(c)
257 | 
258 | if __name__ == "__main__":
259 |     main()
260 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/models.pyc


--------------------------------------------------------------------------------
/visualization/py-echarts/query_presto.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pyhive import presto
 5 | 
 6 | PRESTO_SERVER = {'host': 'bigdata', 'port': 8080, 'catalog': 'hive', 'schema': 'default'}
 7 | BRAND_PRICE_QUERY="select brand,sum(price) as totalPrice from record join brand_dimension on record.bid=brand_dimension.bid group by brand_dimension.brand order by totalPrice desc limit 10"
 8 | 
 9 | AGE_PRICE_QUERY="select cast((year(CURRENT_DATE)-year(birth)) as integer) as age,sum(price) as totalPrice from record join user_dimension on record.uid=user_dimension.uid group by cast((year(CURRENT_DATE)-year(birth)) as integer) order by totalPrice desc"
10 | 
11 | class Presto_Query:
12 | 
13 |     def query_brand_price(self):
14 |         conn = presto.connect(**PRESTO_SERVER)
15 |         cursor = conn.cursor()
16 |         cursor.execute(BRAND_PRICE_QUERY)
17 |         tuples=cursor.fetchall()
18 |         return tuples
19 | 
20 |     def getKeys(self,tuples):
21 |         keys=[]
22 |         for tuple in tuples:
23 |             keys.append(tuple[0])
24 |         return keys
25 | 
26 |     def getValues(self, tuples):
27 |         values=[]
28 |         for tuple in tuples:
29 |             values.append(tuple[1])
30 |         return values
31 |   
32 |     def query_age_price(self):
33 |         conn = presto.connect(**PRESTO_SERVER)
34 |         cursor = conn.cursor()
35 |         cursor.execute(AGE_PRICE_QUERY)
36 |         tuples=cursor.fetchall()
37 |         return tuples 
38 | 
39 |     def getAgeDict(self, tuples):
40 |         dict={'<10':0L,'10~20':0L,'20~30':0L,'30~40':0L,'40~50':0L,'50~60':0L,'60~70':0L,'>70':0L}
41 |         for tuple in tuples:
42 |             age=int(tuple[0])
43 |             price=long(tuple[1])
44 |             age=age/10;
45 |             if age<1:
46 |                 value=dict['<10']
47 |                 dict['<10']=value+price
48 |             elif age>=1 and age<2:
49 |                 value=dict['10~20']
50 |                 dict['10~20']=value+price
51 |             elif age>=2 and age<3:
52 |                 value=dict['20~30']
53 |                 dict['20~30']=value+price
54 |             elif age>=3 and age<4:
55 |                 value=dict['30~40']
56 |                 dict['30~40']=value+price
57 |             elif age>=4 and age<5:
58 |                 value=dict['40~50']
59 |                 dict['40~50']=value+price
60 |             elif age>=5 and age<6:
61 |                 value=dict['50~60']
62 |                 dict['50~60']=value+price
63 |             elif age>=6 and age<7:
64 |                 value=dict['60~70']
65 |                 dict['60~70']=value+price
66 |             else:
67 |                 value=dict['>70']
68 |                 dict['>70']=value+price
69 |         return dict
70 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/query_presto.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/query_presto.pyc


--------------------------------------------------------------------------------
/visualization/py-echarts/query_redis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | import redis  
 6 | 
 7 | PROVINCE_MAP={"BeiJing":"北京","ShangHai":"上海","TianJin":"天津","ChongQing":"重庆","XiangGang":"香港","Aomen":"澳门","AnHui":"安徽","FuJian":"福建","GuangDong":"广东","GuangXi":"广西","GuiZhou":"贵州","GanSu":"甘肃","HaiNan":"海南","HeBei":"河北","HeNan":"河南","HeiLongJiang":"黑龙江","HuBei":"湖北","HuNan":"湖南","JiLin":"吉林","JiangSu":"江苏","JiangXi":"江西","LiaoNing":"辽宁","NeiMengGu":"内蒙古","NingXia":"宁夏","QingHai":"青海","ShanXi1":"山西","ShanXi3":"陕西","ShanDong":"山东","SiChuan":"四川","TaiWan":"台湾","XiZang":"西藏","XinJiang":"新疆","YunNan":"云南","ZheJiang":"浙江"}
 8 | 
 9 | class Redis_Query:
10 | 
11 |     def query_province(self):
12 |         r = redis.StrictRedis(host='127.0.0.1', port=6379)
13 |         return r.hgetall('province')  
14 | 
15 |     def get_province_price(self,dict):
16 |         china_price={}
17 |         for k,v in dict.items():
18 |             if k in PROVINCE_MAP:
19 |                 new_key=PROVINCE_MAP[k]
20 |                 china_price[new_key]=v
21 |         return china_price
22 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/query_redis.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/py-echarts/query_redis.pyc


--------------------------------------------------------------------------------
/visualization/py-echarts/templates/chart.html:
--------------------------------------------------------------------------------
 1 | <!-- 为ECharts准备一个具备大小（宽高）的Dom -->
 2 | <div id="main" style="height:400px"></div>
 3 | <!-- ECharts单文件引入 -->
 4 | <script src="http://echarts.baidu.com/build/dist/echarts.js"></script>
 5 | <script type="text/javascript">
 6 |     // 路径配置
 7 |     require.config({
 8 |         paths: {
 9 |             echarts: 'http://echarts.baidu.com/build/dist'
10 |         }
11 |     });
12 |     // 使用
13 |     require(
14 |         [
15 |             'echarts',
16 |             'echarts/chart/bar' // 使用柱状图就加载bar模块，按需加载
17 |         ],
18 |         function (ec) {
19 |             // 基于准备好的dom，初始化echarts图表
20 |             var myChart = ec.init(document.getElementById('main')); 
21 |             /* var option = {
22 |                tooltip: {
23 |                show: true
24 |                },
25 |                legend: {
26 |                data:['销量']
27 |                },
28 |                xAxis : [
29 |                {
30 |                type : 'category',
31 |                data : ["衬衫","羊毛衫","雪纺衫","裤子","高跟鞋","袜子"]
32 |                }
33 |                ],
34 |                yAxis : [
35 |                {
36 |                type : 'value'
37 |                }
38 |                ],
39 |                series : [
40 |                {
41 |                "name":"销量",
42 |                "type":"bar",
43 |                "data":[5, 20, 40, 10, 10, 20]
44 |                }
45 |                ]
46 |                }; */
47 |             // 为echarts对象加载数据 
48 |             myChart.setOption(option); 
49 |         }
50 |     );
51 | </script>
52 | 


--------------------------------------------------------------------------------
/visualization/py-echarts/templates/main.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html>
 4 |     <head>
 5 |         <title>电商双十一大数据日志分析系统</title>
 6 |         <script src="http://echarts.baidu.com/build/dist/echarts.js"></script>
 7 |         <script>
 8 |          require.config({
 9 |              paths: {
10 |                  echarts: 'http://echarts.baidu.com/build/dist'
11 |              }
12 |          });
13 |         </script>
14 |     </head>
15 | 
16 |     <body>
17 |         <h3>{{title}}</h3>
18 |         {% set i = 1 %}
19 |         {% for template in templates %}
20 |         {% if template.type == 'chart' %}
21 |         <p>
22 |             {{template.title}}
23 |         </p>
24 |         <div id="chart_{{i}}" style="height:400px"></div>
25 |         <!--script type="text/javascript" src="http://echarts.baidu.com/gallery/vendors/echarts/echarts-all-3.js"></script>
26 |         <script type="text/javascript" src="http://echarts.baidu.com/gallery/vendors/echarts/extension/dataTool.min.js"></script>
27 |         <script type="text/javascript" src="http://echarts.baidu.com/gallery/vendors/echarts/map/js/china.js"></script>
28 |         <script type="text/javascript" src="http://echarts.baidu.com/gallery/vendors/echarts/map/js/world.js"></script>
29 |         <script type="text/javascript" src="http://api.map.baidu.com/api?v=2.0&ak=ZUONbpqGBsYGXNIYHicvbAbM"></script>
30 |         <script type="text/javascript" src="http://echarts.baidu.com/gallery/vendors/echarts/extension/bmap.min.js"></script-->
31 |         <script type="text/javascript">
32 |             require([
33 |             'echarts',
34 |             'echarts/chart/bar',
35 |             'echarts/chart/line',
36 |             'echarts/chart/pie',
37 |             'echarts/chart/map'
38 |             ],
39 |             function (ec) {
40 |             var myChart = ec.init(document.getElementById("chart_{{i}}"));
41 |             var option = {{template.option|safe}};
42 |             myChart.setOption(option);
43 |             });
44 |         </script>
45 | 
46 |         {% set i = i + 1 %}
47 |         {% endif %}
48 |         {% endfor %}
49 |     </body>
50 | </html>
51 | 


--------------------------------------------------------------------------------
/visualization/result/image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-1.png


--------------------------------------------------------------------------------
/visualization/result/image-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-2.png


--------------------------------------------------------------------------------
/visualization/result/image-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigdataguide/hadooptraining/f374203eefeb3d6a0f4f64c9c0b841306a197349/visualization/result/image-3.png


--------------------------------------------------------------------------------