├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── images
    ├── index.png
    ├── reverseindex.jpg
    ├── 分区.jpg
    └── 简单模型.jpg
├── resources
    ├── comments.xml
    ├── friendsdata.txt
    ├── itemcf.csv
    ├── order.txt
    ├── people.csv
    ├── product.txt
    └── rand.sh
└── src
    └── main
        └── java
            ├── InputOutputFormatTest
                └── MultiInOutput.java
            ├── InvertedIndex
                ├── InvertedCombiner.java
                ├── InvertedJob.java
                ├── InvertedMapper.java
                └── InvertedReducer.java
            ├── gradesAverage
                └── GradesAverage.java
            ├── mapReduceTest
                └── wordCount
                │   └── WordCount.java
            ├── mapreduceProgram
                ├── DateSortAsc.java
                ├── DateSortDesc.java
                ├── FlowPartition.java
                ├── FlowSort.java
                ├── FlowStatistics.java
                └── GroupMax.java
            ├── mergeMultipleFiles
                ├── MergeJob.java
                ├── MergeMapper.java
                ├── MyInputFormat.java
                └── MyRecordReader.java
            ├── mutualFriend
                ├── DecomposeFriendsMapper.java
                ├── DecomposeFriendsReducer.java
                ├── JobControlRun.java
                ├── JobRun.java
                ├── MergeFriendsMapper.java
                └── MergeFriendsReducer.java
            ├── shuffleTest
                ├── MonthAscTempDescSort.java
                └── TempSort.java
            ├── ssdut
                └── training
                │   └── mapreduce
                │       ├── counter
                │           └── YearCounter.java
                │       ├── datecount
                │           ├── DateCount.java
                │           ├── DateDistinct.java
                │           ├── DateFilter.java
                │           ├── DateGroup.java
                │           ├── DateGroup2.java
                │           ├── DatePartition.java
                │           ├── DatePartition2.java
                │           ├── DateSort.java
                │           ├── DateSort2.java
                │           └── DateSort3.java
                │       ├── inputformat
                │           ├── FixedLengthInput.java
                │           ├── FixedLengthInput2.java
                │           ├── KeyValueInput.java
                │           ├── MultInput.java
                │           ├── MultInput2.java
                │           ├── NLineInput.java
                │           └── SequenceInput.java
                │       ├── itemcf
                │           ├── StartRun.java
                │           ├── Step1.java
                │           ├── Step2.java
                │           ├── Step3.java
                │           ├── Step4.java
                │           ├── Step5.java
                │           └── Step6.java
                │       ├── medianstddev
                │           ├── MRDPUtils.java
                │           ├── MedianStdDevJob.java
                │           ├── MedianStdDevMapper.java
                │           ├── MedianStdDevReducer.java
                │           └── MedianStdDevTuple.java
                │       ├── minmaxcount
                │           ├── MRDPUtils.java
                │           ├── MinMaxCountJob.java
                │           ├── MinMaxCountMapper.java
                │           ├── MinMaxCountReducer.java
                │           └── MinMaxCountTuple.java
                │       ├── output
                │           ├── CompressOutput.java
                │           └── MultOutput.java
                │       ├── peoplerank
                │           ├── People.java
                │           ├── PeopleRank.java
                │           └── PeopleRank2.java
                │       └── topten
                │           ├── TopTenJob.java
                │           ├── TopTenMapper.java
                │           └── TopTenReducer.java
            └── weblog
                ├── FlowCount.java
                ├── IPCount.java
                ├── Missed.java
                ├── PVMinMax.java
                ├── PVMinMax2.java
                └── PVTopTen.java


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 | 
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 josonle
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/images/index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/index.png


--------------------------------------------------------------------------------
/images/reverseindex.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/reverseindex.jpg


--------------------------------------------------------------------------------
/images/分区.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/分区.jpg


--------------------------------------------------------------------------------
/images/简单模型.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/简单模型.jpg


--------------------------------------------------------------------------------
/resources/comments.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <comments>
 3 | <row Id="1" PostId="3" Score="3" Text="One could also argue that having &quot;wordpress&quot; in a tag name is superfluous, since the whole site is about WordPress. But then the tag would be &quot;3&quot;, or &quot;v3&quot;, which is maybe too short." CreationDate="2010-08-11T20:06:43.057" UserId="8" />
 4 | <row Id="2" PostId="3" Score="2" Text="I like [v3] more than a [wordpress-anything]. But does it really matter? Version tags do not describe the content (good enough), they are meta tags and should probably be avoided at all. http://blog.stackoverflow.com/2010/08/the-death-of-meta-tags/" CreationDate="2010-08-11T20:19:54.273" UserId="73" />
 5 | <row Id="3" PostId="6" Text="I mean that from now on all questions will probably be posted here, so SOwill have less and less questions on the topic." CreationDate="2010-08-11T20:36:28.773" UserId="33" />
 6 | <row Id="4" PostId="3" Text="While I agree that wordpress-3 tags should be avoided, they will be used as long as WP3 is new. Likewise, phpand php5 or ruby-on-rails and ruby-on-rails-3 tags are currently used on SO, so there's no point in fighting it ;)" CreationDate="2010-08-11T20:41:07.013" UserId="33" />
 7 | <row Id="5" PostId="3" Text="@toscho: I don't see version tags as meta tags. If my question is &quot;Whatshould I keep in mind when upgrading to WordPress 3&quot;, then a version tag could be descriptive." CreationDate="2010-08-11T20:52:25.817" UserId="8" />
 8 | <row Id="6" PostId="6" Text="Hadoop is very intresting." CreationDate="2010-08-11T21:00:00.000" UserId="33" />
 9 | <row Id="7" PostId="6" Text="Spark is intresting too." CreationDate="2010-08-11T21:10:00.000" UserId="33" />
10 | </comments>


--------------------------------------------------------------------------------
/resources/friendsdata.txt:
--------------------------------------------------------------------------------
 1 | A:B,C,D,F,E,O
 2 | B:A,C,E,K
 3 | C:F,A,D,I
 4 | D:A,E,F,L
 5 | E:B,C,D,M,L
 6 | F:A,B,C,D,E,O,M
 7 | G:A,C,D,E,F
 8 | H:A,C,D,E,O
 9 | I:A,O
10 | J:B,O
11 | K:A,C,D
12 | L:D,E,F
13 | M:E,F,G
14 | O:A,H,I,J


--------------------------------------------------------------------------------
/resources/order.txt:
--------------------------------------------------------------------------------
1 | 1001	20150710	P0001	2
2 | 1002	20150710	P0001	3
3 | 1002	20150710	P0002	3
4 | 1003	20150710	P0003	3


--------------------------------------------------------------------------------
/resources/people.csv:
--------------------------------------------------------------------------------
1 | a,b
2 | a,c
3 | a,d
4 | b,a
5 | b,d
6 | c,a
7 | d,b
8 | d,c
9 | 


--------------------------------------------------------------------------------
/resources/product.txt:
--------------------------------------------------------------------------------
1 | P0001	小米5	1001	2
2 | P0002	锤子T1	1000	3
3 | P0003	锤子	1002	4


--------------------------------------------------------------------------------
/resources/rand.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | num=$1
 3 | [[ -z $num ]] && num=100
 4 | 
 5 | for ((i=1;i<=$num;i++))
 6 | do
 7 |   year=$(expr $RANDOM % 3 + 2015)
 8 |   month=$(expr $RANDOM % 12 + 1)
 9 |   
10 |   case $month in 
11 |     1 | 3 | 5 | 7 | 8 | 10 | 12)
12 |       day=$(expr $RANDOM % 31 + 1)
13 |       ;;
14 |     2)
15 |       if [[ $year -eq 2016 && $month -eq 2 ]]
16 |       then
17 |         day=$(expr $RANDOM % 29 + 1)
18 |       else
19 |         day=$(expr $RANDOM % 28 + 1)
20 |       fi  
21 |       ;;
22 |     4 | 6 | 9 | 11)
23 |       day=$(expr $RANDOM % 30 + 1)
24 |       ;;
25 |   esac
26 | 
27 |   if [[ $month -lt 10 ]]
28 |   then 
29 |     month=0$month
30 |   fi
31 | 
32 |   if [[ $day -lt 10 ]]
33 |   then 
34 |     day=0$day
35 |   fi
36 | 
37 |   echo "$year-$month-$day:$i"
38 | done
39 | 


--------------------------------------------------------------------------------
/src/main/java/InputOutputFormatTest/MultiInOutput.java:
--------------------------------------------------------------------------------
  1 | package InputOutputFormatTest;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.fs.FileSystem;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.mapreduce.Job;
  9 | import org.apache.hadoop.mapreduce.Mapper;
 10 | import org.apache.hadoop.mapreduce.Reducer;
 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 12 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 17 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 19 | 
 20 | import ssdut.training.mapreduce.output.MultOutput;
 21 | import ssdut.training.mapreduce.output.MultOutput.MultOutputMapper;
 22 | import ssdut.training.mapreduce.output.MultOutput.MultOutputReducer;
 23 | 
 24 | import org.apache.hadoop.io.IntWritable;
 25 | import org.apache.hadoop.io.Text;
 26 | 
 27 | public class MultiInOutput {
 28 | 	public static class TxtFileMapper extends Mapper<Object, Text, Text, IntWritable> {
 29 | 		private final static IntWritable one = new IntWritable(1);
 30 | 
 31 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 32 | 			String[] strs = value.toString().split(" ");
 33 | 			Text date = new Text(strs[0]);
 34 | 			context.write(date, one);
 35 | 		}
 36 | 	}
 37 | 
 38 | 	public static class CsvFileMapper extends Mapper<Object, Text, Text, IntWritable> {
 39 | 		private final static IntWritable one = new IntWritable(1);
 40 | 
 41 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 42 | 			String[] strs = value.toString().split(";");//定义csv文件时用了；做分隔符
 43 | 			context.write(new Text(strs[0]), one);
 44 | 		}
 45 | 	}
 46 | 
 47 | 	public static class MultOutputReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
 48 | 		// 通过 MultipleOutputs 类控制输出的文件名和输出路径
 49 | 		// 定义MultipleOutput对象
 50 | 		private MultipleOutputs<Text, IntWritable> mos;
 51 | 
 52 | 		// 覆写MultipleOutput对象的setup()初始化和cleanup()关闭mos对象方法
 53 | 		protected void setup(Context context) {
 54 | 			mos = new MultipleOutputs<Text, IntWritable>(context);
 55 | 		}
 56 | 
 57 | 		protected void cleanup(Context context) throws IOException, InterruptedException {
 58 | 			mos.close();
 59 | 		}
 60 | 
 61 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
 62 | 				throws IOException, InterruptedException {
 63 | 			int sum = 0;
 64 | 			for (IntWritable value : values) {
 65 | 				sum += value.get();
 66 | 			}
 67 | 			// 使用MultiOutputs对象替代Context对象输出
 68 | 			// 1. 输出到不同文件（格式、文件名）
 69 | 			if (key.toString().startsWith("2015"))
 70 | 				mos.write("f2015", key, new IntWritable(sum));
 71 | 			else if (key.toString().startsWith("2016"))
 72 | 				mos.write("f2016", key, new IntWritable(sum));
 73 | 			else
 74 | 				mos.write("f2017", key, new IntWritable(sum));
 75 | 
 76 | 		}
 77 | 	}
 78 | 
 79 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
 80 | 		// 1.设置HDFS配置信息
 81 | 		String namenode_ip = "192.168.17.10";
 82 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
 83 | 		Configuration conf = new Configuration();
 84 | 		conf.set("fs.defaultFS", hdfs);
 85 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 86 | 
 87 | 		// 2.设置MapReduce作业配置信息
 88 | 		String jobName = "MultInputOutput"; // 作业名称
 89 | 		Job job = Job.getInstance(conf, jobName);
 90 | 		job.setJarByClass(MultiInOutput.class); // 指定运行时作业类
 91 | 		job.setJar("export\\MultiInOutput.jar"); // 指定本地jar包
 92 | 		job.setMapOutputKeyClass(Text.class); // 设置Mapper输出Key类型
 93 | 		job.setMapOutputValueClass(IntWritable.class); // 设置Mapper输出Value类型
 94 | 		job.setReducerClass(MultOutputReducer.class); // 指定Reducer类
 95 | 		// job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型
 96 | 		// job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型
 97 | 		
 98 | 		// 3.指定作业多输入路径，及Map所使用的类
 99 | 		MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multiinoutput/data/txt"), TextInputFormat.class, TxtFileMapper.class);
100 | 		MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multiinoutput/data/csv"), TextInputFormat.class, CsvFileMapper.class);
101 | 		
102 | 		// 定义多文件输出的文件名、输出格式、Reduce输出键类型，值类型
103 | 		MultipleOutputs.addNamedOutput(job, "f2015", TextOutputFormat.class, Text.class, IntWritable.class);
104 | 		MultipleOutputs.addNamedOutput(job, "f2016", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
105 | 		MultipleOutputs.addNamedOutput(job, "f2017", MapFileOutputFormat.class, Text.class, IntWritable.class);
106 | 
107 | 		// 设置作业输出路径
108 | 		String outputDir = "/expr/multiinoutput/output"; // 实验输出目录
109 | 		Path outPath = new Path(hdfs + outputDir);
110 | 		FileOutputFormat.setOutputPath(job, outPath);
111 | 		FileSystem fs = FileSystem.get(conf);
112 | 		if (fs.exists(outPath)) {
113 | 			fs.delete(outPath, true);
114 | 		}
115 | 
116 | 		// 4.运行作业
117 | 		System.out.println("Job: " + jobName + " is running...");
118 | 		if (job.waitForCompletion(true)) {
119 | 			System.out.println("success!");
120 | 			System.exit(0);
121 | 		} else {
122 | 			System.out.println("failed!");
123 | 			System.exit(1);
124 | 		}
125 | 	}
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/InvertedIndex/InvertedCombiner.java:
--------------------------------------------------------------------------------
 1 | package InvertedIndex;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class InvertedCombiner extends Reducer<Text, Text, Text, Text> {
 8 | 	private Text info = new Text();
 9 | 
10 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
11 | 		int sum = 0;
12 | 		for (Text value : values) {
13 | 			sum += Integer.parseInt(value.toString());	//单词数求和
14 | 		}
15 | 		int splitIndex = key.toString().indexOf(":");	//获取key中冒号的下标
16 | 		//注意此处应先计算info再计算key，否则key下标会越界
17 | 		info.set(key.toString().substring(splitIndex + 1) + ":" + sum);	//将key中冒号后的内容（文件名）与单词数总和组合成Combiner输出的value
18 | 		key.set(key.toString().substring(0, splitIndex));				//将key中冒号前的内容（单词）设置为Combiner输出的key
19 | 		context.write(key, info);		//输出格式："word	filename:sum"
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/InvertedIndex/InvertedJob.java:
--------------------------------------------------------------------------------
 1 | package InvertedIndex;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 | 
11 | public class InvertedJob {
12 | 	public static void main(String[] args) throws Exception {		
13 | 		String namenode_ip = "192.168.17.10";
14 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
15 | 		Configuration conf = new Configuration();		
16 | 		conf.set("fs.defaultFS", hdfs);
17 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
18 | 		
19 | 		String jobName = "InvertedJob";
20 | 		Job job = Job.getInstance(conf, jobName);
21 | 		job.setJarByClass(InvertedJob.class);
22 | 		job.setJar("export\\InvertedJob.jar");
23 | 		job.setMapperClass(InvertedMapper.class);
24 | 		job.setMapOutputKeyClass(Text.class);
25 | 		job.setMapOutputValueClass(Text.class);
26 | 		job.setCombinerClass(InvertedCombiner.class);	//此处定义Combiner类，与Reducer类不同
27 | 		job.setReducerClass(InvertedReducer.class);
28 | 		job.setOutputKeyClass(Text.class);
29 | 		job.setOutputValueClass(Text.class);
30 | 		
31 | 		String dataDir = "/expr/inverted/data";	
32 | 		String outputDir = "/expr/inverted/output";
33 | 		Path inPath = new Path(hdfs + dataDir);
34 | 		Path outPath = new Path(hdfs + outputDir);
35 | 		FileInputFormat.addInputPath(job, inPath);
36 | 		FileOutputFormat.setOutputPath(job, outPath);		
37 | 		FileSystem fs = FileSystem.get(conf);
38 | 		if(fs.exists(outPath)) {
39 | 			fs.delete(outPath, true);
40 | 		}
41 | 
42 | 		System.out.println("Job: " + jobName + " is running...");
43 | 		if(job.waitForCompletion(true)) {
44 | 			System.out.println("success!");
45 | 			System.exit(0);
46 | 		} else {
47 | 			System.out.println("failed!");
48 | 			System.exit(1);
49 | 		}		
50 | 	}
51 | 	
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/InvertedIndex/InvertedMapper.java:
--------------------------------------------------------------------------------
 1 | package InvertedIndex;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 8 | 
 9 | public class InvertedMapper extends Mapper<Object, Text, Text, Text> {
10 | 	private Text keyInfo = new Text();
11 | 	private Text valueInfo = new Text();
12 | 	private FileSplit split;
13 | 
14 | 	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
15 | 		split = (FileSplit) context.getInputSplit();	//通过context获取输入分片对象，目的是获得输入文件名称
16 | 		StringTokenizer itr = new StringTokenizer(value.toString());
17 | 		while (itr.hasMoreTokens()) {
18 | 			keyInfo.set(itr.nextToken() + ":" + split.getPath().getName()); //将单词及其所属文件拼接成"word:filename"格式作为key
19 | 			valueInfo.set("1");
20 | 			context.write(keyInfo, valueInfo);  //输出格式： "word:filename	1"
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/InvertedIndex/InvertedReducer.java:
--------------------------------------------------------------------------------
 1 | package InvertedIndex;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class InvertedReducer extends Reducer<Text, Text, Text, Text> {
 8 | 	private Text result = new Text();
 9 | 
10 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
11 | 		String fileList = new String();
12 | 		for (Text value : values) {
13 | 			fileList += value.toString() + "; ";
14 | 		}
15 | 		result.set(fileList);
16 | 		context.write(key, result);		//输出格式："word	file1:num1; file2:num2;"
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/gradesAverage/GradesAverage.java:
--------------------------------------------------------------------------------
  1 | package gradesAverage;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.StringTokenizer;
  5 | 
  6 | 
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.fs.FileSystem;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.FloatWritable;
 11 | import org.apache.hadoop.io.IntWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 | 
 19 | import mapReduceTest.wordCount.WordCount;
 20 | import mapReduceTest.wordCount.WordCount.IntSumReducer;
 21 | import mapReduceTest.wordCount.WordCount.TokenizerMapper;
 22 | 
 23 | public class GradesAverage {
 24 | 
 25 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
 26 | 		private Text student = new Text();
 27 | 		private IntWritable grade = new IntWritable();
 28 | 
 29 | 		/* (non-Javadoc)
 30 | 		 * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
 31 | 		 */
 32 | 		/* (non-Javadoc)
 33 | 		 * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
 34 | 		 */
 35 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 36 | //			StringTokenizer iTokenizer = new StringTokenizer(value.toString(),"\n");
 37 | 			System.out.println("key is："+key+",value is: "+value.toString());
 38 | //			while (iTokenizer.hasMoreTokens()) {
 39 | //				
 40 | //			}
 41 | 			String[] list_strs = value.toString().split(" ");
 42 | 			// 因为每行只有一个学号和对应成绩，不需要考虑切分多个词
 43 | 			student.set(list_strs[0]);
 44 | 			grade.set(Integer.parseInt(list_strs[1]));
 45 | 			context.write(student, grade);
 46 | 		}
 47 | 	}
 48 | 	
 49 | //	public static class gradesAverageCombiner extends Reducer<Text, IntWritable, Text, Text> {
 50 | //		private Text gradesSum = new Text();
 51 | //
 52 | //		public void reduce(Text key, Iterable<IntWritable> values, Context context)
 53 | //				throws IOException, InterruptedException {
 54 | //			int sum = 0;
 55 | //			int grades = 0;
 56 | //			for (IntWritable val : values) {
 57 | //				sum += 1;
 58 | //				grades += val.get();
 59 | //			}
 60 | //			System.out.println("Combiner---student is:"+key.toString()+",grades is:"+grades+",sum is:"+sum);
 61 | //			gradesSum.set(grades+","+sum);
 62 | //			System.out.println(gradesSum);
 63 | //			context.write(key, gradesSum);
 64 | //		}
 65 | //	}
 66 | 	public static class gradesAverageReducer extends Reducer<Text, IntWritable, Text, FloatWritable> {
 67 | 		private FloatWritable gradesSum = new FloatWritable();
 68 | 
 69 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
 70 | 				throws IOException, InterruptedException {
 71 | 			int sum = 0;
 72 | 			int grades = 0;
 73 | 			for (IntWritable val : values) {
 74 | 				sum += 1;   
 75 | 				grades += val.get();
 76 | 			}
 77 | 			System.out.println("Reduce----student is:"+key.toString()+",grades is:"+grades+",sum is:"+sum);
 78 | 			gradesSum.set((float)grades/sum);
 79 | 			context.write(key, gradesSum);
 80 | 		}
 81 | 	}
 82 | 
 83 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
 84 | 		// 1.设置HDFS配置信息
 85 | 		String namenode_ip = "192.168.17.10";
 86 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
 87 | 		Configuration conf = new Configuration(); // Hadoop配置类
 88 | 		conf.set("fs.defaultFS", hdfs);
 89 | 		conf.set("mapreduce.app-submission.cross-platform", "true"); // 集群交叉提交
 90 | 		/*
 91 | 		 * conf.set("hadoop.job.user", "hadoop"); conf.set("mapreduce.framework.name",
 92 | 		 * "yarn"); conf.set("mapreduce.jobtracker.address", namenode_ip + ":9001");
 93 | 		 * conf.set("yarn.resourcemanager.hostname", namenode_ip);
 94 | 		 * conf.set("yarn.resourcemanager.resource-tracker.address", namenode_ip +
 95 | 		 * ":8031"); conf.set("yarn.resourcemtanager.address", namenode_ip + ":8032");
 96 | 		 * conf.set("yarn.resourcemanager.admin.address", namenode_ip + ":8033");
 97 | 		 * conf.set("yarn.resourcemanager.scheduler.address", namenode_ip + ":8034");
 98 | 		 * conf.set("mapreduce.jobhistory.address", namenode_ip + ":10020");
 99 | 		 */
100 | 
101 | 		// 2.设置MapReduce作业配置信息
102 | 		String jobName = "GradesAverage"; // 定义作业名称
103 | 		Job job = Job.getInstance(conf, jobName);
104 | 		job.setJarByClass(GradesAverage.class); // 指定作业类
105 | 		job.setJar("export\\GradesAverage.jar"); // 指定本地jar包
106 | 		job.setMapperClass(TokenizerMapper.class);
107 | //		job.setCombinerClass(gradesAverageCombiner.class); // 指定Combiner类
108 | 		job.setReducerClass(gradesAverageReducer.class);
109 | //		输出key-value的类型
110 | 		job.setOutputKeyClass(Text.class);
111 | 		job.setMapOutputValueClass(IntWritable.class);
112 | 		job.setOutputValueClass(FloatWritable.class);
113 | 
114 | 		// 3.设置作业输入和输出路径
115 | 		String dataDir = "/expr/studentgrades/grades"; // 实验数据目录
116 | 		String outputDir = "/expr/studentgrades/output"; // 实验输出目录
117 | 		Path inPath = new Path(hdfs + dataDir);
118 | 		Path outPath = new Path(hdfs + outputDir);
119 | 		FileInputFormat.addInputPath(job, inPath);
120 | 		FileOutputFormat.setOutputPath(job, outPath);
121 | 		// 如果输出目录已存在则删除
122 | 		FileSystem fs = FileSystem.get(conf);
123 | 		if (fs.exists(outPath)) {
124 | 			fs.delete(outPath, true);
125 | 		}
126 | 
127 | 		// 4.运行作业
128 | 		System.out.println("Job: " + jobName + " is running...");
129 | 		if (job.waitForCompletion(true)) {
130 | 			System.out.println("统计 success!");
131 | 			System.exit(0);
132 | 		} else {
133 | 			System.out.println("统计 failed!");
134 | 			System.exit(1);
135 | 		}
136 | 	}
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/mapReduceTest/wordCount/WordCount.java:
--------------------------------------------------------------------------------
 1 | package mapReduceTest.wordCount;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.StringTokenizer;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class WordCount {
17 | 
18 | 	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {		
19 | 		private final static IntWritable one = new IntWritable(1);
20 | 		private Text word = new Text();
21 | 
22 | 		public void map(Object key, Text value, Context context ) throws IOException, InterruptedException {
23 | 	    	StringTokenizer itr = new StringTokenizer(value.toString());
24 | 	    	while (itr.hasMoreTokens()) {
25 | 	    		word.set(itr.nextToken());
26 | 	    		context.write(word, one);
27 | 	    	}
28 | 	    }
29 | 	}
30 |   
31 | 	public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
32 | 		private IntWritable result = new IntWritable();
33 | 		
34 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
35 | 			int sum = 0;
36 | 			for (IntWritable val : values) {
37 | 				sum += val.get();
38 | 			}
39 | 			result.set(sum);
40 | 			context.write(key, result);
41 | 	    }
42 | 	}
43 | 
44 | 	public static void main(String[] args) throws Exception {		
45 | 		//1.设置HDFS配置信息
46 | 		String namenode_ip = "192.168.17.10";
47 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
48 | 		Configuration conf = new Configuration();	//Hadoop配置类
49 | 		conf.set("fs.defaultFS", hdfs);
50 | 		conf.set("mapreduce.app-submission.cross-platform", "true");	//集群交叉提交
51 | /*		conf.set("hadoop.job.user", "hadoop");
52 | 		conf.set("mapreduce.framework.name", "yarn");
53 | 		conf.set("mapreduce.jobtracker.address", namenode_ip + ":9001");
54 | 		conf.set("yarn.resourcemanager.hostname", namenode_ip);	
55 | 		conf.set("yarn.resourcemanager.resource-tracker.address", namenode_ip + ":8031");
56 | 		conf.set("yarn.resourcemtanager.address", namenode_ip + ":8032");
57 | 		conf.set("yarn.resourcemanager.admin.address", namenode_ip + ":8033");
58 | 		conf.set("yarn.resourcemanager.scheduler.address", namenode_ip + ":8034");
59 | 		conf.set("mapreduce.jobhistory.address", namenode_ip + ":10020"); */
60 | 		
61 | 		//2.设置MapReduce作业配置信息
62 | 		String jobName = "WordCount";					//定义作业名称
63 | 		Job job = Job.getInstance(conf, jobName);
64 | 		job.setJarByClass(WordCount.class);			//指定作业类
65 | 		job.setJar("export\\WordCount.jar");			//指定本地jar包
66 | 		job.setMapperClass(TokenizerMapper.class);
67 | 		job.setCombinerClass(IntSumReducer.class);		//指定Combiner类
68 | 		job.setReducerClass(IntSumReducer.class);
69 | 		job.setOutputKeyClass(Text.class);
70 | 		job.setOutputValueClass(IntWritable.class);
71 | 		
72 | 		//3.设置作业输入和输出路径
73 | 		String dataDir = "/expr/wordcount/data";		//实验数据目录	
74 | 		String outputDir = "/expr/wordcount/output";	//实验输出目录
75 | 		Path inPath = new Path(hdfs + dataDir);
76 | 		Path outPath = new Path(hdfs + outputDir);
77 | 		FileInputFormat.addInputPath(job, inPath);
78 | 		FileOutputFormat.setOutputPath(job, outPath);
79 | 		//如果输出目录已存在则删除
80 | 		FileSystem fs = FileSystem.get(conf);
81 | 		if(fs.exists(outPath)) {
82 | 			fs.delete(outPath, true);
83 | 		}
84 | 		
85 | 		//4.运行作业
86 | 		System.out.println("Job: " + jobName + " is running...");
87 | 		if(job.waitForCompletion(true)) {
88 | 			System.out.println("success!");
89 | 			System.exit(0);
90 | 		} else {
91 | 			System.out.println("failed!");
92 | 			System.exit(1);
93 | 		}
94 | 	}
95 | 
96 | }


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/DateSortAsc.java:
--------------------------------------------------------------------------------
 1 | package mapreduceProgram;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | 
17 | public class DateSortAsc {
18 | 
19 | 	public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
20 | 		private IntWritable num = new IntWritable();
21 | 
22 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
23 | 			String[] strs = value.toString().split("\t");
24 | 			num.set(Integer.parseInt(strs[1]));
25 | 			// 将次数作为key进行升序排序
26 | 			context.write(num, new Text(strs[0]));
27 | 			System.out.println(num.get()+","+strs[0]);
28 | 		}
29 | 	}
30 | 
31 | 	public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
32 | 
33 | 		public void reduce(IntWritable key, Iterable<Text> values, Context context)
34 | 				throws IOException, InterruptedException {
35 | 			for (Text value : values) {
36 | 				// 排序后再次颠倒k-v，将日期作为key
37 | 				System.out.println(value.toString()+":"+key.get());
38 | 				context.write(value, key);
39 | 			}
40 | 		}
41 | 	}
42 | 
43 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
44 | 		// 1.设置HDFS配置信息
45 | 		String namenode_ip = "192.168.17.10";
46 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
47 | 		Configuration conf = new Configuration();
48 | 		conf.set("fs.defaultFS", hdfs);
49 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
50 | 
51 | 		// 2.设置MapReduce作业配置信息
52 | 		String jobName = "DateSortAsc"; // 定义作业名称
53 | 		Job job = Job.getInstance(conf, jobName);
54 | 		job.setJarByClass(DateSortAsc.class); // 指定作业类
55 | 		job.setJar("export\\DateSortAsc.jar"); // 指定本地jar包
56 | 		
57 | 		job.setMapperClass(SortMapper.class); // 指定Mapper类
58 | 		job.setMapOutputKeyClass(IntWritable.class); // 设置Mapper输出Key类型
59 | 		job.setMapOutputValueClass(Text.class); // 设置Mapper输出Value类型
60 | 		
61 | 		job.setReducerClass(SortReducer.class); // 指定Reducer类
62 | 		job.setOutputKeyClass(Text.class); // 设置Reduce输出Key类型
63 | 		job.setOutputValueClass(IntWritable.class); // 设置Reduce输出Value类型
64 | 
65 | 		// 3.设置作业输入和输出路径
66 | 		String dataDir = "/workspace/dateSort/data"; // 实验数据目录
67 | 		String outputDir = "/workspace/dateSort/output"; // 实验输出目录
68 | 		Path inPath = new Path(hdfs + dataDir);
69 | 		Path outPath = new Path(hdfs + outputDir);
70 | 		FileInputFormat.addInputPath(job, inPath);
71 | 		FileOutputFormat.setOutputPath(job, outPath);
72 | 		FileSystem fs = FileSystem.get(conf);
73 | 		if (fs.exists(outPath)) {
74 | 			fs.delete(outPath, true);
75 | 		}
76 | 
77 | 		// 4.运行作业
78 | 		System.out.println("Job: " + jobName + " is running...");
79 | 		if (job.waitForCompletion(true)) {
80 | 			System.out.println("success!");
81 | 			System.exit(0);
82 | 		} else {
83 | 			System.out.println("failed!");
84 | 			System.exit(1);
85 | 		}
86 | 	}
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/DateSortDesc.java:
--------------------------------------------------------------------------------
  1 | package mapreduceProgram;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.fs.FileSystem;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.io.IntWritable;
  9 | import org.apache.hadoop.io.Text;
 10 | import org.apache.hadoop.io.WritableComparable;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Reducer;
 14 | import org.apache.hadoop.mapreduce.Mapper.Context;
 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 17 | 
 18 | import mapreduceProgram.DateSortAsc.SortMapper;
 19 | import mapreduceProgram.DateSortAsc.SortReducer;
 20 | 
 21 | import org.apache.hadoop.io.WritableComparator;
 22 | 
 23 | public class DateSortDesc {
 24 | 
 25 | 	public static class MyComparator extends WritableComparator {
 26 | 		public MyComparator() {
 27 | 			// TODO Auto-generated constructor stub
 28 | 			super(IntWritable.class, true);
 29 | 		}
 30 | 
 31 | 		@Override
 32 | 		@SuppressWarnings({ "rawtypes", "unchecked" }) // 不检查类型
 33 | 		public int compare(WritableComparable a, WritableComparable b) {
 34 | 			// CompareTo方法，返回值为1则降序，-1则升序
 35 | 			// 默认是a.compareTo(b)，a比b小返回-1，现在反过来返回1，就变成了降序
 36 | 			return b.compareTo(a);
 37 | 		}
 38 | 
 39 | 		public static class SortMapper extends Mapper<Object, Text, IntWritable, Text> {
 40 | 			private IntWritable num = new IntWritable();
 41 | 
 42 | 			public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 43 | 				String[] strs = value.toString().split("\t");
 44 | 				num.set(Integer.parseInt(strs[1]));
 45 | 				// 将次数作为key进行升序排序
 46 | 				context.write(num, new Text(strs[0]));
 47 | 				System.out.println(num.get() + "," + strs[0]);
 48 | 			}
 49 | 		}
 50 | 
 51 | 		public static class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
 52 | 
 53 | 			public void reduce(IntWritable key, Iterable<Text> values, Context context)
 54 | 					throws IOException, InterruptedException {
 55 | 				for (Text value : values) {
 56 | 					// 排序后再次颠倒k-v，将日期作为key
 57 | 					System.out.println(value.toString() + ":" + key.get());
 58 | 					context.write(value, key);
 59 | 				}
 60 | 			}
 61 | 		}
 62 | 	}
 63 | 
 64 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
 65 | 		// 1.设置HDFS配置信息
 66 | 		String namenode_ip = "192.168.17.10";
 67 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
 68 | 		Configuration conf = new Configuration();
 69 | 		conf.set("fs.defaultFS", hdfs);
 70 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 71 | 
 72 | 		// 2.设置MapReduce作业配置信息
 73 | 		String jobName = "DateSortDesc"; // 定义作业名称
 74 | 		Job job = Job.getInstance(conf, jobName);
 75 | 		job.setJarByClass(DateSortAsc.class); // 指定作业类
 76 | 		job.setJar("export\\DateSortDesc.jar"); // 指定本地jar包
 77 | 
 78 | 		job.setMapperClass(SortMapper.class);
 79 | 		job.setMapOutputKeyClass(IntWritable.class);
 80 | 		job.setMapOutputValueClass(Text.class);
 81 | 		job.setReducerClass(SortReducer.class);
 82 | 		job.setOutputKeyClass(Text.class);
 83 | 		job.setOutputValueClass(IntWritable.class);
 84 | 		// 指定排序所使用的比较器
 85 | 		job.setSortComparatorClass(MyComparator.class);
 86 | 
 87 | 		// 3.设置作业输入和输出路径
 88 | 		String dataDir = "/workspace/dateSort/data"; // 实验数据目录
 89 | 		String outputDir = "/workspace/dateSort/output"; // 实验输出目录
 90 | 		Path inPath = new Path(hdfs + dataDir);
 91 | 		Path outPath = new Path(hdfs + outputDir);
 92 | 		FileInputFormat.addInputPath(job, inPath);
 93 | 		FileOutputFormat.setOutputPath(job, outPath);
 94 | 		FileSystem fs = FileSystem.get(conf);
 95 | 		if (fs.exists(outPath)) {
 96 | 			fs.delete(outPath, true);
 97 | 		}
 98 | 
 99 | 		// 4.运行作业
100 | 		System.out.println("Job: " + jobName + " is running...");
101 | 		if (job.waitForCompletion(true)) {
102 | 			System.out.println("success!");
103 | 			System.exit(0);
104 | 		} else {
105 | 			System.out.println("failed!");
106 | 			System.exit(1);
107 | 		}
108 | 
109 | 	}
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/FlowPartition.java:
--------------------------------------------------------------------------------
  1 | package mapreduceProgram;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.HashMap;
  7 | 
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.apache.hadoop.io.Writable;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Partitioner;
 16 | import org.apache.hadoop.mapreduce.Reducer;
 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 19 | 
 20 | public class FlowPartition {
 21 | 	public static class FlowPartitionMapper extends Mapper<Object, Text, Text, FlowWritable> {
 22 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 23 | 			String[] strs = value.toString().split("\t");
 24 | 			Text phone = new Text(strs[0]);
 25 | 			FlowWritable flow = new FlowWritable(Integer.parseInt(strs[1]), Integer.parseInt(strs[2]));
 26 | 			System.out.println("Flow is:" + flow.toString());
 27 | 			context.write(phone, flow);
 28 | 		}
 29 | 	}
 30 | 
 31 | 	public static class FlowPartitionReducer extends Reducer<Text, FlowWritable, Text, FlowWritable> {
 32 | 		public void reduce(Text key, Iterable<FlowWritable> values, Context context)
 33 | 				throws IOException, InterruptedException {
 34 | 			int upFlow = 0;
 35 | 			int downFlow = 0;
 36 | 
 37 | 			for (FlowWritable value : values) {
 38 | 				upFlow += value.getUpFlow();
 39 | 				downFlow += value.getDownFlow();
 40 | 			}
 41 | 			System.out.println(key.toString() + ":" + upFlow + "," + downFlow);
 42 | 			context.write(key, new FlowWritable(upFlow, downFlow));
 43 | 		}
 44 | 	}
 45 | 
 46 | 	public static class FlowWritable implements Writable {
 47 | 		private int upFlow;
 48 | 		private int downFlow;
 49 | 		private int sumFlow;
 50 | 
 51 | 		public FlowWritable() {
 52 | 		}
 53 | 
 54 | 		public FlowWritable(int upFlow, int downFlow) {
 55 | 			this.upFlow = upFlow;
 56 | 			this.downFlow = downFlow;
 57 | 			this.sumFlow = upFlow + downFlow;
 58 | 		}
 59 | 
 60 | 		public int getDownFlow() {
 61 | 			return downFlow;
 62 | 		}
 63 | 
 64 | 		public void setDownFlow(int downFlow) {
 65 | 			this.downFlow = downFlow;
 66 | 		}
 67 | 
 68 | 		public int getUpFlow() {
 69 | 			return upFlow;
 70 | 		}
 71 | 
 72 | 		public void setUpFlow(int upFlow) {
 73 | 			this.upFlow = upFlow;
 74 | 		}
 75 | 
 76 | 		public int getSumFlow() {
 77 | 			return sumFlow;
 78 | 		}
 79 | 
 80 | 		public void setSumFlow(int sumFlow) {
 81 | 			this.sumFlow = sumFlow;
 82 | 		}
 83 | 
 84 | 		@Override
 85 | 		public void write(DataOutput out) throws IOException {
 86 | 			// TODO Auto-generated method stub
 87 | 			out.writeInt(upFlow);
 88 | 			out.writeInt(downFlow);
 89 | 			out.writeInt(sumFlow);
 90 | 		}
 91 | 
 92 | 		@Override
 93 | 		public void readFields(DataInput in) throws IOException {
 94 | 			// TODO Auto-generated method stub
 95 | 			upFlow = in.readInt();
 96 | 			downFlow = in.readInt();
 97 | 			sumFlow = in.readInt();
 98 | 		}
 99 | 
100 | 		@Override
101 | 		public String toString() {
102 | 			// TODO Auto-generated method stub
103 | 			return upFlow + "\t" + downFlow + "\t" + sumFlow;
104 | 		}
105 | 	}
106 | 
107 | 	public static class PhoneNumberPartitioner extends Partitioner<Text, FlowWritable> {
108 | 		private static HashMap<String, Integer> numberDict = new HashMap<>();
109 | 		static {
110 | 			numberDict.put("133", 0);
111 | 			numberDict.put("135", 1);
112 | 			numberDict.put("137", 2);
113 | 			numberDict.put("138", 3);
114 | 		}
115 | 
116 | 		@Override
117 | 		public int getPartition(Text key, FlowWritable value, int numPartitions) {
118 | 			String num = key.toString().substring(0, 3);
119 | 			// 借助HashMap返回不同手机段对应的分区号
120 | 			// 也可以直接通过if判断，如
121 | 			// 根据年份对数据进行分区，返回不同分区号
122 | 			// if (key.toString().startsWith("133")) return 0 % numPartitions;
123 | 			return numberDict.getOrDefault(num, 4);
124 | 		}
125 | 	}
126 | 
127 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
128 | 		// 设置hdfs配置信息
129 | 		String namenode_ip = "192.168.17.10";
130 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
131 | 		Configuration conf = new Configuration();
132 | 		conf.set("fs.defaultFS", hdfs);
133 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
134 | 
135 | 		// 设置作业Job配置信息
136 | 		String jobName = "FlowPartition";
137 | 		Job job = Job.getInstance(conf, jobName);
138 | 		job.setJarByClass(FlowPartition.class);
139 | 		job.setJar("export\\FlowPartition.jar");
140 | 		// Map
141 | 		 job.setMapperClass(FlowPartitionMapper.class);
142 | 		// Reduce
143 | 		job.setReducerClass(FlowPartitionReducer.class);
144 | 		// 输出k-v类型
145 | 		job.setOutputKeyClass(Text.class);
146 | 		job.setOutputValueClass(FlowWritable.class);
147 | 		// 设置分区类,及Reducer数目
148 | 		job.setPartitionerClass(PhoneNumberPartitioner.class);
149 | 		job.setNumReduceTasks(4);
150 | 
151 | 		// 设置job输入出路径
152 | 		String dataDir = "/workspace/flowStatistics/data";
153 | 		String outputDir = "/workspace/flowStatistics/output_partitions";
154 | 		Path inPath = new Path(hdfs + dataDir);
155 | 		Path outPath = new Path(hdfs + outputDir);
156 | 		FileInputFormat.addInputPath(job, inPath);
157 | 		FileOutputFormat.setOutputPath(job, outPath);
158 | 		FileSystem fs = FileSystem.get(conf);
159 | 		if (fs.exists(outPath)) {
160 | 			fs.delete(outPath, true);
161 | 		}
162 | 
163 | 		// 运行作业
164 | 		System.out.println("Job: " + jobName + " is running...");
165 | 		if (job.waitForCompletion(true)) {
166 | 			System.out.println("success!");
167 | 			System.exit(0);
168 | 		} else {
169 | 			System.out.println("failed!");
170 | 			System.exit(1);
171 | 		}
172 | 	}
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/FlowSort.java:
--------------------------------------------------------------------------------
  1 | package mapreduceProgram;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.fs.FileSystem;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.io.WritableComparable;
 12 | import org.apache.hadoop.mapreduce.Job;
 13 | import org.apache.hadoop.mapreduce.Mapper;
 14 | import org.apache.hadoop.mapreduce.Reducer;
 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 17 | 
 18 | 
 19 | public class FlowSort {
 20 | 
 21 | 	public static class MySortKey implements WritableComparable<MySortKey> {
 22 | 		private int upFlow;
 23 | 		private int downFlow;
 24 | 		private int sumFlow;
 25 | 
 26 | 		public MySortKey() {
 27 | 			// TODO Auto-generated constructor stub
 28 | 		}
 29 | 		
 30 | 		public MySortKey(int up, int down) {
 31 | 			upFlow = up;
 32 | 			downFlow = down;
 33 | 			sumFlow = up + down;
 34 | 		}
 35 | 
 36 | 		public int getUpFlow() {
 37 | 			return upFlow;
 38 | 		}
 39 | 
 40 | 		public void setUpFlow(int upFlow) {
 41 | 			this.upFlow = upFlow;
 42 | 		}
 43 | 
 44 | 		public int getDownFlow() {
 45 | 			return downFlow;
 46 | 		}
 47 | 
 48 | 		public void setDownFlow(int downFlow) {
 49 | 			this.downFlow = downFlow;
 50 | 		}
 51 | 
 52 | 		public int getSumFlow() {
 53 | 			return sumFlow;
 54 | 		}
 55 | 
 56 | 		public void setSumFlow(int sumFlow) {
 57 | 			this.sumFlow = sumFlow;
 58 | 		}
 59 | 
 60 | 		@Override
 61 | 		public void write(DataOutput out) throws IOException {
 62 | 			// TODO Auto-generated method stub
 63 | 			out.writeInt(upFlow);
 64 | 			out.writeInt(downFlow);
 65 | 			out.writeInt(sumFlow);
 66 | 		}
 67 | 
 68 | 		@Override
 69 | 		public void readFields(DataInput in) throws IOException {
 70 | 			// TODO Auto-generated method stub
 71 | 			upFlow = in.readInt();
 72 | 			downFlow = in.readInt();
 73 | 			sumFlow = in.readInt();
 74 | 		}
 75 | 
 76 | 		@Override
 77 | 		public int compareTo(MySortKey o) {
 78 | 			if ((this.upFlow - o.upFlow) == 0) {// 上行流量相等，比较下行流量
 79 | 				return o.downFlow - this.downFlow;// 按downFlow降序排序
 80 | 			} else {
 81 | 				return this.upFlow - o.upFlow;// 按upFlow升序排
 82 | 			}
 83 | 		}
 84 | 
 85 | 		@Override
 86 | 		public String toString() {
 87 | 			// TODO Auto-generated method stub
 88 | 			return upFlow + "\t" + downFlow + "\t" + sumFlow;
 89 | 		}
 90 | 	}
 91 | 
 92 | 	public static class SortMapper extends Mapper<Object, Text, MySortKey, Text> {
 93 | 		Text phone = new Text();
 94 | 		MySortKey mySortKey = new MySortKey();
 95 | 
 96 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 97 | 			String[] lists = value.toString().split("\t");
 98 | 			phone.set(lists[0]);
 99 | 			mySortKey.setUpFlow(Integer.parseInt(lists[1]));
100 | 			mySortKey.setDownFlow(Integer.parseInt(lists[2]));
101 | 			context.write(mySortKey, phone);
102 | 			System.out.println(phone.toString()+":"+mySortKey.toString()+",up:"+lists[1]+"=="+mySortKey.getUpFlow());
103 | 		}
104 | 	}
105 | 
106 | 	public static class SortReducer extends Reducer<MySortKey, Text, Text, MySortKey> {
107 | 		public void reduce(MySortKey key, Iterable<Text> values, Context context)
108 | 				throws IOException, InterruptedException {
109 | 			for (Text value : values) {
110 | 				System.out.println(value.toString()+","+key.toString());
111 | 				context.write(value, key);
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
117 | 		// 设置HDFS配置信息
118 | 		String namenode_ip = "192.168.17.10";
119 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
120 | 		Configuration conf = new Configuration();
121 | 		conf.set("fs.defaultFS", hdfs);
122 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
123 | 
124 | 		// 设置job配置信息
125 | 		Job job = Job.getInstance(conf, "FlowSort");
126 | 		job.setJarByClass(FlowSort.class);
127 | 		job.setJar("export\\FlowSort.jar");
128 | 		// Mapper
129 | 		job.setMapperClass(SortMapper.class);
130 | 		job.setMapOutputKeyClass(MySortKey.class);
131 | 		job.setMapOutputValueClass(Text.class);
132 | 		// Reducer
133 | 		job.setReducerClass(SortReducer.class);
134 | 		job.setOutputKeyClass(Text.class);
135 | 		job.setOutputKeyClass(MySortKey.class);
136 | 		// 作业输入输出路径
137 | 		String dataDir = "/workspace/flowStatistics/output/part-r-00000"; // 实验数据目录
138 | 		String outputDir = "/workspace/flowStatistics/output_sort"; // 实验输出目录
139 | 		Path inPath = new Path(hdfs + dataDir);
140 | 		Path outPath = new Path(hdfs + outputDir);
141 | 		FileInputFormat.addInputPath(job, inPath);
142 | 		FileOutputFormat.setOutputPath(job, outPath);
143 | 		FileSystem fs = FileSystem.get(conf);
144 | 		if (fs.exists(outPath)) {
145 | 			fs.delete(outPath, true);
146 | 		}
147 | 		// 运行作业
148 | 		System.out.println("Job: FlowSort is running...");
149 | 		if (job.waitForCompletion(true)) {
150 | 			System.out.println("success!");
151 | 			System.exit(0);
152 | 		} else {
153 | 			System.out.println("failed!");
154 | 			System.exit(1);
155 | 		}
156 | 	}
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/FlowStatistics.java:
--------------------------------------------------------------------------------
  1 | package mapreduceProgram;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.fs.FileSystem;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Reducer;
 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.io.Writable;
 16 | import org.apache.hadoop.mapreduce.Job;
 17 | 
 18 | public class FlowStatistics {
 19 | 
 20 | 	public static class FlowMapper extends Mapper<Object, Text, Text, Text>{
 21 | 		
 22 | 		public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
 23 | 			String[] strs = value.toString().split("\t");
 24 | 			Text phone = new Text(strs[0]);
 25 | 			Text flow = new Text(strs[1]+"\t"+strs[2]);
 26 | 			context.write(phone, flow);
 27 | 		}
 28 | 	}
 29 | 	
 30 | 	public static class FlowReducer extends Reducer<Text, Text, Text, Text>{
 31 | 		public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException {
 32 | 			int upFlow = 0;
 33 | 			int downFlow = 0;
 34 | 			
 35 | 			for (Text value : values) {
 36 | 				String[] strs = value.toString().split("\t");
 37 | 				upFlow += Integer.parseInt(strs[0].toString());
 38 | 				downFlow += Integer.parseInt(strs[1].toString());
 39 | 			}
 40 | 			int sumFlow = upFlow+downFlow;
 41 | 			
 42 | 			context.write(key,new Text(upFlow+"\t"+downFlow+"\t"+sumFlow));
 43 | 		}
 44 | 	}
 45 | 	
 46 | 	// 第二种写法
 47 | 	public static class FlowWritableMapper extends Mapper<Object, Text, Text, FlowWritable> {
 48 | 		public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
 49 | 			String[] strs = value.toString().split("\t");
 50 | 			Text phone = new Text(strs[0]);
 51 | 			FlowWritable flow = new FlowWritable(Integer.parseInt(strs[1]),Integer.parseInt(strs[2]));
 52 | 			System.out.println("Flow is:"+flow.toString());
 53 | 			context.write(phone, flow);
 54 | 		}
 55 | 	}
 56 | 	public static class FlowWritableReducer extends Reducer<Text, FlowWritable, Text, FlowWritable>{
 57 | 		public void reduce(Text key,Iterable<FlowWritable> values,Context context) throws IOException, InterruptedException {
 58 | 			int upFlow = 0;
 59 | 			int downFlow = 0;
 60 | 			
 61 | 			for (FlowWritable value : values) {
 62 | 				upFlow += value.getUpFlow();
 63 | 				downFlow += value.getDownFlow();
 64 | 			}
 65 | 			System.out.println(key.toString()+":"+upFlow+","+downFlow);
 66 | 			context.write(key,new FlowWritable(upFlow,downFlow));
 67 | 		}
 68 | 	}
 69 | 	
 70 | 	public static class FlowWritable implements Writable{
 71 | 		private int upFlow;
 72 | 		private int downFlow;
 73 | 		private int sumFlow;
 74 | 		
 75 | 		public FlowWritable() {}
 76 | 
 77 | 		public FlowWritable(int upFlow,int downFlow) {
 78 | 			this.upFlow = upFlow;
 79 | 			this.downFlow = downFlow;
 80 | 			this.sumFlow = upFlow+downFlow;
 81 | 		}
 82 | 		
 83 | 		public int getDownFlow() {
 84 | 			return downFlow;
 85 | 		}
 86 | 
 87 | 		public void setDownFlow(int downFlow) {
 88 | 			this.downFlow = downFlow;
 89 | 		}
 90 | 
 91 | 		public int getUpFlow() {
 92 | 			return upFlow;
 93 | 		}
 94 | 
 95 | 		public void setUpFlow(int upFlow) {
 96 | 			this.upFlow = upFlow;
 97 | 		}
 98 | 
 99 | 		public int getSumFlow() {
100 | 			return sumFlow;
101 | 		}
102 | 
103 | 		public void setSumFlow(int sumFlow) {
104 | 			this.sumFlow = sumFlow;
105 | 		}
106 | 		
107 | 		@Override
108 | 		public void write(DataOutput out) throws IOException {
109 | 			// TODO Auto-generated method stub
110 | 			out.writeInt(upFlow);
111 | 			out.writeInt(downFlow);
112 | 			out.writeInt(sumFlow);
113 | 		}
114 | 
115 | 		@Override
116 | 		public void readFields(DataInput in) throws IOException {
117 | 			// TODO Auto-generated method stub
118 | 			upFlow = in.readInt();
119 | 			downFlow = in.readInt();
120 | 			sumFlow = in.readInt();
121 | 		}
122 | 
123 | 		@Override
124 | 		public String toString() {
125 | 			// TODO Auto-generated method stub
126 | 			return upFlow+"\t"+downFlow+"\t"+sumFlow;
127 | 		}
128 | 	}
129 | 	
130 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
131 | 		// 设置hdfs配置信息
132 | 		String namenode_ip = "192.168.17.10";
133 | 		String hdfs = "hdfs://"+namenode_ip+":9000";
134 | 		Configuration conf = new Configuration();
135 | 		conf.set("fs.defaultFS", hdfs);
136 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
137 | 		
138 | 		// 设置作业Job配置信息
139 | 		String jobName = "FlowStatistics";
140 | 		Job job = Job.getInstance(conf, jobName);
141 | 		job.setJarByClass(FlowStatistics.class);
142 | 		job.setJar("export\\FlowStatistics.jar");
143 | 		// Map
144 | 		job.setMapperClass(FlowMapper.class);// 第一种
145 | //		job.setMapperClass(FlowWritableMapper.class);
146 | 		// 这里因为同Reducer输出类型一致，可不写
147 | //		job.setMapOutputKeyClass(Text.class);
148 | //		job.setMapOutputValueClass(FlowWritable.class);
149 | 		// Reduce
150 | 		job.setReducerClass(FlowReducer.class);// 第一种
151 | //		job.setReducerClass(FlowWritableReducer.class);
152 | 		// 输出k-v类型
153 | 		job.setOutputKeyClass(Text.class);
154 | 		job.setOutputValueClass(Text.class);// 第一种
155 | //		job.setOutputValueClass(FlowWritable.class);
156 | 		
157 | 		// 设置job输入出路径
158 | 		String dataDir = "/workspace/flowStatistics/data";
159 | 		String outputDir = "/workspace/flowStatistics/output";
160 | 		Path inPath = new Path(hdfs+dataDir);
161 | 		Path outPath = new Path(hdfs+outputDir);
162 | 		FileInputFormat.addInputPath(job, inPath);
163 | 		FileOutputFormat.setOutputPath(job, outPath);
164 | 		FileSystem fs = FileSystem.get(conf);
165 | 		if(fs.exists(outPath)) {
166 | 			fs.delete(outPath, true);
167 | 		}
168 | 		
169 | 		// 运行作业
170 | 		System.out.println("Job: " + jobName + " is running...");
171 | 		if(job.waitForCompletion(true)) {
172 | 			System.out.println("success!");
173 | 			System.exit(0);
174 | 		} else {
175 | 			System.out.println("failed!");
176 | 			System.exit(1);
177 | 		}
178 | 	}
179 | }
180 | 


--------------------------------------------------------------------------------
/src/main/java/mapreduceProgram/GroupMax.java:
--------------------------------------------------------------------------------
  1 | package mapreduceProgram;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | 
  7 | import javax.print.attribute.standard.JobName;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.FileSystem;
 11 | import org.apache.hadoop.fs.Path;
 12 | import org.apache.hadoop.io.DoubleWritable;
 13 | import org.apache.hadoop.io.NullWritable;
 14 | import org.apache.hadoop.io.RawComparator;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.io.WritableComparable;
 17 | import org.apache.hadoop.io.WritableComparator;
 18 | import org.apache.hadoop.mapreduce.Job;
 19 | import org.apache.hadoop.mapreduce.Mapper;
 20 | import org.apache.hadoop.mapreduce.Partitioner;
 21 | import org.apache.hadoop.mapreduce.Reducer;
 22 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 23 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 24 | 
 25 | import mapreduceProgram.FlowSort.MySortKey;
 26 | import mapreduceProgram.FlowSort.SortMapper;
 27 | import mapreduceProgram.FlowSort.SortReducer;
 28 | import sun.tools.tree.SuperExpression;
 29 | 
 30 | public class GroupMax {
 31 | 
 32 | 	public static class Pair implements WritableComparable<Pair> {
 33 | 		private String order_id;
 34 | 		private DoubleWritable amount;
 35 | 
 36 | 		public Pair() {
 37 | 			// TODO Auto-generated constructor stub
 38 | 		}
 39 | 
 40 | 		public Pair(String id, DoubleWritable amount) {
 41 | 			this.order_id = id;
 42 | 			this.amount = amount;
 43 | 		}
 44 | 
 45 | 		public String getOrder_id() {
 46 | 			return order_id;
 47 | 		}
 48 | 
 49 | 		public void setOrder_id(String order_id) {
 50 | 			this.order_id = order_id;
 51 | 		}
 52 | 
 53 | 		public DoubleWritable getAmount() {
 54 | 			return amount;
 55 | 		}
 56 | 
 57 | 		public void setAmount(DoubleWritable amount) {
 58 | 			this.amount = amount;
 59 | 		}
 60 | 
 61 | 		@Override
 62 | 		public void write(DataOutput out) throws IOException {
 63 | 			// TODO Auto-generated method stub
 64 | 			out.writeUTF(order_id);
 65 | 			out.writeDouble(amount.get());
 66 | 		}
 67 | 
 68 | 		@Override
 69 | 		public void readFields(DataInput in) throws IOException {
 70 | 			// TODO Auto-generated method stub
 71 | 			order_id = in.readUTF();
 72 | 			amount = new DoubleWritable(in.readDouble());
 73 | 		}
 74 | 
 75 | 		@Override
 76 | 		public int compareTo(Pair o) {
 77 | 			if (order_id.equals(o.order_id)) {// 同一order_id，按照amount降序排序
 78 | 				return o.amount.compareTo(amount);
 79 | 			} else {
 80 | 				return order_id.compareTo(o.order_id);
 81 | 			}
 82 | 		}
 83 | 
 84 | 	}
 85 | 
 86 | 	public static class MyMapper extends Mapper<Object, Text, Pair, NullWritable> {
 87 | 		Pair pair = new Pair();
 88 | 
 89 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 90 | 			String[] strs = value.toString().split(" ");
 91 | 			pair.setOrder_id(strs[0]);
 92 | 			pair.setAmount(new DoubleWritable(Double.parseDouble(strs[2])));
 93 | 			context.write(pair, NullWritable.get());
 94 | 			System.out.println(pair.getOrder_id()+","+pair.getAmount());
 95 | 		}
 96 | 	}
 97 | 
 98 | 	public static class MyReducer extends Reducer<Pair, NullWritable, Text, DoubleWritable> {
 99 | 		public void reduce(Pair key, Iterable<NullWritable> values, Context context)
100 | 				throws IOException, InterruptedException {
101 | 			context.write(new Text(key.getOrder_id()), key.getAmount());
102 | 			System.out.println(key.order_id+": "+key.amount.get());
103 | 			// 下面这个可以看下分组结果
104 | //			for (NullWritable value : values) {
105 | //				context.write(new Text(key.getOrder_id()), key.getAmount());
106 | //				System.out.println(key.order_id+": "+key.amount.get());
107 | //			}
108 | 		}
109 | 	}
110 | 	// 是分组不是分区，分组是组内定义一些规则由reduce去处理，分区是由多个Reduce处理，写到不同文件中
111 | 	// 自定义分组类
112 | 	public static class GroupComparator extends WritableComparator {
113 | 		public GroupComparator() {
114 | 			// TODO Auto-generated constructor stub
115 | 			super(Pair.class, true);
116 | 		}
117 | 		// Mapper端会对Pair排序，之后分组的规则是对Pair中的order_id比较
118 | 		@Override
119 | 		public int compare(WritableComparable a, WritableComparable b) {
120 | 			// TODO Auto-generated method stub
121 | 			Pair oa = (Pair) a;
122 | 			Pair ob = (Pair) b;
123 | 			return oa.getOrder_id().compareTo(ob.getOrder_id());
124 | 		}
125 | 
126 | 	}
127 | 
128 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
129 | 		// 设置HDFS配置信息
130 | 		String namenode_ip = "192.168.17.10";
131 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
132 | 		Configuration conf = new Configuration();
133 | 		conf.set("fs.defaultFS", hdfs);
134 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
135 | 
136 | 		// 设置job配置信息
137 | 		String JobName = "GroupMax";
138 | 		Job job = Job.getInstance(conf, JobName);
139 | 		job.setJarByClass(GroupMax.class);
140 | 		job.setJar("export\\GroupMax.jar");
141 | 		// Mapper
142 | 		job.setMapperClass(MyMapper.class);
143 | 		job.setMapOutputKeyClass(Pair.class);
144 | 		job.setMapOutputValueClass(NullWritable.class);
145 | 		// Reducer
146 | 		job.setReducerClass(MyReducer.class);
147 | 		job.setOutputKeyClass(Text.class);
148 | 		job.setOutputKeyClass(DoubleWritable.class);
149 | 		// GroupComparator自定义分组类
150 | 		job.setGroupingComparatorClass(GroupComparator.class);
151 | 		// 作业输入输出路径
152 | 		String dataDir = "/workspace/data/orderDetail.txt"; // 实验数据
153 | 		String outputDir = "/workspace/groupMax/output"; // 实验输出目录
154 | 		Path inPath = new Path(hdfs + dataDir);
155 | 		Path outPath = new Path(hdfs + outputDir);
156 | 		FileInputFormat.addInputPath(job, inPath);
157 | 		FileOutputFormat.setOutputPath(job, outPath);
158 | 		FileSystem fs = FileSystem.get(conf);
159 | 		if (fs.exists(outPath)) {
160 | 			fs.delete(outPath, true);
161 | 		}
162 | 		// 运行作业
163 | 		System.out.println("Job: "+JobName+" is running...");
164 | 		if (job.waitForCompletion(true)) {
165 | 			System.out.println("success!");
166 | 			System.exit(0);
167 | 		} else {
168 | 			System.out.println("failed!");
169 | 			System.exit(1);
170 | 		}
171 | 	}
172 | 
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/java/mergeMultipleFiles/MergeJob.java:
--------------------------------------------------------------------------------
 1 | package mergeMultipleFiles;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.BytesWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
14 | 
15 | import InputOutputFormatTest.MultiInOutput;
16 | 
17 | public class MergeJob {
18 | 
19 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
20 | 		// 1.设置HDFS配置信息
21 | 		String namenode_ip = "192.168.17.10";
22 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
23 | 		Configuration conf = new Configuration();
24 | 		conf.set("fs.defaultFS", hdfs);
25 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
26 | 
27 | 		// 2.设置MapReduce作业配置信息
28 | 		String jobName = "MergeMultipleFiles"; // 作业名称
29 | 		Job job = Job.getInstance(conf, jobName);
30 | 		job.setJarByClass(MultiInOutput.class); // 指定运行时作业类
31 | 		job.setJar("export\\MergeMultipleFiles.jar"); // 指定本地jar包
32 | 		job.setMapOutputKeyClass(Text.class); // 设置Mapper输出Key类型
33 | 		job.setMapOutputValueClass(BytesWritable.class); // 设置Mapper输出Value类型
34 | 		job.setMapperClass(MergeMapper.class);
35 | 		// 输入数据格式
36 | 		job.setInputFormatClass(MyInputFormat.class);
37 | 		// 以文件格式输出，使用序列化文件输出类
38 | 		job.setOutputFormatClass(SequenceFileOutputFormat.class);
39 | 
40 | 		// 设置作业输出路径
41 | 		String inputDir = "/workspace/mergeFiles/data";
42 | 		String outputDir = "/workspace/mergeFiles/output"; // 输出目录
43 | 		Path outPath = new Path(hdfs + outputDir);
44 | 		Path inputPath = new Path(hdfs+inputDir);
45 | 		FileInputFormat.setInputPaths(job, inputPath);
46 | 		FileOutputFormat.setOutputPath(job, outPath);
47 | 		FileSystem fs = FileSystem.get(conf);
48 | 		if (fs.exists(outPath)) {
49 | 			fs.delete(outPath, true);
50 | 		}
51 | 
52 | 		// 运行作业
53 | 		System.out.println("Job: " + jobName + " is running...");
54 | 		if (job.waitForCompletion(true)) {
55 | 			System.out.println("success!");
56 | 			System.exit(0);
57 | 		} else {
58 | 			System.out.println("failed!");
59 | 			System.exit(1);
60 | 		}
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/mergeMultipleFiles/MergeMapper.java:
--------------------------------------------------------------------------------
 1 | package mergeMultipleFiles;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.BytesWritable;
 7 | import org.apache.hadoop.io.NullWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.InputSplit;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
12 | 
13 | public class MergeMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable>{
14 | 	private Text fileNameKey;
15 | 
16 | 	@Override
17 | 	protected void map(NullWritable key, BytesWritable value,
18 | 			Mapper<NullWritable, BytesWritable, Text, BytesWritable>.Context context)
19 | 			throws IOException, InterruptedException {
20 | 		// TODO Auto-generated method stub
21 | 		context.write(fileNameKey, value);
22 | 	}
23 | 
24 | 	@Override
25 | 	protected void setup(Mapper<NullWritable, BytesWritable, Text, BytesWritable>.Context context)
26 | 			throws IOException, InterruptedException {
27 | 		InputSplit split = context.getInputSplit();
28 | 		Path path = ((FileSplit)split).getPath();//???
29 | 		fileNameKey = new Text(path.toString());
30 | 	}
31 | 	
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/mergeMultipleFiles/MyInputFormat.java:
--------------------------------------------------------------------------------
 1 | package mergeMultipleFiles;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.BytesWritable;
 7 | import org.apache.hadoop.io.NullWritable;
 8 | import org.apache.hadoop.mapreduce.InputSplit;
 9 | import org.apache.hadoop.mapreduce.JobContext;
10 | import org.apache.hadoop.mapreduce.RecordReader;
11 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | 
14 | 
15 | public class MyInputFormat extends FileInputFormat<NullWritable, BytesWritable>{
16 | 
17 | 	
18 | 	@Override
19 | 	protected boolean isSplitable(JobContext context, Path filename) {
20 | 		// TODO 因为是合并小文件，设置文件不可分割，k-v的v就是文件对象
21 | 		return false;
22 | 	}
23 | 
24 | 	@Override
25 | 	public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
26 | 			throws IOException, InterruptedException {
27 | 		// TODO Auto-generated method stub
28 | 		MyRecordReader myRecordReader = new MyRecordReader();
29 | 		myRecordReader.initialize(split, context);
30 | 		return myRecordReader;
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/mergeMultipleFiles/MyRecordReader.java:
--------------------------------------------------------------------------------
 1 | package mergeMultipleFiles;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FSDataInputStream;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.io.BytesWritable;
10 | import org.apache.hadoop.io.IOUtils;
11 | import org.apache.hadoop.io.NullWritable;
12 | import org.apache.hadoop.mapred.FileSplit;
13 | import org.apache.hadoop.mapreduce.InputSplit;
14 | import org.apache.hadoop.mapreduce.RecordReader;
15 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
16 | 
17 | public class MyRecordReader extends RecordReader<NullWritable, BytesWritable>{
18 | 	private FileSplit fileSplit;
19 | 	private Configuration conf ;
20 | 	private BytesWritable value = new BytesWritable();
21 | 	private boolean processed =false;
22 | 	@Override
23 | 	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
24 | 		fileSplit = (FileSplit)split;
25 | 		conf = context.getConfiguration();
26 | 	}
27 | 	@Override
28 | 	public boolean nextKeyValue() throws IOException, InterruptedException {
29 | 		if (!processed) {
30 | 			byte[] contents = new byte[(int) fileSplit.getLength()];// 获取分片长度字节数组
31 | 			Path file = fileSplit.getPath();// 获取切片所在位置
32 | 			FileSystem fSystem = file.getFileSystem(conf);
33 | 			FSDataInputStream in = null;
34 | 			try {
35 | 				in = fSystem.open(file);// 打开文件
36 | 				IOUtils.readFully(in, contents, 0, contents.length);// 读取整个文件字节数据，写入contents
37 | 				value.set(contents,0,contents.length);// 将整个文件数据赋值给value
38 | 			} finally {
39 | 				IOUtils.closeStream(in);
40 | 			}
41 | 			processed = true;
42 | 			return true;
43 | 		}
44 | 		return false;
45 | 	}
46 | 	@Override
47 | 	public NullWritable getCurrentKey() throws IOException, InterruptedException {
48 | 		// 获取当前key，因为合并文件，我们应该将文件对象付给value，key赋空即可
49 | 		return NullWritable.get();
50 | 	}
51 | 	@Override
52 | 	public BytesWritable getCurrentValue() throws IOException, InterruptedException {
53 | 		return value;// value是整个文件对象的字节数据
54 | 	}
55 | 	@Override
56 | 	public float getProgress() throws IOException, InterruptedException {
57 | 		// TODO Auto-generated method stub
58 | 		return processed ? 1.0f:0.0f;
59 | 	}
60 | 	@Override
61 | 	public void close() throws IOException {
62 | 		// TODO Auto-generated method stub
63 | 		
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/DecomposeFriendsMapper.java:
--------------------------------------------------------------------------------
 1 | package mutualFriend;
 2 | 
 3 | import org.apache.hadoop.mapreduce.Mapper;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.Text;
 8 | 
 9 | public class DecomposeFriendsMapper extends Mapper<Object, Text, Text, Text> {
10 | 	public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
11 | 		String strs = value.toString();
12 | 		Text uString = new Text(strs.substring(0, 1));
13 | 		String[] friends = strs.substring(2).split(",");
14 | 		
15 | 		//A:B,C,D,F,E,O
16 | 		for (int i = 0; i < friends.length; i++) {
17 | 			// 以<B，A>,<C,A>形式输出
18 | 			context.write(new Text(friends[i]),uString);
19 | 		}
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/DecomposeFriendsReducer.java:
--------------------------------------------------------------------------------
 1 | package mutualFriend;
 2 | 
 3 | import org.apache.hadoop.mapreduce.Reducer;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.Text;
 8 | 
 9 | public class DecomposeFriendsReducer extends Reducer<Text, Text, Text, Text>{
10 | 
11 | 	@Override
12 | 	protected void reduce(Text key, Iterable<Text> values, Context context)
13 | 			throws IOException, InterruptedException {
14 | 		String friendList = "";
15 | 		for (Text value : values) {
16 | 			friendList += value.toString()+",";
17 | 		}
18 | 		// 输出个人所有好友，A	I,K,C,B,G,F,H,O,D
19 | 		context.write(key, new Text(friendList.substring(0,friendList.length()-1)));
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/JobControlRun.java:
--------------------------------------------------------------------------------
  1 | package mutualFriend;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.fs.FileSystem;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 11 | import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
 12 | import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 | 
 15 | public class JobControlRun {
 16 | 
 17 | 	public static void main(String[] args) throws IOException {
 18 | 		String namenode_ip = "192.168.17.10";
 19 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 20 | 		Configuration conf = new Configuration();		
 21 | 		conf.set("fs.defaultFS", hdfs);
 22 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 23 | 		
 24 | 		Job job1 = Job.getInstance(conf,"Decompose");
 25 | 		job1.setJarByClass(JobControlRun.class);
 26 | 		job1.setJar("export\\mutualFriend.jar");
 27 | 		job1.setMapperClass(DecomposeFriendsMapper.class);
 28 | 		job1.setReducerClass(DecomposeFriendsReducer.class);
 29 | 		job1.setOutputKeyClass(Text.class);
 30 | 		job1.setOutputValueClass(Text.class);
 31 | 		
 32 | 		Path input = new Path(hdfs+"/workspace/mutualFriends/data");
 33 | 		Path output1 = new Path(hdfs+"/workspace/mutualFriends/output_Dec");
 34 | 		FileInputFormat.addInputPath(job1, input);
 35 | 		FileOutputFormat.setOutputPath(job1, output1);
 36 | 		FileSystem fs = FileSystem.get(conf);
 37 | 		if (fs.exists(output1)) {
 38 | 			fs.delete(output1, true);
 39 | 			System.out.println("我被删了");// 打印可见只被删了一次，有点怪
 40 | 		}
 41 | 		// ControlledJob作业控制容器
 42 | 		ControlledJob ctrJob1=new ControlledJob(conf);
 43 | 		ctrJob1.setJob(job1);// job1加入控制容器
 44 | 		
 45 | 		Job job2 = Job.getInstance(conf, "Merge");
 46 | 		job2.setJarByClass(JobControlRun.class);
 47 | 		job2.setJar("export\\mutualFriend.jar");
 48 | 		job2.setMapperClass(MergeFriendsMapper.class);
 49 | 		job2.setReducerClass(MergeFriendsReducer.class);
 50 | 		job2.setOutputKeyClass(Text.class);
 51 | 		job2.setOutputValueClass(Text.class);
 52 | 		
 53 | 		Path input2 = new Path(hdfs+"/workspace/mutualFriends/output_Dec");
 54 | 		Path output2 = new Path(hdfs+"/workspace/mutualFriends/output_Meg");
 55 | 		FileInputFormat.addInputPath(job2, input2);
 56 | 		FileOutputFormat.setOutputPath(job2, output2);
 57 | 		if (fs.exists(output2)) {
 58 | 			fs.delete(output2, true);
 59 | 		}
 60 | 		ControlledJob ctrJob2 = new ControlledJob(conf);
 61 | 		ctrJob2.setJob(job2);// job2加入作业控制容器
 62 | 		
 63 | 		// 添加作业依赖，表明job2依赖job1执行
 64 | 		ctrJob2.addDependingJob(ctrJob1);
 65 | 		
 66 | 		// 定义作业主控制容器，监控、调度job1，job2
 67 | 		JobControl jobControl=new JobControl("JobControl");
 68 | 		jobControl.addJob(ctrJob1);
 69 | 		jobControl.addJob(ctrJob2);
 70 | 		// 启动作业线程
 71 | 		Thread T=new Thread(jobControl);
 72 | 		T.start();
 73 | 		while(true){
 74 | 			if(jobControl.allFinished()){// 等待作业全部结束
 75 | 				System.out.println(jobControl.getSuccessfulJobList());// 打印成功job信息
 76 | 				jobControl.stop();
 77 | 				break;
 78 | 			}
 79 | 		}
 80 | 		/**
 81 | 		 * 打印控制信息如下
 82 | 		 * [job name:	Decompose
 83 | 			job id:	JobControl0
 84 | 			job state:	SUCCESS
 85 | 			job mapred id:	job_local445604445_0001
 86 | 			job message:	just initialized
 87 | 			job has no depending job:	
 88 | 			, job name:	Merge
 89 | 			job id:	JobControl1
 90 | 			job state:	SUCCESS
 91 | 			job mapred id:	job_local1897659504_0002
 92 | 			job message:	just initialized
 93 | 			job has 1 dependeng jobs:
 94 | 				 depending job 0:	Decompose
 95 | 			]
 96 | 		 */
 97 | 	}
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/JobRun.java:
--------------------------------------------------------------------------------
 1 | package mutualFriend;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | 
13 | public class JobRun {
14 | 	
15 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
16 | 		String namenode_ip = "192.168.17.10";
17 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
18 | 		Configuration conf = new Configuration();		
19 | 		conf.set("fs.defaultFS", hdfs);
20 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
21 | 		
22 | 		// job1配置信息
23 | 		Job job1 = Job.getInstance(conf,"Decompose");
24 | 		job1.setJarByClass(JobRun.class);
25 | 		job1.setJar("export\\mutualFriend.jar");
26 | 		job1.setMapperClass(DecomposeFriendsMapper.class);
27 | 		job1.setReducerClass(DecomposeFriendsReducer.class);
28 | 		job1.setOutputKeyClass(Text.class);
29 | 		job1.setOutputValueClass(Text.class);
30 | 		
31 | 		Path input = new Path(hdfs+"/workspace/mutualFriends/data");
32 | 		Path output1 = new Path(hdfs+"/workspace/mutualFriends/output_Dec");
33 | 		FileInputFormat.addInputPath(job1, input);
34 | 		FileOutputFormat.setOutputPath(job1, output1);
35 | 		FileSystem fs = FileSystem.get(conf);
36 | 		if (fs.exists(output1)) {
37 | 			fs.delete(output1, true);
38 | 		}
39 | 		
40 | 		// job1如果运行成功则进入job2
41 | 		if(job1.waitForCompletion(true)) {//job2完全依赖job1的结果，所以job1成功执行就开启job2
42 | 			// job2配置信息
43 | 			Job job2 = Job.getInstance(conf, "Merge");
44 | 			job2.setJarByClass(JobRun.class);
45 | 			job2.setJar("export\\mutualFriend.jar");
46 | 			job2.setMapperClass(MergeFriendsMapper.class);
47 | 			job2.setReducerClass(MergeFriendsReducer.class);
48 | 			job2.setOutputKeyClass(Text.class);
49 | 			job2.setOutputValueClass(Text.class);
50 | 			
51 | 			Path output2 = new Path(hdfs+"/workspace/mutualFriends/output_Meg");
52 | 			FileInputFormat.addInputPath(job2, output1);// 输入是job1的输出
53 | 			FileOutputFormat.setOutputPath(job2, output2);
54 | 			if (fs.exists(output2)) {
55 | 				fs.delete(output2, true);
56 | 			}
57 | 			if(job2.waitForCompletion(true)) {
58 | 				System.out.println("sucessed");
59 | 			}else {
60 | 				System.out.println("failed");
61 | 			}
62 | 		}
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/MergeFriendsMapper.java:
--------------------------------------------------------------------------------
 1 | package mutualFriend;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Arrays;
 5 | 
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class MergeFriendsMapper extends Mapper<Object, Text, Text, Text>{// 别写成输入key也是Text类型，这里输入的是偏移量
10 | 	public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
11 | 		Text uText = new Text(value.toString().substring(0, 1));
12 | 		String[] lists = value.toString().substring(2).split(",");
13 | 		Arrays.sort(lists);// 要排好序，不然如A-B，B-A不能归并到一起
14 | 		//对如A B,C,E遍历输出如<B-C A>
15 | 		for (int i = 0; i < lists.length; i++) {
16 | 			for(int j=i+1;j<lists.length;j++) {
17 | 				String friends = lists[i]+"-"+lists[j];
18 | //				System.out.println(friends);
19 | 				context.write(new Text(friends), uText);
20 | 			}
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/mutualFriend/MergeFriendsReducer.java:
--------------------------------------------------------------------------------
 1 | package mutualFriend;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | public class MergeFriendsReducer extends Reducer<Text, Text, Text, Text>{
 9 | 	public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException {
10 | 		String friends = "";
11 | 		for (Text value : values) {
12 | 			friends += value.toString()+",";
13 | 		}
14 | 		System.out.println(key.toString()+" "+friends);
15 | 		context.write(key, new Text(friends));
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/shuffleTest/MonthAscTempDescSort.java:
--------------------------------------------------------------------------------
 1 | package shuffleTest;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Partitioner;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.Mapper.Context;
15 | 
16 | public class MonthAscTempDescSort {
17 | //	按年分区，每个文件中按月升序，按温度降序
18 | 	public static class MonthTempMapper extends Mapper<Object, Text, Text, IntWritable> {
19 | 		IntWritable temp = new IntWritable();
20 | 		public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
21 | 			String[] strings =value.toString().split(" ");
22 | 			String date = strings[0].substring(0, 7);
23 | 			temp.set(Integer.parseInt(strings[2].substring(0, strings[2].length()-1)));
24 | 			context.write(new Text(date), temp);
25 | 		}
26 | 	}
27 | 	
28 | 
29 | 	public static void main(String[] args) {
30 | 		// TODO Auto-generated method stub
31 | 
32 | 	}
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/shuffleTest/TempSort.java:
--------------------------------------------------------------------------------
  1 | package shuffleTest;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.IntWritable;
  8 | import org.apache.hadoop.io.WritableComparable;
  9 | import org.apache.hadoop.io.WritableComparator;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Partitioner;
 14 | import org.apache.hadoop.mapreduce.Reducer;
 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 17 | 
 18 | 
 19 | public class TempSort {
 20 | 	/*
 21 | 	 * 按年输出（分区），每个文件包含每月的最高温度
 22 | 	 */
 23 | 	public static class TempSortMapper extends Mapper<Object, Text, Text, IntWritable> {
 24 | 		IntWritable temp = new IntWritable();
 25 | 		public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
 26 | 			String[] strings =value.toString().split(" ");
 27 | 			String date = strings[0].substring(0, 7);
 28 | 			temp.set(Integer.parseInt(strings[2].substring(0, strings[2].length()-1)));
 29 | 			context.write(new Text(date), temp);
 30 | 		}
 31 | 	}
 32 | 	
 33 | 	public static class TempSortReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
 34 | 		public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
 35 | //			气温降序排序，区第一个
 36 | //			IntWritable temp = values.iterator().next();
 37 | //			System.out.println("气温："+temp);
 38 | //			context.write(key, temp);
 39 | 			
 40 | 			int  maxTemp = Integer.MIN_VALUE;
 41 | 			for(IntWritable value:values) {
 42 | 				System.out.println("年："+key+", 气温："+value);
 43 | 				if (value.get()>maxTemp) {
 44 | 					maxTemp = value.get();
 45 | 				}
 46 | 			}
 47 | 			System.out.println("Date:"+key+", MaxTemp:"+maxTemp);
 48 | 			context.write(key, new IntWritable(maxTemp));
 49 | 		}
 50 | 	}
 51 | 	
 52 | 	public static class YearPartitioner extends Partitioner<Text, IntWritable> {
 53 | 		@Override
 54 | 		public int getPartition(Text key, IntWritable value, int numPartitions) {
 55 | 			//根据年份对数据进行分区，返回不同分区号
 56 | 			if (key.toString().startsWith("1949"))
 57 | 				return 0 % numPartitions;
 58 | 			else if (key.toString().startsWith("1950"))
 59 | 				return 1 % numPartitions;
 60 | 			else
 61 | 				return 2 % numPartitions;		
 62 | 		}
 63 | 	}
 64 | 	
 65 | //	public static class MySort extends WritableComparator {
 66 | //		 public MySort() {
 67 | //			 super(IntWritable.class,true);
 68 | //		 }
 69 | //		 
 70 | //		 @SuppressWarnings({"rawtypes","unchecked"})
 71 | //		 public int compare(WritableComparable a,WritableComparable b) {
 72 | //			 return b.compareTo(a);
 73 | //		 }
 74 | //	}
 75 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
 76 | 		// TODO Auto-generated method stub
 77 | 		String hdfs = "hdfs://192.168.17.10:9000";
 78 | 		Configuration conf = new Configuration();
 79 | 		conf.set("fs.defaultFS", hdfs);
 80 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 81 | 		// 设置作业配置信息
 82 | 		String jobName = "TempSort";
 83 | 		Job job = Job.getInstance(conf, jobName);
 84 | 		job.setJarByClass(TempSort.class);
 85 | 		job.setJar("export\\TempSort.jar");
 86 | 		// Map
 87 | 		job.setMapperClass(TempSortMapper.class);
 88 | 		job.setMapOutputKeyClass(Text.class);
 89 | 		job.setMapOutputValueClass(IntWritable.class);
 90 | 		// Reduce
 91 | 		job.setReducerClass(TempSortReducer.class);
 92 | 		// 全局
 93 | 		job.setOutputKeyClass(Text.class);
 94 | 		job.setOutputValueClass(IntWritable.class);
 95 | 		// Sort
 96 | //		job.setSortComparatorClass(MySort.class);
 97 | 		// Partition
 98 | 		job.setPartitionerClass(YearPartitioner.class);
 99 | 		job.setNumReduceTasks(3);
100 | 		//3.设置作业输入和输出路径
101 | 		String dataDir = "/expr/test/data";	//实验数据目录	
102 | 		String outputDir = "/expr/test/output";				//实验输出目录
103 | 		Path inPath = new Path(hdfs + dataDir);
104 | 		Path outPath = new Path(hdfs + outputDir);
105 | 		FileInputFormat.addInputPath(job, inPath);
106 | 		FileOutputFormat.setOutputPath(job, outPath);
107 | 		FileSystem fs = FileSystem.get(conf);
108 | 		if(fs.exists(outPath)) {
109 | 			fs.delete(outPath, true);
110 | 		}
111 | 		
112 | 		//4.运行作业
113 | 		System.out.println("Job: " + jobName + " is running...");
114 | 		if(job.waitForCompletion(true)) {
115 | 			System.out.println("success!");
116 | 			System.exit(0);
117 | 		} else {
118 | 			System.out.println("failed!");
119 | 			System.exit(1);
120 | 		}
121 | 	}
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/counter/YearCounter.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.counter;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class YearCounter {
16 | 
17 | 	//自定义年份计数器
18 | 	private enum YCounter {
19 | 		Y2015, Y2016, Y2017
20 | 	}
21 | 			
22 | 	public static class YearCounterMapper extends Mapper<Object, Text, Text, IntWritable> {		
23 | 		private final static IntWritable one = new IntWritable(1);		
24 | 		
25 | 		public void map(Object key, Text value, Context context ) 
26 | 				throws IOException, InterruptedException {
27 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
28 | 	    	Text date = new Text(strs[0]);		//获取日期
29 | 			context.write(date, one);			//将日期和常数1作为Map输出	
30 | 			
31 | 			//根据KEY值不同，增加对应计数器的值
32 | 			if(strs[0].startsWith("2015")) {
33 | 				context.getCounter(YCounter.Y2015).increment(1);
34 | 			} else if(strs[0].startsWith("2016")) {
35 | 				context.getCounter(YCounter.Y2016).increment(1);
36 | 			} else
37 | 				context.getCounter(YCounter.Y2017).increment(1);
38 | 	    }
39 | 	}
40 |   
41 | 	public static class YearCounterReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
42 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
43 | 				throws IOException, InterruptedException {
44 | 			int sum = 0;
45 | 			for (IntWritable val : values) {
46 | 				sum += val.get();
47 | 			}
48 | 			context.write(key, new IntWritable(sum));
49 | 	    }
50 | 	}
51 | 
52 | 	public static void main(String[] args) throws Exception {		
53 | 		//1.设置HDFS配置信息
54 | 		String namenode_ip = "192.168.17.10";
55 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
56 | 		Configuration conf = new Configuration();
57 | 		conf.set("fs.defaultFS", hdfs);
58 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
59 | 
60 | 		//2.设置MapReduce作业配置信息
61 | 		String jobName = "YearCounter";						//作业名称
62 | 		Job job = Job.getInstance(conf, jobName);
63 | 		job.setJarByClass(YearCounter.class);				//指定运行时作业类
64 | 		job.setJar("export\\YearCounter.jar");				//指定本地jar包
65 | 		job.setMapperClass(YearCounterMapper.class);		//指定Mapper类
66 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
67 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
68 | 		job.setReducerClass(YearCounterReducer.class);		//指定Reducer类
69 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
70 | 		job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
71 | 		
72 | 		//3.设置作业输入和输出路径
73 | 		String dataDir = "/expr/datecount/data";			//实验数据目录	
74 | 		String outputDir = "/expr/datecount/output";		//实验输出目录
75 | 		Path inPath = new Path(hdfs + dataDir);
76 | 		Path outPath = new Path(hdfs + outputDir);
77 | 		FileInputFormat.addInputPath(job, inPath);
78 | 		FileOutputFormat.setOutputPath(job, outPath);
79 | 		FileSystem fs = FileSystem.get(conf);
80 | 		if(fs.exists(outPath)) {
81 | 			fs.delete(outPath, true);
82 | 		}
83 | 		
84 | 		//4.运行作业
85 | 		System.out.println("Job: " + jobName + " is running...");
86 | 		if(job.waitForCompletion(true)) {
87 | 			System.out.println("success!");
88 | 			System.exit(0);
89 | 		} else {
90 | 			System.out.println("failed!");
91 | 			System.exit(1);
92 | 		}
93 | 	}
94 | 
95 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateCount.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class DateCount {
16 | 
17 | 	public static class DateCountMapper extends Mapper<Object, Text, Text, IntWritable> {		
18 | 		private final static IntWritable one = new IntWritable(1);
19 | 		
20 | 		public void map(Object key, Text value, Context context ) 
21 | 				throws IOException, InterruptedException {
22 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
23 | 	    	Text date = new Text(strs[0]);		//获取日期
24 | 			context.write(date, one);			//将日期和常数1作为Map输出	
25 | 	    }
26 | 	}
27 |   
28 | 	public static class DateCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
29 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
30 | 				throws IOException, InterruptedException {
31 | 			int sum = 0;
32 | 			for (IntWritable val : values) {
33 | 				sum += val.get();
34 | 			}
35 | 			context.write(key, new IntWritable(sum));
36 | 	    }
37 | 	}
38 | 
39 | 	public static void main(String[] args) throws Exception {		
40 | 		//1.设置HDFS配置信息
41 | 		String namenode_ip = "192.168.17.10";
42 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
43 | 		Configuration conf = new Configuration();
44 | 		conf.set("fs.defaultFS", hdfs);
45 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
46 | 
47 | 		//2.设置MapReduce作业配置信息
48 | 		String jobName = "DateCount";					//作业名称
49 | 		Job job = Job.getInstance(conf, jobName);
50 | 		job.setJarByClass(DateCount.class);				//指定运行时作业类
51 | 		job.setJar("export\\DateCount.jar");			//指定本地jar包
52 | 		job.setMapperClass(DateCountMapper.class);		//指定Mapper类
53 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
54 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
55 | 		job.setReducerClass(DateCountReducer.class);	//指定Reducer类
56 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
57 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
58 | 		
59 | 		//3.设置作业输入和输出路径
60 | 		String dataDir = "/expr/datecount/data";		//实验数据目录	
61 | 		String outputDir = "/expr/datecount/output";	//实验输出目录
62 | 		Path inPath = new Path(hdfs + dataDir);
63 | 		Path outPath = new Path(hdfs + outputDir);
64 | 		FileInputFormat.addInputPath(job, inPath);
65 | 		FileOutputFormat.setOutputPath(job, outPath);
66 | 		FileSystem fs = FileSystem.get(conf);
67 | 		if(fs.exists(outPath)) {
68 | 			fs.delete(outPath, true);
69 | 		}
70 | 		
71 | 		//4.运行作业
72 | 		System.out.println("Job: " + jobName + " is running...");
73 | 		if(job.waitForCompletion(true)) {
74 | 			System.out.println("success!");
75 | 			System.exit(0);
76 | 		} else {
77 | 			System.out.println("failed!");
78 | 			System.exit(1);
79 | 		}
80 | 	}
81 | 
82 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateDistinct.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.NullWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class DateDistinct {
16 | 
17 | 	public static class DateDistinctMapper extends Mapper<Object, Text, Text, NullWritable> {		
18 | 		public void map(Object key, Text value, Context context ) 
19 | 				throws IOException, InterruptedException {
20 | 	    	String[] strs = value.toString().split(" ");
21 | 	    	Text date = new Text(strs[0]);
22 | 			context.write(date, NullWritable.get());
23 | 	    }
24 | 	}
25 |   
26 | 	public static class DateDistinctReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
27 | 		public void reduce(Text key, Iterable<NullWritable> values, Context context) 
28 | 				throws IOException, InterruptedException {
29 | 			context.write(key, NullWritable.get());
30 | 	    }
31 | 	}
32 | 
33 | 	public static void main(String[] args) throws Exception {		
34 | 		//1.设置HDFS配置信息
35 | 		String namenode_ip = "192.168.17.10";
36 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
37 | 		Configuration conf = new Configuration();
38 | 		conf.set("fs.defaultFS", hdfs);
39 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
40 | 
41 | 		//2.设置MapReduce作业配置信息
42 | 		String jobName = "DateDistinct";				//定义作业名称
43 | 		Job job = Job.getInstance(conf, jobName);
44 | 		job.setJarByClass(DateDistinct.class);			//指定运行时作业类
45 | 		job.setJar("export\\DateDistinct.jar");			//指定本地jar包
46 | 		job.setMapperClass(DateDistinctMapper.class);	//指定Mapper类
47 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
48 | 		job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
49 | 		job.setReducerClass(DateDistinctReducer.class);	//指定Reducer类
50 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
51 | 		job.setOutputValueClass(NullWritable.class); 	//设置Reduce输出Value类型
52 | 		
53 | 		//3.设置作业输入和输出路径
54 | 		String dataDir = "/expr/datecount/data";				//实验数据目录	
55 | 		String outputDir = "/expr/datecount/output_distinct";	//实验输出目录
56 | 		Path inPath = new Path(hdfs + dataDir);
57 | 		Path outPath = new Path(hdfs + outputDir);
58 | 		FileInputFormat.addInputPath(job, inPath);
59 | 		FileOutputFormat.setOutputPath(job, outPath);
60 | 		FileSystem fs = FileSystem.get(conf);
61 | 		if(fs.exists(outPath)) {
62 | 			fs.delete(outPath, true);
63 | 		}
64 | 		
65 | 		//4.运行作业
66 | 		System.out.println("Job: " + jobName + " is running...");
67 | 		if(job.waitForCompletion(true)) {
68 | 			System.out.println("success!");
69 | 			System.exit(0);
70 | 		} else {
71 | 			System.out.println("failed!");
72 | 			System.exit(1);
73 | 		}
74 | 	}
75 | 
76 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateFilter.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.NullWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | 
14 | public class DateFilter {
15 | 
16 | 	public static class DateFilterMapper extends Mapper<Object, Text, Text, NullWritable> {		
17 | 		public void map(Object key, Text value, Context context ) throws IOException, InterruptedException {
18 | 	    	String[] strs = value.toString().split(" ");
19 | 	    	Text date = new Text(strs[0]);
20 | 			context.write(date, NullWritable.get());
21 | 	    }
22 | 	}
23 | /*
24 | 	public static class DateFilterReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
25 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
26 | 			int sum = 0;
27 | 			for (IntWritable val : values) {
28 | 				sum += val.get();
29 | 			}
30 | 			context.write(key, new IntWritable(sum));
31 | 	    }
32 | 	}
33 | */
34 | 	public static void main(String[] args) throws Exception {		
35 | 		//1.设置HDFS配置信息
36 | 		String namenode_ip = "192.168.17.10";
37 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
38 | 		Configuration conf = new Configuration();
39 | 		conf.set("fs.defaultFS", hdfs);
40 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
41 | 		
42 | 		//2.设置MapReduce作业配置信息
43 | 		String jobName = "DateFilter";					//定义作业名称
44 | 		Job job = Job.getInstance(conf, jobName);		
45 | 		job.setJarByClass(DateFilter.class);			//指定运行时作业类
46 | 		job.setJar("export\\DateFilter.jar");			//指定本地jar包
47 | 		job.setMapperClass(DateFilterMapper.class);
48 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
49 | 		job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
50 | 		//job.setReducerClass(DateCountReducer.class);	//不需要设置Reducer类
51 | 		//job.setOutputKeyClass(Text.class);			//设置Reduce输出键类型
52 | 		//job.setOutputValueClass(NullWritable.class);	//设置Reduce输出值类型
53 | 		
54 | 		//3.设置作业输入和输出路径
55 | 		String dataDir = "/expr/datecount/data";			//实验数据目录	
56 | 		String outputDir = "/expr/datecount/output_filter";	//实验输出目录
57 | 		Path inPath = new Path(hdfs + dataDir);
58 | 		Path outPath = new Path(hdfs + outputDir);
59 | 		FileInputFormat.addInputPath(job, inPath);
60 | 		FileOutputFormat.setOutputPath(job, outPath);
61 | 		FileSystem fs = FileSystem.get(conf);
62 | 		if(fs.exists(outPath)) {
63 | 			fs.delete(outPath, true);
64 | 		}
65 | 		
66 | 		//4.运行作业
67 | 		System.out.println("Job: " + jobName + " is running...");
68 | 		if(job.waitForCompletion(true)) {
69 | 			System.out.println("success!");
70 | 			System.exit(0);
71 | 		} else {
72 | 			System.out.println("failed!");
73 | 			System.exit(1);
74 | 		}
75 | 	}
76 | 
77 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateGroup.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class DateGroup {
16 | 
17 | 	public static class DateGroupMapper extends Mapper<Object, Text, Text, IntWritable> {		
18 | 		public void map(Object key, Text value, Context context ) 
19 | 				throws IOException, InterruptedException {
20 | 	    	String[] strs = value.toString().split(" ");		//按空格分割输入
21 | 	    	String date = strs[0];					//获取日期
22 | 	    	int id = Integer.parseInt(strs[1]);		//获取序号
23 | 			context.write(new Text(date), new IntWritable(id));		
24 | 	    }
25 | 	}
26 |   
27 | 	public static class DateGroupReducer extends Reducer<Text,IntWritable,Text,Text> {
28 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
29 | 				throws IOException, InterruptedException {
30 | 			StringBuilder sb = new StringBuilder();		
31 | 			sb.append("[ ");
32 | 			for (IntWritable val : values) {	//将value值串联
33 | 				sb.append(val.toString()).append(" ");
34 | 			}
35 | 			sb.append("]");
36 | 			context.write(key, new Text(sb.toString()));
37 | 	    }
38 | 	}
39 | 
40 | 	public static void main(String[] args) throws Exception {		
41 | 		//1.设置HDFS配置信息
42 | 		String namenode_ip = "192.168.17.10";
43 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
44 | 		Configuration conf = new Configuration();
45 | 		conf.set("fs.defaultFS", hdfs);
46 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
47 | 
48 | 		//2.设置MapReduce作业配置信息
49 | 		String jobName = "DateGroup";					//作业名称
50 | 		Job job = Job.getInstance(conf, jobName);
51 | 		job.setJarByClass(DateGroup.class);				//指定运行时作业类
52 | 		job.setJar("export\\DateGroup.jar");			//指定本地jar包
53 | 		job.setMapperClass(DateGroupMapper.class);		//指定Mapper类
54 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
55 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
56 | 		job.setReducerClass(DateGroupReducer.class);	//指定Reducer类
57 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
58 | 		job.setOutputValueClass(Text.class); 			//设置Reduce输出Value类型
59 | 		
60 | 		//3.设置作业输入和输出路径
61 | 		String dataDir = "/expr/datecount/data";			//实验数据目录	
62 | 		String outputDir = "/expr/datecount/output_group";	//实验输出目录
63 | 		Path inPath = new Path(hdfs + dataDir);
64 | 		Path outPath = new Path(hdfs + outputDir);
65 | 		FileInputFormat.addInputPath(job, inPath);
66 | 		FileOutputFormat.setOutputPath(job, outPath);
67 | 		FileSystem fs = FileSystem.get(conf);
68 | 		if(fs.exists(outPath)) {
69 | 			fs.delete(outPath, true);
70 | 		}
71 | 		
72 | 		//4.运行作业
73 | 		System.out.println("Job: " + jobName + " is running...");
74 | 		if(job.waitForCompletion(true)) {
75 | 			System.out.println("success!");
76 | 			System.exit(0);
77 | 		} else {
78 | 			System.out.println("failed!");
79 | 			System.exit(1);
80 | 		}
81 | 	}
82 | 
83 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateGroup2.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.datecount;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.IntWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.io.WritableComparable;
 10 | import org.apache.hadoop.io.WritableComparator;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Reducer;
 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 | 
 17 | public class DateGroup2 {
 18 | 
 19 | 	public static class DateGroup2Mapper extends Mapper<Object, Text, Text, IntWritable> {		
 20 | 		public void map(Object key, Text value, Context context ) 
 21 | 				throws IOException, InterruptedException {
 22 | 	    	String[] strs = value.toString().split(" ");		//按空格分割输入
 23 | 	    	String date = strs[0];					//获取日期
 24 | 	    	int id = Integer.parseInt(strs[1]);		//获取序号
 25 | 			context.write(new Text(date), new IntWritable(id));		
 26 | 	    }
 27 | 	}
 28 |   
 29 | 	public static class DateGroup2Reducer extends Reducer<Text,IntWritable,Text,Text> {
 30 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 31 | 				throws IOException, InterruptedException {
 32 | 			StringBuilder sb = new StringBuilder();		
 33 | 			sb.append("[ ");
 34 | 			for (IntWritable val : values) {	//将value值串联
 35 | 				sb.append(val.toString()).append(" ");
 36 | 			}
 37 | 			sb.append("]");			
 38 | 			String year = key.toString().substring(0,4);	//取年份
 39 | 			context.write(new Text(year), new Text(sb.toString()));
 40 | 	    }
 41 | 	}
 42 | 
 43 | 	public static class MyGroup extends WritableComparator {
 44 | 		public MyGroup() {				//注册比较方法
 45 | 			super(Text.class, true);
 46 | 		}
 47 | 
 48 | 		@SuppressWarnings("rawtypes")
 49 | 		@Override
 50 | 		public int compare(WritableComparable a, WritableComparable b) {
 51 | 			String d1 = a.toString();
 52 | 			String d2 = b.toString();	
 53 | 			
 54 | 			if (d1.startsWith("2015"))
 55 | 				d1 = "2015";
 56 | 			else if (d1.startsWith("2016"))
 57 | 				d1 = "2016";
 58 | 			else
 59 | 				d1 = "2017";
 60 | 			
 61 | 			if (d2.startsWith("2015"))
 62 | 				d2 = "2015";
 63 | 			else if (d2.startsWith("2016"))
 64 | 				d2 = "2016";
 65 | 			else
 66 | 				d2 = "2017";
 67 | 			
 68 | 			return d1.compareTo(d2);	//将原本KEY(年月日)的比较变成年份的比较
 69 | 		}
 70 | 	}
 71 | 	
 72 | 	public static void main(String[] args) throws Exception {		
 73 | 		//1.设置HDFS配置信息
 74 | 		String namenode_ip = "192.168.17.10";
 75 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 76 | 		Configuration conf = new Configuration();
 77 | 		conf.set("fs.defaultFS", hdfs);
 78 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 79 | 
 80 | 		//2.设置MapReduce作业配置信息
 81 | 		String jobName = "DateGroup2";					//作业名称
 82 | 		Job job = Job.getInstance(conf, jobName);
 83 | 		job.setJarByClass(DateGroup2.class);			//指定运行时作业类
 84 | 		job.setJar("export\\DateGroup2.jar");			//指定本地jar包
 85 | 		job.setMapperClass(DateGroup2Mapper.class);		//指定Mapper类
 86 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 87 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
 88 | 		job.setReducerClass(DateGroup2Reducer.class);	//指定Reducer类
 89 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
 90 | 		job.setOutputValueClass(Text.class); 			//设置Reduce输出Value类型
 91 | 		job.setGroupingComparatorClass(MyGroup.class);	//设置自定义分组类
 92 | 		//3.设置作业输入和输出路径
 93 | 		String dataDir = "/expr/datecount/data";			//实验数据目录	
 94 | 		String outputDir = "/expr/datecount/output_group2";	//实验输出目录
 95 | 		Path inPath = new Path(hdfs + dataDir);
 96 | 		Path outPath = new Path(hdfs + outputDir);
 97 | 		FileInputFormat.addInputPath(job, inPath);
 98 | 		FileOutputFormat.setOutputPath(job, outPath);
 99 | 		FileSystem fs = FileSystem.get(conf);
100 | 		if(fs.exists(outPath)) {
101 | 			fs.delete(outPath, true);
102 | 		}
103 | 		
104 | 		//4.运行作业
105 | 		System.out.println("Job: " + jobName + " is running...");
106 | 		if(job.waitForCompletion(true)) {
107 | 			System.out.println("success!");
108 | 			System.exit(0);
109 | 		} else {
110 | 			System.out.println("failed!");
111 | 			System.exit(1);
112 | 		}
113 | 	}
114 | 
115 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DatePartition.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.datecount;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.IntWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Partitioner;
 12 | import org.apache.hadoop.mapreduce.Reducer;
 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 | 
 16 | public class DatePartition {
 17 | 
 18 | 	public static class DatePartitionMapper extends Mapper<Object, Text, Text, IntWritable> {		
 19 | 		private final static IntWritable one = new IntWritable(1);
 20 | 		
 21 | 		public void map(Object key, Text value, Context context ) 
 22 | 				throws IOException, InterruptedException {
 23 | 	    	String[] strs = value.toString().split(" ");
 24 | 	    	Text date = new Text(strs[0]);
 25 | 			context.write(date, one);
 26 | 	    }
 27 | 	}
 28 |   
 29 | 	public static class DatePartitionReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 30 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 31 | 				throws IOException, InterruptedException {
 32 | 			int sum = 0;
 33 | 			for (IntWritable val : values) {
 34 | 				sum += val.get();
 35 | 			}
 36 | 			context.write(key, new IntWritable(sum));
 37 | 	    }
 38 | 	}
 39 | 
 40 | 	public static class YearPartitioner extends Partitioner<Text, IntWritable> {
 41 | 		@Override
 42 | 		public int getPartition(Text key, IntWritable value, int numPartitions) {
 43 | 			//根据年份对数据进行分区，返回不同分区号
 44 | 			if (key.toString().startsWith("2015"))
 45 | 				return 0 % numPartitions;
 46 | 			else if (key.toString().startsWith("2016"))
 47 | 				return 1 % numPartitions;
 48 | 			else
 49 | 				return 2 % numPartitions;		
 50 | 		}
 51 | 	}
 52 | 	
 53 | 	public static void main(String[] args) throws Exception {		
 54 | 		//1.设置HDFS配置信息
 55 | 		String namenode_ip = "192.168.17.10";
 56 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 57 | 		Configuration conf = new Configuration();
 58 | 		conf.set("fs.defaultFS", hdfs);
 59 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 60 | 
 61 | 		//2.设置MapReduce作业配置信息
 62 | 		String jobName = "DatePartition";					//定义作业名称
 63 | 		Job job = Job.getInstance(conf, jobName);
 64 | 		job.setJarByClass(DatePartition.class);				//指定运行时作业类
 65 | 		job.setJar("export\\DatePartition.jar");			//指定本地jar包
 66 | //		Map
 67 | 		job.setMapperClass(DatePartitionMapper.class);		//指定Mapper类
 68 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
 69 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
 70 | //		Reduce
 71 | 		job.setReducerClass(DatePartitionReducer.class);	//指定Reducer类
 72 | //		全局
 73 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
 74 | 		job.setOutputValueClass(IntWritable.class);			//设置Reduce输出Value类型
 75 | //		Partition
 76 | 		job.setPartitionerClass(YearPartitioner.class);		//自定义分区方法
 77 | 		job.setNumReduceTasks(10); 	//设置reduce任务的数量,该值传递给Partitioner.getPartition()方法的numPartitions参数
 78 | 		
 79 | 		//3.设置作业输入和输出路径
 80 | 		String dataDir = "/expr/datecount/data";				//实验数据目录	
 81 | 		String outputDir = "/expr/datecount/output_partition";	//实验输出目录
 82 | 		Path inPath = new Path(hdfs + dataDir);
 83 | 		Path outPath = new Path(hdfs + outputDir);
 84 | 		FileInputFormat.addInputPath(job, inPath);
 85 | 		FileOutputFormat.setOutputPath(job, outPath);
 86 | 		FileSystem fs = FileSystem.get(conf);
 87 | 		if(fs.exists(outPath)) {
 88 | 			fs.delete(outPath, true);
 89 | 		}
 90 | 		
 91 | 		//4.运行作业
 92 | 		System.out.println("Job: " + jobName + " is running...");
 93 | 		if(job.waitForCompletion(true)) {
 94 | 			System.out.println("success!");
 95 | 			System.exit(0);
 96 | 		} else {
 97 | 			System.out.println("failed!");
 98 | 			System.exit(1);
 99 | 		}
100 | 	}
101 | 
102 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DatePartition2.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.datecount;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.IntWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Partitioner;
 12 | import org.apache.hadoop.mapreduce.Reducer;
 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 15 | 
 16 | public class DatePartition2 {
 17 | 
 18 | 	public static class DatePartition2Mapper extends Mapper<Object, Text, Text, IntWritable> {		
 19 | 		private final static IntWritable one = new IntWritable(1);
 20 | 		
 21 | 		public void map(Object key, Text value, Context context ) 
 22 | 				throws IOException, InterruptedException {
 23 | 	    	String[] strs = value.toString().split(" ");
 24 | 	    	Text date = new Text(strs[0]);
 25 | 			context.write(date, one);
 26 | 	    }
 27 | 	}
 28 |   
 29 | 	public static class DatePartition2Reducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 30 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 31 | 				throws IOException, InterruptedException {
 32 | 			int sum = 0;
 33 | 			for (IntWritable val : values) {
 34 | 				sum += val.get();
 35 | 			}
 36 | 			context.write(key, new IntWritable(sum));
 37 | 	    }
 38 | 	}
 39 | 
 40 | 	public static class YearPartitioner extends Partitioner<Text, IntWritable> {
 41 | 		@Override
 42 | 		public int getPartition(Text key, IntWritable value, int numPartitions) {
 43 | 			//根据月份对数据进行分区，返回不同分区号			
 44 | 			String month = key.toString().substring(5,7);	//substring取从下标5到下标7前一个字符，即下标5-6的字符
 45 | 			switch (month) {
 46 | 				case "01": return 1;
 47 | 				case "02": return 2;
 48 | 				case "03": return 3;
 49 | 				case "04": return 4;
 50 | 				case "05": return 5;
 51 | 				case "06": return 6;
 52 | 				case "07": return 7;
 53 | 				case "08": return 8;
 54 | 				case "09": return 9;
 55 | 				case "10": return 10;
 56 | 				case "11": return 11;
 57 | 				case "12": return 12;
 58 | 				default  : return 0;
 59 | 			}
 60 | 		}
 61 | 	}
 62 | 	
 63 | 	public static void main(String[] args) throws Exception {		
 64 | 		//1.设置HDFS配置信息
 65 | 		String namenode_ip = "192.168.17.10";
 66 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 67 | 		Configuration conf = new Configuration();
 68 | 		conf.set("fs.defaultFS", hdfs);
 69 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 70 | 
 71 | 		//2.设置MapReduce作业配置信息
 72 | 		String jobName = "DatePartition2";					//定义作业名称
 73 | 		Job job = Job.getInstance(conf, jobName);
 74 | 		job.setJarByClass(DatePartition2.class);			//指定运行时作业类
 75 | 		job.setJar("export\\DatePartition2.jar");			//指定本地jar包
 76 | 		job.setMapperClass(DatePartition2Mapper.class);		//指定Mapper类
 77 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
 78 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
 79 | 		job.setReducerClass(DatePartition2Reducer.class);	//指定Reducer类
 80 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
 81 | 		job.setOutputValueClass(IntWritable.class);			//设置Reduce输出Value类型
 82 | 		job.setPartitionerClass(YearPartitioner.class);		//自定义分区方法
 83 | 		job.setNumReduceTasks(3); 	//设置reduce任务的数量,该值传递给Partitioner.getPartition()方法的numPartitions参数
 84 | 		
 85 | 		//3.设置作业输入和输出路径
 86 | 		String dataDir = "/expr/datecount/data";				//实验数据目录	
 87 | 		String outputDir = "/expr/datecount/output_partition2";	//实验输出目录
 88 | 		Path inPath = new Path(hdfs + dataDir);
 89 | 		Path outPath = new Path(hdfs + outputDir);
 90 | 		FileInputFormat.addInputPath(job, inPath);
 91 | 		FileOutputFormat.setOutputPath(job, outPath);
 92 | 		FileSystem fs = FileSystem.get(conf);
 93 | 		if(fs.exists(outPath)) {
 94 | 			fs.delete(outPath, true);
 95 | 		}
 96 | 		
 97 | 		//4.运行作业
 98 | 		System.out.println("Job: " + jobName + " is running...");
 99 | 		if(job.waitForCompletion(true)) {
100 | 			System.out.println("success!");
101 | 			System.exit(0);
102 | 		} else {
103 | 			System.out.println("failed!");
104 | 			System.exit(1);
105 | 		}
106 | 	}
107 | 
108 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateSort.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class DateSort {	
16 | 	public static class DateSortMapper extends Mapper<Object, Text, IntWritable, Text> {	//key-value类型不同于以往的	
17 | 		IntWritable num = new IntWritable();
18 | 		public void map(Object key, Text value, Context context ) throws IOException, InterruptedException {
19 | 	    	String[] strs = value.toString().split("\t");	//从DateCount运行结果读取数据，默认是用Tab分割输入
20 | 	    	String date = strs[0];					//获取日期
21 | 	    	num.set(Integer.parseInt(strs[1]));		//获取次数
22 | 	    	context.write(num, new Text(date));		//以次数作为key，日期作为value输出;利用shuffle自动对key升序排序的特性
23 | 	    }
24 | 	}
25 |   
26 | 	public static class DateSortReducer extends Reducer<IntWritable,Text,Text,IntWritable> {
27 | 		public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
28 | 			for (Text val : values) {
29 | 				context.write(val, key);//Map阶段将日期和次数反过来以实现排序，Reduce这里再次翻转key-value
30 | 			}
31 | 	    }
32 | 	}
33 | 
34 | 	public static void main(String[] args) throws Exception {		
35 | 		//1.设置HDFS配置信息
36 | 		String namenode_ip = "192.168.17.10";
37 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
38 | 		Configuration conf = new Configuration();
39 | 		conf.set("fs.defaultFS", hdfs);
40 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
41 | 
42 | 		//2.设置MapReduce作业配置信息
43 | 		String jobName = "DateSort";					//定义作业名称
44 | 		Job job = Job.getInstance(conf, jobName);
45 | 		job.setJarByClass(DateSort.class);				//指定作业类
46 | 		job.setJar("export\\DateSort.jar");				//指定本地jar包
47 | 		job.setMapperClass(DateSortMapper.class);		//指定Mapper类
48 | 		job.setMapOutputKeyClass(IntWritable.class);	//设置Mapper输出Key类型
49 | 		job.setMapOutputValueClass(Text.class);			//设置Mapper输出Value类型
50 | 		job.setReducerClass(DateSortReducer.class);		//指定Reducer类
51 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
52 | 		job.setOutputValueClass(IntWritable.class);		//设置Reduce输出Value类型
53 | 				
54 | 		//3.设置作业输入和输出路径
55 | 		String dataDir = "/expr/datecount/output/part-r-00000";	//实验数据目录	
56 | 		String outputDir = "/expr/datecount/output_sort";				//实验输出目录
57 | 		Path inPath = new Path(hdfs + dataDir);
58 | 		Path outPath = new Path(hdfs + outputDir);
59 | 		FileInputFormat.addInputPath(job, inPath);
60 | 		FileOutputFormat.setOutputPath(job, outPath);
61 | 		FileSystem fs = FileSystem.get(conf);
62 | 		if(fs.exists(outPath)) {
63 | 			fs.delete(outPath, true);
64 | 		}
65 | 		
66 | 		//4.运行作业
67 | 		System.out.println("Job: " + jobName + " is running...");
68 | 		if(job.waitForCompletion(true)) {
69 | 			System.out.println("success!");
70 | 			System.exit(0);
71 | 		} else {
72 | 			System.out.println("failed!");
73 | 			System.exit(1);
74 | 		}
75 | 	}
76 | 
77 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateSort2.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.datecount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.io.WritableComparable;
10 | import org.apache.hadoop.io.WritableComparator;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | 
17 | public class DateSort2 {	
18 | 	
19 | 	public static class DateSort2Mapper extends Mapper<Object, Text, IntWritable, Text> {		
20 | 		IntWritable num = new IntWritable();
21 | 		public void map(Object key, Text value, Context context ) throws IOException, InterruptedException {
22 | 	    	String[] strs = value.toString().split("\t");	//从DateCount运行结果读取数据，默认是用Tab分割输入
23 | 	    	String date = strs[0];							//获取日期
24 | 	    	num.set(Integer.parseInt(strs[1]));				//获取次数
25 | 	    	context.write(num, new Text(date));				//以次数作为key，日期作为value输出
26 | 	    }
27 | 	}
28 |   
29 | 	public static class DateSort2Reducer extends Reducer<IntWritable,Text,Text,IntWritable> {
30 | 		public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
31 | 			for (Text val : values) {
32 | 				context.write(val, key);
33 | 			}
34 | 	    }
35 | 	}
36 | 
37 | 	// 自定义Key排序算法
38 | 	public static class MySort extends WritableComparator {
39 | 		public MySort() {
40 | 			super(IntWritable.class, true);
41 | 		}
42 | 		
43 | 		@SuppressWarnings({ "rawtypes", "unchecked" })
44 | 		public int compare(WritableComparable a, WritableComparable b) {
45 | 			return b.compareTo(a);// 默认升序a比b小返回-1，升序排序；现在a比b小，返回1，降序排序
46 | 		}
47 | 	}
48 | 	
49 | 	public static void main(String[] args) throws Exception {		
50 | 		//1.设置HDFS配置信息
51 | 		String namenode_ip = "192.168.17.10";
52 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
53 | 		Configuration conf = new Configuration();
54 | 		conf.set("fs.defaultFS", hdfs);
55 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
56 | 
57 | 		//2.设置MapReduce作业配置信息
58 | 		String jobName = "DateSort2";					//定义作业名称
59 | 		Job job = Job.getInstance(conf, jobName);
60 | 		job.setJarByClass(DateSort2.class);				//指定作业类
61 | 		job.setJar("export\\DateSort2.jar");			//指定本地jar包
62 | //		Map
63 | 		job.setMapperClass(DateSort2Mapper.class);		//指定Mapper类
64 | 		job.setMapOutputKeyClass(IntWritable.class);	//设置Mapper输出Key类型
65 | 		job.setMapOutputValueClass(Text.class);			//设置Mapper输出Value类型
66 | //		Reduce
67 | 		job.setReducerClass(DateSort2Reducer.class);	//指定Reducer类
68 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
69 | 		job.setOutputValueClass(IntWritable.class);		//设置Reduce输出Value类型
70 | //		自定义Sort
71 | 		job.setSortComparatorClass(MySort.class);		//设置自定义排序类
72 | 		
73 | 		//3.设置作业输入和输出路径
74 | 		String dataDir = "/expr/datecount/output/part-r-00000";	//实验数据目录	
75 | 		String outputDir = "/expr/datecount/output_sort2";				//实验输出目录
76 | 		Path inPath = new Path(hdfs + dataDir);
77 | 		Path outPath = new Path(hdfs + outputDir);
78 | 		FileInputFormat.addInputPath(job, inPath);
79 | 		FileOutputFormat.setOutputPath(job, outPath);
80 | 		FileSystem fs = FileSystem.get(conf);
81 | 		if(fs.exists(outPath)) {
82 | 			fs.delete(outPath, true);
83 | 		}
84 | 		
85 | 		//4.运行作业
86 | 		System.out.println("Job: " + jobName + " is running...");
87 | 		if(job.waitForCompletion(true)) {
88 | 			System.out.println("success!");
89 | 			System.exit(0);
90 | 		} else {
91 | 			System.out.println("failed!");
92 | 			System.exit(1);
93 | 		}
94 | 	}
95 | 
96 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/datecount/DateSort3.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.datecount;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import org.apache.hadoop.conf.Configuration;
  7 | import org.apache.hadoop.fs.FileSystem;
  8 | import org.apache.hadoop.fs.Path;
  9 | import org.apache.hadoop.io.IntWritable;
 10 | import org.apache.hadoop.io.NullWritable;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.apache.hadoop.io.WritableComparable;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 | 
 19 | public class DateSort3 {
 20 | 	
 21 | 	public static class MyKey implements WritableComparable<MyKey> {
 22 | 		private String date;
 23 | 		private int num;
 24 | 		
 25 | 		public String getDate() {
 26 | 			return date;
 27 | 		}
 28 | 
 29 | 		public void setDate(String date) {
 30 | 			this.date = date;
 31 | 		}
 32 | 
 33 | 		public int getNum() {
 34 | 			return num;
 35 | 		}
 36 | 
 37 | 		public void setNum(int num) {
 38 | 			this.num = num;
 39 | 		}
 40 | 
 41 | 		public MyKey() {			
 42 | 		}
 43 | 		
 44 | 		public MyKey(String date, int num) {
 45 | 			this.date = date;
 46 | 			this.num = num;
 47 | 		}
 48 | 
 49 | 		@Override
 50 | 		public void write(DataOutput out) throws IOException {
 51 | 			out.writeUTF(date);
 52 | 			out.writeInt(num);
 53 | 		}
 54 | 
 55 | 		@Override
 56 | 		public void readFields(DataInput in) throws IOException {
 57 | 			date = in.readUTF();
 58 | 			num = in.readInt();
 59 | 		}
 60 | 
 61 | 		@Override
 62 | 		public int compareTo(MyKey o) {
 63 | 			//按date升序，num降序
 64 | 			if (!date.equals(o.date)) //相等的话，返回true，取反为false
 65 | 				return date.compareTo(o.date);
 66 | 			else
 67 | 				return o.num-num;
 68 | 		}
 69 | 	}
 70 | 	
 71 | 	public static class DateSort3Mapper extends Mapper<Object, Text, MyKey, NullWritable> {		
 72 | 		public void map(Object key, Text value, Context context ) 
 73 | 				throws IOException, InterruptedException {
 74 | 			String[] strs = value.toString().split(" ");
 75 | 			MyKey myKey = new MyKey(strs[0], Integer.parseInt(strs[1]));
 76 | 			context.write(myKey, NullWritable.get());	//将自定义的myKey作为Map KEY输出
 77 | 	    }
 78 | 	}
 79 |   
 80 | 	public static class DateSort3Reducer extends Reducer<MyKey,NullWritable,Text,IntWritable> {
 81 | 		public void reduce(MyKey key, Iterable<NullWritable> values, Context context) 
 82 | 				throws IOException, InterruptedException {			
 83 | 			context.write(new Text(key.date), new IntWritable(key.num));
 84 | 	    }
 85 | 	}
 86 | 
 87 | 	public static void main(String[] args) throws Exception {		
 88 | 		//1.设置HDFS配置信息
 89 | 		String namenode_ip = "192.168.17.10";
 90 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 91 | 		Configuration conf = new Configuration();
 92 | 		conf.set("fs.defaultFS", hdfs);
 93 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 94 | 
 95 | 		//2.设置MapReduce作业配置信息
 96 | 		String jobName = "DateSort3";					//定义作业名称
 97 | 		Job job = Job.getInstance(conf, jobName);
 98 | 		job.setJarByClass(DateSort3.class);				//指定运行时作业类
 99 | 		job.setJar("export\\DateSort3.jar");			//指定本地jar包
100 | 		job.setMapperClass(DateSort3Mapper.class);		//指定Mapper类
101 | 		job.setMapOutputKeyClass(MyKey.class);			//设置Mapper输出Key类型
102 | 		job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
103 | 		job.setReducerClass(DateSort3Reducer.class);	//指定Reducer类
104 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
105 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
106 | 		
107 | 		//3.设置作业输入和输出路径
108 | 		String dataDir = "/expr/datecount/data";			//实验数据目录	
109 | 		String outputDir = "/expr/datecount/output_sort3";	//实验输出目录
110 | 		Path inPath = new Path(hdfs + dataDir);
111 | 		Path outPath = new Path(hdfs + outputDir);
112 | 		FileInputFormat.addInputPath(job, inPath);
113 | 		FileOutputFormat.setOutputPath(job, outPath);
114 | 		FileSystem fs = FileSystem.get(conf);
115 | 		if(fs.exists(outPath)) {
116 | 			fs.delete(outPath, true);
117 | 		}
118 | 		
119 | 		//4.运行作业
120 | 		System.out.println("Job: " + jobName + " is running...");
121 | 		if(job.waitForCompletion(true)) {
122 | 			System.out.println("success!");
123 | 			System.exit(0);
124 | 		} else {
125 | 			System.out.println("failed!");
126 | 			System.exit(1);
127 | 		}
128 | 	}
129 | 
130 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/FixedLengthInput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.BytesWritable;
 8 | import org.apache.hadoop.io.LongWritable;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class FixedLengthInput {
17 | 	public static class FixedLengthInputMapper extends Mapper<LongWritable, BytesWritable, LongWritable, BytesWritable> {		
18 | 		public void map(LongWritable key, BytesWritable value, Context context ) 
19 | 				throws IOException, InterruptedException {
20 | 			context.write(key, value);
21 | 	    }
22 | 	}
23 |   
24 | 	public static class FixedLengthInputReducer extends Reducer<LongWritable,BytesWritable,LongWritable,BytesWritable> {
25 | 		public void reduce(LongWritable key, Iterable<BytesWritable> values, Context context) 
26 | 				throws IOException, InterruptedException {
27 | 			for (BytesWritable val : values) {
28 | 				context.write(key, val);
29 | 			}
30 | 	    }
31 | 	}
32 | 
33 | 	public static void main(String[] args) throws Exception {		
34 | 		//1.设置HDFS配置信息
35 | 		String namenode_ip = "192.168.17.10";
36 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
37 | 		Configuration conf = new Configuration();
38 | 		conf.set("fs.defaultFS", hdfs);
39 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
40 | 		conf.setInt(FixedLengthInputFormat.FIXED_RECORD_LENGTH, 13);
41 | 		
42 | 		//2.设置MapReduce作业配置信息
43 | 		String jobName = "FixedLengthInput";					//作业名称
44 | 		Job job = Job.getInstance(conf, jobName);
45 | 		job.setJarByClass(FixedLengthInput.class);				//指定运行时作业类
46 | 		job.setJar("export\\FixedLengthInput.jar");				//指定本地jar包
47 | 		job.setMapperClass(FixedLengthInputMapper.class);		//指定Mapper类
48 | 		job.setMapOutputKeyClass(LongWritable.class);			//设置Mapper输出Key类型
49 | 		job.setMapOutputValueClass(BytesWritable.class);		//设置Mapper输出Value类型
50 | 		job.setReducerClass(FixedLengthInputReducer.class);		//指定Reducer类
51 | 		job.setOutputKeyClass(LongWritable.class);				//设置Reduce输出Key类型
52 | 		job.setOutputValueClass(BytesWritable.class); 			//设置Reduce输出Value类型
53 | 		
54 | 		job.setInputFormatClass(FixedLengthInputFormat.class);	//设置输入格式化类
55 | 		
56 | 		//3.设置作业输入和输出路径
57 | 		String dataDir = "/expr/fixedinput/data";			//实验数据目录	
58 | 		String outputDir = "/expr/fixedinput/output";		//实验输出目录
59 | 		Path inPath = new Path(hdfs + dataDir);
60 | 		Path outPath = new Path(hdfs + outputDir);
61 | 		FileInputFormat.addInputPath(job, inPath);
62 | 		FileOutputFormat.setOutputPath(job, outPath);
63 | 		FileSystem fs = FileSystem.get(conf);
64 | 		if(fs.exists(outPath)) {
65 | 			fs.delete(outPath, true);
66 | 		}
67 | 		
68 | 		//4.运行作业
69 | 		System.out.println("Job: " + jobName + " is running...");
70 | 		if(job.waitForCompletion(true)) {
71 | 			System.out.println("success!");
72 | 			System.exit(0);
73 | 		} else {
74 | 			System.out.println("failed!");
75 | 			System.exit(1);
76 | 		}
77 | 	}
78 | 
79 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/FixedLengthInput2.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.BytesWritable;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
17 | 
18 | public class FixedLengthInput2 {
19 | 	public static class FixedLengthInput2Mapper extends Mapper<LongWritable, BytesWritable, Text, IntWritable> {		
20 | 		public void map(LongWritable key, BytesWritable value, Context context ) 
21 | 				throws IOException, InterruptedException {
22 | 			String val = new String(value.getBytes(), 0, value.getLength()-1);	
23 | 			String[] strs = val.split(" ");
24 | 			context.write(new Text(strs[0]), new IntWritable(Integer.parseInt(strs[1])));
25 | 	    }
26 | 	}
27 |   
28 | 	public static class FixedLengthInput2Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
29 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
30 | 				throws IOException, InterruptedException {
31 | 			for (IntWritable val : values) {
32 | 				context.write(key, val);
33 | 			}
34 | 	    }
35 | 	}
36 | 
37 | 	public static void main(String[] args) throws Exception {		
38 | 		//1.设置HDFS配置信息
39 | 		String namenode_ip = "192.168.17.10";
40 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
41 | 		Configuration conf = new Configuration();
42 | 		conf.set("fs.defaultFS", hdfs);
43 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
44 | 		conf.setInt(FixedLengthInputFormat.FIXED_RECORD_LENGTH, 13);
45 | 		
46 | 		//2.设置MapReduce作业配置信息
47 | 		String jobName = "FixedLengthInput2";					//作业名称
48 | 		Job job = Job.getInstance(conf, jobName);
49 | 		job.setJarByClass(FixedLengthInput2.class);				//指定运行时作业类
50 | 		job.setJar("export\\FixedLengthInput2.jar");				//指定本地jar包
51 | 		job.setMapperClass(FixedLengthInput2Mapper.class);		//指定Mapper类
52 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
53 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
54 | 		job.setReducerClass(FixedLengthInput2Reducer.class);		//指定Reducer类
55 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
56 | 		job.setOutputValueClass(IntWritable.class); 			//设置Reduce输出Value类型
57 | 		
58 | 		job.setInputFormatClass(FixedLengthInputFormat.class);	//设置输入格式化类
59 | 		
60 | 		//3.设置作业输入和输出路径
61 | 		String dataDir = "/expr/fixedinput/data";			//实验数据目录	
62 | 		String outputDir = "/expr/fixedinput/output";		//实验输出目录
63 | 		Path inPath = new Path(hdfs + dataDir);
64 | 		Path outPath = new Path(hdfs + outputDir);
65 | 		FileInputFormat.addInputPath(job, inPath);
66 | 		FileOutputFormat.setOutputPath(job, outPath);
67 | 		FileSystem fs = FileSystem.get(conf);
68 | 		if(fs.exists(outPath)) {
69 | 			fs.delete(outPath, true);
70 | 		}
71 | 		
72 | 		//4.运行作业
73 | 		System.out.println("Job: " + jobName + " is running...");
74 | 		if(job.waitForCompletion(true)) {
75 | 			System.out.println("success!");
76 | 			System.exit(0);
77 | 		} else {
78 | 			System.out.println("failed!");
79 | 			System.exit(1);
80 | 		}
81 | 	}
82 | 
83 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/KeyValueInput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class KeyValueInput {
17 | 
18 | 	public static class KeyValueInputMapper extends Mapper<Text, Text, Text, IntWritable> {		
19 | 		private final static IntWritable one = new IntWritable(1);
20 | 		
21 | 		public void map(Text key, Text value, Context context ) 
22 | 				throws IOException, InterruptedException {
23 | 			context.write(key, one);	//Mapper的输入KEY就是日期
24 | 	    }
25 | 	}
26 |   
27 | 	public static class KeyValueInputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
28 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
29 | 				throws IOException, InterruptedException {
30 | 			int sum = 0;
31 | 			for (IntWritable val : values) {
32 | 				sum += val.get();
33 | 			}
34 | 			context.write(key, new IntWritable(sum));
35 | 	    }
36 | 	}
37 | 
38 | 	public static void main(String[] args) throws Exception {		
39 | 		//1.设置HDFS配置信息
40 | 		String namenode_ip = "192.168.17.10";
41 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
42 | 		Configuration conf = new Configuration();
43 | 		conf.set("fs.defaultFS", hdfs);
44 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
45 | 		conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ":");	//设置输入文件kv分隔符
46 | 		
47 | 		//2.设置MapReduce作业配置信息
48 | 		String jobName = "KeyValueInput";					//作业名称
49 | 		Job job = Job.getInstance(conf, jobName);
50 | 		job.setJarByClass(KeyValueInput.class);				//指定运行时作业类
51 | 		job.setJar("export\\KeyValueInput.jar");			//指定本地jar包
52 | 		job.setMapperClass(KeyValueInputMapper.class);		//指定Mapper类
53 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
54 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
55 | 		job.setReducerClass(KeyValueInputReducer.class);	//指定Reducer类
56 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
57 | 		job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
58 | 		
59 | 		job.setInputFormatClass(KeyValueTextInputFormat.class);	//设置输入格式化类
60 | 		
61 | 		//3.设置作业输入和输出路径
62 | 		String dataDir = "/expr/kvinput/data";			//实验数据目录	
63 | 		String outputDir = "/expr/kvinput/output";		//实验输出目录
64 | 		Path inPath = new Path(hdfs + dataDir);
65 | 		Path outPath = new Path(hdfs + outputDir);
66 | 		FileInputFormat.addInputPath(job, inPath);
67 | 		FileOutputFormat.setOutputPath(job, outPath);
68 | 		FileSystem fs = FileSystem.get(conf);
69 | 		if(fs.exists(outPath)) {
70 | 			fs.delete(outPath, true);
71 | 		}
72 | 		
73 | 		//4.运行作业
74 | 		System.out.println("Job: " + jobName + " is running...");
75 | 		if(job.waitForCompletion(true)) {
76 | 			System.out.println("success!");
77 | 			System.exit(0);
78 | 		} else {
79 | 			System.out.println("failed!");
80 | 			System.exit(1);
81 | 		}
82 | 	}
83 | 
84 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/MultInput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class MultInput {
16 | 
17 | 	public static class MultInputMapper extends Mapper<Object, Text, Text, IntWritable> {		
18 | 		private final static IntWritable one = new IntWritable(1);
19 | 		
20 | 		public void map(Object key, Text value, Context context ) 
21 | 				throws IOException, InterruptedException {
22 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
23 | 	    	Text date = new Text(strs[0]);		//获取日期
24 | 			context.write(date, one);			//将日期和常数1作为Map输出	
25 | 	    }
26 | 	}
27 |   
28 | 	public static class MultInputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
29 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
30 | 				throws IOException, InterruptedException {
31 | 			int sum = 0;
32 | 			for (IntWritable val : values) {
33 | 				sum += val.get();
34 | 			}
35 | 			context.write(key, new IntWritable(sum));
36 | 	    }
37 | 	}
38 | 
39 | 	public static void main(String[] args) throws Exception {		
40 | 		//1.设置HDFS配置信息
41 | 		String namenode_ip = "192.168.17.10";
42 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
43 | 		Configuration conf = new Configuration();
44 | 		conf.set("fs.defaultFS", hdfs);
45 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
46 | 
47 | 		//2.设置MapReduce作业配置信息
48 | 		String jobName = "MultInput";					//作业名称
49 | 		Job job = Job.getInstance(conf, jobName);
50 | 		job.setJarByClass(MultInput.class);				//指定运行时作业类
51 | 		job.setJar("export\\MultInput.jar");			//指定本地jar包
52 | 		job.setMapperClass(MultInputMapper.class);		//指定Mapper类
53 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
54 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
55 | 		job.setReducerClass(MultInputReducer.class);	//指定Reducer类
56 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
57 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
58 | 		
59 | 		//3.设置作业输入和输出路径	
60 | 		//方法一：FileInputFormat.addInputPath()
61 | 		FileInputFormat.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt1"));//输入目录1
62 | 		FileInputFormat.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt2"));//输入目录2
63 | 		
64 | 		//方法二：FileInputFormat.addInputPaths()
65 | 		//FileInputFormat.addInputPaths(job, String.join(",", hdfs+"/expr/multinput/data/txt1", hdfs+"/expr/multinput/data/txt2"));
66 | 		
67 | 		//方法三：FileInputFormat.setInputPaths()
68 | 		//FileInputFormat.setInputPaths(job, String.join(",", hdfs+"/expr/multinput/data/txt1", hdfs+"/expr/multinput/data/txt2") );
69 | 		
70 | 		Path outPath = new Path(hdfs + "/expr/multinput/output");		//输出目录
71 | 		FileOutputFormat.setOutputPath(job, outPath);
72 | 		FileSystem fs = FileSystem.get(conf);
73 | 		if(fs.exists(outPath)) {
74 | 			fs.delete(outPath, true);
75 | 		}
76 | 		
77 | 		//4.运行作业
78 | 		System.out.println("Job: " + jobName + " is running...");
79 | 		if(job.waitForCompletion(true)) {
80 | 			System.out.println("success!");
81 | 			System.exit(0);
82 | 		} else {
83 | 			System.out.println("failed!");
84 | 			System.exit(1);
85 | 		}
86 | 	}
87 | 
88 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/MultInput2.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class MultInput2 {
17 | 
18 | 	public static class TxtFileMapper extends Mapper<Object, Text, Text, IntWritable> {		
19 | 		private final static IntWritable one = new IntWritable(1);
20 | 		
21 | 		public void map(Object key, Text value, Context context ) 
22 | 				throws IOException, InterruptedException {
23 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
24 | 	    	Text date = new Text(strs[0]);
25 | 			context.write(date, one);	
26 | 	    }
27 | 	}
28 | 	
29 | 	public static class CsvFileMapper extends Mapper<Object, Text, Text, IntWritable> {		
30 | 		private final static IntWritable one = new IntWritable(1);
31 | 		
32 | 		public void map(Object key, Text value, Context context ) 
33 | 				throws IOException, InterruptedException {
34 | 	    	String[] strs = value.toString().split(",");	//按逗号分割输入
35 | 	    	Text date = new Text(strs[0]);
36 | 			context.write(date, one);	
37 | 	    }
38 | 	}
39 |   
40 | 	public static class MultInput2Reducer extends Reducer<Text,IntWritable,Text,IntWritable> {
41 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
42 | 				throws IOException, InterruptedException {
43 | 			int sum = 0;
44 | 			for (IntWritable val : values) {
45 | 				sum += val.get();
46 | 			}
47 | 			context.write(key, new IntWritable(sum));
48 | 	    }
49 | 	}
50 | 
51 | 	public static void main(String[] args) throws Exception {		
52 | 		//1.设置HDFS配置信息
53 | 		String namenode_ip = "192.168.17.10";
54 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
55 | 		Configuration conf = new Configuration();
56 | 		conf.set("fs.defaultFS", hdfs);
57 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
58 | 
59 | 		//2.设置MapReduce作业配置信息
60 | 		String jobName = "MultInput2";					//作业名称
61 | 		Job job = Job.getInstance(conf, jobName);
62 | 		job.setJarByClass(MultInput2.class);			//指定运行时作业类
63 | 		job.setJar("export\\MultInput2.jar");			//指定本地jar包
64 | 		
65 | 		//job.setMapperClass(MultInput2Mapper.class);	//无需指定Mapper类，而在MultipleInputs.addInputPath()方法中指定
66 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
67 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
68 | 		job.setReducerClass(MultInput2Reducer.class);	//指定Reducer类
69 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
70 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
71 | 		
72 | 		//3.设置作业输入和输出路径	
73 | 		//方法五：MultipleInputs.addInputPath()
74 | 		MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt1"), TextInputFormat.class, TxtFileMapper.class);
75 | 		MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multinput/data/csv"), TextInputFormat.class, CsvFileMapper.class);
76 | 		
77 | 		Path outPath = new Path(hdfs + "/expr/multinput/output3");		//输出目录
78 | 		FileOutputFormat.setOutputPath(job, outPath);
79 | 		FileSystem fs = FileSystem.get(conf);
80 | 		if(fs.exists(outPath)) {
81 | 			fs.delete(outPath, true);
82 | 		}
83 | 		
84 | 		//4.运行作业
85 | 		System.out.println("Job: " + jobName + " is running...");
86 | 		if(job.waitForCompletion(true)) {
87 | 			System.out.println("success!");
88 | 			System.exit(0);
89 | 		} else {
90 | 			System.out.println("failed!");
91 | 			System.exit(1);
92 | 		}
93 | 	}
94 | 
95 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/NLineInput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.LongWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | 
17 | public class NLineInput {
18 | 
19 | 	public static class NLineInputMapper extends Mapper<LongWritable, Text, Text, IntWritable> {		
20 | 		private final static IntWritable one = new IntWritable(1);
21 | 		
22 | 		public void map(LongWritable key, Text value, Context context ) 
23 | 				throws IOException, InterruptedException {
24 | 			System.out.println("value: "+value.toString());
25 | 			String[] strs = value.toString().split(" ");
26 | 			System.out.println("NLines strs is:"+strs);
27 | 			System.out.println("strs[0]"+strs[0]);
28 | 	    	Text date = new Text(strs[0]);
29 | 			context.write(date, one);
30 | 	    }
31 | 	}
32 |   
33 | 	public static class NLineInputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
34 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
35 | 				throws IOException, InterruptedException {
36 | 			int sum = 0;
37 | 			for (IntWritable val : values) {
38 | 				sum += val.get();
39 | 			}
40 | 			context.write(key, new IntWritable(sum));
41 | 	    }
42 | 	}
43 | 
44 | 	public static void main(String[] args) throws Exception {		
45 | 		//1.设置HDFS配置信息
46 | 		String namenode_ip = "192.168.17.10";
47 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
48 | 		Configuration conf = new Configuration();
49 | 		conf.set("fs.defaultFS", hdfs);
50 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
51 | 		conf.setInt("mapreduce.input.lineinputformat.linespermap", 1000);	//设置每个Map处理的行数
52 | 		
53 | 		//2.设置MapReduce作业配置信息
54 | 		String jobName = "NLineInput";						//作业名称
55 | 		Job job = Job.getInstance(conf, jobName);
56 | 		job.setJarByClass(NLineInput.class);				//指定运行时作业类
57 | 		job.setJar("export\\NLineInput.jar");				//指定本地jar包
58 | 		job.setMapperClass(NLineInputMapper.class);			//指定Mapper类
59 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
60 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
61 | 		job.setReducerClass(NLineInputReducer.class);		//指定Reducer类
62 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
63 | 		job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
64 | 		
65 | 		job.setInputFormatClass(NLineInputFormat.class);	//设置输入格式化类
66 | 		
67 | 		//3.设置作业输入和输出路径
68 | 		String dataDir = "/expr/nlineinput/data";			//实验数据目录	
69 | 		String outputDir = "/expr/nlineinput/output";		//实验输出目录
70 | 		Path inPath = new Path(hdfs + dataDir);
71 | 		Path outPath = new Path(hdfs + outputDir);
72 | 		FileInputFormat.addInputPath(job, inPath);
73 | 		FileOutputFormat.setOutputPath(job, outPath);
74 | 		FileSystem fs = FileSystem.get(conf);
75 | 		if(fs.exists(outPath)) {
76 | 			fs.delete(outPath, true);
77 | 		}
78 | 		
79 | 		//4.运行作业
80 | 		System.out.println("Job: " + jobName + " is running...");
81 | 		if(job.waitForCompletion(true)) {
82 | 			System.out.println("success!");
83 | 			System.exit(0);
84 | 		} else {
85 | 			System.out.println("failed!");
86 | 			System.exit(1);
87 | 		}
88 | 	}
89 | 
90 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/inputformat/SequenceInput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.inputformat;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class SequenceInput {
17 | 
18 | 	public static class SequenceInputMapper extends Mapper<Text, IntWritable, Text, IntWritable> {		
19 | 		public void map(Text key, IntWritable value, Context context ) 
20 | 				throws IOException, InterruptedException {
21 | 			int v = value.get() + 10;
22 | 			context.write(key, new IntWritable(v));
23 | 	    }
24 | 	}
25 |   
26 | 	public static class SequenceInputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
27 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
28 | 				throws IOException, InterruptedException {
29 | 			for (IntWritable val : values) {
30 | 				context.write(key, val);
31 | 			}			
32 | 	    }
33 | 	}
34 | 
35 | 	public static void main(String[] args) throws Exception {		
36 | 		//1.设置HDFS配置信息
37 | 		String namenode_ip = "192.168.17.10";
38 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
39 | 		Configuration conf = new Configuration();
40 | 		conf.set("fs.defaultFS", hdfs);
41 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
42 | 		
43 | 		//2.设置MapReduce作业配置信息
44 | 		String jobName = "SequenceInput";						//作业名称
45 | 		Job job = Job.getInstance(conf, jobName);
46 | 		job.setJarByClass(SequenceInput.class);					//指定运行时作业类
47 | 		job.setJar("export\\SequenceInput.jar");				//指定本地jar包
48 | 		job.setMapperClass(SequenceInputMapper.class);			//指定Mapper类
49 | 		job.setMapOutputKeyClass(Text.class);					//设置Mapper输出Key类型
50 | 		job.setMapOutputValueClass(IntWritable.class);			//设置Mapper输出Value类型
51 | 		job.setReducerClass(SequenceInputReducer.class);		//指定Reducer类
52 | 		job.setOutputKeyClass(Text.class);						//设置Reduce输出Key类型
53 | 		job.setOutputValueClass(IntWritable.class); 			//设置Reduce输出Value类型
54 | 		
55 | 		job.setInputFormatClass(SequenceFileInputFormat.class);	//设置输入格式化类
56 | 		
57 | 		//3.设置作业输入和输出路径
58 | 		String dataDir = "/expr/seqinput/data";			//实验数据目录	
59 | 		String outputDir = "/expr/seqinput/output";		//实验输出目录
60 | 		Path inPath = new Path(hdfs + dataDir);
61 | 		Path outPath = new Path(hdfs + outputDir);
62 | 		FileInputFormat.addInputPath(job, inPath);
63 | 		FileOutputFormat.setOutputPath(job, outPath);
64 | 		FileSystem fs = FileSystem.get(conf);
65 | 		if(fs.exists(outPath)) {
66 | 			fs.delete(outPath, true);
67 | 		}
68 | 		
69 | 		//4.运行作业
70 | 		System.out.println("Job: " + jobName + " is running...");
71 | 		if(job.waitForCompletion(true)) {
72 | 			System.out.println("success!");
73 | 			System.exit(0);
74 | 		} else {
75 | 			System.out.println("failed!");
76 | 			System.exit(1);
77 | 		}
78 | 	}
79 | 
80 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/StartRun.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.itemcf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | import org.apache.hadoop.conf.Configuration;
 7 | 
 8 | public class StartRun {
 9 | 	public static void main(String[] args) throws IllegalArgumentException, ClassNotFoundException, IOException, InterruptedException {
10 | 		String namenode_ip = "192.168.17.10";
11 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
12 | 		Configuration conf = new Configuration();
13 | 		conf.set("fs.defaultFS", hdfs);
14 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
15 | 		
16 | 		Map<String, String> paths = new HashMap<String, String>();
17 | 		paths.put("Step1Input", "/expr/itemcf/data");
18 | 		paths.put("Step1Output", "/expr/itemcf/output/output1");
19 | 		
20 | 		paths.put("Step2Input", paths.get("Step1Output"));	//后面每一步的输入路径都是前一步的输出路径
21 | 		paths.put("Step2Output", "/expr/itemcf/output/output2");
22 | 		
23 | 		paths.put("Step3Input", paths.get("Step2Output"));
24 | 		paths.put("Step3Output", "/expr/itemcf/output/output3");
25 | 		
26 | 		paths.put("Step4Input1", paths.get("Step2Output"));
27 | 		paths.put("Step4Input2", paths.get("Step3Output"));
28 | 		paths.put("Step4Output", "/expr/itemcf/output/output4");
29 | 		
30 | 		paths.put("Step5Input", paths.get("Step4Output"));
31 | 		paths.put("Step5Output", "/expr/itemcf/output/output5");
32 | 		
33 | 		paths.put("Step6Input", paths.get("Step5Output"));
34 | 		paths.put("Step6Output", "/expr/itemcf/output/output6");
35 | 		
36 | 		Step1.run(conf, paths);	//去重
37 | 		Step2.run(conf, paths);	//计算用户评分矩阵
38 | 		Step3.run(conf, paths);	//计算同现矩阵
39 | 		Step4.run(conf, paths);	//计算单项评分=同现矩阵*评分矩阵
40 | 		Step5.run(conf, paths);	//计算评分总和
41 | 		Step6.run(conf, paths);	//评分排序取Top10
42 | 		
43 | 		System.out.println("finished!");
44 | 	}
45 | 
46 | 	public static Map<String, Integer> R = new HashMap<String, Integer>();
47 | 	static {
48 | 		R.put("click", 1);		//浏览
49 | 		R.put("collect", 2);	//收藏
50 | 		R.put("cart", 3);		//放入购物车
51 | 		R.put("alipay", 4);		//支付
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step1.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.itemcf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.LongWritable;
 9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | 
17 | //去重
18 | public class Step1 {
19 | 	public static boolean run(Configuration config, Map<String, String> paths)
20 | 			throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
21 | 		String jobName = "step1";
22 | 		Job job = Job.getInstance(config, jobName);
23 | 		job.setJarByClass(Step1.class);
24 | 		job.setJar("export\\ItemCF.jar");
25 | 		job.setMapperClass(Step1_Mapper.class);
26 | 		job.setReducerClass(Step1_Reducer.class);
27 | 		job.setMapOutputKeyClass(Text.class);
28 | 		job.setMapOutputValueClass(NullWritable.class);
29 | 
30 | 		Path inPath = new Path(paths.get("Step1Input"));
31 | 		Path outpath = new Path(paths.get("Step1Output"));
32 | 		FileInputFormat.addInputPath(job, inPath);
33 | 		FileOutputFormat.setOutputPath(job, outpath);		
34 | 		FileSystem fs = FileSystem.get(config);
35 | 		if (fs.exists(outpath)) {
36 | 			fs.delete(outpath, true);
37 | 		}
38 | 		
39 | 		return job.waitForCompletion(true);
40 | 	}
41 | 
42 | 	static class Step1_Mapper extends Mapper<LongWritable, Text, Text, NullWritable> {
43 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
44 | 			if (key.get() != 0) {	//过滤掉输入文件标题行
45 | 				context.write(value, NullWritable.get());
46 | 			}
47 | 		}
48 | 	}
49 | 
50 | 	static class Step1_Reducer extends Reducer<Text, NullWritable, Text, NullWritable> {
51 | 		protected void reduce(Text key, Iterable<NullWritable> values, Context context)
52 | 				throws IOException, InterruptedException {
53 | 			context.write(key, NullWritable.get());
54 | 		}
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step2.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.itemcf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | import java.util.Map.Entry;
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.LongWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.Mapper;
14 | import org.apache.hadoop.mapreduce.Reducer;
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
17 | 
18 | //计算用户评分矩阵
19 | public class Step2 {
20 | 	public static boolean run(Configuration config, Map<String, String> paths) 
21 | 			throws IOException, ClassNotFoundException, InterruptedException {
22 | 		String jobName = "step2";
23 | 		Job job = Job.getInstance(config, jobName);
24 | 		job.setJarByClass(Step2.class);
25 | 		job.setJar("export\\ItemCF.jar");
26 | 		job.setMapperClass(Step2_Mapper.class);
27 | 		job.setReducerClass(Step2_Reducer.class);
28 | 		job.setMapOutputKeyClass(Text.class);
29 | 		job.setMapOutputValueClass(Text.class);
30 | 
31 | 		Path inPath = new Path(paths.get("Step2Input"));
32 | 		Path outpath = new Path(paths.get("Step2Output"));
33 | 		FileInputFormat.addInputPath(job, inPath);
34 | 		FileOutputFormat.setOutputPath(job, outpath);		
35 | 		FileSystem fs = FileSystem.get(config);
36 | 		if (fs.exists(outpath)) {
37 | 			fs.delete(outpath, true);
38 | 		}
39 | 		
40 | 		return job.waitForCompletion(true);	
41 | 	}
42 | 
43 | 	static class Step2_Mapper extends Mapper<LongWritable, Text, Text, Text> {
44 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
45 | 			String[] strs = value.toString().split(",");
46 | 			String item = strs[0];		//商品id
47 | 			String user = strs[1];		//用户id
48 | 			String action = strs[2];	//用户行为
49 | 			Integer rv = StartRun.R.get(action);	//获取行为评分
50 | 			Text v = new Text(item + ":" + rv.intValue());	//value格式: "i1:1"
51 | 			Text k = new Text(user);
52 | 			context.write(k, v);	//map输出格式: "u2723  i1:1"
53 | 		}
54 | 	}
55 | 
56 | 	static class Step2_Reducer extends Reducer<Text, Text, Text, Text> {
57 | 		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
58 | 			Map<String, Integer> m = new HashMap<String, Integer>();	//用于存放每种商品的行为评分之和
59 | 			for (Text value : values) {
60 | 				String[] strs = value.toString().split(":");
61 | 				String item = strs[0];						//商品id
62 | 				Integer score = Integer.parseInt(strs[1]);	//行为评分
63 | 				score += ((Integer) (m.get(item) == null ? 0 : m.get(item))).intValue();	//计算用户对每件商品的行为评分和（如果Map集合中已有该商品评分，则累加）
64 | 				m.put(item, score);		//向HashMap中存入商品及评分之和
65 | 			}
66 | 			
67 | 			StringBuffer sb = new StringBuffer();
68 | 			for (Entry<String, Integer> entry : m.entrySet()) {
69 | 				sb.append(entry.getKey() + ":" + entry.getValue().intValue() + ",");	//将商品和评分串联，格式：  i1:1,i2:1,...I:N,
70 | 			}
71 | 			context.write(key, new Text(sb.toString().substring(0, sb.toString().length() - 1)));	//去掉最后的逗号
72 | 		}
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step3.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.itemcf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.LongWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | 
17 | //计算用户同显矩阵
18 | public class Step3 {
19 | 	private final static Text K = new Text();
20 | 	private final static IntWritable one = new IntWritable(1);
21 | 
22 | 	public static boolean run(Configuration config, Map<String, String> paths)  throws IOException, ClassNotFoundException, InterruptedException {
23 | 		String jobName = "step3";
24 | 		Job job = Job.getInstance(config, jobName);
25 | 		job.setJarByClass(Step3.class);
26 | 		job.setJar("export\\ItemCF.jar");
27 | 		job.setMapperClass(Step3_Mapper.class);
28 | 		job.setReducerClass(Step3_Reducer.class);
29 | 		job.setCombinerClass(Step3_Reducer.class);
30 | 		job.setMapOutputKeyClass(Text.class);
31 | 		job.setMapOutputValueClass(IntWritable.class);
32 | 
33 | 		Path inPath = new Path(paths.get("Step3Input"));
34 | 		Path outpath = new Path(paths.get("Step3Output"));
35 | 		FileInputFormat.addInputPath(job, inPath);
36 | 		FileOutputFormat.setOutputPath(job, outpath);		
37 | 		FileSystem fs = FileSystem.get(config);
38 | 		if (fs.exists(outpath)) {
39 | 			fs.delete(outpath, true);
40 | 		}
41 | 		
42 | 		return job.waitForCompletion(true);		
43 | 	}
44 | 
45 | 	static class Step3_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
46 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
47 | 			// u2727 i468:2,i446:3			
48 | 			String[] items = value.toString().split("\t")[1].split(",");			//每件商品和评分列表，格式：i468:2 i446:3
49 | 			for (int i = 0; i < items.length; i++) {
50 | 				String itemA = items[i].split(":")[0];		// itemA = i468 .. i446
51 | 				for (int j = 0; j < items.length; j++) {
52 | 					String itemB = items[j].split(":")[0];	// itemB = i468 .. i446
53 | 					K.set(itemA + ":" + itemB);				// i468:i468 , i468:i446, i446:i468, i446:i446
54 | 					context.write(K, one);
55 | 				}
56 | 			}
57 | 		}
58 | 	}
59 | 
60 | 	static class Step3_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
61 | 		protected void reduce(Text key, Iterable<IntWritable> values, Context context)
62 | 				throws IOException, InterruptedException {
63 | 			int sum = 0;
64 | 			for (IntWritable val : values) {
65 | 				sum += val.get();
66 | 			}
67 | 			context.write(key, new IntWritable(sum));
68 | 		}
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step4.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.itemcf;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.HashMap;
  5 | import java.util.Iterator;
  6 | import java.util.Map;
  7 | import java.util.regex.Pattern;
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.LongWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 19 | 
 20 | //同显矩阵*评分矩阵，计算评分单项
 21 | public class Step4 {
 22 | 	public static boolean run(Configuration config, Map<String, String> paths) 
 23 | 			throws IOException, ClassNotFoundException, InterruptedException {
 24 | 		String jobName = "step4";
 25 | 		Job job = Job.getInstance(config, jobName);
 26 | 		job.setJarByClass(Step4.class);
 27 | 		job.setJar("export\\ItemCF.jar");
 28 | 		job.setMapperClass(Step4_Mapper.class);
 29 | 		job.setReducerClass(Step4_Reducer.class);
 30 | 		job.setMapOutputKeyClass(Text.class);
 31 | 		job.setMapOutputValueClass(Text.class);
 32 | 
 33 | 		Path[] inPaths = new Path[] { 
 34 | 				new Path(paths.get("Step4Input1")),
 35 | 				new Path(paths.get("Step4Input2")) };		
 36 | 		Path outpath = new Path(paths.get("Step4Output"));
 37 | 		FileInputFormat.setInputPaths(job, inPaths);
 38 | 		FileOutputFormat.setOutputPath(job, outpath);		
 39 | 		FileSystem fs = FileSystem.get(config);
 40 | 		if (fs.exists(outpath)) {
 41 | 			fs.delete(outpath, true);
 42 | 		}
 43 | 		
 44 | 		return job.waitForCompletion(true);
 45 | 	}
 46 | 
 47 | 	static class Step4_Mapper extends Mapper<LongWritable, Text, Text, Text> {
 48 | 		private String flag;	//保存Map输入数据来自于哪个目录（output2或ouput3），用于判断数据是同现矩阵还是评分矩阵
 49 | 
 50 | 		protected void setup(Context context) throws IOException, InterruptedException {
 51 | 			FileSplit split = (FileSplit) context.getInputSplit();	//根据上下文获取输入分片对象
 52 | 			flag = split.getPath().getParent().getName();			//获取输入分片所属的目录名称
 53 | 		}
 54 | 
 55 | 		protected void map(LongWritable key, Text value, Context context) 
 56 | 				throws IOException, InterruptedException {
 57 | 			String[] strs = Pattern.compile("[\t,]").split(value.toString());
 58 | 			if (flag.equals("output3")) {				//输入的是同现矩阵，strs格式："i100:i105 1"
 59 | 				String[] items = strs[0].split(":");	
 60 | 				String itemID1 = items[0];				//第一个商品id  "i100"
 61 | 				String itemID2 = items[1];				//第二个商品id	 "i105"
 62 | 				String num = strs[1];					//两件商品的同现次数    "1"
 63 | 				
 64 | 				Text k = new Text(itemID1);
 65 | 				Text v = new Text("A:" + itemID2 + "," + num);	//格式："A:i105,1"
 66 | 				context.write(k, v);							//格式："i100	A:i105,1"
 67 | 				
 68 | 			} else if (flag.equals("output2")) {	//输入的是评分矩阵，strs格式："u14 i100:1 i25:1"
 69 | 				String userID = strs[0];
 70 | 				for (int i = 1; i < strs.length; i++) {
 71 | 					String[] vector = strs[i].split(":");	//i100:1
 72 | 					String itemID = vector[0];
 73 | 					String score = vector[1];
 74 | 					Text k = new Text(itemID);				
 75 | 					Text v = new Text("B:" + userID + "," + score);	//格式："B:u14,1"
 76 | 					context.write(k, v);							//格式："i100 B:u14,1" 和 "i25 B:u14,1"
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 
 82 | 	static class Step4_Reducer extends Reducer<Text, Text, Text, Text> {
 83 | 		protected void reduce(Text key, Iterable<Text> values, Context context)
 84 | 				throws IOException, InterruptedException {
 85 | 			Map<String, Integer> mapA = new HashMap<String, Integer>();
 86 | 			Map<String, Integer> mapB = new HashMap<String, Integer>();
 87 | 			//reduce输入格式："i100  A:i105,1  A:i107,2  B:u14,1  B:u22,3"
 88 | 			for (Text val : values) {	//将AB格式的输入分别放入HashMap中
 89 | 				String str = val.toString();
 90 | 				if (str.startsWith("A:")) {			//str格式："A:i105,1"
 91 | 					String[] kv = Pattern.compile("[\t,]").split(str.substring(2));
 92 | 					mapA.put(kv[0], Integer.parseInt(kv[1]));
 93 | 				} else if (str.startsWith("B:")) {	//str格式："B:u14,1"
 94 | 					String[] kv = Pattern.compile("[\t,]").split(str.substring(2));
 95 | 					mapB.put(kv[0], Integer.parseInt(kv[1]));
 96 | 				}
 97 | 			}
 98 | 			double result = 0;
 99 | 			Iterator<String> itera = mapA.keySet().iterator();		//根据mapA中key键(itemID)生成迭代器对象
100 | 			while (itera.hasNext()) {
101 | 				String mapka = itera.next();							//获得itemID
102 | 				int num = mapA.get(mapka).intValue();				//根据itemID从mapA获取同现次数
103 | 				
104 | 				Iterator<String> iterb = mapB.keySet().iterator();	//根据mapB中key键生成迭代器对象
105 | 				while (iterb.hasNext()) {
106 | 					String mapkb = iterb.next();					//userID
107 | 					int score = mapB.get(mapkb).intValue();			//根据userID从mapB中获取用户行为评分
108 | 					
109 | 					result = num * score;							//矩阵相乘，计算评分
110 | 					context.write(new Text(mapkb), new Text(mapka + "," + result));	//输出 key："userID" value:"itemID,result"
111 | 				}
112 | 			}
113 | 		}
114 | 	}
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step5.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.itemcf;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | import java.util.Iterator;
 6 | import java.util.Map;
 7 | import java.util.regex.Pattern;
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.FileSystem;
10 | import org.apache.hadoop.fs.Path;
11 | import org.apache.hadoop.io.LongWritable;
12 | import org.apache.hadoop.io.Text;
13 | import org.apache.hadoop.mapreduce.Job;
14 | import org.apache.hadoop.mapreduce.Mapper;
15 | import org.apache.hadoop.mapreduce.Reducer;
16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
18 | 
19 | //计算总和评分
20 | public class Step5 {
21 | 	public static boolean run(Configuration config, Map<String, String> paths) 
22 | 			throws IOException, ClassNotFoundException, InterruptedException {
23 | 		String jobName = "step5";
24 | 		Job job = Job.getInstance(config, jobName);
25 | 		job.setJarByClass(Step5.class);
26 | 		job.setJar("export\\ItemCF.jar");
27 | 		job.setMapperClass(Step5_Mapper.class);
28 | 		job.setReducerClass(Step5_Reducer.class);
29 | 		job.setMapOutputKeyClass(Text.class);
30 | 		job.setMapOutputValueClass(Text.class);
31 | 
32 | 		Path inPath = new Path(paths.get("Step5Input"));
33 | 		Path outpath = new Path(paths.get("Step5Output"));
34 | 		FileInputFormat.addInputPath(job, inPath);
35 | 		FileOutputFormat.setOutputPath(job, outpath);		
36 | 		FileSystem fs = FileSystem.get(config);
37 | 		if (fs.exists(outpath)) {
38 | 			fs.delete(outpath, true);
39 | 		}
40 | 		
41 | 		return job.waitForCompletion(true);		
42 | 	}
43 | 
44 | 	static class Step5_Mapper extends Mapper<LongWritable, Text, Text, Text> {
45 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
46 | 			//输入格式："u2732	i405,2.0"
47 | 			String[] strs = Pattern.compile("[\t,]").split(value.toString());
48 | 			Text k = new Text(strs[0]);						//key: userID
49 | 			Text v = new Text(strs[1] + "," + strs[2]);		//value: "itemID,评分"
50 | 			context.write(k, v);
51 | 		}
52 | 	}
53 | 
54 | 	static class Step5_Reducer extends Reducer<Text, Text, Text, Text> {
55 | 		protected void reduce(Text key, Iterable<Text> values, Context context)
56 | 				throws IOException, InterruptedException {
57 | 			Map<String, Double> map = new HashMap<String, Double>();	//用于对商品评分累加
58 | 			for (Text val : values) {	//val格式: "itemID,评分"
59 | 				String[] strs = val.toString().split(",");
60 | 				String itemID = strs[0];
61 | 				Double score = Double.parseDouble(strs[1]);
62 | 				
63 | 				if (map.containsKey(itemID)) {	//如果Map中已记录该商品，取出评分累加后重新写入Map
64 | 					map.put(itemID, map.get(itemID) + score);
65 | 				} else {
66 | 					map.put(itemID, score);
67 | 				}
68 | 			}
69 | 			
70 | 			//遍历Map，完成输出
71 | 			Iterator<String> iter = map.keySet().iterator();	//根据itemID创建迭代器对象
72 | 			while (iter.hasNext()) {
73 | 				String itemID = iter.next();					//取出itemID
74 | 				double score = map.get(itemID);					//根据itemID从map中取出score
75 | 				context.write(key, new Text(itemID + "," + score));	//格式："userid	itemID,score"
76 | 			}
77 | 		}
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/itemcf/Step6.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.itemcf;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.Map;
  7 | import java.util.regex.Pattern;
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.LongWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.io.WritableComparable;
 14 | import org.apache.hadoop.io.WritableComparator;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.Reducer;
 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | 
 21 | //评分排序取Top10
 22 | public class Step6 {
 23 | 	private final static Text K = new Text();
 24 | 	private final static Text V = new Text();
 25 | 
 26 | 	public static boolean run(Configuration config, Map<String, String> paths) 
 27 | 			throws IOException, ClassNotFoundException, InterruptedException {
 28 | 		String jobName = "step6";
 29 | 		Job job = Job.getInstance(config, jobName);
 30 | 		job.setJarByClass(Step6.class);
 31 | 		job.setJar("export\\ItemCF.jar");
 32 | 		job.setMapperClass(Step6_Mapper.class);
 33 | 		job.setReducerClass(Step6_Reducer.class);		
 34 | 		job.setMapOutputKeyClass(PairWritable.class);
 35 | 		job.setMapOutputValueClass(Text.class);
 36 | 		//job.setSortComparatorClass(ScoreSort.class);			//自定义排序
 37 | 		job.setGroupingComparatorClass(UserGroup.class);	//自定义分组
 38 | 		
 39 | 		Path inPath = new Path(paths.get("Step6Input"));
 40 | 		Path outpath = new Path(paths.get("Step6Output"));
 41 | 		FileInputFormat.addInputPath(job, inPath);
 42 | 		FileOutputFormat.setOutputPath(job, outpath);		
 43 | 		FileSystem fs = FileSystem.get(config);
 44 | 		if (fs.exists(outpath)) {
 45 | 			fs.delete(outpath, true);
 46 | 		}
 47 | 		
 48 | 		return job.waitForCompletion(true);		
 49 | 	}
 50 | 
 51 | 	static class Step6_Mapper extends Mapper<LongWritable, Text, PairWritable, Text> {
 52 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 53 | 			String[] strs = Pattern.compile("[\t,]").split(value.toString());	//输入格式："u13	i524,3.0"
 54 | 			String user = strs[0];
 55 | 			String item = strs[1];
 56 | 			String score = strs[2];
 57 | 			
 58 | 			PairWritable k = new PairWritable();	//将uid和score封装到PairWritable对象中，作为MapKey输出
 59 | 			k.setUid(user);
 60 | 			k.setScore(Double.parseDouble(score));
 61 | 			
 62 | 			V.set(item + ":" + score);	//将item和score组合，作为MapValue输出
 63 | 			context.write(k, V);		//输出格式：key:"u13 3.0"  value:"i524:3.0"
 64 | 		}
 65 | 	}
 66 | 
 67 | 	static class Step6_Reducer extends Reducer<PairWritable, Text, Text, Text> {
 68 | 		protected void reduce(PairWritable key, Iterable<Text> values, Context context)
 69 | 				throws IOException, InterruptedException {
 70 | 			int i = 0;
 71 | 			StringBuffer sb = new StringBuffer();
 72 | 			for (Text v : values) {
 73 | 				if (i == 10)
 74 | 					break;
 75 | 				sb.append(v.toString() + ",");	//将评分数前10项串联
 76 | 				i++;
 77 | 			}
 78 | 			K.set(key.getUid());	//获取自定义key中的uid
 79 | 			V.set(sb.toString().substring(0,sb.toString().length()-1));	//去掉最后的逗号
 80 | 			context.write(K, V);
 81 | 		}
 82 | 	}
 83 | 
 84 | 	static class PairWritable implements WritableComparable<PairWritable> {
 85 | 		private String uid;
 86 | 		private double score;
 87 | 		
 88 | 		public String getUid() {
 89 | 			return uid;
 90 | 		}
 91 | 
 92 | 		public void setUid(String uid) {
 93 | 			this.uid = uid;
 94 | 		}
 95 | 
 96 | 		public double getScore() {
 97 | 			return score;
 98 | 		}
 99 | 
100 | 		public void setScore(double score) {
101 | 			this.score = score;
102 | 		}
103 | 		
104 | 		@Override
105 | 		public void write(DataOutput out) throws IOException {
106 | 			out.writeUTF(uid);
107 | 			out.writeDouble(score);
108 | 		}
109 | 
110 | 		@Override
111 | 		public void readFields(DataInput in) throws IOException {
112 | 			this.uid = in.readUTF();
113 | 			this.score = in.readDouble();
114 | 		}
115 | 
116 | 		@Override
117 | 		public int compareTo(PairWritable o) {
118 | 			int r = this.uid.compareTo(o.getUid());	//按uid升序排列
119 | 			if (r == 0) {
120 | 				return -Double.compare(this.score, o.getScore()); //uid相同，则按score降序排列
121 | 			}
122 | 			return r;
123 | 		}		
124 | 	}
125 | 
126 | 	//自定义排序：先按uid升序，再按score降序
127 | 	/*static class ScoreSort extends WritableComparator {
128 | 		public ScoreSort() {
129 | 			super(PairWritable.class, true);
130 | 		}
131 | 
132 | 		@SuppressWarnings("rawtypes")
133 | 		public int compare(WritableComparable a, WritableComparable b) {
134 | 			PairWritable o1 = (PairWritable) a;
135 | 			PairWritable o2 = (PairWritable) b;
136 | 			int r = o1.getUid().compareTo(o2.getUid());	//按uid升序排列
137 | 			if (r == 0) {
138 | 				return -Double.compare(o1.getScore(), o2.getScore());	//按num降序排列
139 | 			}
140 | 			return r;
141 | 		}
142 | 	}*/
143 | 
144 | 	//自定义分组，Map输出key（PairWritable）中uid相同的记录设为同组
145 | 	static class UserGroup extends WritableComparator {
146 | 		public UserGroup() {
147 | 			super(PairWritable.class, true);
148 | 		}
149 | 
150 | 		@SuppressWarnings("rawtypes")
151 | 		public int compare(WritableComparable a, WritableComparable b) {
152 | 			PairWritable o1 = (PairWritable) a;
153 | 			PairWritable o2 = (PairWritable) b;
154 | 			return o1.getUid().compareTo(o2.getUid());
155 | 		}
156 | 	}
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/medianstddev/MRDPUtils.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.medianstddev;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.HashMap;
 5 | 
 6 | public class MRDPUtils {
 7 | 	public static Map<String, String> transformXmlToMap(String xml) {
 8 | 		Map<String, String> map = new HashMap<String, String>();
 9 | 		try {
10 | 			String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");
11 | 			for (int i = 0; i < tokens.length - 1; i += 2) {
12 | 				String key = tokens[i].trim();
13 | 				String val = tokens[i + 1];
14 | 				map.put(key.substring(0, key.length() - 1), val);
15 | 			}
16 | 		} catch (StringIndexOutOfBoundsException e) {
17 | 			System.err.println(xml);
18 | 		}
19 | 		return map;
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevJob.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.medianstddev;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 | 
11 | public class MedianStdDevJob {
12 | 	public static void main(String[] args) throws Exception {
13 | 		//1.设置HDFS配置信息
14 | 		String namenode_ip = "192.168.17.10";
15 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
16 | 		Configuration conf = new Configuration();
17 | 		conf.set("fs.defaultFS", hdfs);
18 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
19 | 
20 | 		//2.设置MapReduce作业配置信息
21 | 		String jobName = "MedianStdDevJob";					//作业名称
22 | 		Job job = Job.getInstance(conf, jobName);
23 | 		job.setJarByClass(MedianStdDevJob.class);			//指定运行时作业类
24 | 		job.setJar("export\\MedianStdDevJob.jar");			//指定本地jar包
25 | 		job.setMapperClass(MedianStdDevMapper.class);		//指定Mapper类
26 | 		job.setMapOutputKeyClass(IntWritable.class);		//设置Mapper输出Key类型
27 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
28 | 		job.setReducerClass(MedianStdDevReducer.class);		//指定Reducer类
29 | 		job.setOutputKeyClass(IntWritable.class);			//设置Reduce输出Key类型
30 | 		job.setOutputValueClass(MedianStdDevTuple.class); 	//设置Reduce输出Value类型
31 | 		
32 | 		//3.设置作业输入和输出路径
33 | 		String dataDir = "/expr/medianstddev/data";			//实验数据目录	
34 | 		String outputDir = "/expr/medianstddev/output";		//实验输出目录
35 | 		Path inPath = new Path(hdfs + dataDir);
36 | 		Path outPath = new Path(hdfs + outputDir);
37 | 		FileInputFormat.addInputPath(job, inPath);
38 | 		FileOutputFormat.setOutputPath(job, outPath);
39 | 		FileSystem fs = FileSystem.get(conf);
40 | 		if(fs.exists(outPath)) {
41 | 			fs.delete(outPath, true);
42 | 		}
43 | 		
44 | 		//4.运行作业
45 | 		System.out.println("Job: " + jobName + " is running...");
46 | 		if(job.waitForCompletion(true)) {
47 | 			System.out.println("success!");
48 | 			System.exit(0);
49 | 		} else {
50 | 			System.out.println("failed!");
51 | 			System.exit(1);
52 | 		}
53 | 	}
54 | 	
55 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevMapper.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.medianstddev;
 2 | 
 3 | import java.io.IOException;
 4 | import java.text.ParseException;
 5 | import java.text.SimpleDateFormat;
 6 | import java.util.Calendar;
 7 | import java.util.Date;
 8 | import java.util.Map;
 9 | import org.apache.hadoop.io.IntWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | 
13 | public class MedianStdDevMapper extends Mapper<Object, Text, IntWritable, IntWritable> {
14 | 	private IntWritable outHour= new IntWritable();
15 | 	private IntWritable outCommentLength= new IntWritable();
16 | 	private final static SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
17 | 
18 | 	@SuppressWarnings("deprecation")
19 | 	@Override
20 | 	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
21 | 		Map<String, String> map = MRDPUtils.transformXmlToMap(value.toString());
22 | 		String strDate = map.get("CreationDate");		//获取评论日期
23 | 		String text = map.get("Text");					//获取评论内容
24 | 		if (strDate == null || text == null) {
25 | 			return;
26 | 		}
27 | 		try {
28 | 			Date creationDate = frmt.parse(strDate);	//转换日期格式
29 | 			outHour.set(creationDate.getHours());		//从日期中获取小时值
30 | 			outCommentLength.set(text.length());		//设置评论内容的长度
31 | 			context.write(outHour, outCommentLength);	//将小时和评论长度作为Map输出
32 | 		} catch (ParseException e) {
33 | 			System.err.println(e.getMessage());
34 | 			return;
35 | 		}
36 | 	}
37 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevReducer.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.medianstddev;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.Collections;
 6 | import org.apache.hadoop.io.IntWritable;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | public class MedianStdDevReducer extends Reducer<IntWritable, IntWritable, IntWritable, MedianStdDevTuple> {
10 | 	private MedianStdDevTuple result = new MedianStdDevTuple();			//记录评论长度中位数和标准差
11 | 	private ArrayList<Float> commentLengths = new ArrayList<Float>();	//用列表记录每条评论的长度
12 | 
13 | 	@Override
14 | 	public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
15 | 			throws IOException, InterruptedException {
16 | 		float sum = 0;		//评论长度总和
17 | 		float count = 0;	//评论数
18 | 		commentLengths.clear();	//清空评论数列表
19 | 		result.setStddev(0);	//标准差默认值设为0		
20 | 		for (IntWritable val : values) {
21 | 			commentLengths.add((float) val.get());	//将评论长度保存到列表
22 | 			sum += val.get();	//计算评论长度总和
23 | 			count++;			//评论总数
24 | 		}
25 | 		
26 | 		//计算中位数：集合数量如为偶数，取中间两位的均值；如为奇数，则直接取中值
27 | 		Collections.sort(commentLengths);	//对集合中评论字数排序
28 | 		if (count % 2 == 0) {//偶
29 | 			result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths.get((int) count / 2)) / 2.0f);
30 | 		} else {//奇
31 | 			result.setMedian(commentLengths.get((int) count / 2));
32 | 		}
33 | 		
34 | 		//计算标准差
35 | 		float mean = sum / count;	//计算评论的平均字数
36 | 		float sumOfSquares = 0.0f;	//平方和
37 | 		for (Float f : commentLengths) {
38 | 			sumOfSquares += (f - mean) * (f - mean);
39 | 		}
40 | 		result.setStddev((float) Math.sqrt(sumOfSquares / (count - 1)));	//计算标准差
41 | 		context.write(key, result);
42 | 	}
43 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevTuple.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.medianstddev;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | import org.apache.hadoop.io.Writable;
 7 | 
 8 | public class MedianStdDevTuple implements Writable {
 9 | 	private float median = 0f;
10 | 	private float stddev = 0f;
11 | 
12 | 	public float getMedian() {
13 | 		return median;
14 | 	}
15 | 
16 | 	public void setMedian(float median) {
17 | 		this.median = median;
18 | 	}
19 | 
20 | 	public float getStddev() {
21 | 		return stddev;
22 | 	}
23 | 
24 | 	public void setStddev(float stddev) {
25 | 		this.stddev = stddev;
26 | 	}
27 | 
28 | 	@Override
29 | 	public void readFields(DataInput in) throws IOException {
30 | 		median = in.readFloat();
31 | 		stddev = in.readFloat();
32 | 	}
33 | 
34 | 	@Override
35 | 	public void write(DataOutput out) throws IOException {
36 | 		out.writeFloat(median);
37 | 		out.writeFloat(stddev);
38 | 	}
39 | 
40 | 	@Override
41 | 	public String toString() {
42 | 		return median + "\t" + stddev;
43 | 	}
44 | 	
45 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/minmaxcount/MRDPUtils.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.minmaxcount;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.HashMap;
 5 | 
 6 | public class MRDPUtils {
 7 | 	public static Map<String, String> transformXmlToMap(String xml) {
 8 | 		Map<String, String> map = new HashMap<String, String>();
 9 | 		try {
10 | 			String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\"");
11 | 			for (int i = 0; i < tokens.length - 1; i += 2) {
12 | 				String key = tokens[i].trim();
13 | 				String val = tokens[i + 1];
14 | 				map.put(key.substring(0, key.length() - 1), val);
15 | 			}
16 | 		} catch (StringIndexOutOfBoundsException e) {
17 | 			System.err.println(xml);
18 | 		}
19 | 		return map;
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountJob.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.minmaxcount;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 | 
11 | public class MinMaxCountJob {
12 | 	public static void main(String[] args) throws Exception {
13 | 		//1.设置HDFS配置信息
14 | 		String namenode_ip = "192.168.17.10";
15 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
16 | 		Configuration conf = new Configuration();
17 | 		conf.set("fs.defaultFS", hdfs);
18 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
19 | 
20 | 		//2.设置MapReduce作业配置信息
21 | 		String jobName = "MinMaxCountJob";					//作业名称
22 | 		Job job = Job.getInstance(conf, jobName);
23 | 		job.setJarByClass(MinMaxCountJob.class);			//指定运行时作业类
24 | 		job.setJar("export\\MinMaxCountJob.jar");			//指定本地jar包
25 | 		job.setMapperClass(MinMaxCountMapper.class);		//指定Mapper类
26 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
27 | 		job.setMapOutputValueClass(MinMaxCountTuple.class);	//设置Mapper输出Value类型
28 | 		job.setCombinerClass(MinMaxCountReducer.class);		//指定Combiner类
29 | 		job.setReducerClass(MinMaxCountReducer.class);		//指定Reducer类
30 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
31 | 		job.setOutputValueClass(MinMaxCountTuple.class); 	//设置Reduce输出Value类型
32 | 		
33 | 		//3.设置作业输入和输出路径
34 | 		String dataDir = "/expr/minmaxcount/data";			//实验数据目录	
35 | 		String outputDir = "/expr/minmaxcount/output";		//实验输出目录
36 | 		Path inPath = new Path(hdfs + dataDir);
37 | 		Path outPath = new Path(hdfs + outputDir);
38 | 		FileInputFormat.addInputPath(job, inPath);
39 | 		FileOutputFormat.setOutputPath(job, outPath);
40 | 		FileSystem fs = FileSystem.get(conf);
41 | 		if(fs.exists(outPath)) {
42 | 			fs.delete(outPath, true);
43 | 		}
44 | 		
45 | 		//4.运行作业
46 | 		System.out.println("Job: " + jobName + " is running...");
47 | 		if(job.waitForCompletion(true)) {
48 | 			System.out.println("success!");
49 | 			System.exit(0);
50 | 		} else {
51 | 			System.out.println("failed!");
52 | 			System.exit(1);
53 | 		}
54 | 	}
55 | 	
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountMapper.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.minmaxcount;
 2 | 
 3 | import java.io.IOException;
 4 | import java.text.ParseException;
 5 | import java.text.SimpleDateFormat;
 6 | import java.util.Date;
 7 | import java.util.Map;
 8 | 
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | 
12 | public class MinMaxCountMapper extends Mapper<Object, Text, Text, MinMaxCountTuple> {
13 | 	private Text outUserId = new Text();	//用户ID
14 | 	private MinMaxCountTuple outTuple = new MinMaxCountTuple();	//日期最小值、日期最大值、评论数的组合
15 | 	private final SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
16 | 
17 | 	@Override
18 | 	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
19 | 		Map<String, String> map = MRDPUtils.transformXmlToMap(value.toString());	//分解每条评论，保存每对KV到Map对象
20 | 		String userId = map.get("UserId");			//从Map对象中获取用户ID
21 | 		String strDate = map.get("CreationDate");	//从Map对象中获取评论时间
22 | 		
23 | 		if (strDate == null || userId == null) {	//过滤掉不含统计数据的记录
24 | 			return;
25 | 		}
26 | 		try {
27 | 			Date creationDate = frmt.parse(strDate);
28 | 			// 因为还没有MinMax，只有把当前数据中日期作为MinMax
29 | 			outTuple.setMin(creationDate);
30 | 			outTuple.setMax(creationDate);
31 | 			outTuple.setCount(1);
32 | 			outUserId.set(userId);
33 | 			context.write(outUserId, outTuple);
34 | 		} catch (ParseException e) {
35 | 			System.err.println(e.getMessage());
36 | 			return;
37 | 		}
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountReducer.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.minmaxcount;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class MinMaxCountReducer extends Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> {
 8 | 	private MinMaxCountTuple result = new MinMaxCountTuple();
 9 | 
10 | 	@Override
11 | 	public void reduce(Text key, Iterable<MinMaxCountTuple> values, Context context)
12 | 			throws IOException, InterruptedException {
13 | 		result.setMin(null);
14 | 		result.setMax(null);
15 | 		int sum = 0;
16 | 		for (MinMaxCountTuple val : values) {
17 | 			if (result.getMin() == null || val.getMin().compareTo(result.getMin()) < 0) {
18 | 				result.setMin(val.getMin());
19 | 			}
20 | 			if (result.getMax() == null || val.getMax().compareTo(result.getMax()) > 0) {
21 | 				result.setMax(val.getMax());
22 | 			}
23 | 			sum += val.getCount();
24 | 		}
25 | 		result.setCount(sum);
26 | 		context.write(key, result);
27 | 	}
28 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountTuple.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.minmaxcount;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | import java.text.SimpleDateFormat;
 7 | import java.util.Date;
 8 | 
 9 | import org.apache.hadoop.io.Writable;
10 | 
11 | public class MinMaxCountTuple implements Writable {
12 | 	
13 | 	private Date min = new Date();		//第一次评论时间
14 | 	private Date max = new Date();		//最后一次评论时间
15 | 	private long count = 0;				//评论总数
16 | 	private final SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
17 | 
18 | 	public Date getMin() {
19 | 		return min;
20 | 	}
21 | 
22 | 	public void setMin(Date min) {
23 | 		this.min = min;
24 | 	}
25 | 
26 | 	public Date getMax() {
27 | 		return max;
28 | 	}
29 | 
30 | 	public void setMax(Date max) {
31 | 		this.max = max;
32 | 	}
33 | 
34 | 	public long getCount() {
35 | 		return count;
36 | 	}
37 | 
38 | 	public void setCount(long count) {
39 | 		this.count = count;
40 | 	}
41 | 
42 | 	@Override
43 | 	public void readFields(DataInput in) throws IOException {
44 | 		min = new Date(in.readLong());
45 | 		max = new Date(in.readLong());
46 | 		count = in.readLong();
47 | 	}
48 | 
49 | 	@Override
50 | 	public void write(DataOutput out) throws IOException {
51 | 		out.writeLong(min.getTime());
52 | 		out.writeLong(max.getTime());
53 | 		out.writeLong(count);
54 | 	}
55 | 
56 | 	@Override
57 | 	public String toString() {
58 | 		return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count;
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/output/CompressOutput.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.output;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.IntWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.io.compress.GzipCodec;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.Mapper;
12 | import org.apache.hadoop.mapreduce.Reducer;
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 | 
16 | public class CompressOutput {
17 | 
18 | 	public static class CompressOutputMapper extends Mapper<Object, Text, Text, IntWritable> {		
19 | 		private final static IntWritable one = new IntWritable(1);
20 | 		
21 | 		public void map(Object key, Text value, Context context )
22 | 				throws IOException, InterruptedException {
23 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
24 | 	    	Text date = new Text(strs[0]);		//获取日期
25 | 			context.write(date, one);			//将日期和常数1作为Map输出	
26 | 	    }
27 | 	}
28 |   
29 | 	public static class CompressOutputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
30 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
31 | 				throws IOException, InterruptedException {
32 | 			int sum = 0;
33 | 			for (IntWritable val : values) {
34 | 				sum += val.get();
35 | 			}
36 | 			context.write(key, new IntWritable(sum));
37 | 	    }
38 | 	}
39 | 
40 | 	public static void main(String[] args) throws Exception {		
41 | 		//1.设置HDFS配置信息
42 | 		String namenode_ip = "192.168.17.10";
43 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
44 | 		Configuration conf = new Configuration();
45 | 		conf.set("fs.defaultFS", hdfs);
46 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
47 | 
48 | 		//2.设置MapReduce作业配置信息
49 | 		String jobName = "CompressOutput";					//作业名称
50 | 		Job job = Job.getInstance(conf, jobName);
51 | 		job.setJarByClass(CompressOutput.class);				//指定运行时作业类
52 | 		job.setJar("export\\CompressOutput.jar");			//指定本地jar包
53 | 		job.setMapperClass(CompressOutputMapper.class);		//指定Mapper类
54 | 		job.setMapOutputKeyClass(Text.class);				//设置Mapper输出Key类型
55 | 		job.setMapOutputValueClass(IntWritable.class);		//设置Mapper输出Value类型
56 | 		job.setReducerClass(CompressOutputReducer.class);	//指定Reducer类
57 | 		job.setOutputKeyClass(Text.class);					//设置Reduce输出Key类型
58 | 		job.setOutputValueClass(IntWritable.class); 		//设置Reduce输出Value类型
59 | 		
60 | 		//设置对输出结果进行压缩，指定压缩编码方式
61 | 		FileOutputFormat.setCompressOutput(job, true);
62 | 		FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
63 | 		
64 | 		//3.设置作业输入和输出路径
65 | 		String dataDir = "/expr/compress/data";			//实验数据目录	
66 | 		String outputDir = "/expr/compress/output";		//实验输出目录
67 | 		Path inPath = new Path(hdfs + dataDir);
68 | 		Path outPath = new Path(hdfs + outputDir);
69 | 		FileInputFormat.addInputPath(job, inPath);
70 | 		FileOutputFormat.setOutputPath(job, outPath);
71 | 		FileSystem fs = FileSystem.get(conf);
72 | 		if(fs.exists(outPath)) {
73 | 			fs.delete(outPath, true);
74 | 		}
75 | 		
76 | 		//4.运行作业
77 | 		System.out.println("Job: " + jobName + " is running...");
78 | 		if(job.waitForCompletion(true)) {
79 | 			System.out.println("success!");
80 | 			System.exit(0);
81 | 		} else {
82 | 			System.out.println("failed!");
83 | 			System.exit(1);
84 | 		}
85 | 	}
86 | 
87 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/output/MultOutput.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.output;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.IntWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Reducer;
 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 18 | 
 19 | public class MultOutput {
 20 | 
 21 | 	public static class MultOutputMapper extends Mapper<Object, Text, Text, IntWritable> {		
 22 | 		private final static IntWritable one = new IntWritable(1);
 23 | 		
 24 | 		public void map(Object key, Text value, Context context ) 
 25 | 				throws IOException, InterruptedException {
 26 | 	    	String[] strs = value.toString().split(" ");	//按空格分割输入
 27 | 	    	Text date = new Text(strs[0]);		//获取日期
 28 | 			context.write(date, one);			//将日期和常数1作为Map输出	
 29 | 	    }
 30 | 	}
 31 |   
 32 | 	public static class MultOutputReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 33 | 		//定义MultiOutputs对象
 34 | 		private MultipleOutputs<Text,IntWritable> mos;
 35 | 		
 36 | 		//初始化MultiOutputs对象
 37 | 		protected void setup(Context context) throws IOException, InterruptedException {
 38 | 			mos = new MultipleOutputs<Text, IntWritable>(context);
 39 | 		}
 40 | 		
 41 | 		//关闭MultiOutputs对象
 42 | 		protected void cleanup(Context context) throws IOException, InterruptedException {
 43 | 			mos.close();
 44 | 		}
 45 | 		
 46 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 47 | 				throws IOException, InterruptedException {
 48 | 			int sum = 0;
 49 | 			for (IntWritable val : values) {
 50 | 				sum += val.get();
 51 | 			}
 52 | 			//context.write(key, new IntWritable(sum));
 53 | 			
 54 | 			//使用MultiOutputs对象替代Context对象输出
 55 | 			//1. 输出到不同文件（格式、文件名）
 56 | 			if (key.toString().startsWith("2015"))
 57 | 				mos.write("f2015", key, new IntWritable(sum));
 58 | 			else if (key.toString().startsWith("2016"))
 59 | 				mos.write("f2016", key, new IntWritable(sum));
 60 | 			else
 61 | 				mos.write("f2017", key, new IntWritable(sum));
 62 | 			
 63 | 			//2. 输出到以年分类的子目录，只需指定输出子目录+文件名，不需要在驱动类中定义文件名
 64 | 			//mos.write(key, new IntWritable(sum), key.toString().substring(0,4)+"/result");
 65 | 			
 66 | 	    }
 67 | 	}
 68 | 
 69 | 	public static void main(String[] args) throws Exception {		
 70 | 		//1.设置HDFS配置信息
 71 | 		String namenode_ip = "192.168.17.10";
 72 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 73 | 		Configuration conf = new Configuration();
 74 | 		conf.set("fs.defaultFS", hdfs);
 75 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 76 | 
 77 | 		//2.设置MapReduce作业配置信息
 78 | 		String jobName = "MultOutput";					//作业名称
 79 | 		Job job = Job.getInstance(conf, jobName);
 80 | 		job.setJarByClass(MultOutput.class);			//指定运行时作业类
 81 | 		job.setJar("export\\MultOutput.jar");			//指定本地jar包
 82 | 		job.setMapperClass(MultOutputMapper.class);		//指定Mapper类
 83 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 84 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
 85 | 		job.setReducerClass(MultOutputReducer.class);	//指定Reducer类
 86 | 		//job.setOutputKeyClass(Text.class);			//设置Reduce输出Key类型
 87 | 		//job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
 88 | 		
 89 | 		//定义多文件输出的文件名、输出格式、键类型、值类型
 90 | 		MultipleOutputs.addNamedOutput(job, "f2015", TextOutputFormat.class, Text.class, IntWritable.class);
 91 | 		MultipleOutputs.addNamedOutput(job, "f2016", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
 92 | 		MultipleOutputs.addNamedOutput(job, "f2017", MapFileOutputFormat.class, Text.class, IntWritable.class);
 93 | 		
 94 | 		//3.设置作业输入和输出路径
 95 | 		String dataDir = "/expr/multoutput/data";			//实验数据目录	
 96 | 		String outputDir = "/expr/multoutput/output";		//实验输出目录
 97 | 		Path inPath = new Path(hdfs + dataDir);
 98 | 		Path outPath = new Path(hdfs + outputDir);
 99 | 		FileInputFormat.addInputPath(job, inPath);
100 | 		FileOutputFormat.setOutputPath(job, outPath);
101 | 		FileSystem fs = FileSystem.get(conf);
102 | 		if(fs.exists(outPath)) {
103 | 			fs.delete(outPath, true);
104 | 		}
105 | 		
106 | 		//4.运行作业
107 | 		System.out.println("Job: " + jobName + " is running...");
108 | 		if(job.waitForCompletion(true)) {
109 | 			System.out.println("success!");
110 | 			System.exit(0);
111 | 		} else {
112 | 			System.out.println("failed!");
113 | 			System.exit(1);
114 | 		}
115 | 	}
116 | 
117 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/peoplerank/People.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.peoplerank;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Arrays;
 5 | import org.apache.commons.lang.StringUtils;
 6 | 
 7 | public class People {
 8 | 	private double peopleRank = 1.0;		//存储 PR值，初值默认1.0
 9 | 	private String[] attentionPeoples;		//关注的人
10 | 	public static final char fieldSeparator = '\t';	//多处使用分隔符\t，定义为常量
11 | 
12 | 	public double getPeopleRank() {
13 | 		return peopleRank;
14 | 	}
15 | 
16 | 	public People setPeopleRank(double pageRank) {
17 | 		this.peopleRank = pageRank;
18 | 		return this;
19 | 	}
20 | 
21 | 	public String[] getAttentionPeoples() {
22 | 		return attentionPeoples;
23 | 	}
24 | 
25 | 	public People setAttentionPeoples(String[] attentionPeoples) {
26 | 		this.attentionPeoples = attentionPeoples;
27 | 		return this;
28 | 	}
29 | 
30 | 	//判断是否包含关注用户
31 | 	public boolean containsAttentionPeoples() {
32 | 		return attentionPeoples != null && attentionPeoples.length > 0;
33 | 	}
34 | 
35 | 	@Override
36 | 	//People对象转成字符串
37 | 	public String toString() {
38 | 		StringBuilder sb = new StringBuilder();
39 | 		sb.append(peopleRank);
40 | 		if (attentionPeoples != null) {
41 | 			sb.append(fieldSeparator).append(StringUtils.join(attentionPeoples, fieldSeparator));
42 | 		}
43 | 		return sb.toString();	//返回String格式："PeopleRand值	u1	u2..."
44 | 	}
45 | 	
46 | 	//字符串转成People对象
47 | 	public static People fromMR(String str) throws IOException {	//参数String格式："PeopleRand值	u1	u2..."
48 | 		People people = new People();
49 | 		String[] strs = StringUtils.splitPreserveAllTokens(str, fieldSeparator);	//将字符串按分隔符分割成字符串数组
50 | 		people.setPeopleRank(Double.valueOf(strs[0]));	//处理第一个元素
51 | 		if (strs.length > 1) {// 设置关注的人，从strs下标为1的位置开始（因为传进来类似"1.0 b c d"的字符串）
52 | 			people.setAttentionPeoples(Arrays.copyOfRange(strs, 1, strs.length));	//处理其它元素
53 | 		}
54 | 		return people;	//返回People对象
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/peoplerank/PeopleRank.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.peoplerank;
 2 | 
 3 | import java.io.IOException;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class PeopleRank {
16 | 
17 | 	public static class PeopleRankMapper extends Mapper<LongWritable, Text, Text, Text> {
18 | 		@Override
19 | 		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20 | 			String[] strs = value.toString().split(",");
21 | 			context.write(new Text(strs[0]), new Text(strs[1]));
22 | 		}
23 | 	}
24 | 
25 | 	public static class PeopleRankReducer extends Reducer<Text, Text, Text, Text> {
26 | 		@Override
27 | 		protected void reduce(Text key, Iterable<Text> values, Context context)
28 | 				throws IOException, InterruptedException {
29 | 			StringBuilder sb = new StringBuilder();
30 | 			for (Text v : values) {
31 | 				sb.append("\t" + v.toString());
32 | 			}
33 | 			context.write(key, new Text(sb.toString().replaceFirst("\t", "")));	//将开头的制表符去掉
34 | 		}
35 | 	}
36 | 	
37 | 	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
38 | 		String namenode_ip = "192.168.17.10";
39 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
40 | 		Configuration conf = new Configuration();		
41 | 		conf.set("fs.defaultFS", hdfs);
42 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
43 | 		
44 | 		String jobName = "PeopleRank";
45 | 		Job job = Job.getInstance(conf, jobName);
46 | 		job.setJarByClass(PeopleRank.class);
47 | 		job.setJar("export\\PeopleRank.jar");
48 | 		job.setMapperClass(PeopleRankMapper.class);
49 | 		job.setMapOutputKeyClass(Text.class);
50 | 		job.setMapOutputValueClass(Text.class);
51 | 		job.setReducerClass(PeopleRankReducer.class);
52 | 		job.setOutputKeyClass(Text.class);
53 | 		job.setOutputValueClass(Text.class);
54 | 		
55 | 		String dataDir = "/expr/peoplerank/data";
56 | 		String outputDir = "/expr/peoplerank/output/adjacent";
57 | 		Path inPath = new Path(hdfs + dataDir);
58 | 		Path outPath = new Path(hdfs + outputDir);
59 | 		FileInputFormat.addInputPath(job, inPath);
60 | 		FileOutputFormat.setOutputPath(job, outPath);		
61 | 		FileSystem fs = FileSystem.get(conf);
62 | 		if(fs.exists(outPath)) {
63 | 			fs.delete(outPath, true);
64 | 		}
65 | 
66 | 		System.out.println( "Job: " + jobName + " is running...");
67 | 		if(job.waitForCompletion(true)) {
68 | 			System.out.println("success!");
69 | 			System.exit(0);
70 | 		} else {
71 | 			System.out.println("failed!");
72 | 			System.exit(1);
73 | 		}
74 | 	}
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/topten/TopTenJob.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.topten;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.NullWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 | 
12 | public class TopTenJob {
13 | 	public static void main(String[] args) throws Exception {
14 | 		String namenode_ip = "192.168.17.10";
15 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
16 | 		Configuration conf = new Configuration();		
17 | 		conf.set("fs.defaultFS", hdfs);
18 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
19 | 		
20 | 		String jobName = "TopTenJob";
21 | 		Job job = Job.getInstance(conf, jobName);
22 | 		job.setJarByClass(TopTenJob.class);
23 | 		job.setJar("export\\TopTen.jar");
24 | 		job.setMapperClass(TopTenMapper.class);
25 | 		job.setMapOutputKeyClass(NullWritable.class);
26 | 		job.setMapOutputValueClass(Text.class);
27 | 		job.setReducerClass(TopTenReducer.class);
28 | 		job.setOutputKeyClass(NullWritable.class);
29 | 		job.setOutputValueClass(Text.class);
30 | 		job.setNumReduceTasks(1);		//计算最终TopN，只能运行一个Reduce任务
31 | 
32 | 		String dataDir = "/expr/topten/data";	
33 | 		String outputDir = "/expr/topten/output";
34 | 		Path inPath = new Path(hdfs + dataDir);
35 | 		Path outPath = new Path(hdfs + outputDir);
36 | 		FileInputFormat.addInputPath(job, inPath);
37 | 		FileOutputFormat.setOutputPath(job, outPath);		
38 | 		FileSystem fs = FileSystem.get(conf);
39 | 		if(fs.exists(outPath)) {
40 | 			fs.delete(outPath, true);
41 | 		}
42 | 		
43 | 		System.out.println( "Job: " + jobName + " is running...");
44 | 		if(job.waitForCompletion(true)) {
45 | 			System.out.println("success!");
46 | 			System.exit(0);
47 | 		} else {
48 | 			System.out.println("failed!");
49 | 			System.exit(1);
50 | 		}
51 | 	}
52 | 	
53 | }


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/topten/TopTenMapper.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.topten;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.TreeMap;
 5 | import org.apache.hadoop.io.NullWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class TopTenMapper extends Mapper<Object, Text, NullWritable, Text> {
10 | 	private TreeMap<Integer, Text> visittimesMap = new TreeMap<Integer, Text>();	//TreeMap是有序KV集合
11 | 
12 | 	@Override
13 | 	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
14 | 		if (value == null) {
15 | 			return;
16 | 		}
17 | 		String[] strs = value.toString().split(" ");
18 | 		String tId = strs[0];
19 | 		String tVisittimes = strs[1];
20 | 		if (tId == null || tVisittimes == null) {
21 | 			return;
22 | 		}
23 | 		visittimesMap.put(Integer.parseInt(tVisittimes), new Text(value));	//将访问次数（KEY）和行记录（VALUE）放入TreeMap中自动排序
24 | 		if (visittimesMap.size() > 10) {	//如果TreeMap中元素超过N个，将第一个（KEY最小的）元素删除
25 | 			visittimesMap.remove(visittimesMap.firstKey());
26 | 		}
27 | 	}
28 | 
29 | 	@Override
30 | 	protected void cleanup(Context context) throws IOException, InterruptedException {
31 | 		for (Text t : visittimesMap.values()) {
32 | 			context.write(NullWritable.get(), t);	//在clean()中完成Map输出
33 | 		}
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/ssdut/training/mapreduce/topten/TopTenReducer.java:
--------------------------------------------------------------------------------
 1 | package ssdut.training.mapreduce.topten;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.NavigableMap;
 5 | import java.util.TreeMap;
 6 | import org.apache.hadoop.io.NullWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Reducer;
 9 | 
10 | public class TopTenReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
11 | 	private TreeMap<Integer, Text> visittimesMap = new TreeMap<Integer, Text>();
12 | 
13 | 	@Override
14 | 	public void reduce(NullWritable key, Iterable<Text> values, Context context)
15 | 			throws IOException, InterruptedException {
16 | 		for (Text val : values) {
17 | 			String[] strs = val.toString().split(" ");
18 | 			visittimesMap.put(Integer.parseInt(strs[1]), new Text(val));	//将访问次数（KEY）和行记录（VALUE）放入TreeMap中自动排序
19 | 			if (visittimesMap.size() > 10) {		//如果TreeMap中元素超过N个，将第一个（KEY最小的）元素删除
20 | 				visittimesMap.remove(visittimesMap.firstKey());
21 | 			}
22 | 		}
23 | 	}
24 | 	
25 | 	public void cleanup(Context context) throws IOException, InterruptedException {
26 | 		//将TreeMap反序处理，降序输出top10
27 | 		NavigableMap<Integer, Text> reverMap = visittimesMap.descendingMap();	//获得TreeMap反序
28 | 		for (Text t : reverMap.values()) {
29 | 			context.write(NullWritable.get(), t);
30 | 		}
31 | 	}
32 | }
33 | 
34 | 
35 |  
36 | 


--------------------------------------------------------------------------------
/src/main/java/weblog/FlowCount.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.IOException;
  4 | import java.text.ParseException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.Locale;
  7 | import java.util.regex.Pattern;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.FileSystem;
 11 | import org.apache.hadoop.fs.Path;
 12 | import org.apache.hadoop.io.IntWritable;
 13 | import org.apache.hadoop.io.LongWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.Reducer;
 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | 
 21 | //1. 计算网站全天产生的流量
 22 | public class FlowCount {
 23 | 
 24 | 	public static class FlowCountMapper extends Mapper<Object, Text, Text, IntWritable> {		
 25 | 		private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
 26 | 		private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd");
 27 |         private Text date = new Text();					//Map输出key
 28 |         private IntWritable flow = new IntWritable();	//Map输出value
 29 |         
 30 | 		public void map(Object key, Text value, Context context ) 
 31 | 				throws IOException, InterruptedException {
 32 | 			String[] strs = value.toString().split(" ");
 33 | 			String strFlow = strs[strs.length-1];		//获取流量字符串
 34 | 			String strTime = strs[3].substring(1);		//获取时间字符串
 35 | 			String strDate = null;						//定义日期字符串
 36 | 			try {
 37 | 				strDate = SDFOUT.format(SDFIN.parse(strTime));	//时间格式转成日期格式
 38 | 			} catch (ParseException e) {
 39 | 				e.printStackTrace();
 40 | 			}
 41 | 			
 42 | 			//利用正则表达式判断strFlow是否是数字
 43 | 			if ( Pattern.compile("[0-9]+").matcher(strFlow).matches() ) {
 44 | 				flow.set(Integer.parseInt(strFlow));
 45 | 				date.set(strDate);
 46 | 				context.write(date, flow);
 47 | 			} 			
 48 | 	    }
 49 | 	}
 50 |   
 51 | 	public static class FlowCountReducer extends Reducer<Text,IntWritable,Text,LongWritable> {
 52 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 53 | 				throws IOException, InterruptedException {
 54 | 			long sum = 0;
 55 | 			for (IntWritable val : values) {
 56 | 				sum += val.get();
 57 | 			}
 58 | 			context.write(key, new LongWritable(sum));
 59 | 			/*
 60 | 			for (IntWritable val : values) {
 61 | 				context.write(key, val);
 62 | 			}
 63 | 			*/
 64 | 	    }
 65 | 	}
 66 | 
 67 | 	public static void main(String[] args) throws Exception {		
 68 | 		//1.设置HDFS配置信息
 69 | 		String namenode_ip = "192.168.17.10";
 70 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 71 | 		Configuration conf = new Configuration();
 72 | 		conf.set("fs.defaultFS", hdfs);
 73 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 74 | 
 75 | 		//2.设置MapReduce作业配置信息
 76 | 		String jobName = "FlowCount";					//作业名称
 77 | 		Job job = Job.getInstance(conf, jobName);
 78 | 		job.setJarByClass(FlowCount.class);				//指定运行时作业类
 79 | 		job.setJar("export\\FlowCount.jar");			//指定本地jar包
 80 | 		job.setMapperClass(FlowCountMapper.class);		//指定Mapper类
 81 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 82 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
 83 | 		job.setReducerClass(FlowCountReducer.class);	//指定Reducer类
 84 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
 85 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
 86 | 		
 87 | 		//3.设置作业输入和输出路径
 88 | 		String dataDir = "/expr/weblog/data";			//实验数据目录	
 89 | 		String outputDir = "/expr/weblog/output1";		//实验输出目录
 90 | 		Path inPath = new Path(hdfs + dataDir);
 91 | 		Path outPath = new Path(hdfs + outputDir);
 92 | 		FileInputFormat.addInputPath(job, inPath);
 93 | 		FileOutputFormat.setOutputPath(job, outPath);
 94 | 		FileSystem fs = FileSystem.get(conf);
 95 | 		if(fs.exists(outPath)) {
 96 | 			fs.delete(outPath, true);
 97 | 		}
 98 | 		
 99 | 		//4.运行作业
100 | 		System.out.println("Job: " + jobName + " is running...");
101 | 		if(job.waitForCompletion(true)) {
102 | 			System.out.println("success!");
103 | 			System.exit(0);
104 | 		} else {
105 | 			System.out.println("failed!");
106 | 			System.exit(1);
107 | 		}
108 | 	}
109 | 
110 | }


--------------------------------------------------------------------------------
/src/main/java/weblog/IPCount.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Locale;
  9 | 
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.fs.FileSystem;
 12 | import org.apache.hadoop.fs.Path;
 13 | import org.apache.hadoop.io.IntWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.io.WritableComparable;
 16 | import org.apache.hadoop.mapreduce.Job;
 17 | import org.apache.hadoop.mapreduce.Mapper;
 18 | import org.apache.hadoop.mapreduce.Reducer;
 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 21 | 
 22 | //4. 计算每天访问该网站的独立IP数
 23 | public class IPCount {
 24 | 
 25 | 	public enum IpCounter {
 26 | 		ipnum1, ipnum2
 27 | 	}
 28 | 	
 29 | 	public static class IPCountMapper extends Mapper<Object, Text, DayAndIp, IntWritable> {		
 30 | 		private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
 31 | 		private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd");
 32 |         private DayAndIp k = new DayAndIp();						//Map输出Key：日期+IP
 33 |         private final static IntWritable one = new IntWritable(1);	//Map输出Value
 34 |         
 35 | 		public void map(Object key, Text value, Context context ) 
 36 | 				throws IOException, InterruptedException {
 37 | 			String[] strs = value.toString().split(" ");
 38 | 			String strIP = strs[0];						//获取IP字符串
 39 | 			String strTime = strs[3].substring(1);		//获取时间字符串
 40 | 			String strDate = null;						//定义日期字符串
 41 | 			try {
 42 | 				strDate = SDFOUT.format(SDFIN.parse(strTime));	//时间格式转成日期格式
 43 | 			} catch (ParseException e) {
 44 | 				e.printStackTrace();
 45 | 			}			
 46 | 			k.setDate(strDate);
 47 | 			k.setIp(strIP);
 48 | 			context.write(k, one);
 49 | 	    }
 50 | 	}
 51 |   
 52 | 	public static class IPCountReducer extends Reducer<DayAndIp,IntWritable,DayAndIp,IntWritable> {
 53 | 		public void reduce(DayAndIp key, Iterable<IntWritable> values, Context context) 
 54 | 				throws IOException, InterruptedException {
 55 | 			int sum = 0;
 56 | 			for (IntWritable val : values) {
 57 | 				sum += val.get();
 58 | 			}
 59 | 			context.write(key, new IntWritable(sum));
 60 | 			String[] strs = key.toString().split("\t");
 61 | 			if ( strs[0].equals("2013-05-30") ) {
 62 | 				context.getCounter(IpCounter.ipnum1).increment(1);	//使用计数器统计某天访问的IP数
 63 | 			} else {
 64 | 				context.getCounter(IpCounter.ipnum2).increment(1);
 65 | 			}
 66 | 	    }
 67 | 	}
 68 | 
 69 | 	public static void main(String[] args) throws Exception {		
 70 | 		//1.设置HDFS配置信息
 71 | 		String namenode_ip = "192.168.17.10";
 72 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 73 | 		Configuration conf = new Configuration();
 74 | 		conf.set("fs.defaultFS", hdfs);
 75 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 76 | 
 77 | 		//2.设置MapReduce作业配置信息
 78 | 		String jobName = "IPCount";						//作业名称
 79 | 		Job job = Job.getInstance(conf, jobName);
 80 | 		job.setJarByClass(IPCount.class);				//指定运行时作业类
 81 | 		job.setJar("export\\IPCount.jar");				//指定本地jar包
 82 | 		job.setMapperClass(IPCountMapper.class);		//指定Mapper类
 83 | 		job.setMapOutputKeyClass(DayAndIp.class);		//设置Mapper输出Key类型
 84 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
 85 | 		job.setReducerClass(IPCountReducer.class);		//指定Reducer类
 86 | 		job.setOutputKeyClass(DayAndIp.class);			//设置Reduce输出Key类型
 87 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
 88 | 		
 89 | 		//3.设置作业输入和输出路径
 90 | 		String dataDir = "/expr/weblog/data";			//实验数据目录	
 91 | 		String outputDir = "/expr/weblog/output4";		//实验输出目录
 92 | 		Path inPath = new Path(hdfs + dataDir);
 93 | 		Path outPath = new Path(hdfs + outputDir);
 94 | 		FileInputFormat.addInputPath(job, inPath);
 95 | 		FileOutputFormat.setOutputPath(job, outPath);
 96 | 		FileSystem fs = FileSystem.get(conf);
 97 | 		if(fs.exists(outPath)) {
 98 | 			fs.delete(outPath, true);
 99 | 		}
100 | 		
101 | 		//4.运行作业
102 | 		System.out.println("Job: " + jobName + " is running...");
103 | 		if(job.waitForCompletion(true)) {
104 | 			System.out.println("success!");
105 | 			System.exit(0);
106 | 		} else {
107 | 			System.out.println("failed!");
108 | 			System.exit(1);
109 | 		}
110 | 	}
111 | 
112 | 	//自定义KEY类，封装日期和IP
113 | 	public static class DayAndIp implements WritableComparable<DayAndIp> {
114 | 		private String date;
115 | 		private String ip;
116 | 		
117 | 		public String getDate() {
118 | 			return date;
119 | 		}
120 | 		public void setDate(String date) {
121 | 			this.date = date;
122 | 		}
123 | 		public String getIp() {
124 | 			return ip;
125 | 		}
126 | 		public void setIp(String ip) {
127 | 			this.ip = ip;
128 | 		}
129 | 		
130 | 		@Override
131 | 		public void write(DataOutput out) throws IOException {
132 | 			out.writeUTF(date);
133 | 			out.writeUTF(ip);
134 | 		}
135 | 
136 | 		@Override
137 | 		public void readFields(DataInput in) throws IOException {
138 | 			date = in.readUTF();
139 | 			ip = in.readUTF();	
140 | 		}		
141 | 
142 | 		@Override
143 | 		public int compareTo(DayAndIp o) {
144 | 			int r = date.compareTo(o.getDate());
145 | 			if ( r == 0 ) {
146 | 				return ip.compareTo(o.getIp());
147 | 			}
148 | 			return r;
149 | 		}
150 | 		
151 | 		@Override
152 | 		public String toString() {
153 | 			return date + "\t" + ip;
154 | 		}
155 | 	}
156 | }


--------------------------------------------------------------------------------
/src/main/java/weblog/Missed.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.NullWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Reducer;
 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 15 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 16 | 
 17 | //2. 将所有状态为404的记录输出到文件：missed
 18 | public class Missed {
 19 | 
 20 | 	public static class MissedMapper extends Mapper<Object, Text, Text, NullWritable> {
 21 | 		private Text k = new Text();	//Map输出key
 22 | 		
 23 | 		public void map(Object key, Text value, Context context ) 
 24 | 				throws IOException, InterruptedException {
 25 | 			String[] strs = value.toString().split(" ");
 26 | 			String status = strs[strs.length-2];	//获取状态码
 27 | 			if (status.equals("404")) {
 28 | 				//context.write(value, NullWritable.get());
 29 | 				String reqResource = strs[6];		//获取被请求的资源
 30 | 				int index = reqResource.indexOf("?");
 31 | 				if ( index > 0 ) {
 32 | 					reqResource = reqResource.substring(0, index);	//截取问号前的请求资源名称（去掉请求参数）
 33 | 				}
 34 | 				k.set(reqResource);
 35 | 				context.write(k, NullWritable.get());
 36 | 			}
 37 | 	    }
 38 | 	}
 39 |   
 40 | 	public static class MissedReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
 41 | 		//定义MultiOutputs对象
 42 | 		private MultipleOutputs<Text,NullWritable> mos;
 43 | 		
 44 | 		//初始化MultiOutputs对象
 45 | 		protected void setup(Context context) throws IOException, InterruptedException {
 46 | 			mos = new MultipleOutputs<Text, NullWritable>(context);
 47 | 		}
 48 | 		
 49 | 		//关闭MultiOutputs对象
 50 | 		protected void cleanup(Context context) throws IOException, InterruptedException {
 51 | 			mos.close();
 52 | 		}
 53 | 		
 54 | 		public void reduce(Text key, Iterable<NullWritable> values, Context context) 
 55 | 				throws IOException, InterruptedException {
 56 | 			mos.write("missed", key, NullWritable.get());
 57 | 	    }
 58 | 	}
 59 | 
 60 | 	public static void main(String[] args) throws Exception {		
 61 | 		//1.设置HDFS配置信息
 62 | 		String namenode_ip = "192.168.17.10";
 63 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 64 | 		Configuration conf = new Configuration();
 65 | 		conf.set("fs.defaultFS", hdfs);
 66 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 67 | 
 68 | 		//2.设置MapReduce作业配置信息
 69 | 		String jobName = "Missed";						//作业名称
 70 | 		Job job = Job.getInstance(conf, jobName);
 71 | 		job.setJarByClass(Missed.class);				//指定运行时作业类
 72 | 		job.setJar("export\\Missed.jar");				//指定本地jar包
 73 | 		job.setMapperClass(MissedMapper.class);			//指定Mapper类
 74 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 75 | 		job.setMapOutputValueClass(NullWritable.class);	//设置Mapper输出Value类型
 76 | 		job.setReducerClass(MissedReducer.class);		//指定Reducer类		
 77 | 		//定义多文件输出的文件名、输出格式、键类型、值类型
 78 | 		MultipleOutputs.addNamedOutput(job, "missed", TextOutputFormat.class, Text.class, NullWritable.class);
 79 | 		
 80 | 		//3.设置作业输入和输出路径
 81 | 		String dataDir = "/expr/weblog/data";			//实验数据目录	
 82 | 		String outputDir = "/expr/weblog/output2";		//实验输出目录
 83 | 		Path inPath = new Path(hdfs + dataDir);
 84 | 		Path outPath = new Path(hdfs + outputDir);
 85 | 		FileInputFormat.addInputPath(job, inPath);
 86 | 		FileOutputFormat.setOutputPath(job, outPath);
 87 | 		FileSystem fs = FileSystem.get(conf);
 88 | 		if(fs.exists(outPath)) {
 89 | 			fs.delete(outPath, true);
 90 | 		}
 91 | 		
 92 | 		//4.运行作业
 93 | 		System.out.println("Job: " + jobName + " is running...");
 94 | 		if(job.waitForCompletion(true)) {
 95 | 			System.out.println("success!");
 96 | 			System.exit(0);
 97 | 		} else {
 98 | 			System.out.println("failed!");
 99 | 			System.exit(1);
100 | 		}
101 | 	}
102 | 
103 | }


--------------------------------------------------------------------------------
/src/main/java/weblog/PVMinMax.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.IOException;
  4 | import java.text.ParseException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.HashMap;
  7 | import java.util.Locale;
  8 | import java.util.Map;
  9 | 
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.fs.FileSystem;
 12 | import org.apache.hadoop.fs.Path;
 13 | import org.apache.hadoop.io.IntWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.Reducer;
 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | 
 21 | //5.1 统计网站每分钟的访问量
 22 | // 访问量是每一条记录
 23 | public class PVMinMax {
 24 | 
 25 | 	public static class PVMinMaxMapper extends Mapper<Object, Text, Text, IntWritable> {		
 26 | 		private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
 27 | 		private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd HH:mm");
 28 |         private Text minute = new Text();		//Map输出key
 29 |         private final static IntWritable one = new IntWritable(1);
 30 |         
 31 | 		public void map(Object key, Text value, Context context ) 
 32 | 				throws IOException, InterruptedException {
 33 | 			String[] strs = value.toString().split(" ");
 34 | 			String strTime = strs[3].substring(1);		//获取时间字符串
 35 | 			String strMinute = null;
 36 | 			try {
 37 | 				strMinute = SDFOUT.format(SDFIN.parse(strTime));	//时间格式转成日期格式
 38 | 			} catch (ParseException e) {
 39 | 				e.printStackTrace();
 40 | 			}
 41 | 			minute.set(strMinute);
 42 | 			context.write(minute, one);
 43 | 	    }
 44 | 	}
 45 |   
 46 | 	public static class PVMinMaxReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
 47 | 		Map<String, Integer> map = new HashMap<String, Integer>();
 48 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context) 
 49 | 				throws IOException, InterruptedException {
 50 | 			int sum = 0;
 51 | 			for (IntWritable val : values) {
 52 | 				sum += val.get();
 53 | 			}
 54 | 			context.write(key, new IntWritable(sum));
 55 | 	    }
 56 | 	}
 57 | 
 58 | 	public static void main(String[] args) throws Exception {		
 59 | 		//1.设置HDFS配置信息
 60 | 		String namenode_ip = "192.168.17.10";
 61 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 62 | 		Configuration conf = new Configuration();
 63 | 		conf.set("fs.defaultFS", hdfs);
 64 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 65 | 
 66 | 		//2.设置MapReduce作业配置信息
 67 | 		String jobName = "PVMinMax";					//作业名称
 68 | 		Job job = Job.getInstance(conf, jobName);
 69 | 		job.setJarByClass(PVMinMax.class);				//指定运行时作业类
 70 | 		job.setJar("export\\PVMinMax.jar");				//指定本地jar包
 71 | 		job.setMapperClass(PVMinMaxMapper.class);		//指定Mapper类
 72 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 73 | 		job.setMapOutputValueClass(IntWritable.class);	//设置Mapper输出Value类型
 74 | 		job.setReducerClass(PVMinMaxReducer.class);		//指定Reducer类
 75 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
 76 | 		job.setOutputValueClass(IntWritable.class); 	//设置Reduce输出Value类型
 77 | 		
 78 | 		//3.设置作业输入和输出路径
 79 | 		String dataDir = "/expr/weblog/data";			//实验数据目录	
 80 | 		String outputDir = "/expr/weblog/output5_1";		//实验输出目录
 81 | 		Path inPath = new Path(hdfs + dataDir);
 82 | 		Path outPath = new Path(hdfs + outputDir);
 83 | 		FileInputFormat.addInputPath(job, inPath);
 84 | 		FileOutputFormat.setOutputPath(job, outPath);
 85 | 		FileSystem fs = FileSystem.get(conf);
 86 | 		if(fs.exists(outPath)) {
 87 | 			fs.delete(outPath, true);
 88 | 		}
 89 | 		
 90 | 		//4.运行作业
 91 | 		System.out.println("Job: " + jobName + " is running...");
 92 | 		if(job.waitForCompletion(true)) {
 93 | 			System.out.println("success!");
 94 | 			System.exit(0);
 95 | 		} else {
 96 | 			System.out.println("failed!");
 97 | 			System.exit(1);
 98 | 		}
 99 | 	}
100 | 
101 | }


--------------------------------------------------------------------------------
/src/main/java/weblog/PVMinMax2.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | import org.apache.hadoop.conf.Configuration;
  8 | import org.apache.hadoop.fs.FileSystem;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Reducer;
 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 | 
 17 | //5.2 计算网站每分钟访问量的峰值（最大、最小值）
 18 | public class PVMinMax2 {
 19 | 
 20 | 	public static class PVMinMax2Mapper extends Mapper<Object, Text, Text, Text> {		
 21 | 		public void map(Object key, Text value, Context context ) 
 22 | 				throws IOException, InterruptedException {
 23 | 			// 传入数据类似2014-12-12 18:06 	1234，前面通过空格分开，后面是制表符分隔
 24 | 			String[] strs = value.toString().split(" ");
 25 | 			// key是2014-12-12这样的时间
 26 | 			context.write(new Text(strs[0]), new Text(strs[1]));
 27 | 	    }
 28 | 	}
 29 |   
 30 | 	public static class PVMinMax2Reducer extends Reducer<Text, Text, Text, Text> {
 31 | 		// Map<String, Integer> map = new HashMap<String, Integer>();
 32 | 		int maxVisit = 0;					//默认最大值设为0
 33 | 		int minVisit = Integer.MAX_VALUE;	//默认最小值设为最大整数
 34 | 		String maxMinute = null;// 最大访问量的所在时间
 35 | 		String minMinute = null;
 36 | 		public void reduce(Text key, Iterable<Text> values, Context context) 
 37 | 				throws IOException, InterruptedException {
 38 | 			for (Text val : values) {
 39 | 				String[] strs = val.toString().split("\t");
 40 | 				String minute = strs[0];				//minute:访问时间，如：17:38
 41 | 				int visit = Integer.parseInt(strs[1]);	//visit:访问次数,如：813
 42 | 				if (visit > maxVisit) {
 43 | 					maxVisit = visit;
 44 | 					maxMinute = minute;
 45 | 				}					
 46 | 				if (visit < minVisit) {
 47 | 					minVisit = visit;
 48 | 					minMinute = minute;
 49 | 				}
 50 | 			}
 51 | 			
 52 | 			String strMaxTime = key.toString() + " " + maxMinute;	//将日期和分钟合并
 53 | 			String strMinTime = key.toString() + " " + minMinute;
 54 | 			context.write(new Text(strMaxTime), new Text(String.valueOf(maxVisit)));
 55 | 			context.write(new Text(strMinTime), new Text(String.valueOf(minVisit)));
 56 | 			
 57 | 			/*
 58 | 			*或者这样写
 59 | 			String value = maxMinute + " " + maxVisit + "\t" + minMinute + " " + minVisit;
 60 | 			context.write(key, new Text(value));
 61 | 			*/
 62 | 	    }
 63 | 	}
 64 | 
 65 | 	public static void main(String[] args) throws Exception {		
 66 | 		//1.设置HDFS配置信息
 67 | 		String namenode_ip = "192.168.17.10";
 68 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";			
 69 | 		Configuration conf = new Configuration();
 70 | 		conf.set("fs.defaultFS", hdfs);
 71 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 72 | 
 73 | 		//2.设置MapReduce作业配置信息
 74 | 		String jobName = "PVMinMax2";					//作业名称
 75 | 		Job job = Job.getInstance(conf, jobName);
 76 | 		job.setJarByClass(PVMinMax2.class);				//指定运行时作业类
 77 | 		job.setJar("export\\PVMinMax2.jar");			//指定本地jar包
 78 | 		job.setMapperClass(PVMinMax2Mapper.class);		//指定Mapper类
 79 | 		job.setMapOutputKeyClass(Text.class);			//设置Mapper输出Key类型
 80 | 		job.setMapOutputValueClass(Text.class);			//设置Mapper输出Value类型
 81 | 		job.setReducerClass(PVMinMax2Reducer.class);	//指定Reducer类
 82 | 		job.setOutputKeyClass(Text.class);				//设置Reduce输出Key类型
 83 | 		job.setOutputValueClass(Text.class); 	//设置Reduce输出Value类型
 84 | 		
 85 | 		//3.设置作业输入和输出路径
 86 | 		String dataDir = "/expr/weblog/output5_1";			//实验数据目录	
 87 | 		String outputDir = "/expr/weblog/output5_2";		//实验输出目录
 88 | 		Path inPath = new Path(hdfs + dataDir);
 89 | 		Path outPath = new Path(hdfs + outputDir);
 90 | 		FileInputFormat.addInputPath(job, inPath);
 91 | 		FileOutputFormat.setOutputPath(job, outPath);
 92 | 		FileSystem fs = FileSystem.get(conf);
 93 | 		if(fs.exists(outPath)) {
 94 | 			fs.delete(outPath, true);
 95 | 		}
 96 | 		
 97 | 		//4.运行作业
 98 | 		System.out.println("Job: " + jobName + " is running...");
 99 | 		if(job.waitForCompletion(true)) {
100 | 			System.out.println("success!");
101 | 			System.exit(0);
102 | 		} else {
103 | 			System.out.println("failed!");
104 | 			System.exit(1);
105 | 		}
106 | 	}
107 | 
108 | }


--------------------------------------------------------------------------------
/src/main/java/weblog/PVTopTen.java:
--------------------------------------------------------------------------------
  1 | package ssdut.training.mapreduce.weblog;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map.Entry;
  5 | import java.util.NavigableMap;
  6 | import java.util.TreeMap;
  7 | 
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.IntWritable;
 12 | import org.apache.hadoop.io.NullWritable;
 13 | import org.apache.hadoop.io.Text;
 14 | import org.apache.hadoop.mapreduce.Job;
 15 | import org.apache.hadoop.mapreduce.Mapper;
 16 | import org.apache.hadoop.mapreduce.Reducer;
 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 19 | 
 20 | //3. 找到访问量最高的10个页面（按访问量降序输出）
 21 | public class PVTopTen {	
 22 | 	public static class PVTopTenMapper extends Mapper<Object, Text, Text, IntWritable> {
 23 | 		private Text k = new Text();
 24 | 		private final static IntWritable one = new IntWritable(1);
 25 | 		
 26 | 		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
 27 | 			String[] strs = value.toString().split(" ");
 28 | 			String reqResource = strs[6];		//获取请求资源字符串
 29 | 			int index = reqResource.indexOf("?");
 30 | 			if ( index > 0 ) {
 31 | 				reqResource = reqResource.substring(0, index);	//截取问号前的请求资源名称（去掉请求参数）
 32 | 			}
 33 | 			if ( reqResource.endsWith(".html") || reqResource.contains(".php") ) {
 34 | 				k.set(reqResource);
 35 | 				context.write(k, one);
 36 | 			}			
 37 | 		}
 38 | 	}
 39 | 	
 40 | 	public static class PVTopTenReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
 41 | 		public TreeMap<Integer, Text> map = new TreeMap<Integer, Text>();
 42 | 
 43 | 		public void reduce(Text key, Iterable<IntWritable> values, Context context)
 44 | 				throws IOException, InterruptedException {
 45 | 			int sum = 0;
 46 | 			for (IntWritable val : values) {
 47 | 				sum += val.get();	//计算被请求页面的访问量
 48 | 			}
 49 | 			String str = String.valueOf(sum) + "\t" +  key.toString() ;
 50 | 			map.put(sum, new Text(str));	//将页面访问量和被请求页面名称放入TreeMap中，TreeMap按KEY键（访问量）自动排序
 51 | 			if (map.size() > 10) {	//如果TreeMap中元素超过N个，则将第一个（KEY最小的）元素删除
 52 | 				map.remove(map.firstKey());
 53 | 			}
 54 | 		}
 55 | 		
 56 | 		public void cleanup(Context context) throws IOException, InterruptedException {
 57 | 			//将TreeMap反序处理（降序），遍历输出top10
 58 | 			NavigableMap<Integer, Text> reverseMap = map.descendingMap();
 59 | 			for ( Entry<Integer, Text> entry  : reverseMap.entrySet() ) {
 60 | 				context.write(entry.getValue(), NullWritable.get());
 61 | 			}
 62 | 		}
 63 | 	}
 64 | 	
 65 | 	public static void main(String[] args) throws Exception {
 66 | 		String namenode_ip = "192.168.17.10";
 67 | 		String hdfs = "hdfs://" + namenode_ip + ":9000";
 68 | 		Configuration conf = new Configuration();
 69 | 		conf.set("fs.defaultFS", hdfs);
 70 | 		conf.set("mapreduce.app-submission.cross-platform", "true");
 71 | 		
 72 | 		String jobName = "PVTopTenJob";
 73 | 		Job job = Job.getInstance(conf, jobName);
 74 | 		job.setJarByClass(PVTopTen.class);
 75 | 		job.setJar("export\\PVTopTen.jar");
 76 | 		job.setMapperClass(PVTopTenMapper.class);
 77 | 		job.setMapOutputKeyClass(Text.class);
 78 | 		job.setMapOutputValueClass(IntWritable.class);
 79 | 		job.setReducerClass(PVTopTenReducer.class);
 80 | 		job.setOutputKeyClass(Text.class);
 81 | 		job.setOutputValueClass(NullWritable.class);
 82 | 		job.setNumReduceTasks(1);		//计算最终TopN，只能运行一个Reduce任务
 83 | 
 84 | 		String dataDir = "/expr/weblog/data";	
 85 | 		String outputDir = "/expr/weblog/output3";
 86 | 		Path inPath = new Path(hdfs + dataDir);
 87 | 		Path outPath = new Path(hdfs + outputDir);
 88 | 		FileInputFormat.addInputPath(job, inPath);
 89 | 		FileOutputFormat.setOutputPath(job, outPath);		
 90 | 		FileSystem fs = FileSystem.get(conf);
 91 | 		if(fs.exists(outPath)) {
 92 | 			fs.delete(outPath, true);
 93 | 		}
 94 | 		
 95 | 		System.out.println( "Job: " + jobName + " is running...");
 96 | 		if(job.waitForCompletion(true)) {
 97 | 			System.out.println("success!");
 98 | 			System.exit(0);
 99 | 		} else {
100 | 			System.out.println("failed!");
101 | 			System.exit(1);
102 | 		}
103 | 	}
104 | 	
105 | }


--------------------------------------------------------------------------------