├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── images ├── index.png ├── reverseindex.jpg ├── 分区.jpg └── 简单模型.jpg ├── resources ├── comments.xml ├── friendsdata.txt ├── itemcf.csv ├── order.txt ├── people.csv ├── product.txt └── rand.sh └── src └── main └── java ├── InputOutputFormatTest └── MultiInOutput.java ├── InvertedIndex ├── InvertedCombiner.java ├── InvertedJob.java ├── InvertedMapper.java └── InvertedReducer.java ├── gradesAverage └── GradesAverage.java ├── mapReduceTest └── wordCount │ └── WordCount.java ├── mapreduceProgram ├── DateSortAsc.java ├── DateSortDesc.java ├── FlowPartition.java ├── FlowSort.java ├── FlowStatistics.java └── GroupMax.java ├── mergeMultipleFiles ├── MergeJob.java ├── MergeMapper.java ├── MyInputFormat.java └── MyRecordReader.java ├── mutualFriend ├── DecomposeFriendsMapper.java ├── DecomposeFriendsReducer.java ├── JobControlRun.java ├── JobRun.java ├── MergeFriendsMapper.java └── MergeFriendsReducer.java ├── shuffleTest ├── MonthAscTempDescSort.java └── TempSort.java ├── ssdut └── training │ └── mapreduce │ ├── counter │ └── YearCounter.java │ ├── datecount │ ├── DateCount.java │ ├── DateDistinct.java │ ├── DateFilter.java │ ├── DateGroup.java │ ├── DateGroup2.java │ ├── DatePartition.java │ ├── DatePartition2.java │ ├── DateSort.java │ ├── DateSort2.java │ └── DateSort3.java │ ├── inputformat │ ├── FixedLengthInput.java │ ├── FixedLengthInput2.java │ ├── KeyValueInput.java │ ├── MultInput.java │ ├── MultInput2.java │ ├── NLineInput.java │ └── SequenceInput.java │ ├── itemcf │ ├── StartRun.java │ ├── Step1.java │ ├── Step2.java │ ├── Step3.java │ ├── Step4.java │ ├── Step5.java │ └── Step6.java │ ├── medianstddev │ ├── MRDPUtils.java │ ├── MedianStdDevJob.java │ ├── MedianStdDevMapper.java │ ├── MedianStdDevReducer.java │ └── MedianStdDevTuple.java │ ├── minmaxcount │ ├── MRDPUtils.java │ ├── MinMaxCountJob.java │ ├── MinMaxCountMapper.java │ ├── MinMaxCountReducer.java │ └── MinMaxCountTuple.java │ ├── output │ ├── CompressOutput.java │ └── MultOutput.java │ ├── peoplerank │ ├── People.java │ ├── PeopleRank.java │ └── PeopleRank2.java │ └── topten │ ├── TopTenJob.java │ ├── TopTenMapper.java │ └── TopTenReducer.java └── weblog ├── FlowCount.java ├── IPCount.java ├── Missed.java ├── PVMinMax.java ├── PVMinMax2.java └── PVTopTen.java /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 josonle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /images/index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/index.png -------------------------------------------------------------------------------- /images/reverseindex.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/reverseindex.jpg -------------------------------------------------------------------------------- /images/分区.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/分区.jpg -------------------------------------------------------------------------------- /images/简单模型.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josonle/MapReduce-Demo/2f057a5add4f623804f7c102a8ac16c7a52ad946/images/简单模型.jpg -------------------------------------------------------------------------------- /resources/comments.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /resources/friendsdata.txt: -------------------------------------------------------------------------------- 1 | A:B,C,D,F,E,O 2 | B:A,C,E,K 3 | C:F,A,D,I 4 | D:A,E,F,L 5 | E:B,C,D,M,L 6 | F:A,B,C,D,E,O,M 7 | G:A,C,D,E,F 8 | H:A,C,D,E,O 9 | I:A,O 10 | J:B,O 11 | K:A,C,D 12 | L:D,E,F 13 | M:E,F,G 14 | O:A,H,I,J -------------------------------------------------------------------------------- /resources/order.txt: -------------------------------------------------------------------------------- 1 | 1001 20150710 P0001 2 2 | 1002 20150710 P0001 3 3 | 1002 20150710 P0002 3 4 | 1003 20150710 P0003 3 -------------------------------------------------------------------------------- /resources/people.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | a,c 3 | a,d 4 | b,a 5 | b,d 6 | c,a 7 | d,b 8 | d,c 9 | -------------------------------------------------------------------------------- /resources/product.txt: -------------------------------------------------------------------------------- 1 | P0001 小米5 1001 2 2 | P0002 锤子T1 1000 3 3 | P0003 锤子 1002 4 -------------------------------------------------------------------------------- /resources/rand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | num=$1 3 | [[ -z $num ]] && num=100 4 | 5 | for ((i=1;i<=$num;i++)) 6 | do 7 | year=$(expr $RANDOM % 3 + 2015) 8 | month=$(expr $RANDOM % 12 + 1) 9 | 10 | case $month in 11 | 1 | 3 | 5 | 7 | 8 | 10 | 12) 12 | day=$(expr $RANDOM % 31 + 1) 13 | ;; 14 | 2) 15 | if [[ $year -eq 2016 && $month -eq 2 ]] 16 | then 17 | day=$(expr $RANDOM % 29 + 1) 18 | else 19 | day=$(expr $RANDOM % 28 + 1) 20 | fi 21 | ;; 22 | 4 | 6 | 9 | 11) 23 | day=$(expr $RANDOM % 30 + 1) 24 | ;; 25 | esac 26 | 27 | if [[ $month -lt 10 ]] 28 | then 29 | month=0$month 30 | fi 31 | 32 | if [[ $day -lt 10 ]] 33 | then 34 | day=0$day 35 | fi 36 | 37 | echo "$year-$month-$day:$i" 38 | done 39 | -------------------------------------------------------------------------------- /src/main/java/InputOutputFormatTest/MultiInOutput.java: -------------------------------------------------------------------------------- 1 | package InputOutputFormatTest; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.Reducer; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 17 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 19 | 20 | import ssdut.training.mapreduce.output.MultOutput; 21 | import ssdut.training.mapreduce.output.MultOutput.MultOutputMapper; 22 | import ssdut.training.mapreduce.output.MultOutput.MultOutputReducer; 23 | 24 | import org.apache.hadoop.io.IntWritable; 25 | import org.apache.hadoop.io.Text; 26 | 27 | public class MultiInOutput { 28 | public static class TxtFileMapper extends Mapper { 29 | private final static IntWritable one = new IntWritable(1); 30 | 31 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 32 | String[] strs = value.toString().split(" "); 33 | Text date = new Text(strs[0]); 34 | context.write(date, one); 35 | } 36 | } 37 | 38 | public static class CsvFileMapper extends Mapper { 39 | private final static IntWritable one = new IntWritable(1); 40 | 41 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 42 | String[] strs = value.toString().split(";");//定义csv文件时用了;做分隔符 43 | context.write(new Text(strs[0]), one); 44 | } 45 | } 46 | 47 | public static class MultOutputReducer extends Reducer { 48 | // 通过 MultipleOutputs 类控制输出的文件名和输出路径 49 | // 定义MultipleOutput对象 50 | private MultipleOutputs mos; 51 | 52 | // 覆写MultipleOutput对象的setup()初始化和cleanup()关闭mos对象方法 53 | protected void setup(Context context) { 54 | mos = new MultipleOutputs(context); 55 | } 56 | 57 | protected void cleanup(Context context) throws IOException, InterruptedException { 58 | mos.close(); 59 | } 60 | 61 | public void reduce(Text key, Iterable values, Context context) 62 | throws IOException, InterruptedException { 63 | int sum = 0; 64 | for (IntWritable value : values) { 65 | sum += value.get(); 66 | } 67 | // 使用MultiOutputs对象替代Context对象输出 68 | // 1. 输出到不同文件(格式、文件名) 69 | if (key.toString().startsWith("2015")) 70 | mos.write("f2015", key, new IntWritable(sum)); 71 | else if (key.toString().startsWith("2016")) 72 | mos.write("f2016", key, new IntWritable(sum)); 73 | else 74 | mos.write("f2017", key, new IntWritable(sum)); 75 | 76 | } 77 | } 78 | 79 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 80 | // 1.设置HDFS配置信息 81 | String namenode_ip = "192.168.17.10"; 82 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 83 | Configuration conf = new Configuration(); 84 | conf.set("fs.defaultFS", hdfs); 85 | conf.set("mapreduce.app-submission.cross-platform", "true"); 86 | 87 | // 2.设置MapReduce作业配置信息 88 | String jobName = "MultInputOutput"; // 作业名称 89 | Job job = Job.getInstance(conf, jobName); 90 | job.setJarByClass(MultiInOutput.class); // 指定运行时作业类 91 | job.setJar("export\\MultiInOutput.jar"); // 指定本地jar包 92 | job.setMapOutputKeyClass(Text.class); // 设置Mapper输出Key类型 93 | job.setMapOutputValueClass(IntWritable.class); // 设置Mapper输出Value类型 94 | job.setReducerClass(MultOutputReducer.class); // 指定Reducer类 95 | // job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 96 | // job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 97 | 98 | // 3.指定作业多输入路径,及Map所使用的类 99 | MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multiinoutput/data/txt"), TextInputFormat.class, TxtFileMapper.class); 100 | MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multiinoutput/data/csv"), TextInputFormat.class, CsvFileMapper.class); 101 | 102 | // 定义多文件输出的文件名、输出格式、Reduce输出键类型,值类型 103 | MultipleOutputs.addNamedOutput(job, "f2015", TextOutputFormat.class, Text.class, IntWritable.class); 104 | MultipleOutputs.addNamedOutput(job, "f2016", SequenceFileOutputFormat.class, Text.class, IntWritable.class); 105 | MultipleOutputs.addNamedOutput(job, "f2017", MapFileOutputFormat.class, Text.class, IntWritable.class); 106 | 107 | // 设置作业输出路径 108 | String outputDir = "/expr/multiinoutput/output"; // 实验输出目录 109 | Path outPath = new Path(hdfs + outputDir); 110 | FileOutputFormat.setOutputPath(job, outPath); 111 | FileSystem fs = FileSystem.get(conf); 112 | if (fs.exists(outPath)) { 113 | fs.delete(outPath, true); 114 | } 115 | 116 | // 4.运行作业 117 | System.out.println("Job: " + jobName + " is running..."); 118 | if (job.waitForCompletion(true)) { 119 | System.out.println("success!"); 120 | System.exit(0); 121 | } else { 122 | System.out.println("failed!"); 123 | System.exit(1); 124 | } 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/InvertedIndex/InvertedCombiner.java: -------------------------------------------------------------------------------- 1 | package InvertedIndex; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | public class InvertedCombiner extends Reducer { 8 | private Text info = new Text(); 9 | 10 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 11 | int sum = 0; 12 | for (Text value : values) { 13 | sum += Integer.parseInt(value.toString()); //单词数求和 14 | } 15 | int splitIndex = key.toString().indexOf(":"); //获取key中冒号的下标 16 | //注意此处应先计算info再计算key,否则key下标会越界 17 | info.set(key.toString().substring(splitIndex + 1) + ":" + sum); //将key中冒号后的内容(文件名)与单词数总和组合成Combiner输出的value 18 | key.set(key.toString().substring(0, splitIndex)); //将key中冒号前的内容(单词)设置为Combiner输出的key 19 | context.write(key, info); //输出格式:"word filename:sum" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/InvertedIndex/InvertedJob.java: -------------------------------------------------------------------------------- 1 | package InvertedIndex; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | public class InvertedJob { 12 | public static void main(String[] args) throws Exception { 13 | String namenode_ip = "192.168.17.10"; 14 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 15 | Configuration conf = new Configuration(); 16 | conf.set("fs.defaultFS", hdfs); 17 | conf.set("mapreduce.app-submission.cross-platform", "true"); 18 | 19 | String jobName = "InvertedJob"; 20 | Job job = Job.getInstance(conf, jobName); 21 | job.setJarByClass(InvertedJob.class); 22 | job.setJar("export\\InvertedJob.jar"); 23 | job.setMapperClass(InvertedMapper.class); 24 | job.setMapOutputKeyClass(Text.class); 25 | job.setMapOutputValueClass(Text.class); 26 | job.setCombinerClass(InvertedCombiner.class); //此处定义Combiner类,与Reducer类不同 27 | job.setReducerClass(InvertedReducer.class); 28 | job.setOutputKeyClass(Text.class); 29 | job.setOutputValueClass(Text.class); 30 | 31 | String dataDir = "/expr/inverted/data"; 32 | String outputDir = "/expr/inverted/output"; 33 | Path inPath = new Path(hdfs + dataDir); 34 | Path outPath = new Path(hdfs + outputDir); 35 | FileInputFormat.addInputPath(job, inPath); 36 | FileOutputFormat.setOutputPath(job, outPath); 37 | FileSystem fs = FileSystem.get(conf); 38 | if(fs.exists(outPath)) { 39 | fs.delete(outPath, true); 40 | } 41 | 42 | System.out.println("Job: " + jobName + " is running..."); 43 | if(job.waitForCompletion(true)) { 44 | System.out.println("success!"); 45 | System.exit(0); 46 | } else { 47 | System.out.println("failed!"); 48 | System.exit(1); 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/InvertedIndex/InvertedMapper.java: -------------------------------------------------------------------------------- 1 | package InvertedIndex; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 8 | 9 | public class InvertedMapper extends Mapper { 10 | private Text keyInfo = new Text(); 11 | private Text valueInfo = new Text(); 12 | private FileSplit split; 13 | 14 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 15 | split = (FileSplit) context.getInputSplit(); //通过context获取输入分片对象,目的是获得输入文件名称 16 | StringTokenizer itr = new StringTokenizer(value.toString()); 17 | while (itr.hasMoreTokens()) { 18 | keyInfo.set(itr.nextToken() + ":" + split.getPath().getName()); //将单词及其所属文件拼接成"word:filename"格式作为key 19 | valueInfo.set("1"); 20 | context.write(keyInfo, valueInfo); //输出格式: "word:filename 1" 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/InvertedIndex/InvertedReducer.java: -------------------------------------------------------------------------------- 1 | package InvertedIndex; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | public class InvertedReducer extends Reducer { 8 | private Text result = new Text(); 9 | 10 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 11 | String fileList = new String(); 12 | for (Text value : values) { 13 | fileList += value.toString() + "; "; 14 | } 15 | result.set(fileList); 16 | context.write(key, result); //输出格式:"word file1:num1; file2:num2;" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/gradesAverage/GradesAverage.java: -------------------------------------------------------------------------------- 1 | package gradesAverage; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.FloatWritable; 11 | import org.apache.hadoop.io.IntWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | 19 | import mapReduceTest.wordCount.WordCount; 20 | import mapReduceTest.wordCount.WordCount.IntSumReducer; 21 | import mapReduceTest.wordCount.WordCount.TokenizerMapper; 22 | 23 | public class GradesAverage { 24 | 25 | public static class TokenizerMapper extends Mapper { 26 | private Text student = new Text(); 27 | private IntWritable grade = new IntWritable(); 28 | 29 | /* (non-Javadoc) 30 | * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) 31 | */ 32 | /* (non-Javadoc) 33 | * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) 34 | */ 35 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 36 | // StringTokenizer iTokenizer = new StringTokenizer(value.toString(),"\n"); 37 | System.out.println("key is:"+key+",value is: "+value.toString()); 38 | // while (iTokenizer.hasMoreTokens()) { 39 | // 40 | // } 41 | String[] list_strs = value.toString().split(" "); 42 | // 因为每行只有一个学号和对应成绩,不需要考虑切分多个词 43 | student.set(list_strs[0]); 44 | grade.set(Integer.parseInt(list_strs[1])); 45 | context.write(student, grade); 46 | } 47 | } 48 | 49 | // public static class gradesAverageCombiner extends Reducer { 50 | // private Text gradesSum = new Text(); 51 | // 52 | // public void reduce(Text key, Iterable values, Context context) 53 | // throws IOException, InterruptedException { 54 | // int sum = 0; 55 | // int grades = 0; 56 | // for (IntWritable val : values) { 57 | // sum += 1; 58 | // grades += val.get(); 59 | // } 60 | // System.out.println("Combiner---student is:"+key.toString()+",grades is:"+grades+",sum is:"+sum); 61 | // gradesSum.set(grades+","+sum); 62 | // System.out.println(gradesSum); 63 | // context.write(key, gradesSum); 64 | // } 65 | // } 66 | public static class gradesAverageReducer extends Reducer { 67 | private FloatWritable gradesSum = new FloatWritable(); 68 | 69 | public void reduce(Text key, Iterable values, Context context) 70 | throws IOException, InterruptedException { 71 | int sum = 0; 72 | int grades = 0; 73 | for (IntWritable val : values) { 74 | sum += 1; 75 | grades += val.get(); 76 | } 77 | System.out.println("Reduce----student is:"+key.toString()+",grades is:"+grades+",sum is:"+sum); 78 | gradesSum.set((float)grades/sum); 79 | context.write(key, gradesSum); 80 | } 81 | } 82 | 83 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 84 | // 1.设置HDFS配置信息 85 | String namenode_ip = "192.168.17.10"; 86 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 87 | Configuration conf = new Configuration(); // Hadoop配置类 88 | conf.set("fs.defaultFS", hdfs); 89 | conf.set("mapreduce.app-submission.cross-platform", "true"); // 集群交叉提交 90 | /* 91 | * conf.set("hadoop.job.user", "hadoop"); conf.set("mapreduce.framework.name", 92 | * "yarn"); conf.set("mapreduce.jobtracker.address", namenode_ip + ":9001"); 93 | * conf.set("yarn.resourcemanager.hostname", namenode_ip); 94 | * conf.set("yarn.resourcemanager.resource-tracker.address", namenode_ip + 95 | * ":8031"); conf.set("yarn.resourcemtanager.address", namenode_ip + ":8032"); 96 | * conf.set("yarn.resourcemanager.admin.address", namenode_ip + ":8033"); 97 | * conf.set("yarn.resourcemanager.scheduler.address", namenode_ip + ":8034"); 98 | * conf.set("mapreduce.jobhistory.address", namenode_ip + ":10020"); 99 | */ 100 | 101 | // 2.设置MapReduce作业配置信息 102 | String jobName = "GradesAverage"; // 定义作业名称 103 | Job job = Job.getInstance(conf, jobName); 104 | job.setJarByClass(GradesAverage.class); // 指定作业类 105 | job.setJar("export\\GradesAverage.jar"); // 指定本地jar包 106 | job.setMapperClass(TokenizerMapper.class); 107 | // job.setCombinerClass(gradesAverageCombiner.class); // 指定Combiner类 108 | job.setReducerClass(gradesAverageReducer.class); 109 | // 输出key-value的类型 110 | job.setOutputKeyClass(Text.class); 111 | job.setMapOutputValueClass(IntWritable.class); 112 | job.setOutputValueClass(FloatWritable.class); 113 | 114 | // 3.设置作业输入和输出路径 115 | String dataDir = "/expr/studentgrades/grades"; // 实验数据目录 116 | String outputDir = "/expr/studentgrades/output"; // 实验输出目录 117 | Path inPath = new Path(hdfs + dataDir); 118 | Path outPath = new Path(hdfs + outputDir); 119 | FileInputFormat.addInputPath(job, inPath); 120 | FileOutputFormat.setOutputPath(job, outPath); 121 | // 如果输出目录已存在则删除 122 | FileSystem fs = FileSystem.get(conf); 123 | if (fs.exists(outPath)) { 124 | fs.delete(outPath, true); 125 | } 126 | 127 | // 4.运行作业 128 | System.out.println("Job: " + jobName + " is running..."); 129 | if (job.waitForCompletion(true)) { 130 | System.out.println("统计 success!"); 131 | System.exit(0); 132 | } else { 133 | System.out.println("统计 failed!"); 134 | System.exit(1); 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/mapReduceTest/wordCount/WordCount.java: -------------------------------------------------------------------------------- 1 | package mapReduceTest.wordCount; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class WordCount { 17 | 18 | public static class TokenizerMapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | private Text word = new Text(); 21 | 22 | public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { 23 | StringTokenizer itr = new StringTokenizer(value.toString()); 24 | while (itr.hasMoreTokens()) { 25 | word.set(itr.nextToken()); 26 | context.write(word, one); 27 | } 28 | } 29 | } 30 | 31 | public static class IntSumReducer extends Reducer { 32 | private IntWritable result = new IntWritable(); 33 | 34 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 35 | int sum = 0; 36 | for (IntWritable val : values) { 37 | sum += val.get(); 38 | } 39 | result.set(sum); 40 | context.write(key, result); 41 | } 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | //1.设置HDFS配置信息 46 | String namenode_ip = "192.168.17.10"; 47 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 48 | Configuration conf = new Configuration(); //Hadoop配置类 49 | conf.set("fs.defaultFS", hdfs); 50 | conf.set("mapreduce.app-submission.cross-platform", "true"); //集群交叉提交 51 | /* conf.set("hadoop.job.user", "hadoop"); 52 | conf.set("mapreduce.framework.name", "yarn"); 53 | conf.set("mapreduce.jobtracker.address", namenode_ip + ":9001"); 54 | conf.set("yarn.resourcemanager.hostname", namenode_ip); 55 | conf.set("yarn.resourcemanager.resource-tracker.address", namenode_ip + ":8031"); 56 | conf.set("yarn.resourcemtanager.address", namenode_ip + ":8032"); 57 | conf.set("yarn.resourcemanager.admin.address", namenode_ip + ":8033"); 58 | conf.set("yarn.resourcemanager.scheduler.address", namenode_ip + ":8034"); 59 | conf.set("mapreduce.jobhistory.address", namenode_ip + ":10020"); */ 60 | 61 | //2.设置MapReduce作业配置信息 62 | String jobName = "WordCount"; //定义作业名称 63 | Job job = Job.getInstance(conf, jobName); 64 | job.setJarByClass(WordCount.class); //指定作业类 65 | job.setJar("export\\WordCount.jar"); //指定本地jar包 66 | job.setMapperClass(TokenizerMapper.class); 67 | job.setCombinerClass(IntSumReducer.class); //指定Combiner类 68 | job.setReducerClass(IntSumReducer.class); 69 | job.setOutputKeyClass(Text.class); 70 | job.setOutputValueClass(IntWritable.class); 71 | 72 | //3.设置作业输入和输出路径 73 | String dataDir = "/expr/wordcount/data"; //实验数据目录 74 | String outputDir = "/expr/wordcount/output"; //实验输出目录 75 | Path inPath = new Path(hdfs + dataDir); 76 | Path outPath = new Path(hdfs + outputDir); 77 | FileInputFormat.addInputPath(job, inPath); 78 | FileOutputFormat.setOutputPath(job, outPath); 79 | //如果输出目录已存在则删除 80 | FileSystem fs = FileSystem.get(conf); 81 | if(fs.exists(outPath)) { 82 | fs.delete(outPath, true); 83 | } 84 | 85 | //4.运行作业 86 | System.out.println("Job: " + jobName + " is running..."); 87 | if(job.waitForCompletion(true)) { 88 | System.out.println("success!"); 89 | System.exit(0); 90 | } else { 91 | System.out.println("failed!"); 92 | System.exit(1); 93 | } 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/DateSortAsc.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | 17 | public class DateSortAsc { 18 | 19 | public static class SortMapper extends Mapper { 20 | private IntWritable num = new IntWritable(); 21 | 22 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 23 | String[] strs = value.toString().split("\t"); 24 | num.set(Integer.parseInt(strs[1])); 25 | // 将次数作为key进行升序排序 26 | context.write(num, new Text(strs[0])); 27 | System.out.println(num.get()+","+strs[0]); 28 | } 29 | } 30 | 31 | public static class SortReducer extends Reducer { 32 | 33 | public void reduce(IntWritable key, Iterable values, Context context) 34 | throws IOException, InterruptedException { 35 | for (Text value : values) { 36 | // 排序后再次颠倒k-v,将日期作为key 37 | System.out.println(value.toString()+":"+key.get()); 38 | context.write(value, key); 39 | } 40 | } 41 | } 42 | 43 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 44 | // 1.设置HDFS配置信息 45 | String namenode_ip = "192.168.17.10"; 46 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 47 | Configuration conf = new Configuration(); 48 | conf.set("fs.defaultFS", hdfs); 49 | conf.set("mapreduce.app-submission.cross-platform", "true"); 50 | 51 | // 2.设置MapReduce作业配置信息 52 | String jobName = "DateSortAsc"; // 定义作业名称 53 | Job job = Job.getInstance(conf, jobName); 54 | job.setJarByClass(DateSortAsc.class); // 指定作业类 55 | job.setJar("export\\DateSortAsc.jar"); // 指定本地jar包 56 | 57 | job.setMapperClass(SortMapper.class); // 指定Mapper类 58 | job.setMapOutputKeyClass(IntWritable.class); // 设置Mapper输出Key类型 59 | job.setMapOutputValueClass(Text.class); // 设置Mapper输出Value类型 60 | 61 | job.setReducerClass(SortReducer.class); // 指定Reducer类 62 | job.setOutputKeyClass(Text.class); // 设置Reduce输出Key类型 63 | job.setOutputValueClass(IntWritable.class); // 设置Reduce输出Value类型 64 | 65 | // 3.设置作业输入和输出路径 66 | String dataDir = "/workspace/dateSort/data"; // 实验数据目录 67 | String outputDir = "/workspace/dateSort/output"; // 实验输出目录 68 | Path inPath = new Path(hdfs + dataDir); 69 | Path outPath = new Path(hdfs + outputDir); 70 | FileInputFormat.addInputPath(job, inPath); 71 | FileOutputFormat.setOutputPath(job, outPath); 72 | FileSystem fs = FileSystem.get(conf); 73 | if (fs.exists(outPath)) { 74 | fs.delete(outPath, true); 75 | } 76 | 77 | // 4.运行作业 78 | System.out.println("Job: " + jobName + " is running..."); 79 | if (job.waitForCompletion(true)) { 80 | System.out.println("success!"); 81 | System.exit(0); 82 | } else { 83 | System.out.println("failed!"); 84 | System.exit(1); 85 | } 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/DateSortDesc.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.io.WritableComparable; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.Mapper.Context; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | 18 | import mapreduceProgram.DateSortAsc.SortMapper; 19 | import mapreduceProgram.DateSortAsc.SortReducer; 20 | 21 | import org.apache.hadoop.io.WritableComparator; 22 | 23 | public class DateSortDesc { 24 | 25 | public static class MyComparator extends WritableComparator { 26 | public MyComparator() { 27 | // TODO Auto-generated constructor stub 28 | super(IntWritable.class, true); 29 | } 30 | 31 | @Override 32 | @SuppressWarnings({ "rawtypes", "unchecked" }) // 不检查类型 33 | public int compare(WritableComparable a, WritableComparable b) { 34 | // CompareTo方法,返回值为1则降序,-1则升序 35 | // 默认是a.compareTo(b),a比b小返回-1,现在反过来返回1,就变成了降序 36 | return b.compareTo(a); 37 | } 38 | 39 | public static class SortMapper extends Mapper { 40 | private IntWritable num = new IntWritable(); 41 | 42 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 43 | String[] strs = value.toString().split("\t"); 44 | num.set(Integer.parseInt(strs[1])); 45 | // 将次数作为key进行升序排序 46 | context.write(num, new Text(strs[0])); 47 | System.out.println(num.get() + "," + strs[0]); 48 | } 49 | } 50 | 51 | public static class SortReducer extends Reducer { 52 | 53 | public void reduce(IntWritable key, Iterable values, Context context) 54 | throws IOException, InterruptedException { 55 | for (Text value : values) { 56 | // 排序后再次颠倒k-v,将日期作为key 57 | System.out.println(value.toString() + ":" + key.get()); 58 | context.write(value, key); 59 | } 60 | } 61 | } 62 | } 63 | 64 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 65 | // 1.设置HDFS配置信息 66 | String namenode_ip = "192.168.17.10"; 67 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 68 | Configuration conf = new Configuration(); 69 | conf.set("fs.defaultFS", hdfs); 70 | conf.set("mapreduce.app-submission.cross-platform", "true"); 71 | 72 | // 2.设置MapReduce作业配置信息 73 | String jobName = "DateSortDesc"; // 定义作业名称 74 | Job job = Job.getInstance(conf, jobName); 75 | job.setJarByClass(DateSortAsc.class); // 指定作业类 76 | job.setJar("export\\DateSortDesc.jar"); // 指定本地jar包 77 | 78 | job.setMapperClass(SortMapper.class); 79 | job.setMapOutputKeyClass(IntWritable.class); 80 | job.setMapOutputValueClass(Text.class); 81 | job.setReducerClass(SortReducer.class); 82 | job.setOutputKeyClass(Text.class); 83 | job.setOutputValueClass(IntWritable.class); 84 | // 指定排序所使用的比较器 85 | job.setSortComparatorClass(MyComparator.class); 86 | 87 | // 3.设置作业输入和输出路径 88 | String dataDir = "/workspace/dateSort/data"; // 实验数据目录 89 | String outputDir = "/workspace/dateSort/output"; // 实验输出目录 90 | Path inPath = new Path(hdfs + dataDir); 91 | Path outPath = new Path(hdfs + outputDir); 92 | FileInputFormat.addInputPath(job, inPath); 93 | FileOutputFormat.setOutputPath(job, outPath); 94 | FileSystem fs = FileSystem.get(conf); 95 | if (fs.exists(outPath)) { 96 | fs.delete(outPath, true); 97 | } 98 | 99 | // 4.运行作业 100 | System.out.println("Job: " + jobName + " is running..."); 101 | if (job.waitForCompletion(true)) { 102 | System.out.println("success!"); 103 | System.exit(0); 104 | } else { 105 | System.out.println("failed!"); 106 | System.exit(1); 107 | } 108 | 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/FlowPartition.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.io.Writable; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Partitioner; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | 20 | public class FlowPartition { 21 | public static class FlowPartitionMapper extends Mapper { 22 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 23 | String[] strs = value.toString().split("\t"); 24 | Text phone = new Text(strs[0]); 25 | FlowWritable flow = new FlowWritable(Integer.parseInt(strs[1]), Integer.parseInt(strs[2])); 26 | System.out.println("Flow is:" + flow.toString()); 27 | context.write(phone, flow); 28 | } 29 | } 30 | 31 | public static class FlowPartitionReducer extends Reducer { 32 | public void reduce(Text key, Iterable values, Context context) 33 | throws IOException, InterruptedException { 34 | int upFlow = 0; 35 | int downFlow = 0; 36 | 37 | for (FlowWritable value : values) { 38 | upFlow += value.getUpFlow(); 39 | downFlow += value.getDownFlow(); 40 | } 41 | System.out.println(key.toString() + ":" + upFlow + "," + downFlow); 42 | context.write(key, new FlowWritable(upFlow, downFlow)); 43 | } 44 | } 45 | 46 | public static class FlowWritable implements Writable { 47 | private int upFlow; 48 | private int downFlow; 49 | private int sumFlow; 50 | 51 | public FlowWritable() { 52 | } 53 | 54 | public FlowWritable(int upFlow, int downFlow) { 55 | this.upFlow = upFlow; 56 | this.downFlow = downFlow; 57 | this.sumFlow = upFlow + downFlow; 58 | } 59 | 60 | public int getDownFlow() { 61 | return downFlow; 62 | } 63 | 64 | public void setDownFlow(int downFlow) { 65 | this.downFlow = downFlow; 66 | } 67 | 68 | public int getUpFlow() { 69 | return upFlow; 70 | } 71 | 72 | public void setUpFlow(int upFlow) { 73 | this.upFlow = upFlow; 74 | } 75 | 76 | public int getSumFlow() { 77 | return sumFlow; 78 | } 79 | 80 | public void setSumFlow(int sumFlow) { 81 | this.sumFlow = sumFlow; 82 | } 83 | 84 | @Override 85 | public void write(DataOutput out) throws IOException { 86 | // TODO Auto-generated method stub 87 | out.writeInt(upFlow); 88 | out.writeInt(downFlow); 89 | out.writeInt(sumFlow); 90 | } 91 | 92 | @Override 93 | public void readFields(DataInput in) throws IOException { 94 | // TODO Auto-generated method stub 95 | upFlow = in.readInt(); 96 | downFlow = in.readInt(); 97 | sumFlow = in.readInt(); 98 | } 99 | 100 | @Override 101 | public String toString() { 102 | // TODO Auto-generated method stub 103 | return upFlow + "\t" + downFlow + "\t" + sumFlow; 104 | } 105 | } 106 | 107 | public static class PhoneNumberPartitioner extends Partitioner { 108 | private static HashMap numberDict = new HashMap<>(); 109 | static { 110 | numberDict.put("133", 0); 111 | numberDict.put("135", 1); 112 | numberDict.put("137", 2); 113 | numberDict.put("138", 3); 114 | } 115 | 116 | @Override 117 | public int getPartition(Text key, FlowWritable value, int numPartitions) { 118 | String num = key.toString().substring(0, 3); 119 | // 借助HashMap返回不同手机段对应的分区号 120 | // 也可以直接通过if判断,如 121 | // 根据年份对数据进行分区,返回不同分区号 122 | // if (key.toString().startsWith("133")) return 0 % numPartitions; 123 | return numberDict.getOrDefault(num, 4); 124 | } 125 | } 126 | 127 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 128 | // 设置hdfs配置信息 129 | String namenode_ip = "192.168.17.10"; 130 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 131 | Configuration conf = new Configuration(); 132 | conf.set("fs.defaultFS", hdfs); 133 | conf.set("mapreduce.app-submission.cross-platform", "true"); 134 | 135 | // 设置作业Job配置信息 136 | String jobName = "FlowPartition"; 137 | Job job = Job.getInstance(conf, jobName); 138 | job.setJarByClass(FlowPartition.class); 139 | job.setJar("export\\FlowPartition.jar"); 140 | // Map 141 | job.setMapperClass(FlowPartitionMapper.class); 142 | // Reduce 143 | job.setReducerClass(FlowPartitionReducer.class); 144 | // 输出k-v类型 145 | job.setOutputKeyClass(Text.class); 146 | job.setOutputValueClass(FlowWritable.class); 147 | // 设置分区类,及Reducer数目 148 | job.setPartitionerClass(PhoneNumberPartitioner.class); 149 | job.setNumReduceTasks(4); 150 | 151 | // 设置job输入出路径 152 | String dataDir = "/workspace/flowStatistics/data"; 153 | String outputDir = "/workspace/flowStatistics/output_partitions"; 154 | Path inPath = new Path(hdfs + dataDir); 155 | Path outPath = new Path(hdfs + outputDir); 156 | FileInputFormat.addInputPath(job, inPath); 157 | FileOutputFormat.setOutputPath(job, outPath); 158 | FileSystem fs = FileSystem.get(conf); 159 | if (fs.exists(outPath)) { 160 | fs.delete(outPath, true); 161 | } 162 | 163 | // 运行作业 164 | System.out.println("Job: " + jobName + " is running..."); 165 | if (job.waitForCompletion(true)) { 166 | System.out.println("success!"); 167 | System.exit(0); 168 | } else { 169 | System.out.println("failed!"); 170 | System.exit(1); 171 | } 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/FlowSort.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.io.WritableComparable; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | 18 | 19 | public class FlowSort { 20 | 21 | public static class MySortKey implements WritableComparable { 22 | private int upFlow; 23 | private int downFlow; 24 | private int sumFlow; 25 | 26 | public MySortKey() { 27 | // TODO Auto-generated constructor stub 28 | } 29 | 30 | public MySortKey(int up, int down) { 31 | upFlow = up; 32 | downFlow = down; 33 | sumFlow = up + down; 34 | } 35 | 36 | public int getUpFlow() { 37 | return upFlow; 38 | } 39 | 40 | public void setUpFlow(int upFlow) { 41 | this.upFlow = upFlow; 42 | } 43 | 44 | public int getDownFlow() { 45 | return downFlow; 46 | } 47 | 48 | public void setDownFlow(int downFlow) { 49 | this.downFlow = downFlow; 50 | } 51 | 52 | public int getSumFlow() { 53 | return sumFlow; 54 | } 55 | 56 | public void setSumFlow(int sumFlow) { 57 | this.sumFlow = sumFlow; 58 | } 59 | 60 | @Override 61 | public void write(DataOutput out) throws IOException { 62 | // TODO Auto-generated method stub 63 | out.writeInt(upFlow); 64 | out.writeInt(downFlow); 65 | out.writeInt(sumFlow); 66 | } 67 | 68 | @Override 69 | public void readFields(DataInput in) throws IOException { 70 | // TODO Auto-generated method stub 71 | upFlow = in.readInt(); 72 | downFlow = in.readInt(); 73 | sumFlow = in.readInt(); 74 | } 75 | 76 | @Override 77 | public int compareTo(MySortKey o) { 78 | if ((this.upFlow - o.upFlow) == 0) {// 上行流量相等,比较下行流量 79 | return o.downFlow - this.downFlow;// 按downFlow降序排序 80 | } else { 81 | return this.upFlow - o.upFlow;// 按upFlow升序排 82 | } 83 | } 84 | 85 | @Override 86 | public String toString() { 87 | // TODO Auto-generated method stub 88 | return upFlow + "\t" + downFlow + "\t" + sumFlow; 89 | } 90 | } 91 | 92 | public static class SortMapper extends Mapper { 93 | Text phone = new Text(); 94 | MySortKey mySortKey = new MySortKey(); 95 | 96 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 97 | String[] lists = value.toString().split("\t"); 98 | phone.set(lists[0]); 99 | mySortKey.setUpFlow(Integer.parseInt(lists[1])); 100 | mySortKey.setDownFlow(Integer.parseInt(lists[2])); 101 | context.write(mySortKey, phone); 102 | System.out.println(phone.toString()+":"+mySortKey.toString()+",up:"+lists[1]+"=="+mySortKey.getUpFlow()); 103 | } 104 | } 105 | 106 | public static class SortReducer extends Reducer { 107 | public void reduce(MySortKey key, Iterable values, Context context) 108 | throws IOException, InterruptedException { 109 | for (Text value : values) { 110 | System.out.println(value.toString()+","+key.toString()); 111 | context.write(value, key); 112 | } 113 | } 114 | } 115 | 116 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 117 | // 设置HDFS配置信息 118 | String namenode_ip = "192.168.17.10"; 119 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 120 | Configuration conf = new Configuration(); 121 | conf.set("fs.defaultFS", hdfs); 122 | conf.set("mapreduce.app-submission.cross-platform", "true"); 123 | 124 | // 设置job配置信息 125 | Job job = Job.getInstance(conf, "FlowSort"); 126 | job.setJarByClass(FlowSort.class); 127 | job.setJar("export\\FlowSort.jar"); 128 | // Mapper 129 | job.setMapperClass(SortMapper.class); 130 | job.setMapOutputKeyClass(MySortKey.class); 131 | job.setMapOutputValueClass(Text.class); 132 | // Reducer 133 | job.setReducerClass(SortReducer.class); 134 | job.setOutputKeyClass(Text.class); 135 | job.setOutputKeyClass(MySortKey.class); 136 | // 作业输入输出路径 137 | String dataDir = "/workspace/flowStatistics/output/part-r-00000"; // 实验数据目录 138 | String outputDir = "/workspace/flowStatistics/output_sort"; // 实验输出目录 139 | Path inPath = new Path(hdfs + dataDir); 140 | Path outPath = new Path(hdfs + outputDir); 141 | FileInputFormat.addInputPath(job, inPath); 142 | FileOutputFormat.setOutputPath(job, outPath); 143 | FileSystem fs = FileSystem.get(conf); 144 | if (fs.exists(outPath)) { 145 | fs.delete(outPath, true); 146 | } 147 | // 运行作业 148 | System.out.println("Job: FlowSort is running..."); 149 | if (job.waitForCompletion(true)) { 150 | System.out.println("success!"); 151 | System.exit(0); 152 | } else { 153 | System.out.println("failed!"); 154 | System.exit(1); 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/FlowStatistics.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.io.Writable; 16 | import org.apache.hadoop.mapreduce.Job; 17 | 18 | public class FlowStatistics { 19 | 20 | public static class FlowMapper extends Mapper{ 21 | 22 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 23 | String[] strs = value.toString().split("\t"); 24 | Text phone = new Text(strs[0]); 25 | Text flow = new Text(strs[1]+"\t"+strs[2]); 26 | context.write(phone, flow); 27 | } 28 | } 29 | 30 | public static class FlowReducer extends Reducer{ 31 | public void reduce(Text key,Iterable values,Context context) throws IOException, InterruptedException { 32 | int upFlow = 0; 33 | int downFlow = 0; 34 | 35 | for (Text value : values) { 36 | String[] strs = value.toString().split("\t"); 37 | upFlow += Integer.parseInt(strs[0].toString()); 38 | downFlow += Integer.parseInt(strs[1].toString()); 39 | } 40 | int sumFlow = upFlow+downFlow; 41 | 42 | context.write(key,new Text(upFlow+"\t"+downFlow+"\t"+sumFlow)); 43 | } 44 | } 45 | 46 | // 第二种写法 47 | public static class FlowWritableMapper extends Mapper { 48 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 49 | String[] strs = value.toString().split("\t"); 50 | Text phone = new Text(strs[0]); 51 | FlowWritable flow = new FlowWritable(Integer.parseInt(strs[1]),Integer.parseInt(strs[2])); 52 | System.out.println("Flow is:"+flow.toString()); 53 | context.write(phone, flow); 54 | } 55 | } 56 | public static class FlowWritableReducer extends Reducer{ 57 | public void reduce(Text key,Iterable values,Context context) throws IOException, InterruptedException { 58 | int upFlow = 0; 59 | int downFlow = 0; 60 | 61 | for (FlowWritable value : values) { 62 | upFlow += value.getUpFlow(); 63 | downFlow += value.getDownFlow(); 64 | } 65 | System.out.println(key.toString()+":"+upFlow+","+downFlow); 66 | context.write(key,new FlowWritable(upFlow,downFlow)); 67 | } 68 | } 69 | 70 | public static class FlowWritable implements Writable{ 71 | private int upFlow; 72 | private int downFlow; 73 | private int sumFlow; 74 | 75 | public FlowWritable() {} 76 | 77 | public FlowWritable(int upFlow,int downFlow) { 78 | this.upFlow = upFlow; 79 | this.downFlow = downFlow; 80 | this.sumFlow = upFlow+downFlow; 81 | } 82 | 83 | public int getDownFlow() { 84 | return downFlow; 85 | } 86 | 87 | public void setDownFlow(int downFlow) { 88 | this.downFlow = downFlow; 89 | } 90 | 91 | public int getUpFlow() { 92 | return upFlow; 93 | } 94 | 95 | public void setUpFlow(int upFlow) { 96 | this.upFlow = upFlow; 97 | } 98 | 99 | public int getSumFlow() { 100 | return sumFlow; 101 | } 102 | 103 | public void setSumFlow(int sumFlow) { 104 | this.sumFlow = sumFlow; 105 | } 106 | 107 | @Override 108 | public void write(DataOutput out) throws IOException { 109 | // TODO Auto-generated method stub 110 | out.writeInt(upFlow); 111 | out.writeInt(downFlow); 112 | out.writeInt(sumFlow); 113 | } 114 | 115 | @Override 116 | public void readFields(DataInput in) throws IOException { 117 | // TODO Auto-generated method stub 118 | upFlow = in.readInt(); 119 | downFlow = in.readInt(); 120 | sumFlow = in.readInt(); 121 | } 122 | 123 | @Override 124 | public String toString() { 125 | // TODO Auto-generated method stub 126 | return upFlow+"\t"+downFlow+"\t"+sumFlow; 127 | } 128 | } 129 | 130 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 131 | // 设置hdfs配置信息 132 | String namenode_ip = "192.168.17.10"; 133 | String hdfs = "hdfs://"+namenode_ip+":9000"; 134 | Configuration conf = new Configuration(); 135 | conf.set("fs.defaultFS", hdfs); 136 | conf.set("mapreduce.app-submission.cross-platform", "true"); 137 | 138 | // 设置作业Job配置信息 139 | String jobName = "FlowStatistics"; 140 | Job job = Job.getInstance(conf, jobName); 141 | job.setJarByClass(FlowStatistics.class); 142 | job.setJar("export\\FlowStatistics.jar"); 143 | // Map 144 | job.setMapperClass(FlowMapper.class);// 第一种 145 | // job.setMapperClass(FlowWritableMapper.class); 146 | // 这里因为同Reducer输出类型一致,可不写 147 | // job.setMapOutputKeyClass(Text.class); 148 | // job.setMapOutputValueClass(FlowWritable.class); 149 | // Reduce 150 | job.setReducerClass(FlowReducer.class);// 第一种 151 | // job.setReducerClass(FlowWritableReducer.class); 152 | // 输出k-v类型 153 | job.setOutputKeyClass(Text.class); 154 | job.setOutputValueClass(Text.class);// 第一种 155 | // job.setOutputValueClass(FlowWritable.class); 156 | 157 | // 设置job输入出路径 158 | String dataDir = "/workspace/flowStatistics/data"; 159 | String outputDir = "/workspace/flowStatistics/output"; 160 | Path inPath = new Path(hdfs+dataDir); 161 | Path outPath = new Path(hdfs+outputDir); 162 | FileInputFormat.addInputPath(job, inPath); 163 | FileOutputFormat.setOutputPath(job, outPath); 164 | FileSystem fs = FileSystem.get(conf); 165 | if(fs.exists(outPath)) { 166 | fs.delete(outPath, true); 167 | } 168 | 169 | // 运行作业 170 | System.out.println("Job: " + jobName + " is running..."); 171 | if(job.waitForCompletion(true)) { 172 | System.out.println("success!"); 173 | System.exit(0); 174 | } else { 175 | System.out.println("failed!"); 176 | System.exit(1); 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/mapreduceProgram/GroupMax.java: -------------------------------------------------------------------------------- 1 | package mapreduceProgram; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import javax.print.attribute.standard.JobName; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.DoubleWritable; 13 | import org.apache.hadoop.io.NullWritable; 14 | import org.apache.hadoop.io.RawComparator; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.io.WritableComparable; 17 | import org.apache.hadoop.io.WritableComparator; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.Partitioner; 21 | import org.apache.hadoop.mapreduce.Reducer; 22 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 23 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 24 | 25 | import mapreduceProgram.FlowSort.MySortKey; 26 | import mapreduceProgram.FlowSort.SortMapper; 27 | import mapreduceProgram.FlowSort.SortReducer; 28 | import sun.tools.tree.SuperExpression; 29 | 30 | public class GroupMax { 31 | 32 | public static class Pair implements WritableComparable { 33 | private String order_id; 34 | private DoubleWritable amount; 35 | 36 | public Pair() { 37 | // TODO Auto-generated constructor stub 38 | } 39 | 40 | public Pair(String id, DoubleWritable amount) { 41 | this.order_id = id; 42 | this.amount = amount; 43 | } 44 | 45 | public String getOrder_id() { 46 | return order_id; 47 | } 48 | 49 | public void setOrder_id(String order_id) { 50 | this.order_id = order_id; 51 | } 52 | 53 | public DoubleWritable getAmount() { 54 | return amount; 55 | } 56 | 57 | public void setAmount(DoubleWritable amount) { 58 | this.amount = amount; 59 | } 60 | 61 | @Override 62 | public void write(DataOutput out) throws IOException { 63 | // TODO Auto-generated method stub 64 | out.writeUTF(order_id); 65 | out.writeDouble(amount.get()); 66 | } 67 | 68 | @Override 69 | public void readFields(DataInput in) throws IOException { 70 | // TODO Auto-generated method stub 71 | order_id = in.readUTF(); 72 | amount = new DoubleWritable(in.readDouble()); 73 | } 74 | 75 | @Override 76 | public int compareTo(Pair o) { 77 | if (order_id.equals(o.order_id)) {// 同一order_id,按照amount降序排序 78 | return o.amount.compareTo(amount); 79 | } else { 80 | return order_id.compareTo(o.order_id); 81 | } 82 | } 83 | 84 | } 85 | 86 | public static class MyMapper extends Mapper { 87 | Pair pair = new Pair(); 88 | 89 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 90 | String[] strs = value.toString().split(" "); 91 | pair.setOrder_id(strs[0]); 92 | pair.setAmount(new DoubleWritable(Double.parseDouble(strs[2]))); 93 | context.write(pair, NullWritable.get()); 94 | System.out.println(pair.getOrder_id()+","+pair.getAmount()); 95 | } 96 | } 97 | 98 | public static class MyReducer extends Reducer { 99 | public void reduce(Pair key, Iterable values, Context context) 100 | throws IOException, InterruptedException { 101 | context.write(new Text(key.getOrder_id()), key.getAmount()); 102 | System.out.println(key.order_id+": "+key.amount.get()); 103 | // 下面这个可以看下分组结果 104 | // for (NullWritable value : values) { 105 | // context.write(new Text(key.getOrder_id()), key.getAmount()); 106 | // System.out.println(key.order_id+": "+key.amount.get()); 107 | // } 108 | } 109 | } 110 | // 是分组不是分区,分组是组内定义一些规则由reduce去处理,分区是由多个Reduce处理,写到不同文件中 111 | // 自定义分组类 112 | public static class GroupComparator extends WritableComparator { 113 | public GroupComparator() { 114 | // TODO Auto-generated constructor stub 115 | super(Pair.class, true); 116 | } 117 | // Mapper端会对Pair排序,之后分组的规则是对Pair中的order_id比较 118 | @Override 119 | public int compare(WritableComparable a, WritableComparable b) { 120 | // TODO Auto-generated method stub 121 | Pair oa = (Pair) a; 122 | Pair ob = (Pair) b; 123 | return oa.getOrder_id().compareTo(ob.getOrder_id()); 124 | } 125 | 126 | } 127 | 128 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 129 | // 设置HDFS配置信息 130 | String namenode_ip = "192.168.17.10"; 131 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 132 | Configuration conf = new Configuration(); 133 | conf.set("fs.defaultFS", hdfs); 134 | conf.set("mapreduce.app-submission.cross-platform", "true"); 135 | 136 | // 设置job配置信息 137 | String JobName = "GroupMax"; 138 | Job job = Job.getInstance(conf, JobName); 139 | job.setJarByClass(GroupMax.class); 140 | job.setJar("export\\GroupMax.jar"); 141 | // Mapper 142 | job.setMapperClass(MyMapper.class); 143 | job.setMapOutputKeyClass(Pair.class); 144 | job.setMapOutputValueClass(NullWritable.class); 145 | // Reducer 146 | job.setReducerClass(MyReducer.class); 147 | job.setOutputKeyClass(Text.class); 148 | job.setOutputKeyClass(DoubleWritable.class); 149 | // GroupComparator自定义分组类 150 | job.setGroupingComparatorClass(GroupComparator.class); 151 | // 作业输入输出路径 152 | String dataDir = "/workspace/data/orderDetail.txt"; // 实验数据 153 | String outputDir = "/workspace/groupMax/output"; // 实验输出目录 154 | Path inPath = new Path(hdfs + dataDir); 155 | Path outPath = new Path(hdfs + outputDir); 156 | FileInputFormat.addInputPath(job, inPath); 157 | FileOutputFormat.setOutputPath(job, outPath); 158 | FileSystem fs = FileSystem.get(conf); 159 | if (fs.exists(outPath)) { 160 | fs.delete(outPath, true); 161 | } 162 | // 运行作业 163 | System.out.println("Job: "+JobName+" is running..."); 164 | if (job.waitForCompletion(true)) { 165 | System.out.println("success!"); 166 | System.exit(0); 167 | } else { 168 | System.out.println("failed!"); 169 | System.exit(1); 170 | } 171 | } 172 | 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/mergeMultipleFiles/MergeJob.java: -------------------------------------------------------------------------------- 1 | package mergeMultipleFiles; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.BytesWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 14 | 15 | import InputOutputFormatTest.MultiInOutput; 16 | 17 | public class MergeJob { 18 | 19 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 20 | // 1.设置HDFS配置信息 21 | String namenode_ip = "192.168.17.10"; 22 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 23 | Configuration conf = new Configuration(); 24 | conf.set("fs.defaultFS", hdfs); 25 | conf.set("mapreduce.app-submission.cross-platform", "true"); 26 | 27 | // 2.设置MapReduce作业配置信息 28 | String jobName = "MergeMultipleFiles"; // 作业名称 29 | Job job = Job.getInstance(conf, jobName); 30 | job.setJarByClass(MultiInOutput.class); // 指定运行时作业类 31 | job.setJar("export\\MergeMultipleFiles.jar"); // 指定本地jar包 32 | job.setMapOutputKeyClass(Text.class); // 设置Mapper输出Key类型 33 | job.setMapOutputValueClass(BytesWritable.class); // 设置Mapper输出Value类型 34 | job.setMapperClass(MergeMapper.class); 35 | // 输入数据格式 36 | job.setInputFormatClass(MyInputFormat.class); 37 | // 以文件格式输出,使用序列化文件输出类 38 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 39 | 40 | // 设置作业输出路径 41 | String inputDir = "/workspace/mergeFiles/data"; 42 | String outputDir = "/workspace/mergeFiles/output"; // 输出目录 43 | Path outPath = new Path(hdfs + outputDir); 44 | Path inputPath = new Path(hdfs+inputDir); 45 | FileInputFormat.setInputPaths(job, inputPath); 46 | FileOutputFormat.setOutputPath(job, outPath); 47 | FileSystem fs = FileSystem.get(conf); 48 | if (fs.exists(outPath)) { 49 | fs.delete(outPath, true); 50 | } 51 | 52 | // 运行作业 53 | System.out.println("Job: " + jobName + " is running..."); 54 | if (job.waitForCompletion(true)) { 55 | System.out.println("success!"); 56 | System.exit(0); 57 | } else { 58 | System.out.println("failed!"); 59 | System.exit(1); 60 | } 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/mergeMultipleFiles/MergeMapper.java: -------------------------------------------------------------------------------- 1 | package mergeMultipleFiles; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.BytesWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.InputSplit; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 12 | 13 | public class MergeMapper extends Mapper{ 14 | private Text fileNameKey; 15 | 16 | @Override 17 | protected void map(NullWritable key, BytesWritable value, 18 | Mapper.Context context) 19 | throws IOException, InterruptedException { 20 | // TODO Auto-generated method stub 21 | context.write(fileNameKey, value); 22 | } 23 | 24 | @Override 25 | protected void setup(Mapper.Context context) 26 | throws IOException, InterruptedException { 27 | InputSplit split = context.getInputSplit(); 28 | Path path = ((FileSplit)split).getPath();//??? 29 | fileNameKey = new Text(path.toString()); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/mergeMultipleFiles/MyInputFormat.java: -------------------------------------------------------------------------------- 1 | package mergeMultipleFiles; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.BytesWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.mapreduce.InputSplit; 9 | import org.apache.hadoop.mapreduce.JobContext; 10 | import org.apache.hadoop.mapreduce.RecordReader; 11 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | 14 | 15 | public class MyInputFormat extends FileInputFormat{ 16 | 17 | 18 | @Override 19 | protected boolean isSplitable(JobContext context, Path filename) { 20 | // TODO 因为是合并小文件,设置文件不可分割,k-v的v就是文件对象 21 | return false; 22 | } 23 | 24 | @Override 25 | public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) 26 | throws IOException, InterruptedException { 27 | // TODO Auto-generated method stub 28 | MyRecordReader myRecordReader = new MyRecordReader(); 29 | myRecordReader.initialize(split, context); 30 | return myRecordReader; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/mergeMultipleFiles/MyRecordReader.java: -------------------------------------------------------------------------------- 1 | package mergeMultipleFiles; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FSDataInputStream; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.BytesWritable; 10 | import org.apache.hadoop.io.IOUtils; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.mapred.FileSplit; 13 | import org.apache.hadoop.mapreduce.InputSplit; 14 | import org.apache.hadoop.mapreduce.RecordReader; 15 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 16 | 17 | public class MyRecordReader extends RecordReader{ 18 | private FileSplit fileSplit; 19 | private Configuration conf ; 20 | private BytesWritable value = new BytesWritable(); 21 | private boolean processed =false; 22 | @Override 23 | public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { 24 | fileSplit = (FileSplit)split; 25 | conf = context.getConfiguration(); 26 | } 27 | @Override 28 | public boolean nextKeyValue() throws IOException, InterruptedException { 29 | if (!processed) { 30 | byte[] contents = new byte[(int) fileSplit.getLength()];// 获取分片长度字节数组 31 | Path file = fileSplit.getPath();// 获取切片所在位置 32 | FileSystem fSystem = file.getFileSystem(conf); 33 | FSDataInputStream in = null; 34 | try { 35 | in = fSystem.open(file);// 打开文件 36 | IOUtils.readFully(in, contents, 0, contents.length);// 读取整个文件字节数据,写入contents 37 | value.set(contents,0,contents.length);// 将整个文件数据赋值给value 38 | } finally { 39 | IOUtils.closeStream(in); 40 | } 41 | processed = true; 42 | return true; 43 | } 44 | return false; 45 | } 46 | @Override 47 | public NullWritable getCurrentKey() throws IOException, InterruptedException { 48 | // 获取当前key,因为合并文件,我们应该将文件对象付给value,key赋空即可 49 | return NullWritable.get(); 50 | } 51 | @Override 52 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 53 | return value;// value是整个文件对象的字节数据 54 | } 55 | @Override 56 | public float getProgress() throws IOException, InterruptedException { 57 | // TODO Auto-generated method stub 58 | return processed ? 1.0f:0.0f; 59 | } 60 | @Override 61 | public void close() throws IOException { 62 | // TODO Auto-generated method stub 63 | 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/mutualFriend/DecomposeFriendsMapper.java: -------------------------------------------------------------------------------- 1 | package mutualFriend; 2 | 3 | import org.apache.hadoop.mapreduce.Mapper; 4 | 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Text; 8 | 9 | public class DecomposeFriendsMapper extends Mapper { 10 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 11 | String strs = value.toString(); 12 | Text uString = new Text(strs.substring(0, 1)); 13 | String[] friends = strs.substring(2).split(","); 14 | 15 | //A:B,C,D,F,E,O 16 | for (int i = 0; i < friends.length; i++) { 17 | // 以,形式输出 18 | context.write(new Text(friends[i]),uString); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/mutualFriend/DecomposeFriendsReducer.java: -------------------------------------------------------------------------------- 1 | package mutualFriend; 2 | 3 | import org.apache.hadoop.mapreduce.Reducer; 4 | 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Text; 8 | 9 | public class DecomposeFriendsReducer extends Reducer{ 10 | 11 | @Override 12 | protected void reduce(Text key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | String friendList = ""; 15 | for (Text value : values) { 16 | friendList += value.toString()+","; 17 | } 18 | // 输出个人所有好友,A I,K,C,B,G,F,H,O,D 19 | context.write(key, new Text(friendList.substring(0,friendList.length()-1))); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/mutualFriend/JobControlRun.java: -------------------------------------------------------------------------------- 1 | package mutualFriend; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; 12 | import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class JobControlRun { 16 | 17 | public static void main(String[] args) throws IOException { 18 | String namenode_ip = "192.168.17.10"; 19 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 20 | Configuration conf = new Configuration(); 21 | conf.set("fs.defaultFS", hdfs); 22 | conf.set("mapreduce.app-submission.cross-platform", "true"); 23 | 24 | Job job1 = Job.getInstance(conf,"Decompose"); 25 | job1.setJarByClass(JobControlRun.class); 26 | job1.setJar("export\\mutualFriend.jar"); 27 | job1.setMapperClass(DecomposeFriendsMapper.class); 28 | job1.setReducerClass(DecomposeFriendsReducer.class); 29 | job1.setOutputKeyClass(Text.class); 30 | job1.setOutputValueClass(Text.class); 31 | 32 | Path input = new Path(hdfs+"/workspace/mutualFriends/data"); 33 | Path output1 = new Path(hdfs+"/workspace/mutualFriends/output_Dec"); 34 | FileInputFormat.addInputPath(job1, input); 35 | FileOutputFormat.setOutputPath(job1, output1); 36 | FileSystem fs = FileSystem.get(conf); 37 | if (fs.exists(output1)) { 38 | fs.delete(output1, true); 39 | System.out.println("我被删了");// 打印可见只被删了一次,有点怪 40 | } 41 | // ControlledJob作业控制容器 42 | ControlledJob ctrJob1=new ControlledJob(conf); 43 | ctrJob1.setJob(job1);// job1加入控制容器 44 | 45 | Job job2 = Job.getInstance(conf, "Merge"); 46 | job2.setJarByClass(JobControlRun.class); 47 | job2.setJar("export\\mutualFriend.jar"); 48 | job2.setMapperClass(MergeFriendsMapper.class); 49 | job2.setReducerClass(MergeFriendsReducer.class); 50 | job2.setOutputKeyClass(Text.class); 51 | job2.setOutputValueClass(Text.class); 52 | 53 | Path input2 = new Path(hdfs+"/workspace/mutualFriends/output_Dec"); 54 | Path output2 = new Path(hdfs+"/workspace/mutualFriends/output_Meg"); 55 | FileInputFormat.addInputPath(job2, input2); 56 | FileOutputFormat.setOutputPath(job2, output2); 57 | if (fs.exists(output2)) { 58 | fs.delete(output2, true); 59 | } 60 | ControlledJob ctrJob2 = new ControlledJob(conf); 61 | ctrJob2.setJob(job2);// job2加入作业控制容器 62 | 63 | // 添加作业依赖,表明job2依赖job1执行 64 | ctrJob2.addDependingJob(ctrJob1); 65 | 66 | // 定义作业主控制容器,监控、调度job1,job2 67 | JobControl jobControl=new JobControl("JobControl"); 68 | jobControl.addJob(ctrJob1); 69 | jobControl.addJob(ctrJob2); 70 | // 启动作业线程 71 | Thread T=new Thread(jobControl); 72 | T.start(); 73 | while(true){ 74 | if(jobControl.allFinished()){// 等待作业全部结束 75 | System.out.println(jobControl.getSuccessfulJobList());// 打印成功job信息 76 | jobControl.stop(); 77 | break; 78 | } 79 | } 80 | /** 81 | * 打印控制信息如下 82 | * [job name: Decompose 83 | job id: JobControl0 84 | job state: SUCCESS 85 | job mapred id: job_local445604445_0001 86 | job message: just initialized 87 | job has no depending job: 88 | , job name: Merge 89 | job id: JobControl1 90 | job state: SUCCESS 91 | job mapred id: job_local1897659504_0002 92 | job message: just initialized 93 | job has 1 dependeng jobs: 94 | depending job 0: Decompose 95 | ] 96 | */ 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/mutualFriend/JobRun.java: -------------------------------------------------------------------------------- 1 | package mutualFriend; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class JobRun { 14 | 15 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 16 | String namenode_ip = "192.168.17.10"; 17 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 18 | Configuration conf = new Configuration(); 19 | conf.set("fs.defaultFS", hdfs); 20 | conf.set("mapreduce.app-submission.cross-platform", "true"); 21 | 22 | // job1配置信息 23 | Job job1 = Job.getInstance(conf,"Decompose"); 24 | job1.setJarByClass(JobRun.class); 25 | job1.setJar("export\\mutualFriend.jar"); 26 | job1.setMapperClass(DecomposeFriendsMapper.class); 27 | job1.setReducerClass(DecomposeFriendsReducer.class); 28 | job1.setOutputKeyClass(Text.class); 29 | job1.setOutputValueClass(Text.class); 30 | 31 | Path input = new Path(hdfs+"/workspace/mutualFriends/data"); 32 | Path output1 = new Path(hdfs+"/workspace/mutualFriends/output_Dec"); 33 | FileInputFormat.addInputPath(job1, input); 34 | FileOutputFormat.setOutputPath(job1, output1); 35 | FileSystem fs = FileSystem.get(conf); 36 | if (fs.exists(output1)) { 37 | fs.delete(output1, true); 38 | } 39 | 40 | // job1如果运行成功则进入job2 41 | if(job1.waitForCompletion(true)) {//job2完全依赖job1的结果,所以job1成功执行就开启job2 42 | // job2配置信息 43 | Job job2 = Job.getInstance(conf, "Merge"); 44 | job2.setJarByClass(JobRun.class); 45 | job2.setJar("export\\mutualFriend.jar"); 46 | job2.setMapperClass(MergeFriendsMapper.class); 47 | job2.setReducerClass(MergeFriendsReducer.class); 48 | job2.setOutputKeyClass(Text.class); 49 | job2.setOutputValueClass(Text.class); 50 | 51 | Path output2 = new Path(hdfs+"/workspace/mutualFriends/output_Meg"); 52 | FileInputFormat.addInputPath(job2, output1);// 输入是job1的输出 53 | FileOutputFormat.setOutputPath(job2, output2); 54 | if (fs.exists(output2)) { 55 | fs.delete(output2, true); 56 | } 57 | if(job2.waitForCompletion(true)) { 58 | System.out.println("sucessed"); 59 | }else { 60 | System.out.println("failed"); 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/mutualFriend/MergeFriendsMapper.java: -------------------------------------------------------------------------------- 1 | package mutualFriend; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class MergeFriendsMapper extends Mapper{// 别写成输入key也是Text类型,这里输入的是偏移量 10 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 11 | Text uText = new Text(value.toString().substring(0, 1)); 12 | String[] lists = value.toString().substring(2).split(","); 13 | Arrays.sort(lists);// 要排好序,不然如A-B,B-A不能归并到一起 14 | //对如A B,C,E遍历输出如 15 | for (int i = 0; i < lists.length; i++) { 16 | for(int j=i+1;j{ 9 | public void reduce(Text key,Iterable values,Context context) throws IOException, InterruptedException { 10 | String friends = ""; 11 | for (Text value : values) { 12 | friends += value.toString()+","; 13 | } 14 | System.out.println(key.toString()+" "+friends); 15 | context.write(key, new Text(friends)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/shuffleTest/MonthAscTempDescSort.java: -------------------------------------------------------------------------------- 1 | package shuffleTest; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Partitioner; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.Mapper.Context; 15 | 16 | public class MonthAscTempDescSort { 17 | // 按年分区,每个文件中按月升序,按温度降序 18 | public static class MonthTempMapper extends Mapper { 19 | IntWritable temp = new IntWritable(); 20 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 21 | String[] strings =value.toString().split(" "); 22 | String date = strings[0].substring(0, 7); 23 | temp.set(Integer.parseInt(strings[2].substring(0, strings[2].length()-1))); 24 | context.write(new Text(date), temp); 25 | } 26 | } 27 | 28 | 29 | public static void main(String[] args) { 30 | // TODO Auto-generated method stub 31 | 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/shuffleTest/TempSort.java: -------------------------------------------------------------------------------- 1 | package shuffleTest; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.WritableComparable; 9 | import org.apache.hadoop.io.WritableComparator; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Partitioner; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | 18 | 19 | public class TempSort { 20 | /* 21 | * 按年输出(分区),每个文件包含每月的最高温度 22 | */ 23 | public static class TempSortMapper extends Mapper { 24 | IntWritable temp = new IntWritable(); 25 | public void map(Object key,Text value,Context context) throws IOException, InterruptedException { 26 | String[] strings =value.toString().split(" "); 27 | String date = strings[0].substring(0, 7); 28 | temp.set(Integer.parseInt(strings[2].substring(0, strings[2].length()-1))); 29 | context.write(new Text(date), temp); 30 | } 31 | } 32 | 33 | public static class TempSortReducer extends Reducer{ 34 | public void reduce(Text key,Iterable values,Context context) throws IOException, InterruptedException { 35 | // 气温降序排序,区第一个 36 | // IntWritable temp = values.iterator().next(); 37 | // System.out.println("气温:"+temp); 38 | // context.write(key, temp); 39 | 40 | int maxTemp = Integer.MIN_VALUE; 41 | for(IntWritable value:values) { 42 | System.out.println("年:"+key+", 气温:"+value); 43 | if (value.get()>maxTemp) { 44 | maxTemp = value.get(); 45 | } 46 | } 47 | System.out.println("Date:"+key+", MaxTemp:"+maxTemp); 48 | context.write(key, new IntWritable(maxTemp)); 49 | } 50 | } 51 | 52 | public static class YearPartitioner extends Partitioner { 53 | @Override 54 | public int getPartition(Text key, IntWritable value, int numPartitions) { 55 | //根据年份对数据进行分区,返回不同分区号 56 | if (key.toString().startsWith("1949")) 57 | return 0 % numPartitions; 58 | else if (key.toString().startsWith("1950")) 59 | return 1 % numPartitions; 60 | else 61 | return 2 % numPartitions; 62 | } 63 | } 64 | 65 | // public static class MySort extends WritableComparator { 66 | // public MySort() { 67 | // super(IntWritable.class,true); 68 | // } 69 | // 70 | // @SuppressWarnings({"rawtypes","unchecked"}) 71 | // public int compare(WritableComparable a,WritableComparable b) { 72 | // return b.compareTo(a); 73 | // } 74 | // } 75 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 76 | // TODO Auto-generated method stub 77 | String hdfs = "hdfs://192.168.17.10:9000"; 78 | Configuration conf = new Configuration(); 79 | conf.set("fs.defaultFS", hdfs); 80 | conf.set("mapreduce.app-submission.cross-platform", "true"); 81 | // 设置作业配置信息 82 | String jobName = "TempSort"; 83 | Job job = Job.getInstance(conf, jobName); 84 | job.setJarByClass(TempSort.class); 85 | job.setJar("export\\TempSort.jar"); 86 | // Map 87 | job.setMapperClass(TempSortMapper.class); 88 | job.setMapOutputKeyClass(Text.class); 89 | job.setMapOutputValueClass(IntWritable.class); 90 | // Reduce 91 | job.setReducerClass(TempSortReducer.class); 92 | // 全局 93 | job.setOutputKeyClass(Text.class); 94 | job.setOutputValueClass(IntWritable.class); 95 | // Sort 96 | // job.setSortComparatorClass(MySort.class); 97 | // Partition 98 | job.setPartitionerClass(YearPartitioner.class); 99 | job.setNumReduceTasks(3); 100 | //3.设置作业输入和输出路径 101 | String dataDir = "/expr/test/data"; //实验数据目录 102 | String outputDir = "/expr/test/output"; //实验输出目录 103 | Path inPath = new Path(hdfs + dataDir); 104 | Path outPath = new Path(hdfs + outputDir); 105 | FileInputFormat.addInputPath(job, inPath); 106 | FileOutputFormat.setOutputPath(job, outPath); 107 | FileSystem fs = FileSystem.get(conf); 108 | if(fs.exists(outPath)) { 109 | fs.delete(outPath, true); 110 | } 111 | 112 | //4.运行作业 113 | System.out.println("Job: " + jobName + " is running..."); 114 | if(job.waitForCompletion(true)) { 115 | System.out.println("success!"); 116 | System.exit(0); 117 | } else { 118 | System.out.println("failed!"); 119 | System.exit(1); 120 | } 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/counter/YearCounter.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.counter; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class YearCounter { 16 | 17 | //自定义年份计数器 18 | private enum YCounter { 19 | Y2015, Y2016, Y2017 20 | } 21 | 22 | public static class YearCounterMapper extends Mapper { 23 | private final static IntWritable one = new IntWritable(1); 24 | 25 | public void map(Object key, Text value, Context context ) 26 | throws IOException, InterruptedException { 27 | String[] strs = value.toString().split(" "); //按空格分割输入 28 | Text date = new Text(strs[0]); //获取日期 29 | context.write(date, one); //将日期和常数1作为Map输出 30 | 31 | //根据KEY值不同,增加对应计数器的值 32 | if(strs[0].startsWith("2015")) { 33 | context.getCounter(YCounter.Y2015).increment(1); 34 | } else if(strs[0].startsWith("2016")) { 35 | context.getCounter(YCounter.Y2016).increment(1); 36 | } else 37 | context.getCounter(YCounter.Y2017).increment(1); 38 | } 39 | } 40 | 41 | public static class YearCounterReducer extends Reducer { 42 | public void reduce(Text key, Iterable values, Context context) 43 | throws IOException, InterruptedException { 44 | int sum = 0; 45 | for (IntWritable val : values) { 46 | sum += val.get(); 47 | } 48 | context.write(key, new IntWritable(sum)); 49 | } 50 | } 51 | 52 | public static void main(String[] args) throws Exception { 53 | //1.设置HDFS配置信息 54 | String namenode_ip = "192.168.17.10"; 55 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 56 | Configuration conf = new Configuration(); 57 | conf.set("fs.defaultFS", hdfs); 58 | conf.set("mapreduce.app-submission.cross-platform", "true"); 59 | 60 | //2.设置MapReduce作业配置信息 61 | String jobName = "YearCounter"; //作业名称 62 | Job job = Job.getInstance(conf, jobName); 63 | job.setJarByClass(YearCounter.class); //指定运行时作业类 64 | job.setJar("export\\YearCounter.jar"); //指定本地jar包 65 | job.setMapperClass(YearCounterMapper.class); //指定Mapper类 66 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 67 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 68 | job.setReducerClass(YearCounterReducer.class); //指定Reducer类 69 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 70 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 71 | 72 | //3.设置作业输入和输出路径 73 | String dataDir = "/expr/datecount/data"; //实验数据目录 74 | String outputDir = "/expr/datecount/output"; //实验输出目录 75 | Path inPath = new Path(hdfs + dataDir); 76 | Path outPath = new Path(hdfs + outputDir); 77 | FileInputFormat.addInputPath(job, inPath); 78 | FileOutputFormat.setOutputPath(job, outPath); 79 | FileSystem fs = FileSystem.get(conf); 80 | if(fs.exists(outPath)) { 81 | fs.delete(outPath, true); 82 | } 83 | 84 | //4.运行作业 85 | System.out.println("Job: " + jobName + " is running..."); 86 | if(job.waitForCompletion(true)) { 87 | System.out.println("success!"); 88 | System.exit(0); 89 | } else { 90 | System.out.println("failed!"); 91 | System.exit(1); 92 | } 93 | } 94 | 95 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateCount.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class DateCount { 16 | 17 | public static class DateCountMapper extends Mapper { 18 | private final static IntWritable one = new IntWritable(1); 19 | 20 | public void map(Object key, Text value, Context context ) 21 | throws IOException, InterruptedException { 22 | String[] strs = value.toString().split(" "); //按空格分割输入 23 | Text date = new Text(strs[0]); //获取日期 24 | context.write(date, one); //将日期和常数1作为Map输出 25 | } 26 | } 27 | 28 | public static class DateCountReducer extends Reducer { 29 | public void reduce(Text key, Iterable values, Context context) 30 | throws IOException, InterruptedException { 31 | int sum = 0; 32 | for (IntWritable val : values) { 33 | sum += val.get(); 34 | } 35 | context.write(key, new IntWritable(sum)); 36 | } 37 | } 38 | 39 | public static void main(String[] args) throws Exception { 40 | //1.设置HDFS配置信息 41 | String namenode_ip = "192.168.17.10"; 42 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 43 | Configuration conf = new Configuration(); 44 | conf.set("fs.defaultFS", hdfs); 45 | conf.set("mapreduce.app-submission.cross-platform", "true"); 46 | 47 | //2.设置MapReduce作业配置信息 48 | String jobName = "DateCount"; //作业名称 49 | Job job = Job.getInstance(conf, jobName); 50 | job.setJarByClass(DateCount.class); //指定运行时作业类 51 | job.setJar("export\\DateCount.jar"); //指定本地jar包 52 | job.setMapperClass(DateCountMapper.class); //指定Mapper类 53 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 54 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 55 | job.setReducerClass(DateCountReducer.class); //指定Reducer类 56 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 57 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 58 | 59 | //3.设置作业输入和输出路径 60 | String dataDir = "/expr/datecount/data"; //实验数据目录 61 | String outputDir = "/expr/datecount/output"; //实验输出目录 62 | Path inPath = new Path(hdfs + dataDir); 63 | Path outPath = new Path(hdfs + outputDir); 64 | FileInputFormat.addInputPath(job, inPath); 65 | FileOutputFormat.setOutputPath(job, outPath); 66 | FileSystem fs = FileSystem.get(conf); 67 | if(fs.exists(outPath)) { 68 | fs.delete(outPath, true); 69 | } 70 | 71 | //4.运行作业 72 | System.out.println("Job: " + jobName + " is running..."); 73 | if(job.waitForCompletion(true)) { 74 | System.out.println("success!"); 75 | System.exit(0); 76 | } else { 77 | System.out.println("failed!"); 78 | System.exit(1); 79 | } 80 | } 81 | 82 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateDistinct.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class DateDistinct { 16 | 17 | public static class DateDistinctMapper extends Mapper { 18 | public void map(Object key, Text value, Context context ) 19 | throws IOException, InterruptedException { 20 | String[] strs = value.toString().split(" "); 21 | Text date = new Text(strs[0]); 22 | context.write(date, NullWritable.get()); 23 | } 24 | } 25 | 26 | public static class DateDistinctReducer extends Reducer { 27 | public void reduce(Text key, Iterable values, Context context) 28 | throws IOException, InterruptedException { 29 | context.write(key, NullWritable.get()); 30 | } 31 | } 32 | 33 | public static void main(String[] args) throws Exception { 34 | //1.设置HDFS配置信息 35 | String namenode_ip = "192.168.17.10"; 36 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 37 | Configuration conf = new Configuration(); 38 | conf.set("fs.defaultFS", hdfs); 39 | conf.set("mapreduce.app-submission.cross-platform", "true"); 40 | 41 | //2.设置MapReduce作业配置信息 42 | String jobName = "DateDistinct"; //定义作业名称 43 | Job job = Job.getInstance(conf, jobName); 44 | job.setJarByClass(DateDistinct.class); //指定运行时作业类 45 | job.setJar("export\\DateDistinct.jar"); //指定本地jar包 46 | job.setMapperClass(DateDistinctMapper.class); //指定Mapper类 47 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 48 | job.setMapOutputValueClass(NullWritable.class); //设置Mapper输出Value类型 49 | job.setReducerClass(DateDistinctReducer.class); //指定Reducer类 50 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 51 | job.setOutputValueClass(NullWritable.class); //设置Reduce输出Value类型 52 | 53 | //3.设置作业输入和输出路径 54 | String dataDir = "/expr/datecount/data"; //实验数据目录 55 | String outputDir = "/expr/datecount/output_distinct"; //实验输出目录 56 | Path inPath = new Path(hdfs + dataDir); 57 | Path outPath = new Path(hdfs + outputDir); 58 | FileInputFormat.addInputPath(job, inPath); 59 | FileOutputFormat.setOutputPath(job, outPath); 60 | FileSystem fs = FileSystem.get(conf); 61 | if(fs.exists(outPath)) { 62 | fs.delete(outPath, true); 63 | } 64 | 65 | //4.运行作业 66 | System.out.println("Job: " + jobName + " is running..."); 67 | if(job.waitForCompletion(true)) { 68 | System.out.println("success!"); 69 | System.exit(0); 70 | } else { 71 | System.out.println("failed!"); 72 | System.exit(1); 73 | } 74 | } 75 | 76 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateFilter.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | 14 | public class DateFilter { 15 | 16 | public static class DateFilterMapper extends Mapper { 17 | public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { 18 | String[] strs = value.toString().split(" "); 19 | Text date = new Text(strs[0]); 20 | context.write(date, NullWritable.get()); 21 | } 22 | } 23 | /* 24 | public static class DateFilterReducer extends Reducer { 25 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 26 | int sum = 0; 27 | for (IntWritable val : values) { 28 | sum += val.get(); 29 | } 30 | context.write(key, new IntWritable(sum)); 31 | } 32 | } 33 | */ 34 | public static void main(String[] args) throws Exception { 35 | //1.设置HDFS配置信息 36 | String namenode_ip = "192.168.17.10"; 37 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 38 | Configuration conf = new Configuration(); 39 | conf.set("fs.defaultFS", hdfs); 40 | conf.set("mapreduce.app-submission.cross-platform", "true"); 41 | 42 | //2.设置MapReduce作业配置信息 43 | String jobName = "DateFilter"; //定义作业名称 44 | Job job = Job.getInstance(conf, jobName); 45 | job.setJarByClass(DateFilter.class); //指定运行时作业类 46 | job.setJar("export\\DateFilter.jar"); //指定本地jar包 47 | job.setMapperClass(DateFilterMapper.class); 48 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 49 | job.setMapOutputValueClass(NullWritable.class); //设置Mapper输出Value类型 50 | //job.setReducerClass(DateCountReducer.class); //不需要设置Reducer类 51 | //job.setOutputKeyClass(Text.class); //设置Reduce输出键类型 52 | //job.setOutputValueClass(NullWritable.class); //设置Reduce输出值类型 53 | 54 | //3.设置作业输入和输出路径 55 | String dataDir = "/expr/datecount/data"; //实验数据目录 56 | String outputDir = "/expr/datecount/output_filter"; //实验输出目录 57 | Path inPath = new Path(hdfs + dataDir); 58 | Path outPath = new Path(hdfs + outputDir); 59 | FileInputFormat.addInputPath(job, inPath); 60 | FileOutputFormat.setOutputPath(job, outPath); 61 | FileSystem fs = FileSystem.get(conf); 62 | if(fs.exists(outPath)) { 63 | fs.delete(outPath, true); 64 | } 65 | 66 | //4.运行作业 67 | System.out.println("Job: " + jobName + " is running..."); 68 | if(job.waitForCompletion(true)) { 69 | System.out.println("success!"); 70 | System.exit(0); 71 | } else { 72 | System.out.println("failed!"); 73 | System.exit(1); 74 | } 75 | } 76 | 77 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateGroup.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class DateGroup { 16 | 17 | public static class DateGroupMapper extends Mapper { 18 | public void map(Object key, Text value, Context context ) 19 | throws IOException, InterruptedException { 20 | String[] strs = value.toString().split(" "); //按空格分割输入 21 | String date = strs[0]; //获取日期 22 | int id = Integer.parseInt(strs[1]); //获取序号 23 | context.write(new Text(date), new IntWritable(id)); 24 | } 25 | } 26 | 27 | public static class DateGroupReducer extends Reducer { 28 | public void reduce(Text key, Iterable values, Context context) 29 | throws IOException, InterruptedException { 30 | StringBuilder sb = new StringBuilder(); 31 | sb.append("[ "); 32 | for (IntWritable val : values) { //将value值串联 33 | sb.append(val.toString()).append(" "); 34 | } 35 | sb.append("]"); 36 | context.write(key, new Text(sb.toString())); 37 | } 38 | } 39 | 40 | public static void main(String[] args) throws Exception { 41 | //1.设置HDFS配置信息 42 | String namenode_ip = "192.168.17.10"; 43 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 44 | Configuration conf = new Configuration(); 45 | conf.set("fs.defaultFS", hdfs); 46 | conf.set("mapreduce.app-submission.cross-platform", "true"); 47 | 48 | //2.设置MapReduce作业配置信息 49 | String jobName = "DateGroup"; //作业名称 50 | Job job = Job.getInstance(conf, jobName); 51 | job.setJarByClass(DateGroup.class); //指定运行时作业类 52 | job.setJar("export\\DateGroup.jar"); //指定本地jar包 53 | job.setMapperClass(DateGroupMapper.class); //指定Mapper类 54 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 55 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 56 | job.setReducerClass(DateGroupReducer.class); //指定Reducer类 57 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 58 | job.setOutputValueClass(Text.class); //设置Reduce输出Value类型 59 | 60 | //3.设置作业输入和输出路径 61 | String dataDir = "/expr/datecount/data"; //实验数据目录 62 | String outputDir = "/expr/datecount/output_group"; //实验输出目录 63 | Path inPath = new Path(hdfs + dataDir); 64 | Path outPath = new Path(hdfs + outputDir); 65 | FileInputFormat.addInputPath(job, inPath); 66 | FileOutputFormat.setOutputPath(job, outPath); 67 | FileSystem fs = FileSystem.get(conf); 68 | if(fs.exists(outPath)) { 69 | fs.delete(outPath, true); 70 | } 71 | 72 | //4.运行作业 73 | System.out.println("Job: " + jobName + " is running..."); 74 | if(job.waitForCompletion(true)) { 75 | System.out.println("success!"); 76 | System.exit(0); 77 | } else { 78 | System.out.println("failed!"); 79 | System.exit(1); 80 | } 81 | } 82 | 83 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateGroup2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.WritableComparable; 10 | import org.apache.hadoop.io.WritableComparator; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | public class DateGroup2 { 18 | 19 | public static class DateGroup2Mapper extends Mapper { 20 | public void map(Object key, Text value, Context context ) 21 | throws IOException, InterruptedException { 22 | String[] strs = value.toString().split(" "); //按空格分割输入 23 | String date = strs[0]; //获取日期 24 | int id = Integer.parseInt(strs[1]); //获取序号 25 | context.write(new Text(date), new IntWritable(id)); 26 | } 27 | } 28 | 29 | public static class DateGroup2Reducer extends Reducer { 30 | public void reduce(Text key, Iterable values, Context context) 31 | throws IOException, InterruptedException { 32 | StringBuilder sb = new StringBuilder(); 33 | sb.append("[ "); 34 | for (IntWritable val : values) { //将value值串联 35 | sb.append(val.toString()).append(" "); 36 | } 37 | sb.append("]"); 38 | String year = key.toString().substring(0,4); //取年份 39 | context.write(new Text(year), new Text(sb.toString())); 40 | } 41 | } 42 | 43 | public static class MyGroup extends WritableComparator { 44 | public MyGroup() { //注册比较方法 45 | super(Text.class, true); 46 | } 47 | 48 | @SuppressWarnings("rawtypes") 49 | @Override 50 | public int compare(WritableComparable a, WritableComparable b) { 51 | String d1 = a.toString(); 52 | String d2 = b.toString(); 53 | 54 | if (d1.startsWith("2015")) 55 | d1 = "2015"; 56 | else if (d1.startsWith("2016")) 57 | d1 = "2016"; 58 | else 59 | d1 = "2017"; 60 | 61 | if (d2.startsWith("2015")) 62 | d2 = "2015"; 63 | else if (d2.startsWith("2016")) 64 | d2 = "2016"; 65 | else 66 | d2 = "2017"; 67 | 68 | return d1.compareTo(d2); //将原本KEY(年月日)的比较变成年份的比较 69 | } 70 | } 71 | 72 | public static void main(String[] args) throws Exception { 73 | //1.设置HDFS配置信息 74 | String namenode_ip = "192.168.17.10"; 75 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 76 | Configuration conf = new Configuration(); 77 | conf.set("fs.defaultFS", hdfs); 78 | conf.set("mapreduce.app-submission.cross-platform", "true"); 79 | 80 | //2.设置MapReduce作业配置信息 81 | String jobName = "DateGroup2"; //作业名称 82 | Job job = Job.getInstance(conf, jobName); 83 | job.setJarByClass(DateGroup2.class); //指定运行时作业类 84 | job.setJar("export\\DateGroup2.jar"); //指定本地jar包 85 | job.setMapperClass(DateGroup2Mapper.class); //指定Mapper类 86 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 87 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 88 | job.setReducerClass(DateGroup2Reducer.class); //指定Reducer类 89 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 90 | job.setOutputValueClass(Text.class); //设置Reduce输出Value类型 91 | job.setGroupingComparatorClass(MyGroup.class); //设置自定义分组类 92 | //3.设置作业输入和输出路径 93 | String dataDir = "/expr/datecount/data"; //实验数据目录 94 | String outputDir = "/expr/datecount/output_group2"; //实验输出目录 95 | Path inPath = new Path(hdfs + dataDir); 96 | Path outPath = new Path(hdfs + outputDir); 97 | FileInputFormat.addInputPath(job, inPath); 98 | FileOutputFormat.setOutputPath(job, outPath); 99 | FileSystem fs = FileSystem.get(conf); 100 | if(fs.exists(outPath)) { 101 | fs.delete(outPath, true); 102 | } 103 | 104 | //4.运行作业 105 | System.out.println("Job: " + jobName + " is running..."); 106 | if(job.waitForCompletion(true)) { 107 | System.out.println("success!"); 108 | System.exit(0); 109 | } else { 110 | System.out.println("failed!"); 111 | System.exit(1); 112 | } 113 | } 114 | 115 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DatePartition.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Partitioner; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class DatePartition { 17 | 18 | public static class DatePartitionMapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | 21 | public void map(Object key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | String[] strs = value.toString().split(" "); 24 | Text date = new Text(strs[0]); 25 | context.write(date, one); 26 | } 27 | } 28 | 29 | public static class DatePartitionReducer extends Reducer { 30 | public void reduce(Text key, Iterable values, Context context) 31 | throws IOException, InterruptedException { 32 | int sum = 0; 33 | for (IntWritable val : values) { 34 | sum += val.get(); 35 | } 36 | context.write(key, new IntWritable(sum)); 37 | } 38 | } 39 | 40 | public static class YearPartitioner extends Partitioner { 41 | @Override 42 | public int getPartition(Text key, IntWritable value, int numPartitions) { 43 | //根据年份对数据进行分区,返回不同分区号 44 | if (key.toString().startsWith("2015")) 45 | return 0 % numPartitions; 46 | else if (key.toString().startsWith("2016")) 47 | return 1 % numPartitions; 48 | else 49 | return 2 % numPartitions; 50 | } 51 | } 52 | 53 | public static void main(String[] args) throws Exception { 54 | //1.设置HDFS配置信息 55 | String namenode_ip = "192.168.17.10"; 56 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 57 | Configuration conf = new Configuration(); 58 | conf.set("fs.defaultFS", hdfs); 59 | conf.set("mapreduce.app-submission.cross-platform", "true"); 60 | 61 | //2.设置MapReduce作业配置信息 62 | String jobName = "DatePartition"; //定义作业名称 63 | Job job = Job.getInstance(conf, jobName); 64 | job.setJarByClass(DatePartition.class); //指定运行时作业类 65 | job.setJar("export\\DatePartition.jar"); //指定本地jar包 66 | // Map 67 | job.setMapperClass(DatePartitionMapper.class); //指定Mapper类 68 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 69 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 70 | // Reduce 71 | job.setReducerClass(DatePartitionReducer.class); //指定Reducer类 72 | // 全局 73 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 74 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 75 | // Partition 76 | job.setPartitionerClass(YearPartitioner.class); //自定义分区方法 77 | job.setNumReduceTasks(10); //设置reduce任务的数量,该值传递给Partitioner.getPartition()方法的numPartitions参数 78 | 79 | //3.设置作业输入和输出路径 80 | String dataDir = "/expr/datecount/data"; //实验数据目录 81 | String outputDir = "/expr/datecount/output_partition"; //实验输出目录 82 | Path inPath = new Path(hdfs + dataDir); 83 | Path outPath = new Path(hdfs + outputDir); 84 | FileInputFormat.addInputPath(job, inPath); 85 | FileOutputFormat.setOutputPath(job, outPath); 86 | FileSystem fs = FileSystem.get(conf); 87 | if(fs.exists(outPath)) { 88 | fs.delete(outPath, true); 89 | } 90 | 91 | //4.运行作业 92 | System.out.println("Job: " + jobName + " is running..."); 93 | if(job.waitForCompletion(true)) { 94 | System.out.println("success!"); 95 | System.exit(0); 96 | } else { 97 | System.out.println("failed!"); 98 | System.exit(1); 99 | } 100 | } 101 | 102 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DatePartition2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Partitioner; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class DatePartition2 { 17 | 18 | public static class DatePartition2Mapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | 21 | public void map(Object key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | String[] strs = value.toString().split(" "); 24 | Text date = new Text(strs[0]); 25 | context.write(date, one); 26 | } 27 | } 28 | 29 | public static class DatePartition2Reducer extends Reducer { 30 | public void reduce(Text key, Iterable values, Context context) 31 | throws IOException, InterruptedException { 32 | int sum = 0; 33 | for (IntWritable val : values) { 34 | sum += val.get(); 35 | } 36 | context.write(key, new IntWritable(sum)); 37 | } 38 | } 39 | 40 | public static class YearPartitioner extends Partitioner { 41 | @Override 42 | public int getPartition(Text key, IntWritable value, int numPartitions) { 43 | //根据月份对数据进行分区,返回不同分区号 44 | String month = key.toString().substring(5,7); //substring取从下标5到下标7前一个字符,即下标5-6的字符 45 | switch (month) { 46 | case "01": return 1; 47 | case "02": return 2; 48 | case "03": return 3; 49 | case "04": return 4; 50 | case "05": return 5; 51 | case "06": return 6; 52 | case "07": return 7; 53 | case "08": return 8; 54 | case "09": return 9; 55 | case "10": return 10; 56 | case "11": return 11; 57 | case "12": return 12; 58 | default : return 0; 59 | } 60 | } 61 | } 62 | 63 | public static void main(String[] args) throws Exception { 64 | //1.设置HDFS配置信息 65 | String namenode_ip = "192.168.17.10"; 66 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 67 | Configuration conf = new Configuration(); 68 | conf.set("fs.defaultFS", hdfs); 69 | conf.set("mapreduce.app-submission.cross-platform", "true"); 70 | 71 | //2.设置MapReduce作业配置信息 72 | String jobName = "DatePartition2"; //定义作业名称 73 | Job job = Job.getInstance(conf, jobName); 74 | job.setJarByClass(DatePartition2.class); //指定运行时作业类 75 | job.setJar("export\\DatePartition2.jar"); //指定本地jar包 76 | job.setMapperClass(DatePartition2Mapper.class); //指定Mapper类 77 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 78 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 79 | job.setReducerClass(DatePartition2Reducer.class); //指定Reducer类 80 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 81 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 82 | job.setPartitionerClass(YearPartitioner.class); //自定义分区方法 83 | job.setNumReduceTasks(3); //设置reduce任务的数量,该值传递给Partitioner.getPartition()方法的numPartitions参数 84 | 85 | //3.设置作业输入和输出路径 86 | String dataDir = "/expr/datecount/data"; //实验数据目录 87 | String outputDir = "/expr/datecount/output_partition2"; //实验输出目录 88 | Path inPath = new Path(hdfs + dataDir); 89 | Path outPath = new Path(hdfs + outputDir); 90 | FileInputFormat.addInputPath(job, inPath); 91 | FileOutputFormat.setOutputPath(job, outPath); 92 | FileSystem fs = FileSystem.get(conf); 93 | if(fs.exists(outPath)) { 94 | fs.delete(outPath, true); 95 | } 96 | 97 | //4.运行作业 98 | System.out.println("Job: " + jobName + " is running..."); 99 | if(job.waitForCompletion(true)) { 100 | System.out.println("success!"); 101 | System.exit(0); 102 | } else { 103 | System.out.println("failed!"); 104 | System.exit(1); 105 | } 106 | } 107 | 108 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateSort.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class DateSort { 16 | public static class DateSortMapper extends Mapper { //key-value类型不同于以往的 17 | IntWritable num = new IntWritable(); 18 | public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { 19 | String[] strs = value.toString().split("\t"); //从DateCount运行结果读取数据,默认是用Tab分割输入 20 | String date = strs[0]; //获取日期 21 | num.set(Integer.parseInt(strs[1])); //获取次数 22 | context.write(num, new Text(date)); //以次数作为key,日期作为value输出;利用shuffle自动对key升序排序的特性 23 | } 24 | } 25 | 26 | public static class DateSortReducer extends Reducer { 27 | public void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException { 28 | for (Text val : values) { 29 | context.write(val, key);//Map阶段将日期和次数反过来以实现排序,Reduce这里再次翻转key-value 30 | } 31 | } 32 | } 33 | 34 | public static void main(String[] args) throws Exception { 35 | //1.设置HDFS配置信息 36 | String namenode_ip = "192.168.17.10"; 37 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 38 | Configuration conf = new Configuration(); 39 | conf.set("fs.defaultFS", hdfs); 40 | conf.set("mapreduce.app-submission.cross-platform", "true"); 41 | 42 | //2.设置MapReduce作业配置信息 43 | String jobName = "DateSort"; //定义作业名称 44 | Job job = Job.getInstance(conf, jobName); 45 | job.setJarByClass(DateSort.class); //指定作业类 46 | job.setJar("export\\DateSort.jar"); //指定本地jar包 47 | job.setMapperClass(DateSortMapper.class); //指定Mapper类 48 | job.setMapOutputKeyClass(IntWritable.class); //设置Mapper输出Key类型 49 | job.setMapOutputValueClass(Text.class); //设置Mapper输出Value类型 50 | job.setReducerClass(DateSortReducer.class); //指定Reducer类 51 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 52 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 53 | 54 | //3.设置作业输入和输出路径 55 | String dataDir = "/expr/datecount/output/part-r-00000"; //实验数据目录 56 | String outputDir = "/expr/datecount/output_sort"; //实验输出目录 57 | Path inPath = new Path(hdfs + dataDir); 58 | Path outPath = new Path(hdfs + outputDir); 59 | FileInputFormat.addInputPath(job, inPath); 60 | FileOutputFormat.setOutputPath(job, outPath); 61 | FileSystem fs = FileSystem.get(conf); 62 | if(fs.exists(outPath)) { 63 | fs.delete(outPath, true); 64 | } 65 | 66 | //4.运行作业 67 | System.out.println("Job: " + jobName + " is running..."); 68 | if(job.waitForCompletion(true)) { 69 | System.out.println("success!"); 70 | System.exit(0); 71 | } else { 72 | System.out.println("failed!"); 73 | System.exit(1); 74 | } 75 | } 76 | 77 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateSort2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.WritableComparable; 10 | import org.apache.hadoop.io.WritableComparator; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | public class DateSort2 { 18 | 19 | public static class DateSort2Mapper extends Mapper { 20 | IntWritable num = new IntWritable(); 21 | public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { 22 | String[] strs = value.toString().split("\t"); //从DateCount运行结果读取数据,默认是用Tab分割输入 23 | String date = strs[0]; //获取日期 24 | num.set(Integer.parseInt(strs[1])); //获取次数 25 | context.write(num, new Text(date)); //以次数作为key,日期作为value输出 26 | } 27 | } 28 | 29 | public static class DateSort2Reducer extends Reducer { 30 | public void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException { 31 | for (Text val : values) { 32 | context.write(val, key); 33 | } 34 | } 35 | } 36 | 37 | // 自定义Key排序算法 38 | public static class MySort extends WritableComparator { 39 | public MySort() { 40 | super(IntWritable.class, true); 41 | } 42 | 43 | @SuppressWarnings({ "rawtypes", "unchecked" }) 44 | public int compare(WritableComparable a, WritableComparable b) { 45 | return b.compareTo(a);// 默认升序a比b小返回-1,升序排序;现在a比b小,返回1,降序排序 46 | } 47 | } 48 | 49 | public static void main(String[] args) throws Exception { 50 | //1.设置HDFS配置信息 51 | String namenode_ip = "192.168.17.10"; 52 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 53 | Configuration conf = new Configuration(); 54 | conf.set("fs.defaultFS", hdfs); 55 | conf.set("mapreduce.app-submission.cross-platform", "true"); 56 | 57 | //2.设置MapReduce作业配置信息 58 | String jobName = "DateSort2"; //定义作业名称 59 | Job job = Job.getInstance(conf, jobName); 60 | job.setJarByClass(DateSort2.class); //指定作业类 61 | job.setJar("export\\DateSort2.jar"); //指定本地jar包 62 | // Map 63 | job.setMapperClass(DateSort2Mapper.class); //指定Mapper类 64 | job.setMapOutputKeyClass(IntWritable.class); //设置Mapper输出Key类型 65 | job.setMapOutputValueClass(Text.class); //设置Mapper输出Value类型 66 | // Reduce 67 | job.setReducerClass(DateSort2Reducer.class); //指定Reducer类 68 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 69 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 70 | // 自定义Sort 71 | job.setSortComparatorClass(MySort.class); //设置自定义排序类 72 | 73 | //3.设置作业输入和输出路径 74 | String dataDir = "/expr/datecount/output/part-r-00000"; //实验数据目录 75 | String outputDir = "/expr/datecount/output_sort2"; //实验输出目录 76 | Path inPath = new Path(hdfs + dataDir); 77 | Path outPath = new Path(hdfs + outputDir); 78 | FileInputFormat.addInputPath(job, inPath); 79 | FileOutputFormat.setOutputPath(job, outPath); 80 | FileSystem fs = FileSystem.get(conf); 81 | if(fs.exists(outPath)) { 82 | fs.delete(outPath, true); 83 | } 84 | 85 | //4.运行作业 86 | System.out.println("Job: " + jobName + " is running..."); 87 | if(job.waitForCompletion(true)) { 88 | System.out.println("success!"); 89 | System.exit(0); 90 | } else { 91 | System.out.println("failed!"); 92 | System.exit(1); 93 | } 94 | } 95 | 96 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/datecount/DateSort3.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.datecount; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.NullWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.io.WritableComparable; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | 19 | public class DateSort3 { 20 | 21 | public static class MyKey implements WritableComparable { 22 | private String date; 23 | private int num; 24 | 25 | public String getDate() { 26 | return date; 27 | } 28 | 29 | public void setDate(String date) { 30 | this.date = date; 31 | } 32 | 33 | public int getNum() { 34 | return num; 35 | } 36 | 37 | public void setNum(int num) { 38 | this.num = num; 39 | } 40 | 41 | public MyKey() { 42 | } 43 | 44 | public MyKey(String date, int num) { 45 | this.date = date; 46 | this.num = num; 47 | } 48 | 49 | @Override 50 | public void write(DataOutput out) throws IOException { 51 | out.writeUTF(date); 52 | out.writeInt(num); 53 | } 54 | 55 | @Override 56 | public void readFields(DataInput in) throws IOException { 57 | date = in.readUTF(); 58 | num = in.readInt(); 59 | } 60 | 61 | @Override 62 | public int compareTo(MyKey o) { 63 | //按date升序,num降序 64 | if (!date.equals(o.date)) //相等的话,返回true,取反为false 65 | return date.compareTo(o.date); 66 | else 67 | return o.num-num; 68 | } 69 | } 70 | 71 | public static class DateSort3Mapper extends Mapper { 72 | public void map(Object key, Text value, Context context ) 73 | throws IOException, InterruptedException { 74 | String[] strs = value.toString().split(" "); 75 | MyKey myKey = new MyKey(strs[0], Integer.parseInt(strs[1])); 76 | context.write(myKey, NullWritable.get()); //将自定义的myKey作为Map KEY输出 77 | } 78 | } 79 | 80 | public static class DateSort3Reducer extends Reducer { 81 | public void reduce(MyKey key, Iterable values, Context context) 82 | throws IOException, InterruptedException { 83 | context.write(new Text(key.date), new IntWritable(key.num)); 84 | } 85 | } 86 | 87 | public static void main(String[] args) throws Exception { 88 | //1.设置HDFS配置信息 89 | String namenode_ip = "192.168.17.10"; 90 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 91 | Configuration conf = new Configuration(); 92 | conf.set("fs.defaultFS", hdfs); 93 | conf.set("mapreduce.app-submission.cross-platform", "true"); 94 | 95 | //2.设置MapReduce作业配置信息 96 | String jobName = "DateSort3"; //定义作业名称 97 | Job job = Job.getInstance(conf, jobName); 98 | job.setJarByClass(DateSort3.class); //指定运行时作业类 99 | job.setJar("export\\DateSort3.jar"); //指定本地jar包 100 | job.setMapperClass(DateSort3Mapper.class); //指定Mapper类 101 | job.setMapOutputKeyClass(MyKey.class); //设置Mapper输出Key类型 102 | job.setMapOutputValueClass(NullWritable.class); //设置Mapper输出Value类型 103 | job.setReducerClass(DateSort3Reducer.class); //指定Reducer类 104 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 105 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 106 | 107 | //3.设置作业输入和输出路径 108 | String dataDir = "/expr/datecount/data"; //实验数据目录 109 | String outputDir = "/expr/datecount/output_sort3"; //实验输出目录 110 | Path inPath = new Path(hdfs + dataDir); 111 | Path outPath = new Path(hdfs + outputDir); 112 | FileInputFormat.addInputPath(job, inPath); 113 | FileOutputFormat.setOutputPath(job, outPath); 114 | FileSystem fs = FileSystem.get(conf); 115 | if(fs.exists(outPath)) { 116 | fs.delete(outPath, true); 117 | } 118 | 119 | //4.运行作业 120 | System.out.println("Job: " + jobName + " is running..."); 121 | if(job.waitForCompletion(true)) { 122 | System.out.println("success!"); 123 | System.exit(0); 124 | } else { 125 | System.out.println("failed!"); 126 | System.exit(1); 127 | } 128 | } 129 | 130 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/FixedLengthInput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class FixedLengthInput { 17 | public static class FixedLengthInputMapper extends Mapper { 18 | public void map(LongWritable key, BytesWritable value, Context context ) 19 | throws IOException, InterruptedException { 20 | context.write(key, value); 21 | } 22 | } 23 | 24 | public static class FixedLengthInputReducer extends Reducer { 25 | public void reduce(LongWritable key, Iterable values, Context context) 26 | throws IOException, InterruptedException { 27 | for (BytesWritable val : values) { 28 | context.write(key, val); 29 | } 30 | } 31 | } 32 | 33 | public static void main(String[] args) throws Exception { 34 | //1.设置HDFS配置信息 35 | String namenode_ip = "192.168.17.10"; 36 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 37 | Configuration conf = new Configuration(); 38 | conf.set("fs.defaultFS", hdfs); 39 | conf.set("mapreduce.app-submission.cross-platform", "true"); 40 | conf.setInt(FixedLengthInputFormat.FIXED_RECORD_LENGTH, 13); 41 | 42 | //2.设置MapReduce作业配置信息 43 | String jobName = "FixedLengthInput"; //作业名称 44 | Job job = Job.getInstance(conf, jobName); 45 | job.setJarByClass(FixedLengthInput.class); //指定运行时作业类 46 | job.setJar("export\\FixedLengthInput.jar"); //指定本地jar包 47 | job.setMapperClass(FixedLengthInputMapper.class); //指定Mapper类 48 | job.setMapOutputKeyClass(LongWritable.class); //设置Mapper输出Key类型 49 | job.setMapOutputValueClass(BytesWritable.class); //设置Mapper输出Value类型 50 | job.setReducerClass(FixedLengthInputReducer.class); //指定Reducer类 51 | job.setOutputKeyClass(LongWritable.class); //设置Reduce输出Key类型 52 | job.setOutputValueClass(BytesWritable.class); //设置Reduce输出Value类型 53 | 54 | job.setInputFormatClass(FixedLengthInputFormat.class); //设置输入格式化类 55 | 56 | //3.设置作业输入和输出路径 57 | String dataDir = "/expr/fixedinput/data"; //实验数据目录 58 | String outputDir = "/expr/fixedinput/output"; //实验输出目录 59 | Path inPath = new Path(hdfs + dataDir); 60 | Path outPath = new Path(hdfs + outputDir); 61 | FileInputFormat.addInputPath(job, inPath); 62 | FileOutputFormat.setOutputPath(job, outPath); 63 | FileSystem fs = FileSystem.get(conf); 64 | if(fs.exists(outPath)) { 65 | fs.delete(outPath, true); 66 | } 67 | 68 | //4.运行作业 69 | System.out.println("Job: " + jobName + " is running..."); 70 | if(job.waitForCompletion(true)) { 71 | System.out.println("success!"); 72 | System.exit(0); 73 | } else { 74 | System.out.println("failed!"); 75 | System.exit(1); 76 | } 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/FixedLengthInput2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | 18 | public class FixedLengthInput2 { 19 | public static class FixedLengthInput2Mapper extends Mapper { 20 | public void map(LongWritable key, BytesWritable value, Context context ) 21 | throws IOException, InterruptedException { 22 | String val = new String(value.getBytes(), 0, value.getLength()-1); 23 | String[] strs = val.split(" "); 24 | context.write(new Text(strs[0]), new IntWritable(Integer.parseInt(strs[1]))); 25 | } 26 | } 27 | 28 | public static class FixedLengthInput2Reducer extends Reducer { 29 | public void reduce(Text key, Iterable values, Context context) 30 | throws IOException, InterruptedException { 31 | for (IntWritable val : values) { 32 | context.write(key, val); 33 | } 34 | } 35 | } 36 | 37 | public static void main(String[] args) throws Exception { 38 | //1.设置HDFS配置信息 39 | String namenode_ip = "192.168.17.10"; 40 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 41 | Configuration conf = new Configuration(); 42 | conf.set("fs.defaultFS", hdfs); 43 | conf.set("mapreduce.app-submission.cross-platform", "true"); 44 | conf.setInt(FixedLengthInputFormat.FIXED_RECORD_LENGTH, 13); 45 | 46 | //2.设置MapReduce作业配置信息 47 | String jobName = "FixedLengthInput2"; //作业名称 48 | Job job = Job.getInstance(conf, jobName); 49 | job.setJarByClass(FixedLengthInput2.class); //指定运行时作业类 50 | job.setJar("export\\FixedLengthInput2.jar"); //指定本地jar包 51 | job.setMapperClass(FixedLengthInput2Mapper.class); //指定Mapper类 52 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 53 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 54 | job.setReducerClass(FixedLengthInput2Reducer.class); //指定Reducer类 55 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 56 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 57 | 58 | job.setInputFormatClass(FixedLengthInputFormat.class); //设置输入格式化类 59 | 60 | //3.设置作业输入和输出路径 61 | String dataDir = "/expr/fixedinput/data"; //实验数据目录 62 | String outputDir = "/expr/fixedinput/output"; //实验输出目录 63 | Path inPath = new Path(hdfs + dataDir); 64 | Path outPath = new Path(hdfs + outputDir); 65 | FileInputFormat.addInputPath(job, inPath); 66 | FileOutputFormat.setOutputPath(job, outPath); 67 | FileSystem fs = FileSystem.get(conf); 68 | if(fs.exists(outPath)) { 69 | fs.delete(outPath, true); 70 | } 71 | 72 | //4.运行作业 73 | System.out.println("Job: " + jobName + " is running..."); 74 | if(job.waitForCompletion(true)) { 75 | System.out.println("success!"); 76 | System.exit(0); 77 | } else { 78 | System.out.println("failed!"); 79 | System.exit(1); 80 | } 81 | } 82 | 83 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/KeyValueInput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class KeyValueInput { 17 | 18 | public static class KeyValueInputMapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | 21 | public void map(Text key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | context.write(key, one); //Mapper的输入KEY就是日期 24 | } 25 | } 26 | 27 | public static class KeyValueInputReducer extends Reducer { 28 | public void reduce(Text key, Iterable values, Context context) 29 | throws IOException, InterruptedException { 30 | int sum = 0; 31 | for (IntWritable val : values) { 32 | sum += val.get(); 33 | } 34 | context.write(key, new IntWritable(sum)); 35 | } 36 | } 37 | 38 | public static void main(String[] args) throws Exception { 39 | //1.设置HDFS配置信息 40 | String namenode_ip = "192.168.17.10"; 41 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 42 | Configuration conf = new Configuration(); 43 | conf.set("fs.defaultFS", hdfs); 44 | conf.set("mapreduce.app-submission.cross-platform", "true"); 45 | conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ":"); //设置输入文件kv分隔符 46 | 47 | //2.设置MapReduce作业配置信息 48 | String jobName = "KeyValueInput"; //作业名称 49 | Job job = Job.getInstance(conf, jobName); 50 | job.setJarByClass(KeyValueInput.class); //指定运行时作业类 51 | job.setJar("export\\KeyValueInput.jar"); //指定本地jar包 52 | job.setMapperClass(KeyValueInputMapper.class); //指定Mapper类 53 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 54 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 55 | job.setReducerClass(KeyValueInputReducer.class); //指定Reducer类 56 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 57 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 58 | 59 | job.setInputFormatClass(KeyValueTextInputFormat.class); //设置输入格式化类 60 | 61 | //3.设置作业输入和输出路径 62 | String dataDir = "/expr/kvinput/data"; //实验数据目录 63 | String outputDir = "/expr/kvinput/output"; //实验输出目录 64 | Path inPath = new Path(hdfs + dataDir); 65 | Path outPath = new Path(hdfs + outputDir); 66 | FileInputFormat.addInputPath(job, inPath); 67 | FileOutputFormat.setOutputPath(job, outPath); 68 | FileSystem fs = FileSystem.get(conf); 69 | if(fs.exists(outPath)) { 70 | fs.delete(outPath, true); 71 | } 72 | 73 | //4.运行作业 74 | System.out.println("Job: " + jobName + " is running..."); 75 | if(job.waitForCompletion(true)) { 76 | System.out.println("success!"); 77 | System.exit(0); 78 | } else { 79 | System.out.println("failed!"); 80 | System.exit(1); 81 | } 82 | } 83 | 84 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/MultInput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class MultInput { 16 | 17 | public static class MultInputMapper extends Mapper { 18 | private final static IntWritable one = new IntWritable(1); 19 | 20 | public void map(Object key, Text value, Context context ) 21 | throws IOException, InterruptedException { 22 | String[] strs = value.toString().split(" "); //按空格分割输入 23 | Text date = new Text(strs[0]); //获取日期 24 | context.write(date, one); //将日期和常数1作为Map输出 25 | } 26 | } 27 | 28 | public static class MultInputReducer extends Reducer { 29 | public void reduce(Text key, Iterable values, Context context) 30 | throws IOException, InterruptedException { 31 | int sum = 0; 32 | for (IntWritable val : values) { 33 | sum += val.get(); 34 | } 35 | context.write(key, new IntWritable(sum)); 36 | } 37 | } 38 | 39 | public static void main(String[] args) throws Exception { 40 | //1.设置HDFS配置信息 41 | String namenode_ip = "192.168.17.10"; 42 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 43 | Configuration conf = new Configuration(); 44 | conf.set("fs.defaultFS", hdfs); 45 | conf.set("mapreduce.app-submission.cross-platform", "true"); 46 | 47 | //2.设置MapReduce作业配置信息 48 | String jobName = "MultInput"; //作业名称 49 | Job job = Job.getInstance(conf, jobName); 50 | job.setJarByClass(MultInput.class); //指定运行时作业类 51 | job.setJar("export\\MultInput.jar"); //指定本地jar包 52 | job.setMapperClass(MultInputMapper.class); //指定Mapper类 53 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 54 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 55 | job.setReducerClass(MultInputReducer.class); //指定Reducer类 56 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 57 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 58 | 59 | //3.设置作业输入和输出路径 60 | //方法一:FileInputFormat.addInputPath() 61 | FileInputFormat.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt1"));//输入目录1 62 | FileInputFormat.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt2"));//输入目录2 63 | 64 | //方法二:FileInputFormat.addInputPaths() 65 | //FileInputFormat.addInputPaths(job, String.join(",", hdfs+"/expr/multinput/data/txt1", hdfs+"/expr/multinput/data/txt2")); 66 | 67 | //方法三:FileInputFormat.setInputPaths() 68 | //FileInputFormat.setInputPaths(job, String.join(",", hdfs+"/expr/multinput/data/txt1", hdfs+"/expr/multinput/data/txt2") ); 69 | 70 | Path outPath = new Path(hdfs + "/expr/multinput/output"); //输出目录 71 | FileOutputFormat.setOutputPath(job, outPath); 72 | FileSystem fs = FileSystem.get(conf); 73 | if(fs.exists(outPath)) { 74 | fs.delete(outPath, true); 75 | } 76 | 77 | //4.运行作业 78 | System.out.println("Job: " + jobName + " is running..."); 79 | if(job.waitForCompletion(true)) { 80 | System.out.println("success!"); 81 | System.exit(0); 82 | } else { 83 | System.out.println("failed!"); 84 | System.exit(1); 85 | } 86 | } 87 | 88 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/MultInput2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class MultInput2 { 17 | 18 | public static class TxtFileMapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | 21 | public void map(Object key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | String[] strs = value.toString().split(" "); //按空格分割输入 24 | Text date = new Text(strs[0]); 25 | context.write(date, one); 26 | } 27 | } 28 | 29 | public static class CsvFileMapper extends Mapper { 30 | private final static IntWritable one = new IntWritable(1); 31 | 32 | public void map(Object key, Text value, Context context ) 33 | throws IOException, InterruptedException { 34 | String[] strs = value.toString().split(","); //按逗号分割输入 35 | Text date = new Text(strs[0]); 36 | context.write(date, one); 37 | } 38 | } 39 | 40 | public static class MultInput2Reducer extends Reducer { 41 | public void reduce(Text key, Iterable values, Context context) 42 | throws IOException, InterruptedException { 43 | int sum = 0; 44 | for (IntWritable val : values) { 45 | sum += val.get(); 46 | } 47 | context.write(key, new IntWritable(sum)); 48 | } 49 | } 50 | 51 | public static void main(String[] args) throws Exception { 52 | //1.设置HDFS配置信息 53 | String namenode_ip = "192.168.17.10"; 54 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 55 | Configuration conf = new Configuration(); 56 | conf.set("fs.defaultFS", hdfs); 57 | conf.set("mapreduce.app-submission.cross-platform", "true"); 58 | 59 | //2.设置MapReduce作业配置信息 60 | String jobName = "MultInput2"; //作业名称 61 | Job job = Job.getInstance(conf, jobName); 62 | job.setJarByClass(MultInput2.class); //指定运行时作业类 63 | job.setJar("export\\MultInput2.jar"); //指定本地jar包 64 | 65 | //job.setMapperClass(MultInput2Mapper.class); //无需指定Mapper类,而在MultipleInputs.addInputPath()方法中指定 66 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 67 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 68 | job.setReducerClass(MultInput2Reducer.class); //指定Reducer类 69 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 70 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 71 | 72 | //3.设置作业输入和输出路径 73 | //方法五:MultipleInputs.addInputPath() 74 | MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multinput/data/txt1"), TextInputFormat.class, TxtFileMapper.class); 75 | MultipleInputs.addInputPath(job, new Path(hdfs+"/expr/multinput/data/csv"), TextInputFormat.class, CsvFileMapper.class); 76 | 77 | Path outPath = new Path(hdfs + "/expr/multinput/output3"); //输出目录 78 | FileOutputFormat.setOutputPath(job, outPath); 79 | FileSystem fs = FileSystem.get(conf); 80 | if(fs.exists(outPath)) { 81 | fs.delete(outPath, true); 82 | } 83 | 84 | //4.运行作业 85 | System.out.println("Job: " + jobName + " is running..."); 86 | if(job.waitForCompletion(true)) { 87 | System.out.println("success!"); 88 | System.exit(0); 89 | } else { 90 | System.out.println("failed!"); 91 | System.exit(1); 92 | } 93 | } 94 | 95 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/NLineInput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | public class NLineInput { 18 | 19 | public static class NLineInputMapper extends Mapper { 20 | private final static IntWritable one = new IntWritable(1); 21 | 22 | public void map(LongWritable key, Text value, Context context ) 23 | throws IOException, InterruptedException { 24 | System.out.println("value: "+value.toString()); 25 | String[] strs = value.toString().split(" "); 26 | System.out.println("NLines strs is:"+strs); 27 | System.out.println("strs[0]"+strs[0]); 28 | Text date = new Text(strs[0]); 29 | context.write(date, one); 30 | } 31 | } 32 | 33 | public static class NLineInputReducer extends Reducer { 34 | public void reduce(Text key, Iterable values, Context context) 35 | throws IOException, InterruptedException { 36 | int sum = 0; 37 | for (IntWritable val : values) { 38 | sum += val.get(); 39 | } 40 | context.write(key, new IntWritable(sum)); 41 | } 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | //1.设置HDFS配置信息 46 | String namenode_ip = "192.168.17.10"; 47 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 48 | Configuration conf = new Configuration(); 49 | conf.set("fs.defaultFS", hdfs); 50 | conf.set("mapreduce.app-submission.cross-platform", "true"); 51 | conf.setInt("mapreduce.input.lineinputformat.linespermap", 1000); //设置每个Map处理的行数 52 | 53 | //2.设置MapReduce作业配置信息 54 | String jobName = "NLineInput"; //作业名称 55 | Job job = Job.getInstance(conf, jobName); 56 | job.setJarByClass(NLineInput.class); //指定运行时作业类 57 | job.setJar("export\\NLineInput.jar"); //指定本地jar包 58 | job.setMapperClass(NLineInputMapper.class); //指定Mapper类 59 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 60 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 61 | job.setReducerClass(NLineInputReducer.class); //指定Reducer类 62 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 63 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 64 | 65 | job.setInputFormatClass(NLineInputFormat.class); //设置输入格式化类 66 | 67 | //3.设置作业输入和输出路径 68 | String dataDir = "/expr/nlineinput/data"; //实验数据目录 69 | String outputDir = "/expr/nlineinput/output"; //实验输出目录 70 | Path inPath = new Path(hdfs + dataDir); 71 | Path outPath = new Path(hdfs + outputDir); 72 | FileInputFormat.addInputPath(job, inPath); 73 | FileOutputFormat.setOutputPath(job, outPath); 74 | FileSystem fs = FileSystem.get(conf); 75 | if(fs.exists(outPath)) { 76 | fs.delete(outPath, true); 77 | } 78 | 79 | //4.运行作业 80 | System.out.println("Job: " + jobName + " is running..."); 81 | if(job.waitForCompletion(true)) { 82 | System.out.println("success!"); 83 | System.exit(0); 84 | } else { 85 | System.out.println("failed!"); 86 | System.exit(1); 87 | } 88 | } 89 | 90 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/inputformat/SequenceInput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class SequenceInput { 17 | 18 | public static class SequenceInputMapper extends Mapper { 19 | public void map(Text key, IntWritable value, Context context ) 20 | throws IOException, InterruptedException { 21 | int v = value.get() + 10; 22 | context.write(key, new IntWritable(v)); 23 | } 24 | } 25 | 26 | public static class SequenceInputReducer extends Reducer { 27 | public void reduce(Text key, Iterable values, Context context) 28 | throws IOException, InterruptedException { 29 | for (IntWritable val : values) { 30 | context.write(key, val); 31 | } 32 | } 33 | } 34 | 35 | public static void main(String[] args) throws Exception { 36 | //1.设置HDFS配置信息 37 | String namenode_ip = "192.168.17.10"; 38 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 39 | Configuration conf = new Configuration(); 40 | conf.set("fs.defaultFS", hdfs); 41 | conf.set("mapreduce.app-submission.cross-platform", "true"); 42 | 43 | //2.设置MapReduce作业配置信息 44 | String jobName = "SequenceInput"; //作业名称 45 | Job job = Job.getInstance(conf, jobName); 46 | job.setJarByClass(SequenceInput.class); //指定运行时作业类 47 | job.setJar("export\\SequenceInput.jar"); //指定本地jar包 48 | job.setMapperClass(SequenceInputMapper.class); //指定Mapper类 49 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 50 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 51 | job.setReducerClass(SequenceInputReducer.class); //指定Reducer类 52 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 53 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 54 | 55 | job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式化类 56 | 57 | //3.设置作业输入和输出路径 58 | String dataDir = "/expr/seqinput/data"; //实验数据目录 59 | String outputDir = "/expr/seqinput/output"; //实验输出目录 60 | Path inPath = new Path(hdfs + dataDir); 61 | Path outPath = new Path(hdfs + outputDir); 62 | FileInputFormat.addInputPath(job, inPath); 63 | FileOutputFormat.setOutputPath(job, outPath); 64 | FileSystem fs = FileSystem.get(conf); 65 | if(fs.exists(outPath)) { 66 | fs.delete(outPath, true); 67 | } 68 | 69 | //4.运行作业 70 | System.out.println("Job: " + jobName + " is running..."); 71 | if(job.waitForCompletion(true)) { 72 | System.out.println("success!"); 73 | System.exit(0); 74 | } else { 75 | System.out.println("failed!"); 76 | System.exit(1); 77 | } 78 | } 79 | 80 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/StartRun.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import org.apache.hadoop.conf.Configuration; 7 | 8 | public class StartRun { 9 | public static void main(String[] args) throws IllegalArgumentException, ClassNotFoundException, IOException, InterruptedException { 10 | String namenode_ip = "192.168.17.10"; 11 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 12 | Configuration conf = new Configuration(); 13 | conf.set("fs.defaultFS", hdfs); 14 | conf.set("mapreduce.app-submission.cross-platform", "true"); 15 | 16 | Map paths = new HashMap(); 17 | paths.put("Step1Input", "/expr/itemcf/data"); 18 | paths.put("Step1Output", "/expr/itemcf/output/output1"); 19 | 20 | paths.put("Step2Input", paths.get("Step1Output")); //后面每一步的输入路径都是前一步的输出路径 21 | paths.put("Step2Output", "/expr/itemcf/output/output2"); 22 | 23 | paths.put("Step3Input", paths.get("Step2Output")); 24 | paths.put("Step3Output", "/expr/itemcf/output/output3"); 25 | 26 | paths.put("Step4Input1", paths.get("Step2Output")); 27 | paths.put("Step4Input2", paths.get("Step3Output")); 28 | paths.put("Step4Output", "/expr/itemcf/output/output4"); 29 | 30 | paths.put("Step5Input", paths.get("Step4Output")); 31 | paths.put("Step5Output", "/expr/itemcf/output/output5"); 32 | 33 | paths.put("Step6Input", paths.get("Step5Output")); 34 | paths.put("Step6Output", "/expr/itemcf/output/output6"); 35 | 36 | Step1.run(conf, paths); //去重 37 | Step2.run(conf, paths); //计算用户评分矩阵 38 | Step3.run(conf, paths); //计算同现矩阵 39 | Step4.run(conf, paths); //计算单项评分=同现矩阵*评分矩阵 40 | Step5.run(conf, paths); //计算评分总和 41 | Step6.run(conf, paths); //评分排序取Top10 42 | 43 | System.out.println("finished!"); 44 | } 45 | 46 | public static Map R = new HashMap(); 47 | static { 48 | R.put("click", 1); //浏览 49 | R.put("collect", 2); //收藏 50 | R.put("cart", 3); //放入购物车 51 | R.put("alipay", 4); //支付 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step1.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | //去重 18 | public class Step1 { 19 | public static boolean run(Configuration config, Map paths) 20 | throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException { 21 | String jobName = "step1"; 22 | Job job = Job.getInstance(config, jobName); 23 | job.setJarByClass(Step1.class); 24 | job.setJar("export\\ItemCF.jar"); 25 | job.setMapperClass(Step1_Mapper.class); 26 | job.setReducerClass(Step1_Reducer.class); 27 | job.setMapOutputKeyClass(Text.class); 28 | job.setMapOutputValueClass(NullWritable.class); 29 | 30 | Path inPath = new Path(paths.get("Step1Input")); 31 | Path outpath = new Path(paths.get("Step1Output")); 32 | FileInputFormat.addInputPath(job, inPath); 33 | FileOutputFormat.setOutputPath(job, outpath); 34 | FileSystem fs = FileSystem.get(config); 35 | if (fs.exists(outpath)) { 36 | fs.delete(outpath, true); 37 | } 38 | 39 | return job.waitForCompletion(true); 40 | } 41 | 42 | static class Step1_Mapper extends Mapper { 43 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 44 | if (key.get() != 0) { //过滤掉输入文件标题行 45 | context.write(value, NullWritable.get()); 46 | } 47 | } 48 | } 49 | 50 | static class Step1_Reducer extends Reducer { 51 | protected void reduce(Text key, Iterable values, Context context) 52 | throws IOException, InterruptedException { 53 | context.write(key, NullWritable.get()); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import java.util.Map.Entry; 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | 18 | //计算用户评分矩阵 19 | public class Step2 { 20 | public static boolean run(Configuration config, Map paths) 21 | throws IOException, ClassNotFoundException, InterruptedException { 22 | String jobName = "step2"; 23 | Job job = Job.getInstance(config, jobName); 24 | job.setJarByClass(Step2.class); 25 | job.setJar("export\\ItemCF.jar"); 26 | job.setMapperClass(Step2_Mapper.class); 27 | job.setReducerClass(Step2_Reducer.class); 28 | job.setMapOutputKeyClass(Text.class); 29 | job.setMapOutputValueClass(Text.class); 30 | 31 | Path inPath = new Path(paths.get("Step2Input")); 32 | Path outpath = new Path(paths.get("Step2Output")); 33 | FileInputFormat.addInputPath(job, inPath); 34 | FileOutputFormat.setOutputPath(job, outpath); 35 | FileSystem fs = FileSystem.get(config); 36 | if (fs.exists(outpath)) { 37 | fs.delete(outpath, true); 38 | } 39 | 40 | return job.waitForCompletion(true); 41 | } 42 | 43 | static class Step2_Mapper extends Mapper { 44 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 45 | String[] strs = value.toString().split(","); 46 | String item = strs[0]; //商品id 47 | String user = strs[1]; //用户id 48 | String action = strs[2]; //用户行为 49 | Integer rv = StartRun.R.get(action); //获取行为评分 50 | Text v = new Text(item + ":" + rv.intValue()); //value格式: "i1:1" 51 | Text k = new Text(user); 52 | context.write(k, v); //map输出格式: "u2723 i1:1" 53 | } 54 | } 55 | 56 | static class Step2_Reducer extends Reducer { 57 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 58 | Map m = new HashMap(); //用于存放每种商品的行为评分之和 59 | for (Text value : values) { 60 | String[] strs = value.toString().split(":"); 61 | String item = strs[0]; //商品id 62 | Integer score = Integer.parseInt(strs[1]); //行为评分 63 | score += ((Integer) (m.get(item) == null ? 0 : m.get(item))).intValue(); //计算用户对每件商品的行为评分和(如果Map集合中已有该商品评分,则累加) 64 | m.put(item, score); //向HashMap中存入商品及评分之和 65 | } 66 | 67 | StringBuffer sb = new StringBuffer(); 68 | for (Entry entry : m.entrySet()) { 69 | sb.append(entry.getKey() + ":" + entry.getValue().intValue() + ","); //将商品和评分串联,格式: i1:1,i2:1,...I:N, 70 | } 71 | context.write(key, new Text(sb.toString().substring(0, sb.toString().length() - 1))); //去掉最后的逗号 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step3.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IntWritable; 9 | import org.apache.hadoop.io.LongWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | //计算用户同显矩阵 18 | public class Step3 { 19 | private final static Text K = new Text(); 20 | private final static IntWritable one = new IntWritable(1); 21 | 22 | public static boolean run(Configuration config, Map paths) throws IOException, ClassNotFoundException, InterruptedException { 23 | String jobName = "step3"; 24 | Job job = Job.getInstance(config, jobName); 25 | job.setJarByClass(Step3.class); 26 | job.setJar("export\\ItemCF.jar"); 27 | job.setMapperClass(Step3_Mapper.class); 28 | job.setReducerClass(Step3_Reducer.class); 29 | job.setCombinerClass(Step3_Reducer.class); 30 | job.setMapOutputKeyClass(Text.class); 31 | job.setMapOutputValueClass(IntWritable.class); 32 | 33 | Path inPath = new Path(paths.get("Step3Input")); 34 | Path outpath = new Path(paths.get("Step3Output")); 35 | FileInputFormat.addInputPath(job, inPath); 36 | FileOutputFormat.setOutputPath(job, outpath); 37 | FileSystem fs = FileSystem.get(config); 38 | if (fs.exists(outpath)) { 39 | fs.delete(outpath, true); 40 | } 41 | 42 | return job.waitForCompletion(true); 43 | } 44 | 45 | static class Step3_Mapper extends Mapper { 46 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 47 | // u2727 i468:2,i446:3 48 | String[] items = value.toString().split("\t")[1].split(","); //每件商品和评分列表,格式:i468:2 i446:3 49 | for (int i = 0; i < items.length; i++) { 50 | String itemA = items[i].split(":")[0]; // itemA = i468 .. i446 51 | for (int j = 0; j < items.length; j++) { 52 | String itemB = items[j].split(":")[0]; // itemB = i468 .. i446 53 | K.set(itemA + ":" + itemB); // i468:i468 , i468:i446, i446:i468, i446:i446 54 | context.write(K, one); 55 | } 56 | } 57 | } 58 | } 59 | 60 | static class Step3_Reducer extends Reducer { 61 | protected void reduce(Text key, Iterable values, Context context) 62 | throws IOException, InterruptedException { 63 | int sum = 0; 64 | for (IntWritable val : values) { 65 | sum += val.get(); 66 | } 67 | context.write(key, new IntWritable(sum)); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step4.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Iterator; 6 | import java.util.Map; 7 | import java.util.regex.Pattern; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | 20 | //同显矩阵*评分矩阵,计算评分单项 21 | public class Step4 { 22 | public static boolean run(Configuration config, Map paths) 23 | throws IOException, ClassNotFoundException, InterruptedException { 24 | String jobName = "step4"; 25 | Job job = Job.getInstance(config, jobName); 26 | job.setJarByClass(Step4.class); 27 | job.setJar("export\\ItemCF.jar"); 28 | job.setMapperClass(Step4_Mapper.class); 29 | job.setReducerClass(Step4_Reducer.class); 30 | job.setMapOutputKeyClass(Text.class); 31 | job.setMapOutputValueClass(Text.class); 32 | 33 | Path[] inPaths = new Path[] { 34 | new Path(paths.get("Step4Input1")), 35 | new Path(paths.get("Step4Input2")) }; 36 | Path outpath = new Path(paths.get("Step4Output")); 37 | FileInputFormat.setInputPaths(job, inPaths); 38 | FileOutputFormat.setOutputPath(job, outpath); 39 | FileSystem fs = FileSystem.get(config); 40 | if (fs.exists(outpath)) { 41 | fs.delete(outpath, true); 42 | } 43 | 44 | return job.waitForCompletion(true); 45 | } 46 | 47 | static class Step4_Mapper extends Mapper { 48 | private String flag; //保存Map输入数据来自于哪个目录(output2或ouput3),用于判断数据是同现矩阵还是评分矩阵 49 | 50 | protected void setup(Context context) throws IOException, InterruptedException { 51 | FileSplit split = (FileSplit) context.getInputSplit(); //根据上下文获取输入分片对象 52 | flag = split.getPath().getParent().getName(); //获取输入分片所属的目录名称 53 | } 54 | 55 | protected void map(LongWritable key, Text value, Context context) 56 | throws IOException, InterruptedException { 57 | String[] strs = Pattern.compile("[\t,]").split(value.toString()); 58 | if (flag.equals("output3")) { //输入的是同现矩阵,strs格式:"i100:i105 1" 59 | String[] items = strs[0].split(":"); 60 | String itemID1 = items[0]; //第一个商品id "i100" 61 | String itemID2 = items[1]; //第二个商品id "i105" 62 | String num = strs[1]; //两件商品的同现次数 "1" 63 | 64 | Text k = new Text(itemID1); 65 | Text v = new Text("A:" + itemID2 + "," + num); //格式:"A:i105,1" 66 | context.write(k, v); //格式:"i100 A:i105,1" 67 | 68 | } else if (flag.equals("output2")) { //输入的是评分矩阵,strs格式:"u14 i100:1 i25:1" 69 | String userID = strs[0]; 70 | for (int i = 1; i < strs.length; i++) { 71 | String[] vector = strs[i].split(":"); //i100:1 72 | String itemID = vector[0]; 73 | String score = vector[1]; 74 | Text k = new Text(itemID); 75 | Text v = new Text("B:" + userID + "," + score); //格式:"B:u14,1" 76 | context.write(k, v); //格式:"i100 B:u14,1" 和 "i25 B:u14,1" 77 | } 78 | } 79 | } 80 | } 81 | 82 | static class Step4_Reducer extends Reducer { 83 | protected void reduce(Text key, Iterable values, Context context) 84 | throws IOException, InterruptedException { 85 | Map mapA = new HashMap(); 86 | Map mapB = new HashMap(); 87 | //reduce输入格式:"i100 A:i105,1 A:i107,2 B:u14,1 B:u22,3" 88 | for (Text val : values) { //将AB格式的输入分别放入HashMap中 89 | String str = val.toString(); 90 | if (str.startsWith("A:")) { //str格式:"A:i105,1" 91 | String[] kv = Pattern.compile("[\t,]").split(str.substring(2)); 92 | mapA.put(kv[0], Integer.parseInt(kv[1])); 93 | } else if (str.startsWith("B:")) { //str格式:"B:u14,1" 94 | String[] kv = Pattern.compile("[\t,]").split(str.substring(2)); 95 | mapB.put(kv[0], Integer.parseInt(kv[1])); 96 | } 97 | } 98 | double result = 0; 99 | Iterator itera = mapA.keySet().iterator(); //根据mapA中key键(itemID)生成迭代器对象 100 | while (itera.hasNext()) { 101 | String mapka = itera.next(); //获得itemID 102 | int num = mapA.get(mapka).intValue(); //根据itemID从mapA获取同现次数 103 | 104 | Iterator iterb = mapB.keySet().iterator(); //根据mapB中key键生成迭代器对象 105 | while (iterb.hasNext()) { 106 | String mapkb = iterb.next(); //userID 107 | int score = mapB.get(mapkb).intValue(); //根据userID从mapB中获取用户行为评分 108 | 109 | result = num * score; //矩阵相乘,计算评分 110 | context.write(new Text(mapkb), new Text(mapka + "," + result)); //输出 key:"userID" value:"itemID,result" 111 | } 112 | } 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step5.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Iterator; 6 | import java.util.Map; 7 | import java.util.regex.Pattern; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | 19 | //计算总和评分 20 | public class Step5 { 21 | public static boolean run(Configuration config, Map paths) 22 | throws IOException, ClassNotFoundException, InterruptedException { 23 | String jobName = "step5"; 24 | Job job = Job.getInstance(config, jobName); 25 | job.setJarByClass(Step5.class); 26 | job.setJar("export\\ItemCF.jar"); 27 | job.setMapperClass(Step5_Mapper.class); 28 | job.setReducerClass(Step5_Reducer.class); 29 | job.setMapOutputKeyClass(Text.class); 30 | job.setMapOutputValueClass(Text.class); 31 | 32 | Path inPath = new Path(paths.get("Step5Input")); 33 | Path outpath = new Path(paths.get("Step5Output")); 34 | FileInputFormat.addInputPath(job, inPath); 35 | FileOutputFormat.setOutputPath(job, outpath); 36 | FileSystem fs = FileSystem.get(config); 37 | if (fs.exists(outpath)) { 38 | fs.delete(outpath, true); 39 | } 40 | 41 | return job.waitForCompletion(true); 42 | } 43 | 44 | static class Step5_Mapper extends Mapper { 45 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 46 | //输入格式:"u2732 i405,2.0" 47 | String[] strs = Pattern.compile("[\t,]").split(value.toString()); 48 | Text k = new Text(strs[0]); //key: userID 49 | Text v = new Text(strs[1] + "," + strs[2]); //value: "itemID,评分" 50 | context.write(k, v); 51 | } 52 | } 53 | 54 | static class Step5_Reducer extends Reducer { 55 | protected void reduce(Text key, Iterable values, Context context) 56 | throws IOException, InterruptedException { 57 | Map map = new HashMap(); //用于对商品评分累加 58 | for (Text val : values) { //val格式: "itemID,评分" 59 | String[] strs = val.toString().split(","); 60 | String itemID = strs[0]; 61 | Double score = Double.parseDouble(strs[1]); 62 | 63 | if (map.containsKey(itemID)) { //如果Map中已记录该商品,取出评分累加后重新写入Map 64 | map.put(itemID, map.get(itemID) + score); 65 | } else { 66 | map.put(itemID, score); 67 | } 68 | } 69 | 70 | //遍历Map,完成输出 71 | Iterator iter = map.keySet().iterator(); //根据itemID创建迭代器对象 72 | while (iter.hasNext()) { 73 | String itemID = iter.next(); //取出itemID 74 | double score = map.get(itemID); //根据itemID从map中取出score 75 | context.write(key, new Text(itemID + "," + score)); //格式:"userid itemID,score" 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/itemcf/Step6.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.itemcf; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.regex.Pattern; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.io.WritableComparable; 14 | import org.apache.hadoop.io.WritableComparator; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | 21 | //评分排序取Top10 22 | public class Step6 { 23 | private final static Text K = new Text(); 24 | private final static Text V = new Text(); 25 | 26 | public static boolean run(Configuration config, Map paths) 27 | throws IOException, ClassNotFoundException, InterruptedException { 28 | String jobName = "step6"; 29 | Job job = Job.getInstance(config, jobName); 30 | job.setJarByClass(Step6.class); 31 | job.setJar("export\\ItemCF.jar"); 32 | job.setMapperClass(Step6_Mapper.class); 33 | job.setReducerClass(Step6_Reducer.class); 34 | job.setMapOutputKeyClass(PairWritable.class); 35 | job.setMapOutputValueClass(Text.class); 36 | //job.setSortComparatorClass(ScoreSort.class); //自定义排序 37 | job.setGroupingComparatorClass(UserGroup.class); //自定义分组 38 | 39 | Path inPath = new Path(paths.get("Step6Input")); 40 | Path outpath = new Path(paths.get("Step6Output")); 41 | FileInputFormat.addInputPath(job, inPath); 42 | FileOutputFormat.setOutputPath(job, outpath); 43 | FileSystem fs = FileSystem.get(config); 44 | if (fs.exists(outpath)) { 45 | fs.delete(outpath, true); 46 | } 47 | 48 | return job.waitForCompletion(true); 49 | } 50 | 51 | static class Step6_Mapper extends Mapper { 52 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 53 | String[] strs = Pattern.compile("[\t,]").split(value.toString()); //输入格式:"u13 i524,3.0" 54 | String user = strs[0]; 55 | String item = strs[1]; 56 | String score = strs[2]; 57 | 58 | PairWritable k = new PairWritable(); //将uid和score封装到PairWritable对象中,作为MapKey输出 59 | k.setUid(user); 60 | k.setScore(Double.parseDouble(score)); 61 | 62 | V.set(item + ":" + score); //将item和score组合,作为MapValue输出 63 | context.write(k, V); //输出格式:key:"u13 3.0" value:"i524:3.0" 64 | } 65 | } 66 | 67 | static class Step6_Reducer extends Reducer { 68 | protected void reduce(PairWritable key, Iterable values, Context context) 69 | throws IOException, InterruptedException { 70 | int i = 0; 71 | StringBuffer sb = new StringBuffer(); 72 | for (Text v : values) { 73 | if (i == 10) 74 | break; 75 | sb.append(v.toString() + ","); //将评分数前10项串联 76 | i++; 77 | } 78 | K.set(key.getUid()); //获取自定义key中的uid 79 | V.set(sb.toString().substring(0,sb.toString().length()-1)); //去掉最后的逗号 80 | context.write(K, V); 81 | } 82 | } 83 | 84 | static class PairWritable implements WritableComparable { 85 | private String uid; 86 | private double score; 87 | 88 | public String getUid() { 89 | return uid; 90 | } 91 | 92 | public void setUid(String uid) { 93 | this.uid = uid; 94 | } 95 | 96 | public double getScore() { 97 | return score; 98 | } 99 | 100 | public void setScore(double score) { 101 | this.score = score; 102 | } 103 | 104 | @Override 105 | public void write(DataOutput out) throws IOException { 106 | out.writeUTF(uid); 107 | out.writeDouble(score); 108 | } 109 | 110 | @Override 111 | public void readFields(DataInput in) throws IOException { 112 | this.uid = in.readUTF(); 113 | this.score = in.readDouble(); 114 | } 115 | 116 | @Override 117 | public int compareTo(PairWritable o) { 118 | int r = this.uid.compareTo(o.getUid()); //按uid升序排列 119 | if (r == 0) { 120 | return -Double.compare(this.score, o.getScore()); //uid相同,则按score降序排列 121 | } 122 | return r; 123 | } 124 | } 125 | 126 | //自定义排序:先按uid升序,再按score降序 127 | /*static class ScoreSort extends WritableComparator { 128 | public ScoreSort() { 129 | super(PairWritable.class, true); 130 | } 131 | 132 | @SuppressWarnings("rawtypes") 133 | public int compare(WritableComparable a, WritableComparable b) { 134 | PairWritable o1 = (PairWritable) a; 135 | PairWritable o2 = (PairWritable) b; 136 | int r = o1.getUid().compareTo(o2.getUid()); //按uid升序排列 137 | if (r == 0) { 138 | return -Double.compare(o1.getScore(), o2.getScore()); //按num降序排列 139 | } 140 | return r; 141 | } 142 | }*/ 143 | 144 | //自定义分组,Map输出key(PairWritable)中uid相同的记录设为同组 145 | static class UserGroup extends WritableComparator { 146 | public UserGroup() { 147 | super(PairWritable.class, true); 148 | } 149 | 150 | @SuppressWarnings("rawtypes") 151 | public int compare(WritableComparable a, WritableComparable b) { 152 | PairWritable o1 = (PairWritable) a; 153 | PairWritable o2 = (PairWritable) b; 154 | return o1.getUid().compareTo(o2.getUid()); 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/medianstddev/MRDPUtils.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.medianstddev; 2 | 3 | import java.util.Map; 4 | import java.util.HashMap; 5 | 6 | public class MRDPUtils { 7 | public static Map transformXmlToMap(String xml) { 8 | Map map = new HashMap(); 9 | try { 10 | String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\""); 11 | for (int i = 0; i < tokens.length - 1; i += 2) { 12 | String key = tokens[i].trim(); 13 | String val = tokens[i + 1]; 14 | map.put(key.substring(0, key.length() - 1), val); 15 | } 16 | } catch (StringIndexOutOfBoundsException e) { 17 | System.err.println(xml); 18 | } 19 | return map; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevJob.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.medianstddev; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | public class MedianStdDevJob { 12 | public static void main(String[] args) throws Exception { 13 | //1.设置HDFS配置信息 14 | String namenode_ip = "192.168.17.10"; 15 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 16 | Configuration conf = new Configuration(); 17 | conf.set("fs.defaultFS", hdfs); 18 | conf.set("mapreduce.app-submission.cross-platform", "true"); 19 | 20 | //2.设置MapReduce作业配置信息 21 | String jobName = "MedianStdDevJob"; //作业名称 22 | Job job = Job.getInstance(conf, jobName); 23 | job.setJarByClass(MedianStdDevJob.class); //指定运行时作业类 24 | job.setJar("export\\MedianStdDevJob.jar"); //指定本地jar包 25 | job.setMapperClass(MedianStdDevMapper.class); //指定Mapper类 26 | job.setMapOutputKeyClass(IntWritable.class); //设置Mapper输出Key类型 27 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 28 | job.setReducerClass(MedianStdDevReducer.class); //指定Reducer类 29 | job.setOutputKeyClass(IntWritable.class); //设置Reduce输出Key类型 30 | job.setOutputValueClass(MedianStdDevTuple.class); //设置Reduce输出Value类型 31 | 32 | //3.设置作业输入和输出路径 33 | String dataDir = "/expr/medianstddev/data"; //实验数据目录 34 | String outputDir = "/expr/medianstddev/output"; //实验输出目录 35 | Path inPath = new Path(hdfs + dataDir); 36 | Path outPath = new Path(hdfs + outputDir); 37 | FileInputFormat.addInputPath(job, inPath); 38 | FileOutputFormat.setOutputPath(job, outPath); 39 | FileSystem fs = FileSystem.get(conf); 40 | if(fs.exists(outPath)) { 41 | fs.delete(outPath, true); 42 | } 43 | 44 | //4.运行作业 45 | System.out.println("Job: " + jobName + " is running..."); 46 | if(job.waitForCompletion(true)) { 47 | System.out.println("success!"); 48 | System.exit(0); 49 | } else { 50 | System.out.println("failed!"); 51 | System.exit(1); 52 | } 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevMapper.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.medianstddev; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Calendar; 7 | import java.util.Date; 8 | import java.util.Map; 9 | import org.apache.hadoop.io.IntWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | 13 | public class MedianStdDevMapper extends Mapper { 14 | private IntWritable outHour= new IntWritable(); 15 | private IntWritable outCommentLength= new IntWritable(); 16 | private final static SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); 17 | 18 | @SuppressWarnings("deprecation") 19 | @Override 20 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 21 | Map map = MRDPUtils.transformXmlToMap(value.toString()); 22 | String strDate = map.get("CreationDate"); //获取评论日期 23 | String text = map.get("Text"); //获取评论内容 24 | if (strDate == null || text == null) { 25 | return; 26 | } 27 | try { 28 | Date creationDate = frmt.parse(strDate); //转换日期格式 29 | outHour.set(creationDate.getHours()); //从日期中获取小时值 30 | outCommentLength.set(text.length()); //设置评论内容的长度 31 | context.write(outHour, outCommentLength); //将小时和评论长度作为Map输出 32 | } catch (ParseException e) { 33 | System.err.println(e.getMessage()); 34 | return; 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevReducer.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.medianstddev; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | public class MedianStdDevReducer extends Reducer { 10 | private MedianStdDevTuple result = new MedianStdDevTuple(); //记录评论长度中位数和标准差 11 | private ArrayList commentLengths = new ArrayList(); //用列表记录每条评论的长度 12 | 13 | @Override 14 | public void reduce(IntWritable key, Iterable values, Context context) 15 | throws IOException, InterruptedException { 16 | float sum = 0; //评论长度总和 17 | float count = 0; //评论数 18 | commentLengths.clear(); //清空评论数列表 19 | result.setStddev(0); //标准差默认值设为0 20 | for (IntWritable val : values) { 21 | commentLengths.add((float) val.get()); //将评论长度保存到列表 22 | sum += val.get(); //计算评论长度总和 23 | count++; //评论总数 24 | } 25 | 26 | //计算中位数:集合数量如为偶数,取中间两位的均值;如为奇数,则直接取中值 27 | Collections.sort(commentLengths); //对集合中评论字数排序 28 | if (count % 2 == 0) {//偶 29 | result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths.get((int) count / 2)) / 2.0f); 30 | } else {//奇 31 | result.setMedian(commentLengths.get((int) count / 2)); 32 | } 33 | 34 | //计算标准差 35 | float mean = sum / count; //计算评论的平均字数 36 | float sumOfSquares = 0.0f; //平方和 37 | for (Float f : commentLengths) { 38 | sumOfSquares += (f - mean) * (f - mean); 39 | } 40 | result.setStddev((float) Math.sqrt(sumOfSquares / (count - 1))); //计算标准差 41 | context.write(key, result); 42 | } 43 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/medianstddev/MedianStdDevTuple.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.medianstddev; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | public class MedianStdDevTuple implements Writable { 9 | private float median = 0f; 10 | private float stddev = 0f; 11 | 12 | public float getMedian() { 13 | return median; 14 | } 15 | 16 | public void setMedian(float median) { 17 | this.median = median; 18 | } 19 | 20 | public float getStddev() { 21 | return stddev; 22 | } 23 | 24 | public void setStddev(float stddev) { 25 | this.stddev = stddev; 26 | } 27 | 28 | @Override 29 | public void readFields(DataInput in) throws IOException { 30 | median = in.readFloat(); 31 | stddev = in.readFloat(); 32 | } 33 | 34 | @Override 35 | public void write(DataOutput out) throws IOException { 36 | out.writeFloat(median); 37 | out.writeFloat(stddev); 38 | } 39 | 40 | @Override 41 | public String toString() { 42 | return median + "\t" + stddev; 43 | } 44 | 45 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/minmaxcount/MRDPUtils.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.minmaxcount; 2 | 3 | import java.util.Map; 4 | import java.util.HashMap; 5 | 6 | public class MRDPUtils { 7 | public static Map transformXmlToMap(String xml) { 8 | Map map = new HashMap(); 9 | try { 10 | String[] tokens = xml.trim().substring(5, xml.trim().length() - 3).split("\""); 11 | for (int i = 0; i < tokens.length - 1; i += 2) { 12 | String key = tokens[i].trim(); 13 | String val = tokens[i + 1]; 14 | map.put(key.substring(0, key.length() - 1), val); 15 | } 16 | } catch (StringIndexOutOfBoundsException e) { 17 | System.err.println(xml); 18 | } 19 | return map; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountJob.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.minmaxcount; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | public class MinMaxCountJob { 12 | public static void main(String[] args) throws Exception { 13 | //1.设置HDFS配置信息 14 | String namenode_ip = "192.168.17.10"; 15 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 16 | Configuration conf = new Configuration(); 17 | conf.set("fs.defaultFS", hdfs); 18 | conf.set("mapreduce.app-submission.cross-platform", "true"); 19 | 20 | //2.设置MapReduce作业配置信息 21 | String jobName = "MinMaxCountJob"; //作业名称 22 | Job job = Job.getInstance(conf, jobName); 23 | job.setJarByClass(MinMaxCountJob.class); //指定运行时作业类 24 | job.setJar("export\\MinMaxCountJob.jar"); //指定本地jar包 25 | job.setMapperClass(MinMaxCountMapper.class); //指定Mapper类 26 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 27 | job.setMapOutputValueClass(MinMaxCountTuple.class); //设置Mapper输出Value类型 28 | job.setCombinerClass(MinMaxCountReducer.class); //指定Combiner类 29 | job.setReducerClass(MinMaxCountReducer.class); //指定Reducer类 30 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 31 | job.setOutputValueClass(MinMaxCountTuple.class); //设置Reduce输出Value类型 32 | 33 | //3.设置作业输入和输出路径 34 | String dataDir = "/expr/minmaxcount/data"; //实验数据目录 35 | String outputDir = "/expr/minmaxcount/output"; //实验输出目录 36 | Path inPath = new Path(hdfs + dataDir); 37 | Path outPath = new Path(hdfs + outputDir); 38 | FileInputFormat.addInputPath(job, inPath); 39 | FileOutputFormat.setOutputPath(job, outPath); 40 | FileSystem fs = FileSystem.get(conf); 41 | if(fs.exists(outPath)) { 42 | fs.delete(outPath, true); 43 | } 44 | 45 | //4.运行作业 46 | System.out.println("Job: " + jobName + " is running..."); 47 | if(job.waitForCompletion(true)) { 48 | System.out.println("success!"); 49 | System.exit(0); 50 | } else { 51 | System.out.println("failed!"); 52 | System.exit(1); 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountMapper.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.minmaxcount; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Date; 7 | import java.util.Map; 8 | 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | 12 | public class MinMaxCountMapper extends Mapper { 13 | private Text outUserId = new Text(); //用户ID 14 | private MinMaxCountTuple outTuple = new MinMaxCountTuple(); //日期最小值、日期最大值、评论数的组合 15 | private final SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); 16 | 17 | @Override 18 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 19 | Map map = MRDPUtils.transformXmlToMap(value.toString()); //分解每条评论,保存每对KV到Map对象 20 | String userId = map.get("UserId"); //从Map对象中获取用户ID 21 | String strDate = map.get("CreationDate"); //从Map对象中获取评论时间 22 | 23 | if (strDate == null || userId == null) { //过滤掉不含统计数据的记录 24 | return; 25 | } 26 | try { 27 | Date creationDate = frmt.parse(strDate); 28 | // 因为还没有MinMax,只有把当前数据中日期作为MinMax 29 | outTuple.setMin(creationDate); 30 | outTuple.setMax(creationDate); 31 | outTuple.setCount(1); 32 | outUserId.set(userId); 33 | context.write(outUserId, outTuple); 34 | } catch (ParseException e) { 35 | System.err.println(e.getMessage()); 36 | return; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountReducer.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.minmaxcount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | public class MinMaxCountReducer extends Reducer { 8 | private MinMaxCountTuple result = new MinMaxCountTuple(); 9 | 10 | @Override 11 | public void reduce(Text key, Iterable values, Context context) 12 | throws IOException, InterruptedException { 13 | result.setMin(null); 14 | result.setMax(null); 15 | int sum = 0; 16 | for (MinMaxCountTuple val : values) { 17 | if (result.getMin() == null || val.getMin().compareTo(result.getMin()) < 0) { 18 | result.setMin(val.getMin()); 19 | } 20 | if (result.getMax() == null || val.getMax().compareTo(result.getMax()) > 0) { 21 | result.setMax(val.getMax()); 22 | } 23 | sum += val.getCount(); 24 | } 25 | result.setCount(sum); 26 | context.write(key, result); 27 | } 28 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/minmaxcount/MinMaxCountTuple.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.minmaxcount; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.SimpleDateFormat; 7 | import java.util.Date; 8 | 9 | import org.apache.hadoop.io.Writable; 10 | 11 | public class MinMaxCountTuple implements Writable { 12 | 13 | private Date min = new Date(); //第一次评论时间 14 | private Date max = new Date(); //最后一次评论时间 15 | private long count = 0; //评论总数 16 | private final SimpleDateFormat frmt = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS"); 17 | 18 | public Date getMin() { 19 | return min; 20 | } 21 | 22 | public void setMin(Date min) { 23 | this.min = min; 24 | } 25 | 26 | public Date getMax() { 27 | return max; 28 | } 29 | 30 | public void setMax(Date max) { 31 | this.max = max; 32 | } 33 | 34 | public long getCount() { 35 | return count; 36 | } 37 | 38 | public void setCount(long count) { 39 | this.count = count; 40 | } 41 | 42 | @Override 43 | public void readFields(DataInput in) throws IOException { 44 | min = new Date(in.readLong()); 45 | max = new Date(in.readLong()); 46 | count = in.readLong(); 47 | } 48 | 49 | @Override 50 | public void write(DataOutput out) throws IOException { 51 | out.writeLong(min.getTime()); 52 | out.writeLong(max.getTime()); 53 | out.writeLong(count); 54 | } 55 | 56 | @Override 57 | public String toString() { 58 | return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/output/CompressOutput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.compress.GzipCodec; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | 16 | public class CompressOutput { 17 | 18 | public static class CompressOutputMapper extends Mapper { 19 | private final static IntWritable one = new IntWritable(1); 20 | 21 | public void map(Object key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | String[] strs = value.toString().split(" "); //按空格分割输入 24 | Text date = new Text(strs[0]); //获取日期 25 | context.write(date, one); //将日期和常数1作为Map输出 26 | } 27 | } 28 | 29 | public static class CompressOutputReducer extends Reducer { 30 | public void reduce(Text key, Iterable values, Context context) 31 | throws IOException, InterruptedException { 32 | int sum = 0; 33 | for (IntWritable val : values) { 34 | sum += val.get(); 35 | } 36 | context.write(key, new IntWritable(sum)); 37 | } 38 | } 39 | 40 | public static void main(String[] args) throws Exception { 41 | //1.设置HDFS配置信息 42 | String namenode_ip = "192.168.17.10"; 43 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 44 | Configuration conf = new Configuration(); 45 | conf.set("fs.defaultFS", hdfs); 46 | conf.set("mapreduce.app-submission.cross-platform", "true"); 47 | 48 | //2.设置MapReduce作业配置信息 49 | String jobName = "CompressOutput"; //作业名称 50 | Job job = Job.getInstance(conf, jobName); 51 | job.setJarByClass(CompressOutput.class); //指定运行时作业类 52 | job.setJar("export\\CompressOutput.jar"); //指定本地jar包 53 | job.setMapperClass(CompressOutputMapper.class); //指定Mapper类 54 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 55 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 56 | job.setReducerClass(CompressOutputReducer.class); //指定Reducer类 57 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 58 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 59 | 60 | //设置对输出结果进行压缩,指定压缩编码方式 61 | FileOutputFormat.setCompressOutput(job, true); 62 | FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); 63 | 64 | //3.设置作业输入和输出路径 65 | String dataDir = "/expr/compress/data"; //实验数据目录 66 | String outputDir = "/expr/compress/output"; //实验输出目录 67 | Path inPath = new Path(hdfs + dataDir); 68 | Path outPath = new Path(hdfs + outputDir); 69 | FileInputFormat.addInputPath(job, inPath); 70 | FileOutputFormat.setOutputPath(job, outPath); 71 | FileSystem fs = FileSystem.get(conf); 72 | if(fs.exists(outPath)) { 73 | fs.delete(outPath, true); 74 | } 75 | 76 | //4.运行作业 77 | System.out.println("Job: " + jobName + " is running..."); 78 | if(job.waitForCompletion(true)) { 79 | System.out.println("success!"); 80 | System.exit(0); 81 | } else { 82 | System.out.println("failed!"); 83 | System.exit(1); 84 | } 85 | } 86 | 87 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/output/MultOutput.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.output; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 16 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 18 | 19 | public class MultOutput { 20 | 21 | public static class MultOutputMapper extends Mapper { 22 | private final static IntWritable one = new IntWritable(1); 23 | 24 | public void map(Object key, Text value, Context context ) 25 | throws IOException, InterruptedException { 26 | String[] strs = value.toString().split(" "); //按空格分割输入 27 | Text date = new Text(strs[0]); //获取日期 28 | context.write(date, one); //将日期和常数1作为Map输出 29 | } 30 | } 31 | 32 | public static class MultOutputReducer extends Reducer { 33 | //定义MultiOutputs对象 34 | private MultipleOutputs mos; 35 | 36 | //初始化MultiOutputs对象 37 | protected void setup(Context context) throws IOException, InterruptedException { 38 | mos = new MultipleOutputs(context); 39 | } 40 | 41 | //关闭MultiOutputs对象 42 | protected void cleanup(Context context) throws IOException, InterruptedException { 43 | mos.close(); 44 | } 45 | 46 | public void reduce(Text key, Iterable values, Context context) 47 | throws IOException, InterruptedException { 48 | int sum = 0; 49 | for (IntWritable val : values) { 50 | sum += val.get(); 51 | } 52 | //context.write(key, new IntWritable(sum)); 53 | 54 | //使用MultiOutputs对象替代Context对象输出 55 | //1. 输出到不同文件(格式、文件名) 56 | if (key.toString().startsWith("2015")) 57 | mos.write("f2015", key, new IntWritable(sum)); 58 | else if (key.toString().startsWith("2016")) 59 | mos.write("f2016", key, new IntWritable(sum)); 60 | else 61 | mos.write("f2017", key, new IntWritable(sum)); 62 | 63 | //2. 输出到以年分类的子目录,只需指定输出子目录+文件名,不需要在驱动类中定义文件名 64 | //mos.write(key, new IntWritable(sum), key.toString().substring(0,4)+"/result"); 65 | 66 | } 67 | } 68 | 69 | public static void main(String[] args) throws Exception { 70 | //1.设置HDFS配置信息 71 | String namenode_ip = "192.168.17.10"; 72 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 73 | Configuration conf = new Configuration(); 74 | conf.set("fs.defaultFS", hdfs); 75 | conf.set("mapreduce.app-submission.cross-platform", "true"); 76 | 77 | //2.设置MapReduce作业配置信息 78 | String jobName = "MultOutput"; //作业名称 79 | Job job = Job.getInstance(conf, jobName); 80 | job.setJarByClass(MultOutput.class); //指定运行时作业类 81 | job.setJar("export\\MultOutput.jar"); //指定本地jar包 82 | job.setMapperClass(MultOutputMapper.class); //指定Mapper类 83 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 84 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 85 | job.setReducerClass(MultOutputReducer.class); //指定Reducer类 86 | //job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 87 | //job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 88 | 89 | //定义多文件输出的文件名、输出格式、键类型、值类型 90 | MultipleOutputs.addNamedOutput(job, "f2015", TextOutputFormat.class, Text.class, IntWritable.class); 91 | MultipleOutputs.addNamedOutput(job, "f2016", SequenceFileOutputFormat.class, Text.class, IntWritable.class); 92 | MultipleOutputs.addNamedOutput(job, "f2017", MapFileOutputFormat.class, Text.class, IntWritable.class); 93 | 94 | //3.设置作业输入和输出路径 95 | String dataDir = "/expr/multoutput/data"; //实验数据目录 96 | String outputDir = "/expr/multoutput/output"; //实验输出目录 97 | Path inPath = new Path(hdfs + dataDir); 98 | Path outPath = new Path(hdfs + outputDir); 99 | FileInputFormat.addInputPath(job, inPath); 100 | FileOutputFormat.setOutputPath(job, outPath); 101 | FileSystem fs = FileSystem.get(conf); 102 | if(fs.exists(outPath)) { 103 | fs.delete(outPath, true); 104 | } 105 | 106 | //4.运行作业 107 | System.out.println("Job: " + jobName + " is running..."); 108 | if(job.waitForCompletion(true)) { 109 | System.out.println("success!"); 110 | System.exit(0); 111 | } else { 112 | System.out.println("failed!"); 113 | System.exit(1); 114 | } 115 | } 116 | 117 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/peoplerank/People.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.peoplerank; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | import org.apache.commons.lang.StringUtils; 6 | 7 | public class People { 8 | private double peopleRank = 1.0; //存储 PR值,初值默认1.0 9 | private String[] attentionPeoples; //关注的人 10 | public static final char fieldSeparator = '\t'; //多处使用分隔符\t,定义为常量 11 | 12 | public double getPeopleRank() { 13 | return peopleRank; 14 | } 15 | 16 | public People setPeopleRank(double pageRank) { 17 | this.peopleRank = pageRank; 18 | return this; 19 | } 20 | 21 | public String[] getAttentionPeoples() { 22 | return attentionPeoples; 23 | } 24 | 25 | public People setAttentionPeoples(String[] attentionPeoples) { 26 | this.attentionPeoples = attentionPeoples; 27 | return this; 28 | } 29 | 30 | //判断是否包含关注用户 31 | public boolean containsAttentionPeoples() { 32 | return attentionPeoples != null && attentionPeoples.length > 0; 33 | } 34 | 35 | @Override 36 | //People对象转成字符串 37 | public String toString() { 38 | StringBuilder sb = new StringBuilder(); 39 | sb.append(peopleRank); 40 | if (attentionPeoples != null) { 41 | sb.append(fieldSeparator).append(StringUtils.join(attentionPeoples, fieldSeparator)); 42 | } 43 | return sb.toString(); //返回String格式:"PeopleRand值 u1 u2..." 44 | } 45 | 46 | //字符串转成People对象 47 | public static People fromMR(String str) throws IOException { //参数String格式:"PeopleRand值 u1 u2..." 48 | People people = new People(); 49 | String[] strs = StringUtils.splitPreserveAllTokens(str, fieldSeparator); //将字符串按分隔符分割成字符串数组 50 | people.setPeopleRank(Double.valueOf(strs[0])); //处理第一个元素 51 | if (strs.length > 1) {// 设置关注的人,从strs下标为1的位置开始(因为传进来类似"1.0 b c d"的字符串) 52 | people.setAttentionPeoples(Arrays.copyOfRange(strs, 1, strs.length)); //处理其它元素 53 | } 54 | return people; //返回People对象 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/peoplerank/PeopleRank.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.peoplerank; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class PeopleRank { 16 | 17 | public static class PeopleRankMapper extends Mapper { 18 | @Override 19 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 20 | String[] strs = value.toString().split(","); 21 | context.write(new Text(strs[0]), new Text(strs[1])); 22 | } 23 | } 24 | 25 | public static class PeopleRankReducer extends Reducer { 26 | @Override 27 | protected void reduce(Text key, Iterable values, Context context) 28 | throws IOException, InterruptedException { 29 | StringBuilder sb = new StringBuilder(); 30 | for (Text v : values) { 31 | sb.append("\t" + v.toString()); 32 | } 33 | context.write(key, new Text(sb.toString().replaceFirst("\t", ""))); //将开头的制表符去掉 34 | } 35 | } 36 | 37 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 38 | String namenode_ip = "192.168.17.10"; 39 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 40 | Configuration conf = new Configuration(); 41 | conf.set("fs.defaultFS", hdfs); 42 | conf.set("mapreduce.app-submission.cross-platform", "true"); 43 | 44 | String jobName = "PeopleRank"; 45 | Job job = Job.getInstance(conf, jobName); 46 | job.setJarByClass(PeopleRank.class); 47 | job.setJar("export\\PeopleRank.jar"); 48 | job.setMapperClass(PeopleRankMapper.class); 49 | job.setMapOutputKeyClass(Text.class); 50 | job.setMapOutputValueClass(Text.class); 51 | job.setReducerClass(PeopleRankReducer.class); 52 | job.setOutputKeyClass(Text.class); 53 | job.setOutputValueClass(Text.class); 54 | 55 | String dataDir = "/expr/peoplerank/data"; 56 | String outputDir = "/expr/peoplerank/output/adjacent"; 57 | Path inPath = new Path(hdfs + dataDir); 58 | Path outPath = new Path(hdfs + outputDir); 59 | FileInputFormat.addInputPath(job, inPath); 60 | FileOutputFormat.setOutputPath(job, outPath); 61 | FileSystem fs = FileSystem.get(conf); 62 | if(fs.exists(outPath)) { 63 | fs.delete(outPath, true); 64 | } 65 | 66 | System.out.println( "Job: " + jobName + " is running..."); 67 | if(job.waitForCompletion(true)) { 68 | System.out.println("success!"); 69 | System.exit(0); 70 | } else { 71 | System.out.println("failed!"); 72 | System.exit(1); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/topten/TopTenJob.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.topten; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | public class TopTenJob { 13 | public static void main(String[] args) throws Exception { 14 | String namenode_ip = "192.168.17.10"; 15 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 16 | Configuration conf = new Configuration(); 17 | conf.set("fs.defaultFS", hdfs); 18 | conf.set("mapreduce.app-submission.cross-platform", "true"); 19 | 20 | String jobName = "TopTenJob"; 21 | Job job = Job.getInstance(conf, jobName); 22 | job.setJarByClass(TopTenJob.class); 23 | job.setJar("export\\TopTen.jar"); 24 | job.setMapperClass(TopTenMapper.class); 25 | job.setMapOutputKeyClass(NullWritable.class); 26 | job.setMapOutputValueClass(Text.class); 27 | job.setReducerClass(TopTenReducer.class); 28 | job.setOutputKeyClass(NullWritable.class); 29 | job.setOutputValueClass(Text.class); 30 | job.setNumReduceTasks(1); //计算最终TopN,只能运行一个Reduce任务 31 | 32 | String dataDir = "/expr/topten/data"; 33 | String outputDir = "/expr/topten/output"; 34 | Path inPath = new Path(hdfs + dataDir); 35 | Path outPath = new Path(hdfs + outputDir); 36 | FileInputFormat.addInputPath(job, inPath); 37 | FileOutputFormat.setOutputPath(job, outPath); 38 | FileSystem fs = FileSystem.get(conf); 39 | if(fs.exists(outPath)) { 40 | fs.delete(outPath, true); 41 | } 42 | 43 | System.out.println( "Job: " + jobName + " is running..."); 44 | if(job.waitForCompletion(true)) { 45 | System.out.println("success!"); 46 | System.exit(0); 47 | } else { 48 | System.out.println("failed!"); 49 | System.exit(1); 50 | } 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/topten/TopTenMapper.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.topten; 2 | 3 | import java.io.IOException; 4 | import java.util.TreeMap; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class TopTenMapper extends Mapper { 10 | private TreeMap visittimesMap = new TreeMap(); //TreeMap是有序KV集合 11 | 12 | @Override 13 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 14 | if (value == null) { 15 | return; 16 | } 17 | String[] strs = value.toString().split(" "); 18 | String tId = strs[0]; 19 | String tVisittimes = strs[1]; 20 | if (tId == null || tVisittimes == null) { 21 | return; 22 | } 23 | visittimesMap.put(Integer.parseInt(tVisittimes), new Text(value)); //将访问次数(KEY)和行记录(VALUE)放入TreeMap中自动排序 24 | if (visittimesMap.size() > 10) { //如果TreeMap中元素超过N个,将第一个(KEY最小的)元素删除 25 | visittimesMap.remove(visittimesMap.firstKey()); 26 | } 27 | } 28 | 29 | @Override 30 | protected void cleanup(Context context) throws IOException, InterruptedException { 31 | for (Text t : visittimesMap.values()) { 32 | context.write(NullWritable.get(), t); //在clean()中完成Map输出 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/ssdut/training/mapreduce/topten/TopTenReducer.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.topten; 2 | 3 | import java.io.IOException; 4 | import java.util.NavigableMap; 5 | import java.util.TreeMap; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Reducer; 9 | 10 | public class TopTenReducer extends Reducer { 11 | private TreeMap visittimesMap = new TreeMap(); 12 | 13 | @Override 14 | public void reduce(NullWritable key, Iterable values, Context context) 15 | throws IOException, InterruptedException { 16 | for (Text val : values) { 17 | String[] strs = val.toString().split(" "); 18 | visittimesMap.put(Integer.parseInt(strs[1]), new Text(val)); //将访问次数(KEY)和行记录(VALUE)放入TreeMap中自动排序 19 | if (visittimesMap.size() > 10) { //如果TreeMap中元素超过N个,将第一个(KEY最小的)元素删除 20 | visittimesMap.remove(visittimesMap.firstKey()); 21 | } 22 | } 23 | } 24 | 25 | public void cleanup(Context context) throws IOException, InterruptedException { 26 | //将TreeMap反序处理,降序输出top10 27 | NavigableMap reverMap = visittimesMap.descendingMap(); //获得TreeMap反序 28 | for (Text t : reverMap.values()) { 29 | context.write(NullWritable.get(), t); 30 | } 31 | } 32 | } 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/main/java/weblog/FlowCount.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Locale; 7 | import java.util.regex.Pattern; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IntWritable; 13 | import org.apache.hadoop.io.LongWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | 21 | //1. 计算网站全天产生的流量 22 | public class FlowCount { 23 | 24 | public static class FlowCountMapper extends Mapper { 25 | private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); 26 | private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd"); 27 | private Text date = new Text(); //Map输出key 28 | private IntWritable flow = new IntWritable(); //Map输出value 29 | 30 | public void map(Object key, Text value, Context context ) 31 | throws IOException, InterruptedException { 32 | String[] strs = value.toString().split(" "); 33 | String strFlow = strs[strs.length-1]; //获取流量字符串 34 | String strTime = strs[3].substring(1); //获取时间字符串 35 | String strDate = null; //定义日期字符串 36 | try { 37 | strDate = SDFOUT.format(SDFIN.parse(strTime)); //时间格式转成日期格式 38 | } catch (ParseException e) { 39 | e.printStackTrace(); 40 | } 41 | 42 | //利用正则表达式判断strFlow是否是数字 43 | if ( Pattern.compile("[0-9]+").matcher(strFlow).matches() ) { 44 | flow.set(Integer.parseInt(strFlow)); 45 | date.set(strDate); 46 | context.write(date, flow); 47 | } 48 | } 49 | } 50 | 51 | public static class FlowCountReducer extends Reducer { 52 | public void reduce(Text key, Iterable values, Context context) 53 | throws IOException, InterruptedException { 54 | long sum = 0; 55 | for (IntWritable val : values) { 56 | sum += val.get(); 57 | } 58 | context.write(key, new LongWritable(sum)); 59 | /* 60 | for (IntWritable val : values) { 61 | context.write(key, val); 62 | } 63 | */ 64 | } 65 | } 66 | 67 | public static void main(String[] args) throws Exception { 68 | //1.设置HDFS配置信息 69 | String namenode_ip = "192.168.17.10"; 70 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 71 | Configuration conf = new Configuration(); 72 | conf.set("fs.defaultFS", hdfs); 73 | conf.set("mapreduce.app-submission.cross-platform", "true"); 74 | 75 | //2.设置MapReduce作业配置信息 76 | String jobName = "FlowCount"; //作业名称 77 | Job job = Job.getInstance(conf, jobName); 78 | job.setJarByClass(FlowCount.class); //指定运行时作业类 79 | job.setJar("export\\FlowCount.jar"); //指定本地jar包 80 | job.setMapperClass(FlowCountMapper.class); //指定Mapper类 81 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 82 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 83 | job.setReducerClass(FlowCountReducer.class); //指定Reducer类 84 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 85 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 86 | 87 | //3.设置作业输入和输出路径 88 | String dataDir = "/expr/weblog/data"; //实验数据目录 89 | String outputDir = "/expr/weblog/output1"; //实验输出目录 90 | Path inPath = new Path(hdfs + dataDir); 91 | Path outPath = new Path(hdfs + outputDir); 92 | FileInputFormat.addInputPath(job, inPath); 93 | FileOutputFormat.setOutputPath(job, outPath); 94 | FileSystem fs = FileSystem.get(conf); 95 | if(fs.exists(outPath)) { 96 | fs.delete(outPath, true); 97 | } 98 | 99 | //4.运行作业 100 | System.out.println("Job: " + jobName + " is running..."); 101 | if(job.waitForCompletion(true)) { 102 | System.out.println("success!"); 103 | System.exit(0); 104 | } else { 105 | System.out.println("failed!"); 106 | System.exit(1); 107 | } 108 | } 109 | 110 | } -------------------------------------------------------------------------------- /src/main/java/weblog/IPCount.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Locale; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.IntWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.io.WritableComparable; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.Mapper; 18 | import org.apache.hadoop.mapreduce.Reducer; 19 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 21 | 22 | //4. 计算每天访问该网站的独立IP数 23 | public class IPCount { 24 | 25 | public enum IpCounter { 26 | ipnum1, ipnum2 27 | } 28 | 29 | public static class IPCountMapper extends Mapper { 30 | private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); 31 | private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd"); 32 | private DayAndIp k = new DayAndIp(); //Map输出Key:日期+IP 33 | private final static IntWritable one = new IntWritable(1); //Map输出Value 34 | 35 | public void map(Object key, Text value, Context context ) 36 | throws IOException, InterruptedException { 37 | String[] strs = value.toString().split(" "); 38 | String strIP = strs[0]; //获取IP字符串 39 | String strTime = strs[3].substring(1); //获取时间字符串 40 | String strDate = null; //定义日期字符串 41 | try { 42 | strDate = SDFOUT.format(SDFIN.parse(strTime)); //时间格式转成日期格式 43 | } catch (ParseException e) { 44 | e.printStackTrace(); 45 | } 46 | k.setDate(strDate); 47 | k.setIp(strIP); 48 | context.write(k, one); 49 | } 50 | } 51 | 52 | public static class IPCountReducer extends Reducer { 53 | public void reduce(DayAndIp key, Iterable values, Context context) 54 | throws IOException, InterruptedException { 55 | int sum = 0; 56 | for (IntWritable val : values) { 57 | sum += val.get(); 58 | } 59 | context.write(key, new IntWritable(sum)); 60 | String[] strs = key.toString().split("\t"); 61 | if ( strs[0].equals("2013-05-30") ) { 62 | context.getCounter(IpCounter.ipnum1).increment(1); //使用计数器统计某天访问的IP数 63 | } else { 64 | context.getCounter(IpCounter.ipnum2).increment(1); 65 | } 66 | } 67 | } 68 | 69 | public static void main(String[] args) throws Exception { 70 | //1.设置HDFS配置信息 71 | String namenode_ip = "192.168.17.10"; 72 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 73 | Configuration conf = new Configuration(); 74 | conf.set("fs.defaultFS", hdfs); 75 | conf.set("mapreduce.app-submission.cross-platform", "true"); 76 | 77 | //2.设置MapReduce作业配置信息 78 | String jobName = "IPCount"; //作业名称 79 | Job job = Job.getInstance(conf, jobName); 80 | job.setJarByClass(IPCount.class); //指定运行时作业类 81 | job.setJar("export\\IPCount.jar"); //指定本地jar包 82 | job.setMapperClass(IPCountMapper.class); //指定Mapper类 83 | job.setMapOutputKeyClass(DayAndIp.class); //设置Mapper输出Key类型 84 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 85 | job.setReducerClass(IPCountReducer.class); //指定Reducer类 86 | job.setOutputKeyClass(DayAndIp.class); //设置Reduce输出Key类型 87 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 88 | 89 | //3.设置作业输入和输出路径 90 | String dataDir = "/expr/weblog/data"; //实验数据目录 91 | String outputDir = "/expr/weblog/output4"; //实验输出目录 92 | Path inPath = new Path(hdfs + dataDir); 93 | Path outPath = new Path(hdfs + outputDir); 94 | FileInputFormat.addInputPath(job, inPath); 95 | FileOutputFormat.setOutputPath(job, outPath); 96 | FileSystem fs = FileSystem.get(conf); 97 | if(fs.exists(outPath)) { 98 | fs.delete(outPath, true); 99 | } 100 | 101 | //4.运行作业 102 | System.out.println("Job: " + jobName + " is running..."); 103 | if(job.waitForCompletion(true)) { 104 | System.out.println("success!"); 105 | System.exit(0); 106 | } else { 107 | System.out.println("failed!"); 108 | System.exit(1); 109 | } 110 | } 111 | 112 | //自定义KEY类,封装日期和IP 113 | public static class DayAndIp implements WritableComparable { 114 | private String date; 115 | private String ip; 116 | 117 | public String getDate() { 118 | return date; 119 | } 120 | public void setDate(String date) { 121 | this.date = date; 122 | } 123 | public String getIp() { 124 | return ip; 125 | } 126 | public void setIp(String ip) { 127 | this.ip = ip; 128 | } 129 | 130 | @Override 131 | public void write(DataOutput out) throws IOException { 132 | out.writeUTF(date); 133 | out.writeUTF(ip); 134 | } 135 | 136 | @Override 137 | public void readFields(DataInput in) throws IOException { 138 | date = in.readUTF(); 139 | ip = in.readUTF(); 140 | } 141 | 142 | @Override 143 | public int compareTo(DayAndIp o) { 144 | int r = date.compareTo(o.getDate()); 145 | if ( r == 0 ) { 146 | return ip.compareTo(o.getIp()); 147 | } 148 | return r; 149 | } 150 | 151 | @Override 152 | public String toString() { 153 | return date + "\t" + ip; 154 | } 155 | } 156 | } -------------------------------------------------------------------------------- /src/main/java/weblog/Missed.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 15 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 16 | 17 | //2. 将所有状态为404的记录输出到文件:missed 18 | public class Missed { 19 | 20 | public static class MissedMapper extends Mapper { 21 | private Text k = new Text(); //Map输出key 22 | 23 | public void map(Object key, Text value, Context context ) 24 | throws IOException, InterruptedException { 25 | String[] strs = value.toString().split(" "); 26 | String status = strs[strs.length-2]; //获取状态码 27 | if (status.equals("404")) { 28 | //context.write(value, NullWritable.get()); 29 | String reqResource = strs[6]; //获取被请求的资源 30 | int index = reqResource.indexOf("?"); 31 | if ( index > 0 ) { 32 | reqResource = reqResource.substring(0, index); //截取问号前的请求资源名称(去掉请求参数) 33 | } 34 | k.set(reqResource); 35 | context.write(k, NullWritable.get()); 36 | } 37 | } 38 | } 39 | 40 | public static class MissedReducer extends Reducer { 41 | //定义MultiOutputs对象 42 | private MultipleOutputs mos; 43 | 44 | //初始化MultiOutputs对象 45 | protected void setup(Context context) throws IOException, InterruptedException { 46 | mos = new MultipleOutputs(context); 47 | } 48 | 49 | //关闭MultiOutputs对象 50 | protected void cleanup(Context context) throws IOException, InterruptedException { 51 | mos.close(); 52 | } 53 | 54 | public void reduce(Text key, Iterable values, Context context) 55 | throws IOException, InterruptedException { 56 | mos.write("missed", key, NullWritable.get()); 57 | } 58 | } 59 | 60 | public static void main(String[] args) throws Exception { 61 | //1.设置HDFS配置信息 62 | String namenode_ip = "192.168.17.10"; 63 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 64 | Configuration conf = new Configuration(); 65 | conf.set("fs.defaultFS", hdfs); 66 | conf.set("mapreduce.app-submission.cross-platform", "true"); 67 | 68 | //2.设置MapReduce作业配置信息 69 | String jobName = "Missed"; //作业名称 70 | Job job = Job.getInstance(conf, jobName); 71 | job.setJarByClass(Missed.class); //指定运行时作业类 72 | job.setJar("export\\Missed.jar"); //指定本地jar包 73 | job.setMapperClass(MissedMapper.class); //指定Mapper类 74 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 75 | job.setMapOutputValueClass(NullWritable.class); //设置Mapper输出Value类型 76 | job.setReducerClass(MissedReducer.class); //指定Reducer类 77 | //定义多文件输出的文件名、输出格式、键类型、值类型 78 | MultipleOutputs.addNamedOutput(job, "missed", TextOutputFormat.class, Text.class, NullWritable.class); 79 | 80 | //3.设置作业输入和输出路径 81 | String dataDir = "/expr/weblog/data"; //实验数据目录 82 | String outputDir = "/expr/weblog/output2"; //实验输出目录 83 | Path inPath = new Path(hdfs + dataDir); 84 | Path outPath = new Path(hdfs + outputDir); 85 | FileInputFormat.addInputPath(job, inPath); 86 | FileOutputFormat.setOutputPath(job, outPath); 87 | FileSystem fs = FileSystem.get(conf); 88 | if(fs.exists(outPath)) { 89 | fs.delete(outPath, true); 90 | } 91 | 92 | //4.运行作业 93 | System.out.println("Job: " + jobName + " is running..."); 94 | if(job.waitForCompletion(true)) { 95 | System.out.println("success!"); 96 | System.exit(0); 97 | } else { 98 | System.out.println("failed!"); 99 | System.exit(1); 100 | } 101 | } 102 | 103 | } -------------------------------------------------------------------------------- /src/main/java/weblog/PVMinMax.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.HashMap; 7 | import java.util.Locale; 8 | import java.util.Map; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.IntWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | 21 | //5.1 统计网站每分钟的访问量 22 | // 访问量是每一条记录 23 | public class PVMinMax { 24 | 25 | public static class PVMinMaxMapper extends Mapper { 26 | private SimpleDateFormat SDFIN = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); 27 | private SimpleDateFormat SDFOUT = new SimpleDateFormat("yyyy-MM-dd HH:mm"); 28 | private Text minute = new Text(); //Map输出key 29 | private final static IntWritable one = new IntWritable(1); 30 | 31 | public void map(Object key, Text value, Context context ) 32 | throws IOException, InterruptedException { 33 | String[] strs = value.toString().split(" "); 34 | String strTime = strs[3].substring(1); //获取时间字符串 35 | String strMinute = null; 36 | try { 37 | strMinute = SDFOUT.format(SDFIN.parse(strTime)); //时间格式转成日期格式 38 | } catch (ParseException e) { 39 | e.printStackTrace(); 40 | } 41 | minute.set(strMinute); 42 | context.write(minute, one); 43 | } 44 | } 45 | 46 | public static class PVMinMaxReducer extends Reducer { 47 | Map map = new HashMap(); 48 | public void reduce(Text key, Iterable values, Context context) 49 | throws IOException, InterruptedException { 50 | int sum = 0; 51 | for (IntWritable val : values) { 52 | sum += val.get(); 53 | } 54 | context.write(key, new IntWritable(sum)); 55 | } 56 | } 57 | 58 | public static void main(String[] args) throws Exception { 59 | //1.设置HDFS配置信息 60 | String namenode_ip = "192.168.17.10"; 61 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 62 | Configuration conf = new Configuration(); 63 | conf.set("fs.defaultFS", hdfs); 64 | conf.set("mapreduce.app-submission.cross-platform", "true"); 65 | 66 | //2.设置MapReduce作业配置信息 67 | String jobName = "PVMinMax"; //作业名称 68 | Job job = Job.getInstance(conf, jobName); 69 | job.setJarByClass(PVMinMax.class); //指定运行时作业类 70 | job.setJar("export\\PVMinMax.jar"); //指定本地jar包 71 | job.setMapperClass(PVMinMaxMapper.class); //指定Mapper类 72 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 73 | job.setMapOutputValueClass(IntWritable.class); //设置Mapper输出Value类型 74 | job.setReducerClass(PVMinMaxReducer.class); //指定Reducer类 75 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 76 | job.setOutputValueClass(IntWritable.class); //设置Reduce输出Value类型 77 | 78 | //3.设置作业输入和输出路径 79 | String dataDir = "/expr/weblog/data"; //实验数据目录 80 | String outputDir = "/expr/weblog/output5_1"; //实验输出目录 81 | Path inPath = new Path(hdfs + dataDir); 82 | Path outPath = new Path(hdfs + outputDir); 83 | FileInputFormat.addInputPath(job, inPath); 84 | FileOutputFormat.setOutputPath(job, outPath); 85 | FileSystem fs = FileSystem.get(conf); 86 | if(fs.exists(outPath)) { 87 | fs.delete(outPath, true); 88 | } 89 | 90 | //4.运行作业 91 | System.out.println("Job: " + jobName + " is running..."); 92 | if(job.waitForCompletion(true)) { 93 | System.out.println("success!"); 94 | System.exit(0); 95 | } else { 96 | System.out.println("failed!"); 97 | System.exit(1); 98 | } 99 | } 100 | 101 | } -------------------------------------------------------------------------------- /src/main/java/weblog/PVMinMax2.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | 17 | //5.2 计算网站每分钟访问量的峰值(最大、最小值) 18 | public class PVMinMax2 { 19 | 20 | public static class PVMinMax2Mapper extends Mapper { 21 | public void map(Object key, Text value, Context context ) 22 | throws IOException, InterruptedException { 23 | // 传入数据类似2014-12-12 18:06 1234,前面通过空格分开,后面是制表符分隔 24 | String[] strs = value.toString().split(" "); 25 | // key是2014-12-12这样的时间 26 | context.write(new Text(strs[0]), new Text(strs[1])); 27 | } 28 | } 29 | 30 | public static class PVMinMax2Reducer extends Reducer { 31 | // Map map = new HashMap(); 32 | int maxVisit = 0; //默认最大值设为0 33 | int minVisit = Integer.MAX_VALUE; //默认最小值设为最大整数 34 | String maxMinute = null;// 最大访问量的所在时间 35 | String minMinute = null; 36 | public void reduce(Text key, Iterable values, Context context) 37 | throws IOException, InterruptedException { 38 | for (Text val : values) { 39 | String[] strs = val.toString().split("\t"); 40 | String minute = strs[0]; //minute:访问时间,如:17:38 41 | int visit = Integer.parseInt(strs[1]); //visit:访问次数,如:813 42 | if (visit > maxVisit) { 43 | maxVisit = visit; 44 | maxMinute = minute; 45 | } 46 | if (visit < minVisit) { 47 | minVisit = visit; 48 | minMinute = minute; 49 | } 50 | } 51 | 52 | String strMaxTime = key.toString() + " " + maxMinute; //将日期和分钟合并 53 | String strMinTime = key.toString() + " " + minMinute; 54 | context.write(new Text(strMaxTime), new Text(String.valueOf(maxVisit))); 55 | context.write(new Text(strMinTime), new Text(String.valueOf(minVisit))); 56 | 57 | /* 58 | *或者这样写 59 | String value = maxMinute + " " + maxVisit + "\t" + minMinute + " " + minVisit; 60 | context.write(key, new Text(value)); 61 | */ 62 | } 63 | } 64 | 65 | public static void main(String[] args) throws Exception { 66 | //1.设置HDFS配置信息 67 | String namenode_ip = "192.168.17.10"; 68 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 69 | Configuration conf = new Configuration(); 70 | conf.set("fs.defaultFS", hdfs); 71 | conf.set("mapreduce.app-submission.cross-platform", "true"); 72 | 73 | //2.设置MapReduce作业配置信息 74 | String jobName = "PVMinMax2"; //作业名称 75 | Job job = Job.getInstance(conf, jobName); 76 | job.setJarByClass(PVMinMax2.class); //指定运行时作业类 77 | job.setJar("export\\PVMinMax2.jar"); //指定本地jar包 78 | job.setMapperClass(PVMinMax2Mapper.class); //指定Mapper类 79 | job.setMapOutputKeyClass(Text.class); //设置Mapper输出Key类型 80 | job.setMapOutputValueClass(Text.class); //设置Mapper输出Value类型 81 | job.setReducerClass(PVMinMax2Reducer.class); //指定Reducer类 82 | job.setOutputKeyClass(Text.class); //设置Reduce输出Key类型 83 | job.setOutputValueClass(Text.class); //设置Reduce输出Value类型 84 | 85 | //3.设置作业输入和输出路径 86 | String dataDir = "/expr/weblog/output5_1"; //实验数据目录 87 | String outputDir = "/expr/weblog/output5_2"; //实验输出目录 88 | Path inPath = new Path(hdfs + dataDir); 89 | Path outPath = new Path(hdfs + outputDir); 90 | FileInputFormat.addInputPath(job, inPath); 91 | FileOutputFormat.setOutputPath(job, outPath); 92 | FileSystem fs = FileSystem.get(conf); 93 | if(fs.exists(outPath)) { 94 | fs.delete(outPath, true); 95 | } 96 | 97 | //4.运行作业 98 | System.out.println("Job: " + jobName + " is running..."); 99 | if(job.waitForCompletion(true)) { 100 | System.out.println("success!"); 101 | System.exit(0); 102 | } else { 103 | System.out.println("failed!"); 104 | System.exit(1); 105 | } 106 | } 107 | 108 | } -------------------------------------------------------------------------------- /src/main/java/weblog/PVTopTen.java: -------------------------------------------------------------------------------- 1 | package ssdut.training.mapreduce.weblog; 2 | 3 | import java.io.IOException; 4 | import java.util.Map.Entry; 5 | import java.util.NavigableMap; 6 | import java.util.TreeMap; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.IntWritable; 12 | import org.apache.hadoop.io.NullWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 19 | 20 | //3. 找到访问量最高的10个页面(按访问量降序输出) 21 | public class PVTopTen { 22 | public static class PVTopTenMapper extends Mapper { 23 | private Text k = new Text(); 24 | private final static IntWritable one = new IntWritable(1); 25 | 26 | public void map(Object key, Text value, Context context) throws IOException, InterruptedException { 27 | String[] strs = value.toString().split(" "); 28 | String reqResource = strs[6]; //获取请求资源字符串 29 | int index = reqResource.indexOf("?"); 30 | if ( index > 0 ) { 31 | reqResource = reqResource.substring(0, index); //截取问号前的请求资源名称(去掉请求参数) 32 | } 33 | if ( reqResource.endsWith(".html") || reqResource.contains(".php") ) { 34 | k.set(reqResource); 35 | context.write(k, one); 36 | } 37 | } 38 | } 39 | 40 | public static class PVTopTenReducer extends Reducer { 41 | public TreeMap map = new TreeMap(); 42 | 43 | public void reduce(Text key, Iterable values, Context context) 44 | throws IOException, InterruptedException { 45 | int sum = 0; 46 | for (IntWritable val : values) { 47 | sum += val.get(); //计算被请求页面的访问量 48 | } 49 | String str = String.valueOf(sum) + "\t" + key.toString() ; 50 | map.put(sum, new Text(str)); //将页面访问量和被请求页面名称放入TreeMap中,TreeMap按KEY键(访问量)自动排序 51 | if (map.size() > 10) { //如果TreeMap中元素超过N个,则将第一个(KEY最小的)元素删除 52 | map.remove(map.firstKey()); 53 | } 54 | } 55 | 56 | public void cleanup(Context context) throws IOException, InterruptedException { 57 | //将TreeMap反序处理(降序),遍历输出top10 58 | NavigableMap reverseMap = map.descendingMap(); 59 | for ( Entry entry : reverseMap.entrySet() ) { 60 | context.write(entry.getValue(), NullWritable.get()); 61 | } 62 | } 63 | } 64 | 65 | public static void main(String[] args) throws Exception { 66 | String namenode_ip = "192.168.17.10"; 67 | String hdfs = "hdfs://" + namenode_ip + ":9000"; 68 | Configuration conf = new Configuration(); 69 | conf.set("fs.defaultFS", hdfs); 70 | conf.set("mapreduce.app-submission.cross-platform", "true"); 71 | 72 | String jobName = "PVTopTenJob"; 73 | Job job = Job.getInstance(conf, jobName); 74 | job.setJarByClass(PVTopTen.class); 75 | job.setJar("export\\PVTopTen.jar"); 76 | job.setMapperClass(PVTopTenMapper.class); 77 | job.setMapOutputKeyClass(Text.class); 78 | job.setMapOutputValueClass(IntWritable.class); 79 | job.setReducerClass(PVTopTenReducer.class); 80 | job.setOutputKeyClass(Text.class); 81 | job.setOutputValueClass(NullWritable.class); 82 | job.setNumReduceTasks(1); //计算最终TopN,只能运行一个Reduce任务 83 | 84 | String dataDir = "/expr/weblog/data"; 85 | String outputDir = "/expr/weblog/output3"; 86 | Path inPath = new Path(hdfs + dataDir); 87 | Path outPath = new Path(hdfs + outputDir); 88 | FileInputFormat.addInputPath(job, inPath); 89 | FileOutputFormat.setOutputPath(job, outPath); 90 | FileSystem fs = FileSystem.get(conf); 91 | if(fs.exists(outPath)) { 92 | fs.delete(outPath, true); 93 | } 94 | 95 | System.out.println( "Job: " + jobName + " is running..."); 96 | if(job.waitForCompletion(true)) { 97 | System.out.println("success!"); 98 | System.exit(0); 99 | } else { 100 | System.out.println("failed!"); 101 | System.exit(1); 102 | } 103 | } 104 | 105 | } --------------------------------------------------------------------------------