├── Bayes ├── CalcDocNumClass.java ├── CalcWordNumClass.java ├── Evaluation.java ├── Main.java ├── TestPrediction.java ├── TestPreparation.java ├── Utils.java └── pom.xml ├── Hadoop报告.pdf ├── README.md ├── data └── NBCorpus.rar └── first.jmx /Bayes/CalcDocNumClass.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.*; 8 | import org.apache.hadoop.mapreduce.*; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | import java.io.IOException; 15 | 16 | public class CalcDocNumClass extends Configured implements Tool { 17 | /* 18 | * 第一个MapReduce用于统计每个类对应的文件数量 19 | * 为计算先验概率准备: 20 | */ 21 | public static class CalcDocNumClassMap extends Mapper { 22 | // private Text newKey = new Text(); 23 | private final static IntWritable one = new IntWritable(1); 24 | public void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException{ 25 | context.write(key, one); 26 | } 27 | } 28 | public static class CalcDocNumClassReduce extends Reducer { 29 | private IntWritable result = new IntWritable(); 30 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 31 | int sum = 0; 32 | for(IntWritable value:values){ 33 | sum += value.get(); 34 | } 35 | result.set(sum); 36 | context.write(key, result); 37 | } 38 | } 39 | @Override 40 | public int run(String[] strings) throws Exception { 41 | Configuration conf = getConf(); 42 | FileSystem hdfs = FileSystem.get(conf); 43 | 44 | Path outputPath1 = new Path(Utils.DocNumClass); 45 | if(hdfs.exists(outputPath1)) 46 | hdfs.delete(outputPath1, true); 47 | 48 | Job job1 =Job.getInstance(conf, "CalcDocNum"); 49 | job1.setJarByClass(CalcDocNumClass.class); 50 | //设置输入输出格式 51 | job1.setInputFormatClass(WholeFileInputFormat.class); 52 | job1.setMapperClass(CalcDocNumClassMap.class); 53 | job1.setCombinerClass(CalcDocNumClassReduce.class); 54 | job1.setReducerClass(CalcDocNumClassReduce.class); 55 | 56 | FileInputFormat.setInputDirRecursive(job1,true); 57 | job1.setOutputKeyClass(Text.class); //reduce阶段的输出的key 58 | job1.setOutputValueClass(IntWritable.class); //reduce阶段的输出的value 59 | FileInputFormat.addInputPath(job1, new Path(Utils.TRAIN_DATA_PATH)); 60 | FileOutputFormat.setOutputPath(job1, new Path(Utils.DocNumClass)); 61 | return job1.waitForCompletion(true) ? 0 : 1; 62 | } 63 | 64 | public static class WholeFileInputFormat extends FileInputFormat{ 65 | 66 | @Override 67 | protected boolean isSplitable(JobContext context, Path filename) { 68 | return false; //文件输入的时候不再切片 69 | } 70 | 71 | @Override 72 | public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { 73 | WholeFileRecordReader reader = new WholeFileRecordReader(); 74 | reader.initialize(inputSplit, taskAttemptContext); 75 | return reader; 76 | } 77 | 78 | } 79 | 80 | public static class WholeFileRecordReader extends RecordReader { 81 | private FileSplit fileSplit; //保存输入的分片,它将被转换成一条(key,value)记录 82 | private Configuration conf; //配置对象 83 | private Text key = new Text(); //key对象,初始值为空 84 | private BytesWritable value = new BytesWritable(); //value对象,内容为空 85 | private boolean isRead = false; //布尔变量记录记录是否被处理过 86 | 87 | @Override 88 | public void initialize(InputSplit split, TaskAttemptContext context) 89 | throws IOException, InterruptedException { 90 | this.fileSplit = (FileSplit) split; //将输入分片强制转换成FileSplit 91 | this.conf = context.getConfiguration(); //从context获取配置信息 92 | } 93 | @Override 94 | public boolean nextKeyValue() throws IOException, InterruptedException { 95 | if (!isRead) { //如果记录有没有被处理过 96 | //定义缓存区 97 | byte[] contents = new byte[(int) fileSplit.getLength()]; 98 | FileSystem fs = null; 99 | FSDataInputStream fis = null; 100 | try { 101 | //获取文件系统 102 | Path path = fileSplit.getPath(); 103 | fs = path.getFileSystem(conf); 104 | //读取数据 105 | fis = fs.open(path); 106 | //读取文件内容 107 | IOUtils.readFully(fis, contents, 0, contents.length); 108 | //输出内容文件 109 | value.set(contents, 0, contents.length); 110 | //获取文件所属类名称 111 | String classname = fileSplit.getPath().getParent().getName(); 112 | key.set(classname); 113 | }catch (Exception e){ 114 | System.out.println(e); 115 | }finally { 116 | IOUtils.closeStream(fis); 117 | } 118 | isRead = true; //将是否处理标志设为true,下次调用该方法会返回false 119 | return true; 120 | } 121 | else { 122 | return false; //如果记录处理过,返回false,表示split处理完毕 123 | } 124 | } 125 | @Override 126 | public Text getCurrentKey() throws IOException, InterruptedException { 127 | return key; 128 | } 129 | @Override 130 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 131 | return value; 132 | } 133 | @Override 134 | public float getProgress() throws IOException { 135 | return isRead ? 1.0f : 0.0f; 136 | } 137 | @Override 138 | public void close() throws IOException { 139 | } 140 | } 141 | public static void main(String[] args) throws Exception { 142 | int res = ToolRunner.run(new Configuration(), 143 | new CalcDocNumClass(), args); 144 | System.exit(res); 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /Bayes/CalcWordNumClass.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.InputSplit; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.util.Tool; 18 | import org.apache.hadoop.util.ToolRunner; 19 | import java.io.IOException; 20 | public class CalcWordNumClass extends Configured implements Tool { 21 | /* 22 | * 第二个MapReduce用于统计每个类下单词的数量 23 | */ 24 | public static class CalcWordNum_Mapper extends Mapper { 25 | private Text key_out = new Text(); 26 | private IntWritable one = new IntWritable(1); 27 | @Override 28 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 29 | InputSplit inputSplit = context.getInputSplit(); 30 | String className = ((FileSplit)inputSplit).getPath().getParent().getName(); 31 | String line_context = value.toString(); 32 | key_out.set(className + '\t' + line_context); 33 | context.write(key_out, one); 34 | } 35 | } 36 | public static class CalcWordNum_Reducer extends Reducer { 37 | private IntWritable result = new IntWritable(); 38 | @Override 39 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 40 | int num = 0; 41 | for (IntWritable value: values){ 42 | num += value.get(); 43 | } 44 | result.set(num); 45 | context.write(key, result); 46 | } 47 | } 48 | @Override 49 | public int run(String[] strings) throws Exception { 50 | Configuration conf = getConf(); 51 | Job job2 = Job.getInstance(conf, "CalcWordNum"); 52 | FileSystem fileSystem = FileSystem.get(conf); 53 | Path outputPath2 = new Path(Utils.WordNumClass); 54 | if(fileSystem.exists(outputPath2)) 55 | fileSystem.delete(outputPath2, true); 56 | //设置jar加载路径 57 | job2.setJarByClass(CalcWordNumClass.class); 58 | //设置map和reduce类 59 | job2.setMapperClass(CalcWordNum_Mapper.class); 60 | job2.setCombinerClass(CalcWordNum_Reducer.class); 61 | job2.setReducerClass(CalcWordNum_Reducer.class); 62 | //设置map reduce输出格式 63 | job2.setMapOutputKeyClass(Text.class); 64 | job2.setMapOutputValueClass(IntWritable.class); 65 | job2.setOutputKeyClass(Text.class); 66 | job2.setOutputValueClass(IntWritable.class); 67 | //设置输入和输出路径 68 | FileInputFormat.setInputDirRecursive(job2,true); 69 | FileInputFormat.addInputPath(job2, new Path(Utils.TRAIN_DATA_PATH)); 70 | // FileInputFormat.setInputPaths(job2, new Path(Utils.TRAIN_DATA_PATH)); 71 | FileOutputFormat.setOutputPath(job2, new Path(Utils.WordNumClass)); 72 | boolean result = job2.waitForCompletion(true); 73 | return (result ? 0 : 1); 74 | } 75 | public static void main(String[] args) throws Exception { 76 | int res = ToolRunner.run(new Configuration(), 77 | new CalcWordNumClass(), args); 78 | System.exit(res); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Bayes/Evaluation.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.util.Tool; 8 | import org.apache.hadoop.util.ToolRunner; 9 | import java.io.BufferedReader; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.net.URI; 13 | import java.util.ArrayList; 14 | public class Evaluation extends Configured implements Tool { 15 | public static void GetEvaluation(Configuration conf) throws IOException { 16 | //读取TextPrediction输出的文档 17 | String classFilePath =Utils.Test_Prediction + "/part-r-00000"; 18 | FileSystem fs = FileSystem.get(URI.create(classFilePath), conf); 19 | FSDataInputStream fsr = fs.open(new Path(classFilePath)); 20 | ArrayList ClassNames = new ArrayList<>(); //得到待分类的类名 21 | ArrayList TruePositive = new ArrayList<>(); //TP,属于该类且被分到该类的数目 22 | ArrayList FalseNegative = new ArrayList<>(); //FN,属于该类但没分到该类的数目 23 | ArrayList FalsePositive = new ArrayList<>(); //FP,不属于该类但分到该类的数目 24 | ArrayList precision = new ArrayList<>(); //Precision精度:P = TP/(TP+FP) 25 | ArrayList recall = new ArrayList<>(); //Recall精度: R = TP/(TP+FN) 26 | ArrayList F1 = new ArrayList<>(); //P和R的调和平均:F1 = 2PR/(P+R) 27 | BufferedReader reader = null; 28 | Integer temp = 0; //下面用于计算时的暂时存储数据 29 | try { 30 | reader = new BufferedReader(new InputStreamReader(fsr)); 31 | String lineValue = null; 32 | while ((lineValue = reader.readLine()) != null){ 33 | //按站空白字符分词,分词后得到的数组中,前三项依次为:真实类别名,文件名,预测类别名 34 | String[] values = lineValue.split("\\s"); 35 | if (!ClassNames.contains(values[0])) { 36 | ClassNames.add(values[0]); 37 | TruePositive.add(0); 38 | FalseNegative.add(0); 39 | FalsePositive.add(0); 40 | } 41 | if (!ClassNames.contains(values[2])) { 42 | ClassNames.add(values[2]); 43 | TruePositive.add(0); 44 | FalseNegative.add(0); 45 | FalsePositive.add(0); 46 | } 47 | if (values[0].equals(values[2])){ 48 | temp = TruePositive.get(ClassNames.indexOf(values[2])) + 1; 49 | TruePositive.set(ClassNames.indexOf(values[2]), temp); 50 | } 51 | else { 52 | temp = FalseNegative.get(ClassNames.indexOf(values[0])) + 1; 53 | FalseNegative.set(ClassNames.indexOf(values[0]), temp); 54 | temp = FalsePositive.get(ClassNames.indexOf(values[2])) + 1; 55 | FalsePositive.set(ClassNames.indexOf(values[2]), temp); 56 | } 57 | } 58 | for (int i = 0; i < ClassNames.size(); i++) { 59 | int TP = TruePositive.get(i); 60 | int FP = FalsePositive.get(i); 61 | int FN = FalseNegative.get(i); 62 | double p = TP * 1.0 / ( TP + FP ); 63 | double r = TP * 1.0 / ( TP + FN ); 64 | double F = 2 * p * r / ( p + r ); 65 | precision.add(p); 66 | recall.add(r); 67 | F1.add(F); 68 | } 69 | /* 70 | * 计算宏平均和微平均 71 | * 以计算precision为例 72 | * 宏平均的precision:(p1+p2+...+pN)/N 73 | * 微平均的precision:对应各项PR相加后再计算precision 74 | * */ 75 | double p_Sum_Ma = 0.0; 76 | double r_Sum_Ma = 0.0; 77 | double F1_Sum_Ma = 0.0; 78 | Integer TP_Sum_Mi = 0; 79 | Integer FN_Sum_Mi = 0; 80 | Integer FP_Sum_Mi = 0; 81 | int n = ClassNames.size(); //类的种类数量 82 | for (int i = 0; i < n; i++) { 83 | p_Sum_Ma += precision.get(i); 84 | r_Sum_Ma += recall.get(i); 85 | F1_Sum_Ma += F1.get(i); 86 | TP_Sum_Mi += TruePositive.get(i); 87 | FN_Sum_Mi += FalseNegative.get(i); 88 | FP_Sum_Mi += FalsePositive.get(i); 89 | } 90 | //宏平均 91 | double p_Ma = p_Sum_Ma / n; 92 | double r_Ma = r_Sum_Ma / n; 93 | double F1_Ma = F1_Sum_Ma / n; 94 | //微平均 95 | double p_Mi = TP_Sum_Mi * 1.0 / ( TP_Sum_Mi + FP_Sum_Mi );; 96 | double r_Mi = TP_Sum_Mi * 1.0 / ( TP_Sum_Mi + FN_Sum_Mi ); 97 | double F1_Mi = 2 * p_Mi * r_Mi / ( p_Mi + r_Mi ); 98 | for (int i = 0; i < n; i++) { 99 | System.out.println(ClassNames.get(i) + "\tprecision: " + precision.get(i).toString()); 100 | System.out.println(ClassNames.get(i) + "\trecall: " + recall.get(i).toString()); 101 | System.out.println(ClassNames.get(i) + "\tF1: " + F1.get(i).toString()); 102 | } 103 | System.out.println("Macroaveraged(宏平均) precision: "+ p_Ma ); 104 | System.out.println("Macroaveraged(宏平均) recall: "+ r_Ma ); 105 | System.out.println("Macroaveraged(宏平均) F1: "+ F1_Ma ); 106 | System.out.println("Microaveraged(微平均) precision: "+ p_Mi ); 107 | System.out.println("Microaveraged(微平均) recall: "+ r_Mi ); 108 | System.out.println("Microaveraged(微平均) F1: "+ F1_Mi ); 109 | } catch (Exception e) { 110 | e.printStackTrace(); 111 | }finally { 112 | reader.close(); 113 | } 114 | } 115 | @Override 116 | public int run(String[] strings) throws Exception { 117 | Configuration conf = getConf(); 118 | GetEvaluation(conf); 119 | return 0; 120 | } 121 | public static void main(String[] args) throws Exception { 122 | int res = ToolRunner.run(new Configuration(), 123 | new Evaluation(), args); 124 | System.exit(res); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /Bayes/Main.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.util.ToolRunner; 4 | public class Main { 5 | public static void main(String[] args) throws Exception { 6 | Configuration configuration = new Configuration(); 7 | //统计每个类中文件数目 8 | CalcDocNumClass computeDocNumInClass = new CalcDocNumClass(); 9 | ToolRunner.run(configuration, computeDocNumInClass, args); 10 | //统计每个类中出现单词总数 11 | CalcWordNumClass calcWordNumClass = new CalcWordNumClass(); 12 | ToolRunner.run(configuration, calcWordNumClass, args); 13 | //测试集数据预处理 14 | TestPreparation testPreparation = new TestPreparation(); 15 | ToolRunner.run(configuration, testPreparation, args); 16 | //预测测试集文件类别 17 | TestPrediction testPrediction = new TestPrediction(); 18 | ToolRunner.run(configuration, testPrediction, args); 19 | //评估测试效果,计算precision,recall,F1 20 | Evaluation evaluation = new Evaluation(); 21 | ToolRunner.run(configuration, evaluation, args); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Bayes/TestPrediction.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import org.apache.hadoop.util.Tool; 15 | import org.apache.hadoop.util.ToolRunner; 16 | import java.io.BufferedReader; 17 | import java.io.IOException; 18 | import java.io.InputStreamReader; 19 | import java.net.URI; 20 | import java.util.HashMap; 21 | import java.util.Iterator; 22 | import java.util.Map; 23 | import java.util.StringTokenizer; 24 | public class TestPrediction extends Configured implements Tool { 25 | private static HashMap priorProbability = new HashMap(); // 类的先验概率 26 | private static HashMap conditionalProbability = new HashMap<>(); // 每个单词在类中的条件概率 27 | //计算类的先验概率 28 | public static void Get_PriorProbability() throws IOException { 29 | Configuration conf = new Configuration(); 30 | FSDataInputStream fsr = null; 31 | BufferedReader bufferedReader = null; 32 | String lineValue = null; 33 | HashMap temp = new HashMap<>(); //暂存类名和文档数 34 | double sum = 0; //文档总数量 35 | try { 36 | FileSystem fs = FileSystem.get(URI.create(Utils.DocNumClass + "part-r-00000"), conf); 37 | fsr = fs.open(new Path( Utils.DocNumClass + "/part-r-00000")); 38 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); //文档读入流 39 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 40 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 41 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 42 | String className = tokenizer.nextToken(); //类名 43 | String num_C_Tmp = tokenizer.nextToken(); //文档数量 44 | double numC = Double.parseDouble(num_C_Tmp); 45 | temp.put(className, numC); 46 | sum = sum + numC; //文档总数量 47 | } 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } finally { 51 | bufferedReader.close(); //关闭资源 52 | } 53 | 54 | Iterator> it = temp.entrySet().iterator(); 55 | while (it.hasNext()){ //遍历计算先验概率 56 | Map.Entry val = (Map.Entry)it.next(); 57 | String key = val.getKey().toString(); 58 | double value = Double.parseDouble(val.getValue().toString()); 59 | value /= sum; 60 | priorProbability.put(key, value); 61 | } 62 | } 63 | public static void Get_ConditionProbability() throws IOException { 64 | String filePath =Utils.WordNumClass + "/part-r-00000"; 65 | Configuration conf = new Configuration(); 66 | FSDataInputStream fsr = null; 67 | BufferedReader bufferedReader = null; 68 | String lineValue = null; 69 | HashMap wordSum=new HashMap(); //存放的为<类名,单词总数> 70 | 71 | try { 72 | FileSystem fs = FileSystem.get(URI.create(filePath), conf); 73 | fsr = fs.open(new Path(filePath)); 74 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); 75 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 76 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 77 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 78 | String className = tokenizer.nextToken(); 79 | String word =tokenizer.nextToken(); 80 | String numWordTmp = tokenizer.nextToken(); 81 | double numWord = Double.parseDouble(numWordTmp); 82 | if(wordSum.containsKey(className)) 83 | wordSum.put(className,wordSum.get(className)+numWord+1.0);//加1.0是因为每一次都是一个不重复的单词 84 | else 85 | wordSum.put(className,numWord+1.0); 86 | } 87 | fsr.close(); 88 | } catch (Exception e) { 89 | e.printStackTrace(); 90 | } finally { 91 | if (bufferedReader != null) { 92 | try { 93 | bufferedReader.close(); 94 | } catch (IOException e) { 95 | e.printStackTrace(); 96 | } 97 | } 98 | } 99 | // 现在来计算条件概率 100 | try { 101 | FileSystem fs = FileSystem.get(URI.create(filePath), conf); 102 | fsr = fs.open(new Path(filePath)); 103 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); 104 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 105 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 106 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 107 | String className = tokenizer.nextToken(); 108 | String word =tokenizer.nextToken(); 109 | String numWordTmp = tokenizer.nextToken(); 110 | double numWord = Double.parseDouble(numWordTmp); 111 | String key=className+"\t"+word; 112 | conditionalProbability.put(key,(numWord+1.0)/wordSum.get(className)); 113 | //System.out.println(className+"\t"+word+"\t"+wordsProbably.get(key)); 114 | } 115 | fsr.close(); 116 | } catch (Exception e) { 117 | e.printStackTrace(); 118 | } finally { 119 | if (bufferedReader != null) { 120 | try { 121 | bufferedReader.close(); 122 | } catch (IOException e) { 123 | e.printStackTrace(); 124 | } 125 | } 126 | } 127 | // 对测试集中出现的新单词定义概率 128 | Iterator iterator = wordSum.entrySet().iterator(); //获取key和value的set 129 | while (iterator.hasNext()) { 130 | Map.Entry entry = (Map.Entry) iterator.next(); //把hashmap转成Iterator再迭代到entry 131 | Object key = entry.getKey(); //从entry获取key 132 | conditionalProbability.put(key.toString(),1.0/Double.parseDouble(entry.getValue().toString())); 133 | } 134 | } 135 | public static class Prediction_Mapper extends Mapper { 136 | public void setup(Context context)throws IOException{ 137 | Get_PriorProbability(); //先验概率 138 | Get_ConditionProbability(); //条件概率 139 | } 140 | private Text newKey = new Text(); 141 | private Text newValue = new Text(); 142 | @Override 143 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 144 | String[] lineValues = value.toString().split("\\s"); //分词,按照空白字符切割 145 | String class_Name = lineValues[0]; //得到类名 146 | String fileName = lineValues[1]; //得到文件名 147 | for(Map.Entry entry : priorProbability.entrySet()){ 148 | String className = entry.getKey(); 149 | newKey.set(class_Name + "\t" + fileName);//新的键值的key为<类明 文档名> 150 | double tempValue = Math.log(entry.getValue());//构建临时键值对的value为各概率相乘,转化为各概率取对数再相加 151 | for(int i=2; i,在wordsProbably表中查找对应的概率 153 | if(conditionalProbability.containsKey(tempKey)){ 154 | //如果测试文档的单词在训练集中出现过,则直接加上之前计算的概率 155 | tempValue += Math.log(conditionalProbability.get(tempKey)); 156 | } 157 | else{//如果测试文档中出现了新单词则加上之前计算新单词概率 158 | tempValue += Math.log(conditionalProbability.get(className)); 159 | } 160 | } 161 | newValue.set(className + "\t" + tempValue);//新的键值的value为<类名 概率> 162 | context.write(newKey, newValue);//一份文档遍历在一个类中遍历完毕,则将结果写入文件,即> 163 | System.out.println(newKey + "\t" +newValue); 164 | } 165 | } 166 | } 167 | public static class Prediction_Reduce extends Reducer { 168 | Text newValue = new Text(); 169 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 170 | boolean flag = false;//标记,若第一次循环则先赋值,否则比较若概率更大则更新 171 | String tempClass = null; 172 | double tempProbably = 0.0; 173 | for(Text value:values){ 174 | System.out.println("value......."+value.toString()); 175 | String[] result = value.toString().split("\\s"); 176 | String className=result[0]; 177 | String probably=result[1]; 178 | if(flag != true){//循环第一次 179 | tempClass = className;//value.toString().substring(0, index); 180 | tempProbably = Double.parseDouble(probably); 181 | flag = true; 182 | }else{//否则当概率更大时就更新tempClass和tempProbably 183 | if(Double.parseDouble(probably) > tempProbably){ 184 | tempClass = className; 185 | tempProbably = Double.parseDouble(probably); 186 | } 187 | } 188 | } 189 | newValue.set(tempClass + "\t" +tempProbably); 190 | //newValue.set(tempClass+":"+values.iterator().next()); 191 | context.write(key, newValue); 192 | System.out.println(key + "\t" + newValue); 193 | } 194 | } 195 | @Override 196 | public int run(String[] strings) throws Exception { 197 | Configuration conf = getConf(); 198 | FileSystem hdfs = FileSystem.get(conf); 199 | Path outputPath2 = new Path(Utils.Test_Prediction); 200 | if(hdfs.exists(outputPath2)) 201 | hdfs.delete(outputPath2, true); 202 | Job job4 =Job.getInstance(conf, "Prediction"); 203 | job4.setJarByClass(TestPrediction.class); 204 | job4.setMapperClass(Prediction_Mapper.class); 205 | job4.setCombinerClass(Prediction_Reduce.class); 206 | job4.setReducerClass(Prediction_Reduce.class); 207 | FileInputFormat.setInputDirRecursive(job4,true); 208 | job4.setOutputKeyClass(Text.class);//reduce阶段的输出的key 209 | job4.setOutputValueClass(Text.class);//reduce阶段的输出的value 210 | FileInputFormat.addInputPath(job4, new Path(Utils.Test_Preparation)); 211 | FileOutputFormat.setOutputPath(job4, new Path(Utils.Test_Prediction)); 212 | boolean result = job4.waitForCompletion(true); 213 | return (result ? 0 : 1); 214 | } 215 | public static void main(String[] args) throws Exception { 216 | int res = ToolRunner.run(new Configuration(), 217 | new TestPrediction(), args); 218 | System.exit(res); 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /Bayes/TestPreparation.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.InputSplit; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 | import org.apache.hadoop.util.Tool; 16 | import org.apache.hadoop.util.ToolRunner; 17 | import java.io.IOException; 18 | public class TestPreparation extends Configured implements Tool { 19 | public static class TestPreparation_Mapper extends Mapper { 20 | private Text key_out = new Text(); 21 | private Text value_out = new Text(); 22 | @Override 23 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 24 | InputSplit inputsplit = context.getInputSplit(); 25 | //获取类名 26 | String className = ((FileSplit)inputsplit).getPath().getParent().getName(); 27 | //获取文档名 28 | String fileName = ((FileSplit)inputsplit).getPath().getName(); 29 | 30 | key_out.set(className + "\t" +fileName); 31 | value_out.set(value.toString()); 32 | context.write(key_out, value_out); 33 | } 34 | } 35 | public static class TestPreparation_Reducer extends Reducer { 36 | private Text result = new Text(); 37 | private StringBuffer stringBuffer; 38 | @Override 39 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 40 | stringBuffer = new StringBuffer(); 41 | for (Text value : values){ 42 | stringBuffer = stringBuffer.append(value.toString() + " "); 43 | } 44 | result.set(stringBuffer.toString()); 45 | context.write(key, result); 46 | } 47 | } 48 | @Override 49 | public int run(String[] strings) throws Exception { 50 | Configuration conf = getConf(); 51 | Job job3 = Job.getInstance(conf, "TestPreparation"); 52 | 53 | FileSystem fileSystem = FileSystem.get(conf); 54 | Path outputPath3 = new Path(Utils.Test_Preparation); 55 | if(fileSystem.exists(outputPath3)) 56 | fileSystem.delete(outputPath3, true); 57 | //设置jar加载路径 58 | job3.setJarByClass(TestPreparation.class); 59 | //设置map和reduce类 60 | job3.setMapperClass(TestPreparation_Mapper.class); 61 | job3.setCombinerClass(TestPreparation_Reducer.class); 62 | job3.setReducerClass(TestPreparation_Reducer.class); 63 | //设置map reduce输出格式 64 | job3.setMapOutputKeyClass(Text.class); 65 | job3.setMapOutputValueClass(Text.class); 66 | job3.setOutputKeyClass(Text.class); 67 | job3.setOutputValueClass(Text.class); 68 | //设置输入和输出路径 69 | FileInputFormat.setInputDirRecursive(job3,true); 70 | FileInputFormat.addInputPath(job3, new Path(Utils.TEST_DATA_PATH)); 71 | // FileInputFormat.setInputPaths(job2, new Path(Utils.TRAIN_DATA_PATH)); 72 | FileOutputFormat.setOutputPath(job3, new Path(Utils.Test_Preparation)); 73 | boolean result = job3.waitForCompletion(true); 74 | return (result ? 0 : 1); 75 | } 76 | public static void main(String[] args) throws Exception { 77 | int res = ToolRunner.run(new Configuration(), 78 | new TestPreparation(), args); 79 | System.exit(res); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /Bayes/Utils.java: -------------------------------------------------------------------------------- 1 | package com.Bayes; 2 | 3 | public class Utils { 4 | public static final String BASE_PATH = "/output/"; 5 | 6 | public static final String TRAIN_DATA_PATH = "/liu_data/train"; 7 | public static final String TEST_DATA_PATH = "/liu_data/test"; 8 | 9 | public static final String DocNumClass = BASE_PATH + "output1"; 10 | public static final String WordNumClass = BASE_PATH + "output2"; 11 | 12 | public static final String Test_Preparation = BASE_PATH + "output3"; 13 | public static final String Test_Prediction = BASE_PATH + "output4"; 14 | 15 | } 16 | -------------------------------------------------------------------------------- /Bayes/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | test1 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | UTF-8 14 | 9 15 | 9 16 | 9 17 | 18 | 19 | 20 | 21 | 22 | junit 23 | junit 24 | RELEASE 25 | 26 | 27 | org.apache.logging.log4j 28 | log4j-core 29 | 2.10.0 30 | 31 | 32 | org.apache.hadoop 33 | hadoop-common 34 | 2.7.2 35 | 36 | 37 | org.apache.hadoop 38 | hadoop-client 39 | 2.7.2 40 | 41 | 42 | org.apache.hadoop 43 | hadoop-hdfs 44 | 2.7.2 45 | 46 | 47 | org.apache.maven.plugins 48 | maven-compiler-plugin 49 | 2.3.2 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Hadoop报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hutushanren/hadoopTask/9633ecf6292d320217a0066635fa5e6267c28fec/Hadoop报告.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hadoopTask 2 | Hadoop课程作业,用map/reduce实现贝叶斯文本分类器 3 | 4 | 数据集放在data目录下 5 | -------------------------------------------------------------------------------- /data/NBCorpus.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hutushanren/hadoopTask/9633ecf6292d320217a0066635fa5e6267c28fec/data/NBCorpus.rar -------------------------------------------------------------------------------- /first.jmx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | false 7 | true 8 | false 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | continue 17 | 18 | false 19 | 10 20 | 21 | 200 22 | 0 23 | false 24 | 25 | 26 | true 27 | 28 | 29 | 30 | 31 | 32 | 33 | localhost 34 | 8080 35 | HTTP 36 | 37 | 38 | 6 39 | 40 | 41 | 42 | 43 | 44 | false 45 | 46 | saveConfig 47 | 48 | 49 | true 50 | true 51 | true 52 | 53 | true 54 | true 55 | true 56 | true 57 | false 58 | true 59 | true 60 | false 61 | false 62 | false 63 | true 64 | false 65 | false 66 | false 67 | true 68 | 0 69 | true 70 | true 71 | true 72 | true 73 | true 74 | true 75 | 76 | 77 | 78 | 79 | 80 | 81 | false 82 | 83 | saveConfig 84 | 85 | 86 | true 87 | true 88 | true 89 | 90 | true 91 | true 92 | true 93 | true 94 | false 95 | true 96 | true 97 | false 98 | false 99 | false 100 | true 101 | false 102 | false 103 | false 104 | true 105 | 0 106 | true 107 | true 108 | true 109 | true 110 | true 111 | true 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | false 122 | 1 123 | = 124 | true 125 | goodsId 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | /seckill/doSeckill 134 | POST 135 | true 136 | false 137 | true 138 | false 139 | 140 | 141 | 142 | 143 | 144 | 145 | , 146 | UTF-8 147 | C:/Users/LMC/Desktop/config.txt 148 | false 149 | false 150 | true 151 | shareMode.all 152 | false 153 | mobile,userTicket 154 | 155 | 156 | 157 | 158 | 159 | ${userTicket} 160 | localhost 161 | / 162 | false 163 | 0 164 | true 165 | true 166 | 167 | 168 | false 169 | false 170 | 171 | 172 | 173 | 174 | 175 | 176 | --------------------------------------------------------------------------------