├── Hadoop报告.pdf ├── bayes_classify.iml ├── src └── main │ └── java │ ├── META-INF │ └── MANIFEST.MF │ └── com │ └── loring │ └── bayes │ ├── BayesClassify.java │ ├── Evaluation.java │ └── Prediction.java ├── README.md ├── .idea ├── misc.xml ├── compiler.xml ├── uiDesigner.xml ├── artifacts │ └── bayes_classify_jar.xml └── workspace.xml └── pom.xml /Hadoop报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loringfor/bayes_classify/HEAD/Hadoop报告.pdf -------------------------------------------------------------------------------- /bayes_classify.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/main/java/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.loring.bayes.WordCount1 3 | 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 1. 用MapReduce算法实现贝叶斯分类器的训练过程,并输出训练模型; 2 | 3 | 2. 用输出的模型对测试集文档进行分类测试。测试过程可基于单机Java程序,也可以是MapReduce程序。输出每个测试文档的分类结果; 4 | 5 | 3. 利用测试文档的真实类别,计算分类模型的Precision,Recall和F1值。 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.loring 8 | bayes_classify 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.apache.hadoop 15 | hadoop-common 16 | 2.7.4 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-mapreduce-client-core 23 | 2.7.4 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/main/java/com/loring/bayes/BayesClassify.java: -------------------------------------------------------------------------------- 1 | package com.loring.bayes; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.*; 8 | import org.apache.hadoop.mapreduce.*; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import java.io.IOException; 13 | import java.util.StringTokenizer; 14 | 15 | /** 16 | * @author XYL 17 | * @date 2018.12.28 18 | * 贝叶斯分类器 19 | */ 20 | public class BayesClassify { 21 | 22 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 23 | Configuration conf = new Configuration(); 24 | FileSystem hdfs = FileSystem.get(conf); 25 | 26 | Path outputPath1 = new Path(args[1]); 27 | if(hdfs.exists(outputPath1)) 28 | hdfs.delete(outputPath1, true); 29 | 30 | Job job1 =Job.getInstance(conf, "DocNumbers"); 31 | job1.setJarByClass(BayesClassify.class); 32 | //设置输入输出格式 33 | job1.setInputFormatClass(WholeFileInputFormat.class); 34 | job1.setMapperClass(DocNums_Map.class); 35 | job1.setCombinerClass(DocNums_Reduce.class); 36 | job1.setReducerClass(DocNums_Reduce.class); 37 | 38 | FileInputFormat.setInputDirRecursive(job1,true); 39 | job1.setOutputKeyClass(Text.class);//reduce阶段的输出的key 40 | job1.setOutputValueClass(IntWritable.class);//reduce阶段的输出的value 41 | FileInputFormat.addInputPath(job1, new Path(args[0])); 42 | FileOutputFormat.setOutputPath(job1, new Path(args[1])); 43 | boolean isSuccess = job1.waitForCompletion(true); 44 | if(!isSuccess) { 45 | System.exit(1); 46 | } 47 | 48 | Path outputPath2 = new Path(args[2]); 49 | if(hdfs.exists(outputPath2)) 50 | hdfs.delete(outputPath2, true); 51 | Job job2 =Job.getInstance(conf, "WordCount"); 52 | job2.setJarByClass(BayesClassify.class); 53 | 54 | job2.setMapperClass(WordCount_Map.class); 55 | job2.setCombinerClass(WordCount_Reduce.class); 56 | job2.setReducerClass(WordCount_Reduce.class); 57 | 58 | FileInputFormat.setInputDirRecursive(job2,true); 59 | job2.setOutputKeyClass(Text.class);//reduce阶段的输出的key 60 | job2.setOutputValueClass(IntWritable.class);//reduce阶段的输出的value 61 | FileInputFormat.addInputPath(job2, new Path(args[0])); 62 | FileOutputFormat.setOutputPath(job2, new Path(args[2])); 63 | System.exit(job2.waitForCompletion(true) ? 0 : 1); 64 | 65 | } 66 | 67 | 68 | /* 69 | * 第一个MapReduce用于统计每个类对应的文件数量 70 | * 为计算先验概率准备: 71 | * 输入:args[0],训练集 72 | * 输出:args[1],key为类名,value为类对应的文档数目,即 73 | */ 74 | public static class DocNums_Map extends Mapper { 75 | private Text newKey = new Text(); 76 | private final static IntWritable one = new IntWritable(1); 77 | public void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException{ 78 | //得到当前所处理分片 79 | InputSplit inputsplit = context.getInputSplit(); 80 | //将当前所处理分片的路径名按照目录结构解析为:类名、文档名 81 | String className = ((FileSplit)inputsplit).getPath().getParent().getName(); 82 | //将当前所处理分片所属的类名和文档名中间加上制表符组合成一个字符串 83 | //String classAndDoc = className; 84 | newKey.set(className); 85 | context.write(newKey, one); 86 | } 87 | } 88 | 89 | public static class DocNums_Reduce extends Reducer { 90 | private IntWritable result = new IntWritable(); 91 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 92 | int sum = 0; 93 | for(IntWritable value:values){ 94 | sum += value.get(); 95 | } 96 | result.set(sum); 97 | context.write(key, result); 98 | } 99 | } 100 | 101 | public static class WholeFileInputFormat extends FileInputFormat{ 102 | 103 | @Override 104 | protected boolean isSplitable(JobContext context, Path filename) { 105 | return false; 106 | } 107 | 108 | @Override 109 | public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { 110 | WholeFileRecordReader reader = new WholeFileRecordReader(); 111 | reader.initialize(inputSplit, taskAttemptContext); 112 | return reader; 113 | } 114 | 115 | } 116 | 117 | public static class WholeFileRecordReader extends RecordReader { 118 | private FileSplit fileSplit; //保存输入的分片,它将被转换成一条(key,value)记录 119 | private Configuration conf; //配置对象 120 | private BytesWritable value = new BytesWritable(); //value对象,内容为空 121 | private boolean processed = false; //布尔变量记录记录是否被处理过 122 | 123 | 124 | @Override 125 | public void initialize(InputSplit split, TaskAttemptContext context) 126 | throws IOException, InterruptedException { 127 | this.fileSplit = (FileSplit) split; //将输入分片强制转换成FileSplit 128 | this.conf = context.getConfiguration(); //从context获取配置信息 129 | } 130 | @Override 131 | public NullWritable getCurrentKey() throws IOException, InterruptedException { 132 | return NullWritable.get(); 133 | } 134 | @Override 135 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 136 | return value; 137 | } 138 | @Override 139 | public boolean nextKeyValue() throws IOException, InterruptedException { 140 | if (!processed) { //如果记录没有被处理过 141 | //从fileSplit对象获取split的字节数,创建byte数组contents 142 | byte[] contents = new byte[(int) fileSplit.getLength()]; 143 | Path file = fileSplit.getPath(); //从fileSplit对象获取输入文件路径 144 | FileSystem fs = file.getFileSystem(conf); //获取文件系统对象 145 | FSDataInputStream in = null; //定义文件输入流对象 146 | try { 147 | in = fs.open(file); //打开文件,返回文件输入流对象 148 | IOUtils.readFully(in, contents, 0, contents.length); //从输入流读取所有字节到contents 149 | value.set(contents, 0, contents.length); //将contens内容设置到value对象中 150 | } finally { 151 | IOUtils.closeStream(in); //关闭输入流 152 | } 153 | processed = true; //将是否处理标志设为true,下次调用该方法会返回false 154 | return true; 155 | } 156 | return false; //如果记录处理过,返回false,表示split处理完毕 157 | } 158 | 159 | @Override 160 | public float getProgress() throws IOException { 161 | return processed ? 1.0f : 0.0f; 162 | } 163 | 164 | @Override 165 | public void close() throws IOException { 166 | // do nothing 167 | } 168 | 169 | } 170 | 171 | /* 172 | * 第二个MapReduce用于统计每个类下单词的数量 173 | * 输入:args[0],训练集,输入为<行偏移量,单词> 174 | * 输出:args[2],输出为<类名_单词名,数量> 175 | */ 176 | public static class WordCount_Map extends Mapper{ 177 | private Text nameAndWord = new Text(); // KEYOUT 178 | private final static IntWritable one = new IntWritable(1); // VALUEOUT 179 | @Override 180 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 181 | InputSplit inputsplit = context.getInputSplit(); 182 | String className = ((FileSplit)inputsplit).getPath().getParent().getName(); 183 | String cAndTValue; 184 | String lineValue = value.toString(); 185 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 186 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 187 | // 遍历 188 | while (tokenizer.hasMoreTokens()) { 189 | //获取每个单词 190 | String wordValue = tokenizer.nextToken(); 191 | // 设置map输出的key值为类名和单词中间加上制表符组合成的字符串 192 | cAndTValue = className + '\t' + wordValue; 193 | //将类名单词字符串的值赋给hadoop的Text对象 194 | nameAndWord.set(cAndTValue); 195 | //将<类名单词,1>键值对写入上下文 196 | context.write(nameAndWord, one); 197 | } 198 | } 199 | } 200 | 201 | //将相同的类名_单词累加 202 | public static class WordCount_Reduce extends Reducer { 203 | private IntWritable result = new IntWritable(); 204 | @Override 205 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 206 | int sum = 0; 207 | for (IntWritable val : values) { 208 | sum += val.get(); 209 | } 210 | result.set(sum); 211 | context.write(key, result); 212 | } 213 | } 214 | 215 | } 216 | -------------------------------------------------------------------------------- /src/main/java/com/loring/bayes/Evaluation.java: -------------------------------------------------------------------------------- 1 | package com.loring.bayes; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | import java.io.BufferedReader; 15 | import java.io.IOException; 16 | import java.io.InputStreamReader; 17 | import java.net.URI; 18 | import java.util.ArrayList; 19 | 20 | public class Evaluation { 21 | 22 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 23 | Configuration conf = new Configuration(); 24 | FileSystem hdfs = FileSystem.get(conf); 25 | 26 | Path outputPath1 = new Path(args[1]); 27 | if(hdfs.exists(outputPath1)) 28 | hdfs.delete(outputPath1, true); 29 | 30 | Job job1 =Job.getInstance(conf, "OriginalDocOfClass"); 31 | job1.setJarByClass(Evaluation.class); 32 | job1.setMapperClass(Evaluation.OriginalDocOfClassMap.class); 33 | job1.setCombinerClass(Evaluation.Reduce.class); 34 | job1.setReducerClass(Evaluation.Reduce.class); 35 | FileInputFormat.setInputDirRecursive(job1,true); 36 | job1.setOutputKeyClass(Text.class);//reduce阶段的输出的key 37 | job1.setOutputValueClass(Text.class);//reduce阶段的输出的value 38 | FileInputFormat.addInputPath(job1, new Path(args[0])); 39 | FileOutputFormat.setOutputPath(job1, new Path(args[1])); 40 | boolean isSuccess = job1.waitForCompletion(true); 41 | if(!isSuccess) { 42 | System.exit(1); 43 | } 44 | 45 | 46 | Path outputPath2 = new Path(args[3]); 47 | if(hdfs.exists(outputPath2)) 48 | hdfs.delete(outputPath2, true); 49 | Job job2 =Job.getInstance(conf, "ClassifiedDocOfClass"); 50 | job2.setJarByClass(Evaluation.class); 51 | job2.setMapperClass(Evaluation.ClassifiedDocOfClassMap.class); 52 | job2.setCombinerClass(Evaluation.Reduce.class); 53 | job2.setReducerClass(Evaluation.Reduce.class); 54 | FileInputFormat.setInputDirRecursive(job2,true); 55 | job2.setOutputKeyClass(Text.class);//reduce阶段的输出的key 56 | job2.setOutputValueClass(Text.class);//reduce阶段的输出的value 57 | FileInputFormat.addInputPath(job2, new Path(args[2])); 58 | FileOutputFormat.setOutputPath(job2, new Path(args[3])); 59 | //System.exit(job2.waitForCompletion(true) ? 0 : 1); 60 | isSuccess = job2.waitForCompletion(true); 61 | if(!isSuccess) { 62 | System.exit(1); 63 | } 64 | 65 | GetEvaluation(conf, args[1]+"/part-r-00000", args[3]+"/part-r-00000"); 66 | 67 | } 68 | 69 | 70 | /** 71 | * 得到原本的文档分类 72 | * 输入:初始数据集合,格式为<,word1 word2...> 73 | * 输出:原本的文档分类,即 74 | */ 75 | public static class OriginalDocOfClassMap extends Mapper { 76 | private Text newKey = new Text(); 77 | private Text newValue = new Text(); 78 | 79 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 80 | // 分词:将每行的单词进行分割,按照"\t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 81 | String[] result = value.toString().split("\\s"); 82 | String className = result[0]; // 类名 83 | String docName = result[1]; // 文档名 84 | newKey.set(className); 85 | newValue.set(docName); 86 | context.write(newKey, newValue); 87 | System.out.println(newKey + "\t" + newValue); 88 | } 89 | } 90 | 91 | /** 92 | * 得到经贝叶斯分分类器分类后的文档分类 93 | * 读取经贝叶斯分类器分类后的结果文档,并将其转化为形式 94 | */ 95 | public static class ClassifiedDocOfClassMap extends Mapper{ 96 | private Text newKey = new Text(); 97 | private Text newValue = new Text(); 98 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 99 | // 分词:将每行的单词进行分割,按照"\t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 100 | String[] result = value.toString().split("\\s"); 101 | String docName = result[0]; // 文档名 102 | String className = result[1]; // 类名 103 | newKey.set(className); 104 | newValue.set(docName); 105 | context.write(newKey, newValue); 106 | } 107 | } 108 | 109 | public static class Reduce extends Reducer { 110 | private Text result = new Text(); 111 | public void reduce(Text key, Iterablevalues, Context context) throws IOException, InterruptedException{ 112 | //生成文档列表 113 | StringBuffer fileList = new StringBuffer(); 114 | for(Text value:values){ 115 | fileList.append(value + "\t"); 116 | } 117 | result.set(fileList.toString()); 118 | context.write(key, result); 119 | } 120 | } 121 | 122 | 123 | 124 | /** 125 | * 第一个MapReduce计算得出初始情况下各个类有哪些文档,第二个MapReduce计算得出经贝叶斯分类后各个类有哪些文档 126 | * 此函数作用就是统计初始情况下的分类和贝叶斯分类两种情况下各个类公有的文档数目(即针对各个类分类正确的文档数目TP) 127 | * 初始情况下的各个类总数目减去分类正确的数目即为原本正确但分类错误的数目(FN = OriginalCounts-TP) 128 | * 贝叶斯分类得到的各个类的总数目减去分类正确的数目即为原本不属于该类但分到该类的数目(FP = ClassifiedCounts - TP) 129 | */ 130 | //Precision精度:P = TP/(TP+FP) 131 | //Recall精度: R = TP/(TP+FN) 132 | //P和R的调和平均:F1 = 2PR/(P+R) 133 | //针对所有类别: 134 | //Macroaveraged(宏平均) precision:(p1+p2+...+pN)/N 135 | //Microaveraged(微平均) precision:对应各项相加再计算总的P、R值 136 | public static void GetEvaluation(Configuration conf, String ClassifiedDocOfClassFilePath, String OriginalDocOfClassFilePath) throws IOException{ 137 | 138 | //原始文档 139 | FileSystem fs1 = FileSystem.get(URI.create(OriginalDocOfClassFilePath), conf); 140 | FSDataInputStream fsr1 = fs1.open(new Path(OriginalDocOfClassFilePath)); 141 | BufferedReader reader1 = new BufferedReader(new InputStreamReader(fsr1)); 142 | //分类后的文档 143 | FileSystem fs2 = FileSystem.get(URI.create(ClassifiedDocOfClassFilePath), conf); 144 | FSDataInputStream fsr2 = fs2.open(new Path(ClassifiedDocOfClassFilePath)); 145 | BufferedReader reader2 = new BufferedReader(new InputStreamReader(fsr2)); 146 | 147 | ArrayList ClassNames = new ArrayList(); //依次得到分类的类名 148 | ArrayList TruePositive = new ArrayList(); // TP,记录真实情况和经分类后,正确分类的文档数目 149 | ArrayList FalseNegative = new ArrayList();// FN,记录属于该类但是没有分到该类的数目 150 | ArrayList FalsePositive = new ArrayList();// FP,记录不属于该类但是被分到该类的数目 151 | ArrayList precision = new ArrayList(); 152 | ArrayList recall = new ArrayList(); 153 | ArrayList F1 = new ArrayList(); 154 | 155 | try{ 156 | reader1 = new BufferedReader(new InputStreamReader(fsr1));//创建Reader对象 157 | String lineValue1 = null; 158 | String lineValue2 = null; 159 | while ((lineValue1 = reader1.readLine()) != null && (lineValue2 = reader2.readLine()) != null) { //按行读取 160 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 161 | String[] result1 = lineValue1.split("\\s"); 162 | String[] result2 = lineValue2.split("\\s"); 163 | //后面可以逐条记录处理(因为每次读入的分类前后的类是相同的) 164 | //System.out.println(result1[0]+"\t"+result2[0]); 165 | String className = result1[0]; 166 | ClassNames.add(className); 167 | 168 | int TP = 0; 169 | for(int i=1; i 2 | 3 | $PROJECT_DIR$/out/artifacts/bayes_classify_jar 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /src/main/java/com/loring/bayes/Prediction.java: -------------------------------------------------------------------------------- 1 | package com.loring.bayes; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.InputSplit; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.Mapper; 12 | import org.apache.hadoop.mapreduce.Reducer; 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | import java.io.BufferedReader; 17 | import java.io.IOException; 18 | import java.io.InputStreamReader; 19 | import java.net.URI; 20 | import java.util.HashMap; 21 | import java.util.Iterator; 22 | import java.util.Map; 23 | import java.util.StringTokenizer; 24 | 25 | public class Prediction { 26 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 27 | Configuration conf = new Configuration(); 28 | FileSystem hdfs = FileSystem.get(conf); 29 | 30 | Path outputPath1 = new Path(args[1]); 31 | if(hdfs.exists(outputPath1)) 32 | hdfs.delete(outputPath1, true); 33 | Job job1 =Job.getInstance(conf, "Word"); 34 | job1.setJarByClass(Prediction.class); 35 | job1.setMapperClass(Prediction.WordMapper.class); 36 | job1.setCombinerClass(Prediction.WordReducer.class); 37 | job1.setReducerClass(Prediction.WordReducer.class); 38 | FileInputFormat.setInputDirRecursive(job1,true); 39 | job1.setOutputKeyClass(Text.class);//reduce阶段的输出的key 40 | job1.setOutputValueClass(Text.class);//reduce阶段的输出的value 41 | FileInputFormat.addInputPath(job1, new Path(args[0])); 42 | FileOutputFormat.setOutputPath(job1, new Path(args[1])); 43 | boolean isSuccess = job1.waitForCompletion(true); 44 | if(!isSuccess) { 45 | System.exit(1); 46 | } 47 | 48 | Path outputPath2 = new Path(args[2]); 49 | if(hdfs.exists(outputPath2)) 50 | hdfs.delete(outputPath2, true); 51 | Job job2 =Job.getInstance(conf, "Prediction"); 52 | job2.setJarByClass(Prediction.class); 53 | job2.setMapperClass(Prediction.DocOfClassMap.class); 54 | job2.setCombinerClass(Prediction.DocOfClassReduce.class); 55 | job2.setReducerClass(Prediction.DocOfClassReduce.class); 56 | FileInputFormat.setInputDirRecursive(job2,true); 57 | job2.setOutputKeyClass(Text.class);//reduce阶段的输出的key 58 | job2.setOutputValueClass(Text.class);//reduce阶段的输出的value 59 | FileInputFormat.addInputPath(job2, new Path(args[1])); 60 | FileOutputFormat.setOutputPath(job2, new Path(args[2])); 61 | System.exit(job2.waitForCompletion(true) ? 0 : 1); 62 | } 63 | 64 | 65 | public static class WordMapper extends Mapper{ 66 | private Text newKey = new Text(); 67 | private Text newValue = new Text(); 68 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 69 | InputSplit inputsplit = context.getInputSplit(); 70 | // 类名 71 | String className = ((FileSplit)inputsplit).getPath().getParent().getName(); 72 | // 文档名 73 | String docName = ((FileSplit)inputsplit).getPath().getName(); 74 | StringTokenizer itr = new StringTokenizer(value.toString()); 75 | while (itr.hasMoreTokens()) { 76 | newKey.set(className+"\t"+docName); 77 | newValue.set(itr.nextToken()); 78 | context.write(newKey,newValue); 79 | } 80 | } 81 | } 82 | 83 | public static class WordReducer extends Reducer { 84 | private Text result = new Text(); 85 | private StringBuffer stringBuffer; 86 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 87 | stringBuffer=new StringBuffer(); 88 | for(Text word:values){ 89 | stringBuffer = stringBuffer.append(word.toString()+" "); 90 | } 91 | result.set(stringBuffer.toString()); 92 | System.out.println("key===>"+key); 93 | System.out.println("value===>"+result.toString()); 94 | context.write(key, result); 95 | } 96 | } 97 | 98 | /* 99 | * 第三个MapReduce进行贝叶斯测试 100 | * 输入:args[3],处理后的测试数据,测试数据格式<,word1 word2 ...> 101 | * HashMap classProbably先验概率 102 | * HashMap wordsProbably条件概率 103 | * 输出:args[4],输出每一份文档经贝叶斯分类后所对应的类,格式为 104 | */ 105 | public static class DocOfClassMap extends Mapper { 106 | public void setup(Context context)throws IOException{ 107 | GetPriorProbably(); //先验概率 108 | GetConditionProbably(); //条件概率 109 | } 110 | private Text newKey = new Text(); 111 | private Text newValue = new Text(); 112 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 113 | // 分词:将每行的单词进行分割,按照"\t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 114 | String[] result = value.toString().split("\\s"); 115 | String docName = result[1]; // 第二个,第一个是类名 116 | for(Map.Entry entry:classProbably.entrySet()) { //外层循环遍历所有类别 117 | String myKey = entry.getKey();//类名 118 | newKey.set(docName);//新的键值的key为<文档名> 119 | double tempValue = Math.log(entry.getValue());//构建临时键值对的value为各概率相乘,转化为各概率取对数再相加 120 | 121 | for(int i=2; i,在wordsProbably表中查找对应的概率 123 | if(wordsProbably.containsKey(tempKey)){ 124 | //如果测试文档的单词在训练集中出现过,则直接加上之前计算的概率 125 | tempValue += Math.log(wordsProbably.get(tempKey)); 126 | } 127 | else{//如果测试文档中出现了新单词则加上之前计算新单词概率 128 | tempValue += Math.log(wordsProbably.get(myKey)); 129 | } 130 | } 131 | newValue.set(myKey + "\t" + tempValue);//新的键值的value为<类名 概率> 132 | context.write(newKey, newValue);//一份文档遍历在一个类中遍历完毕,则将结果写入文件,即> 133 | System.out.println(newKey + "\t" +newValue); 134 | } 135 | } 136 | } 137 | 138 | public static class DocOfClassReduce extends Reducer { 139 | Text newValue = new Text(); 140 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 141 | boolean flag = false;//标记,若第一次循环则先赋值,否则比较若概率更大则更新 142 | String tempClass = null; 143 | double tempProbably = 0.0; 144 | for(Text value:values){ 145 | System.out.println("value......."+value.toString()); 146 | String[] result = value.toString().split("\\s"); 147 | String className=result[0]; 148 | String probably=result[1]; 149 | 150 | if(flag != true){//循环第一次 151 | tempClass = className;//value.toString().substring(0, index); 152 | tempProbably = Double.parseDouble(probably); 153 | flag = true; 154 | }else{//否则当概率更大时就更新tempClass和tempProbably 155 | if(Double.parseDouble(probably) > tempProbably){ 156 | tempClass = className; 157 | tempProbably = Double.parseDouble(probably); 158 | } 159 | } 160 | } 161 | 162 | newValue.set(tempClass + "\t" +tempProbably); 163 | //newValue.set(tempClass+":"+values.iterator().next()); 164 | context.write(key, newValue); 165 | System.out.println(key + "\t" + newValue); 166 | } 167 | } 168 | 169 | /*计算先验概率: 170 | * 该静态函数计算每个类的文档在总类中占的比例,即先验概率P(c)=类c下文件总数/整个训练样本的文件总数 171 | * 输入:对应第一个MapReduce的输出args[1] 172 | * 输出:得到HashMap存放的是<类名,概率> 173 | */ 174 | private static HashMap classProbably = new HashMap();//<类别,概率>,即 175 | 176 | public static HashMap GetPriorProbably() throws IOException { 177 | Configuration conf = new Configuration(); 178 | String filePath = "/output1/part-r-00000"; 179 | FSDataInputStream fsr = null; 180 | BufferedReader bufferedReader = null; 181 | String lineValue = null; 182 | double sum = 0; //文档总数量 183 | 184 | try { 185 | FileSystem fs = FileSystem.get(URI.create(filePath), conf); 186 | fsr = fs.open(new Path(filePath)); 187 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); 188 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 189 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 190 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 191 | String className = tokenizer.nextToken(); //类名 192 | String num_C_Tmp = tokenizer.nextToken(); //文档数量 193 | double numC = Double.parseDouble(num_C_Tmp); 194 | classProbably.put(className, numC); 195 | sum = sum + numC; //文档总数量 196 | } 197 | } catch (Exception e) { 198 | e.printStackTrace(); 199 | } finally { 200 | if (bufferedReader != null) { 201 | try { 202 | bufferedReader.close(); 203 | } catch (IOException e) { 204 | e.printStackTrace(); 205 | } 206 | } 207 | } 208 | 209 | Iterator iterator = classProbably.entrySet().iterator(); 210 | while (iterator.hasNext()) { 211 | Map.Entry entry = (Map.Entry) iterator.next(); 212 | Object key = entry.getKey(); 213 | double val = Double.parseDouble(entry.getValue().toString()) / sum; 214 | classProbably.put(key.toString(), val); 215 | System.out.println(classProbably.get(key)); 216 | } 217 | return classProbably; 218 | } 219 | 220 | 221 | /* 计算条件概率 222 | * 条件概率P(tk|c)=(类c下单词tk在各个文档中出现过的次数之和+1)/(类c下单词总数+不重复的单词总数) 223 | * 输入:对应第二个MapReduce的输出<,counts> 224 | * 输出:得到HashMap,即<<类名:单词>,概率> 225 | */ 226 | private static HashMap wordsProbably = new HashMap(); 227 | public static HashMap GetConditionProbably() throws IOException { 228 | String filePath = "/output2/part-r-00000"; 229 | Configuration conf = new Configuration(); 230 | FSDataInputStream fsr = null; 231 | BufferedReader bufferedReader = null; 232 | String lineValue = null; 233 | HashMap wordSum=new HashMap(); //存放的为<类名,单词总数> 234 | 235 | try { 236 | FileSystem fs = FileSystem.get(URI.create(filePath), conf); 237 | fsr = fs.open(new Path(filePath)); 238 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); 239 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 240 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 241 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 242 | String className = tokenizer.nextToken(); 243 | String word =tokenizer.nextToken(); 244 | String numWordTmp = tokenizer.nextToken(); 245 | double numWord = Double.parseDouble(numWordTmp); 246 | if(wordSum.containsKey(className)) 247 | wordSum.put(className,wordSum.get(className)+numWord+1.0);//加1.0是因为每一次都是一个不重复的单词 248 | else 249 | wordSum.put(className,numWord+1.0); 250 | } 251 | fsr.close(); 252 | } catch (Exception e) { 253 | e.printStackTrace(); 254 | } finally { 255 | if (bufferedReader != null) { 256 | try { 257 | bufferedReader.close(); 258 | } catch (IOException e) { 259 | e.printStackTrace(); 260 | } 261 | } 262 | } 263 | 264 | // 现在来计算条件概率 265 | try { 266 | FileSystem fs = FileSystem.get(URI.create(filePath), conf); 267 | fsr = fs.open(new Path(filePath)); 268 | bufferedReader = new BufferedReader(new InputStreamReader(fsr)); 269 | while ((lineValue = bufferedReader.readLine()) != null) { //按行读取 270 | // 分词:将每行的单词进行分割,按照" \t\n\r\f"(空格、制表符、换行符、回车符、换页)进行分割 271 | StringTokenizer tokenizer = new StringTokenizer(lineValue); 272 | String className = tokenizer.nextToken(); 273 | String word =tokenizer.nextToken(); 274 | String numWordTmp = tokenizer.nextToken(); 275 | double numWord = Double.parseDouble(numWordTmp); 276 | String key=className+"\t"+word; 277 | wordsProbably.put(key,(numWord+1.0)/wordSum.get(className)); 278 | //System.out.println(className+"\t"+word+"\t"+wordsProbably.get(key)); 279 | } 280 | fsr.close(); 281 | } catch (Exception e) { 282 | e.printStackTrace(); 283 | } finally { 284 | if (bufferedReader != null) { 285 | try { 286 | bufferedReader.close(); 287 | } catch (IOException e) { 288 | e.printStackTrace(); 289 | } 290 | } 291 | } 292 | 293 | // 对测试集中出现的新单词定义概率 294 | Iterator iterator = wordSum.entrySet().iterator(); //获取key和value的set 295 | while (iterator.hasNext()) { 296 | Map.Entry entry = (Map.Entry) iterator.next(); //把hashmap转成Iterator再迭代到entry 297 | Object key = entry.getKey(); //从entry获取key 298 | wordsProbably.put(key.toString(),1.0/Double.parseDouble(entry.getValue().toString())); 299 | } 300 | 301 | return wordsProbably; 302 | } 303 | 304 | } 305 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 123 | 124 | 125 | 126 | Classify 127 | data 128 | SequenceFileInputFormat 129 | job1 130 | 131 | 132 | 133 | job2 134 | 135 | 136 | C:\Users\Loring\Desktop\Code\bayes_classify\src\main\java\com\loring\bayes 137 | 138 | 139 | 140 | 158 | 159 | 160 | 161 | 162 | true 163 | DEFINITION_ORDER 164 | 165 | 166 | 171 | 172 | 173 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 |