├── Decision_Tree_ID3_MapReduce ├── ReadMe.txt ├── input │ └── 1.txt ├── output │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ └── part-r-00000 └── src │ ├── BuildTree.java │ ├── DT_ID3_Driver.java │ ├── DT_ID3_Map.java │ └── DT_ID3_Reduce.java ├── KMeansClustering_MapReduce ├── Readme.txt ├── input │ └── 1.txt ├── output │ └── 1.txt └── src │ ├── KMeansCentroidCalculationDriver_ClassificationDriver.java │ ├── KMeansCentroidCalculationMap.java │ ├── KMeansCentroidCalculationReduce.java │ └── KMeansClassificationReduce.java ├── KNN_MapReduce ├── Readme.txt ├── input │ ├── input_to_be_classified.txt │ └── iris_training_data.txt └── src │ ├── Driver.java │ ├── Map.java │ └── Reduce.java ├── LICENSE ├── LUDecomposition ├── .classpath ├── .project ├── README.md ├── input │ └── test_input_4x4.txt ├── output │ ├── .nth.crc │ ├── LU_Components │ │ ├── L │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-r-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-r-00000 │ │ └── U │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-r-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-r-00000 │ ├── after-2-runs │ │ ├── ._SUCCESS.crc │ │ ├── .part-r-00000.crc │ │ ├── _SUCCESS │ │ └── part-r-00000 │ ├── after-3-runs │ │ ├── ._SUCCESS.crc │ │ ├── .part-r-00000.crc │ │ ├── _SUCCESS │ │ └── part-r-00000 │ ├── nth │ └── total_records │ │ ├── ._SUCCESS.crc │ │ ├── .part-r-00000.crc │ │ ├── _SUCCESS │ │ └── part-r-00000 └── src │ └── lud │ ├── Utils.java │ ├── io │ ├── LongAndTextWritable.java │ ├── NaturalKeyGroupingComparator.java │ ├── TextPair.java │ ├── TextPairComparator.java │ └── TextPairPartitioner.java │ └── naiveGaussian │ ├── initial_input_mapper.java │ ├── lud_driver.java │ ├── lud_mapper.java │ ├── lud_reducer.java │ ├── mergeResults │ ├── merge_results_driver.java │ ├── merge_results_mapper.java │ └── merge_results_reducer.java │ └── totalRecords │ ├── total_records_driver.java │ ├── total_records_mapper.java │ └── total_records_reducer.java ├── LinearRegression_MapReduce ├── Readme.txt ├── input │ └── linear.txt └── src │ ├── Driver.java │ ├── thetaMAP.java │ └── thetaREDUCE.java ├── LogisticRegression_MapReduce ├── Readme.txt ├── input │ └── diabetes.txt └── src │ ├── Driver.java │ ├── thetaMAP.java │ └── thetaREDUCE.java ├── Market-Basket-Analysis_MapReduce ├── ReadMe.txt ├── input │ └── in.txt.txt ├── output │ └── part-r-00000 └── src │ ├── MBA_Driver.java │ ├── MBA_Mapper.java │ └── MBA_Reducer.java ├── MatrixMultiplication_MapReduce ├── Readme.txt ├── input │ └── 1.txt └── src │ ├── MatMulDriver.java │ ├── MatMulMap.java │ └── MatMulReduce.java ├── Mutual-Friends_MapReduce ├── ReadMe.txt ├── input │ └── in.txt.txt ├── output │ └── part-r-00000 └── src │ ├── MF_Driver.java │ ├── MF_Mapper.java │ ├── MF_Reducer.java │ └── gen_mutual_friends_matrix.java ├── Naive_Bayes_Classifier_MapReduce ├── Readme.txt ├── input │ └── 1.txt ├── output │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ └── part-r-00000 └── src │ ├── NBCDriver.java │ ├── NBCMap.java │ └── NBCReduce.java ├── README.md ├── Recommendation_Collaborative_Filtering_MapReduce ├── Readme.txt ├── input │ └── recommendation.txt ├── outputs │ ├── Intermediate_output │ │ └── part-r-00000 │ ├── final_output │ │ └── part-r-00000 │ └── n.txt └── src │ ├── FinalMap.java │ ├── FinalReduce.java │ ├── RecDriver.java │ ├── RecMap.java │ ├── RecReduce.java │ ├── get_co_oc_mat.java │ ├── get_unique_items.java │ └── get_unique_users.java ├── Top_N_MapReduce ├── ReadMe.txt ├── in │ ├── 1.txt │ ├── 2.txt │ └── 3.txt ├── out │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ ├── _SUCCESS │ └── part-r-00000 └── src │ ├── Top_N_Driver.java │ ├── Top_N_Mapper.java │ └── Top_N_Reducer.java └── lu_decomposition ├── .classpath ├── .project ├── README.md ├── input └── test_input_4x4.txt ├── output-merged ├── lower │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ ├── _SUCCESS │ └── part-r-00000 └── upper │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ ├── _SUCCESS │ └── part-r-00000 ├── output-run-0 ├── ._SUCCESS.crc ├── .part-m-00000.crc ├── _SUCCESS └── part-m-00000 ├── output-run-1 ├── ._SUCCESS.crc ├── .part-m-00000.crc ├── _SUCCESS └── part-m-00000 ├── output-run-2 ├── ._SUCCESS.crc ├── .part-m-00000.crc ├── _SUCCESS └── part-m-00000 ├── output ├── nth │ ├── ._SUCCESS.crc │ ├── .part-m-00000.crc │ ├── _SUCCESS │ └── part-m-00000 └── total_records │ ├── ._SUCCESS.crc │ ├── .part-r-00000.crc │ ├── _SUCCESS │ └── part-r-00000 └── src └── lu_decomposition └── naive_gausssian ├── FindNthRow ├── find_nth_driver.java └── find_nth_mapper.java ├── MergeResults ├── merge_results_driver.java ├── merge_results_mapper.java └── merge_results_reducer.java ├── TotalRecords ├── total_records_driver.java ├── total_records_mapper.java └── total_records_reducer.java ├── io ├── LongAndTextWritable.java ├── NaturalKeyGroupingComparator.java ├── TextPair.java ├── TextPairComparator.java └── TextPairPartitioner.java ├── lud_driver.java └── lud_mapper.java /Decision_Tree_ID3_MapReduce/ReadMe.txt: -------------------------------------------------------------------------------- 1 | This is a MapReduce implementation of the ID3 (Iterative Dichotomiser 3) Decision Tree algorithm. 2 | 3 | This program accepts 3 user inputs: 4 | 5 | 1. The input path 6 | 7 | 2. The output path 8 | 9 | The example input that I have used in this project is file which tells us about a student's activity given certain factors. 10 | So the factors are (in order): Deadline?, Is there a Party?, Is he/she lazy? and finally the output is Activity. 11 | 12 | So the 0,1,2 in the output file are nothing but Deadline?, Is there a Party? and Is he/she lazy? respectively. 13 | 14 | Although I get the correct outputs from this algorithm, there are still a lot of wrongdoings in the code which I will be fixed in time. 15 | 16 | This algorithm computes the decision tree on each data block and then sort of averages out all the DTs at the reducer which I think is a bit computationally expensive for one reducer and which needs a change in the approach a wee bit. 17 | 18 | The output of this algorithm is a simple string which represents a graph (sort of, since I did not get time to write the Graph API). 19 | Every line starts with an integer which is the node and then the connected nodes as values. 20 | At every line, outputs are separated by "|". If an output doesn't have a ";" separated value attached to it, then it is a node 21 | which will be split and its contents are written in the next line. 22 | 23 | I use a LinkedHashMap to store the graph. I used the LinkedhashMap to preserve the order of all the insertions in my map. 24 | -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/input/1.txt: -------------------------------------------------------------------------------- 1 | Urgent,Yes,Yes,Party 2 | Urgent,No,Yes,Study 3 | Near,Yes,Yes,Party 4 | None,Yes,No,Party 5 | None,No,Yes,Pub 6 | None,Yes,No,Party 7 | Near,No,No,Study 8 | Near,No,Yes,TV 9 | Near,Yes,Yes,Party 10 | Urgent,No,No,Study 11 | -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/output/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Decision_Tree_ID3_MapReduce/output/.part-r-00000.crc -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1,Yes;Party|No 2 | 0,Urgent;Study|Near|None;Pub 3 | 2,Yes;TV|No;Study 4 | -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/src/BuildTree.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.HashMap; 3 | import java.util.Iterator; 4 | import java.util.LinkedHashMap; 5 | import java.util.Map; 6 | import java.util.Map.Entry; 7 | 8 | public class BuildTree{ 9 | public static int feat_count=0; 10 | public static LinkedHashMap p=new LinkedHashMap(); 11 | public static HashMap nodes=new HashMap(); 12 | public static HashMap gain= new HashMap(); 13 | public static HashMap intermediate= new HashMap(); 14 | public static HashMap feature_count= new HashMap(); 15 | public static HashMap outcome_count= new HashMap(); 16 | 17 | public static String[] getMax_m(HashMap x){ 18 | String maxKey=""; 19 | Double maxValue=Double.NEGATIVE_INFINITY; 20 | for(Entry e:x.entrySet()){ 21 | if(e.getValue()>maxValue){ 22 | maxKey=e.getKey(); 23 | maxValue=e.getValue(); 24 | } 25 | } 26 | String[] s=new String[2]; 27 | s[0]=String.valueOf(maxKey); 28 | s[1]=String.valueOf(maxValue); 29 | return s; 30 | } 31 | 32 | public static String[] getMax(HashMap x){ 33 | int maxKey=-1; 34 | Double maxValue=Double.NEGATIVE_INFINITY; 35 | for(Entry e:x.entrySet()){ 36 | if(e.getValue()>maxValue && !nodes.containsKey(String.valueOf(e.getKey()))){ 37 | maxKey=e.getKey(); 38 | maxValue=e.getValue(); 39 | } 40 | } 41 | String[] s=new String[2]; 42 | s[0]=String.valueOf(maxKey); 43 | s[1]=String.valueOf(maxValue); 44 | return s; 45 | } 46 | 47 | public static LinkedHashMap build(LinkedHashMap g,ArrayList data,int size){ 48 | if(p.size()==0) 49 | p.putAll(g); 50 | if(feat_count==0) 51 | feat_count=data.get(0).split("\\,").length-1; 52 | 53 | for(int i=0;i e:feature_count.entrySet()){ 72 | String[] key=e.getKey().split("\\,"); 73 | if(intermediate.containsKey(Integer.parseInt(key[0]))) 74 | intermediate.put(Integer.parseInt(key[0]), intermediate.get(Integer.parseInt(key[0]))+","+key[1]+":"+(e.getValue())); 75 | else 76 | intermediate.put(Integer.parseInt(e.getKey().split("\\,")[0]), String.valueOf(key[1]+":"+e.getValue())); 77 | } 78 | // Calculating the entropy of the whole Set. 79 | double entropy=0.0; 80 | for(Entry e:outcome_count.entrySet()){ 81 | double p=((e.getValue()/size)); 82 | entropy+=-(p*(Math.log(p)/Math.log(2))); 83 | } 84 | 85 | // Initialising the gain Map with all the keys 86 | // and the initial information gain which is ofcourse 87 | // the entropy of whole Set. 88 | for(int i=0;i e:intermediate.entrySet()){ 92 | if(gain.containsKey(e.getKey())){ 93 | double info_gain_except_the_entropy=0.0; 94 | String[] counts=e.getValue().split("\\,"); 95 | HashMap feat=new HashMap(); 96 | for(int j=0;j r:feat.entrySet()){ 103 | String[] c=r.getValue().split("\\,"); 104 | int num=0; 105 | for(int x=0;x test=new HashMap(); 122 | for(Entry z:feature_count.entrySet()){ 123 | String[] parts=z.getKey().split("\\,"); 124 | if(parts[0].contentEquals(key)){ 125 | if(test.containsKey(parts[1]+";"+parts[2])) 126 | test.put(parts[1]+";"+parts[2], test.get(parts[1]+";"+parts[2])+1); 127 | else 128 | test.put(parts[1]+";"+parts[2], (double) 1); 129 | } 130 | } 131 | String return_value=(key+","+getMax_m(test)[0]); 132 | HashMap ret=new HashMap(); 133 | ret.put(key, getMax_m(test)[0]); 134 | if(p.containsKey(key)) 135 | p.put(key, p.get(key)+"|"+getMax_m(test)[0]); 136 | else 137 | p.put(key, getMax_m(test)[0]); 138 | ArrayList indices=new ArrayList(); 139 | for(int i=0;i test2=new HashMap(); 164 | for(Entry E:test.entrySet()){ 165 | if(test2.containsKey(E.getKey().split("\\;")[0])) 166 | test2.put(E.getKey().split("\\;")[0], test2.get(E.getKey().split("\\;")[0])+1); 167 | else 168 | test2.put(E.getKey().split("\\;")[0], (double) 1); 169 | } 170 | Iterator> it1=test.entrySet().iterator(),it2=test2.entrySet().iterator(); 171 | while (it1.hasNext() && it2.hasNext()){ 172 | Map.Entry pairs1=(Entry) it1.next(); 173 | Map.Entry pairs2=(Entry) it2.next(); 174 | 175 | if(p.containsKey(key)) 176 | if(pairs2.getValue()==(double) 1) 177 | p.put(key, p.get(key)+"|"+pairs1.getKey()); 178 | else 179 | p.put(key, p.get(key)+"|"+pairs2.getKey()); 180 | else 181 | if(pairs2.getValue()==(double) 1) 182 | p.put(key, pairs1.getKey()); 183 | else 184 | p.put(key, pairs2.getKey()); 185 | } 186 | int r=0; 187 | String vl=""; 188 | for(Entry n:p.entrySet()){ 189 | ++r; 190 | if(r==p.size()){ 191 | String[] i=n.getValue().split("\\|"); 192 | int count=i.length-1; 193 | for(int v=0;v test2=new HashMap(); 208 | for(Entry E:test.entrySet()){ 209 | if(test2.containsKey(E.getKey().split("\\;")[0])) 210 | test2.put(E.getKey().split("\\;")[0], test2.get(E.getKey().split("\\;")[0])+1); 211 | else 212 | test2.put(E.getKey().split("\\;")[0], (double) 1); 213 | } 214 | Iterator> it1=test.entrySet().iterator(),it2=test2.entrySet().iterator(); 215 | while (it1.hasNext() && it2.hasNext()){ 216 | Map.Entry pairs1=(Entry) it1.next(); 217 | Map.Entry pairs2=(Entry) it2.next(); 218 | 219 | if(p.containsKey(key)) 220 | if(pairs2.getValue()==(double) 1) 221 | p.put(key, p.get(key)+"|"+pairs1.getKey()); 222 | else 223 | p.put(key, p.get(key)+"|"+pairs2.getKey()); 224 | else 225 | if(pairs2.getValue()==(double) 1) 226 | p.put(key, pairs1.getKey()); 227 | else 228 | p.put(key, pairs2.getKey()); 229 | } 230 | return build(p,data,data.size()); 231 | } 232 | } 233 | } -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/src/DT_ID3_Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 8 | 9 | public class DT_ID3_Driver { 10 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ 11 | Configuration conf=new Configuration(); 12 | Job job = new Job(conf); 13 | job.setJarByClass(DT_ID3_Driver.class); 14 | job.setJobName("Decision_Tree_Algorithm_on_Hadoop"); 15 | FileInputFormat.setInputPaths(job, new Path(args[0])); 16 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 17 | //job.setNumReduceTasks(0); 18 | job.setMapperClass(DT_ID3_Map.class); 19 | job.setReducerClass(DT_ID3_Reduce.class); 20 | job.setMapOutputKeyClass(Text.class); 21 | job.setMapOutputValueClass(Text.class); 22 | job.setOutputKeyClass(Text.class); 23 | job.setOutputValueClass(Text.class); 24 | boolean success = job.waitForCompletion(true); 25 | System.exit(success ? 0 : 1); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/src/DT_ID3_Map.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.ArrayList; 3 | import java.util.LinkedHashMap; 4 | import java.util.Map.Entry; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class DT_ID3_Map extends Mapper{ 10 | public static int count=0; 11 | public static ArrayList input=new ArrayList(); 12 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{ 13 | input.add(value.toString()); 14 | ++count; 15 | } 16 | 17 | @Override 18 | public void cleanup(Context context) throws IOException, InterruptedException{ 19 | LinkedHashMap g = new LinkedHashMap(); 20 | LinkedHashMap t=BuildTree.build(g,input, count); 21 | String key=""; 22 | int c=0; 23 | for(Entry T:t.entrySet()){ 24 | ++c; 25 | key=T.getKey()+","+T.getValue(); 26 | System.out.println("key: "+key+" c: "+c); 27 | context.write(new Text(String.valueOf(c)), new Text(key)); 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /Decision_Tree_ID3_MapReduce/src/DT_ID3_Reduce.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.HashMap; 3 | import java.util.Map.Entry; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | public class DT_ID3_Reduce extends Reducer{ 8 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 9 | HashMap counts=new HashMap(); 10 | String maxKey=""; 11 | int maxValue=-1; 12 | for(Text value:values){ 13 | if(counts.containsKey(value.toString())) 14 | counts.put(value.toString(), counts.get(value.toString())+1); 15 | else 16 | counts.put(value.toString(), 1); 17 | } 18 | for(Entry e:counts.entrySet()){ 19 | if(e.getValue()>maxValue){ 20 | maxKey=e.getKey(); 21 | maxValue=e.getValue(); 22 | } 23 | } 24 | context.write(null, new Text(maxKey)); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /KMeansClustering_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | The input file is just filled with delimited 2D points and this algorithm tries to cluster them. 2 | 3 | Initially, the output folder needs to be created and must be filled with a file which contains the initial cluster centers which are mostly zero. 4 | 5 | The old and new centers are tab separated. The centers on the left are old and the centers on the right are new. Initially both are zero. 6 | 7 | This algorithms takes in 4 arguments as follows: 8 | 9 | 1. Number of centroids 10 | 2. The dimension of the input points 11 | 3. The input data 12 | 4. The output data 13 | -------------------------------------------------------------------------------- /KMeansClustering_MapReduce/input/1.txt: -------------------------------------------------------------------------------- 1 | 1,1|2,2|3,3|4,4 2 | 10,10|20,20|30,30|40,40 3 | 60,60|70,70|80,80|90,90 -------------------------------------------------------------------------------- /KMeansClustering_MapReduce/output/1.txt: -------------------------------------------------------------------------------- 1 | 0.0,0.0 0.0,0.0 2 | 0.0,0.0 0.0,0.0 3 | 0.0,0.0 0.0,0.0 -------------------------------------------------------------------------------- /KMeansClustering_MapReduce/src/KMeansCentroidCalculationDriver_ClassificationDriver.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.IOException; 3 | import java.io.InputStreamReader; 4 | import java.util.ArrayList; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class KMeansCentroidCalculationDriver_ClassificationDriver{ 14 | public static boolean isdone=false; 15 | public static String num_centers; 16 | public static String dimension; 17 | public static void main(String[] args) throws Exception{ 18 | Configuration conf= new Configuration(); 19 | //args[0] is the number of centers to be used. 20 | num_centers=args[0]; 21 | //args[1] is the dimension of the input. 22 | dimension=args[1]; 23 | conf.setInt("noc", Integer.parseInt(num_centers)); 24 | conf.setInt("dimension", Integer.parseInt(dimension)); 25 | int iter=0; 26 | FileSystem hdfs=FileSystem.get(conf); 27 | ArrayList centers=new ArrayList(); 28 | //args[3] is the output path. Initially it will contain a single file 29 | //in which old and new centroids will be assigned to 0.0. 30 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[3])))); 31 | String line=null; 32 | while((line=br.readLine())!=null){ 33 | ++iter; 34 | String[] tok=line.split("\\\t"); 35 | String[] centroids_new= tok[1].split("\\,"); 36 | centers.add(Float.parseFloat(centroids_new[0])); 37 | centers.add(Float.parseFloat(centroids_new[1])); 38 | } 39 | br.close(); 40 | for(int i=1;i<=Integer.parseInt(num_centers);i++){ 41 | if(iter==i){ 42 | for(int j=0;j<((Integer.parseInt(num_centers)-iter)*2);j++){ 43 | centers.add((float) 0); 44 | } 45 | } 46 | } 47 | if(hdfs.exists(new Path(args[3]))){ 48 | hdfs.delete(new Path(args[3]),true); 49 | } 50 | hdfs.close(); 51 | for(int i=0;i<(Integer.parseInt(num_centers)*2);i++){ 52 | conf.setFloat("c".concat(String.valueOf(i)) , centers.get(i)); 53 | } 54 | Job job = new Job(conf,"K-Means Clustering MapReduce"); 55 | job.setJarByClass(KMeansCentroidCalculationDriver_ClassificationDriver.class); 56 | //args[2] is the input path. 57 | FileInputFormat.setInputPaths(job, new Path(args[2])); 58 | FileOutputFormat.setOutputPath(job, new Path(args[3])); 59 | job.setMapperClass(KMeansCentroidCalculationMap.class); 60 | job.setCombinerClass(KMeansCentroidCalculationReduce.class); 61 | job.setReducerClass(KMeansCentroidCalculationReduce.class); 62 | job.setMapOutputKeyClass(Text.class); 63 | job.setMapOutputValueClass(Text.class); 64 | job.setOutputKeyClass(Text.class); 65 | job.setOutputValueClass(Text.class); 66 | job.waitForCompletion(true); 67 | while(isdone==false){ 68 | run(args); 69 | } 70 | } 71 | public static void run(String[] args) throws IOException, ClassNotFoundException, InterruptedException{ 72 | Configuration conf_med=new Configuration(); 73 | conf_med.setInt("noc", Integer.parseInt(num_centers)); 74 | conf_med.setInt("dimension", Integer.parseInt(dimension)); 75 | int iter_med=0; 76 | ArrayList centers_old=new ArrayList(); 77 | ArrayList centers_new=new ArrayList(); 78 | FileSystem hdfs_med=FileSystem.get(conf_med); 79 | BufferedReader br_med = new BufferedReader(new InputStreamReader(hdfs_med.open(new Path(args[3])))); 80 | String line_med=null; 81 | while((line_med=br_med.readLine())!=null){ 82 | ++iter_med; 83 | String[] tok= line_med.split("\\\t"); 84 | String[] centroids_old= tok[0].split("\\,"); 85 | String[] centroids_new= tok[1].split("\\,"); 86 | centers_old.add(Float.parseFloat(centroids_old[0])); 87 | centers_old.add(Float.parseFloat(centroids_old[1])); 88 | centers_new.add(Float.parseFloat(centroids_new[0])); 89 | centers_new.add(Float.parseFloat(centroids_new[1])); 90 | } 91 | br_med.close(); 92 | for(int i=1;i<=Integer.parseInt(num_centers);i++){ 93 | if(iter_med==i){ 94 | for(int j=0;j<((Integer.parseInt(num_centers)-iter_med)*2);j++){ 95 | centers_old.add((float) 0); 96 | centers_new.add((float) 0); 97 | } 98 | } 99 | } 100 | if(hdfs_med.exists(new Path(args[3]))){ 101 | hdfs_med.delete(new Path(args[3]),true); 102 | } 103 | hdfs_med.close(); 104 | ArrayList ond = new ArrayList(); 105 | for(int i=0;i{ 10 | public static HashMap map=new HashMap(); 11 | public static Double minkey=(double) 0; 12 | public static int noc=0, dimension=0; 13 | public static ArrayList centers=new ArrayList(); 14 | public static Double minvalue=Double.POSITIVE_INFINITY; 15 | public static float euc_dist(Float[] a, Float[] b,int num){ 16 | float distance=0; 17 | float val=0; 18 | for(int i=0;i entry: map.entrySet()){ 46 | if(entry.getValue(){ 7 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 8 | int count=0; 9 | float x_sum=0, y_sum=0; 10 | for(Text val:values){ 11 | StringTokenizer xy_points=new StringTokenizer(val.toString(),","); 12 | float x_point=Float.parseFloat(xy_points.nextToken()); 13 | float y_point=Float.parseFloat(xy_points.nextToken()); 14 | x_sum+=x_point; 15 | y_sum+=y_point; 16 | count++; 17 | } 18 | context.write(key,new Text((x_sum/count)+","+(y_sum/count))); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /KMeansClustering_MapReduce/src/KMeansClassificationReduce.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.io.Text; 3 | import org.apache.hadoop.mapreduce.Reducer; 4 | 5 | public class KMeansClassificationReduce extends Reducer{ 6 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 7 | for(Text val:values){ 8 | context.write(key,val); 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /KNN_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | Iris database has been used as the input. 2 | 3 | In this example, 3 nearest neighbours are used in the map phase. In the reduce phase, the dominant class is selected from all the classes sent by the mappers. 4 | 5 | If the data size is very large, you can disable the reduction proces by setting the number of reduce tasks to zero and then run a new MapReduce job on the data which was the input to the reducer of the old job and then include the same logic, which was used in the reducer of the old job, in the mapper and reducer of the new job to find the dominant class of the input. 6 | 7 | This algorithms takes in 4 arguments as follows: 8 | 9 | 1. The input file waiting to be classified (features separated by whitespaces in a single file) 10 | 2. The name of the input entity to be classified 11 | 3. The input (training) dataset 12 | 4. The output 13 | -------------------------------------------------------------------------------- /KNN_MapReduce/input/input_to_be_classified.txt: -------------------------------------------------------------------------------- 1 | 5.1 3.5 1.4 0.2 -------------------------------------------------------------------------------- /KNN_MapReduce/input/iris_training_data.txt: -------------------------------------------------------------------------------- 1 | 5.1 3.5 1.4 0.2 "setosa" 2 | 4.9 3 1.4 0.2 "setosa" 3 | 4.7 3.2 1.3 0.2 "setosa" 4 | 4.6 3.1 1.5 0.2 "setosa" 5 | 5 3.6 1.4 0.2 "setosa" 6 | 5.4 3.9 1.7 0.4 "setosa" 7 | 4.6 3.4 1.4 0.3 "setosa" 8 | 5 3.4 1.5 0.2 "setosa" 9 | 4.4 2.9 1.4 0.2 "setosa" 10 | 4.9 3.1 1.5 0.1 "setosa" 11 | 5.4 3.7 1.5 0.2 "setosa" 12 | 4.8 3.4 1.6 0.2 "setosa" 13 | 4.8 3 1.4 0.1 "setosa" 14 | 4.3 3 1.1 0.1 "setosa" 15 | 5.8 4 1.2 0.2 "setosa" 16 | 5.7 4.4 1.5 0.4 "setosa" 17 | 5.4 3.9 1.3 0.4 "setosa" 18 | 5.1 3.5 1.4 0.3 "setosa" 19 | 5.7 3.8 1.7 0.3 "setosa" 20 | 5.1 3.8 1.5 0.3 "setosa" 21 | 5.4 3.4 1.7 0.2 "setosa" 22 | 5.1 3.7 1.5 0.4 "setosa" 23 | 4.6 3.6 1 0.2 "setosa" 24 | 5.1 3.3 1.7 0.5 "setosa" 25 | 4.8 3.4 1.9 0.2 "setosa" 26 | 5 3 1.6 0.2 "setosa" 27 | 5 3.4 1.6 0.4 "setosa" 28 | 5.2 3.5 1.5 0.2 "setosa" 29 | 5.2 3.4 1.4 0.2 "setosa" 30 | 4.7 3.2 1.6 0.2 "setosa" 31 | 4.8 3.1 1.6 0.2 "setosa" 32 | 5.4 3.4 1.5 0.4 "setosa" 33 | 5.2 4.1 1.5 0.1 "setosa" 34 | 5.5 4.2 1.4 0.2 "setosa" 35 | 4.9 3.1 1.5 0.2 "setosa" 36 | 5 3.2 1.2 0.2 "setosa" 37 | 5.5 3.5 1.3 0.2 "setosa" 38 | 4.9 3.6 1.4 0.1 "setosa" 39 | 4.4 3 1.3 0.2 "setosa" 40 | 5.1 3.4 1.5 0.2 "setosa" 41 | 5 3.5 1.3 0.3 "setosa" 42 | 4.5 2.3 1.3 0.3 "setosa" 43 | 4.4 3.2 1.3 0.2 "setosa" 44 | 5 3.5 1.6 0.6 "setosa" 45 | 5.1 3.8 1.9 0.4 "setosa" 46 | 4.8 3 1.4 0.3 "setosa" 47 | 5.1 3.8 1.6 0.2 "setosa" 48 | 4.6 3.2 1.4 0.2 "setosa" 49 | 5.3 3.7 1.5 0.2 "setosa" 50 | 5 3.3 1.4 0.2 "setosa" 51 | 7 3.2 4.7 1.4 "versicolor" 52 | 6.4 3.2 4.5 1.5 "versicolor" 53 | 6.9 3.1 4.9 1.5 "versicolor" 54 | 5.5 2.3 4 1.3 "versicolor" 55 | 6.5 2.8 4.6 1.5 "versicolor" 56 | 5.7 2.8 4.5 1.3 "versicolor" 57 | 6.3 3.3 4.7 1.6 "versicolor" 58 | 4.9 2.4 3.3 1 "versicolor" 59 | 6.6 2.9 4.6 1.3 "versicolor" 60 | 5.2 2.7 3.9 1.4 "versicolor" 61 | 5 2 3.5 1 "versicolor" 62 | 5.9 3 4.2 1.5 "versicolor" 63 | 6 2.2 4 1 "versicolor" 64 | 6.1 2.9 4.7 1.4 "versicolor" 65 | 5.6 2.9 3.6 1.3 "versicolor" 66 | 6.7 3.1 4.4 1.4 "versicolor" 67 | 5.6 3 4.5 1.5 "versicolor" 68 | 5.8 2.7 4.1 1 "versicolor" 69 | 6.2 2.2 4.5 1.5 "versicolor" 70 | 5.6 2.5 3.9 1.1 "versicolor" 71 | 5.9 3.2 4.8 1.8 "versicolor" 72 | 6.1 2.8 4 1.3 "versicolor" 73 | 6.3 2.5 4.9 1.5 "versicolor" 74 | 6.1 2.8 4.7 1.2 "versicolor" 75 | 6.4 2.9 4.3 1.3 "versicolor" 76 | 6.6 3 4.4 1.4 "versicolor" 77 | 6.8 2.8 4.8 1.4 "versicolor" 78 | 6.7 3 5 1.7 "versicolor" 79 | 6 2.9 4.5 1.5 "versicolor" 80 | 5.7 2.6 3.5 1 "versicolor" 81 | 5.5 2.4 3.8 1.1 "versicolor" 82 | 5.5 2.4 3.7 1 "versicolor" 83 | 5.8 2.7 3.9 1.2 "versicolor" 84 | 6 2.7 5.1 1.6 "versicolor" 85 | 5.4 3 4.5 1.5 "versicolor" 86 | 6 3.4 4.5 1.6 "versicolor" 87 | 6.7 3.1 4.7 1.5 "versicolor" 88 | 6.3 2.3 4.4 1.3 "versicolor" 89 | 5.6 3 4.1 1.3 "versicolor" 90 | 5.5 2.5 4 1.3 "versicolor" 91 | 5.5 2.6 4.4 1.2 "versicolor" 92 | 6.1 3 4.6 1.4 "versicolor" 93 | 5.8 2.6 4 1.2 "versicolor" 94 | 5 2.3 3.3 1 "versicolor" 95 | 5.6 2.7 4.2 1.3 "versicolor" 96 | 5.7 3 4.2 1.2 "versicolor" 97 | 5.7 2.9 4.2 1.3 "versicolor" 98 | 6.2 2.9 4.3 1.3 "versicolor" 99 | 5.1 2.5 3 1.1 "versicolor" 100 | 5.7 2.8 4.1 1.3 "versicolor" 101 | 6.3 3.3 6 2.5 "virginica" 102 | 5.8 2.7 5.1 1.9 "virginica" 103 | 7.1 3 5.9 2.1 "virginica" 104 | 6.3 2.9 5.6 1.8 "virginica" 105 | 6.5 3 5.8 2.2 "virginica" 106 | 7.6 3 6.6 2.1 "virginica" 107 | 4.9 2.5 4.5 1.7 "virginica" 108 | 7.3 2.9 6.3 1.8 "virginica" 109 | 6.7 2.5 5.8 1.8 "virginica" 110 | 7.2 3.6 6.1 2.5 "virginica" 111 | 6.5 3.2 5.1 2 "virginica" 112 | 6.4 2.7 5.3 1.9 "virginica" 113 | 6.8 3 5.5 2.1 "virginica" 114 | 5.7 2.5 5 2 "virginica" 115 | 5.8 2.8 5.1 2.4 "virginica" 116 | 6.4 3.2 5.3 2.3 "virginica" 117 | 6.5 3 5.5 1.8 "virginica" 118 | 7.7 3.8 6.7 2.2 "virginica" 119 | 7.7 2.6 6.9 2.3 "virginica" 120 | 6 2.2 5 1.5 "virginica" 121 | 6.9 3.2 5.7 2.3 "virginica" 122 | 5.6 2.8 4.9 2 "virginica" 123 | 7.7 2.8 6.7 2 "virginica" 124 | 6.3 2.7 4.9 1.8 "virginica" 125 | 6.7 3.3 5.7 2.1 "virginica" 126 | 7.2 3.2 6 1.8 "virginica" 127 | 6.2 2.8 4.8 1.8 "virginica" 128 | 6.1 3 4.9 1.8 "virginica" 129 | 6.4 2.8 5.6 2.1 "virginica" 130 | 7.2 3 5.8 1.6 "virginica" 131 | 7.4 2.8 6.1 1.9 "virginica" 132 | 7.9 3.8 6.4 2 "virginica" 133 | 6.4 2.8 5.6 2.2 "virginica" 134 | 6.3 2.8 5.1 1.5 "virginica" 135 | 6.1 2.6 5.6 1.4 "virginica" 136 | 7.7 3 6.1 2.3 "virginica" 137 | 6.3 3.4 5.6 2.4 "virginica" 138 | 6.4 3.1 5.5 1.8 "virginica" 139 | 6 3 4.8 1.8 "virginica" 140 | 6.9 3.1 5.4 2.1 "virginica" 141 | 6.7 3.1 5.6 2.4 "virginica" 142 | 6.9 3.1 5.1 2.3 "virginica" 143 | 5.8 2.7 5.1 1.9 "virginica" 144 | 6.8 3.2 5.9 2.3 "virginica" 145 | 6.7 3.3 5.7 2.5 "virginica" 146 | 6.7 3 5.2 2.3 "virginica" 147 | 6.3 2.5 5 1.9 "virginica" 148 | 6.5 3 5.2 2 "virginica" 149 | 6.2 3.4 5.4 2.3 "virginica" 150 | 5.9 3 5.1 1.8 "virginica" 151 | -------------------------------------------------------------------------------- /KNN_MapReduce/src/Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.IOException; 3 | import java.io.InputStreamReader; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | public class Driver { 13 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 14 | int num_features=0; 15 | Configuration conf=new Configuration(); 16 | FileSystem hdfs=FileSystem.get(conf); 17 | //args[0] is the path to the file which has features of the input waiting to be classified. 18 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[0])))); 19 | String line=null; 20 | while((line=br.readLine())!=null){ 21 | String[] feat=line.toString().split("\\ "); 22 | for(int i=0;i { 11 | public static long byteoffset=0; 12 | public static Float[] feat=null; 13 | public static String species=null; 14 | public static ArrayList dists=new ArrayList(); 15 | public static float min_dist=0; 16 | public static int num_features=0; 17 | public static float euc_dist(Float[] feat, Float[] test,int num){ 18 | float distance=0; 19 | float val=0; 20 | for(int i=0;i { 8 | String flower_name=null; 9 | @Override 10 | public void setup(Context context){ 11 | flower_name=String.valueOf(context.getConfiguration().get("name")); 12 | } 13 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 14 | HashMap map=new HashMap(); 15 | String maxkey=null;int maxvalue=-1; 16 | for(Text value:values){ 17 | if(!map.containsKey(value.toString())){ 18 | map.put(value.toString(), 1); 19 | } 20 | else{ 21 | map.put(value.toString(), map.get(value.toString())+1); 22 | } 23 | } 24 | for(Entry entry: map.entrySet()){ 25 | if(entry.getValue()>maxvalue){ 26 | maxkey=entry.getKey(); 27 | maxvalue=entry.getValue(); 28 | } 29 | } 30 | context.write(null, new Text(flower_name+" belongs to the species of "+maxkey)); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /LUDecomposition/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /LUDecomposition/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | LUDecomposition 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.apache.hdt.mrnature 16 | org.eclipse.jdt.core.javanature 17 | 18 | 19 | -------------------------------------------------------------------------------- /LUDecomposition/README.md: -------------------------------------------------------------------------------- 1 | # LU Decomposition 2 | 3 | This mapreduce algorithm splits massively large matrix into it's `L` and `U` components. It uses the Naive Gaussian Elimination technique to do so. 4 | 5 | # Program Execution Arguments 6 | 7 | This programs only expects two arguments: 8 | 9 | 1. An input path 10 | 2. An output path 11 | 12 | # Input and Output data shape 13 | 14 | Both the input and output matrix shapes are the **SAME**. This program expects and produces the textual input of matrices in the following manner: 15 | 16 | `row_number + "\t" + elem-1 + "," + elem-2 + "," + elem-3 ...` 17 | 18 | The text files should be a tab-separated list of `row_number`s and comma-separated row elements 19 | 20 | # Final Output Location 21 | 22 | This program produces **only one** intermediate output. But the actual output (`L` and `U` matrices) are present in the paths ` + "LU_Components/L"` and ` + "LU_Components/U"`. 23 | 24 | It's shapes will correspond to the shapes defined above. 25 | 26 | **NOTE**: I have provided the input and all the output (intermediate and actual) folders, you can use them to verify your outputs. 27 | -------------------------------------------------------------------------------- /LUDecomposition/input/test_input_4x4.txt: -------------------------------------------------------------------------------- 1 | 0 1,5,0,0 2 | 1 2,12,5,0 3 | 2 0,4,13,5 4 | 3 0,0,6,11 5 | -------------------------------------------------------------------------------- /LUDecomposition/output/.nth.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/.nth.crc -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/L/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/L/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/L/.part-r-00000.crc -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/L/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/L/_SUCCESS -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/L/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 1.0,0.0,0.0,0.0 2 | 1 2.0,1.0,0.0,0.0 3 | 2 0.0,2.0,1.0,0.0 4 | 3 0.0,0.0,2.0,1.0 5 | -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/U/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/U/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crc5)} -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/U/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/U/_SUCCESS -------------------------------------------------------------------------------- /LUDecomposition/output/LU_Components/U/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 1,5,0,0 2 | 1 0.0,2.0,5.0,0.0 3 | 2 0.0,0.0,3.0,5.0 4 | 3 0.0,0.0,0.0,1.0 5 | -------------------------------------------------------------------------------- /LUDecomposition/output/after-2-runs/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /LUDecomposition/output/after-2-runs/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crcѪ ( -------------------------------------------------------------------------------- /LUDecomposition/output/after-2-runs/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/after-2-runs/_SUCCESS -------------------------------------------------------------------------------- /LUDecomposition/output/after-2-runs/part-r-00000: -------------------------------------------------------------------------------- 1 | 1,0 2.56 2 | 2,0 5.76 3 | 0 25,5,1 4 | 1 0.0,-4.800000000000001,-1.56 5 | 2,1 3.499999999999999 6 | 2 0.0,0.0,0.6999999999999993 7 | -------------------------------------------------------------------------------- /LUDecomposition/output/after-3-runs/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /LUDecomposition/output/after-3-runs/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crcDM=] -------------------------------------------------------------------------------- /LUDecomposition/output/after-3-runs/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/after-3-runs/_SUCCESS -------------------------------------------------------------------------------- /LUDecomposition/output/after-3-runs/part-r-00000: -------------------------------------------------------------------------------- 1 | 1,0 2.0 2 | 2,0 0.0 3 | 3,0 0.0 4 | 2,1 2.0 5 | 3,1 0.0 6 | 0 1,5,0,0 7 | 1 0.0,2.0,5.0,0.0 8 | 2 0.0,0.0,3.0,5.0 9 | 3,2 2.0 10 | 3 0.0,0.0,0.0,1.0 11 | -------------------------------------------------------------------------------- /LUDecomposition/output/nth: -------------------------------------------------------------------------------- 1 | 0.0,0.0,0.0,1.0 -------------------------------------------------------------------------------- /LUDecomposition/output/total_records/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /LUDecomposition/output/total_records/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/total_records/.part-r-00000.crc -------------------------------------------------------------------------------- /LUDecomposition/output/total_records/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/total_records/_SUCCESS -------------------------------------------------------------------------------- /LUDecomposition/output/total_records/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 4 2 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/Utils.java: -------------------------------------------------------------------------------- 1 | package lud; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.DataOutputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.io.OutputStreamWriter; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | 14 | public class Utils { 15 | 16 | public static String arrayToCSV(Double[] nVal2) { 17 | String result = ""; 18 | 19 | if (nVal2.length > 0) { 20 | StringBuilder sb = new StringBuilder(); 21 | 22 | for (Double s : nVal2) { 23 | sb.append(s).append(","); 24 | } 25 | 26 | result = sb.deleteCharAt(sb.length() - 1).toString(); 27 | } 28 | return result; 29 | } 30 | 31 | public static Double[] stringToDoubleArray(String[] a) { 32 | Double[] x = new Double[a.length]; 33 | for(int i = 0; i < a.length ; i++) 34 | x[i] = Double.valueOf(a[i]); 35 | return x; 36 | } 37 | 38 | public static void storeToHDFS(String data, String output, Configuration conf) throws IOException { 39 | 40 | FileSystem hdfs=FileSystem.get(conf); 41 | Path find_nth_row_output_path = new Path(conf.get("find_nth_row_output")); 42 | try { 43 | if (hdfs.exists(find_nth_row_output_path)) { 44 | hdfs.delete(find_nth_row_output_path, true); 45 | } 46 | DataOutputStream outStream = hdfs.create(find_nth_row_output_path); 47 | BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(outStream, "UTF-8" ) ); 48 | bw.write(data); 49 | bw.close(); 50 | hdfs.close(); 51 | outStream.close(); 52 | } 53 | catch (Exception e) { 54 | System.out.println(e.getMessage()); 55 | } 56 | } 57 | 58 | public static String readFromHDFS(String path, Configuration conf) throws IOException { 59 | 60 | FileSystem hdfs=FileSystem.get(conf); 61 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path)))); 62 | String records = br.readLine().trim(); 63 | br.close(); 64 | hdfs.close(); 65 | 66 | return records; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/io/LongAndTextWritable.java: -------------------------------------------------------------------------------- 1 | package lud.io; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.Writable; 10 | 11 | public class LongAndTextWritable implements Writable { 12 | 13 | private LongWritable rowKey; 14 | private Text rowValue; 15 | 16 | public LongAndTextWritable() { 17 | this.rowKey = new LongWritable(0); 18 | this.rowValue = new Text(""); 19 | } 20 | 21 | public LongAndTextWritable(LongWritable k, Text v) { 22 | this.rowKey = k; 23 | this.rowValue = v; 24 | } 25 | 26 | public LongWritable getKey() { 27 | return rowKey; 28 | } 29 | 30 | public Text getValue() { 31 | return rowValue; 32 | } 33 | 34 | @Override 35 | public void readFields(DataInput in) throws IOException { 36 | 37 | rowKey.readFields(in); 38 | rowValue.readFields(in); 39 | 40 | } 41 | 42 | @Override 43 | public void write(DataOutput out) throws IOException { 44 | 45 | rowKey.write(out); 46 | rowValue.write(out); 47 | } 48 | 49 | @Override 50 | public String toString() { 51 | return rowKey.toString() + "\t" + rowValue.toString(); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/io/NaturalKeyGroupingComparator.java: -------------------------------------------------------------------------------- 1 | package lud.io; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class NaturalKeyGroupingComparator extends WritableComparator { 7 | protected NaturalKeyGroupingComparator() { 8 | super(TextPair.class, true); 9 | } 10 | @SuppressWarnings("rawtypes") 11 | @Override 12 | public int compare(WritableComparable w1, WritableComparable w2) { 13 | TextPair tp1 = (TextPair)w1; 14 | TextPair tp2 = (TextPair)w2; 15 | 16 | return tp1.getFirst().compareTo(tp2.getFirst()); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/io/TextPair.java: -------------------------------------------------------------------------------- 1 | package lud.io; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | public class TextPair implements WritableComparable { 10 | 11 | private String t1; 12 | private String t2; 13 | 14 | public String getFirst() { 15 | return this.t1; 16 | } 17 | 18 | public String getSecond() { 19 | return this.t2; 20 | } 21 | 22 | @Override 23 | public void readFields(DataInput in) throws IOException { 24 | this.t1 = in.readUTF(); 25 | this.t2 = in.readUTF(); 26 | } 27 | 28 | @Override 29 | public void write(DataOutput out) throws IOException { 30 | out.writeUTF(this.t1); 31 | out.writeUTF(this.t2); 32 | } 33 | 34 | public TextPair() { 35 | this.t1 = new String(); 36 | this.t2 = new String(); 37 | } 38 | 39 | public TextPair(String t1, String t2) { 40 | this.t1 = new String(t1); 41 | this.t2 = new String(t2); 42 | } 43 | 44 | public int compareTo(TextPair tp) { 45 | int sortKey = this.t1.compareTo(tp.getFirst()); 46 | if (sortKey == 0) { 47 | sortKey = this.t2.compareTo(tp.getSecond()); 48 | } 49 | return sortKey; 50 | } 51 | 52 | public String toString () { 53 | String s = ""; 54 | if (this.t2.compareTo("") == 0) { 55 | s += this.t1; 56 | } 57 | else { 58 | s += this.t1 + "," + this.t2; 59 | } 60 | return s; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/io/TextPairComparator.java: -------------------------------------------------------------------------------- 1 | package lud.io; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class TextPairComparator extends WritableComparator { 7 | protected TextPairComparator() { 8 | super(TextPair.class, true); 9 | } 10 | @SuppressWarnings("rawtypes") 11 | @Override 12 | public int compare(WritableComparable w1, WritableComparable w2) { 13 | TextPair tp1 = (TextPair)w1; 14 | TextPair tp2 = (TextPair)w2; 15 | 16 | int result = tp1.getFirst().compareTo(tp2.getFirst()); 17 | if(0 == result) { 18 | result = tp1.getSecond().compareTo(tp2.getSecond()); 19 | } 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /LUDecomposition/src/lud/io/TextPairPartitioner.java: -------------------------------------------------------------------------------- 1 | package lud.io; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Partitioner; 5 | 6 | public class TextPairPartitioner extends Partitioner{ 7 | @Override 8 | public int getPartition(TextPair tp, Text t, int numPartitions) { 9 | return tp.getFirst().hashCode() % numPartitions; 10 | } 11 | } -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/initial_input_mapper.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | import lud.Utils; 12 | import lud.io.TextPair; 13 | 14 | public class initial_input_mapper extends Mapper { 15 | 16 | private int n; 17 | private String nVal = null; 18 | 19 | private long counter = 0; 20 | private long[] input_range = new long[2]; 21 | 22 | private List toBeSent = new ArrayList(); 23 | 24 | public static Double[] readNthRow (Configuration conf) throws IOException { 25 | try { 26 | 27 | String path = conf.get("find_nth_row_output"); 28 | String[] nValArr = Utils.readFromHDFS(path, conf).split(","); 29 | return Utils.stringToDoubleArray(nValArr); 30 | 31 | } 32 | catch (Exception e) { 33 | System.out.println("Can't read nth value! " + e.getMessage()); 34 | return null; 35 | } 36 | } 37 | 38 | @Override 39 | public void setup (Context context) throws IOException, InterruptedException { 40 | Configuration conf = context.getConfiguration(); 41 | this.n = (int) conf.getLong("n", 0); 42 | // The below code should have worked, but it does not 43 | //if (n>0) 44 | // this.nVal = Utils.arrayToCSV(readNthRow(context.getConfiguration())); 45 | } 46 | 47 | public void map(Text key, Text value, Context context) throws IOException, InterruptedException { 48 | 49 | if (counter == 0 && !key.toString().contains(",")) 50 | this.input_range[0] = Long.valueOf(key.toString()); 51 | 52 | if (!key.toString().contains(",")) { 53 | int row = Integer.parseInt(key.toString()); 54 | 55 | if (row == this.n) 56 | this.nVal = value.toString(); 57 | 58 | if (n > 0) 59 | //context.write(key, new Text(value.toString()+";"+this.nVal)); 60 | toBeSent.add(new TextPair(key.toString(), value.toString())); 61 | else 62 | context.write(key, value); 63 | } 64 | else 65 | context.write(key, value); 66 | 67 | counter++; 68 | } 69 | 70 | @Override 71 | public void cleanup (Context context) throws IOException, InterruptedException { 72 | 73 | // The code block below will run if the nth row is not in the split. 74 | // It will read it from HDFS, which was stored there from the previous mapper/reducer 75 | if (this.nVal == null && n>0) 76 | this.nVal = Utils.arrayToCSV(readNthRow(context.getConfiguration())); 77 | 78 | if (n == 0 && n>=input_range[0] && n<=input_range[1]) { 79 | input_range[1] = input_range[0] + counter - 1; 80 | // Sending Nth Row to all reducers 81 | for (long i = 0 ; i <= input_range[1] ; i++) 82 | context.write(new Text(String.valueOf(i)), new Text("Nth Row->"+this.nVal)); 83 | } 84 | 85 | // Have to do this because there is a mapper after this map phase, and not a reducer. 86 | // Otherwise I would have used the same logic like in the if block above. 87 | else 88 | for (TextPair tp:toBeSent) 89 | context.write(new Text(tp.getFirst()), new Text(tp.getSecond()+";"+this.nVal)); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/lud_driver.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.chain.ChainMapper; 10 | import org.apache.hadoop.mapreduce.lib.chain.ChainReducer; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | import lud.naiveGaussian.mergeResults.merge_results_driver; 16 | import lud.naiveGaussian.totalRecords.total_records_driver; 17 | 18 | public class lud_driver { 19 | 20 | public static void prepareJobWithConf (Job jobPrep, Configuration confPrep) throws IOException, InterruptedException, ClassNotFoundException { 21 | 22 | long n = confPrep.getLong("n", 0); 23 | 24 | // Chaining MR Jobs 25 | if (n == 0) { 26 | ChainMapper.addMapper(jobPrep, initial_input_mapper.class, 27 | Text.class, Text.class, 28 | Text.class, Text.class, 29 | confPrep); 30 | ChainReducer.setReducer(jobPrep, lud_reducer.class, Text.class, Text.class, Text.class, Text.class, confPrep); 31 | } 32 | else { 33 | ChainReducer.addMapper(jobPrep, initial_input_mapper.class, 34 | Text.class, Text.class, 35 | Text.class, Text.class, 36 | confPrep); 37 | ChainReducer.addMapper(jobPrep, lud_mapper.class, 38 | Text.class, Text.class, 39 | Text.class, Text.class, 40 | confPrep); 41 | } 42 | } 43 | 44 | @SuppressWarnings("deprecation") 45 | public static void main (String[] args) throws IOException, ClassNotFoundException, InterruptedException { 46 | 47 | String input = args[0]; 48 | String output = args[1]; 49 | String find_nth_row_output = output + "/nth"; 50 | 51 | // MR Job: Finding Total Records 52 | 53 | String[] total_records_args = {input, output + "/total_records"}; 54 | long total_records = total_records_driver.run(total_records_args); 55 | 56 | Configuration conf = new Configuration(); 57 | conf.set("find_nth_row_output", find_nth_row_output); 58 | conf.set("mapreduce.job.reduce.slowstart.completedmaps", "1.00"); 59 | conf.setLong("total_records", total_records); 60 | Job job = new Job(conf); 61 | 62 | for(int n = 0 ; n < total_records - 1 ; n++) { 63 | 64 | Configuration confLoop = conf; 65 | confLoop.set("mapreduce.job.reduce.slowstart.completedmaps", "1.00"); 66 | confLoop.unset("n"); 67 | confLoop.setLong("n", n); 68 | 69 | prepareJobWithConf(job, confLoop); 70 | } 71 | 72 | String lud_output_path = output+"/after-"+(total_records-1)+"-runs"; 73 | job.setJarByClass(lud_driver.class); 74 | job.setJobName("Split a matrix into it's LU decomposed components using the Naive Gaussian Elimination method"); 75 | FileInputFormat.addInputPath(job, new Path(input)); 76 | FileOutputFormat.setOutputPath(job, new Path(lud_output_path)); 77 | job.setInputFormatClass(KeyValueTextInputFormat.class); 78 | job.setOutputKeyClass(Text.class); 79 | job.setOutputValueClass(Text.class); 80 | job.waitForCompletion(true); 81 | 82 | // MR Job(s): Merging Outputs 83 | 84 | Path merge_results_input_path = new Path(lud_output_path); 85 | conf.setBoolean("upper", false); 86 | job = new Job(conf); 87 | FileInputFormat.addInputPath(job, merge_results_input_path); 88 | String l_output_path = output+"/LU_Components/L"; 89 | merge_results_driver.runWithJob(job, l_output_path); 90 | 91 | conf.setBoolean("upper", true); 92 | job = new Job(conf); 93 | String u_output_path = output+"/LU_Components/U"; 94 | FileInputFormat.addInputPath(job, merge_results_input_path); 95 | merge_results_driver.runWithJob(job, u_output_path); 96 | 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/lud_mapper.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import lud.Utils; 9 | 10 | public class lud_mapper extends Mapper { 11 | 12 | private static long total_records; 13 | private long n; 14 | private Double[] nVal = null; 15 | 16 | @Override 17 | public void setup (Context context) throws IOException, InterruptedException { 18 | lud_mapper.total_records = context.getConfiguration().getLong("total_records", 0); 19 | this.n = context.getConfiguration().getLong("n", 0); 20 | this.nVal = initial_input_mapper.readNthRow(context.getConfiguration()); 21 | } 22 | 23 | public void map(Text key, Text value, Context context) throws IOException, InterruptedException { 24 | 25 | String[] parts = new String[2]; 26 | parts[0] = key.toString(); 27 | String[] rowAndNVal = value.toString().split(";"); 28 | parts[1] = parts[0].contains(",")?value.toString():rowAndNVal[0]; 29 | 30 | if (this.nVal == null && !parts[0].contains(",")) 31 | this.nVal = Utils.stringToDoubleArray(rowAndNVal[1].split(",")); 32 | 33 | if(!parts[0].contains(",")) { 34 | 35 | long row = Long.valueOf(parts[0]); 36 | 37 | if (row > this.n) { 38 | 39 | Double[] rowElements = Utils.stringToDoubleArray(parts[1].split(",")); 40 | Double multiplier = (double) (rowElements[(int) this.n]/this.nVal[(int) this.n]); 41 | // Sending lower triangular matrix elements 42 | context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier))); 43 | Double[] rowElementsModified = new Double[(int) lud_mapper.total_records]; 44 | 45 | for (int i = 0; i< lud_mapper.total_records; i++) { 46 | rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier); 47 | } 48 | 49 | // Doing this so that N+1th row is stored before any KV pair is generated 50 | if (row==(this.n+1)) 51 | Utils.storeToHDFS(Utils.arrayToCSV(rowElementsModified), context.getConfiguration().get("find_nth_row_output"), context.getConfiguration()); 52 | 53 | context.write(new Text(String.valueOf(row)), new Text(Utils.arrayToCSV(rowElementsModified))); 54 | } 55 | else 56 | context.write(new Text(parts[0]), new Text(parts[1].split(";")[0])); 57 | } 58 | else 59 | context.write(new Text(parts[0]), new Text(parts[1].split(";")[0])); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/lud_reducer.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import lud.Utils; 9 | 10 | public class lud_reducer extends Reducer { 11 | 12 | private long n; 13 | private Double[] nVal = null; 14 | 15 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 16 | 17 | // Fetching Nth Row from values 18 | for (Text value:values) { 19 | 20 | String[] parts = new String[2]; 21 | parts[0] = key.toString(); 22 | parts[1] = value.toString(); 23 | 24 | if (parts[1].contains("Nth Row->")) { 25 | this.nVal = Utils.stringToDoubleArray(parts[1].split("->")[1].split(",")); 26 | break; 27 | } 28 | } 29 | 30 | 31 | // Processing rest of the rows 32 | for (Text value:values) { 33 | 34 | String[] parts = new String[2]; 35 | parts[0] = key.toString(); 36 | parts[1] = value.toString(); 37 | 38 | if (parts[1].contains("Nth Row->")) 39 | continue; 40 | 41 | else { 42 | if(!parts[0].contains(",")) { 43 | 44 | long row = Long.valueOf(parts[0]); 45 | 46 | if (row > this.n) { 47 | Double[] rowElements = Utils.stringToDoubleArray(parts[1].split(",")); 48 | Double multiplier = (double) (rowElements[(int) this.n]/this.nVal[(int) this.n]); 49 | 50 | context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier))); 51 | 52 | Double[] rowElementsModified = new Double[(int) rowElements.length]; 53 | for (int i = 0; i< rowElementsModified.length; i++) { 54 | rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier); 55 | } 56 | 57 | // Doing this so that N+1th row is stored before any KV pair is generated 58 | if (row==(this.n+1)) 59 | Utils.storeToHDFS(Utils.arrayToCSV(rowElementsModified), context.getConfiguration().get("find_nth_row_output"), context.getConfiguration()); 60 | 61 | context.write(new Text(String.valueOf(row)), new Text(Utils.arrayToCSV(rowElementsModified))); 62 | } 63 | else 64 | context.write(new Text(parts[0]), new Text(parts[1])); 65 | } 66 | else 67 | context.write(new Text(parts[0]), new Text(parts[1])); 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_driver.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.mergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | import lud.io.NaturalKeyGroupingComparator; 11 | import lud.io.TextPair; 12 | import lud.io.TextPairComparator; 13 | import lud.io.TextPairPartitioner; 14 | 15 | public class merge_results_driver { 16 | 17 | public static boolean runWithJob(Job job, String out_path) throws IOException, InterruptedException, ClassNotFoundException { 18 | job.setJarByClass(merge_results_driver.class); 19 | 20 | job.setJobName("Final Step: Merging results and creating separate LU decomposed components of input matrix"); 21 | 22 | FileOutputFormat.setOutputPath(job, new Path(out_path)); 23 | 24 | job.setMapperClass(lud.naiveGaussian.mergeResults.merge_results_mapper.class); 25 | job.setReducerClass(lud.naiveGaussian.mergeResults.merge_results_reducer.class); 26 | job.setMapOutputKeyClass(TextPair.class); 27 | job.setMapOutputValueClass(Text.class); 28 | job.setOutputKeyClass(TextPair.class); 29 | job.setOutputValueClass(Text.class); 30 | job.setPartitionerClass(TextPairPartitioner.class); 31 | job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); 32 | job.setSortComparatorClass(TextPairComparator.class); 33 | 34 | boolean success = job.waitForCompletion(true); 35 | return success; 36 | }; 37 | } 38 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_mapper.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.mergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | import lud.io.TextPair; 10 | 11 | public class merge_results_mapper extends Mapper { 12 | 13 | private Boolean upper; 14 | private int total_records; 15 | 16 | @Override 17 | public void setup (Context context) { 18 | this.upper = context.getConfiguration().getBoolean("upper", false); 19 | this.total_records = (int) context.getConfiguration().getLong("total_records", 0); 20 | } 21 | 22 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 23 | String[] parts = value.toString().split("\\t"); 24 | // Processing Upper Triangular Matrix's rows 25 | if (this.upper && !parts[0].contains(",")) { 26 | context.write(new TextPair(parts[0],""), new Text(parts[1])); 27 | } 28 | // Processing Lower Triangular Matrix's rows 29 | if (!this.upper && parts[0].contains(",")) { 30 | 31 | String[] rowCol = parts[0].split(","); 32 | String row = rowCol[0]; 33 | // Sending first row of Lower Triangular Matrix to the reducer 34 | if (Integer.valueOf(row)-1 == 0) { 35 | for (int i = 0; i < this.total_records; i++) { 36 | context.write(new TextPair("0",String.valueOf(i)), new Text(i+","+((i == 0) ? 1 : 0))); 37 | } 38 | } 39 | String column = rowCol[1]; 40 | String element = parts[1]; 41 | context.write(new TextPair(row, column), new Text(column+","+element)); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_reducer.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.mergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import lud.Utils; 9 | import lud.io.TextPair; 10 | 11 | public class merge_results_reducer extends Reducer { 12 | 13 | private Boolean upper; 14 | private int total_records; 15 | 16 | @Override 17 | public void setup (Context context) { 18 | this.upper = context.getConfiguration().getBoolean("upper", false); 19 | this.total_records = (int) context.getConfiguration().getLong("total_records", 0); 20 | } 21 | 22 | public static String arrayToCSV(String[] a) { 23 | String result = ""; 24 | if (a.length > 0) { 25 | StringBuilder sb = new StringBuilder(); 26 | for (String s : a) { 27 | sb.append(s).append(","); 28 | } 29 | result = sb.deleteCharAt(sb.length() - 1).toString(); 30 | } 31 | return result; 32 | } 33 | 34 | public void reduce(TextPair key, Iterable values, Context context) 35 | throws IOException, InterruptedException { 36 | if (this.upper) { 37 | for (Text val:values) { 38 | context.write(new TextPair(key.getFirst(),""), val); 39 | } 40 | } 41 | else { 42 | Double[] rowElements = new Double[this.total_records]; 43 | int row = Integer.valueOf(key.getFirst()); 44 | for (Text val:values) { 45 | String[] parts = val.toString().split(","); 46 | int j = Integer.valueOf(parts[0]); 47 | rowElements[j] = Double.valueOf(parts[1]); 48 | } 49 | // Setting Diagonal Elements as `1` in the lower triangular matrix rows 50 | rowElements[row] = (double) 1; 51 | 52 | for(int j = 0; j< this.total_records; j++) { 53 | if (rowElements[j] == null) { 54 | rowElements[j] = (double) 0; 55 | } 56 | } 57 | context.write(new TextPair(key.getFirst(),""), new Text(Utils.arrayToCSV(rowElements))); 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_driver.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.totalRecords; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class total_records_driver { 16 | 17 | public static long readTotalRecords (String path, Configuration conf) throws IOException { 18 | FileSystem hdfs=FileSystem.get(conf); 19 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-r-00000")))); 20 | Long records = (long) 0; 21 | records = Long.valueOf(br.readLine().split("\\t")[1]); 22 | br.close(); 23 | hdfs.close(); 24 | return records; 25 | } 26 | 27 | @SuppressWarnings("deprecation") 28 | public static long run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 29 | Configuration conf = new Configuration(); 30 | Job job = new Job(conf); 31 | 32 | job.setJarByClass(total_records_driver.class); 33 | 34 | job.setJobName("Just counting total rows of the HDFS input"); 35 | 36 | FileInputFormat.setInputPaths(job, new Path(args[0])); 37 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 38 | 39 | job.setMapperClass(total_records_mapper.class); 40 | 41 | job.setReducerClass(total_records_reducer.class); 42 | job.setCombinerClass(total_records_reducer.class); 43 | 44 | job.setOutputKeyClass(LongWritable.class); 45 | job.setOutputValueClass(LongWritable.class); 46 | 47 | //job.setInputFormatClass(TextInputFormat.class); 48 | //job.setOutputFormatClass(TextOutputFormat.class); 49 | 50 | job.waitForCompletion(true); 51 | 52 | return readTotalRecords(args[1], conf); 53 | }; 54 | } 55 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_mapper.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.totalRecords; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class total_records_mapper extends Mapper { 10 | 11 | private Long countRows = (long) 0; 12 | 13 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 14 | this.countRows++; 15 | } 16 | 17 | @Override 18 | public void cleanup(Context context) throws IOException, InterruptedException{ 19 | context.write(new LongWritable(0), new LongWritable(this.countRows)); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_reducer.java: -------------------------------------------------------------------------------- 1 | package lud.naiveGaussian.totalRecords; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | public class total_records_reducer extends Reducer { 9 | 10 | private Long countRows = (long) 0; 11 | 12 | public void reduce(LongWritable key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | for(LongWritable val:values){ 15 | this.countRows += val.get(); 16 | } 17 | } 18 | 19 | @Override 20 | public void cleanup(Context context) throws IOException, InterruptedException{ 21 | 22 | context.write(new LongWritable(0), new LongWritable(this.countRows)); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /LinearRegression_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | The input contains files which have comma separated values. Each line is an input point with the last comma separated value being the output. 2 | 3 | In this case, I have just used a straight line as input and examined if my algorithm generates the same straight line from it which it does. 4 | 5 | This code can easily be converted into LWR (Locally Weighted Regression), a technique which discards the less critical (irrelevant) input points, by simply multiplying the weighting function which is given by; 6 | 7 | w(i) = exp(-(X[i]-X)^2/(2T^2)) ;; "exp" is "e^" 8 | 9 | X[i] is the input point 10 | X is the query point (The input for which you want to predict the output) 11 | T (Tao) is a constant like alpha. The higher the value of Tao, the higher is the range of the input points used (chosen) for prediction or wider is the weighting function and vice-versa. 12 | 13 | to the term "(alpha/number_inputs)*(Yi-h_theta)*(Xi[i]))" in the map function of the code. 14 | 15 | This algorithm takes in 5 arguments as follows: 16 | 17 | 1. The number of features each input point has 18 | 2. The value of alpha 19 | 3. The number of times you want your algorithm to iterate 20 | 4. The input path 21 | 5. The output path 22 | -------------------------------------------------------------------------------- /LinearRegression_MapReduce/input/linear.txt: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 3,3 4 | 4,4 5 | 5,5 6 | 6,6 7 | 7,7 8 | 8,8 9 | 9,9 10 | 10,10 -------------------------------------------------------------------------------- /LinearRegression_MapReduce/src/Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.IOException; 3 | import java.io.InputStreamReader; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.FloatWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | public class Driver { 14 | public static int num_features; // needs to be set 15 | public static float alpha; // needs to be set 16 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ 17 | //args[0] is the number of features each input has. 18 | num_features=Integer.parseInt(args[0]); 19 | ++num_features; 20 | //args[1] is the value of alpha that you want to use. 21 | alpha=Float.parseFloat(args[1]); 22 | Configuration conf=new Configuration(); 23 | FileSystem hdfs=FileSystem.get(conf); 24 | Float[] theta=new Float[num_features]; 25 | //args[2] is the number of times you want to iterate over your training set. 26 | for(int i=0;i { 9 | public static int count=0; 10 | public static long number_inputs=(long) 0; 11 | public static float alpha=0.0f; 12 | public static Float[] Xi=null; 13 | public static ArrayList theta_i=new ArrayList(); 14 | @Override 15 | public void setup(Context context) throws IOException, InterruptedException{ 16 | alpha=context.getConfiguration().getFloat("alpha",0), 17 | number_inputs=context.getCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS).getValue(); 18 | } 19 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 20 | ++count; 21 | float h_theta=0; 22 | String[] tok=value.toString().split("\\,"); 23 | if(count==1){ 24 | for(int i=0;i{ 7 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 8 | float sum=0; 9 | int count=0; 10 | for(FloatWritable value:values){ 11 | sum+=value.get(); 12 | count++; 13 | } 14 | context.write(key, new FloatWritable(sum/count)); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /LogisticRegression_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | The input is a record of women who were diagnosed for Diabetes. Each line is an input point with the last value being the output. 2 | 3 | In this case, the output is either 0 or 1. 0 means tested negative for Diabetes and 1 means otherwise. 4 | 5 | This code can easily be converted into LWR (Locally Weighted Regression), a technique which discards the less critical (irrelevant) input points, by simply multiplying the weighting function which is given by; 6 | 7 | w(i) = exp(-(X[i]-X)^2/(2T^2)) ;; "exp" is "e^" 8 | 9 | X[i] is the input point 10 | X is the query point (The input for which you want to predict the output) 11 | T (Tao) is a constant like alpha. The higher the value of Tao, the higher is the range of the input points used (chosen) for prediction or wider is the weighting function and vice-versa. 12 | 13 | to the term "(alpha/number_inputs)*(Yi-h_theta)*(Xi[i]))" in the map function of the code. 14 | 15 | This algorithm takes in 5 arguments as follows: 16 | 17 | 1. The number of features each input point has 18 | 2. The value of alpha 19 | 3. The number of times you want your algorithm to iterate 20 | 4. The input path 21 | 5. The output path -------------------------------------------------------------------------------- /LogisticRegression_MapReduce/src/Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.IOException; 3 | import java.io.InputStreamReader; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.FloatWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | 14 | public class Driver { 15 | public static int num_features; // needs to be set 16 | public static float alpha; // needs to be set 17 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ 18 | //args[0] is the number of features each input has. 19 | num_features=Integer.parseInt(args[0]); 20 | ++num_features; 21 | //args[1] is the value of alpha that you want to use. 22 | alpha=Float.parseFloat(args[1]); 23 | Configuration conf=new Configuration(); 24 | FileSystem hdfs=FileSystem.get(conf); 25 | Float[] theta=new Float[num_features]; 26 | //args[2] is the number of times you want to iterate over your training set. 27 | for(int i=0;i { 9 | public static int count=0; 10 | public static long number_inputs=(long) 0; 11 | public static float alpha=0.0f; 12 | public static Float[] Xi=null; 13 | public static ArrayList theta_i=new ArrayList(); 14 | @Override 15 | public void setup(Context context) throws IOException, InterruptedException{ 16 | alpha=context.getConfiguration().getFloat("alpha",0); 17 | number_inputs=context.getCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS).getValue(); 18 | } 19 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 20 | ++count; 21 | float h_theta=0; 22 | String[] tok=value.toString().split("\\,"); 23 | if(count==1){ 24 | for(int i=0;i=0){ 43 | h_theta=1; 44 | } 45 | else{ 46 | h_theta=0; 47 | } 48 | }*/ 49 | //If you choose to use the Logistic Function for learning 50 | if(i==(Xi.length-1)){ 51 | h_theta=(float) (1/(1+(Math.exp(-(exp))))); 52 | } 53 | } 54 | float Yi=Float.parseFloat(tok[tok.length-1]); 55 | for(int i=0;i{ 7 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 8 | float sum=0; 9 | int count=0; 10 | for(FloatWritable value:values){ 11 | sum+=value.get(); 12 | count++; 13 | } 14 | context.write(key, new FloatWritable(sum/count)); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Market-Basket-Analysis_MapReduce/ReadMe.txt: -------------------------------------------------------------------------------- 1 | This is a MapReduce implementation of one of the simplest algorithms called Market Basket Analysis. 2 | 3 | This algorithm helps the user to determine which items have been occuring together. 4 | In marketing terms, this algorihtm can help the vendor (online or local) to determine which items to be clubbed together on the shelf. 5 | For example, many customer might have bought butter along with bread. So naturally it would be a wise choice to juxtapose them on the shelf and this algorithm helps the vendors to do the same. 6 | 7 | The sample input data contains transactions of all the customers. 8 | It has a comma separated list of items bought by a customer. 9 | 10 | The sample output is the frequency of occurence of groups of items. 11 | 12 | This algorithm takes in three arguments: 13 | 14 | 1. The input path 15 | 2. The output path 16 | 3. Number of groupings i.e. How many items shoould be grouped together. Set this carefully as a the value of number of groupings should be always less than or equal to the number of items purchased by every customer. -------------------------------------------------------------------------------- /Market-Basket-Analysis_MapReduce/input/in.txt.txt: -------------------------------------------------------------------------------- 1 | crackers,bread,banana 2 | crackers,coke,butter,coffee 3 | crackers,bread 4 | crackers,bread 5 | crackers,bread,coffee 6 | butter,coke 7 | butter,coke,bread,crackers 8 | -------------------------------------------------------------------------------- /Market-Basket-Analysis_MapReduce/output/part-r-00000: -------------------------------------------------------------------------------- 1 | bread, coffee 1 2 | butter, coffee 1 3 | banana, crackers 1 4 | butter, coke 3 5 | coffee, crackers 2 6 | bread, butter 1 7 | banana, bread 1 8 | bread, crackers 5 9 | coke, crackers 2 10 | bread, coke 1 11 | coffee, coke 1 12 | butter, crackers 2 13 | -------------------------------------------------------------------------------- /Market-Basket-Analysis_MapReduce/src/MBA_Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | public class MBA_Driver { 11 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 12 | Configuration conf=new Configuration(); 13 | conf.set("group_num", args[2]); 14 | Job job = new Job(conf); 15 | job.setJarByClass(MBA_Driver.class); 16 | job.setJobName("Market Basket Analysis"); 17 | FileInputFormat.setInputPaths(job, new Path(args[0])); 18 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 19 | job.setMapperClass(MBA_Mapper.class); 20 | job.setCombinerClass(MBA_Reducer.class); 21 | job.setReducerClass(MBA_Reducer.class); 22 | job.setOutputKeyClass(Text.class); 23 | job.setOutputValueClass(IntWritable.class); 24 | boolean success = job.waitForCompletion(true); 25 | System.exit(success ? 0 : 1); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Market-Basket-Analysis_MapReduce/src/MBA_Mapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Arrays; 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | public class MBA_Mapper extends Mapper { 9 | public static int group_num = 2; 10 | @Override 11 | public void setup(Context context){ 12 | group_num=Integer.parseInt(context.getConfiguration().get("group_num")); 13 | } 14 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 15 | String[] vals=value.toString().split("\\,"); 16 | Arrays.sort(vals); 17 | if(vals.length>=group_num){ 18 | for(int i=0;i { 7 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 8 | int sum=0; 9 | for(IntWritable value:values){ 10 | sum=sum+value.get(); 11 | } 12 | if(sum>1) 13 | context.write(key, new IntWritable(sum)); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /MatrixMultiplication_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | The input stores the two matrices to be multiplied in a file. 2 | 3 | Each line is a row of a matrix. The first value of each row names the matrix and the last value is the row number of the matrix (starts from 0). 4 | 5 | In this case, the first matrix is stored normally and the second matrix is stored as the transpose. 6 | 7 | This way of storing reduces the filesize and helps the algorithm complete at a faster rate. -------------------------------------------------------------------------------- /MatrixMultiplication_MapReduce/input/1.txt: -------------------------------------------------------------------------------- 1 | A,1,2,3,0 2 | A,3,4,5,1 3 | A,5,6,7,2 4 | B,2,12,4,0 5 | B,3,1,4,1 -------------------------------------------------------------------------------- /MatrixMultiplication_MapReduce/src/MatMulDriver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 8 | 9 | 10 | public class MatMulDriver { 11 | 12 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 13 | Configuration conf = new Configuration(); 14 | // A is an m-by-n matrix; B is an n-by-p matrix. 15 | conf.set("m", args[0]); 16 | conf.set("n", args[1]); 17 | conf.set("p", args[2]); 18 | Job job = new Job(conf, "Matrix_Multiplication"); 19 | job.setJarByClass(MatMulDriver.class); 20 | job.setOutputKeyClass(Text.class); 21 | job.setOutputValueClass(Text.class); 22 | job.setMapperClass(MatMulMap.class); 23 | //Don't use combiner if there is no scope of combining the output. Otherwise the job will get stuck. 24 | //job.setCombinerClass(MatMulModGenReduce.class); 25 | job.setReducerClass(MatMulReduce.class); 26 | //args[3] is the input path. 27 | FileInputFormat.addInputPath(job, new Path(args[3])); 28 | //args[4] is the output path. 29 | FileOutputFormat.setOutputPath(job, new Path(args[4])); 30 | System.exit(job.waitForCompletion(true)?0:1); 31 | } 32 | } -------------------------------------------------------------------------------- /MatrixMultiplication_MapReduce/src/MatMulMap.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.io.LongWritable; 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Mapper; 5 | 6 | 7 | public class MatMulMap extends Mapper { 8 | public static int m=0,n=0,p=0; 9 | @Override 10 | public void setup(Context context) throws IOException, InterruptedException{ 11 | m = Integer.parseInt(context.getConfiguration().get("m")); 12 | n = Integer.parseInt(context.getConfiguration().get("n")); 13 | p = Integer.parseInt(context.getConfiguration().get("p")); 14 | } 15 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 16 | Text Key = new Text(); 17 | Text Value = new Text(); 18 | String line = value.toString(); 19 | String[] val = line.split("\\,"); 20 | if(val[0].contentEquals("A")){ 21 | for(int x=0;x{ 7 | int n=0; 8 | @Override 9 | public void setup(Context context){ 10 | n=Integer.parseInt(context.getConfiguration().get("n")); 11 | } 12 | public void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException{ 13 | String[] value; 14 | HashMap hashA = new HashMap(); 15 | HashMap hashB = new HashMap(); 16 | for (Text val : values) { 17 | value = val.toString().split(","); 18 | if (value[0].equals("A")) { 19 | for(int z=1;z<=n;z++){ 20 | hashA.put(z, Float.parseFloat(value[z]));} 21 | } else{ 22 | for(int a=1;a<=n;a++){ 23 | hashB.put(a, Float.parseFloat(value[a]));} 24 | } 25 | } 26 | float result = 0.0f; 27 | float a_ij; 28 | float b_jk; 29 | for (int j=1;j<=n;j++) { 30 | a_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f; 31 | b_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f; 32 | result +=a_ij*b_jk; 33 | } 34 | context.write(null, new Text(key.toString() + "," + Float.toString(result))); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/ReadMe.txt: -------------------------------------------------------------------------------- 1 | This algorithm is called "Common Friends" algorithm. 2 | As the name suggests, it helps to find common items between entities. 3 | 4 | In this case, the sample input file is a file which stores the user_id of a person and user_ids of all its friends in the fllowing format: 5 | 6 | , .... 7 | 8 | Each person's user_id is separated by a comma from the friends' user_ids and friends' user_ids are separated by spaces. 9 | 10 | The sample output stores the user_ids of two persons and their mutual friends in the following fashion: 11 | 12 | , ,...| 13 | 14 | The two persons' user_ids are separated by a comma and from the friends' user_ids and counts by a tab. 15 | The mutual friends' user_ids are separated by commas and from count of the mutual friends by a "|" 16 | 17 | 18 | This algorithm takes in only two arguments: 19 | 20 | 1. The input path 21 | 2. The output path -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/input/in.txt.txt: -------------------------------------------------------------------------------- 1 | 100,200 300 400 500 600 2 | 200,100 300 400 3 | 300,100 200 400 500 4 | 400,100 200 300 5 | 500,100 300 6 | 600,100 7 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/output/part-r-00000: -------------------------------------------------------------------------------- 1 | 100,200 300,400|2 2 | 100,300 200,400,500|3 3 | 100,400 300,200|2 4 | 100,500 300|1 5 | 100,600 null 6 | 200,300 400,100|2 7 | 200,400 300,100|2 8 | 300,400 200,100|2 9 | 300,500 100|1 10 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/src/MF_Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | public class MF_Driver { 11 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 12 | Configuration conf=new Configuration(); 13 | Job job = new Job(conf); 14 | job.setJarByClass(MF_Driver.class); 15 | job.setJobName("Mutual Friend Calculator"); 16 | FileInputFormat.setInputPaths(job, new Path(args[0])); 17 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 18 | job.setMapperClass(MF_Mapper.class); 19 | job.setCombinerClass(MF_Reducer.class); 20 | job.setReducerClass(MF_Reducer.class); 21 | job.setMapOutputKeyClass(Text.class); 22 | job.setMapOutputValueClass(Text.class); 23 | job.setOutputKeyClass(Text.class); 24 | job.setOutputValueClass(Text.class); 25 | boolean success = job.waitForCompletion(true); 26 | System.exit(success ? 0 : 1); 27 | }; 28 | 29 | } 30 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/src/MF_Mapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.ArrayList; 3 | import java.util.Collections; 4 | import java.util.Map.Entry; 5 | import java.util.TreeMap; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | public class MF_Mapper extends Mapper { 11 | public static TreeMap> Friends=new TreeMap>(); 12 | public static ArrayList ArrToList (ArrayList l, String[] a){ 13 | for(String i:a) 14 | l.add(i); 15 | return l; 16 | } 17 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 18 | String[] vals=value.toString().split("\\,"); 19 | ArrayList al=ArrToList(new ArrayList(),vals[1].split(" ")); 20 | Collections.sort(al); 21 | Friends.put(vals[0],al); 22 | } 23 | @Override 24 | public void cleanup(Context context) throws IOException, InterruptedException{ 25 | for(Entry> s:new gen_mutual_friends_matrix().generate(Friends).entrySet()) 26 | context.write(new Text(s.getKey()), new Text(s.getValue().toString().replaceAll("\\[", "").replaceAll("\\]", "").replaceAll(" ", ""))); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/src/MF_Reducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.ArrayList; 3 | import java.util.HashSet; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | public class MF_Reducer extends Reducer { 8 | public static ArrayList ArrToList (ArrayList l, String[] a){ 9 | for(String i:a) 10 | l.add(i); 11 | return l; 12 | } 13 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 14 | ArrayList mutual_friends=new ArrayList(); 15 | for(Text value:values){ 16 | if(!value.toString().contentEquals("")){ 17 | String[] vals=value.toString().split("\\,"); 18 | ArrToList(mutual_friends,vals); 19 | } 20 | } 21 | HashSet hs=new HashSet(mutual_friends); 22 | if(hs.size()>0) 23 | context.write(key, new Text(hs.toString().replaceAll("\\[", "").replaceAll("\\]", "").replaceAll(" ", "")+"|"+hs.size())); 24 | else 25 | context.write(key, new Text("null")); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Mutual-Friends_MapReduce/src/gen_mutual_friends_matrix.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.Map.Entry; 3 | import java.util.TreeMap; 4 | 5 | public class gen_mutual_friends_matrix { 6 | public static TreeMap> list=new TreeMap>(); 7 | public TreeMap> generate(TreeMap> x){ 8 | for(Entry> s1:x.entrySet()){ 9 | for(Entry> s2:x.entrySet()){ 10 | if(!s1.getKey().contentEquals(s2.getKey()) && Integer.parseInt(s2.getKey())>Integer.parseInt(s1.getKey())){ 11 | ArrayList mutual=s1.getValue(); 12 | mutual.retainAll(s2.getValue()); 13 | list.put(s1.getKey()+","+s2.getKey(), mutual); 14 | } 15 | } 16 | } 17 | return list; 18 | } 19 | } -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | This is implementation of Naive Bayes Classifier on Hadoop using MapReduce. 2 | 3 | The example input that I have used in this project is file which tells us about a student's activity given certain factors. 4 | So the factors are (in order): Deadline?, Is there a Party?, Is he/she lazy? and finally the output is Activity. 5 | 6 | The correct output for this example has also been uploaded. 7 | 8 | The algorihm takes in three arguments: 9 | 10 | 1. The test input as a string of comma separated values for which you want to predict the activity for that particular person. 11 | 12 | 2. The path for the input. 13 | 14 | 3. The path for the output. 15 | -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/input/1.txt: -------------------------------------------------------------------------------- 1 | Urgent,Yes,Yes,Party 2 | Urgent,No,Yes,Study 3 | Near,Yes,Yes,Party 4 | None,Yes,No,Party 5 | None,No,Yes,Pub 6 | None,Yes,No,Party 7 | Near,No,No,Study 8 | Near,No,Yes,TV 9 | Near,Yes,Yes,Party 10 | Urgent,No,No,Study 11 | -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/output/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/output/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Naive_Bayes_Classifier_MapReduce/output/.part-r-00000.crc -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/output/part-r-00000: -------------------------------------------------------------------------------- 1 | TV 2 | -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/src/NBCDriver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | 11 | public class NBCDriver { 12 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 13 | Configuration conf=new Configuration(); 14 | // The test input for which you want to find the acitivity that the Person should be doing 15 | conf.set("test_input", args[0]); 16 | Job job = new Job(conf); 17 | job.setJarByClass(NBCDriver.class); 18 | job.setJobName("Naive_Bayes_calssifier using Hadoop"); 19 | FileInputFormat.setInputPaths(job, new Path(args[1])); 20 | FileOutputFormat.setOutputPath(job, new Path(args[2])); 21 | job.setMapperClass(NBCMap.class); 22 | job.setReducerClass(NBCReduce.class); 23 | job.setMapOutputKeyClass(IntWritable.class); 24 | job.setMapOutputValueClass(Text.class); 25 | job.setOutputKeyClass(IntWritable.class); 26 | job.setOutputValueClass(Text.class); 27 | boolean success = job.waitForCompletion(true); 28 | System.exit(success ? 0 : 1); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/src/NBCMap.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.HashMap; 3 | import java.util.Map.Entry; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | public class NBCMap extends Mapper{ 11 | public static String output_key; 12 | public static String[] test_input=null; 13 | public static int count=0; 14 | public static HashMap inputs=new HashMap(); 15 | public static double output_value=Double.NEGATIVE_INFINITY; 16 | public static HashMap output= new HashMap(); 17 | public static HashMap outcome_count= new HashMap(); 18 | public static HashMap features_count= new HashMap(); 19 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 20 | if(test_input==null) 21 | test_input=context.getConfiguration().get("test_input").split("\\,"); 22 | String[] input=value.toString().split("\\,"); 23 | for(int j=0;j o_c:outcome_count.entrySet()){ 46 | String output_class=o_c.getKey(); 47 | for(Entry i:inputs.entrySet()){ 48 | if(!features_count.containsKey(i.getKey()+"|"+output_class)) 49 | features_count.put(i.getKey()+"|"+output_class, (double) 0); 50 | } 51 | double output_class_count=o_c.getValue(); 52 | double probability=output_class_count/count; 53 | for(Entry f_c:features_count.entrySet()){ 54 | if(f_c.getKey().split("\\|")[1].contentEquals(output_class)) 55 | probability=probability*(f_c.getValue()/output_class_count); 56 | } 57 | output.put(output_class, probability); 58 | } 59 | for(Entry o:output.entrySet()){ 60 | if(o.getValue()>output_value){ 61 | output_value=o.getValue(); 62 | output_key=o.getKey(); 63 | } 64 | } 65 | context.write(new IntWritable(1),new Text(output_key)); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Naive_Bayes_Classifier_MapReduce/src/NBCReduce.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.HashMap; 3 | import java.util.Map.Entry; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | public class NBCReduce extends Reducer{ 10 | public void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException{ 11 | Double out_value=Double.NEGATIVE_INFINITY; 12 | String out_key=null; 13 | HashMap final_output=new HashMap(); 14 | for(Text value:values){ 15 | if(final_output.containsKey(value.toString())) 16 | final_output.put(value.toString(), final_output.get(value.toString())+1); 17 | else 18 | final_output.put(value.toString(), 1); 19 | } 20 | for(Entry output:final_output.entrySet()){ 21 | if(output.getValue()>out_value){ 22 | out_value=(double) output.getValue(); 23 | out_key=output.getKey(); 24 | } 25 | } 26 | context.write(null, new Text(out_key)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLHadoop 2 | This repository contains Machine-Learning MapReduce codes for Hadoop which are written from scratch (without using any package or library). So you'll find codes written right from the basic Mathematics required for all of these Algorithms. 3 | e.g. Prediction Algorithms (Linear and Logistic Regression - Iterative Version), Clustering Algorithm (K-Means Clustering), Classification Algorithm (KNN Classifier), MBA, Common Friends etc. 4 | 5 | NOTE: I think some of the algorithms implemented here can be improved in time as well as space by controlling the shuffle-sort phase between a MapReduce job i.e by writing and implementing your own custom Secondary Sort class as the shuffle-sort phase takes up a lot of time. If you have a sort order of key-value pairs in mind and if you are running multiple jobs or extra sorting methods inside mappers and reducers just to get the correct sort order, then, secondary sorting might come in handy as it will speed up the jobs and will use lesser RAM. 6 | 7 | Language used: Java 8 | 9 | IDE used: Eclipse IDE with [HDT (Hadoop Development Tools)](https://archive.apache.org/dist/incubator/hdt/hdt-0.0.2.incubating/hdt-0.0.2.incubating-bin.tar.gz) plugin installed. 10 | 11 | Hadoop version used: 1.2.1 12 | 13 | I wrote these codes when I was just a novice (in terms of MapReduce programming as well as programming in general) and therefore I am certain the code is very inefficient and there are a lot of optimisations yet to be done in this. So feel free to point out the mistakes or create PRs if you are interested. 14 | 15 | License 16 | Copyright © 2023 [Punit Naik](https://github.com/punit-naik) 17 | 18 | This program and the accompanying materials are made available under the terms of the Eclipse Public License 2.0 which is available at http://www.eclipse.org/legal/epl-2.0. 19 | 20 | This Source Code may also be made available under the following Secondary Licenses when the conditions for such availability set forth in the Eclipse Public License, v. 2.0 are satisfied: GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version, with the GNU Classpath Exception which is available at https://www.gnu.org/software/classpath/license.html. 21 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/Readme.txt: -------------------------------------------------------------------------------- 1 | This is an Algorithm which generates Recommendations for users by using the Collaborative Filtering technique. 2 | 3 | This algorithm takes in four arguments, namely: 4 | 5 | 1. args[0]: The path which will store the value "n" for a particular task_id. It also the "n" part of matrices co_oc_mat and user_scoring_mat where co_oc_mat has dimensions of m x n and sorted_user_scoring_mat has dimensions n x p. 6 | 7 | 2. args[1]: The path to the input. 8 | 9 | 3. args[2]: The intermediate output of the program which is also the input to the final MR Job. 10 | 11 | 4. args[3]: The final output path which will contain recommendations for users. Each group of users will be identified by their task_IDs. -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/input/recommendation.txt: -------------------------------------------------------------------------------- 1 | 1,101,5.0 2 | 1,102,3.0 3 | 1,103,2.5 4 | 2,101,2.0 5 | 2,102,2.5 6 | 2,103,5.0 7 | 2,104,2.0 8 | 3,101,2.0 9 | 3,104,4.0 10 | 3,105,4.5 11 | 3,107,5.0 12 | 4,101,5.0 13 | 4,103,3.0 14 | 4,104,4.5 15 | 4,106,4.0 16 | 5,101,4.0 17 | 5,102,3.0 18 | 5,103,2.0 19 | 5,104,4.0 20 | 5,105,3.5 21 | 5,106,4.0 -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/outputs/Intermediate_output/part-r-00000: -------------------------------------------------------------------------------- 1 | file:/home/punit/recommendation.txt:0+209-->102,3;16.0 2 | file:/home/punit/recommendation.txt:0+209-->102,4;32.5 3 | file:/home/punit/recommendation.txt:0+209-->103,3;10.0 4 | file:/home/punit/recommendation.txt:0+209-->104,1;24.0 5 | file:/home/punit/recommendation.txt:0+209-->105,1;0.0 6 | file:/home/punit/recommendation.txt:0+209-->105,2;2.0 7 | file:/home/punit/recommendation.txt:0+209-->105,4;4.5 8 | file:/home/punit/recommendation.txt:0+209-->106,1;0.0 9 | file:/home/punit/recommendation.txt:0+209-->106,2;2.0 10 | file:/home/punit/recommendation.txt:0+209-->106,3;4.0 11 | file:/home/punit/recommendation.txt:0+209-->107,1;5.0 12 | file:/home/punit/recommendation.txt:0+209-->107,2;4.0 13 | file:/home/punit/recommendation.txt:0+209-->107,4;9.5 14 | file:/home/punit/recommendation.txt:0+209-->107,5;11.5 15 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/outputs/final_output/part-r-00000: -------------------------------------------------------------------------------- 1 | 1 104,24.0 2 | 2 105,2.0 3 | 3 102,16.0 4 | 4 102,32.5 5 | 5 107,11.5 6 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/outputs/n.txt: -------------------------------------------------------------------------------- 1 | file:/home/punit/recommendation.txt:0+209-->7 2 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/FinalMap.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.HashMap; 3 | import java.util.Map.Entry; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | 10 | public class FinalMap extends Mapper { 11 | public static String delimiter=null; 12 | public static HashMap map=new HashMap(); 13 | @Override 14 | public void setup(Context context){ 15 | delimiter=context.getConfiguration().get("delimiter"); 16 | } 17 | @Override 18 | public void map(LongWritable key, Text value, Context context) 19 | throws IOException, InterruptedException { 20 | String[] parts=value.toString().split("\\;"); 21 | String score=parts[1]; 22 | String[] parts2=parts[0].split(delimiter); 23 | String[] parts3=parts2[1].split("\\,"); 24 | String user=parts3[1]; 25 | String item=parts3[0]; 26 | if(!map.containsKey(user)){ 27 | map.put(user, item+","+score); 28 | } 29 | else{ 30 | String[] old=map.get(user).split(","); 31 | if(Double.parseDouble(score)>Double.parseDouble(old[0])){ 32 | map.put(user, item+","+score); 33 | } 34 | } 35 | } 36 | @Override 37 | public void cleanup(Context context) throws IOException, InterruptedException{ 38 | for(Entry entry:map.entrySet()){ 39 | context.write(new Text(entry.getKey()), new Text(entry.getValue())); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/FinalReduce.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.io.Text; 3 | import org.apache.hadoop.mapreduce.Reducer; 4 | 5 | public class FinalReduce extends Reducer{ 6 | String delimiter=null,identifier=null; 7 | @Override 8 | public void setup(Context context){ 9 | delimiter=context.getConfiguration().get("delimiter"); 10 | identifier=context.getTaskAttemptID().getTaskID().getId()+delimiter; 11 | } 12 | @Override 13 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 14 | for(Text val: values){ 15 | context.write(new Text(/*identifier+*/key.toString()), val);//new Text(val.toString().split("\\,")[1])); 16 | } 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/RecDriver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import org.apache.hadoop.conf.Configuration; 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Job; 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 8 | 9 | 10 | public class RecDriver { 11 | public static String delimiter="-->"; 12 | public static String outFile=null; 13 | public static String rec_in=null; 14 | public static String mid_out=null; 15 | public static String final_out=null; 16 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 17 | // args[0] is the path of the file which stores the number of unique items "n" and its identification 18 | // which is the task_id. 19 | // It also the "n" part of matrices co_oc_mat and user_scoring_mat 20 | // where co_oc_mat has dimensions of m x n 21 | // and sorted_user_scoring_mat has dimensions n x p 22 | String a=String.valueOf(args[0].charAt(args[0].length()-1)); 23 | if(!"/".contentEquals(a)){ 24 | args[0]=args[0]+"/"; 25 | } 26 | outFile=args[0]+"n.txt"; 27 | 28 | //args[1] is the input file. 29 | rec_in=args[1]; 30 | 31 | //args[2] is the intermediate output which is also the input to final recommendation job. 32 | mid_out=args[2]; 33 | 34 | //args[3] is the final output. 35 | final_out=args[3]; 36 | 37 | run1(args); 38 | run2(args); 39 | } 40 | public static void run1(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ 41 | Configuration conf = new Configuration(); 42 | conf.set("outFile", outFile); 43 | conf.set("delimiter", delimiter); 44 | Job job = new Job(conf, "Recommendations_CollaborativeFiltering_Prepare"); 45 | job.setJarByClass(RecDriver.class); 46 | job.setOutputKeyClass(Text.class); 47 | job.setOutputValueClass(Text.class); 48 | job.setMapperClass(RecMap.class); 49 | job.setReducerClass(RecReduce.class); 50 | FileInputFormat.addInputPath(job, new Path(rec_in)); 51 | FileOutputFormat.setOutputPath(job, new Path(mid_out)); 52 | job.waitForCompletion(true); 53 | } 54 | public static void run2(String[] args) throws IOException, InterruptedException, ClassNotFoundException{ 55 | Configuration conf = new Configuration(); 56 | conf.set("delimiter", delimiter); 57 | Job job = new Job(conf, "Recommendations_CollaborativeFiltering_Final"); 58 | job.setJarByClass(RecDriver.class); 59 | job.setOutputKeyClass(Text.class); 60 | job.setOutputValueClass(Text.class); 61 | job.setMapperClass(FinalMap.class); 62 | job.setReducerClass(FinalReduce.class); 63 | FileInputFormat.addInputPath(job, new Path(mid_out)); 64 | FileOutputFormat.setOutputPath(job, new Path(final_out)); 65 | System.exit(job.waitForCompletion(true)?0:1); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/RecMap.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.BufferedWriter; 3 | import java.io.IOException; 4 | import java.io.InputStreamReader; 5 | import java.io.OutputStream; 6 | import java.io.OutputStreamWriter; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.Map.Entry; 10 | import java.util.TreeMap; 11 | 12 | import org.apache.hadoop.fs.FileSystem; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.LongWritable; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | 18 | 19 | public class RecMap extends Mapper { 20 | public static String delimiter=null; 21 | public static String identifier=null; 22 | public static TreeMap co_oc_mat=new TreeMap(); 23 | public static HashMap user_scoring_mat=new HashMap(); 24 | public static TreeMap sorted_user_scoring_mat=new TreeMap(); 25 | public static ArrayList vals=new ArrayList(); 26 | public static ArrayList unique_items=new ArrayList(); 27 | public static ArrayList unique_users=new ArrayList(); 28 | public static int a=0; 29 | @Override 30 | public void setup(Context context){ 31 | delimiter=context.getConfiguration().get("delimiter"); 32 | identifier=context.getInputSplit()+delimiter; 33 | } 34 | @Override 35 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 36 | ++a; 37 | String b=value.toString(); 38 | vals.add(b); 39 | String[] parts=b.split("\\,"); 40 | user_scoring_mat.put(parts[0]+","+parts[1], Float.parseFloat(parts[2])); 41 | } 42 | @Override 43 | public void cleanup(Context context) throws IOException, InterruptedException{ 44 | co_oc_mat.putAll(new get_co_oc_mat().get(vals, a)); 45 | unique_users.addAll(new get_unique_users().get(vals, a)); 46 | unique_items.addAll(new get_unique_items().get(vals, a)); 47 | FileSystem hdfs = FileSystem.get(context.getConfiguration()); 48 | Path outFile=new Path(context.getConfiguration().get("outFile")); 49 | String line1=""; 50 | if (!hdfs.exists(outFile)){ 51 | OutputStream out = hdfs.create(outFile); 52 | BufferedWriter br = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); 53 | br.write(identifier+unique_items.size()+"\n"); 54 | br.close(); 55 | hdfs.close(); 56 | } 57 | else{ 58 | String line2=null; 59 | BufferedReader br1 = new BufferedReader(new InputStreamReader(hdfs.open(outFile))); 60 | while((line2=br1.readLine())!=null){ 61 | line1=line1.concat(line2)+"\n"; 62 | } 63 | br1.close(); 64 | hdfs.delete(outFile, true); 65 | OutputStream out = hdfs.create(outFile); 66 | BufferedWriter br2 = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); 67 | br2.write(line1+identifier+unique_items.size()+"\n"); 68 | br2.close(); 69 | hdfs.close(); 70 | } 71 | for(int i=0;i entry: co_oc_mat.entrySet()){ 84 | String check_val=entry.getKey().split("\\,")[0]; 85 | if(!prev.contentEquals(check_val)){ 86 | // If code enters this block, it will mean that the row has changed 87 | // We have to transmit the aggregated values of the previous row and re-initialise the values. 88 | if(row_num==-1){ 89 | prev=check_val; 90 | //++row_num; 91 | row_num=Integer.parseInt(check_val); 92 | } 93 | else{ 94 | for(int i=0;i entry: sorted_user_scoring_mat.entrySet()){ 119 | String check_val=entry.getKey().split("\\,")[0]; 120 | if(!prev2.contentEquals(check_val)){ 121 | // If code enters this block, it will mean that the column has changed 122 | // We have to transmit the aggregated values of the previous column and re-initialise the values. 123 | if(col_num==-1){ 124 | prev2=check_val; 125 | //++col_num; 126 | col_num=Integer.parseInt(check_val); 127 | } 128 | else{ 129 | for(int i=0;i{ 12 | public static String delimiter=null; 13 | @Override 14 | public void setup(Context context){ 15 | delimiter=context.getConfiguration().get("delimiter"); 16 | } 17 | @Override 18 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException{ 19 | int n=0; 20 | if(n==0){ 21 | FileSystem hdfs= FileSystem.get(context.getConfiguration()); 22 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(context.getConfiguration().get("outFile"))))); 23 | String line=null; 24 | while((line=br.readLine())!=null){ 25 | String[] parts=line.replaceAll("\n", "").split(delimiter); 26 | if((key.toString().split(delimiter)[0]).contentEquals(parts[0])){ 27 | n=Integer.parseInt(parts[1]); 28 | break; 29 | } 30 | } 31 | br.close(); 32 | hdfs.close(); 33 | } 34 | String[] value=null; 35 | double pref=0; 36 | HashMap hashA = new HashMap(); 37 | HashMap hashB = new HashMap(); 38 | for (Text val : values) { 39 | if(val.toString().contains(",")){ 40 | value = val.toString().split(","); 41 | if (value[0].equals("A")) { 42 | for(int z=1;z<=n;z++){ 43 | hashA.put(z, Float.parseFloat(value[z]));} 44 | } else{ 45 | for(int a=1;a<=n;a++){ 46 | hashB.put(a, Float.parseFloat(value[a]));} 47 | } 48 | } 49 | else{ 50 | pref=Double.parseDouble(val.toString()); 51 | } 52 | } 53 | float result = 0.0f; 54 | float a_ij; 55 | float b_jk; 56 | for (int j=1;j<=n;j++) { 57 | a_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f; 58 | b_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f; 59 | result +=a_ij*b_jk; 60 | } 61 | if(pref==0.0){ 62 | context.write(null, new Text(key.toString() + ";" + Float.toString(result))); 63 | } 64 | //delimiter=null; 65 | n=0; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/get_co_oc_mat.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.Collections; 3 | import java.util.HashMap; 4 | import java.util.LinkedHashSet; 5 | 6 | public class get_co_oc_mat{ 7 | public HashMap get(ArrayList vals, int a){ 8 | HashMap co_oc_mat=new HashMap(); 9 | ArrayList items=new ArrayList(); 10 | ArrayList unique_items=null; 11 | ArrayList users=new ArrayList(); 12 | ArrayList unique_users=null; 13 | for(int i=0;i(new LinkedHashSet(users)); 19 | Collections.sort(unique_users); 20 | unique_items=new ArrayList(new LinkedHashSet(items)); 21 | Collections.sort(unique_items); 22 | 23 | // Updating Diagonal Elements of co_oc_mat; 24 | for(int i=0;i1){ 55 | break; 56 | } 57 | else{ 58 | i++; 59 | j++; 60 | } 61 | } 62 | } 63 | } 64 | 65 | // remaining elements are assigned to 0 66 | for(int i=0;i get(ArrayList vals, int a){ 6 | ArrayList items=new ArrayList(); 7 | ArrayList unique_items=new ArrayList(); 8 | for(int i=0;i(new LinkedHashSet(items)); 13 | return unique_items; 14 | } 15 | } -------------------------------------------------------------------------------- /Recommendation_Collaborative_Filtering_MapReduce/src/get_unique_users.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.LinkedHashSet; 3 | 4 | public class get_unique_users{ 5 | public ArrayList get(ArrayList vals, int a){ 6 | ArrayList users=new ArrayList(); 7 | ArrayList unique_users=new ArrayList(); 8 | for(int i=0;i(new LinkedHashSet(users)); 13 | return unique_users; 14 | } 15 | } -------------------------------------------------------------------------------- /Top_N_MapReduce/ReadMe.txt: -------------------------------------------------------------------------------- 1 | This is a MapReduce implementation of the 'Top N' algorithm. It finds top 'N' items based on their corresponding value. 2 | 3 | This algorithm expects 3 arguments: 4 | 5 | 1. N i.e. the 'N' part of the 'Top N' algorithm 6 | 2. The input path 7 | 3. The output path 8 | 9 | The input provided in this example is just a csv file with two comma-separated values which are the item and its value respectively. The top 'N' items here are found based on their aggregated values. 10 | 11 | NOTE: In the example output, I have set n = 5 -------------------------------------------------------------------------------- /Top_N_MapReduce/in/1.txt: -------------------------------------------------------------------------------- 1 | A,2 2 | B,2 3 | C,3 4 | D,2 5 | E,1 6 | G,2 7 | A,3 8 | B,4 9 | Z,100 10 | Z,1 -------------------------------------------------------------------------------- /Top_N_MapReduce/in/2.txt: -------------------------------------------------------------------------------- 1 | A,1 2 | B,1 3 | C,3 4 | E,1 5 | F,1 6 | G,2 7 | A,65 8 | A,3 9 | -------------------------------------------------------------------------------- /Top_N_MapReduce/in/3.txt: -------------------------------------------------------------------------------- 1 | A,2 2 | B,2 3 | C,1 4 | D,2 5 | E,1 6 | F,1 7 | G,2 8 | -------------------------------------------------------------------------------- /Top_N_MapReduce/out/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /Top_N_MapReduce/out/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Top_N_MapReduce/out/.part-r-00000.crc -------------------------------------------------------------------------------- /Top_N_MapReduce/out/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Top_N_MapReduce/out/_SUCCESS -------------------------------------------------------------------------------- /Top_N_MapReduce/out/part-r-00000: -------------------------------------------------------------------------------- 1 | Z,101 2 | A,76 3 | B,9 4 | C,7 5 | G,6 6 | -------------------------------------------------------------------------------- /Top_N_MapReduce/src/Top_N_Driver.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Job; 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | public class Top_N_Driver { 11 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 12 | Configuration conf=new Configuration(); 13 | conf.set("N", args[0]); 14 | Job job = new Job(conf); 15 | job.setJarByClass(Top_N_Driver.class); 16 | job.setJobName("Top_N_Driver"); 17 | FileInputFormat.setInputPaths(job, new Path(args[1])); 18 | FileOutputFormat.setOutputPath(job, new Path(args[2])); 19 | job.setMapperClass(Top_N_Mapper.class); 20 | job.setReducerClass(Top_N_Reducer.class); 21 | job.setOutputKeyClass(Text.class); 22 | job.setOutputValueClass(Text.class); 23 | boolean success = job.waitForCompletion(true); 24 | System.exit(success ? 0 : 1); 25 | }; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /Top_N_MapReduce/src/Top_N_Mapper.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Collections; 3 | import java.util.Comparator; 4 | import java.util.HashMap; 5 | import java.util.Iterator; 6 | import java.util.LinkedHashMap; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import org.apache.hadoop.io.LongWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | 16 | public class Top_N_Mapper extends Mapper { 17 | public static Map sortByComparator(Map m){ 18 | List> list=new LinkedList>(m.entrySet()); 19 | Collections.sort(list, new Comparator>(){ 20 | @Override 21 | public int compare(Entry o1, Entry o2) { 22 | return -(o1.getValue().compareTo(o2.getValue())); 23 | } 24 | }); 25 | Map sortedMap=new LinkedHashMap(); 26 | for(Iterator> it=list.iterator(); it.hasNext();){ 27 | Entry e=it.next(); 28 | sortedMap.put(e.getKey(), e.getValue()); 29 | } 30 | return sortedMap; 31 | } 32 | public static Map sm=new HashMap(); 33 | public static int N=0; 34 | @Override 35 | public void setup(Context context){ 36 | N=Integer.parseInt(context.getConfiguration().get("N")); 37 | } 38 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 39 | String[] values=value.toString().split(","); 40 | if(sm.containsKey(values[0])) 41 | sm.put(values[0], sm.get(values[0])+Integer.parseInt(values[1])); 42 | else 43 | sm.put(values[0], Integer.parseInt(values[1])); 44 | } 45 | @Override 46 | public void cleanup(Context context) throws IOException, InterruptedException{ 47 | int count=0; 48 | // Sorting based on values descendingly 49 | Map p=sortByComparator(sm); 50 | Map x=new LinkedHashMap(); 51 | for(Entry e:p.entrySet()){ 52 | if(count<=N){ 53 | x.put(e.getKey(), e.getValue()); 54 | count++; 55 | } 56 | else 57 | break; 58 | } 59 | context.write(new Text("1"), new Text(x.toString())); 60 | sm.clear(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Top_N_MapReduce/src/Top_N_Reducer.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Collections; 3 | import java.util.Comparator; 4 | import java.util.HashMap; 5 | import java.util.Iterator; 6 | import java.util.LinkedHashMap; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Map.Entry; 11 | 12 | import org.apache.commons.lang.StringUtils; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | 16 | public class Top_N_Reducer extends Reducer { 17 | public static Map m=new HashMap(); 18 | public static int N=0; 19 | public static Map sortByComparator(Map m){ 20 | List> list=new LinkedList>(m.entrySet()); 21 | Collections.sort(list, new Comparator>(){ 22 | @Override 23 | public int compare(Entry o1, Entry o2) { 24 | return -(o1.getValue().compareTo(o2.getValue())); 25 | } 26 | }); 27 | Map sortedMap=new LinkedHashMap(); 28 | for(Iterator> it=list.iterator(); it.hasNext();){ 29 | Entry e=it.next(); 30 | sortedMap.put(e.getKey(), e.getValue()); 31 | } 32 | return sortedMap; 33 | } 34 | @Override 35 | public void setup(Context context){ 36 | N=Integer.parseInt(context.getConfiguration().get("N")); 37 | } 38 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 39 | for(Text value:values){ 40 | String val=StringUtils.substringBetween(value.toString(),"{","}"); 41 | String[] key_val=val.split(","); 42 | for(String pair:key_val){ 43 | String[] entry=pair.split("="); 44 | if(m.containsKey(entry[0].trim())) 45 | m.put(entry[0].trim(), m.get(entry[0].trim())+Integer.parseInt(entry[1].trim())); 46 | else 47 | m.put(entry[0].trim(), Integer.parseInt(entry[1].trim())); 48 | } 49 | } 50 | } 51 | @Override 52 | public void cleanup(Context context) throws IOException, InterruptedException{ 53 | // Sorting based on values descendingly 54 | Map x=sortByComparator(m); 55 | int count=0; 56 | for(Entry e:x.entrySet()){ 57 | if(count 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /lu_decomposition/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | lu_decomposition 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.apache.hdt.mrnature 16 | org.eclipse.jdt.core.javanature 17 | 18 | 19 | -------------------------------------------------------------------------------- /lu_decomposition/README.md: -------------------------------------------------------------------------------- 1 | # LU Decomposition 2 | 3 | This mapreduce algorithm splits massively large matrix into it's `L` and `U` components. It uses the Naive Gaussian Elimination technique to do so. 4 | 5 | # Program Execution Arguments 6 | 7 | This programs only expects two arguments: 8 | 9 | 1. An input path 10 | 2. An output path 11 | 12 | # Input and Output data shape 13 | 14 | Both the input and output matrix shapes are the **SAME**. This program expects and produces the textual input of matrices in the following manner: 15 | 16 | `row_number + "\t" + elem-1 + "," + elem-2 + "," + elem-3 ...` 17 | 18 | The text files should be a tab-separated list of `row_number`s and comma-separated row elements 19 | 20 | # Final Output Location 21 | 22 | This program produces various intermediate outputs. But the actual output (`L` and `U` matrices) are present in the paths ` + "-merged/lower"` and ` + "-merged/upper"`. 23 | 24 | It's shapes will correspond to the shapes defined above. 25 | 26 | **NOTE**: I have provided the input and all the output (intermediate and actual) folders, you can use them to verify your outputs. 27 | 28 | # Limitations 29 | 30 | This program uses Naive Gaussian Elimination method as mentioned eariler which produces a lot of intermediate outputs. This is fine for large datasets but as the daaset grows (with the number of input rows), this program will produce a lot of intermediate outputs which might cause a bottleneck on the I/O. 31 | 32 | **NOTE**: Disk I/O can be significantly improved using Chained mappers and reducers in the MR job. 33 | 34 | # Deprecated 35 | 36 | This version is now depricated and you can find the newer, improved, low disk I/O version of this code at [LUDecomposition](https://github.com/punit-naik/MLHadoop/tree/master/LUDecomposition) 37 | -------------------------------------------------------------------------------- /lu_decomposition/input/test_input_4x4.txt: -------------------------------------------------------------------------------- 1 | 0 1,5,0,0 2 | 1 2,12,5,0 3 | 2 0,4,13,5 4 | 3 0,0,6,11 5 | -------------------------------------------------------------------------------- /lu_decomposition/output-merged/lower/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output-merged/lower/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/lower/.part-r-00000.crc -------------------------------------------------------------------------------- /lu_decomposition/output-merged/lower/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/lower/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output-merged/lower/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 1.0,0.0,0.0,0.0 2 | 1 2.0,1.0,0.0,0.0 3 | 2 0.0,2.0,1.0,0.0 4 | 3 0.0,0.0,2.0,1.0 5 | -------------------------------------------------------------------------------- /lu_decomposition/output-merged/upper/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output-merged/upper/.part-r-00000.crc: -------------------------------------------------------------------------------- 1 | crcRR� -------------------------------------------------------------------------------- /lu_decomposition/output-merged/upper/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/upper/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output-merged/upper/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 1.0,5.0,0.0,0.0 2 | 1 0.0,2.0,5.0,0.0 3 | 2 0.0,0.0,3.0,5.0 4 | 3 0.0,0.0,0.0,1.0 5 | -------------------------------------------------------------------------------- /lu_decomposition/output-run-0/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output-run-0/.part-m-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-0/.part-m-00000.crc -------------------------------------------------------------------------------- /lu_decomposition/output-run-0/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-0/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output-run-0/part-m-00000: -------------------------------------------------------------------------------- 1 | 0 1.0,5.0,0.0,0.0 2 | 1,0 2.0 3 | 1 0.0,2.0,5.0,0.0 4 | 2,0 0.0 5 | 2 0.0,4.0,13.0,5.0 6 | 3,0 0.0 7 | 3 0.0,0.0,6.0,11.0 8 | -------------------------------------------------------------------------------- /lu_decomposition/output-run-1/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output-run-1/.part-m-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-1/.part-m-00000.crc -------------------------------------------------------------------------------- /lu_decomposition/output-run-1/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-1/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output-run-1/part-m-00000: -------------------------------------------------------------------------------- 1 | 1 0.0,2.0,5.0,0.0 2 | 0 1.0,5.0,0.0,0.0 3 | 2,1 2.0 4 | 2 0.0,0.0,3.0,5.0 5 | 3,1 0.0 6 | 3 0.0,0.0,6.0,11.0 7 | -------------------------------------------------------------------------------- /lu_decomposition/output-run-2/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output-run-2/.part-m-00000.crc: -------------------------------------------------------------------------------- 1 | crcЇغ -------------------------------------------------------------------------------- /lu_decomposition/output-run-2/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-2/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output-run-2/part-m-00000: -------------------------------------------------------------------------------- 1 | 2 0.0,0.0,3.0,5.0 2 | 1 0.0,2.0,5.0,0.0 3 | 0 1.0,5.0,0.0,0.0 4 | 3,2 2.0 5 | 3 0.0,0.0,0.0,1.0 6 | -------------------------------------------------------------------------------- /lu_decomposition/output/nth/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output/nth/.part-m-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/nth/.part-m-00000.crc -------------------------------------------------------------------------------- /lu_decomposition/output/nth/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/nth/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output/nth/part-m-00000: -------------------------------------------------------------------------------- 1 | 2 0.0,0.0,3.0,5.0 2 | -------------------------------------------------------------------------------- /lu_decomposition/output/total_records/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /lu_decomposition/output/total_records/.part-r-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/total_records/.part-r-00000.crc -------------------------------------------------------------------------------- /lu_decomposition/output/total_records/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/total_records/_SUCCESS -------------------------------------------------------------------------------- /lu_decomposition/output/total_records/part-r-00000: -------------------------------------------------------------------------------- 1 | 0 4 2 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/FindNthRow/find_nth_driver.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.FindNthRow; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.NullWritable; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | import lu_decomposition.naive_gausssian.io.LongAndTextWritable; 16 | 17 | public class find_nth_driver { 18 | 19 | public static String readNthRow (String path, Configuration conf) throws IOException { 20 | FileSystem hdfs=FileSystem.get(conf); 21 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-m-00000")))); 22 | String records = br.readLine().split("\\s")[1]; 23 | br.close(); 24 | return records; 25 | } 26 | 27 | public static String run (String[] args, long n, long total_records) throws IOException, InterruptedException, ClassNotFoundException { 28 | return (n <= total_records-1) ? runSafely(args, n) :"fail"; 29 | } 30 | 31 | @SuppressWarnings("deprecation") 32 | public static String runSafely (String[] args, long n) throws IOException, InterruptedException, ClassNotFoundException { 33 | Configuration conf= new Configuration(); 34 | FileSystem hdfs=FileSystem.get(conf); 35 | // Deleting previous stored nth row 36 | hdfs.delete(new Path(args[1])); 37 | conf.setLong("n", n); 38 | Job job = new Job(conf); 39 | 40 | job.setJarByClass(find_nth_driver.class); 41 | 42 | job.setJobName("Finds the nth row of the HDFS file"); 43 | 44 | FileInputFormat.setInputPaths(job, new Path(args[0])); 45 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 46 | 47 | job.setMapperClass(find_nth_mapper.class); 48 | job.setNumReduceTasks(0); 49 | job.setOutputKeyClass(NullWritable.class); 50 | job.setOutputValueClass(LongAndTextWritable.class); 51 | 52 | job.waitForCompletion(true); 53 | 54 | return readNthRow(args[1], conf); 55 | }; 56 | 57 | } 58 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/FindNthRow/find_nth_mapper.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.FindNthRow; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | import lu_decomposition.naive_gausssian.io.LongAndTextWritable; 11 | 12 | public class find_nth_mapper extends Mapper { 13 | 14 | private LongWritable nthKey; 15 | private Text nthValue = null; 16 | 17 | @Override 18 | public void setup (Context context) throws IOException, InterruptedException { 19 | this.nthKey = new LongWritable(context.getConfiguration().getLong("n", 0)); 20 | } 21 | 22 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 23 | String[] input = value.toString().split("\\t"); 24 | if (!input[0].contains(",")) { 25 | LongWritable rowKey = new LongWritable(Long.valueOf(input[0])); 26 | if (rowKey.compareTo(this.nthKey) == 0) { 27 | this.nthValue = new Text(input[1]); 28 | } 29 | } 30 | } 31 | 32 | @Override 33 | public void cleanup(Context context) throws IOException, InterruptedException{ 34 | 35 | if (this.nthValue != null) 36 | context.write(null, new LongAndTextWritable(this.nthKey, this.nthValue)); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_driver.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.MergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | 10 | import lu_decomposition.naive_gausssian.io.NaturalKeyGroupingComparator; 11 | import lu_decomposition.naive_gausssian.io.TextPair; 12 | import lu_decomposition.naive_gausssian.io.TextPairComparator; 13 | import lu_decomposition.naive_gausssian.io.TextPairPartitioner; 14 | 15 | public class merge_results_driver { 16 | 17 | public static boolean runWithJob(Job job, String out_path) throws IOException, InterruptedException, ClassNotFoundException { 18 | job.setJarByClass(merge_results_driver.class); 19 | 20 | job.setJobName("Final Step: Merging results and creating separate LU decomposed components of input matrix"); 21 | 22 | FileOutputFormat.setOutputPath(job, new Path(out_path)); 23 | 24 | job.setMapperClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_mapper.class); 25 | job.setReducerClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_reducer.class); 26 | job.setMapOutputKeyClass(TextPair.class); 27 | job.setMapOutputValueClass(Text.class); 28 | job.setOutputKeyClass(TextPair.class); 29 | job.setOutputValueClass(Text.class); 30 | job.setPartitionerClass(TextPairPartitioner.class); 31 | job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); 32 | job.setSortComparatorClass(TextPairComparator.class); 33 | 34 | boolean success = job.waitForCompletion(true); 35 | return success; 36 | }; 37 | } 38 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_mapper.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.MergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | import lu_decomposition.naive_gausssian.io.TextPair; 10 | 11 | public class merge_results_mapper extends Mapper { 12 | 13 | private Boolean upper; 14 | private int total_records; 15 | 16 | @Override 17 | public void setup (Context context) { 18 | this.upper = context.getConfiguration().getBoolean("upper", false); 19 | this.total_records = (int) context.getConfiguration().getLong("total_records", 0); 20 | } 21 | 22 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 23 | String[] parts = value.toString().split("\\t"); 24 | // Processing Upper Triangular Matrix's rows 25 | if (this.upper && !parts[0].contains(",")) { 26 | context.write(new TextPair(parts[0],""), new Text(parts[1])); 27 | } 28 | // Processing Lower Triangular Matrix's rows 29 | if (!this.upper && parts[0].contains(",")) { 30 | 31 | String[] rowCol = parts[0].split(","); 32 | String row = rowCol[0]; 33 | // Sending first row of Lower Triangular Matrix to the reducer 34 | if (Integer.valueOf(row)-1 == 0) { 35 | for (int i = 0; i < this.total_records; i++) { 36 | context.write(new TextPair("0",String.valueOf(i)), new Text(i+","+((i == 0) ? 1 : 0))); 37 | } 38 | } 39 | String column = rowCol[1]; 40 | String element = parts[1]; 41 | context.write(new TextPair(row, column), new Text(column+","+element)); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_reducer.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.MergeResults; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | import lu_decomposition.naive_gausssian.lud_mapper; 9 | import lu_decomposition.naive_gausssian.io.TextPair; 10 | 11 | public class merge_results_reducer extends Reducer { 12 | 13 | private Boolean upper; 14 | private int total_records; 15 | 16 | @Override 17 | public void setup (Context context) { 18 | this.upper = context.getConfiguration().getBoolean("upper", false); 19 | this.total_records = (int) context.getConfiguration().getLong("total_records", 0); 20 | } 21 | 22 | public static String arrayToCSV(String[] a) { 23 | String result = ""; 24 | if (a.length > 0) { 25 | StringBuilder sb = new StringBuilder(); 26 | for (String s : a) { 27 | sb.append(s).append(","); 28 | } 29 | result = sb.deleteCharAt(sb.length() - 1).toString(); 30 | } 31 | return result; 32 | } 33 | 34 | public void reduce(TextPair key, Iterable values, Context context) 35 | throws IOException, InterruptedException { 36 | if (this.upper) { 37 | for (Text val:values) { 38 | context.write(new TextPair(key.getFirst(),""), val); 39 | } 40 | } 41 | else { 42 | Double[] rowElements = new Double[this.total_records]; 43 | int row = Integer.valueOf(key.getFirst()); 44 | for (Text val:values) { 45 | String[] parts = val.toString().split(","); 46 | int j = Integer.valueOf(parts[0]); 47 | rowElements[j] = Double.valueOf(parts[1]); 48 | } 49 | // Setting Diagonal Elements as `1` in the lower triangular matrix rows 50 | rowElements[row] = (double) 1; 51 | 52 | for(int j = 0; j< this.total_records; j++) { 53 | if (rowElements[j] == null) { 54 | rowElements[j] = (double) 0; 55 | } 56 | } 57 | context.write(new TextPair(key.getFirst(),""), new Text(lud_mapper.arrayToCSV(rowElements))); 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_driver.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.TotalRecords; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 14 | 15 | public class total_records_driver { 16 | 17 | public static long readTotalRecords (String path, Configuration conf) throws IOException { 18 | FileSystem hdfs=FileSystem.get(conf); 19 | BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-r-00000")))); 20 | Long records = (long) 0; 21 | records = Long.valueOf(br.readLine().split("\\t")[1]); 22 | br.close(); 23 | return records; 24 | } 25 | 26 | @SuppressWarnings("deprecation") 27 | public static long run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 28 | Configuration conf = new Configuration(); 29 | Job job = new Job(conf); 30 | 31 | job.setJarByClass(total_records_driver.class); 32 | 33 | job.setJobName("Just counting total rows of the HDFS input"); 34 | 35 | FileInputFormat.setInputPaths(job, new Path(args[0])); 36 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 37 | 38 | job.setMapperClass(total_records_mapper.class); 39 | 40 | job.setReducerClass(total_records_reducer.class); 41 | job.setCombinerClass(total_records_reducer.class); 42 | 43 | job.setOutputKeyClass(LongWritable.class); 44 | job.setOutputValueClass(LongWritable.class); 45 | 46 | //job.setInputFormatClass(TextInputFormat.class); 47 | //job.setOutputFormatClass(TextOutputFormat.class); 48 | 49 | job.waitForCompletion(true); 50 | 51 | return readTotalRecords(args[1], conf); 52 | }; 53 | } 54 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_mapper.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.TotalRecords; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class total_records_mapper extends Mapper { 10 | 11 | private Long countRows = (long) 0; 12 | 13 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 14 | this.countRows++; 15 | } 16 | 17 | @Override 18 | public void cleanup(Context context) throws IOException, InterruptedException{ 19 | context.write(new LongWritable(0), new LongWritable(this.countRows)); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_reducer.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.TotalRecords; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | public class total_records_reducer extends Reducer { 9 | 10 | private Long countRows = (long) 0; 11 | 12 | public void reduce(LongWritable key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | for(LongWritable val:values){ 15 | this.countRows += val.get(); 16 | } 17 | } 18 | 19 | @Override 20 | public void cleanup(Context context) throws IOException, InterruptedException{ 21 | 22 | context.write(new LongWritable(0), new LongWritable(this.countRows)); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/io/LongAndTextWritable.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.io; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.io.Writable; 10 | 11 | public class LongAndTextWritable implements Writable { 12 | 13 | private LongWritable rowKey; 14 | private Text rowValue; 15 | 16 | public LongAndTextWritable() { 17 | this.rowKey = new LongWritable(0); 18 | this.rowValue = new Text(""); 19 | } 20 | 21 | public LongAndTextWritable(LongWritable k, Text v) { 22 | this.rowKey = k; 23 | this.rowValue = v; 24 | } 25 | 26 | public LongWritable getKey() { 27 | return rowKey; 28 | } 29 | 30 | public Text getValue() { 31 | return rowValue; 32 | } 33 | 34 | @Override 35 | public void readFields(DataInput in) throws IOException { 36 | 37 | rowKey.readFields(in); 38 | rowValue.readFields(in); 39 | 40 | } 41 | 42 | @Override 43 | public void write(DataOutput out) throws IOException { 44 | 45 | rowKey.write(out); 46 | rowValue.write(out); 47 | } 48 | 49 | @Override 50 | public String toString() { 51 | return rowKey.toString() + "\t" + rowValue.toString(); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/io/NaturalKeyGroupingComparator.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.io; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class NaturalKeyGroupingComparator extends WritableComparator { 7 | protected NaturalKeyGroupingComparator() { 8 | super(TextPair.class, true); 9 | } 10 | @SuppressWarnings("rawtypes") 11 | @Override 12 | public int compare(WritableComparable w1, WritableComparable w2) { 13 | TextPair tp1 = (TextPair)w1; 14 | TextPair tp2 = (TextPair)w2; 15 | 16 | return tp1.getFirst().compareTo(tp2.getFirst()); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPair.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.io; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | public class TextPair implements WritableComparable { 10 | 11 | private String t1; 12 | private String t2; 13 | 14 | public String getFirst() { 15 | return this.t1; 16 | } 17 | 18 | public String getSecond() { 19 | return this.t2; 20 | } 21 | 22 | @Override 23 | public void readFields(DataInput in) throws IOException { 24 | this.t1 = in.readUTF(); 25 | this.t2 = in.readUTF(); 26 | } 27 | 28 | @Override 29 | public void write(DataOutput out) throws IOException { 30 | out.writeUTF(this.t1); 31 | out.writeUTF(this.t2); 32 | } 33 | 34 | public TextPair() { 35 | this.t1 = new String(); 36 | this.t2 = new String(); 37 | } 38 | 39 | public TextPair(String t1, String t2) { 40 | this.t1 = new String(t1); 41 | this.t2 = new String(t2); 42 | } 43 | 44 | public int compareTo(TextPair tp) { 45 | int sortKey = this.t1.compareTo(tp.getFirst()); 46 | if (sortKey == 0) { 47 | sortKey = this.t2.compareTo(tp.getSecond()); 48 | } 49 | return sortKey; 50 | } 51 | 52 | public String toString () { 53 | String s = ""; 54 | if (this.t2.compareTo("") == 0) { 55 | s += this.t1; 56 | } 57 | else { 58 | s += this.t1 + "," + this.t2; 59 | } 60 | return s; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPairComparator.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.io; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class TextPairComparator extends WritableComparator { 7 | protected TextPairComparator() { 8 | super(TextPair.class, true); 9 | } 10 | @SuppressWarnings("rawtypes") 11 | @Override 12 | public int compare(WritableComparable w1, WritableComparable w2) { 13 | TextPair tp1 = (TextPair)w1; 14 | TextPair tp2 = (TextPair)w2; 15 | 16 | int result = tp1.getFirst().compareTo(tp2.getFirst()); 17 | if(0 == result) { 18 | result = tp1.getSecond().compareTo(tp2.getSecond()); 19 | } 20 | return result; 21 | } 22 | } -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPairPartitioner.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian.io; 2 | 3 | import org.apache.hadoop.io.Text; 4 | import org.apache.hadoop.mapreduce.Partitioner; 5 | 6 | public class TextPairPartitioner extends Partitioner{ 7 | @Override 8 | public int getPartition(TextPair tp, Text t, int numPartitions) { 9 | return tp.getFirst().hashCode() % numPartitions; 10 | } 11 | } -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/lud_driver.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | import lu_decomposition.naive_gausssian.FindNthRow.find_nth_driver; 13 | import lu_decomposition.naive_gausssian.MergeResults.merge_results_driver; 14 | import lu_decomposition.naive_gausssian.TotalRecords.total_records_driver; 15 | 16 | public class lud_driver { 17 | 18 | public static String arrayToCSV(String[] a) { 19 | String result = ""; 20 | if (a.length > 0) { 21 | StringBuilder sb = new StringBuilder(); 22 | for (String s : a) { 23 | sb.append(s).append(","); 24 | } 25 | result = sb.deleteCharAt(sb.length() - 1).toString(); 26 | } 27 | return result; 28 | } 29 | 30 | @SuppressWarnings("deprecation") 31 | public static boolean runWithConf (String[] args, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { 32 | 33 | Job job = new Job(conf); 34 | 35 | job.setJarByClass(lud_driver.class); 36 | 37 | job.setJobName("Split a matrix into it's LU decomposed components using the Naive Gaussian Elimination method"); 38 | long n = conf.getLong("n", 0); 39 | FileInputFormat.setInputPaths(job, new Path((n==0)?args[0]:(args[1]+"-run-"+(n-1)))); 40 | FileOutputFormat.setOutputPath(job, new Path(args[1]+"-run-"+n)); 41 | job.setNumReduceTasks(0); 42 | job.setMapperClass(lud_mapper.class); 43 | job.setOutputKeyClass(Text.class); 44 | job.setOutputValueClass(Text.class); 45 | 46 | boolean success = job.waitForCompletion(true); 47 | 48 | return success; 49 | }; 50 | 51 | @SuppressWarnings("deprecation") 52 | public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { 53 | String input = args[0]; 54 | String output = args[1]; 55 | String total_records_output = output + "/total_records"; 56 | String[] total_records_args = {input, total_records_output}; 57 | String find_nth_row_output = output + "/nth"; 58 | // MR Job: Finding Total Records 59 | long total_records = total_records_driver.run(total_records_args); 60 | String[] lud_args = {input, output}; 61 | Configuration conf = new Configuration(); 62 | 63 | for(long n = 0; n < total_records-1; n++) { 64 | String find_nth_row_input = (n==0) ? input : output+"-run-"+(n-1); 65 | String[] find_nth_row_args = {find_nth_row_input, find_nth_row_output}; 66 | // MR Job: Finding Nth Record 67 | String nVal = find_nth_driver.run(find_nth_row_args, n, total_records); 68 | conf.setLong("n", n); 69 | conf.setLong("total_records", total_records); 70 | conf.set("nVal", nVal); 71 | // MR Job: Running LU Decomposition on the input 72 | runWithConf(lud_args, conf); 73 | } 74 | 75 | // MR Job(s): Merging Outputs 76 | conf.setBoolean("upper", false); 77 | Job job = new Job(conf); 78 | String[] path = new String[(int) (total_records-1)]; 79 | for(long n = 0; n < total_records-1; n++) { 80 | path[(int) n] = (output+"-run-"+n); 81 | } 82 | FileInputFormat.setInputPaths(job, arrayToCSV(path)); 83 | merge_results_driver.runWithJob(job, output+"-merged/lower"); 84 | conf.setBoolean("upper", true); 85 | job = new Job(conf); 86 | FileInputFormat.addInputPath(job, new Path(output+"-run-"+(total_records-2))); 87 | merge_results_driver.runWithJob(job, output+"-merged/upper"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /lu_decomposition/src/lu_decomposition/naive_gausssian/lud_mapper.java: -------------------------------------------------------------------------------- 1 | package lu_decomposition.naive_gausssian; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | public class lud_mapper extends Mapper { 10 | 11 | private int n; 12 | private Double[] nVal; 13 | private int total_records; 14 | 15 | public Double[] stringToDoubleArray(String[] a) { 16 | 17 | Double[] x = new Double[a.length]; 18 | 19 | for(int i = 0; i < this.total_records; i++) { 20 | x[i] = Double.valueOf(a[i]); 21 | } 22 | 23 | return x; 24 | 25 | } 26 | 27 | public static String arrayToCSV(Double[] nVal2) { 28 | String result = ""; 29 | 30 | if (nVal2.length > 0) { 31 | StringBuilder sb = new StringBuilder(); 32 | 33 | for (Double s : nVal2) { 34 | sb.append(s).append(","); 35 | } 36 | 37 | result = sb.deleteCharAt(sb.length() - 1).toString(); 38 | } 39 | return result; 40 | } 41 | 42 | @Override 43 | public void setup (Context context) throws IOException, InterruptedException { 44 | this.n = (int) context.getConfiguration().getLong("n", 0); 45 | this.total_records = (int) context.getConfiguration().getLong("total_records", 0); 46 | this.nVal = stringToDoubleArray(context.getConfiguration().get("nVal").split(",")); 47 | 48 | context.write(new Text(String.valueOf(this.n)), new Text(arrayToCSV(this.nVal))); 49 | } 50 | 51 | public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 52 | String[] parts = value.toString().split("\\t"); 53 | if(!parts[0].contains(",")) { 54 | long row = Long.valueOf(parts[0]); 55 | if (row > this.n) { 56 | Double[] rowElements = stringToDoubleArray(parts[1].split(",")); 57 | Double multiplier = (double) (rowElements[this.n]/this.nVal[this.n]); 58 | context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier))); 59 | Double[] rowElementsModified = new Double[this.total_records]; 60 | for (int i = 0; i< this.total_records; i++) { 61 | rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier); 62 | } 63 | if (row != 0) 64 | context.write(new Text(String.valueOf(row)), new Text(arrayToCSV(rowElementsModified))); 65 | } 66 | else { 67 | if (Long.valueOf(parts[0]) != this.n) 68 | context.write(new Text(parts[0]), new Text(parts[1])); 69 | } 70 | } 71 | } 72 | 73 | } 74 | --------------------------------------------------------------------------------