├── Decision_Tree_ID3_MapReduce
    ├── ReadMe.txt
    ├── input
    │   └── 1.txt
    ├── output
    │   ├── ._SUCCESS.crc
    │   ├── .part-r-00000.crc
    │   └── part-r-00000
    └── src
    │   ├── BuildTree.java
    │   ├── DT_ID3_Driver.java
    │   ├── DT_ID3_Map.java
    │   └── DT_ID3_Reduce.java
├── KMeansClustering_MapReduce
    ├── Readme.txt
    ├── input
    │   └── 1.txt
    ├── output
    │   └── 1.txt
    └── src
    │   ├── KMeansCentroidCalculationDriver_ClassificationDriver.java
    │   ├── KMeansCentroidCalculationMap.java
    │   ├── KMeansCentroidCalculationReduce.java
    │   └── KMeansClassificationReduce.java
├── KNN_MapReduce
    ├── Readme.txt
    ├── input
    │   ├── input_to_be_classified.txt
    │   └── iris_training_data.txt
    └── src
    │   ├── Driver.java
    │   ├── Map.java
    │   └── Reduce.java
├── LICENSE
├── LUDecomposition
    ├── .classpath
    ├── .project
    ├── README.md
    ├── input
    │   └── test_input_4x4.txt
    ├── output
    │   ├── .nth.crc
    │   ├── LU_Components
    │   │   ├── L
    │   │   │   ├── ._SUCCESS.crc
    │   │   │   ├── .part-r-00000.crc
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-r-00000
    │   │   └── U
    │   │   │   ├── ._SUCCESS.crc
    │   │   │   ├── .part-r-00000.crc
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-r-00000
    │   ├── after-2-runs
    │   │   ├── ._SUCCESS.crc
    │   │   ├── .part-r-00000.crc
    │   │   ├── _SUCCESS
    │   │   └── part-r-00000
    │   ├── after-3-runs
    │   │   ├── ._SUCCESS.crc
    │   │   ├── .part-r-00000.crc
    │   │   ├── _SUCCESS
    │   │   └── part-r-00000
    │   ├── nth
    │   └── total_records
    │   │   ├── ._SUCCESS.crc
    │   │   ├── .part-r-00000.crc
    │   │   ├── _SUCCESS
    │   │   └── part-r-00000
    └── src
    │   └── lud
    │       ├── Utils.java
    │       ├── io
    │           ├── LongAndTextWritable.java
    │           ├── NaturalKeyGroupingComparator.java
    │           ├── TextPair.java
    │           ├── TextPairComparator.java
    │           └── TextPairPartitioner.java
    │       └── naiveGaussian
    │           ├── initial_input_mapper.java
    │           ├── lud_driver.java
    │           ├── lud_mapper.java
    │           ├── lud_reducer.java
    │           ├── mergeResults
    │               ├── merge_results_driver.java
    │               ├── merge_results_mapper.java
    │               └── merge_results_reducer.java
    │           └── totalRecords
    │               ├── total_records_driver.java
    │               ├── total_records_mapper.java
    │               └── total_records_reducer.java
├── LinearRegression_MapReduce
    ├── Readme.txt
    ├── input
    │   └── linear.txt
    └── src
    │   ├── Driver.java
    │   ├── thetaMAP.java
    │   └── thetaREDUCE.java
├── LogisticRegression_MapReduce
    ├── Readme.txt
    ├── input
    │   └── diabetes.txt
    └── src
    │   ├── Driver.java
    │   ├── thetaMAP.java
    │   └── thetaREDUCE.java
├── Market-Basket-Analysis_MapReduce
    ├── ReadMe.txt
    ├── input
    │   └── in.txt.txt
    ├── output
    │   └── part-r-00000
    └── src
    │   ├── MBA_Driver.java
    │   ├── MBA_Mapper.java
    │   └── MBA_Reducer.java
├── MatrixMultiplication_MapReduce
    ├── Readme.txt
    ├── input
    │   └── 1.txt
    └── src
    │   ├── MatMulDriver.java
    │   ├── MatMulMap.java
    │   └── MatMulReduce.java
├── Mutual-Friends_MapReduce
    ├── ReadMe.txt
    ├── input
    │   └── in.txt.txt
    ├── output
    │   └── part-r-00000
    └── src
    │   ├── MF_Driver.java
    │   ├── MF_Mapper.java
    │   ├── MF_Reducer.java
    │   └── gen_mutual_friends_matrix.java
├── Naive_Bayes_Classifier_MapReduce
    ├── Readme.txt
    ├── input
    │   └── 1.txt
    ├── output
    │   ├── ._SUCCESS.crc
    │   ├── .part-r-00000.crc
    │   └── part-r-00000
    └── src
    │   ├── NBCDriver.java
    │   ├── NBCMap.java
    │   └── NBCReduce.java
├── README.md
├── Recommendation_Collaborative_Filtering_MapReduce
    ├── Readme.txt
    ├── input
    │   └── recommendation.txt
    ├── outputs
    │   ├── Intermediate_output
    │   │   └── part-r-00000
    │   ├── final_output
    │   │   └── part-r-00000
    │   └── n.txt
    └── src
    │   ├── FinalMap.java
    │   ├── FinalReduce.java
    │   ├── RecDriver.java
    │   ├── RecMap.java
    │   ├── RecReduce.java
    │   ├── get_co_oc_mat.java
    │   ├── get_unique_items.java
    │   └── get_unique_users.java
├── Top_N_MapReduce
    ├── ReadMe.txt
    ├── in
    │   ├── 1.txt
    │   ├── 2.txt
    │   └── 3.txt
    ├── out
    │   ├── ._SUCCESS.crc
    │   ├── .part-r-00000.crc
    │   ├── _SUCCESS
    │   └── part-r-00000
    └── src
    │   ├── Top_N_Driver.java
    │   ├── Top_N_Mapper.java
    │   └── Top_N_Reducer.java
└── lu_decomposition
    ├── .classpath
    ├── .project
    ├── README.md
    ├── input
        └── test_input_4x4.txt
    ├── output-merged
        ├── lower
        │   ├── ._SUCCESS.crc
        │   ├── .part-r-00000.crc
        │   ├── _SUCCESS
        │   └── part-r-00000
        └── upper
        │   ├── ._SUCCESS.crc
        │   ├── .part-r-00000.crc
        │   ├── _SUCCESS
        │   └── part-r-00000
    ├── output-run-0
        ├── ._SUCCESS.crc
        ├── .part-m-00000.crc
        ├── _SUCCESS
        └── part-m-00000
    ├── output-run-1
        ├── ._SUCCESS.crc
        ├── .part-m-00000.crc
        ├── _SUCCESS
        └── part-m-00000
    ├── output-run-2
        ├── ._SUCCESS.crc
        ├── .part-m-00000.crc
        ├── _SUCCESS
        └── part-m-00000
    ├── output
        ├── nth
        │   ├── ._SUCCESS.crc
        │   ├── .part-m-00000.crc
        │   ├── _SUCCESS
        │   └── part-m-00000
        └── total_records
        │   ├── ._SUCCESS.crc
        │   ├── .part-r-00000.crc
        │   ├── _SUCCESS
        │   └── part-r-00000
    └── src
        └── lu_decomposition
            └── naive_gausssian
                ├── FindNthRow
                    ├── find_nth_driver.java
                    └── find_nth_mapper.java
                ├── MergeResults
                    ├── merge_results_driver.java
                    ├── merge_results_mapper.java
                    └── merge_results_reducer.java
                ├── TotalRecords
                    ├── total_records_driver.java
                    ├── total_records_mapper.java
                    └── total_records_reducer.java
                ├── io
                    ├── LongAndTextWritable.java
                    ├── NaturalKeyGroupingComparator.java
                    ├── TextPair.java
                    ├── TextPairComparator.java
                    └── TextPairPartitioner.java
                ├── lud_driver.java
                └── lud_mapper.java


/Decision_Tree_ID3_MapReduce/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This is a MapReduce implementation of the ID3 (Iterative Dichotomiser 3) Decision Tree algorithm.
 2 | 
 3 | This program accepts 3 user inputs:
 4 | 
 5 | 1. The input path
 6 | 
 7 | 2. The output path
 8 | 
 9 | The example input that I have used in this project is file which tells us about a student's activity given certain factors.
10 | So the factors are (in order): Deadline?, Is there a Party?, Is he/she lazy? and finally the output is Activity.
11 | 
12 | So the 0,1,2 in the output file are nothing but Deadline?, Is there a Party? and Is he/she lazy? respectively.
13 | 
14 | Although I get the correct outputs from this algorithm, there are still a lot of wrongdoings in the code which I will be fixed in time.
15 | 
16 | This algorithm computes the decision tree on each data block and then sort of averages out all the DTs at the reducer which  I think is a bit computationally expensive for one reducer and which needs a change in the approach a wee bit.
17 | 
18 | The output of this algorithm is a simple string which represents a graph (sort of, since I did not get time to write the Graph API).
19 | Every line starts with an integer which is the node and then the connected nodes as values.
20 | At every line, outputs are separated by "|". If an output doesn't have a ";" separated value attached to it, then it is a node
21 | which will be split and its contents are written in the next line.
22 | 
23 | I use a LinkedHashMap to store the graph. I used the LinkedhashMap to preserve the order of all the insertions in my map.
24 | 


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/input/1.txt:
--------------------------------------------------------------------------------
 1 | Urgent,Yes,Yes,Party
 2 | Urgent,No,Yes,Study
 3 | Near,Yes,Yes,Party
 4 | None,Yes,No,Party
 5 | None,No,Yes,Pub
 6 | None,Yes,No,Party
 7 | Near,No,No,Study
 8 | Near,No,Yes,TV
 9 | Near,Yes,Yes,Party
10 | Urgent,No,No,Study
11 | 


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/output/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Decision_Tree_ID3_MapReduce/output/.part-r-00000.crc


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1,Yes;Party|No
2 | 0,Urgent;Study|Near|None;Pub
3 | 2,Yes;TV|No;Study
4 | 


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/src/BuildTree.java:
--------------------------------------------------------------------------------
  1 | import java.util.ArrayList;
  2 | import java.util.HashMap;
  3 | import java.util.Iterator;
  4 | import java.util.LinkedHashMap;
  5 | import java.util.Map;
  6 | import java.util.Map.Entry;
  7 | 
  8 | public class BuildTree{
  9 | 	public static int feat_count=0;
 10 | 	public static LinkedHashMap<String,String> p=new LinkedHashMap<String,String>();
 11 | 	public static HashMap<String,Integer> nodes=new HashMap<String,Integer>();
 12 | 	public static HashMap<Integer,Double> gain= new HashMap<Integer,Double>();
 13 | 	public static HashMap<Integer,String> intermediate= new HashMap<Integer,String>();
 14 | 	public static HashMap<String,Double> feature_count= new HashMap<String,Double>();
 15 | 	public static HashMap<String,Double> outcome_count= new HashMap<String,Double>();
 16 | 
 17 | 	public static String[] getMax_m(HashMap<String,Double> x){
 18 | 		String maxKey="";
 19 | 		Double maxValue=Double.NEGATIVE_INFINITY;
 20 | 		for(Entry<String,Double> e:x.entrySet()){
 21 | 			if(e.getValue()>maxValue){
 22 | 				maxKey=e.getKey();
 23 | 				maxValue=e.getValue();
 24 | 			}
 25 | 		}
 26 | 		String[] s=new String[2];
 27 | 		s[0]=String.valueOf(maxKey);
 28 | 		s[1]=String.valueOf(maxValue);
 29 | 		return s;
 30 | 	}
 31 | 
 32 | 	public static String[] getMax(HashMap<Integer,Double> x){
 33 | 		int maxKey=-1;
 34 | 		Double maxValue=Double.NEGATIVE_INFINITY;
 35 | 		for(Entry<Integer,Double> e:x.entrySet()){
 36 | 			if(e.getValue()>maxValue && !nodes.containsKey(String.valueOf(e.getKey()))){
 37 | 				maxKey=e.getKey();
 38 | 				maxValue=e.getValue();
 39 | 			}
 40 | 		}
 41 | 		String[] s=new String[2];
 42 | 		s[0]=String.valueOf(maxKey);
 43 | 		s[1]=String.valueOf(maxValue);
 44 | 		return s;
 45 | 	}
 46 | 
 47 | 	public static LinkedHashMap<String, String> build(LinkedHashMap<String, String> g,ArrayList<String> data,int size){
 48 | 		if(p.size()==0)
 49 | 			p.putAll(g);
 50 | 		if(feat_count==0)
 51 | 			feat_count=data.get(0).split("\\,").length-1;
 52 | 
 53 | 		for(int i=0;i<data.size();i++){
 54 | 			String[] input=data.get(i).split("\\,");
 55 | 			for(int q=0;q<input.length;q++){
 56 | 				if(q==input.length-1){
 57 | 					if(outcome_count.containsKey(input[q]))
 58 | 						outcome_count.put(input[q], outcome_count.get(input[q])+1);
 59 | 					else
 60 | 						outcome_count.put(input[q], (double) 1);
 61 | 				}
 62 | 				else{
 63 | 					if(feature_count.containsKey(q+","+input[q]+","+input[input.length-1]))
 64 | 						feature_count.put(q+","+input[q]+","+input[input.length-1], feature_count.get(q+","+input[q]+","+input[input.length-1])+1);
 65 | 					else
 66 | 						feature_count.put(q+","+input[q]+","+input[input.length-1], (double) 1);
 67 | 				}
 68 | 			}
 69 | 		}
 70 | 
 71 | 		for(Entry<String,Double> e:feature_count.entrySet()){
 72 | 			String[] key=e.getKey().split("\\,");
 73 | 			if(intermediate.containsKey(Integer.parseInt(key[0])))
 74 | 				intermediate.put(Integer.parseInt(key[0]), intermediate.get(Integer.parseInt(key[0]))+","+key[1]+":"+(e.getValue()));
 75 | 			else
 76 | 				intermediate.put(Integer.parseInt(e.getKey().split("\\,")[0]), String.valueOf(key[1]+":"+e.getValue()));
 77 | 		}
 78 | 		// Calculating the entropy of the whole Set.
 79 | 		double entropy=0.0;
 80 | 		for(Entry<String,Double> e:outcome_count.entrySet()){
 81 | 			double p=((e.getValue()/size));
 82 | 			entropy+=-(p*(Math.log(p)/Math.log(2)));
 83 | 		}
 84 | 
 85 | 		// Initialising the gain Map with all the keys
 86 | 		// and the initial information gain which is ofcourse
 87 | 		// the entropy of whole Set.
 88 | 		for(int i=0;i<data.get(0).split("\\,").length-1;i++){
 89 | 			gain.put(i, entropy);
 90 | 		}
 91 | 		for(Entry<Integer,String> e:intermediate.entrySet()){
 92 | 			if(gain.containsKey(e.getKey())){
 93 | 				double info_gain_except_the_entropy=0.0;
 94 | 				String[] counts=e.getValue().split("\\,");
 95 | 				HashMap<String,String> feat=new HashMap<String,String>();
 96 | 				for(int j=0;j<counts.length;j++){
 97 | 					if(feat.containsKey(counts[j].split("\\:")[0]))
 98 | 						feat.put(counts[j].split("\\:")[0], feat.get(counts[j].split("\\:")[0])+","+counts[j].split("\\:")[1]);
 99 | 					else
100 | 						feat.put(counts[j].split("\\:")[0], counts[j].split("\\:")[1]);
101 | 				}
102 | 				for(Entry<String,String> r:feat.entrySet()){
103 | 					String[] c=r.getValue().split("\\,");
104 | 					int num=0;
105 | 					for(int x=0;x<c.length;x++){
106 | 						num+=Double.parseDouble(c[x]);
107 | 					}
108 | 					double ent=0.0;
109 | 					for(int k=0;k<c.length;k++){
110 | 						double p=(Double.parseDouble(c[k])*Math.pow(num, -1));
111 | 						ent+=-(p*(Math.log(p)/Math.log(2)));
112 | 					}
113 | 					double cs=num*Math.pow(size, -1);
114 | 					info_gain_except_the_entropy+=ent * cs;
115 | 				}
116 | 				gain.put(e.getKey(), gain.get(e.getKey())-info_gain_except_the_entropy);
117 | 			}
118 | 		}
119 | 		String key=getMax(gain)[0];
120 | 		nodes.put(key, 1);
121 | 		HashMap<String,Double> test=new HashMap<String,Double>();
122 | 		for(Entry<String,Double> z:feature_count.entrySet()){
123 | 			String[] parts=z.getKey().split("\\,");
124 | 			if(parts[0].contentEquals(key)){
125 | 				if(test.containsKey(parts[1]+";"+parts[2]))
126 | 					test.put(parts[1]+";"+parts[2], test.get(parts[1]+";"+parts[2])+1);
127 | 				else
128 | 					test.put(parts[1]+";"+parts[2], (double) 1);
129 | 			}
130 | 		}
131 | 		String return_value=(key+","+getMax_m(test)[0]);
132 | 		HashMap<String,String> ret=new HashMap<String,String>();
133 | 		ret.put(key, getMax_m(test)[0]);
134 | 		if(p.containsKey(key))
135 | 			p.put(key, p.get(key)+"|"+getMax_m(test)[0]);
136 | 		else
137 | 			p.put(key, getMax_m(test)[0]);
138 | 		ArrayList<String> indices=new ArrayList<String>();
139 | 		for(int i=0;i<data.size();i++){
140 | 			String[] vals=data.get(i).split("\\,");
141 | 			String[] check=return_value.split("\\,")[1].split("\\;");
142 | 			if(vals[Integer.parseInt(key)].contentEquals(check[0]) && vals[vals.length-1].contentEquals(check[1]))
143 | 				indices.add(data.get(i));
144 | 		}
145 | 
146 | 		// Removing the data points whose output has been decided
147 | 		data.removeAll(indices);
148 | 		
149 | 		// Clearing the global variables so that no data duplication occurs
150 | 		// when the values are passed to the recursion process
151 | 		// and thereby avoiding infinite recursion.
152 | 		gain.clear();
153 | 		intermediate.clear();
154 | 		feature_count.clear();
155 | 		outcome_count.clear();
156 | 		
157 | 		// In this example I don't loop util the dataset is empty
158 | 		// which should be the done.
159 | 		// Still it gets my work (building the decision tree) done though.
160 | 		if(data.size()==0 || nodes.size()==feat_count){
161 | 			String[] tbr=return_value.split("\\,");
162 | 			test.remove(tbr[1]);
163 | 			HashMap<String,Double> test2=new HashMap<String,Double>();
164 | 			for(Entry<String,Double> E:test.entrySet()){
165 | 				if(test2.containsKey(E.getKey().split("\\;")[0]))
166 | 					test2.put(E.getKey().split("\\;")[0], test2.get(E.getKey().split("\\;")[0])+1);
167 | 				else
168 | 					test2.put(E.getKey().split("\\;")[0], (double) 1);
169 | 			}
170 | 			Iterator<Entry<String, Double>> it1=test.entrySet().iterator(),it2=test2.entrySet().iterator();
171 | 			while (it1.hasNext() && it2.hasNext()){
172 | 				Map.Entry<String, Double> pairs1=(Entry<String,Double>) it1.next();
173 | 				Map.Entry<String, Double> pairs2=(Entry<String,Double>) it2.next();
174 | 
175 | 				if(p.containsKey(key))
176 | 					if(pairs2.getValue()==(double) 1)
177 | 						p.put(key, p.get(key)+"|"+pairs1.getKey());
178 | 					else
179 | 						p.put(key, p.get(key)+"|"+pairs2.getKey());
180 | 				else
181 | 					if(pairs2.getValue()==(double) 1)
182 | 						p.put(key, pairs1.getKey());
183 | 					else
184 | 						p.put(key, pairs2.getKey());
185 | 			}
186 | 			int r=0;
187 | 			String vl="";
188 | 			for(Entry<String,String> n:p.entrySet()){
189 | 				++r;
190 | 				if(r==p.size()){
191 | 					String[] i=n.getValue().split("\\|");
192 | 					int count=i.length-1;
193 | 					for(int v=0;v<count;v++){
194 | 						if(v==count-1)
195 | 							vl+=i[v];
196 | 						else
197 | 							vl+=i[v]+"|";
198 | 					}
199 | 					p.put(n.getKey(), vl);
200 | 				}
201 | 			}
202 | 			return p;
203 | 		}
204 | 		else{
205 | 			String[] tbr=return_value.split("\\,");
206 | 			test.remove(tbr[1]);
207 | 			HashMap<String,Double> test2=new HashMap<String,Double>();
208 | 			for(Entry<String,Double> E:test.entrySet()){
209 | 				if(test2.containsKey(E.getKey().split("\\;")[0]))
210 | 					test2.put(E.getKey().split("\\;")[0], test2.get(E.getKey().split("\\;")[0])+1);
211 | 				else
212 | 					test2.put(E.getKey().split("\\;")[0], (double) 1);
213 | 			}
214 | 			Iterator<Entry<String, Double>> it1=test.entrySet().iterator(),it2=test2.entrySet().iterator();
215 | 			while (it1.hasNext() && it2.hasNext()){
216 | 				Map.Entry<String, Double> pairs1=(Entry<String,Double>) it1.next();
217 | 				Map.Entry<String, Double> pairs2=(Entry<String,Double>) it2.next();
218 | 
219 | 				if(p.containsKey(key))
220 | 					if(pairs2.getValue()==(double) 1)
221 | 						p.put(key, p.get(key)+"|"+pairs1.getKey());
222 | 					else
223 | 						p.put(key, p.get(key)+"|"+pairs2.getKey());
224 | 				else
225 | 					if(pairs2.getValue()==(double) 1)
226 | 						p.put(key, pairs1.getKey());
227 | 					else
228 | 						p.put(key, pairs2.getKey());
229 | 			}
230 | 			return build(p,data,data.size());
231 | 		}
232 | 	}
233 | }


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/src/DT_ID3_Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.conf.Configuration;
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 8 | 
 9 | public class DT_ID3_Driver {
10 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
11 | 		Configuration conf=new Configuration();
12 | 		Job job = new Job(conf);
13 | 		job.setJarByClass(DT_ID3_Driver.class);
14 | 		job.setJobName("Decision_Tree_Algorithm_on_Hadoop");
15 | 		FileInputFormat.setInputPaths(job, new Path(args[0]));
16 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
17 | 		//job.setNumReduceTasks(0);
18 | 		job.setMapperClass(DT_ID3_Map.class);
19 | 		job.setReducerClass(DT_ID3_Reduce.class);
20 | 		job.setMapOutputKeyClass(Text.class);
21 | 		job.setMapOutputValueClass(Text.class);
22 | 		job.setOutputKeyClass(Text.class);
23 | 		job.setOutputValueClass(Text.class);
24 | 		boolean success = job.waitForCompletion(true);
25 | 		System.exit(success ? 0 : 1);
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/src/DT_ID3_Map.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.LinkedHashMap;
 4 | import java.util.Map.Entry;
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class DT_ID3_Map extends Mapper<LongWritable, Text, Text, Text>{
10 | 	public static int count=0;
11 | 	public static ArrayList<String> input=new ArrayList<String>();
12 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
13 | 		input.add(value.toString());
14 | 		++count;
15 | 	}
16 | 	
17 | 	@Override
18 | 	public void cleanup(Context context) throws IOException, InterruptedException{
19 | 		LinkedHashMap<String,String> g = new LinkedHashMap<String,String>();
20 | 		LinkedHashMap<String,String> t=BuildTree.build(g,input, count);
21 | 		String key="";
22 | 		int c=0;
23 | 		for(Entry<String,String> T:t.entrySet()){
24 | 			++c;
25 | 			key=T.getKey()+","+T.getValue();
26 | 			System.out.println("key: "+key+" c: "+c);
27 | 			context.write(new Text(String.valueOf(c)), new Text(key));
28 | 		}
29 | 	}
30 | }


--------------------------------------------------------------------------------
/Decision_Tree_ID3_MapReduce/src/DT_ID3_Reduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import java.util.Map.Entry;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class DT_ID3_Reduce extends Reducer<Text, Text, Text, Text>{
 8 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
 9 | 		HashMap<String,Integer> counts=new HashMap<String,Integer>();
10 | 		String maxKey="";
11 | 		int maxValue=-1;
12 | 		for(Text value:values){
13 | 			if(counts.containsKey(value.toString()))
14 | 				counts.put(value.toString(), counts.get(value.toString())+1);
15 | 			else
16 | 				counts.put(value.toString(), 1);
17 | 		}
18 | 		for(Entry<String,Integer> e:counts.entrySet()){
19 | 			if(e.getValue()>maxValue){
20 | 				maxKey=e.getKey();
21 | 				maxValue=e.getValue();
22 | 			}
23 | 		}
24 | 		context.write(null, new Text(maxKey));
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | The input file is just filled with delimited 2D points and this algorithm tries to cluster them.
 2 | 
 3 | Initially, the output folder needs to be created and must be filled with a file which contains the initial cluster centers which are mostly zero.
 4 | 
 5 | The old and new centers are tab separated. The centers on the left are old and the centers on the right are new. Initially both are zero.
 6 | 
 7 | This algorithms takes in 4 arguments as follows:
 8 | 
 9 | 1. Number of centroids
10 | 2. The dimension of the input points
11 | 3. The input data
12 | 4. The output data
13 | 


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/input/1.txt:
--------------------------------------------------------------------------------
1 | 1,1|2,2|3,3|4,4
2 | 10,10|20,20|30,30|40,40
3 | 60,60|70,70|80,80|90,90


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/output/1.txt:
--------------------------------------------------------------------------------
1 | 0.0,0.0	0.0,0.0
2 | 0.0,0.0	0.0,0.0
3 | 0.0,0.0	0.0,0.0


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/src/KMeansCentroidCalculationDriver_ClassificationDriver.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedReader;
  2 | import java.io.IOException;
  3 | import java.io.InputStreamReader;
  4 | import java.util.ArrayList;
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.fs.FileSystem;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 12 | 
 13 | public class KMeansCentroidCalculationDriver_ClassificationDriver{
 14 | 	public static boolean isdone=false;
 15 | 	public static String num_centers;
 16 | 	public static String dimension;
 17 | 	public static void main(String[] args) throws Exception{
 18 | 		Configuration conf= new Configuration();
 19 | 		//args[0] is the number of centers to be used.
 20 | 		num_centers=args[0];
 21 | 		//args[1] is the dimension of the input.
 22 | 		dimension=args[1];
 23 | 		conf.setInt("noc", Integer.parseInt(num_centers));
 24 | 		conf.setInt("dimension", Integer.parseInt(dimension));
 25 | 		int iter=0;
 26 | 		FileSystem hdfs=FileSystem.get(conf);
 27 | 		ArrayList<Float> centers=new ArrayList<Float>();
 28 | 		//args[3] is the output path. Initially it will contain a single file
 29 | 		//in which old and new centroids will be assigned to 0.0.
 30 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[3]))));
 31 | 		String line=null;
 32 | 		while((line=br.readLine())!=null){
 33 | 			++iter;
 34 | 			String[] tok=line.split("\\\t");
 35 | 			String[] centroids_new= tok[1].split("\\,");
 36 | 			centers.add(Float.parseFloat(centroids_new[0]));
 37 | 			centers.add(Float.parseFloat(centroids_new[1]));
 38 | 		}
 39 | 		br.close();
 40 | 		for(int i=1;i<=Integer.parseInt(num_centers);i++){
 41 | 			if(iter==i){
 42 | 				for(int j=0;j<((Integer.parseInt(num_centers)-iter)*2);j++){
 43 | 					centers.add((float) 0);
 44 | 				}
 45 | 			}
 46 | 		}
 47 | 		if(hdfs.exists(new Path(args[3]))){
 48 | 			hdfs.delete(new Path(args[3]),true);
 49 | 		}
 50 | 		hdfs.close();
 51 | 		for(int i=0;i<(Integer.parseInt(num_centers)*2);i++){
 52 | 			conf.setFloat("c".concat(String.valueOf(i)) , centers.get(i));
 53 | 		}
 54 | 		Job job = new Job(conf,"K-Means Clustering MapReduce");
 55 | 		job.setJarByClass(KMeansCentroidCalculationDriver_ClassificationDriver.class);
 56 | 		//args[2] is the input path.
 57 | 		FileInputFormat.setInputPaths(job, new Path(args[2]));
 58 | 		FileOutputFormat.setOutputPath(job, new Path(args[3]));
 59 | 		job.setMapperClass(KMeansCentroidCalculationMap.class);
 60 | 		job.setCombinerClass(KMeansCentroidCalculationReduce.class);
 61 | 		job.setReducerClass(KMeansCentroidCalculationReduce.class);
 62 | 		job.setMapOutputKeyClass(Text.class);
 63 | 		job.setMapOutputValueClass(Text.class);
 64 | 		job.setOutputKeyClass(Text.class);
 65 | 		job.setOutputValueClass(Text.class);
 66 | 		job.waitForCompletion(true);
 67 | 		while(isdone==false){
 68 | 			run(args);
 69 | 		}
 70 | 	}
 71 | 	public static void run(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
 72 | 		Configuration conf_med=new Configuration();
 73 | 		conf_med.setInt("noc", Integer.parseInt(num_centers));
 74 | 		conf_med.setInt("dimension", Integer.parseInt(dimension));
 75 | 		int iter_med=0;
 76 | 		ArrayList<Float> centers_old=new ArrayList<Float>();
 77 | 		ArrayList<Float> centers_new=new ArrayList<Float>(); 
 78 | 		FileSystem hdfs_med=FileSystem.get(conf_med);
 79 | 		BufferedReader br_med = new BufferedReader(new InputStreamReader(hdfs_med.open(new Path(args[3]))));
 80 | 		String line_med=null;
 81 | 		while((line_med=br_med.readLine())!=null){
 82 | 			++iter_med;
 83 | 			String[] tok= line_med.split("\\\t");
 84 | 			String[] centroids_old= tok[0].split("\\,");
 85 | 			String[] centroids_new= tok[1].split("\\,");
 86 | 			centers_old.add(Float.parseFloat(centroids_old[0]));
 87 | 			centers_old.add(Float.parseFloat(centroids_old[1]));
 88 | 			centers_new.add(Float.parseFloat(centroids_new[0]));
 89 | 			centers_new.add(Float.parseFloat(centroids_new[1]));
 90 | 		}
 91 | 		br_med.close();
 92 | 		for(int i=1;i<=Integer.parseInt(num_centers);i++){
 93 | 			if(iter_med==i){
 94 | 				for(int j=0;j<((Integer.parseInt(num_centers)-iter_med)*2);j++){
 95 | 					centers_old.add((float) 0);
 96 | 					centers_new.add((float) 0);
 97 | 				}
 98 | 			}
 99 | 		}
100 | 		if(hdfs_med.exists(new Path(args[3]))){
101 | 			hdfs_med.delete(new Path(args[3]),true);
102 | 		}
103 | 		hdfs_med.close();
104 | 		ArrayList<Float> ond = new ArrayList<Float>();
105 | 		for(int i=0;i<Integer.parseInt(num_centers)*2;i+=2){
106 | 			ond.add(Math.abs((centers_old.get(i)-centers_new.get(i))+(centers_old.get(i+1)-centers_new.get(i+1))));
107 | 		}
108 | 		int check=0;
109 | 		for(int i=0;i<Integer.parseInt(num_centers);i++){
110 | 			if(ond.get(i)<=0.0002){
111 | 				check=1;
112 | 			}
113 | 			else{
114 | 				check=0;
115 | 				break;
116 | 			}
117 | 		}
118 | 
119 | 		if(check==1){
120 | 			isdone=true;
121 | 			Configuration conf_new= new Configuration();
122 | 			conf_new.setInt("noc", Integer.parseInt(num_centers));
123 | 			conf_new.setInt("dimension", Integer.parseInt(dimension));
124 | 			for(int i=0;i<(Integer.parseInt(num_centers)*2);i++){
125 | 				conf_new.setFloat("c".concat(String.valueOf(i)), centers_new.get(i));
126 | 			}
127 | 			Job job_new = new Job(conf_new,"K-Means Clustering MapReduce");
128 | 			job_new.setJarByClass(KMeansCentroidCalculationDriver_ClassificationDriver.class);
129 | 			FileInputFormat.setInputPaths(job_new, new Path(args[2]));
130 | 			FileOutputFormat.setOutputPath(job_new, new Path(args[3]));
131 | 			job_new.setMapperClass(KMeansCentroidCalculationMap.class);
132 | 			job_new.setCombinerClass(KMeansClassificationReduce.class);
133 | 			job_new.setReducerClass(KMeansClassificationReduce.class);
134 | 			job_new.setMapOutputKeyClass(Text.class);
135 | 			job_new.setMapOutputValueClass(Text.class);
136 | 			job_new.setOutputKeyClass(Text.class);
137 | 			job_new.setOutputValueClass(Text.class);
138 | 			System.exit(job_new.waitForCompletion(true)?0:1);
139 | 		}
140 | 		else{
141 | 			isdone=false;
142 | 			for(int i=0;i<(Integer.parseInt(num_centers)*2);i++){
143 | 				conf_med.setFloat("c".concat(String.valueOf(i)) , centers_new.get(i));
144 | 			}
145 | 			Job job_med = new Job(conf_med,"K-Means Clustering MapReduce");
146 | 			job_med.setJarByClass(KMeansCentroidCalculationDriver_ClassificationDriver.class);
147 | 			FileInputFormat.setInputPaths(job_med, new Path(args[2]));
148 | 			FileOutputFormat.setOutputPath(job_med, new Path(args[3]));
149 | 			job_med.setMapperClass(KMeansCentroidCalculationMap.class);
150 | 			job_med.setCombinerClass(KMeansCentroidCalculationReduce.class);
151 | 			job_med.setReducerClass(KMeansCentroidCalculationReduce.class);
152 | 			job_med.setMapOutputKeyClass(Text.class);
153 | 			job_med.setMapOutputValueClass(Text.class);
154 | 			job_med.setOutputKeyClass(Text.class);
155 | 			job_med.setOutputValueClass(Text.class);
156 | 			job_med.waitForCompletion(true);
157 | 		}
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/src/KMeansCentroidCalculationMap.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.HashMap;
 4 | import java.util.Map.Entry;
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class KMeansCentroidCalculationMap extends Mapper<LongWritable, Text, Text, Text>{
10 | 	public static HashMap<Integer,Float> map=new HashMap<Integer,Float>();
11 | 	public static Double minkey=(double) 0;
12 | 	public static int noc=0, dimension=0;
13 | 	public static ArrayList<Float> centers=new ArrayList<Float>();
14 | 	public static Double minvalue=Double.POSITIVE_INFINITY;
15 | 	public static float euc_dist(Float[] a, Float[] b,int num){
16 | 		float distance=0;
17 | 		float val=0;
18 | 		for(int i=0;i<num;i++){
19 | 			val+=((a[i]-b[i])*(a[i]-b[i]));
20 | 		}
21 | 		distance=(float) Math.sqrt(val);
22 | 		return distance;
23 | 	}
24 | 	@Override
25 | 	public void setup(Context context) throws IOException, InterruptedException{
26 | 		noc=Integer.parseInt(context.getConfiguration().get("noc"));
27 | 		dimension=Integer.parseInt(context.getConfiguration().get("dimension"));
28 | 		for(int i=0;i<noc*2;i++){
29 | 			centers.add(context.getConfiguration().getFloat("c"+i));
30 | 		}
31 | 	}
32 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
33 | 		String[] points = value.toString().split("\\|");
34 | 		for(int i=0;i<points.length;i++){
35 | 			String[] str_point=points[i].split("\\,");
36 | 			Float[] xy_points= new Float[2];
37 | 			xy_points[0]=(Float.parseFloat(str_point[0]));
38 | 			xy_points[1]=(Float.parseFloat(str_point[1]));
39 | 			for(int j=0;j<noc*2;j+=2){
40 | 				Float[] d=new Float[2];
41 | 				d[0]=centers.get(j);
42 | 				d[1]=centers.get(j+1);
43 | 				map.put(j, euc_dist(xy_points,d,dimension));
44 | 			}
45 | 			for(Entry<Integer, Float> entry: map.entrySet()){
46 | 				if(entry.getValue()<minvalue){
47 | 					minkey=entry.getKey();
48 | 					minvalue=entry.getValue();
49 | 				}
50 | 			}
51 | 			map.clear();
52 | 			minvalue=Double.POSITIVE_INFINITY;
53 | 			context.write(new Text(centers.get(minkey)+","+centers.get(minkey+1)), new Text(xy_points[0]+","+xy_points[1]));
54 | 		}
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/src/KMeansCentroidCalculationReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.StringTokenizer;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | public class KMeansCentroidCalculationReduce extends Reducer<Text, Text, Text, Text>{
 7 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
 8 | 		int count=0;
 9 | 		float x_sum=0, y_sum=0;
10 | 		for(Text val:values){
11 | 			StringTokenizer xy_points=new StringTokenizer(val.toString(),",");
12 | 			float x_point=Float.parseFloat(xy_points.nextToken());
13 | 			float y_point=Float.parseFloat(xy_points.nextToken());
14 | 			x_sum+=x_point;
15 | 			y_sum+=y_point;
16 | 			count++;
17 | 		}
18 | 		context.write(key,new Text((x_sum/count)+","+(y_sum/count)));
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/KMeansClustering_MapReduce/src/KMeansClassificationReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.Text;
 3 | import org.apache.hadoop.mapreduce.Reducer;
 4 | 
 5 | public class KMeansClassificationReduce extends Reducer<Text, Text, Text, Text>{
 6 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
 7 | 		for(Text val:values){
 8 | 			context.write(key,val);
 9 | 		}
10 | 	}
11 | }
12 | 


--------------------------------------------------------------------------------
/KNN_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | Iris database has been used as the input.
 2 | 
 3 | In this example, 3 nearest neighbours are used in the map phase. In the reduce phase, the dominant class is selected from all the classes sent by the mappers.
 4 | 
 5 | If the data size is very large, you can disable the reduction proces by setting the number of reduce tasks to zero and then run a new MapReduce job on the data which was the input to the reducer of the old job and then include the same logic, which was used in the reducer of the old job, in the mapper and reducer of the new job to find the dominant class of the input.
 6 | 
 7 | This algorithms takes in 4 arguments as follows:
 8 | 
 9 | 1. The input file waiting to be classified (features separated by whitespaces in a single file)
10 | 2. The name of the input entity to be classified
11 | 3. The input (training) dataset
12 | 4. The output
13 | 


--------------------------------------------------------------------------------
/KNN_MapReduce/input/input_to_be_classified.txt:
--------------------------------------------------------------------------------
1 | 5.1 3.5 1.4 0.2


--------------------------------------------------------------------------------
/KNN_MapReduce/input/iris_training_data.txt:
--------------------------------------------------------------------------------
  1 | 5.1 3.5 1.4 0.2 "setosa"
  2 | 4.9 3 1.4 0.2 "setosa"
  3 | 4.7 3.2 1.3 0.2 "setosa"
  4 | 4.6 3.1 1.5 0.2 "setosa"
  5 | 5 3.6 1.4 0.2 "setosa"
  6 | 5.4 3.9 1.7 0.4 "setosa"
  7 | 4.6 3.4 1.4 0.3 "setosa"
  8 | 5 3.4 1.5 0.2 "setosa"
  9 | 4.4 2.9 1.4 0.2 "setosa"
 10 | 4.9 3.1 1.5 0.1 "setosa"
 11 | 5.4 3.7 1.5 0.2 "setosa"
 12 | 4.8 3.4 1.6 0.2 "setosa"
 13 | 4.8 3 1.4 0.1 "setosa"
 14 | 4.3 3 1.1 0.1 "setosa"
 15 | 5.8 4 1.2 0.2 "setosa"
 16 | 5.7 4.4 1.5 0.4 "setosa"
 17 | 5.4 3.9 1.3 0.4 "setosa"
 18 | 5.1 3.5 1.4 0.3 "setosa"
 19 | 5.7 3.8 1.7 0.3 "setosa"
 20 | 5.1 3.8 1.5 0.3 "setosa"
 21 | 5.4 3.4 1.7 0.2 "setosa"
 22 | 5.1 3.7 1.5 0.4 "setosa"
 23 | 4.6 3.6 1 0.2 "setosa"
 24 | 5.1 3.3 1.7 0.5 "setosa"
 25 | 4.8 3.4 1.9 0.2 "setosa"
 26 | 5 3 1.6 0.2 "setosa"
 27 | 5 3.4 1.6 0.4 "setosa"
 28 | 5.2 3.5 1.5 0.2 "setosa"
 29 | 5.2 3.4 1.4 0.2 "setosa"
 30 | 4.7 3.2 1.6 0.2 "setosa"
 31 | 4.8 3.1 1.6 0.2 "setosa"
 32 | 5.4 3.4 1.5 0.4 "setosa"
 33 | 5.2 4.1 1.5 0.1 "setosa"
 34 | 5.5 4.2 1.4 0.2 "setosa"
 35 | 4.9 3.1 1.5 0.2 "setosa"
 36 | 5 3.2 1.2 0.2 "setosa"
 37 | 5.5 3.5 1.3 0.2 "setosa"
 38 | 4.9 3.6 1.4 0.1 "setosa"
 39 | 4.4 3 1.3 0.2 "setosa"
 40 | 5.1 3.4 1.5 0.2 "setosa"
 41 | 5 3.5 1.3 0.3 "setosa"
 42 | 4.5 2.3 1.3 0.3 "setosa"
 43 | 4.4 3.2 1.3 0.2 "setosa"
 44 | 5 3.5 1.6 0.6 "setosa"
 45 | 5.1 3.8 1.9 0.4 "setosa"
 46 | 4.8 3 1.4 0.3 "setosa"
 47 | 5.1 3.8 1.6 0.2 "setosa"
 48 | 4.6 3.2 1.4 0.2 "setosa"
 49 | 5.3 3.7 1.5 0.2 "setosa"
 50 | 5 3.3 1.4 0.2 "setosa"
 51 | 7 3.2 4.7 1.4 "versicolor"
 52 | 6.4 3.2 4.5 1.5 "versicolor"
 53 | 6.9 3.1 4.9 1.5 "versicolor"
 54 | 5.5 2.3 4 1.3 "versicolor"
 55 | 6.5 2.8 4.6 1.5 "versicolor"
 56 | 5.7 2.8 4.5 1.3 "versicolor"
 57 | 6.3 3.3 4.7 1.6 "versicolor"
 58 | 4.9 2.4 3.3 1 "versicolor"
 59 | 6.6 2.9 4.6 1.3 "versicolor"
 60 | 5.2 2.7 3.9 1.4 "versicolor"
 61 | 5 2 3.5 1 "versicolor"
 62 | 5.9 3 4.2 1.5 "versicolor"
 63 | 6 2.2 4 1 "versicolor"
 64 | 6.1 2.9 4.7 1.4 "versicolor"
 65 | 5.6 2.9 3.6 1.3 "versicolor"
 66 | 6.7 3.1 4.4 1.4 "versicolor"
 67 | 5.6 3 4.5 1.5 "versicolor"
 68 | 5.8 2.7 4.1 1 "versicolor"
 69 | 6.2 2.2 4.5 1.5 "versicolor"
 70 | 5.6 2.5 3.9 1.1 "versicolor"
 71 | 5.9 3.2 4.8 1.8 "versicolor"
 72 | 6.1 2.8 4 1.3 "versicolor"
 73 | 6.3 2.5 4.9 1.5 "versicolor"
 74 | 6.1 2.8 4.7 1.2 "versicolor"
 75 | 6.4 2.9 4.3 1.3 "versicolor"
 76 | 6.6 3 4.4 1.4 "versicolor"
 77 | 6.8 2.8 4.8 1.4 "versicolor"
 78 | 6.7 3 5 1.7 "versicolor"
 79 | 6 2.9 4.5 1.5 "versicolor"
 80 | 5.7 2.6 3.5 1 "versicolor"
 81 | 5.5 2.4 3.8 1.1 "versicolor"
 82 | 5.5 2.4 3.7 1 "versicolor"
 83 | 5.8 2.7 3.9 1.2 "versicolor"
 84 | 6 2.7 5.1 1.6 "versicolor"
 85 | 5.4 3 4.5 1.5 "versicolor"
 86 | 6 3.4 4.5 1.6 "versicolor"
 87 | 6.7 3.1 4.7 1.5 "versicolor"
 88 | 6.3 2.3 4.4 1.3 "versicolor"
 89 | 5.6 3 4.1 1.3 "versicolor"
 90 | 5.5 2.5 4 1.3 "versicolor"
 91 | 5.5 2.6 4.4 1.2 "versicolor"
 92 | 6.1 3 4.6 1.4 "versicolor"
 93 | 5.8 2.6 4 1.2 "versicolor"
 94 | 5 2.3 3.3 1 "versicolor"
 95 | 5.6 2.7 4.2 1.3 "versicolor"
 96 | 5.7 3 4.2 1.2 "versicolor"
 97 | 5.7 2.9 4.2 1.3 "versicolor"
 98 | 6.2 2.9 4.3 1.3 "versicolor"
 99 | 5.1 2.5 3 1.1 "versicolor"
100 | 5.7 2.8 4.1 1.3 "versicolor"
101 | 6.3 3.3 6 2.5 "virginica"
102 | 5.8 2.7 5.1 1.9 "virginica"
103 | 7.1 3 5.9 2.1 "virginica"
104 | 6.3 2.9 5.6 1.8 "virginica"
105 | 6.5 3 5.8 2.2 "virginica"
106 | 7.6 3 6.6 2.1 "virginica"
107 | 4.9 2.5 4.5 1.7 "virginica"
108 | 7.3 2.9 6.3 1.8 "virginica"
109 | 6.7 2.5 5.8 1.8 "virginica"
110 | 7.2 3.6 6.1 2.5 "virginica"
111 | 6.5 3.2 5.1 2 "virginica"
112 | 6.4 2.7 5.3 1.9 "virginica"
113 | 6.8 3 5.5 2.1 "virginica"
114 | 5.7 2.5 5 2 "virginica"
115 | 5.8 2.8 5.1 2.4 "virginica"
116 | 6.4 3.2 5.3 2.3 "virginica"
117 | 6.5 3 5.5 1.8 "virginica"
118 | 7.7 3.8 6.7 2.2 "virginica"
119 | 7.7 2.6 6.9 2.3 "virginica"
120 | 6 2.2 5 1.5 "virginica"
121 | 6.9 3.2 5.7 2.3 "virginica"
122 | 5.6 2.8 4.9 2 "virginica"
123 | 7.7 2.8 6.7 2 "virginica"
124 | 6.3 2.7 4.9 1.8 "virginica"
125 | 6.7 3.3 5.7 2.1 "virginica"
126 | 7.2 3.2 6 1.8 "virginica"
127 | 6.2 2.8 4.8 1.8 "virginica"
128 | 6.1 3 4.9 1.8 "virginica"
129 | 6.4 2.8 5.6 2.1 "virginica"
130 | 7.2 3 5.8 1.6 "virginica"
131 | 7.4 2.8 6.1 1.9 "virginica"
132 | 7.9 3.8 6.4 2 "virginica"
133 | 6.4 2.8 5.6 2.2 "virginica"
134 | 6.3 2.8 5.1 1.5 "virginica"
135 | 6.1 2.6 5.6 1.4 "virginica"
136 | 7.7 3 6.1 2.3 "virginica"
137 | 6.3 3.4 5.6 2.4 "virginica"
138 | 6.4 3.1 5.5 1.8 "virginica"
139 | 6 3 4.8 1.8 "virginica"
140 | 6.9 3.1 5.4 2.1 "virginica"
141 | 6.7 3.1 5.6 2.4 "virginica"
142 | 6.9 3.1 5.1 2.3 "virginica"
143 | 5.8 2.7 5.1 1.9 "virginica"
144 | 6.8 3.2 5.9 2.3 "virginica"
145 | 6.7 3.3 5.7 2.5 "virginica"
146 | 6.7 3 5.2 2.3 "virginica"
147 | 6.3 2.5 5 1.9 "virginica"
148 | 6.5 3 5.2 2 "virginica"
149 | 6.2 3.4 5.4 2.3 "virginica"
150 | 5.9 3 5.1 1.8 "virginica"
151 | 


--------------------------------------------------------------------------------
/KNN_MapReduce/src/Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.BufferedReader;
 2 | import java.io.IOException;
 3 | import java.io.InputStreamReader;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 | 
12 | public class Driver {
13 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
14 | 		int num_features=0;
15 | 		Configuration conf=new Configuration();
16 | 		FileSystem hdfs=FileSystem.get(conf);
17 | 		//args[0] is the path to the file which has features of the input waiting to be classified.
18 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[0]))));
19 | 		String line=null;
20 | 		while((line=br.readLine())!=null){
21 | 			String[] feat=line.toString().split("\\ ");
22 | 			for(int i=0;i<feat.length;i++)
23 | 				conf.setFloat("feat"+i, Float.parseFloat(feat[i]));
24 | 			num_features=feat.length;
25 | 			break;
26 | 		}
27 | 		br.close();
28 | 		hdfs.close();
29 | 		conf.setInt("num_features",num_features);
30 | 		//args[1] is the name of the entity to be classified.
31 | 		conf.set("name",args[1]);
32 | 		Job job = new Job(conf,"KNN Classification MapReduce");
33 | 		job.setJarByClass(Driver.class);
34 | 		//args[2] is the path to the input file which will be used for training.
35 | 		FileInputFormat.setInputPaths(job, new Path(args[2]));
36 | 		//args[3] is the path to the output file.
37 | 		FileOutputFormat.setOutputPath(job, new Path(args[3]));
38 | 		job.setMapperClass(Map.class);
39 | 		//job.setCombinerClass(Reduce.class);
40 | 		job.setReducerClass(Reduce.class);
41 | 		job.setOutputKeyClass(Text.class);
42 | 		job.setOutputValueClass(Text.class);
43 | 		job.waitForCompletion(true);
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/KNN_MapReduce/src/Map.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.Arrays;
 4 | import java.util.Collections;
 5 | 
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | public class Map extends Mapper<LongWritable, Text, Text, Text> {
11 | 	public static long byteoffset=0;
12 | 	public static Float[] feat=null;
13 | 	public static String species=null;
14 | 	public static ArrayList<String> dists=new ArrayList<String>();
15 | 	public static float min_dist=0;
16 | 	public static int num_features=0;
17 | 	public static float euc_dist(Float[] feat, Float[] test,int num){
18 | 		float distance=0;
19 | 		float val=0;
20 | 		for(int i=0;i<num;i++){
21 | 			val+=((feat[i]-test[i])*(feat[i]-test[i]));
22 | 		}
23 | 		distance=(float) Math.sqrt(val);
24 | 		return distance;
25 | 	}
26 | 	@Override
27 | 	public void setup(Context context) throws IOException, InterruptedException{
28 | 		num_features=(context.getConfiguration().getInt("num_features",1));
29 | 		feat=new Float[num_features];
30 | 		for(int i=0;i<num_features;i++){
31 | 			feat[i]=(context.getConfiguration().getFloat("feat"+i, 0));
32 | 		}
33 | 	}
34 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
35 | 		String[] characteristics=value.toString().split("\\ ");
36 | 		Float[] test=new Float[num_features];
37 | 		for(int i=0;i<num_features;i++){
38 | 			test[i]=Float.parseFloat(characteristics[i]);
39 | 		}
40 | 		species=characteristics[num_features].replace("\"", "");
41 | 		dists.add(String.valueOf(euc_dist(feat,test,num_features))+species);
42 | 		byteoffset=Long.parseLong(key.toString());
43 | 	}
44 | 	@Override
45 | 	public void cleanup(Context context) throws IOException, InterruptedException{
46 | 		Collections.sort(dists);
47 | 		int iter=0;
48 | 		String[] species=new String[3];
49 | 		String str="";
50 | 		for(int i=0;i<3;i++){
51 | 			str=dists.get(i);
52 | 			String spec=String.valueOf(str.replaceAll("[\\d.]", ""));
53 | 			species[iter]=spec;
54 | 			iter++;
55 | 		}
56 | 		Arrays.sort(species);
57 | 		for(int i=0;i<species.length-1;i++){
58 | 			if(species[i].equals(species[i+1])){
59 | 				context.write(new Text("1"), new Text(species[i]));
60 | 				break;
61 | 			}
62 | 		}
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/KNN_MapReduce/src/Reduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import java.util.Map.Entry;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class Reduce extends Reducer<Text, Text, Text, Text> {
 8 | 	String flower_name=null;
 9 | 	@Override
10 | 	public void setup(Context context){
11 | 		flower_name=String.valueOf(context.getConfiguration().get("name"));
12 | 	}
13 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
14 | 		HashMap<String,Integer> map=new HashMap<String,Integer>();
15 | 		String maxkey=null;int maxvalue=-1;
16 | 		for(Text value:values){
17 | 			if(!map.containsKey(value.toString())){
18 | 				map.put(value.toString(), 1);
19 | 			}
20 | 			else{
21 | 				map.put(value.toString(), map.get(value.toString())+1);
22 | 			}
23 | 		}
24 | 		for(Entry<String, Integer> entry: map.entrySet()){
25 | 			if(entry.getValue()>maxvalue){
26 | 				maxkey=entry.getKey();
27 | 				maxvalue=entry.getValue();
28 | 			}
29 | 		}
30 | 		context.write(null, new Text(flower_name+" belongs to the species of "+maxkey));
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/LUDecomposition/.classpath:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <classpath>
  3 | 	<classpathentry kind="src" path="src"/>
  4 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
  5 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-app-2.7.0.jar"/>
  6 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-common-2.7.0.jar"/>
  7 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-shuffle-2.7.0.jar"/>
  8 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.7.0.jar"/>
  9 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.0-tests.jar"/>
 10 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-plugins-2.7.0.jar"/>
 11 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.0.jar"/>
 12 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-2.7.0.jar"/>
 13 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.0.jar"/>
 14 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6-tests.jar"/>
 15 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/jersey-guice-1.9.jar"/>
 16 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/aopalliance-1.0.jar"/>
 17 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/guice-servlet-3.0.jar"/>
 18 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/javax.inject-1.jar"/>
 19 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/guice-3.0.jar"/>
 20 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/jersey-client-1.9.jar"/>
 21 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-common-2.7.0.jar"/>
 22 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-applicationhistoryservice-2.7.0.jar"/>
 23 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-nodemanager-2.7.0.jar"/>
 24 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-common-2.7.0.jar"/>
 25 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-web-proxy-2.7.0.jar"/>
 26 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-client-2.7.0.jar"/>
 27 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-applications-distributedshell-2.7.0.jar"/>
 28 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-api-2.7.0.jar"/>
 29 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-sharedcachemanager-2.7.0.jar"/>
 30 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-applications-unmanaged-am-launcher-2.7.0.jar"/>
 31 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-registry-2.7.0.jar"/>
 32 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-tests-2.7.0.jar"/>
 33 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-resourcemanager-2.7.0.jar"/>
 34 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/netty-all-4.0.23.Final.jar"/>
 35 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/xml-apis-1.3.04.jar"/>
 36 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/commons-daemon-1.0.13.jar"/>
 37 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/xercesImpl-2.9.1.jar"/>
 38 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/leveldbjni-all-1.8.jar"/>
 39 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.0.jar"/>
 40 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-nfs-2.7.0.jar"/>
 41 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.0-tests.jar"/>
 42 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/apacheds-i18n-2.0.0-M15.jar"/>
 43 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-collections-3.2.1.jar"/>
 44 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/protobuf-java-2.5.0.jar"/>
 45 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/htrace-core-3.1.0-incubating.jar"/>
 46 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jets3t-0.9.0.jar"/>
 47 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-digester-1.8.jar"/>
 48 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-core-1.9.jar"/>
 49 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/api-util-1.0.0-M20.jar"/>
 50 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/guava-11.0.2.jar"/>
 51 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsp-api-2.1.jar"/>
 52 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-beanutils-core-1.8.0.jar"/>
 53 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-codec-1.4.jar"/>
 54 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-beanutils-1.7.0.jar"/>
 55 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/xz-1.0.jar"/>
 56 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/paranamer-2.3.jar"/>
 57 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jetty-6.1.26.jar"/>
 58 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-configuration-1.6.jar"/>
 59 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-logging-1.1.3.jar"/>
 60 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jaxb-api-2.2.2.jar"/>
 61 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/httpcore-4.2.5.jar"/>
 62 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hadoop-annotations-2.7.0.jar"/>
 63 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/xmlenc-0.52.jar"/>
 64 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-recipes-2.7.1.jar"/>
 65 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/netty-3.6.2.Final.jar"/>
 66 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/junit-4.11.jar"/>
 67 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/avro-1.7.4.jar"/>
 68 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/gson-2.2.4.jar"/>
 69 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsr305-3.0.0.jar"/>
 70 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-server-1.9.jar"/>
 71 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-net-3.1.jar"/>
 72 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/mockito-all-1.8.5.jar"/>
 73 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-jaxrs-1.9.13.jar"/>
 74 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/api-asn1-api-1.0.0-M20.jar"/>
 75 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/java-xmlbuilder-0.4.jar"/>
 76 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/stax-api-1.0-2.jar"/>
 77 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar"/>
 78 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsch-0.1.42.jar"/>
 79 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-mapper-asl-1.9.13.jar"/>
 80 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/log4j-1.2.17.jar"/>
 81 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hamcrest-core-1.3.jar"/>
 82 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jetty-util-6.1.26.jar"/>
 83 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-lang-2.6.jar"/>
 84 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/asm-3.2.jar"/>
 85 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-compress-1.4.1.jar"/>
 86 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-httpclient-3.1.jar"/>
 87 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/servlet-api-2.5.jar"/>
 88 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar"/>
 89 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jettison-1.1.jar"/>
 90 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-client-2.7.1.jar"/>
 91 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/apacheds-kerberos-codec-2.0.0-M15.jar"/>
 92 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar"/>
 93 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/slf4j-api-1.7.10.jar"/>
 94 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-xc-1.9.13.jar"/>
 95 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jaxb-impl-2.2.3-1.jar"/>
 96 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/activation-1.1.jar"/>
 97 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-json-1.9.jar"/>
 98 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-framework-2.7.1.jar"/>
 99 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hadoop-auth-2.7.0.jar"/>
100 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-io-2.4.jar"/>
101 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/httpclient-4.2.5.jar"/>
102 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/snappy-java-1.0.4.1.jar"/>
103 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-math3-3.1.1.jar"/>
104 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/zookeeper-3.4.6.jar"/>
105 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-nfs-2.7.0.jar"/>
106 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-common-2.7.0-tests.jar"/>
107 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-common-2.7.0.jar"/>
108 | 	<classpathentry kind="output" path="bin"/>
109 | </classpath>
110 | 


--------------------------------------------------------------------------------
/LUDecomposition/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>LUDecomposition</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.apache.hdt.mrnature</nature>
16 | 		<nature>org.eclipse.jdt.core.javanature</nature>
17 | 	</natures>
18 | </projectDescription>
19 | 


--------------------------------------------------------------------------------
/LUDecomposition/README.md:
--------------------------------------------------------------------------------
 1 | # LU Decomposition
 2 | 
 3 | This mapreduce algorithm splits massively large matrix into it's `L` and `U` components. It uses the Naive Gaussian Elimination technique to do so.
 4 | 
 5 | # Program Execution Arguments
 6 | 
 7 | This programs only expects two arguments:
 8 | 
 9 | 1. An input path
10 | 2. An output path
11 | 
12 | # Input and Output data shape
13 | 
14 | Both the input and output matrix shapes are the **SAME**. This program expects and produces the textual input of matrices in the following manner:
15 | 
16 | `row_number + "\t" + elem-1 + "," + elem-2 + "," + elem-3 ...`
17 | 
18 | The text files should be a tab-separated list of `row_number`s and comma-separated row elements
19 | 
20 | # Final Output Location
21 | 
22 | This program produces **only one** intermediate output. But the actual output (`L` and `U` matrices) are present in the paths `<output_path> + "LU_Components/L"` and `<output_path> + "LU_Components/U"`.
23 | 
24 | It's shapes will correspond to the shapes defined above.
25 | 
26 | **NOTE**: I have provided the input and all the output (intermediate and actual) folders, you can use them to verify your outputs.
27 | 


--------------------------------------------------------------------------------
/LUDecomposition/input/test_input_4x4.txt:
--------------------------------------------------------------------------------
1 | 0	1,5,0,0
2 | 1	2,12,5,0
3 | 2	0,4,13,5
4 | 3	0,0,6,11
5 | 


--------------------------------------------------------------------------------
/LUDecomposition/output/.nth.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/.nth.crc


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/L/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/L/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/L/.part-r-00000.crc


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/L/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/L/_SUCCESS


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/L/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	1.0,0.0,0.0,0.0
2 | 1	2.0,1.0,0.0,0.0
3 | 2	0.0,2.0,1.0,0.0
4 | 3	0.0,0.0,2.0,1.0
5 | 


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/U/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/U/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    5)}


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/U/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/LU_Components/U/_SUCCESS


--------------------------------------------------------------------------------
/LUDecomposition/output/LU_Components/U/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	1,5,0,0
2 | 1	0.0,2.0,5.0,0.0
3 | 2	0.0,0.0,3.0,5.0
4 | 3	0.0,0.0,0.0,1.0
5 | 


--------------------------------------------------------------------------------
/LUDecomposition/output/after-2-runs/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/LUDecomposition/output/after-2-runs/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    Ѫ(


--------------------------------------------------------------------------------
/LUDecomposition/output/after-2-runs/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/after-2-runs/_SUCCESS


--------------------------------------------------------------------------------
/LUDecomposition/output/after-2-runs/part-r-00000:
--------------------------------------------------------------------------------
1 | 1,0	2.56
2 | 2,0	5.76
3 | 0	25,5,1
4 | 1	0.0,-4.800000000000001,-1.56
5 | 2,1	3.499999999999999
6 | 2	0.0,0.0,0.6999999999999993
7 | 


--------------------------------------------------------------------------------
/LUDecomposition/output/after-3-runs/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/LUDecomposition/output/after-3-runs/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    DM=]


--------------------------------------------------------------------------------
/LUDecomposition/output/after-3-runs/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/after-3-runs/_SUCCESS


--------------------------------------------------------------------------------
/LUDecomposition/output/after-3-runs/part-r-00000:
--------------------------------------------------------------------------------
 1 | 1,0	2.0
 2 | 2,0	0.0
 3 | 3,0	0.0
 4 | 2,1	2.0
 5 | 3,1	0.0
 6 | 0	1,5,0,0
 7 | 1	0.0,2.0,5.0,0.0
 8 | 2	0.0,0.0,3.0,5.0
 9 | 3,2	2.0
10 | 3	0.0,0.0,0.0,1.0
11 | 


--------------------------------------------------------------------------------
/LUDecomposition/output/nth:
--------------------------------------------------------------------------------
1 | 0.0,0.0,0.0,1.0


--------------------------------------------------------------------------------
/LUDecomposition/output/total_records/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/LUDecomposition/output/total_records/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/total_records/.part-r-00000.crc


--------------------------------------------------------------------------------
/LUDecomposition/output/total_records/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/LUDecomposition/output/total_records/_SUCCESS


--------------------------------------------------------------------------------
/LUDecomposition/output/total_records/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	4
2 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/Utils.java:
--------------------------------------------------------------------------------
 1 | package lud;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.DataOutputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.io.OutputStreamWriter;
 9 | 
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.fs.FileSystem;
12 | import org.apache.hadoop.fs.Path;
13 | 
14 | public class Utils {
15 | 	
16 | 	public static String arrayToCSV(Double[] nVal2) {
17 |         String result = "";
18 | 
19 |         if (nVal2.length > 0) {
20 |             StringBuilder sb = new StringBuilder();
21 | 
22 |             for (Double s : nVal2) {
23 |                 sb.append(s).append(",");
24 |             }
25 | 
26 |             result = sb.deleteCharAt(sb.length() - 1).toString();
27 |         }
28 |         return result;
29 |     }
30 | 	
31 | 	public static Double[] stringToDoubleArray(String[] a) {
32 | 		Double[] x = new Double[a.length];
33 | 		for(int i = 0; i < a.length ; i++)
34 | 			x[i] = Double.valueOf(a[i]);
35 | 		return x;
36 | 	}
37 | 
38 | 	public static void storeToHDFS(String data, String output, Configuration conf) throws IOException {
39 | 		
40 | 		FileSystem hdfs=FileSystem.get(conf);
41 | 		Path find_nth_row_output_path = new Path(conf.get("find_nth_row_output"));
42 | 		try {
43 | 		    if (hdfs.exists(find_nth_row_output_path)) {
44 | 		        hdfs.delete(find_nth_row_output_path, true);
45 | 		    }
46 | 		    DataOutputStream outStream = hdfs.create(find_nth_row_output_path);
47 | 		    BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(outStream, "UTF-8" ) );
48 | 		    bw.write(data);
49 | 		    bw.close();
50 | 		    hdfs.close();
51 | 		    outStream.close();
52 | 		}
53 | 		catch (Exception e) {
54 | 			System.out.println(e.getMessage());
55 | 		}
56 | 	}
57 | 
58 | 	public static String readFromHDFS(String path, Configuration conf) throws IOException {
59 | 		
60 | 		FileSystem hdfs=FileSystem.get(conf);
61 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path))));
62 | 		String records = br.readLine().trim();
63 | 		br.close();
64 | 		hdfs.close();
65 | 		
66 | 		return records;
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/io/LongAndTextWritable.java:
--------------------------------------------------------------------------------
 1 | package lud.io;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.io.Writable;
10 | 
11 | public class LongAndTextWritable implements Writable {
12 | 	
13 | 	private LongWritable rowKey;
14 |     private Text rowValue;
15 |     
16 |     public LongAndTextWritable() {
17 |         this.rowKey = new LongWritable(0);
18 |         this.rowValue = new Text("");
19 |     }
20 | 	
21 | 	public LongAndTextWritable(LongWritable k, Text v) {
22 |         this.rowKey = k;
23 |         this.rowValue = v;
24 |     }
25 | 	
26 | 	public LongWritable getKey() {
27 | 		return rowKey;
28 | 	}
29 | 	
30 | 	public Text getValue() {
31 | 		return rowValue;
32 | 	}
33 | 
34 | 	@Override
35 | 	public void readFields(DataInput in) throws IOException {
36 | 		
37 | 		rowKey.readFields(in);
38 | 		rowValue.readFields(in);
39 | 		
40 | 	}
41 | 
42 | 	@Override
43 | 	public void write(DataOutput out) throws IOException {
44 | 		
45 | 		rowKey.write(out);
46 | 		rowValue.write(out);
47 | 	}
48 | 	
49 | 	@Override
50 |     public String toString() {
51 |         return rowKey.toString() + "\t" + rowValue.toString();
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/io/NaturalKeyGroupingComparator.java:
--------------------------------------------------------------------------------
 1 | package lud.io;
 2 | 
 3 | import org.apache.hadoop.io.WritableComparable;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | 
 6 | public class NaturalKeyGroupingComparator extends WritableComparator {
 7 |     protected NaturalKeyGroupingComparator() {
 8 |         super(TextPair.class, true);
 9 |     }   
10 |     @SuppressWarnings("rawtypes")
11 |     @Override
12 |     public int compare(WritableComparable w1, WritableComparable w2) {
13 |         TextPair tp1 = (TextPair)w1;
14 |         TextPair tp2 = (TextPair)w2;
15 |          
16 |         return tp1.getFirst().compareTo(tp2.getFirst());
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/io/TextPair.java:
--------------------------------------------------------------------------------
 1 | package lud.io;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.WritableComparable;
 8 | 
 9 | public class TextPair implements WritableComparable<TextPair> {
10 | 	
11 | 	private String t1;
12 | 	private String t2;
13 | 	
14 | 	public String getFirst() {
15 | 		return this.t1;
16 | 	}
17 | 	
18 | 	public String getSecond() {
19 | 		return this.t2;
20 | 	}
21 | 
22 | 	@Override
23 | 	public void readFields(DataInput in) throws IOException {
24 | 		this.t1 = in.readUTF();
25 | 		this.t2 = in.readUTF();
26 | 	}
27 | 
28 | 	@Override
29 | 	public void write(DataOutput out) throws IOException {
30 | 		out.writeUTF(this.t1);
31 | 		out.writeUTF(this.t2);
32 | 	}
33 | 	
34 | 	public TextPair() {
35 |         this.t1 = new String();
36 |         this.t2 = new String();
37 |     }
38 | 	
39 | 	public TextPair(String t1, String t2) {
40 |         this.t1 = new String(t1);
41 |         this.t2 = new String(t2);
42 |     }
43 | 	
44 | 	public int compareTo(TextPair tp) {
45 |         int sortKey = this.t1.compareTo(tp.getFirst());
46 |         if (sortKey == 0) {
47 |         	sortKey = this.t2.compareTo(tp.getSecond());
48 |         }
49 |         return sortKey;
50 |     }
51 | 	
52 | 	public String toString () {
53 | 		String s = "";
54 | 		if (this.t2.compareTo("") == 0) {
55 | 			s += this.t1;
56 | 		}
57 | 		else {
58 | 			s += this.t1 + "," + this.t2;
59 | 		}
60 | 		return s;
61 | 	}
62 | 	
63 | }
64 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/io/TextPairComparator.java:
--------------------------------------------------------------------------------
 1 | package lud.io;
 2 | 
 3 | import org.apache.hadoop.io.WritableComparable;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | 
 6 | public class TextPairComparator extends WritableComparator {
 7 |     protected TextPairComparator() {
 8 |         super(TextPair.class, true);
 9 |     }   
10 |     @SuppressWarnings("rawtypes")
11 |     @Override
12 |     public int compare(WritableComparable w1, WritableComparable w2) {
13 |     	TextPair tp1 = (TextPair)w1;
14 |     	TextPair tp2 = (TextPair)w2;
15 |          
16 |         int result = tp1.getFirst().compareTo(tp2.getFirst());
17 |         if(0 == result) {
18 |             result = tp1.getSecond().compareTo(tp2.getSecond());
19 |         }
20 |         return result;
21 |     }
22 | }


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/io/TextPairPartitioner.java:
--------------------------------------------------------------------------------
 1 | package lud.io;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Partitioner;
 5 | 
 6 | public class TextPairPartitioner extends Partitioner<TextPair, Text>{
 7 |     @Override
 8 |     public int getPartition(TextPair tp, Text t, int numPartitions) {
 9 |         return tp.getFirst().hashCode() % numPartitions;
10 |     }
11 | }


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/initial_input_mapper.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | 
11 | import lud.Utils;
12 | import lud.io.TextPair;
13 | 
14 | public class initial_input_mapper extends Mapper<Text, Text, Text, Text> {
15 | 	
16 | 	private int n;
17 | 	private String nVal = null;
18 | 	
19 | 	private long counter = 0;
20 | 	private long[] input_range = new long[2];
21 | 	
22 | 	private List<TextPair> toBeSent = new ArrayList<TextPair>();
23 | 	
24 | 	public static Double[] readNthRow (Configuration conf) throws IOException {
25 | 		try {
26 | 			
27 | 			String path = conf.get("find_nth_row_output");
28 | 			String[] nValArr = Utils.readFromHDFS(path, conf).split(",");
29 | 			return Utils.stringToDoubleArray(nValArr);
30 | 			
31 | 		}
32 | 		catch (Exception e) {
33 | 			System.out.println("Can't read nth value! " + e.getMessage());
34 | 			return null;
35 | 		}
36 | 	}
37 | 	
38 | 	@Override
39 | 	public void setup (Context context) throws IOException, InterruptedException {
40 | 		Configuration conf = context.getConfiguration();
41 | 		this.n = (int) conf.getLong("n", 0);
42 | 		// The below code should have worked, but it does not
43 | 		//if (n>0)
44 | 		//	this.nVal = Utils.arrayToCSV(readNthRow(context.getConfiguration()));
45 | 	}
46 | 	
47 | 	public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
48 | 		
49 | 		if (counter == 0 && !key.toString().contains(","))
50 | 			this.input_range[0] = Long.valueOf(key.toString());
51 | 
52 | 		if (!key.toString().contains(",")) {
53 | 			int row = Integer.parseInt(key.toString());
54 | 			
55 | 			if (row == this.n)
56 | 				this.nVal = value.toString();
57 | 			
58 | 			if (n > 0)
59 | 				//context.write(key, new Text(value.toString()+";"+this.nVal));
60 | 				toBeSent.add(new TextPair(key.toString(), value.toString()));
61 | 			else
62 | 				context.write(key, value);
63 | 		}
64 | 		else
65 | 			context.write(key, value);
66 | 		
67 | 		counter++;
68 | 	}
69 | 
70 | 	@Override
71 | 	public void cleanup (Context context) throws IOException, InterruptedException {
72 | 		
73 | 		// The code block below will run if the nth row is not in the split.
74 | 		// It will read it from HDFS, which was stored there from the previous mapper/reducer
75 | 		if (this.nVal == null && n>0)
76 | 		  this.nVal = Utils.arrayToCSV(readNthRow(context.getConfiguration()));
77 | 		
78 | 		if (n == 0 && n>=input_range[0] && n<=input_range[1]) {
79 | 			input_range[1] = input_range[0] + counter - 1;
80 | 			// Sending Nth Row to all reducers
81 | 			for (long i = 0 ; i <= input_range[1] ; i++)
82 | 				context.write(new Text(String.valueOf(i)), new Text("Nth Row->"+this.nVal));
83 | 		}
84 | 		
85 | 		// Have to do this because there is a mapper after this map phase, and not a reducer.
86 | 		// Otherwise I would have used the same logic like in the if block above.
87 | 		else
88 | 			for (TextPair tp:toBeSent)
89 | 				context.write(new Text(tp.getFirst()), new Text(tp.getSecond()+";"+this.nVal));
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/lud_driver.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
10 | import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | import lud.naiveGaussian.mergeResults.merge_results_driver;
16 | import lud.naiveGaussian.totalRecords.total_records_driver;
17 | 
18 | public class lud_driver {
19 | 	
20 | 	public static void prepareJobWithConf (Job jobPrep, Configuration confPrep) throws IOException, InterruptedException, ClassNotFoundException {
21 | 		
22 | 		long n = confPrep.getLong("n", 0);
23 | 		
24 | 		// Chaining MR Jobs
25 | 		if (n == 0) {
26 | 			ChainMapper.addMapper(jobPrep, initial_input_mapper.class,
27 | 					Text.class, Text.class,
28 | 					Text.class, Text.class,
29 | 					confPrep);
30 | 			ChainReducer.setReducer(jobPrep, lud_reducer.class, Text.class, Text.class, Text.class, Text.class, confPrep);
31 | 		}
32 | 		else {
33 | 			ChainReducer.addMapper(jobPrep, initial_input_mapper.class,
34 | 					Text.class, Text.class,
35 | 					Text.class, Text.class,
36 | 					confPrep);
37 | 			ChainReducer.addMapper(jobPrep, lud_mapper.class,
38 | 					Text.class, Text.class,
39 | 					Text.class, Text.class,
40 | 					confPrep);
41 | 		}
42 | 	}
43 | 	
44 | 	@SuppressWarnings("deprecation")
45 | 	public static void main (String[] args) throws IOException, ClassNotFoundException, InterruptedException {
46 | 		
47 | 		String input = args[0];
48 | 		String output = args[1];
49 | 		String find_nth_row_output = output + "/nth";
50 | 		
51 | 		// MR Job: Finding Total Records
52 | 		
53 | 		String[] total_records_args = {input, output + "/total_records"};
54 | 		long total_records = total_records_driver.run(total_records_args);
55 | 		
56 | 		Configuration conf = new Configuration();
57 | 		conf.set("find_nth_row_output", find_nth_row_output);
58 | 		conf.set("mapreduce.job.reduce.slowstart.completedmaps", "1.00");
59 | 		conf.setLong("total_records", total_records);
60 | 		Job job = new Job(conf);
61 | 		
62 | 		for(int n = 0 ; n < total_records - 1 ; n++) {
63 | 			
64 | 			Configuration confLoop = conf;
65 | 			confLoop.set("mapreduce.job.reduce.slowstart.completedmaps", "1.00");
66 | 			confLoop.unset("n");
67 | 			confLoop.setLong("n", n);
68 | 			
69 | 			prepareJobWithConf(job, confLoop);
70 | 		}
71 | 		
72 | 		String lud_output_path = output+"/after-"+(total_records-1)+"-runs";
73 | 		job.setJarByClass(lud_driver.class);
74 | 		job.setJobName("Split a matrix into it's LU decomposed components using the Naive Gaussian Elimination method");
75 | 		FileInputFormat.addInputPath(job, new Path(input));
76 | 		FileOutputFormat.setOutputPath(job, new Path(lud_output_path));
77 | 		job.setInputFormatClass(KeyValueTextInputFormat.class);
78 | 		job.setOutputKeyClass(Text.class);
79 | 		job.setOutputValueClass(Text.class);
80 | 		job.waitForCompletion(true);
81 | 		
82 | 		// MR Job(s): Merging Outputs
83 | 		
84 | 		Path merge_results_input_path = new Path(lud_output_path);
85 | 		conf.setBoolean("upper", false);
86 | 		job = new Job(conf);
87 | 		FileInputFormat.addInputPath(job, merge_results_input_path);
88 | 		String l_output_path = output+"/LU_Components/L";
89 | 		merge_results_driver.runWithJob(job, l_output_path);
90 | 		
91 | 		conf.setBoolean("upper", true);
92 | 		job = new Job(conf);
93 | 		String u_output_path = output+"/LU_Components/U";
94 | 		FileInputFormat.addInputPath(job, merge_results_input_path);
95 | 		merge_results_driver.runWithJob(job, u_output_path);
96 | 		
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/lud_mapper.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | import lud.Utils;
 9 | 
10 | public class lud_mapper extends Mapper<Text, Text, Text, Text> {
11 | 	
12 | 	private static long total_records;
13 | 	private long n;
14 | 	private Double[] nVal = null;
15 | 	
16 | 	@Override
17 | 	public void setup (Context context) throws IOException, InterruptedException {
18 | 		lud_mapper.total_records = context.getConfiguration().getLong("total_records", 0);
19 | 		this.n = context.getConfiguration().getLong("n", 0);
20 | 		this.nVal = initial_input_mapper.readNthRow(context.getConfiguration());
21 | 	}
22 | 	
23 | 	public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
24 | 		
25 | 		String[] parts = new String[2];
26 | 		parts[0] = key.toString();
27 | 		String[] rowAndNVal = value.toString().split(";");
28 | 		parts[1] = parts[0].contains(",")?value.toString():rowAndNVal[0];
29 | 		
30 | 		if (this.nVal == null && !parts[0].contains(","))
31 | 			this.nVal = Utils.stringToDoubleArray(rowAndNVal[1].split(","));
32 | 		
33 | 		if(!parts[0].contains(",")) {
34 | 			
35 | 			long row = Long.valueOf(parts[0]);
36 | 			
37 | 			if (row > this.n) {
38 | 				
39 | 				Double[] rowElements = Utils.stringToDoubleArray(parts[1].split(","));
40 | 				Double multiplier = (double) (rowElements[(int) this.n]/this.nVal[(int) this.n]);
41 | 				// Sending lower triangular matrix elements
42 | 				context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier)));
43 | 				Double[] rowElementsModified = new Double[(int) lud_mapper.total_records];
44 | 				
45 | 				for (int i = 0; i< lud_mapper.total_records; i++) {
46 | 					rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier);
47 | 				}
48 | 				
49 | 				// Doing this so that N+1th row is stored before any KV pair is generated
50 | 				if (row==(this.n+1))
51 | 					Utils.storeToHDFS(Utils.arrayToCSV(rowElementsModified), context.getConfiguration().get("find_nth_row_output"), context.getConfiguration());
52 | 				
53 | 				context.write(new Text(String.valueOf(row)), new Text(Utils.arrayToCSV(rowElementsModified)));
54 | 			}
55 | 			else
56 | 			  context.write(new Text(parts[0]), new Text(parts[1].split(";")[0]));
57 | 		}
58 | 		else
59 | 		  context.write(new Text(parts[0]), new Text(parts[1].split(";")[0]));
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/lud_reducer.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import lud.Utils;
 9 | 
10 | public class lud_reducer extends Reducer<Text, Text, Text, Text> {
11 | 	
12 | 	private long n;
13 | 	private Double[] nVal = null;
14 | 	
15 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
16 | 		
17 | 		// Fetching Nth Row from values
18 | 		for (Text value:values) {
19 | 			
20 | 			String[] parts = new String[2];
21 | 			parts[0] = key.toString();
22 | 			parts[1] = value.toString();
23 | 			
24 | 			if (parts[1].contains("Nth Row->")) {
25 | 				this.nVal = Utils.stringToDoubleArray(parts[1].split("->")[1].split(","));
26 | 				break;
27 | 			}
28 | 		}
29 | 		
30 | 		
31 | 		// Processing rest of the rows
32 | 		for (Text value:values) {
33 | 			
34 | 			String[] parts = new String[2];
35 | 			parts[0] = key.toString();
36 | 			parts[1] = value.toString();
37 | 			
38 | 			if (parts[1].contains("Nth Row->"))
39 | 				continue;
40 | 			
41 | 			else {
42 | 				if(!parts[0].contains(",")) {
43 | 					
44 | 					long row = Long.valueOf(parts[0]);
45 | 					
46 | 					if (row > this.n) {
47 | 						Double[] rowElements = Utils.stringToDoubleArray(parts[1].split(","));
48 | 						Double multiplier = (double) (rowElements[(int) this.n]/this.nVal[(int) this.n]);
49 | 						
50 | 						context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier)));
51 | 						
52 | 						Double[] rowElementsModified = new Double[(int) rowElements.length];
53 | 						for (int i = 0; i< rowElementsModified.length; i++) {
54 | 							rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier);
55 | 						}
56 | 						
57 | 						// Doing this so that N+1th row is stored before any KV pair is generated
58 | 						if (row==(this.n+1))
59 | 							Utils.storeToHDFS(Utils.arrayToCSV(rowElementsModified), context.getConfiguration().get("find_nth_row_output"), context.getConfiguration());
60 | 						
61 | 						context.write(new Text(String.valueOf(row)), new Text(Utils.arrayToCSV(rowElementsModified)));
62 | 					}
63 | 					else
64 | 					  context.write(new Text(parts[0]), new Text(parts[1]));
65 | 				}
66 | 				else
67 | 				  context.write(new Text(parts[0]), new Text(parts[1]));
68 | 			}
69 | 		}
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_driver.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.mergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | import lud.io.NaturalKeyGroupingComparator;
11 | import lud.io.TextPair;
12 | import lud.io.TextPairComparator;
13 | import lud.io.TextPairPartitioner;
14 | 
15 | public class merge_results_driver {
16 | 
17 | 	public static boolean runWithJob(Job job, String out_path) throws IOException, InterruptedException, ClassNotFoundException {
18 | 	  job.setJarByClass(merge_results_driver.class);
19 | 	
20 | 	  job.setJobName("Final Step: Merging results and creating separate LU decomposed components of input matrix");
21 | 	
22 | 	  FileOutputFormat.setOutputPath(job, new Path(out_path));
23 | 	
24 | 	  job.setMapperClass(lud.naiveGaussian.mergeResults.merge_results_mapper.class);
25 | 	  job.setReducerClass(lud.naiveGaussian.mergeResults.merge_results_reducer.class);
26 | 	  job.setMapOutputKeyClass(TextPair.class);
27 | 	  job.setMapOutputValueClass(Text.class);
28 | 	  job.setOutputKeyClass(TextPair.class);
29 | 	  job.setOutputValueClass(Text.class);
30 | 	  job.setPartitionerClass(TextPairPartitioner.class);
31 |       job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
32 |       job.setSortComparatorClass(TextPairComparator.class);
33 |       
34 |       boolean success = job.waitForCompletion(true);
35 | 	  return success;
36 | 	};
37 | }
38 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_mapper.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.mergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | import lud.io.TextPair;
10 | 
11 | public class merge_results_mapper extends Mapper<LongWritable, Text, TextPair, Text> {
12 | 	
13 | 	private Boolean upper;
14 | 	private int total_records;
15 | 	
16 | 	@Override
17 | 	public void setup (Context context) {
18 | 		this.upper = context.getConfiguration().getBoolean("upper", false);
19 | 		this.total_records = (int) context.getConfiguration().getLong("total_records", 0);
20 | 	}
21 | 
22 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
23 | 		String[] parts = value.toString().split("\\t");
24 | 		// Processing Upper Triangular Matrix's rows
25 | 		if (this.upper && !parts[0].contains(",")) {
26 | 			context.write(new TextPair(parts[0],""), new Text(parts[1]));
27 | 		}
28 | 		// Processing Lower Triangular Matrix's rows
29 | 		if (!this.upper && parts[0].contains(",")) {
30 | 			
31 | 			String[] rowCol = parts[0].split(",");
32 | 			String row = rowCol[0];
33 | 			// Sending first row of Lower Triangular Matrix to the reducer
34 | 			if (Integer.valueOf(row)-1 == 0) {
35 | 				for (int i = 0; i < this.total_records; i++) {
36 | 					context.write(new TextPair("0",String.valueOf(i)), new Text(i+","+((i == 0) ? 1 : 0)));
37 | 				}
38 | 			}
39 | 			String column = rowCol[1];
40 | 			String element = parts[1];
41 | 			context.write(new TextPair(row, column), new Text(column+","+element));
42 | 		}
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/mergeResults/merge_results_reducer.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.mergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import lud.Utils;
 9 | import lud.io.TextPair;
10 | 
11 | public class merge_results_reducer extends Reducer<TextPair, Text, TextPair, Text> {
12 | 	
13 | 	private Boolean upper;
14 | 	private int total_records;
15 | 	
16 | 	@Override
17 | 	public void setup (Context context) {
18 | 		this.upper = context.getConfiguration().getBoolean("upper", false);
19 | 		this.total_records = (int) context.getConfiguration().getLong("total_records", 0);
20 | 	}
21 | 	
22 | 	public static String arrayToCSV(String[] a) {
23 |         String result = "";
24 |         if (a.length > 0) {
25 |             StringBuilder sb = new StringBuilder();
26 |             for (String s : a) {
27 |                 sb.append(s).append(",");
28 |             }
29 |             result = sb.deleteCharAt(sb.length() - 1).toString();
30 |         }
31 |         return result;
32 |     }
33 | 	
34 | 	public void reduce(TextPair key, Iterable<Text> values, Context context)
35 | 			throws IOException, InterruptedException {
36 | 		if (this.upper) {
37 | 			for (Text val:values) {
38 | 				context.write(new TextPair(key.getFirst(),""), val);
39 | 			}
40 | 		}
41 | 		else {
42 | 			Double[] rowElements = new Double[this.total_records];
43 | 			int row = Integer.valueOf(key.getFirst());
44 | 			for (Text val:values) {
45 | 				String[] parts = val.toString().split(",");
46 | 				int j = Integer.valueOf(parts[0]);
47 | 				rowElements[j] = Double.valueOf(parts[1]);
48 | 			}
49 | 			// Setting Diagonal Elements as `1` in the lower triangular matrix rows
50 | 			rowElements[row] = (double) 1;
51 | 			
52 | 			for(int j = 0; j< this.total_records; j++) {
53 | 				if (rowElements[j] == null) {
54 | 					rowElements[j] = (double) 0;
55 | 				}
56 | 			}
57 | 			context.write(new TextPair(key.getFirst(),""), new Text(Utils.arrayToCSV(rowElements)));
58 | 		}
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_driver.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.totalRecords;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStreamReader;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.LongWritable;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class total_records_driver {
16 | 	
17 | 	public static long readTotalRecords (String path, Configuration conf) throws IOException {
18 | 		FileSystem hdfs=FileSystem.get(conf);
19 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-r-00000"))));
20 | 		Long records = (long) 0;
21 | 		records = Long.valueOf(br.readLine().split("\\t")[1]);
22 | 		br.close();
23 | 		hdfs.close();
24 | 		return records;
25 | 	}
26 | 
27 | 	@SuppressWarnings("deprecation")
28 | 	public static long run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
29 | 	  Configuration conf = new Configuration();
30 | 	  Job job = new Job(conf);
31 | 	
32 | 	  job.setJarByClass(total_records_driver.class);
33 | 	
34 | 	  job.setJobName("Just counting total rows of the HDFS input");
35 | 	
36 | 	  FileInputFormat.setInputPaths(job, new Path(args[0]));
37 | 	  FileOutputFormat.setOutputPath(job, new Path(args[1]));
38 | 	
39 | 	  job.setMapperClass(total_records_mapper.class);
40 | 	
41 | 	  job.setReducerClass(total_records_reducer.class);
42 | 	  job.setCombinerClass(total_records_reducer.class);
43 | 	
44 | 	  job.setOutputKeyClass(LongWritable.class);
45 | 	  job.setOutputValueClass(LongWritable.class);
46 | 	  
47 | 	  //job.setInputFormatClass(TextInputFormat.class);
48 |       //job.setOutputFormatClass(TextOutputFormat.class);
49 | 	
50 | 	  job.waitForCompletion(true);
51 | 	  
52 | 	  return readTotalRecords(args[1], conf);
53 |   };
54 | }
55 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_mapper.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.totalRecords;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class total_records_mapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
10 | 
11 | 	private Long countRows = (long) 0;
12 | 	
13 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
14 | 		this.countRows++;
15 | 	}
16 | 	
17 | 	@Override
18 | 	public void cleanup(Context context) throws IOException, InterruptedException{
19 | 		context.write(new LongWritable(0), new LongWritable(this.countRows));
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/LUDecomposition/src/lud/naiveGaussian/totalRecords/total_records_reducer.java:
--------------------------------------------------------------------------------
 1 | package lud.naiveGaussian.totalRecords;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | public class total_records_reducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
 9 | 
10 | 	private Long countRows = (long) 0;
11 | 	
12 | 	public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
13 | 			throws IOException, InterruptedException {
14 | 		for(LongWritable val:values){
15 | 			this.countRows += val.get();
16 | 		}
17 | 	}
18 | 	
19 | 	@Override
20 | 	public void cleanup(Context context) throws IOException, InterruptedException{
21 | 		
22 | 		context.write(new LongWritable(0), new LongWritable(this.countRows));
23 | 	}
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/LinearRegression_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | The input contains files which have comma separated values. Each line is an input point with the last comma separated value being the output.
 2 | 
 3 | In this case, I have just used a straight line as input and examined if my algorithm generates the same straight line from it which it does.
 4 | 
 5 | This code can easily be converted into LWR (Locally Weighted Regression), a technique which discards the less critical (irrelevant) input points, by simply multiplying the weighting function which is given by;
 6 | 
 7 | w(i) = exp(-(X[i]-X)^2/(2T^2))		;; "exp" is "e^"
 8 | 
 9 | X[i] is the input point
10 | X is the query point (The input for which you want to predict the output)
11 | T (Tao) is a constant like alpha. The higher the value of Tao, the higher is the range of the input points used (chosen) for prediction or wider is the weighting function and vice-versa.
12 | 
13 | to the term "(alpha/number_inputs)*(Yi-h_theta)*(Xi[i]))" in the map function of the code.
14 | 
15 | This algorithm takes in 5 arguments as follows:
16 | 
17 | 1. The number of features each input point has
18 | 2. The value of alpha
19 | 3. The number of times you want your algorithm to iterate
20 | 4. The input path
21 | 5. The output path
22 | 


--------------------------------------------------------------------------------
/LinearRegression_MapReduce/input/linear.txt:
--------------------------------------------------------------------------------
 1 | 1,1
 2 | 2,2
 3 | 3,3
 4 | 4,4
 5 | 5,5
 6 | 6,6
 7 | 7,7
 8 | 8,8
 9 | 9,9
10 | 10,10


--------------------------------------------------------------------------------
/LinearRegression_MapReduce/src/Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.BufferedReader;
 2 | import java.io.IOException;
 3 | import java.io.InputStreamReader;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.FloatWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | 
13 | public class Driver {
14 | 	public static int num_features; // needs to be set
15 | 	public static float alpha; // needs to be set
16 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
17 | 		//args[0] is the number of features each input has.
18 | 		num_features=Integer.parseInt(args[0]);
19 | 		++num_features;
20 | 		//args[1] is the value of alpha that you want to use.
21 | 		alpha=Float.parseFloat(args[1]);
22 | 		Configuration conf=new Configuration();
23 | 		FileSystem hdfs=FileSystem.get(conf);
24 | 		Float[] theta=new Float[num_features];
25 | 		//args[2] is the number of times you want to iterate over your training set.
26 | 		for(int i=0;i<Integer.parseInt(args[2]);i++){
27 | 			//for the first run
28 | 			if(i==0){
29 | 				for(int i1=0;i1<num_features;i1++){
30 | 					theta[i1]=(float) 0;
31 | 				}
32 | 			}
33 | 			//for the second run
34 | 			else{
35 | 				int iter=0;
36 | 				//args[4] is the output path for storing the theta values.
37 | 				BufferedReader br1 = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[4]))));
38 | 				String line1=null;
39 | 				while((line1=br1.readLine())!=null){
40 | 					String[] theta_line=line1.toString().split("\\\t");
41 | 					theta[iter]=Float.parseFloat(theta_line[1]);
42 | 					iter++;
43 | 				}
44 | 				br1.close();
45 | 			}
46 | 			if(hdfs.exists(new Path(args[4]))){
47 | 				hdfs.delete(new Path(args[4]),true);
48 | 			}
49 | 			hdfs.close();
50 | 			//alpha value initialisation
51 | 			conf.setFloat("alpha", alpha);
52 | 			//Theta Value Initialisation
53 | 			for(int j=0;j<num_features;j++){
54 | 				conf.setFloat("theta".concat(String.valueOf(j)), theta[j]);
55 | 			}
56 | 			Job job = new Job(conf,"Calculation of Theta");
57 | 			job.setJarByClass(Driver.class);
58 | 			//args[3] is the input path.
59 | 			FileInputFormat.setInputPaths(job, new Path(args[3]));
60 | 			FileOutputFormat.setOutputPath(job, new Path(args[4]));
61 | 			job.setMapperClass(thetaMAP.class);
62 | 			job.setReducerClass(thetaREDUCE.class);
63 | 			job.setOutputKeyClass(Text.class);
64 | 			job.setOutputValueClass(FloatWritable.class);
65 | 			job.waitForCompletion(true);
66 | 		}
67 | 	}  
68 | }
69 | 


--------------------------------------------------------------------------------
/LinearRegression_MapReduce/src/thetaMAP.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import org.apache.hadoop.io.FloatWritable;
 4 | import org.apache.hadoop.io.LongWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | public class thetaMAP extends Mapper<LongWritable, Text, Text, FloatWritable> {
 9 | 	public static int count=0;
10 | 	public static long number_inputs=(long) 0;
11 | 	public static float alpha=0.0f;
12 | 	public static Float[] Xi=null;
13 | 	public static ArrayList<Float> theta_i=new ArrayList<Float>();
14 | 	@Override
15 | 	public void setup(Context context) throws IOException, InterruptedException{
16 | 		alpha=context.getConfiguration().getFloat("alpha",0),
17 | 		number_inputs=context.getCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS).getValue();
18 | 	}
19 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20 | 		++count;
21 | 		float h_theta=0;
22 | 		String[] tok=value.toString().split("\\,");
23 | 		if(count==1){
24 | 			for(int i=0;i<tok.length;i++){
25 | 				theta_i.add(context.getConfiguration().getFloat("theta".concat(String.valueOf(i)),0));
26 | 			}
27 | 			Xi=new Float[tok.length];
28 | 		}
29 | 		for(int i=0;i<Xi.length;i++){
30 | 			if(i==0){
31 | 				Xi[0]=(float) 1;
32 | 			}
33 | 			else{
34 | 				Xi[i]=Float.parseFloat(tok[i-1]);
35 | 		}
36 | 		}
37 | 		for(int i=0;i<Xi.length;i++){
38 | 			h_theta+=(float) (Xi[i]*theta_i.get(i));
39 | 		}
40 | 
41 | 		float Yi=Float.parseFloat(tok[tok.length-1]);
42 | 		for(int i=0;i<Xi.length;i++){
43 | 			float temp=theta_i.get(i);
44 | 			theta_i.remove(i);
45 | 			theta_i.add(i,(float) (temp+(alpha/number_inputs)*(Yi-h_theta)*(Xi[i])));
46 | 		}
47 | 	}
48 | 	@Override
49 | 	public void cleanup(Context context) throws IOException, InterruptedException{
50 | 		for(int i=0;i<theta_i.size();i++){
51 | 		context.write(new Text("theta"+i), new FloatWritable(theta_i.get(i)));
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/LinearRegression_MapReduce/src/thetaREDUCE.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.FloatWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | public class thetaREDUCE extends Reducer<Text, FloatWritable, Text, FloatWritable>{
 7 | 	public void reduce(Text key, Iterable<FloatWritable> values, Context context) throws IOException, InterruptedException{
 8 | 		float sum=0;
 9 | 		int count=0;
10 | 		for(FloatWritable value:values){
11 | 			sum+=value.get();
12 | 			count++;
13 | 		}
14 | 		context.write(key, new FloatWritable(sum/count));
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/LogisticRegression_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | The input is a record of women who were diagnosed for Diabetes. Each line is an input point with the last value being the output.
 2 | 
 3 | In this case, the output is either 0 or 1. 0 means tested negative for Diabetes and 1 means otherwise.
 4 | 
 5 | This code can easily be converted into LWR (Locally Weighted Regression), a technique which discards the less critical (irrelevant) input points, by simply multiplying the weighting function which is given by;
 6 | 
 7 | w(i) = exp(-(X[i]-X)^2/(2T^2))		;; "exp" is "e^"
 8 | 
 9 | X[i] is the input point
10 | X is the query point (The input for which you want to predict the output)
11 | T (Tao) is a constant like alpha. The higher the value of Tao, the higher is the range of the input points used (chosen) for prediction or wider is the weighting function and vice-versa.
12 | 
13 | to the term "(alpha/number_inputs)*(Yi-h_theta)*(Xi[i]))" in the map function of the code.
14 | 
15 | This algorithm takes in 5 arguments as follows:
16 | 
17 | 1. The number of features each input point has
18 | 2. The value of alpha
19 | 3. The number of times you want your algorithm to iterate
20 | 4. The input path
21 | 5. The output path


--------------------------------------------------------------------------------
/LogisticRegression_MapReduce/src/Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.BufferedReader;
 2 | import java.io.IOException;
 3 | import java.io.InputStreamReader;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.FileSystem;
 7 | import org.apache.hadoop.fs.Path;
 8 | import org.apache.hadoop.io.FloatWritable;
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapreduce.Job;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | 
14 | public class Driver {
15 | 	public static int num_features; // needs to be set
16 | 	public static float alpha; // needs to be set
17 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
18 | 		//args[0] is the number of features each input has.
19 | 		num_features=Integer.parseInt(args[0]);
20 | 		++num_features;
21 | 		//args[1] is the value of alpha that you want to use.
22 | 		alpha=Float.parseFloat(args[1]);
23 | 		Configuration conf=new Configuration();
24 | 		FileSystem hdfs=FileSystem.get(conf);
25 | 		Float[] theta=new Float[num_features];
26 | 		//args[2] is the number of times you want to iterate over your training set.
27 | 		for(int i=0;i<Integer.parseInt(args[2]);i++){
28 | 			//for the first run
29 | 			if(i==0){
30 | 				for(int i1=0;i1<num_features;i1++){
31 | 					theta[i1]=(float) 0;
32 | 				}
33 | 			}
34 | 			//for the second run
35 | 			else{
36 | 				int iter=0;
37 | 				//args[4] is the output path for storing the theta values.
38 | 				BufferedReader br1 = new BufferedReader(new InputStreamReader(hdfs.open(new Path(args[4]+"/part-r-00000"))));
39 | 				String line1=null;
40 | 				while((line1=br1.readLine())!=null){
41 | 					String[] theta_line=line1.toString().split("\t");
42 | 					theta[iter]=Float.parseFloat(theta_line[1]);
43 | 					iter++;
44 | 				}
45 | 				br1.close();
46 | 			}
47 | 			if(hdfs.exists(new Path(args[4]))){
48 | 				hdfs.delete(new Path(args[4]),true);
49 | 			}
50 | 			hdfs.close();
51 | 			//alpha value initialisation
52 | 			conf.setFloat("alpha", alpha);
53 | 			//Theta Value Initialisation
54 | 			for(int j=0;j<num_features;j++){
55 | 				conf.setFloat("theta".concat(String.valueOf(j)), theta[j]);
56 | 			}
57 | 			Job job = new Job(conf,"Calculation of Theta");
58 | 			job.setJarByClass(Driver.class);
59 | 			//args[3] is the input path.
60 | 			FileInputFormat.setInputPaths(job, new Path(args[3]));
61 | 			FileOutputFormat.setOutputPath(job, new Path(args[4]));
62 | 			job.setMapperClass(thetaMAP.class);
63 | 			job.setReducerClass(thetaREDUCE.class);
64 | 			job.setOutputKeyClass(Text.class);
65 | 			job.setOutputValueClass(FloatWritable.class);
66 | 			job.waitForCompletion(true);
67 | 		}
68 | 	}  
69 | }
70 | 


--------------------------------------------------------------------------------
/LogisticRegression_MapReduce/src/thetaMAP.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import org.apache.hadoop.io.FloatWritable;
 4 | import org.apache.hadoop.io.LongWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | public class thetaMAP extends Mapper<LongWritable, Text, Text, FloatWritable> {
 9 | 	public static int count=0;
10 | 	public static long number_inputs=(long) 0;
11 | 	public static float alpha=0.0f;
12 | 	public static Float[] Xi=null;
13 | 	public static ArrayList<Float> theta_i=new ArrayList<Float>();
14 | 	@Override
15 | 	public void setup(Context context) throws IOException, InterruptedException{
16 | 		alpha=context.getConfiguration().getFloat("alpha",0);
17 | 		number_inputs=context.getCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS).getValue();
18 | 	}
19 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20 | 		++count;
21 | 		float h_theta=0;
22 | 		String[] tok=value.toString().split("\\,");
23 | 		if(count==1){
24 | 			for(int i=0;i<tok.length;i++){
25 | 				theta_i.add(context.getConfiguration().getFloat("theta".concat(String.valueOf(i)),0));
26 | 			}
27 | 			Xi=new Float[tok.length];
28 | 		}
29 | 		for(int i=0;i<Xi.length;i++){
30 | 			if(i==0){
31 | 				Xi[0]=(float) 1;
32 | 			}
33 | 			else{
34 | 				Xi[i]=Float.parseFloat(tok[i-1]);
35 | 			}
36 | 		}
37 |                 float exp = 0;
38 | 		for(int i=0;i<Xi.length;i++){
39 | 			exp+=(Xi[i]*theta_i.get(i));
40 | 			//If you choose to use perceptron learning rule
41 | 			/*if(i==num_features){
42 | 				if(exp>=0){
43 | 					h_theta=1;
44 | 				}
45 | 				else{
46 | 					h_theta=0;
47 | 				}
48 | 			}*/
49 | 			//If you choose to use the Logistic Function for learning
50 | 			if(i==(Xi.length-1)){
51 | 				h_theta=(float) (1/(1+(Math.exp(-(exp)))));
52 | 			}
53 | 		}
54 | 		float Yi=Float.parseFloat(tok[tok.length-1]);
55 | 		for(int i=0;i<Xi.length;i++){
56 | 			float temp=theta_i.get(i);
57 | 			theta_i.remove(i);
58 | 			theta_i.add(i,(float) (temp+(alpha/number_inputs)*(Yi-h_theta)*(Xi[i])));
59 | 		}
60 | 	}
61 | 	@Override
62 | 	public void cleanup(Context context) throws IOException, InterruptedException{
63 | 		for(int i=0;i<theta_i.size();i++){
64 | 			context.write(new Text("theta"+i), new FloatWritable(theta_i.get(i)));
65 | 		}
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/LogisticRegression_MapReduce/src/thetaREDUCE.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.FloatWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | public class thetaREDUCE extends Reducer<Text, FloatWritable, Text, FloatWritable>{
 7 | 	public void reduce(Text key, Iterable<FloatWritable> values, Context context) throws IOException, InterruptedException{
 8 | 		float sum=0;
 9 | 		int count=0;
10 | 		for(FloatWritable value:values){
11 | 			sum+=value.get();
12 | 			count++;
13 | 		}
14 | 		context.write(key, new FloatWritable(sum/count));
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This is a MapReduce implementation of one of the simplest algorithms called Market Basket Analysis.
 2 | 
 3 | This algorithm helps the user to determine which items have been occuring together.
 4 | In marketing terms, this algorihtm can help the vendor (online or local) to determine which items to be clubbed together on the shelf.
 5 | For example, many customer might have bought butter along with bread. So naturally it would be a wise choice to juxtapose them on the shelf and this algorithm helps the vendors to do the same.
 6 | 
 7 | The sample input data contains transactions of all the customers.
 8 | It has a comma separated list of items bought by a customer.
 9 | 
10 | The sample output is the frequency of occurence of groups of items.
11 | 
12 | This algorithm takes in three arguments:
13 | 
14 | 1. The input path
15 | 2. The output path
16 | 3. Number of groupings i.e. How many items shoould be grouped together. Set this carefully as a the value of number of groupings should be always less than or equal to the number of items purchased by every customer.


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/input/in.txt.txt:
--------------------------------------------------------------------------------
1 | crackers,bread,banana
2 | crackers,coke,butter,coffee
3 | crackers,bread
4 | crackers,bread
5 | crackers,bread,coffee
6 | butter,coke
7 | butter,coke,bread,crackers
8 | 


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/output/part-r-00000:
--------------------------------------------------------------------------------
 1 | bread, coffee	1
 2 | butter, coffee	1
 3 | banana, crackers	1
 4 | butter, coke	3
 5 | coffee, crackers	2
 6 | bread, butter	1
 7 | banana, bread	1
 8 | bread, crackers	5
 9 | coke, crackers	2
10 | bread, coke	1
11 | coffee, coke	1
12 | butter, crackers	2
13 | 


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/src/MBA_Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.conf.Configuration;
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.IntWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Job;
 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | public class MBA_Driver {
11 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
12 | 		Configuration conf=new Configuration();
13 | 		conf.set("group_num", args[2]);
14 | 		Job job = new Job(conf);
15 | 		job.setJarByClass(MBA_Driver.class);
16 | 		job.setJobName("Market Basket Analysis");
17 | 		FileInputFormat.setInputPaths(job, new Path(args[0]));
18 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
19 | 		job.setMapperClass(MBA_Mapper.class);
20 | 		job.setCombinerClass(MBA_Reducer.class);
21 | 		job.setReducerClass(MBA_Reducer.class);
22 | 		job.setOutputKeyClass(Text.class);
23 | 		job.setOutputValueClass(IntWritable.class);
24 | 		boolean success = job.waitForCompletion(true);
25 | 		System.exit(success ? 0 : 1);
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/src/MBA_Mapper.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.Arrays;
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.LongWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Mapper;
 7 | 
 8 | public class MBA_Mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
 9 | 	public static int group_num = 2;
10 | 	@Override
11 | 	public void setup(Context context){
12 | 		group_num=Integer.parseInt(context.getConfiguration().get("group_num"));
13 | 	}
14 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
15 | 		String[] vals=value.toString().split("\\,");
16 | 		Arrays.sort(vals);
17 | 		if(vals.length>=group_num){
18 | 			for(int i=0;i<vals.length-(group_num-1);i++){
19 | 				String pair="";
20 | 				for(int j=0;j<group_num;j++){
21 | 					if(j==group_num-1)
22 | 						pair=pair+vals[i+j];
23 | 					else
24 | 						pair=pair+vals[i+j]+",";
25 | 				}
26 | 				context.write(new Text(pair), new IntWritable(1));
27 | 			}
28 | 		}
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/Market-Basket-Analysis_MapReduce/src/MBA_Reducer.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.IntWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | public class MBA_Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
 7 | 	public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
 8 | 		int sum=0;
 9 | 		for(IntWritable value:values){
10 | 			sum=sum+value.get();
11 | 		}
12 | 		if(sum>1)
13 | 			context.write(key, new IntWritable(sum));
14 | 	}
15 | }
16 | 


--------------------------------------------------------------------------------
/MatrixMultiplication_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
1 | The input stores the two matrices to be multiplied in a file.
2 | 
3 | Each line is a row of a matrix. The first value of each row names the matrix and the last value is the row number of the matrix (starts from 0).
4 | 
5 | In this case, the first matrix is stored normally and the second matrix is stored as the transpose.
6 | 
7 | This way of storing reduces the filesize and helps the algorithm complete at a faster rate.


--------------------------------------------------------------------------------
/MatrixMultiplication_MapReduce/input/1.txt:
--------------------------------------------------------------------------------
1 | A,1,2,3,0
2 | A,3,4,5,1
3 | A,5,6,7,2
4 | B,2,12,4,0
5 | B,3,1,4,1


--------------------------------------------------------------------------------
/MatrixMultiplication_MapReduce/src/MatMulDriver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.conf.Configuration;
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 8 | 
 9 | 
10 | public class MatMulDriver {
11 | 
12 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
13 | 		Configuration conf = new Configuration();
14 | 		// A is an m-by-n matrix; B is an n-by-p matrix.
15 | 		conf.set("m", args[0]);
16 | 		conf.set("n", args[1]);
17 | 		conf.set("p", args[2]);
18 | 		Job job = new Job(conf, "Matrix_Multiplication");
19 | 		job.setJarByClass(MatMulDriver.class);
20 | 		job.setOutputKeyClass(Text.class);
21 | 		job.setOutputValueClass(Text.class);
22 | 		job.setMapperClass(MatMulMap.class);
23 | 		//Don't use combiner if there is no scope of combining the output. Otherwise the job will get stuck.
24 | 		//job.setCombinerClass(MatMulModGenReduce.class);
25 | 		job.setReducerClass(MatMulReduce.class);
26 | 		//args[3] is the input path.
27 | 		FileInputFormat.addInputPath(job, new Path(args[3]));
28 | 		//args[4] is the output path.
29 | 		FileOutputFormat.setOutputPath(job, new Path(args[4]));
30 | 		System.exit(job.waitForCompletion(true)?0:1);
31 | 	}
32 | }


--------------------------------------------------------------------------------
/MatrixMultiplication_MapReduce/src/MatMulMap.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.LongWritable;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Mapper;
 5 | 
 6 | 
 7 | public class MatMulMap extends Mapper<LongWritable, Text, Text, Text> {
 8 | 	public static int m=0,n=0,p=0;
 9 | 	@Override
10 | 	public void setup(Context context) throws IOException, InterruptedException{
11 | 		m = Integer.parseInt(context.getConfiguration().get("m"));
12 | 		n = Integer.parseInt(context.getConfiguration().get("n"));
13 | 		p = Integer.parseInt(context.getConfiguration().get("p"));
14 | 	}
15 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
16 | 		Text Key = new Text();
17 | 		Text Value = new Text();
18 | 		String line = value.toString();
19 | 		String[] val = line.split("\\,");
20 | 		if(val[0].contentEquals("A")){
21 | 			for(int x=0;x<p;x++){
22 | 				String s="";
23 | 				Key.set(val[val.length-1]+","+x);
24 | 				for(int i=0;i<val.length-1;i++){
25 | 					if(i<val.length-2){
26 | 						s=s.concat(val[i]+",");
27 | 					}
28 | 					else{
29 | 						s=s.concat(val[i]);
30 | 					}
31 | 				}
32 | 				Value.set(s);
33 | 				context.write(Key, Value);
34 | 			}
35 | 		}
36 | 		else{
37 | 			for(int y=0;y<m;y++){
38 | 				String s="";
39 | 				Key.set(y+","+val[val.length-1]);
40 | 				for(int i=0;i<val.length-1;i++){
41 | 					if(i<val.length-2){
42 | 						s=s.concat(val[i]+",");
43 | 					}
44 | 					else{
45 | 						s=s.concat(val[i]);
46 | 					}
47 | 				}
48 | 				Value.set(s);
49 | 				context.write(Key, Value);
50 | 			}
51 | 		}
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/MatrixMultiplication_MapReduce/src/MatMulReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Reducer;
 5 | 
 6 | public class MatMulReduce extends Reducer<Text, Text, Text, Text>{
 7 | 	int n=0;
 8 | 	@Override
 9 | 	public void setup(Context context){
10 | 		n=Integer.parseInt(context.getConfiguration().get("n"));
11 | 	}
12 | 	public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException{
13 | 		String[] value;
14 | 		HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
15 | 		HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
16 | 		for (Text val : values) {
17 | 			value = val.toString().split(",");
18 | 			if (value[0].equals("A")) {
19 | 				for(int z=1;z<=n;z++){
20 | 					hashA.put(z, Float.parseFloat(value[z]));}
21 | 			} else{
22 | 				for(int a=1;a<=n;a++){
23 | 					hashB.put(a, Float.parseFloat(value[a]));}
24 | 			}
25 | 		}
26 | 		float result = 0.0f;
27 | 		float a_ij;
28 | 		float b_jk;
29 | 		for (int j=1;j<=n;j++) {
30 | 			a_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f;
31 | 			b_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f;
32 | 			result +=a_ij*b_jk;
33 | 		}
34 | 		context.write(null, new Text(key.toString() + "," + Float.toString(result)));
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This algorithm is called "Common Friends" algorithm.
 2 | As the name suggests, it helps to find common items between entities.
 3 | 
 4 | In this case, the sample input file is a file which stores the user_id of a person and user_ids of all its friends in the fllowing format:
 5 | 
 6 | <person_id>,<friend1_id> <friend2_id>....
 7 | 
 8 | Each person's user_id is separated by a comma from the friends' user_ids and friends' user_ids are separated by spaces.
 9 | 
10 | The sample output stores the user_ids of two persons and their mutual friends in the following fashion:
11 | 
12 | <person1_id>,<person2_id>	<friend1_id>,<friend2_id>...|<count>
13 | 
14 | The two persons' user_ids are separated by a comma and from the friends' user_ids and counts by a tab.
15 | The mutual friends' user_ids are separated by commas and from count of the mutual friends by a "|"
16 | 
17 | 
18 | This algorithm takes in only two arguments:
19 | 
20 | 1. The input path
21 | 2. The output path


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/input/in.txt.txt:
--------------------------------------------------------------------------------
1 | 100,200 300 400 500 600
2 | 200,100 300 400
3 | 300,100 200 400 500
4 | 400,100 200 300
5 | 500,100 300
6 | 600,100
7 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/output/part-r-00000:
--------------------------------------------------------------------------------
 1 | 100,200	300,400|2
 2 | 100,300	200,400,500|3
 3 | 100,400	300,200|2
 4 | 100,500	300|1
 5 | 100,600	null
 6 | 200,300	400,100|2
 7 | 200,400	300,100|2
 8 | 300,400	200,100|2
 9 | 300,500	100|1
10 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/src/MF_Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Job;
 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | public class MF_Driver {
11 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
12 | 		Configuration conf=new Configuration();
13 | 		Job job = new Job(conf);
14 | 		job.setJarByClass(MF_Driver.class);
15 | 		job.setJobName("Mutual Friend Calculator");
16 | 		FileInputFormat.setInputPaths(job, new Path(args[0]));
17 | 		FileOutputFormat.setOutputPath(job, new Path(args[1]));
18 | 		job.setMapperClass(MF_Mapper.class);
19 | 		job.setCombinerClass(MF_Reducer.class);
20 | 		job.setReducerClass(MF_Reducer.class);
21 | 		job.setMapOutputKeyClass(Text.class);
22 | 		job.setMapOutputValueClass(Text.class);
23 | 		job.setOutputKeyClass(Text.class);
24 | 		job.setOutputValueClass(Text.class);
25 | 		boolean success = job.waitForCompletion(true);
26 | 		System.exit(success ? 0 : 1);
27 | 	};
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/src/MF_Mapper.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.Collections;
 4 | import java.util.Map.Entry;
 5 | import java.util.TreeMap;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | public class MF_Mapper extends Mapper<LongWritable, Text, Text, Text> {
11 | 	public static TreeMap<String,ArrayList<String>> Friends=new TreeMap<String,ArrayList<String>>();
12 | 	public static ArrayList<String> ArrToList (ArrayList<String> l, String[] a){
13 | 		for(String i:a)
14 | 			l.add(i);
15 | 		return l;
16 | 	}
17 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
18 | 		String[] vals=value.toString().split("\\,");
19 | 		ArrayList<String> al=ArrToList(new ArrayList<String>(),vals[1].split(" "));
20 | 		Collections.sort(al);
21 | 		Friends.put(vals[0],al);
22 | 	}
23 | 	@Override
24 | 	public void cleanup(Context context) throws IOException, InterruptedException{
25 | 		for(Entry<String,ArrayList<String>> s:new gen_mutual_friends_matrix().generate(Friends).entrySet())
26 | 			context.write(new Text(s.getKey()), new Text(s.getValue().toString().replaceAll("\\[", "").replaceAll("\\]", "").replaceAll(" ", "")));
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/src/MF_Reducer.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.ArrayList;
 3 | import java.util.HashSet;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Reducer;
 6 | 
 7 | public class MF_Reducer extends Reducer<Text, Text, Text, Text> {
 8 | 	public static ArrayList<String> ArrToList (ArrayList<String> l, String[] a){
 9 | 		for(String i:a)
10 | 			l.add(i);
11 | 		return l;
12 | 	}
13 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
14 | 		ArrayList<String> mutual_friends=new ArrayList<String>();
15 | 		for(Text value:values){
16 | 			if(!value.toString().contentEquals("")){
17 | 				String[] vals=value.toString().split("\\,");
18 | 				ArrToList(mutual_friends,vals);
19 | 			}
20 | 		}
21 | 		HashSet<String> hs=new HashSet<String>(mutual_friends);
22 | 		if(hs.size()>0)
23 | 			context.write(key, new Text(hs.toString().replaceAll("\\[", "").replaceAll("\\]", "").replaceAll(" ", "")+"|"+hs.size()));
24 | 		else
25 | 			context.write(key, new Text("null"));
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/Mutual-Friends_MapReduce/src/gen_mutual_friends_matrix.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.Map.Entry;
 3 | import java.util.TreeMap;
 4 | 
 5 | public class gen_mutual_friends_matrix {
 6 | 	public static TreeMap<String,ArrayList<String>> list=new TreeMap<String,ArrayList<String>>();
 7 | 	public TreeMap<String,ArrayList<String>> generate(TreeMap<String,ArrayList<String>> x){
 8 | 		for(Entry<String, ArrayList<String>> s1:x.entrySet()){
 9 | 			for(Entry<String, ArrayList<String>> s2:x.entrySet()){
10 | 				if(!s1.getKey().contentEquals(s2.getKey()) && Integer.parseInt(s2.getKey())>Integer.parseInt(s1.getKey())){
11 | 					ArrayList<String> mutual=s1.getValue();
12 | 					mutual.retainAll(s2.getValue());
13 | 					list.put(s1.getKey()+","+s2.getKey(), mutual);
14 | 				}
15 | 			}
16 | 		}
17 | 		return list;
18 | 	}
19 | }


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | This is implementation of Naive Bayes Classifier on Hadoop using MapReduce.
 2 | 
 3 | The example input that I have used in this project is file which tells us about a student's activity given certain factors.
 4 | So the factors are (in order): Deadline?, Is there a Party?, Is he/she lazy? and finally the output is Activity.
 5 | 
 6 | The correct output for this example has also been uploaded.
 7 |  
 8 | The algorihm takes in three arguments:
 9 | 
10 | 1. The test input as a string of comma separated values for which you want to predict the activity for that particular person.
11 | 
12 | 2. The path for the input.
13 | 
14 | 3. The path for the output.
15 | 


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/input/1.txt:
--------------------------------------------------------------------------------
 1 | Urgent,Yes,Yes,Party
 2 | Urgent,No,Yes,Study
 3 | Near,Yes,Yes,Party
 4 | None,Yes,No,Party
 5 | None,No,Yes,Pub
 6 | None,Yes,No,Party
 7 | Near,No,No,Study
 8 | Near,No,Yes,TV
 9 | Near,Yes,Yes,Party
10 | Urgent,No,No,Study
11 | 


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/output/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/output/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Naive_Bayes_Classifier_MapReduce/output/.part-r-00000.crc


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/output/part-r-00000:
--------------------------------------------------------------------------------
1 | TV
2 | 


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/src/NBCDriver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 | 
11 | public class NBCDriver {
12 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
13 | 		Configuration conf=new Configuration();
14 | 		// The test input for which you want to find the acitivity that the Person should be doing
15 | 		conf.set("test_input", args[0]);
16 | 		Job job = new Job(conf);
17 | 		job.setJarByClass(NBCDriver.class);
18 | 		job.setJobName("Naive_Bayes_calssifier using Hadoop");
19 | 		FileInputFormat.setInputPaths(job, new Path(args[1]));
20 | 		FileOutputFormat.setOutputPath(job, new Path(args[2]));
21 | 		job.setMapperClass(NBCMap.class);
22 | 		job.setReducerClass(NBCReduce.class);
23 | 		job.setMapOutputKeyClass(IntWritable.class);
24 | 		job.setMapOutputValueClass(Text.class);
25 | 		job.setOutputKeyClass(IntWritable.class);
26 | 		job.setOutputValueClass(Text.class);
27 | 		boolean success = job.waitForCompletion(true);
28 | 		System.exit(success ? 0 : 1);
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/src/NBCMap.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import java.util.Map.Entry;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | public class NBCMap extends Mapper<LongWritable, Text, IntWritable, Text>{
11 | 	public static String output_key;
12 | 	public static String[] test_input=null;
13 | 	public static int count=0;
14 | 	public static HashMap<String,Integer> inputs=new HashMap<String,Integer>();
15 | 	public static double output_value=Double.NEGATIVE_INFINITY;
16 | 	public static HashMap<String,Double> output= new HashMap<String,Double>();
17 | 	public static HashMap<String,Double> outcome_count= new HashMap<String,Double>();
18 | 	public static HashMap<String,Double> features_count= new HashMap<String,Double>();
19 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20 | 		if(test_input==null)
21 | 			test_input=context.getConfiguration().get("test_input").split("\\,");
22 | 		String[] input=value.toString().split("\\,");
23 | 		for(int j=0;j<input.length;j++){
24 | 			if(j==input.length-1){
25 | 				if(outcome_count.containsKey(input[j]))
26 | 					outcome_count.put(input[j], outcome_count.get(input[j])+1);
27 | 				else
28 | 					outcome_count.put(input[j], (double) 1);
29 | 			}
30 | 			else{
31 | 				if(input[j].contentEquals(test_input[j])){
32 | 					if(!inputs.containsKey(j+","+input[j]))
33 | 						inputs.put(j+","+input[j], 0);
34 | 					if(features_count.containsKey(j+","+input[j]+"|"+input[input.length-1]))
35 | 						features_count.put(j+","+input[j]+"|"+input[input.length-1], features_count.get(j+","+input[j]+"|"+input[input.length-1])+1);
36 | 					else
37 | 						features_count.put(j+","+input[j]+"|"+input[input.length-1], (double) 1);
38 | 				}
39 | 			}
40 | 		}
41 | 		++count;
42 | 	}
43 | 	public void cleanup(Context context) throws IOException, InterruptedException{
44 | 		
45 | 		for(Entry<String,Double> o_c:outcome_count.entrySet()){
46 | 			String output_class=o_c.getKey();
47 | 			for(Entry<String,Integer> i:inputs.entrySet()){
48 | 				if(!features_count.containsKey(i.getKey()+"|"+output_class))
49 | 					features_count.put(i.getKey()+"|"+output_class, (double) 0);
50 | 			}
51 | 			double output_class_count=o_c.getValue();
52 | 			double probability=output_class_count/count;
53 | 			for(Entry<String,Double> f_c:features_count.entrySet()){
54 | 				if(f_c.getKey().split("\\|")[1].contentEquals(output_class))
55 | 					probability=probability*(f_c.getValue()/output_class_count);
56 | 			}
57 | 			output.put(output_class, probability);
58 | 		}
59 | 		for(Entry<String,Double> o:output.entrySet()){
60 | 			if(o.getValue()>output_value){
61 | 				output_value=o.getValue();
62 | 				output_key=o.getKey();
63 | 			}
64 | 		}
65 | 		context.write(new IntWritable(1),new Text(output_key));
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/Naive_Bayes_Classifier_MapReduce/src/NBCReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import java.util.Map.Entry;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | public class NBCReduce extends Reducer<IntWritable, Text, IntWritable, Text>{
10 | 	public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
11 | 		Double out_value=Double.NEGATIVE_INFINITY;
12 | 		String out_key=null;
13 | 		HashMap<String,Integer> final_output=new HashMap<String,Integer>();
14 | 		for(Text value:values){
15 | 			if(final_output.containsKey(value.toString()))
16 | 				final_output.put(value.toString(), final_output.get(value.toString())+1);
17 | 			else
18 | 				final_output.put(value.toString(), 1);
19 | 		}
20 | 		for(Entry<String,Integer> output:final_output.entrySet()){
21 | 			if(output.getValue()>out_value){
22 | 				out_value=(double) output.getValue();
23 | 				out_key=output.getKey();
24 | 			}
25 | 		}
26 | 		context.write(null, new Text(out_key));
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MLHadoop
 2 | This repository contains Machine-Learning MapReduce codes for Hadoop which are written from scratch (without using any package or library). So you'll find codes written right from the basic Mathematics required for all of these Algorithms.
 3 | e.g. Prediction Algorithms (Linear and Logistic Regression - Iterative Version), Clustering Algorithm (K-Means Clustering), Classification Algorithm (KNN Classifier), MBA, Common Friends etc.
 4 | 
 5 | NOTE: I think some of the algorithms implemented here can be improved in time as well as space by controlling the shuffle-sort phase between a MapReduce job i.e by writing and implementing your own custom Secondary Sort class as the shuffle-sort phase takes up a lot of time. If you have a sort order of key-value pairs in mind and if you are running multiple jobs or extra sorting methods inside mappers and reducers just to get the correct sort order, then, secondary sorting might come in handy as it will speed up the jobs and will use lesser RAM.
 6 | 
 7 | Language used: Java
 8 | 
 9 | IDE used: Eclipse IDE with [HDT (Hadoop Development Tools)](https://archive.apache.org/dist/incubator/hdt/hdt-0.0.2.incubating/hdt-0.0.2.incubating-bin.tar.gz) plugin installed.
10 | 
11 | Hadoop version used: 1.2.1
12 | 
13 | I wrote these codes when I was just a novice (in terms of MapReduce programming as well as programming in general) and therefore I am certain the code is very inefficient and there are a lot of optimisations yet to be done in this. So feel free to point out the mistakes or create PRs if you are interested.
14 | 
15 | License
16 | Copyright © 2023 [Punit Naik](https://github.com/punit-naik)
17 | 
18 | This program and the accompanying materials are made available under the terms of the Eclipse Public License 2.0 which is available at http://www.eclipse.org/legal/epl-2.0.
19 | 
20 | This Source Code may also be made available under the following Secondary Licenses when the conditions for such availability set forth in the Eclipse Public License, v. 2.0 are satisfied: GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version, with the GNU Classpath Exception which is available at https://www.gnu.org/software/classpath/license.html.
21 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/Readme.txt:
--------------------------------------------------------------------------------
 1 | This is an Algorithm which generates Recommendations for users by using the Collaborative Filtering technique.
 2 | 
 3 | This algorithm takes in four arguments, namely:
 4 | 
 5 | 1. args[0]: The path which will store the value "n" for a particular task_id. It also the "n" part of matrices co_oc_mat and user_scoring_mat where co_oc_mat has dimensions of m x n and sorted_user_scoring_mat has dimensions n x p.
 6 | 
 7 | 2. args[1]: The path to the input.
 8 | 
 9 | 3. args[2]: The intermediate output of the program which is also the input to the final MR Job.
10 | 
11 | 4. args[3]: The final output path which will contain recommendations for users. Each group of users will be identified by their task_IDs.


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/input/recommendation.txt:
--------------------------------------------------------------------------------
 1 | 1,101,5.0
 2 | 1,102,3.0
 3 | 1,103,2.5
 4 | 2,101,2.0
 5 | 2,102,2.5
 6 | 2,103,5.0
 7 | 2,104,2.0
 8 | 3,101,2.0
 9 | 3,104,4.0
10 | 3,105,4.5
11 | 3,107,5.0
12 | 4,101,5.0
13 | 4,103,3.0
14 | 4,104,4.5
15 | 4,106,4.0
16 | 5,101,4.0
17 | 5,102,3.0
18 | 5,103,2.0
19 | 5,104,4.0
20 | 5,105,3.5
21 | 5,106,4.0


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/outputs/Intermediate_output/part-r-00000:
--------------------------------------------------------------------------------
 1 | file:/home/punit/recommendation.txt:0+209-->102,3;16.0
 2 | file:/home/punit/recommendation.txt:0+209-->102,4;32.5
 3 | file:/home/punit/recommendation.txt:0+209-->103,3;10.0
 4 | file:/home/punit/recommendation.txt:0+209-->104,1;24.0
 5 | file:/home/punit/recommendation.txt:0+209-->105,1;0.0
 6 | file:/home/punit/recommendation.txt:0+209-->105,2;2.0
 7 | file:/home/punit/recommendation.txt:0+209-->105,4;4.5
 8 | file:/home/punit/recommendation.txt:0+209-->106,1;0.0
 9 | file:/home/punit/recommendation.txt:0+209-->106,2;2.0
10 | file:/home/punit/recommendation.txt:0+209-->106,3;4.0
11 | file:/home/punit/recommendation.txt:0+209-->107,1;5.0
12 | file:/home/punit/recommendation.txt:0+209-->107,2;4.0
13 | file:/home/punit/recommendation.txt:0+209-->107,4;9.5
14 | file:/home/punit/recommendation.txt:0+209-->107,5;11.5
15 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/outputs/final_output/part-r-00000:
--------------------------------------------------------------------------------
1 | 1	104,24.0
2 | 2	105,2.0
3 | 3	102,16.0
4 | 4	102,32.5
5 | 5	107,11.5
6 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/outputs/n.txt:
--------------------------------------------------------------------------------
1 | file:/home/punit/recommendation.txt:0+209-->7
2 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/FinalMap.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.HashMap;
 3 | import java.util.Map.Entry;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | 
10 | public class FinalMap extends Mapper<LongWritable, Text, Text, Text> {
11 | 	public static String delimiter=null;
12 | 	public static HashMap<String,String> map=new HashMap<String,String>();
13 | 	@Override
14 | 	public void setup(Context context){
15 | 		delimiter=context.getConfiguration().get("delimiter");
16 | 	}
17 | 	@Override
18 | 	public void map(LongWritable key, Text value, Context context)
19 | 			throws IOException, InterruptedException {
20 | 		String[] parts=value.toString().split("\\;");
21 | 		String score=parts[1];
22 | 		String[] parts2=parts[0].split(delimiter);
23 | 		String[] parts3=parts2[1].split("\\,");
24 | 		String user=parts3[1];
25 | 		String item=parts3[0];
26 | 		if(!map.containsKey(user)){
27 | 			map.put(user, item+","+score);
28 | 		}
29 | 		else{
30 | 			String[] old=map.get(user).split(",");
31 | 			if(Double.parseDouble(score)>Double.parseDouble(old[0])){
32 | 				map.put(user, item+","+score);
33 | 			}
34 | 		}
35 | 	}
36 | 	@Override
37 | 	public void cleanup(Context context) throws IOException, InterruptedException{
38 | 		for(Entry<String,String> entry:map.entrySet()){
39 | 			context.write(new Text(entry.getKey()), new Text(entry.getValue()));
40 | 		}
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/FinalReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.io.Text;
 3 | import org.apache.hadoop.mapreduce.Reducer;
 4 | 
 5 | public class FinalReduce extends Reducer<Text, Text, Text, Text>{
 6 | 	String delimiter=null,identifier=null;
 7 | 	@Override
 8 | 	public void setup(Context context){
 9 | 		delimiter=context.getConfiguration().get("delimiter");
10 | 		identifier=context.getTaskAttemptID().getTaskID().getId()+delimiter;
11 | 	}
12 | 	@Override
13 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
14 | 		for(Text val: values){
15 | 			context.write(new Text(/*identifier+*/key.toString()), val);//new Text(val.toString().split("\\,")[1]));
16 | 		}
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/RecDriver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import org.apache.hadoop.conf.Configuration;
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.Job;
 6 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 7 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 8 | 
 9 | 
10 | public class RecDriver {
11 | 	public static String delimiter="-->";
12 | 	public static String outFile=null;
13 | 	public static String rec_in=null;
14 | 	public static String mid_out=null;
15 | 	public static String final_out=null;
16 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
17 | 		// args[0] is the path of the file which stores the number of unique items "n" and its identification
18 | 		// which is the task_id.
19 | 		// It also the "n" part of matrices co_oc_mat and user_scoring_mat
20 | 		// where co_oc_mat has dimensions of m x n
21 | 		// and sorted_user_scoring_mat has dimensions n x p
22 | 		String a=String.valueOf(args[0].charAt(args[0].length()-1));
23 | 		if(!"/".contentEquals(a)){
24 | 			args[0]=args[0]+"/";
25 | 		}
26 | 		outFile=args[0]+"n.txt";
27 | 		
28 | 		//args[1] is the input file.
29 | 		rec_in=args[1];
30 | 		
31 | 		//args[2] is the intermediate output which is also the input to final recommendation job.
32 | 		mid_out=args[2];
33 | 		
34 | 		//args[3] is the final output.
35 | 		final_out=args[3];
36 | 		
37 | 		run1(args);
38 | 		run2(args);
39 | 	}
40 | 	public static void run1(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
41 | 		Configuration conf = new Configuration();
42 | 		conf.set("outFile", outFile);
43 | 		conf.set("delimiter", delimiter);
44 | 		Job job = new Job(conf, "Recommendations_CollaborativeFiltering_Prepare");
45 | 		job.setJarByClass(RecDriver.class);
46 | 		job.setOutputKeyClass(Text.class);
47 | 		job.setOutputValueClass(Text.class);
48 | 		job.setMapperClass(RecMap.class);
49 | 		job.setReducerClass(RecReduce.class);
50 | 		FileInputFormat.addInputPath(job, new Path(rec_in));
51 | 		FileOutputFormat.setOutputPath(job, new Path(mid_out));
52 | 		job.waitForCompletion(true);
53 | 	}
54 | 	public static void run2(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
55 | 		Configuration conf = new Configuration();
56 | 		conf.set("delimiter", delimiter);
57 | 		Job job = new Job(conf, "Recommendations_CollaborativeFiltering_Final");
58 | 		job.setJarByClass(RecDriver.class);
59 | 		job.setOutputKeyClass(Text.class);
60 | 		job.setOutputValueClass(Text.class);
61 | 		job.setMapperClass(FinalMap.class);
62 | 		job.setReducerClass(FinalReduce.class);
63 | 		FileInputFormat.addInputPath(job, new Path(mid_out));
64 | 		FileOutputFormat.setOutputPath(job, new Path(final_out));
65 | 		System.exit(job.waitForCompletion(true)?0:1);
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/RecMap.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedReader;
  2 | import java.io.BufferedWriter;
  3 | import java.io.IOException;
  4 | import java.io.InputStreamReader;
  5 | import java.io.OutputStream;
  6 | import java.io.OutputStreamWriter;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.Map.Entry;
 10 | import java.util.TreeMap;
 11 | 
 12 | import org.apache.hadoop.fs.FileSystem;
 13 | import org.apache.hadoop.fs.Path;
 14 | import org.apache.hadoop.io.LongWritable;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | 
 18 | 
 19 | public class RecMap extends Mapper<LongWritable, Text, Text, Text> {
 20 | 	public static String delimiter=null;
 21 | 	public static String identifier=null;
 22 | 	public static TreeMap<String,Integer> co_oc_mat=new TreeMap<String,Integer>();
 23 | 	public static HashMap<String,Float> user_scoring_mat=new HashMap<String,Float>();
 24 | 	public static TreeMap<String,Float> sorted_user_scoring_mat=new TreeMap<String,Float>();
 25 | 	public static ArrayList<String> vals=new ArrayList<String>();
 26 | 	public static ArrayList<Integer> unique_items=new ArrayList<Integer>();
 27 | 	public static ArrayList<Integer> unique_users=new ArrayList<Integer>();
 28 | 	public static int a=0;
 29 | 	@Override
 30 | 	public void setup(Context context){
 31 | 		delimiter=context.getConfiguration().get("delimiter");
 32 | 		identifier=context.getInputSplit()+delimiter;
 33 | 	}
 34 | 	@Override
 35 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 36 | 		++a;
 37 | 		String b=value.toString();
 38 | 		vals.add(b);
 39 | 		String[] parts=b.split("\\,");
 40 | 		user_scoring_mat.put(parts[0]+","+parts[1], Float.parseFloat(parts[2]));
 41 | 	}
 42 | 	@Override
 43 | 	public void cleanup(Context context) throws IOException, InterruptedException{
 44 | 		co_oc_mat.putAll(new get_co_oc_mat().get(vals, a));
 45 | 		unique_users.addAll(new get_unique_users().get(vals, a));
 46 | 		unique_items.addAll(new get_unique_items().get(vals, a));
 47 | 		FileSystem hdfs = FileSystem.get(context.getConfiguration());
 48 | 		Path outFile=new Path(context.getConfiguration().get("outFile"));
 49 | 		String line1="";
 50 | 		if (!hdfs.exists(outFile)){
 51 | 			OutputStream out = hdfs.create(outFile);
 52 | 			BufferedWriter br = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
 53 | 			br.write(identifier+unique_items.size()+"\n");
 54 | 			br.close();
 55 | 			hdfs.close();
 56 | 		}
 57 | 		else{
 58 | 			String line2=null;
 59 | 			BufferedReader br1 = new BufferedReader(new InputStreamReader(hdfs.open(outFile)));
 60 | 			while((line2=br1.readLine())!=null){
 61 | 				line1=line1.concat(line2)+"\n";
 62 | 			}
 63 | 			br1.close();
 64 | 			hdfs.delete(outFile, true);
 65 | 			OutputStream out = hdfs.create(outFile);
 66 | 			BufferedWriter br2 = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
 67 | 			br2.write(line1+identifier+unique_items.size()+"\n");
 68 | 			br2.close();
 69 | 			hdfs.close();
 70 | 		}
 71 | 		for(int i=0;i<unique_users.size();i++){
 72 | 			for(int j=0;j<unique_items.size();j++){
 73 | 				if(!user_scoring_mat.containsKey(unique_users.get(i)+","+unique_items.get(j))){
 74 | 					user_scoring_mat.put(unique_users.get(i)+","+unique_items.get(j), 0.0f);
 75 | 				}
 76 | 			}
 77 | 		}
 78 | 		sorted_user_scoring_mat.putAll(user_scoring_mat);
 79 | 		String prev="null";int row_num=-1;String value="A";
 80 | 		String prev2="null";int col_num=-1;String value2="B";
 81 | 
 82 | 		//Transmitting co_oc_mat
 83 | 		for(Entry<String, Integer> entry: co_oc_mat.entrySet()){
 84 | 			String check_val=entry.getKey().split("\\,")[0];
 85 | 			if(!prev.contentEquals(check_val)){
 86 | 			// If code enters this block, it will mean that the row has changed
 87 | 			// We have to transmit the aggregated values of the previous row and re-initialise the values.
 88 | 				if(row_num==-1){
 89 | 					prev=check_val;
 90 | 					//++row_num;
 91 | 					row_num=Integer.parseInt(check_val);
 92 | 				}
 93 | 				else{
 94 | 					for(int i=0;i<unique_users.size();i++){
 95 | 						String key=row_num+","+unique_users.get(i);
 96 | 						//String key=row_num+","+i;
 97 | 						context.write(new Text(identifier+key), new Text(value));
 98 | 					}
 99 | 					value="A";
100 | 					prev=check_val;
101 | 					++row_num;
102 | 				}
103 | 			}
104 | 			// Iterating through one row and fetching its values.
105 | 			// Joining them together in a string.
106 | 			// i.e. The row indices will be equal and we are currently traversing through the same row i.e. prev and check_val are equal
107 | 			value=value+","+entry.getValue();
108 | 		}
109 | 		// We have to transmit the aggregated values of the final row
110 | 		// since the matrix is fully iterated over and it won't enter the block where values are transmitted.
111 | 		for(int i=0;i<unique_users.size();i++){
112 | 			String key=row_num+","+unique_users.get(i);
113 | 			//String key=row_num+","+i;
114 | 			context.write(new Text(identifier+key), new Text(value));
115 | 		}
116 | 
117 | 		//Transmitting sorted_user_scoring_mat
118 | 		for(Entry<String, Float> entry: sorted_user_scoring_mat.entrySet()){
119 | 			String check_val=entry.getKey().split("\\,")[0];
120 | 			if(!prev2.contentEquals(check_val)){
121 | 				// If code enters this block, it will mean that the column has changed
122 | 				// We have to transmit the aggregated values of the previous column and re-initialise the values.
123 | 				if(col_num==-1){
124 | 					prev2=check_val;
125 | 					//++col_num;
126 | 					col_num=Integer.parseInt(check_val);
127 | 				}
128 | 				else{
129 | 					for(int i=0;i<unique_items.size();i++){
130 | 						String key=unique_items.get(i)+","+col_num;
131 | 						//String key=i+","+col_num;
132 | 						context.write(new Text(identifier+key), new Text(value2));
133 | 					}
134 | 					value2="B";
135 | 					prev2=check_val;
136 | 					++col_num;
137 | 				}
138 | 			}
139 | 			// Iterating through one column and fetching its values.
140 | 			// Joining them together in a string.
141 | 			// i.e. The column indices will be equal and we are currently traversing through the same column i.e. prev2 and check_val are equal
142 | 			value2=value2+","+entry.getValue();
143 | 			// For an extra check at the RecReduce
144 | 			context.write(new Text(identifier+entry.getKey().split("\\,")[1]+","+entry.getKey().split("\\,")[0]), new Text(String.valueOf(entry.getValue())));
145 | 		}
146 | 		// We have to transmit the aggregated values of the final column
147 | 		// since the matrix is fully iterated over and it won't enter the block where values are transmitted.
148 | 		for(int i=0;i<unique_items.size();i++){
149 | 			String key=unique_items.get(i)+","+col_num;
150 | 			//String key=i+","+col_num;
151 | 			context.write(new Text(identifier+key), new Text(value2));
152 | 		}
153 | 	}
154 | }
155 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/RecReduce.java:
--------------------------------------------------------------------------------
 1 | import java.io.BufferedReader;
 2 | import java.io.IOException;
 3 | import java.io.InputStreamReader;
 4 | import java.util.HashMap;
 5 | import org.apache.hadoop.fs.FileSystem;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Reducer;
 9 | 
10 | 
11 | public class RecReduce extends Reducer<Text, Text, Text, Text>{
12 | 	public static String delimiter=null;
13 | 	@Override
14 | 	public void setup(Context context){
15 | 		delimiter=context.getConfiguration().get("delimiter");
16 | 	}
17 | 	@Override
18 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
19 | 		int n=0;
20 | 		if(n==0){
21 | 			FileSystem hdfs= FileSystem.get(context.getConfiguration());
22 | 			BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(context.getConfiguration().get("outFile")))));
23 | 			String line=null;
24 | 			while((line=br.readLine())!=null){
25 | 				String[] parts=line.replaceAll("\n", "").split(delimiter);
26 | 				if((key.toString().split(delimiter)[0]).contentEquals(parts[0])){
27 | 					n=Integer.parseInt(parts[1]);
28 | 					break;
29 | 				}
30 | 			}
31 | 			br.close();
32 | 			hdfs.close();
33 | 		}
34 | 		String[] value=null;
35 | 		double pref=0;
36 | 		HashMap<Integer, Float> hashA = new HashMap<Integer, Float>();
37 | 		HashMap<Integer, Float> hashB = new HashMap<Integer, Float>();
38 | 		for (Text val : values) {
39 | 			if(val.toString().contains(",")){
40 | 				value = val.toString().split(",");
41 | 				if (value[0].equals("A")) {
42 | 					for(int z=1;z<=n;z++){
43 | 						hashA.put(z, Float.parseFloat(value[z]));}
44 | 				} else{
45 | 					for(int a=1;a<=n;a++){
46 | 						hashB.put(a, Float.parseFloat(value[a]));}
47 | 				}
48 | 			}
49 | 			else{
50 | 				pref=Double.parseDouble(val.toString());
51 | 			}
52 | 		}
53 | 		float result = 0.0f;
54 | 		float a_ij;
55 | 		float b_jk;
56 | 		for (int j=1;j<=n;j++) {
57 | 			a_ij = hashA.containsKey(j) ? hashA.get(j) : 0.0f;
58 | 			b_jk = hashB.containsKey(j) ? hashB.get(j) : 0.0f;
59 | 			result +=a_ij*b_jk;
60 | 		}
61 | 		if(pref==0.0){
62 | 			context.write(null, new Text(key.toString() + ";" + Float.toString(result)));
63 | 		}
64 | 		//delimiter=null;
65 | 		n=0;
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/get_co_oc_mat.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.Collections;
 3 | import java.util.HashMap;
 4 | import java.util.LinkedHashSet;
 5 | 
 6 | public class get_co_oc_mat{
 7 | 	public HashMap<String,Integer> get(ArrayList<String> vals, int a){
 8 | 		HashMap<String,Integer> co_oc_mat=new HashMap<String,Integer>();
 9 | 		ArrayList<Integer> items=new ArrayList<Integer>();
10 | 		ArrayList<Integer> unique_items=null;
11 | 		ArrayList<Integer> users=new ArrayList<Integer>();
12 | 		ArrayList<Integer> unique_users=null;
13 | 		for(int i=0;i<a;i++){
14 | 			String[] tokens=vals.get(i).split("\\,");
15 | 			users.add(Integer.parseInt(tokens[0]));
16 | 			items.add(Integer.parseInt(tokens[1]));
17 | 		}
18 | 		unique_users=new ArrayList<Integer>(new LinkedHashSet<Integer>(users));
19 | 		Collections.sort(unique_users);
20 | 		unique_items=new ArrayList<Integer>(new LinkedHashSet<Integer>(items));
21 | 		Collections.sort(unique_items);
22 | 
23 | 		// Updating Diagonal Elements of co_oc_mat;
24 | 		for(int i=0;i<a;i++){
25 | 			String[] tokens=vals.get(i).split("\\,");
26 | 			String check=tokens[1]+","+tokens[1];
27 | 			if(!co_oc_mat.containsKey(check)){
28 | 				co_oc_mat.put(check, 1);
29 | 			}
30 | 			else{
31 | 				co_oc_mat.put(check, co_oc_mat.get(check)+1);
32 | 			}
33 | 
34 | 		}
35 | 
36 | 		// Updating the rest of the elements of co_oc_mat;
37 | 		for(int i=0;i<a-1;i++){
38 | 			String[] tokens1=vals.get(i).split("\\,");
39 | 			for(int j=1;j<a;j++){
40 | 				String[] tokens2=vals.get(j).split("\\,");
41 | 				if(tokens1[0].contentEquals(tokens2[0])){
42 | 					if(!tokens1[1].contentEquals(tokens2[1])){
43 | 						if(!co_oc_mat.containsKey(tokens1[1]+","+tokens2[1])){
44 | 							co_oc_mat.put(tokens1[1]+","+tokens2[1], 1);
45 | 							co_oc_mat.put(tokens2[1]+","+tokens1[1], 1);
46 | 						}
47 | 						else{
48 | 							co_oc_mat.put(tokens1[1]+","+tokens2[1], co_oc_mat.get(tokens1[1]+","+tokens2[1])+1);
49 | 							co_oc_mat.put(tokens2[1]+","+tokens1[1], co_oc_mat.get(tokens2[1]+","+tokens1[1])+1);
50 | 						}
51 | 					}
52 | 				}
53 | 				else{
54 | 					if(j-i>1){
55 | 						break;
56 | 					}
57 | 					else{
58 | 						i++;
59 | 						j++;
60 | 					}
61 | 				}
62 | 			}
63 | 		}
64 | 
65 | 		// remaining elements are assigned to 0
66 | 		for(int i=0;i<unique_items.size();i++){
67 | 			for(int j=0;j<unique_items.size();j++){
68 | 				if(co_oc_mat.containsKey(unique_items.get(i)+","+unique_items.get(j))){
69 | 					continue;
70 | 				}
71 | 				else{
72 | 					co_oc_mat.put(unique_items.get(i)+","+unique_items.get(j), 0);
73 | 				}
74 | 			}
75 | 		}
76 | 		return co_oc_mat;
77 | 	}
78 | }


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/get_unique_items.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.LinkedHashSet;
 3 | 
 4 | public class get_unique_items{
 5 | 	public ArrayList<Integer> get(ArrayList<String> vals, int a){
 6 | 		ArrayList<Integer> items=new ArrayList<Integer>();
 7 | 		ArrayList<Integer> unique_items=new ArrayList<Integer>();
 8 | 		for(int i=0;i<a;i++){
 9 | 			String[] tokens=vals.get(i).split("\\,");
10 | 			items.add(Integer.parseInt(tokens[1]));
11 | 		}
12 | 		unique_items=new ArrayList<Integer>(new LinkedHashSet<Integer>(items));
13 | 		return unique_items;
14 | 	}
15 | }


--------------------------------------------------------------------------------
/Recommendation_Collaborative_Filtering_MapReduce/src/get_unique_users.java:
--------------------------------------------------------------------------------
 1 | import java.util.ArrayList;
 2 | import java.util.LinkedHashSet;
 3 | 
 4 | public class get_unique_users{
 5 | 	public ArrayList<Integer> get(ArrayList<String> vals, int a){
 6 | 		ArrayList<Integer> users=new ArrayList<Integer>();
 7 | 		ArrayList<Integer> unique_users=new ArrayList<Integer>();
 8 | 		for(int i=0;i<a;i++){
 9 | 			String[] tokens=vals.get(i).split("\\,");
10 | 			users.add(Integer.parseInt(tokens[0]));
11 | 		}
12 | 		unique_users=new ArrayList<Integer>(new LinkedHashSet<Integer>(users));
13 | 		return unique_users;
14 | 	}
15 | }


--------------------------------------------------------------------------------
/Top_N_MapReduce/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This is a MapReduce implementation of the 'Top N' algorithm. It finds top 'N' items based on their corresponding value.
 2 | 
 3 | This algorithm expects 3 arguments:
 4 | 
 5 | 1. N i.e. the 'N' part of the 'Top N' algorithm
 6 | 2. The input path
 7 | 3. The output path
 8 | 
 9 | The input provided in this example is just a csv file with two comma-separated values which are the item and its value respectively. The top 'N' items here are found based on their aggregated values.
10 | 
11 | NOTE: In the example output, I have set n = 5


--------------------------------------------------------------------------------
/Top_N_MapReduce/in/1.txt:
--------------------------------------------------------------------------------
 1 | A,2
 2 | B,2
 3 | C,3
 4 | D,2
 5 | E,1
 6 | G,2
 7 | A,3
 8 | B,4
 9 | Z,100
10 | Z,1


--------------------------------------------------------------------------------
/Top_N_MapReduce/in/2.txt:
--------------------------------------------------------------------------------
1 | A,1
2 | B,1
3 | C,3
4 | E,1
5 | F,1
6 | G,2
7 | A,65
8 | A,3
9 | 


--------------------------------------------------------------------------------
/Top_N_MapReduce/in/3.txt:
--------------------------------------------------------------------------------
1 | A,2
2 | B,2
3 | C,1
4 | D,2
5 | E,1
6 | F,1
7 | G,2
8 | 


--------------------------------------------------------------------------------
/Top_N_MapReduce/out/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/Top_N_MapReduce/out/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Top_N_MapReduce/out/.part-r-00000.crc


--------------------------------------------------------------------------------
/Top_N_MapReduce/out/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/Top_N_MapReduce/out/_SUCCESS


--------------------------------------------------------------------------------
/Top_N_MapReduce/out/part-r-00000:
--------------------------------------------------------------------------------
1 | Z,101
2 | A,76
3 | B,9
4 | C,7
5 | G,6
6 | 


--------------------------------------------------------------------------------
/Top_N_MapReduce/src/Top_N_Driver.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Job;
 7 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | public class Top_N_Driver {
11 | 	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
12 | 		Configuration conf=new Configuration();
13 | 		conf.set("N", args[0]);
14 | 		Job job = new Job(conf);
15 | 		job.setJarByClass(Top_N_Driver.class);
16 | 		job.setJobName("Top_N_Driver");
17 | 		FileInputFormat.setInputPaths(job, new Path(args[1]));
18 | 		FileOutputFormat.setOutputPath(job, new Path(args[2]));
19 | 		job.setMapperClass(Top_N_Mapper.class);
20 | 		job.setReducerClass(Top_N_Reducer.class);
21 | 		job.setOutputKeyClass(Text.class);
22 | 		job.setOutputValueClass(Text.class);
23 | 		boolean success = job.waitForCompletion(true);
24 | 		System.exit(success ? 0 : 1);
25 | 	};
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/Top_N_MapReduce/src/Top_N_Mapper.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.Collections;
 3 | import java.util.Comparator;
 4 | import java.util.HashMap;
 5 | import java.util.Iterator;
 6 | import java.util.LinkedHashMap;
 7 | import java.util.LinkedList;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | 
12 | import org.apache.hadoop.io.LongWritable;
13 | import org.apache.hadoop.io.Text;
14 | import org.apache.hadoop.mapreduce.Mapper;
15 | 
16 | public class Top_N_Mapper extends Mapper<LongWritable, Text, Text, Text> {
17 | 	public static Map<String, Integer> sortByComparator(Map<String, Integer> m){
18 | 		List<Entry<String,Integer>> list=new LinkedList<Entry<String,Integer>>(m.entrySet());
19 | 		Collections.sort(list, new Comparator<Entry<String,Integer>>(){
20 | 			@Override
21 | 			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
22 | 				return -(o1.getValue().compareTo(o2.getValue()));
23 | 			}
24 | 		});
25 | 		Map<String,Integer> sortedMap=new LinkedHashMap<String,Integer>();
26 | 		for(Iterator<Entry<String,Integer>> it=list.iterator(); it.hasNext();){
27 | 			Entry<String,Integer> e=it.next();
28 | 			sortedMap.put(e.getKey(), e.getValue());
29 | 		}
30 | 		return sortedMap;
31 | 	}
32 | 	public static Map<String,Integer> sm=new HashMap<String,Integer>();
33 | 	public static int N=0;
34 | 	@Override
35 | 	public void setup(Context context){
36 | 		N=Integer.parseInt(context.getConfiguration().get("N"));
37 | 	}
38 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
39 | 		String[] values=value.toString().split(",");
40 | 		if(sm.containsKey(values[0]))
41 | 			sm.put(values[0], sm.get(values[0])+Integer.parseInt(values[1]));
42 | 		else
43 | 			sm.put(values[0], Integer.parseInt(values[1]));
44 | 	}
45 | 	@Override
46 | 	public void cleanup(Context context) throws IOException, InterruptedException{
47 | 		int count=0;
48 | 		// Sorting based on values descendingly
49 | 		Map<String,Integer> p=sortByComparator(sm);
50 | 		Map<String,Integer> x=new LinkedHashMap<String,Integer>();
51 | 		for(Entry<String,Integer> e:p.entrySet()){
52 | 			if(count<=N){
53 | 				x.put(e.getKey(), e.getValue());
54 | 				count++;
55 | 			}
56 | 			else
57 | 				break;
58 | 		}
59 | 		context.write(new Text("1"), new Text(x.toString()));
60 | 		sm.clear();
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/Top_N_MapReduce/src/Top_N_Reducer.java:
--------------------------------------------------------------------------------
 1 | import java.io.IOException;
 2 | import java.util.Collections;
 3 | import java.util.Comparator;
 4 | import java.util.HashMap;
 5 | import java.util.Iterator;
 6 | import java.util.LinkedHashMap;
 7 | import java.util.LinkedList;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | 
12 | import org.apache.commons.lang.StringUtils;
13 | import org.apache.hadoop.io.Text;
14 | import org.apache.hadoop.mapreduce.Reducer;
15 | 
16 | public class Top_N_Reducer extends Reducer<Text, Text, Text, Text> {
17 | 	public static Map<String,Integer> m=new HashMap<String,Integer>();
18 | 	public static int N=0;
19 | 	public static Map<String, Integer> sortByComparator(Map<String, Integer> m){
20 | 		List<Entry<String,Integer>> list=new LinkedList<Entry<String,Integer>>(m.entrySet());
21 | 		Collections.sort(list, new Comparator<Entry<String,Integer>>(){
22 | 			@Override
23 | 			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
24 | 				return -(o1.getValue().compareTo(o2.getValue()));
25 | 			}
26 | 		});
27 | 		Map<String,Integer> sortedMap=new LinkedHashMap<String,Integer>();
28 | 		for(Iterator<Entry<String,Integer>> it=list.iterator(); it.hasNext();){
29 | 			Entry<String,Integer> e=it.next();
30 | 			sortedMap.put(e.getKey(), e.getValue());
31 | 		}
32 | 		return sortedMap;
33 | 	}
34 | 	@Override
35 | 	public void setup(Context context){
36 | 		N=Integer.parseInt(context.getConfiguration().get("N"));
37 | 	}
38 | 	public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
39 | 		for(Text value:values){
40 | 			String val=StringUtils.substringBetween(value.toString(),"{","}");
41 | 			String[] key_val=val.split(",");
42 | 			for(String pair:key_val){
43 | 				String[] entry=pair.split("=");
44 | 				if(m.containsKey(entry[0].trim()))
45 | 					m.put(entry[0].trim(), m.get(entry[0].trim())+Integer.parseInt(entry[1].trim()));
46 | 				else
47 | 					m.put(entry[0].trim(), Integer.parseInt(entry[1].trim()));
48 | 			}
49 | 		}
50 | 	}
51 | 	@Override
52 | 	public void cleanup(Context context) throws IOException, InterruptedException{
53 | 		// Sorting based on values descendingly
54 | 		Map<String,Integer> x=sortByComparator(m);
55 | 		int count=0;
56 | 		for(Entry<String,Integer> e:x.entrySet()){
57 | 			if(count<N){
58 | 				context.write(new Text(e.getKey()+","+e.getValue()), null);
59 | 				count++;
60 | 			}
61 | 			else
62 | 				break;
63 | 		}
64 | 		m.clear();
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/lu_decomposition/.classpath:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <classpath>
  3 | 	<classpathentry kind="src" path="src"/>
  4 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
  5 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-app-2.7.0.jar"/>
  6 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-common-2.7.0.jar"/>
  7 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-shuffle-2.7.0.jar"/>
  8 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.7.0.jar"/>
  9 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.0-tests.jar"/>
 10 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-plugins-2.7.0.jar"/>
 11 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.7.0.jar"/>
 12 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-hs-2.7.0.jar"/>
 13 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.0.jar"/>
 14 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/zookeeper-3.4.6-tests.jar"/>
 15 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/jersey-guice-1.9.jar"/>
 16 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/aopalliance-1.0.jar"/>
 17 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/guice-servlet-3.0.jar"/>
 18 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/javax.inject-1.jar"/>
 19 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/guice-3.0.jar"/>
 20 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/lib/jersey-client-1.9.jar"/>
 21 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-common-2.7.0.jar"/>
 22 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-applicationhistoryservice-2.7.0.jar"/>
 23 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-nodemanager-2.7.0.jar"/>
 24 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-common-2.7.0.jar"/>
 25 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-web-proxy-2.7.0.jar"/>
 26 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-client-2.7.0.jar"/>
 27 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-applications-distributedshell-2.7.0.jar"/>
 28 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-api-2.7.0.jar"/>
 29 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-sharedcachemanager-2.7.0.jar"/>
 30 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-applications-unmanaged-am-launcher-2.7.0.jar"/>
 31 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-registry-2.7.0.jar"/>
 32 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-tests-2.7.0.jar"/>
 33 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/yarn/hadoop-yarn-server-resourcemanager-2.7.0.jar"/>
 34 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/netty-all-4.0.23.Final.jar"/>
 35 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/xml-apis-1.3.04.jar"/>
 36 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/commons-daemon-1.0.13.jar"/>
 37 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/xercesImpl-2.9.1.jar"/>
 38 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/lib/leveldbjni-all-1.8.jar"/>
 39 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.0.jar"/>
 40 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-nfs-2.7.0.jar"/>
 41 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/hdfs/hadoop-hdfs-2.7.0-tests.jar"/>
 42 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/apacheds-i18n-2.0.0-M15.jar"/>
 43 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-collections-3.2.1.jar"/>
 44 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/protobuf-java-2.5.0.jar"/>
 45 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/htrace-core-3.1.0-incubating.jar"/>
 46 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jets3t-0.9.0.jar"/>
 47 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-digester-1.8.jar"/>
 48 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-core-1.9.jar"/>
 49 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/api-util-1.0.0-M20.jar"/>
 50 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/guava-11.0.2.jar"/>
 51 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsp-api-2.1.jar"/>
 52 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-beanutils-core-1.8.0.jar"/>
 53 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-codec-1.4.jar"/>
 54 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-beanutils-1.7.0.jar"/>
 55 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/xz-1.0.jar"/>
 56 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/paranamer-2.3.jar"/>
 57 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jetty-6.1.26.jar"/>
 58 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-configuration-1.6.jar"/>
 59 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-logging-1.1.3.jar"/>
 60 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jaxb-api-2.2.2.jar"/>
 61 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/httpcore-4.2.5.jar"/>
 62 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hadoop-annotations-2.7.0.jar"/>
 63 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/xmlenc-0.52.jar"/>
 64 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-recipes-2.7.1.jar"/>
 65 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/netty-3.6.2.Final.jar"/>
 66 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/junit-4.11.jar"/>
 67 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/avro-1.7.4.jar"/>
 68 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/gson-2.2.4.jar"/>
 69 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsr305-3.0.0.jar"/>
 70 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-server-1.9.jar"/>
 71 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-net-3.1.jar"/>
 72 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/mockito-all-1.8.5.jar"/>
 73 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-jaxrs-1.9.13.jar"/>
 74 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/api-asn1-api-1.0.0-M20.jar"/>
 75 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/java-xmlbuilder-0.4.jar"/>
 76 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/stax-api-1.0-2.jar"/>
 77 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-cli-1.2.jar"/>
 78 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jsch-0.1.42.jar"/>
 79 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-mapper-asl-1.9.13.jar"/>
 80 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/log4j-1.2.17.jar"/>
 81 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hamcrest-core-1.3.jar"/>
 82 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jetty-util-6.1.26.jar"/>
 83 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-lang-2.6.jar"/>
 84 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/asm-3.2.jar"/>
 85 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-compress-1.4.1.jar"/>
 86 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-httpclient-3.1.jar"/>
 87 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/servlet-api-2.5.jar"/>
 88 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-core-asl-1.9.13.jar"/>
 89 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jettison-1.1.jar"/>
 90 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-client-2.7.1.jar"/>
 91 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/apacheds-kerberos-codec-2.0.0-M15.jar"/>
 92 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar"/>
 93 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/slf4j-api-1.7.10.jar"/>
 94 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jackson-xc-1.9.13.jar"/>
 95 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jaxb-impl-2.2.3-1.jar"/>
 96 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/activation-1.1.jar"/>
 97 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/jersey-json-1.9.jar"/>
 98 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/curator-framework-2.7.1.jar"/>
 99 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/hadoop-auth-2.7.0.jar"/>
100 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-io-2.4.jar"/>
101 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/httpclient-4.2.5.jar"/>
102 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/snappy-java-1.0.4.1.jar"/>
103 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/commons-math3-3.1.1.jar"/>
104 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/lib/zookeeper-3.4.6.jar"/>
105 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-nfs-2.7.0.jar"/>
106 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-common-2.7.0-tests.jar"/>
107 | 	<classpathentry kind="lib" path="/opt/hadoop/share/hadoop/common/hadoop-common-2.7.0.jar"/>
108 | 	<classpathentry kind="output" path="bin"/>
109 | </classpath>
110 | 


--------------------------------------------------------------------------------
/lu_decomposition/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>lu_decomposition</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.apache.hdt.mrnature</nature>
16 | 		<nature>org.eclipse.jdt.core.javanature</nature>
17 | 	</natures>
18 | </projectDescription>
19 | 


--------------------------------------------------------------------------------
/lu_decomposition/README.md:
--------------------------------------------------------------------------------
 1 | # LU Decomposition
 2 | 
 3 | This mapreduce algorithm splits massively large matrix into it's `L` and `U` components. It uses the Naive Gaussian Elimination technique to do so.
 4 | 
 5 | # Program Execution Arguments
 6 | 
 7 | This programs only expects two arguments:
 8 | 
 9 | 1. An input path
10 | 2. An output path
11 | 
12 | # Input and Output data shape
13 | 
14 | Both the input and output matrix shapes are the **SAME**. This program expects and produces the textual input of matrices in the following manner:
15 | 
16 | `row_number + "\t" + elem-1 + "," + elem-2 + "," + elem-3 ...`
17 | 
18 | The text files should be a tab-separated list of `row_number`s and comma-separated row elements
19 | 
20 | # Final Output Location
21 | 
22 | This program produces various intermediate outputs. But the actual output (`L` and `U` matrices) are present in the paths `<output_path> + "-merged/lower"` and `<output_path> + "-merged/upper"`.
23 | 
24 | It's shapes will correspond to the shapes defined above.
25 | 
26 | **NOTE**: I have provided the input and all the output (intermediate and actual) folders, you can use them to verify your outputs.
27 | 
28 | # Limitations
29 | 
30 | This program uses Naive Gaussian Elimination method as mentioned eariler which produces a lot of intermediate outputs. This is fine for large datasets but as the daaset grows (with the number of input rows), this program will produce a lot of intermediate outputs which might cause a bottleneck on the I/O.
31 | 
32 | **NOTE**: Disk I/O can be significantly improved using Chained mappers and reducers in the MR job.
33 | 
34 | # Deprecated
35 | 
36 | This version is now depricated and you can find the newer, improved, low disk I/O version of this code at [LUDecomposition](https://github.com/punit-naik/MLHadoop/tree/master/LUDecomposition)
37 | 


--------------------------------------------------------------------------------
/lu_decomposition/input/test_input_4x4.txt:
--------------------------------------------------------------------------------
1 | 0	1,5,0,0
2 | 1	2,12,5,0
3 | 2	0,4,13,5
4 | 3	0,0,6,11
5 | 


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/lower/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/lower/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/lower/.part-r-00000.crc


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/lower/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/lower/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/lower/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	1.0,0.0,0.0,0.0
2 | 1	2.0,1.0,0.0,0.0
3 | 2	0.0,2.0,1.0,0.0
4 | 3	0.0,0.0,2.0,1.0
5 | 


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/upper/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/upper/.part-r-00000.crc:
--------------------------------------------------------------------------------
1 | crc    RR�


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/upper/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-merged/upper/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output-merged/upper/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	1.0,5.0,0.0,0.0
2 | 1	0.0,2.0,5.0,0.0
3 | 2	0.0,0.0,3.0,5.0
4 | 3	0.0,0.0,0.0,1.0
5 | 


--------------------------------------------------------------------------------
/lu_decomposition/output-run-0/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output-run-0/.part-m-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-0/.part-m-00000.crc


--------------------------------------------------------------------------------
/lu_decomposition/output-run-0/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-0/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output-run-0/part-m-00000:
--------------------------------------------------------------------------------
1 | 0	1.0,5.0,0.0,0.0
2 | 1,0	2.0
3 | 1	0.0,2.0,5.0,0.0
4 | 2,0	0.0
5 | 2	0.0,4.0,13.0,5.0
6 | 3,0	0.0
7 | 3	0.0,0.0,6.0,11.0
8 | 


--------------------------------------------------------------------------------
/lu_decomposition/output-run-1/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output-run-1/.part-m-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-1/.part-m-00000.crc


--------------------------------------------------------------------------------
/lu_decomposition/output-run-1/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-1/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output-run-1/part-m-00000:
--------------------------------------------------------------------------------
1 | 1	0.0,2.0,5.0,0.0
2 | 0	1.0,5.0,0.0,0.0
3 | 2,1	2.0
4 | 2	0.0,0.0,3.0,5.0
5 | 3,1	0.0
6 | 3	0.0,0.0,6.0,11.0
7 | 


--------------------------------------------------------------------------------
/lu_decomposition/output-run-2/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output-run-2/.part-m-00000.crc:
--------------------------------------------------------------------------------
1 | crc    Їغ


--------------------------------------------------------------------------------
/lu_decomposition/output-run-2/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output-run-2/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output-run-2/part-m-00000:
--------------------------------------------------------------------------------
1 | 2	0.0,0.0,3.0,5.0
2 | 1	0.0,2.0,5.0,0.0
3 | 0	1.0,5.0,0.0,0.0
4 | 3,2	2.0
5 | 3	0.0,0.0,0.0,1.0
6 | 


--------------------------------------------------------------------------------
/lu_decomposition/output/nth/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output/nth/.part-m-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/nth/.part-m-00000.crc


--------------------------------------------------------------------------------
/lu_decomposition/output/nth/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/nth/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output/nth/part-m-00000:
--------------------------------------------------------------------------------
1 | 2	0.0,0.0,3.0,5.0
2 | 


--------------------------------------------------------------------------------
/lu_decomposition/output/total_records/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/lu_decomposition/output/total_records/.part-r-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/total_records/.part-r-00000.crc


--------------------------------------------------------------------------------
/lu_decomposition/output/total_records/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/punit-naik/MLHadoop/5dc27f3e1f67635341b5dfab1b2c594bafdfc837/lu_decomposition/output/total_records/_SUCCESS


--------------------------------------------------------------------------------
/lu_decomposition/output/total_records/part-r-00000:
--------------------------------------------------------------------------------
1 | 0	4
2 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/FindNthRow/find_nth_driver.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.FindNthRow;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStreamReader;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.NullWritable;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | import lu_decomposition.naive_gausssian.io.LongAndTextWritable;
16 | 
17 | public class find_nth_driver {
18 | 	
19 | 	public static String readNthRow (String path, Configuration conf) throws IOException {
20 | 		FileSystem hdfs=FileSystem.get(conf);
21 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-m-00000"))));
22 | 		String records = br.readLine().split("\\s")[1];
23 | 		br.close();
24 | 		return records;
25 | 	}
26 | 	
27 | 	public static String run (String[] args, long n, long total_records) throws IOException, InterruptedException, ClassNotFoundException {
28 | 		return (n <= total_records-1) ? runSafely(args, n) :"fail";
29 | 	}
30 | 
31 | 	@SuppressWarnings("deprecation")
32 | 	public static String runSafely (String[] args, long n) throws IOException, InterruptedException, ClassNotFoundException {
33 | 	  Configuration conf= new Configuration();
34 | 	  FileSystem hdfs=FileSystem.get(conf);
35 | 	  // Deleting previous stored nth row
36 | 	  hdfs.delete(new Path(args[1]));
37 | 	  conf.setLong("n", n);
38 | 	  Job job = new Job(conf);
39 | 	
40 | 	  job.setJarByClass(find_nth_driver.class);
41 | 	
42 | 	  job.setJobName("Finds the nth row of the HDFS file");
43 | 	
44 | 	  FileInputFormat.setInputPaths(job, new Path(args[0]));
45 | 	  FileOutputFormat.setOutputPath(job, new Path(args[1]));
46 | 	
47 | 	  job.setMapperClass(find_nth_mapper.class);
48 | 	  job.setNumReduceTasks(0);
49 | 	  job.setOutputKeyClass(NullWritable.class);
50 | 	  job.setOutputValueClass(LongAndTextWritable.class);
51 | 	
52 | 	  job.waitForCompletion(true);
53 | 	  
54 | 	  return readNthRow(args[1], conf);
55 | 	};
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/FindNthRow/find_nth_mapper.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.FindNthRow;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.NullWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | import lu_decomposition.naive_gausssian.io.LongAndTextWritable;
11 | 
12 | public class find_nth_mapper extends Mapper<LongWritable, Text, NullWritable, LongAndTextWritable> {
13 | 	
14 | 	private LongWritable nthKey;
15 | 	private Text nthValue = null;
16 | 	
17 | 	@Override
18 | 	public void setup (Context context) throws IOException, InterruptedException {
19 | 		this.nthKey = new LongWritable(context.getConfiguration().getLong("n", 0));
20 | 	}
21 | 
22 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
23 | 		String[] input = value.toString().split("\\t");
24 | 		if (!input[0].contains(",")) {
25 | 			LongWritable rowKey = new LongWritable(Long.valueOf(input[0]));
26 | 			if (rowKey.compareTo(this.nthKey) == 0) {
27 | 				this.nthValue = new Text(input[1]);
28 | 			}
29 | 		}
30 | 	}
31 | 	
32 | 	@Override
33 | 	public void cleanup(Context context) throws IOException, InterruptedException{
34 | 		
35 | 		if (this.nthValue != null)
36 | 			context.write(null, new LongAndTextWritable(this.nthKey, this.nthValue));
37 | 	}
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_driver.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.MergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Job;
 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 9 | 
10 | import lu_decomposition.naive_gausssian.io.NaturalKeyGroupingComparator;
11 | import lu_decomposition.naive_gausssian.io.TextPair;
12 | import lu_decomposition.naive_gausssian.io.TextPairComparator;
13 | import lu_decomposition.naive_gausssian.io.TextPairPartitioner;
14 | 
15 | public class merge_results_driver {
16 | 
17 | 	public static boolean runWithJob(Job job, String out_path) throws IOException, InterruptedException, ClassNotFoundException {
18 | 	  job.setJarByClass(merge_results_driver.class);
19 | 	
20 | 	  job.setJobName("Final Step: Merging results and creating separate LU decomposed components of input matrix");
21 | 	
22 | 	  FileOutputFormat.setOutputPath(job, new Path(out_path));
23 | 	
24 | 	  job.setMapperClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_mapper.class);
25 | 	  job.setReducerClass(lu_decomposition.naive_gausssian.MergeResults.merge_results_reducer.class);
26 | 	  job.setMapOutputKeyClass(TextPair.class);
27 | 	  job.setMapOutputValueClass(Text.class);
28 | 	  job.setOutputKeyClass(TextPair.class);
29 | 	  job.setOutputValueClass(Text.class);
30 | 	  job.setPartitionerClass(TextPairPartitioner.class);
31 |       job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
32 |       job.setSortComparatorClass(TextPairComparator.class);
33 |       
34 |       boolean success = job.waitForCompletion(true);
35 | 	  return success;
36 | 	};
37 | }
38 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_mapper.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.MergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | import lu_decomposition.naive_gausssian.io.TextPair;
10 | 
11 | public class merge_results_mapper extends Mapper<LongWritable, Text, TextPair, Text> {
12 | 	
13 | 	private Boolean upper;
14 | 	private int total_records;
15 | 	
16 | 	@Override
17 | 	public void setup (Context context) {
18 | 		this.upper = context.getConfiguration().getBoolean("upper", false);
19 | 		this.total_records = (int) context.getConfiguration().getLong("total_records", 0);
20 | 	}
21 | 
22 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
23 | 		String[] parts = value.toString().split("\\t");
24 | 		// Processing Upper Triangular Matrix's rows
25 | 		if (this.upper && !parts[0].contains(",")) {
26 | 			context.write(new TextPair(parts[0],""), new Text(parts[1]));
27 | 		}
28 | 		// Processing Lower Triangular Matrix's rows
29 | 		if (!this.upper && parts[0].contains(",")) {
30 | 			
31 | 			String[] rowCol = parts[0].split(",");
32 | 			String row = rowCol[0];
33 | 			// Sending first row of Lower Triangular Matrix to the reducer
34 | 			if (Integer.valueOf(row)-1 == 0) {
35 | 				for (int i = 0; i < this.total_records; i++) {
36 | 					context.write(new TextPair("0",String.valueOf(i)), new Text(i+","+((i == 0) ? 1 : 0)));
37 | 				}
38 | 			}
39 | 			String column = rowCol[1];
40 | 			String element = parts[1];
41 | 			context.write(new TextPair(row, column), new Text(column+","+element));
42 | 		}
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/MergeResults/merge_results_reducer.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.MergeResults;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | import lu_decomposition.naive_gausssian.lud_mapper;
 9 | import lu_decomposition.naive_gausssian.io.TextPair;
10 | 
11 | public class merge_results_reducer extends Reducer<TextPair, Text, TextPair, Text> {
12 | 	
13 | 	private Boolean upper;
14 | 	private int total_records;
15 | 	
16 | 	@Override
17 | 	public void setup (Context context) {
18 | 		this.upper = context.getConfiguration().getBoolean("upper", false);
19 | 		this.total_records = (int) context.getConfiguration().getLong("total_records", 0);
20 | 	}
21 | 	
22 | 	public static String arrayToCSV(String[] a) {
23 |         String result = "";
24 |         if (a.length > 0) {
25 |             StringBuilder sb = new StringBuilder();
26 |             for (String s : a) {
27 |                 sb.append(s).append(",");
28 |             }
29 |             result = sb.deleteCharAt(sb.length() - 1).toString();
30 |         }
31 |         return result;
32 |     }
33 | 	
34 | 	public void reduce(TextPair key, Iterable<Text> values, Context context)
35 | 			throws IOException, InterruptedException {
36 | 		if (this.upper) {
37 | 			for (Text val:values) {
38 | 				context.write(new TextPair(key.getFirst(),""), val);
39 | 			}
40 | 		}
41 | 		else {
42 | 			Double[] rowElements = new Double[this.total_records];
43 | 			int row = Integer.valueOf(key.getFirst());
44 | 			for (Text val:values) {
45 | 				String[] parts = val.toString().split(",");
46 | 				int j = Integer.valueOf(parts[0]);
47 | 				rowElements[j] = Double.valueOf(parts[1]);
48 | 			}
49 | 			// Setting Diagonal Elements as `1` in the lower triangular matrix rows
50 | 			rowElements[row] = (double) 1;
51 | 			
52 | 			for(int j = 0; j< this.total_records; j++) {
53 | 				if (rowElements[j] == null) {
54 | 					rowElements[j] = (double) 0;
55 | 				}
56 | 			}
57 | 			context.write(new TextPair(key.getFirst(),""), new Text(lud_mapper.arrayToCSV(rowElements)));
58 | 		}
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_driver.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.TotalRecords;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStreamReader;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.FileSystem;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.LongWritable;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | 
15 | public class total_records_driver {
16 | 	
17 | 	public static long readTotalRecords (String path, Configuration conf) throws IOException {
18 | 		FileSystem hdfs=FileSystem.get(conf);
19 | 		BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(new Path(path+"/part-r-00000"))));
20 | 		Long records = (long) 0;
21 | 		records = Long.valueOf(br.readLine().split("\\t")[1]);
22 | 		br.close();
23 | 		return records;
24 | 	}
25 | 
26 | 	@SuppressWarnings("deprecation")
27 | 	public static long run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
28 | 	  Configuration conf = new Configuration();
29 | 	  Job job = new Job(conf);
30 | 	
31 | 	  job.setJarByClass(total_records_driver.class);
32 | 	
33 | 	  job.setJobName("Just counting total rows of the HDFS input");
34 | 	
35 | 	  FileInputFormat.setInputPaths(job, new Path(args[0]));
36 | 	  FileOutputFormat.setOutputPath(job, new Path(args[1]));
37 | 	
38 | 	  job.setMapperClass(total_records_mapper.class);
39 | 	
40 | 	  job.setReducerClass(total_records_reducer.class);
41 | 	  job.setCombinerClass(total_records_reducer.class);
42 | 	
43 | 	  job.setOutputKeyClass(LongWritable.class);
44 | 	  job.setOutputValueClass(LongWritable.class);
45 | 	  
46 | 	  //job.setInputFormatClass(TextInputFormat.class);
47 |       //job.setOutputFormatClass(TextOutputFormat.class);
48 | 	
49 | 	  job.waitForCompletion(true);
50 | 	  
51 | 	  return readTotalRecords(args[1], conf);
52 |   };
53 | }
54 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_mapper.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.TotalRecords;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class total_records_mapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
10 | 
11 | 	private Long countRows = (long) 0;
12 | 	
13 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
14 | 		this.countRows++;
15 | 	}
16 | 	
17 | 	@Override
18 | 	public void cleanup(Context context) throws IOException, InterruptedException{
19 | 		context.write(new LongWritable(0), new LongWritable(this.countRows));
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/TotalRecords/total_records_reducer.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.TotalRecords;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.mapreduce.Reducer;
 7 | 
 8 | public class total_records_reducer extends Reducer<LongWritable, LongWritable, LongWritable, LongWritable> {
 9 | 
10 | 	private Long countRows = (long) 0;
11 | 	
12 | 	public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
13 | 			throws IOException, InterruptedException {
14 | 		for(LongWritable val:values){
15 | 			this.countRows += val.get();
16 | 		}
17 | 	}
18 | 	
19 | 	@Override
20 | 	public void cleanup(Context context) throws IOException, InterruptedException{
21 | 		
22 | 		context.write(new LongWritable(0), new LongWritable(this.countRows));
23 | 	}
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/io/LongAndTextWritable.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.io;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.LongWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.io.Writable;
10 | 
11 | public class LongAndTextWritable implements Writable {
12 | 	
13 | 	private LongWritable rowKey;
14 |     private Text rowValue;
15 |     
16 |     public LongAndTextWritable() {
17 |         this.rowKey = new LongWritable(0);
18 |         this.rowValue = new Text("");
19 |     }
20 | 	
21 | 	public LongAndTextWritable(LongWritable k, Text v) {
22 |         this.rowKey = k;
23 |         this.rowValue = v;
24 |     }
25 | 	
26 | 	public LongWritable getKey() {
27 | 		return rowKey;
28 | 	}
29 | 	
30 | 	public Text getValue() {
31 | 		return rowValue;
32 | 	}
33 | 
34 | 	@Override
35 | 	public void readFields(DataInput in) throws IOException {
36 | 		
37 | 		rowKey.readFields(in);
38 | 		rowValue.readFields(in);
39 | 		
40 | 	}
41 | 
42 | 	@Override
43 | 	public void write(DataOutput out) throws IOException {
44 | 		
45 | 		rowKey.write(out);
46 | 		rowValue.write(out);
47 | 	}
48 | 	
49 | 	@Override
50 |     public String toString() {
51 |         return rowKey.toString() + "\t" + rowValue.toString();
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/io/NaturalKeyGroupingComparator.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.io;
 2 | 
 3 | import org.apache.hadoop.io.WritableComparable;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | 
 6 | public class NaturalKeyGroupingComparator extends WritableComparator {
 7 |     protected NaturalKeyGroupingComparator() {
 8 |         super(TextPair.class, true);
 9 |     }   
10 |     @SuppressWarnings("rawtypes")
11 |     @Override
12 |     public int compare(WritableComparable w1, WritableComparable w2) {
13 |         TextPair tp1 = (TextPair)w1;
14 |         TextPair tp2 = (TextPair)w2;
15 |          
16 |         return tp1.getFirst().compareTo(tp2.getFirst());
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPair.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.io;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.WritableComparable;
 8 | 
 9 | public class TextPair implements WritableComparable<TextPair> {
10 | 	
11 | 	private String t1;
12 | 	private String t2;
13 | 	
14 | 	public String getFirst() {
15 | 		return this.t1;
16 | 	}
17 | 	
18 | 	public String getSecond() {
19 | 		return this.t2;
20 | 	}
21 | 
22 | 	@Override
23 | 	public void readFields(DataInput in) throws IOException {
24 | 		this.t1 = in.readUTF();
25 | 		this.t2 = in.readUTF();
26 | 	}
27 | 
28 | 	@Override
29 | 	public void write(DataOutput out) throws IOException {
30 | 		out.writeUTF(this.t1);
31 | 		out.writeUTF(this.t2);
32 | 	}
33 | 	
34 | 	public TextPair() {
35 |         this.t1 = new String();
36 |         this.t2 = new String();
37 |     }
38 | 	
39 | 	public TextPair(String t1, String t2) {
40 |         this.t1 = new String(t1);
41 |         this.t2 = new String(t2);
42 |     }
43 | 	
44 | 	public int compareTo(TextPair tp) {
45 |         int sortKey = this.t1.compareTo(tp.getFirst());
46 |         if (sortKey == 0) {
47 |         	sortKey = this.t2.compareTo(tp.getSecond());
48 |         }
49 |         return sortKey;
50 |     }
51 | 	
52 | 	public String toString () {
53 | 		String s = "";
54 | 		if (this.t2.compareTo("") == 0) {
55 | 			s += this.t1;
56 | 		}
57 | 		else {
58 | 			s += this.t1 + "," + this.t2;
59 | 		}
60 | 		return s;
61 | 	}
62 | 	
63 | }
64 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPairComparator.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.io;
 2 | 
 3 | import org.apache.hadoop.io.WritableComparable;
 4 | import org.apache.hadoop.io.WritableComparator;
 5 | 
 6 | public class TextPairComparator extends WritableComparator {
 7 |     protected TextPairComparator() {
 8 |         super(TextPair.class, true);
 9 |     }   
10 |     @SuppressWarnings("rawtypes")
11 |     @Override
12 |     public int compare(WritableComparable w1, WritableComparable w2) {
13 |     	TextPair tp1 = (TextPair)w1;
14 |     	TextPair tp2 = (TextPair)w2;
15 |          
16 |         int result = tp1.getFirst().compareTo(tp2.getFirst());
17 |         if(0 == result) {
18 |             result = tp1.getSecond().compareTo(tp2.getSecond());
19 |         }
20 |         return result;
21 |     }
22 | }


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/io/TextPairPartitioner.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian.io;
 2 | 
 3 | import org.apache.hadoop.io.Text;
 4 | import org.apache.hadoop.mapreduce.Partitioner;
 5 | 
 6 | public class TextPairPartitioner extends Partitioner<TextPair, Text>{
 7 |     @Override
 8 |     public int getPartition(TextPair tp, Text t, int numPartitions) {
 9 |         return tp.getFirst().hashCode() % numPartitions;
10 |     }
11 | }


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/lud_driver.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.Path;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 | 
12 | import lu_decomposition.naive_gausssian.FindNthRow.find_nth_driver;
13 | import lu_decomposition.naive_gausssian.MergeResults.merge_results_driver;
14 | import lu_decomposition.naive_gausssian.TotalRecords.total_records_driver;
15 | 
16 | public class lud_driver {
17 | 	
18 | 	public static String arrayToCSV(String[] a) {
19 |         String result = "";
20 |         if (a.length > 0) {
21 |             StringBuilder sb = new StringBuilder();
22 |             for (String s : a) {
23 |                 sb.append(s).append(",");
24 |             }
25 |             result = sb.deleteCharAt(sb.length() - 1).toString();
26 |         }
27 |         return result;
28 |     }
29 | 
30 | 	@SuppressWarnings("deprecation")
31 | 	public static boolean runWithConf (String[] args, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException {
32 | 	  
33 | 	  Job job = new Job(conf);
34 | 	
35 | 	  job.setJarByClass(lud_driver.class);
36 | 	
37 | 	  job.setJobName("Split a matrix into it's LU decomposed components using the Naive Gaussian Elimination method");
38 | 	  long n = conf.getLong("n", 0);
39 | 	  FileInputFormat.setInputPaths(job, new Path((n==0)?args[0]:(args[1]+"-run-"+(n-1))));
40 | 	  FileOutputFormat.setOutputPath(job, new Path(args[1]+"-run-"+n));
41 | 	  job.setNumReduceTasks(0);
42 | 	  job.setMapperClass(lud_mapper.class);
43 | 	  job.setOutputKeyClass(Text.class);
44 | 	  job.setOutputValueClass(Text.class);
45 | 	
46 | 	  boolean success = job.waitForCompletion(true);
47 | 	  
48 | 	  return success;
49 | 	};
50 | 	
51 | 	@SuppressWarnings("deprecation")
52 | 	public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
53 | 		String input = args[0];
54 | 		String output = args[1];
55 | 		String total_records_output = output + "/total_records";
56 | 		String[] total_records_args = {input, total_records_output};
57 | 		String find_nth_row_output = output + "/nth";
58 | 		// MR Job: Finding Total Records
59 | 		long total_records = total_records_driver.run(total_records_args);
60 | 		String[] lud_args = {input, output};
61 | 		Configuration conf = new Configuration();
62 | 		
63 | 		for(long n = 0; n < total_records-1; n++) {
64 | 			String find_nth_row_input = (n==0) ? input : output+"-run-"+(n-1);
65 | 			String[] find_nth_row_args = {find_nth_row_input, find_nth_row_output};
66 | 			// MR Job: Finding Nth Record
67 | 			String nVal = find_nth_driver.run(find_nth_row_args, n, total_records);
68 | 			conf.setLong("n", n);
69 | 		    conf.setLong("total_records", total_records);
70 | 			conf.set("nVal", nVal);
71 | 			// MR Job: Running LU Decomposition on the input
72 | 			runWithConf(lud_args, conf);
73 | 		}
74 | 		
75 | 		// MR Job(s): Merging Outputs
76 | 		conf.setBoolean("upper", false);
77 | 		Job job = new Job(conf);
78 | 		String[] path = new String[(int) (total_records-1)];
79 | 		for(long n = 0; n < total_records-1; n++) {
80 | 			path[(int) n] = (output+"-run-"+n);
81 | 		}
82 | 		FileInputFormat.setInputPaths(job, arrayToCSV(path));
83 | 		merge_results_driver.runWithJob(job, output+"-merged/lower");	
84 | 		conf.setBoolean("upper", true);
85 | 		job = new Job(conf);
86 | 		FileInputFormat.addInputPath(job, new Path(output+"-run-"+(total_records-2)));
87 | 		merge_results_driver.runWithJob(job, output+"-merged/upper");
88 | 	}
89 | }
90 | 


--------------------------------------------------------------------------------
/lu_decomposition/src/lu_decomposition/naive_gausssian/lud_mapper.java:
--------------------------------------------------------------------------------
 1 | package lu_decomposition.naive_gausssian;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Mapper;
 8 | 
 9 | public class lud_mapper extends Mapper<LongWritable, Text, Text, Text> {
10 | 	
11 | 	private int n;
12 | 	private Double[] nVal;
13 | 	private int total_records;
14 | 	
15 | 	public Double[] stringToDoubleArray(String[] a) {
16 | 		
17 | 		Double[] x = new Double[a.length];
18 | 		
19 | 		for(int i = 0; i < this.total_records; i++) {
20 | 			x[i] = Double.valueOf(a[i]);
21 | 		}
22 | 		
23 | 		return x;
24 | 		
25 | 	}
26 | 	
27 | 	public static String arrayToCSV(Double[] nVal2) {
28 |         String result = "";
29 | 
30 |         if (nVal2.length > 0) {
31 |             StringBuilder sb = new StringBuilder();
32 | 
33 |             for (Double s : nVal2) {
34 |                 sb.append(s).append(",");
35 |             }
36 | 
37 |             result = sb.deleteCharAt(sb.length() - 1).toString();
38 |         }
39 |         return result;
40 |     }
41 | 	
42 | 	@Override
43 | 	public void setup (Context context) throws IOException, InterruptedException {
44 | 		this.n = (int) context.getConfiguration().getLong("n", 0);
45 | 		this.total_records = (int) context.getConfiguration().getLong("total_records", 0);
46 | 		this.nVal =  stringToDoubleArray(context.getConfiguration().get("nVal").split(","));
47 | 		
48 | 		context.write(new Text(String.valueOf(this.n)), new Text(arrayToCSV(this.nVal)));
49 | 	}
50 | 	
51 | 	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
52 | 		String[] parts = value.toString().split("\\t");
53 | 		if(!parts[0].contains(",")) {
54 | 			long row = Long.valueOf(parts[0]);
55 | 			if (row > this.n) {
56 | 				Double[] rowElements = stringToDoubleArray(parts[1].split(","));
57 | 				Double multiplier = (double) (rowElements[this.n]/this.nVal[this.n]);
58 | 				context.write(new Text(row+","+this.n), new Text(String.valueOf(multiplier)));
59 | 				Double[] rowElementsModified = new Double[this.total_records];
60 | 				for (int i = 0; i< this.total_records; i++) {
61 | 					rowElementsModified[i] = (Double) (rowElements[i] - this.nVal[i]*multiplier);
62 | 				}
63 | 				if (row != 0)
64 | 					context.write(new Text(String.valueOf(row)), new Text(arrayToCSV(rowElementsModified)));
65 | 			}
66 | 			else {
67 | 				if (Long.valueOf(parts[0]) != this.n)
68 | 					context.write(new Text(parts[0]), new Text(parts[1]));
69 | 			}
70 | 		}
71 | 	}
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------