├── .gitignore
├── MRDP
    └── src
    │   ├── main
    │       ├── java
    │       │   └── mrdp
    │       │   │   ├── MRDPMain.java
    │       │   │   ├── appendixA
    │       │   │       └── BloomFilterDriver.java
    │       │   │   ├── ch1
    │       │   │       └── CommentWordCount.java
    │       │   │   ├── ch2
    │       │   │       ├── AverageDriver.java
    │       │   │       ├── CountNumUsersByStateDriver.java
    │       │   │       ├── MedianStdDevDriver.java
    │       │   │       ├── MinMaxCountDriver.java
    │       │   │       ├── SmarterMedianStdDevDriver.java
    │       │   │       └── WikipediaIndex.java
    │       │   │   ├── ch3
    │       │   │       ├── BloomFilteringDriver.java
    │       │   │       ├── DistinctUserDriver.java
    │       │   │       ├── DistributedGrep.java
    │       │   │       ├── QueryBloomFiltering.java
    │       │   │       ├── SimpleRandomSampling.java
    │       │   │       ├── TopTenDriver.java
    │       │   │       └── UniqueUserCount.java
    │       │   │   ├── ch4
    │       │   │       ├── AnonymizeDriver.java
    │       │   │       ├── Binning.java
    │       │   │       ├── PartitionedUsers.java
    │       │   │       ├── PostCommentBuildingDriver.java
    │       │   │       ├── QuestionAnswerBuildingDriver.java
    │       │   │       └── TotalOrderSorting.java
    │       │   │   ├── ch5
    │       │   │       ├── CartesianFormatter.java
    │       │   │       ├── CartesianProduct.java
    │       │   │       ├── CompositeJoinDriver.java
    │       │   │       ├── JoinFormatting.java
    │       │   │       ├── ReduceSideJoinDriver.java
    │       │   │       ├── ReduceSideJoinWithBloomDriver.java
    │       │   │       └── ReplicatedJoinDriver.java
    │       │   │   ├── ch6
    │       │   │       ├── ChainMapperDriver.java
    │       │   │       ├── JobChainingDriver.java
    │       │   │       ├── JobControlDriver.java
    │       │   │       ├── MergedJobDriver.java
    │       │   │       └── ParallelJobs.java
    │       │   │   ├── ch7
    │       │   │       ├── PartitionPruningInputDriver.java
    │       │   │       ├── PartitionPruningOutputDriver.java
    │       │   │       ├── RandomDataGenerationDriver.java
    │       │   │       ├── RedisInputDriver.java
    │       │   │       └── RedisOutputDriver.java
    │       │   │   └── utils
    │       │   │       └── MRDPUtils.java
    │       └── resources
    │       │   ├── highrepusers.bf
    │       │   ├── hotlist.txt
    │       │   └── hotlistwords.bf
    │   └── test
    │       └── java
    │           └── mrdp
    │               └── ch5
    │                   └── CartesianProductTest.java
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | 
3 | # Package Files #
4 | *.jar
5 | *.war
6 | *.ear
7 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/MRDPMain.java:
--------------------------------------------------------------------------------
 1 | package mrdp;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | import mrdp.ch1.*;
 6 | import mrdp.ch2.*;
 7 | import mrdp.ch3.*;
 8 | import mrdp.ch4.*;
 9 | import mrdp.ch5.*;
10 | import mrdp.ch6.*;
11 | import mrdp.ch7.*;
12 | import mrdp.utils.MRDPUtils;
13 | 
14 | import org.apache.hadoop.conf.Configuration;
15 | import org.apache.hadoop.conf.Configured;
16 | import org.apache.hadoop.util.Tool;
17 | import org.apache.hadoop.util.ToolRunner;
18 | 
19 | @SuppressWarnings("unused")
20 | public class MRDPMain extends Configured implements Tool {
21 | 
22 | 	public static void main(String[] args) throws Exception {
23 | 		System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args));
24 | 	}
25 | 
26 | 	@Override
27 | 	public int run(String[] args) throws Exception {
28 | 		if (args.length > 0) {
29 | 			String example = args[0];
30 | 			String[] otherArgs = Arrays.copyOfRange(args, 1, args.length);
31 | 
32 | 			if (example.equalsIgnoreCase("PartitionPruningOutput")) {
33 | 				PartitionPruningOutputDriver.main(otherArgs);
34 | 			} else if (example.equalsIgnoreCase("PartitionPruningInput")) {
35 | 				PartitionPruningInputDriver.main(otherArgs);
36 | 			} else if (example.equalsIgnoreCase("RedisInput")) {
37 | 				RedisInputDriver.main(otherArgs);
38 | 			} else if (example.equalsIgnoreCase("RedisOutput")) {
39 | 				RedisOutputDriver.main(otherArgs);
40 | 			} else {
41 | 				printHelp();
42 | 				return 1;
43 | 			}
44 | 
45 | 			return 0;
46 | 		} else {
47 | 			printHelp();
48 | 			return 1;
49 | 		}
50 | 	}
51 | 
52 | 	private void printHelp() {
53 | 		System.out
54 | 				.println("Usage: hadoop jar mrdp.jar <example> <example args>");
55 | 		System.out.println("Examples are:");
56 | 		System.out.println("Chapter 7:");
57 | 		System.out
58 | 				.println("\tRedisOutput  <user data> <redis hosts> <hashset name>");
59 | 		System.out
60 | 				.println("\tRedisInput <redis hosts> <hashset name> <output>");
61 | 		System.out.println("\tPartitionPruningOutput <user data>");
62 | 		System.out
63 | 				.println("\tPartitionPruningInput <last access months> <output>");
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/appendixA/BloomFilterDriver.java:
--------------------------------------------------------------------------------
 1 | package mrdp.appendixA;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.InputStreamReader;
 5 | import java.util.zip.GZIPInputStream;
 6 | 
 7 | import org.apache.hadoop.conf.Configuration;
 8 | import org.apache.hadoop.fs.FSDataOutputStream;
 9 | import org.apache.hadoop.fs.FileStatus;
10 | import org.apache.hadoop.fs.FileSystem;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.util.GenericOptionsParser;
13 | import org.apache.hadoop.util.bloom.BloomFilter;
14 | import org.apache.hadoop.util.bloom.Key;
15 | import org.apache.hadoop.util.hash.Hash;
16 | 
17 | public class BloomFilterDriver {
18 | 
19 | 	public static void main(String[] args) throws Exception {
20 | 		Configuration conf = new Configuration();
21 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
22 | 				.getRemainingArgs();
23 | 		if (otherArgs.length != 4) {
24 | 			System.err
25 | 					.println("Usage: BloomFilterWriter <inputfile> <nummembers> <falseposrate> <bfoutfile>");
26 | 			System.exit(1);
27 | 		}
28 | 
29 | 		FileSystem fs = FileSystem.get(new Configuration());
30 | 
31 | 		// Parse command line arguments
32 | 		Path inputFile = new Path(otherArgs[0]);
33 | 		int numMembers = Integer.parseInt(otherArgs[1]);
34 | 		float falsePosRate = Float.parseFloat(otherArgs[2]);
35 | 		Path bfFile = new Path(otherArgs[3]);
36 | 
37 | 		// Calculate our vector size and optimal K value based on approximations
38 | 		int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
39 | 		int nbHash = getOptimalK(numMembers, vectorSize);
40 | 
41 | 		// create new Bloom filter
42 | 		BloomFilter filter = new BloomFilter(vectorSize, nbHash,
43 | 				Hash.MURMUR_HASH);
44 | 
45 | 		// Open file for read
46 | 
47 | 		System.out.println("Training Bloom filter of size " + vectorSize
48 | 				+ " with " + nbHash + " hash functions, " + numMembers
49 | 				+ " approximate number of records, and " + falsePosRate
50 | 				+ " false positive rate");
51 | 
52 | 		String line = null;
53 | 		int numRecords = 0;
54 | 		for (FileStatus status : fs.listStatus(inputFile)) {
55 | 			BufferedReader rdr;
56 | 			// if file is gzipped, wrap it in a GZIPInputStream
57 | 			if (status.getPath().getName().endsWith(".gz")) {
58 | 				rdr = new BufferedReader(new InputStreamReader(
59 | 						new GZIPInputStream(fs.open(status.getPath()))));
60 | 			} else {
61 | 				rdr = new BufferedReader(new InputStreamReader(fs.open(status
62 | 						.getPath())));
63 | 			}
64 | 
65 | 			System.out.println("Reading " + status.getPath());
66 | 			while ((line = rdr.readLine()) != null) {
67 | 				filter.add(new Key(line.getBytes()));
68 | 				++numRecords;
69 | 			}
70 | 
71 | 			rdr.close();
72 | 		}
73 | 
74 | 		System.out.println("Trained Bloom filter with " + numRecords
75 | 				+ " entries.");
76 | 
77 | 		System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
78 | 		FSDataOutputStream strm = fs.create(bfFile);
79 | 		filter.write(strm);
80 | 
81 | 		strm.flush();
82 | 		strm.close();
83 | 
84 | 		System.out.println("Done training Bloom filter.");
85 | 	}
86 | 
87 | 	public static int getOptimalBloomFilterSize(int numRecords,
88 | 			float falsePosRate) {
89 | 		int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math
90 | 				.pow(Math.log(2), 2));
91 | 		return size;
92 | 	}
93 | 
94 | 	public static int getOptimalK(float numMembers, float vectorSize) {
95 | 		return (int) Math.round(vectorSize / numMembers * Math.log(2));
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch1/CommentWordCount.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch1;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.StringTokenizer;
  5 | import java.util.Map;
  6 | 
  7 | import mrdp.utils.MRDPUtils;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.IntWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 | import org.apache.hadoop.util.GenericOptionsParser;
 19 | 
 20 | import org.apache.commons.lang.StringEscapeUtils;
 21 | 
 22 | public class CommentWordCount {
 23 | 
 24 | 	public static class SOWordCountMapper extends
 25 | 			Mapper<Object, Text, Text, IntWritable> {
 26 | 
 27 | 		private final static IntWritable one = new IntWritable(1);
 28 | 		private Text word = new Text();
 29 | 
 30 | 		public void map(Object key, Text value, Context context)
 31 | 				throws IOException, InterruptedException {
 32 | 
 33 | 			// Parse the input string into a nice map
 34 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 35 | 					.toString());
 36 | 
 37 | 			// Grab the "Text" field, since that is what we are counting over
 38 | 			String txt = parsed.get("Text");
 39 | 
 40 | 			// .get will return null if the key is not there
 41 | 			if (txt == null) {
 42 | 				// skip this record
 43 | 				return;
 44 | 			}
 45 | 
 46 | 			// Unescape the HTML because the SO data is escaped.
 47 | 			txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase());
 48 | 
 49 | 			// Remove some annoying punctuation
 50 | 			txt = txt.replaceAll("'", ""); // remove single quotes (e.g., can't)
 51 | 			txt = txt.replaceAll("[^a-zA-Z]", " "); // replace the rest with a
 52 | 													// space
 53 | 
 54 | 			// Tokenize the string, then send the tokens away
 55 | 			StringTokenizer itr = new StringTokenizer(txt);
 56 | 			while (itr.hasMoreTokens()) {
 57 | 				word.set(itr.nextToken());
 58 | 				context.write(word, one);
 59 | 			}
 60 | 		}
 61 | 	}
 62 | 
 63 | 	public static class IntSumReducer extends
 64 | 			Reducer<Text, IntWritable, Text, IntWritable> {
 65 | 		private IntWritable result = new IntWritable();
 66 | 
 67 | 		public void reduce(Text key, Iterable<IntWritable> values,
 68 | 				Context context) throws IOException, InterruptedException {
 69 | 			int sum = 0;
 70 | 			for (IntWritable val : values) {
 71 | 				sum += val.get();
 72 | 			}
 73 | 
 74 | 			result.set(sum);
 75 | 			context.write(key, result);
 76 | 
 77 | 		}
 78 | 	}
 79 | 
 80 | 	public static void main(String[] args) throws Exception {
 81 | 		Configuration conf = new Configuration();
 82 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 83 | 				.getRemainingArgs();
 84 | 		if (otherArgs.length != 2) {
 85 | 			System.err.println("Usage: CommentWordCount <in> <out>");
 86 | 			System.exit(2);
 87 | 		}
 88 | 		Job job = new Job(conf, "StackOverflow Comment Word Count");
 89 | 		job.setJarByClass(CommentWordCount.class);
 90 | 		job.setMapperClass(SOWordCountMapper.class);
 91 | 		job.setCombinerClass(IntSumReducer.class);
 92 | 		job.setReducerClass(IntSumReducer.class);
 93 | 		job.setOutputKeyClass(Text.class);
 94 | 		job.setOutputValueClass(IntWritable.class);
 95 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
 96 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
 97 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/AverageDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Date;
  9 | import java.util.Map;
 10 | 
 11 | import mrdp.utils.MRDPUtils;
 12 | 
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.IntWritable;
 16 | import org.apache.hadoop.io.Text;
 17 | import org.apache.hadoop.io.Writable;
 18 | import org.apache.hadoop.mapreduce.Job;
 19 | import org.apache.hadoop.mapreduce.Mapper;
 20 | import org.apache.hadoop.mapreduce.Reducer;
 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 23 | import org.apache.hadoop.util.GenericOptionsParser;
 24 | 
 25 | public class AverageDriver {
 26 | 
 27 | 	public static class SOAverageMapper extends
 28 | 			Mapper<Object, Text, IntWritable, CountAverageTuple> {
 29 | 
 30 | 		private IntWritable outHour = new IntWritable();
 31 | 		private CountAverageTuple outCountAverage = new CountAverageTuple();
 32 | 
 33 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 34 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 35 | 
 36 | 		@SuppressWarnings("deprecation")
 37 | 		@Override
 38 | 		public void map(Object key, Text value, Context context)
 39 | 				throws IOException, InterruptedException {
 40 | 
 41 | 			// Parse the input string into a nice map
 42 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 43 | 					.toString());
 44 | 
 45 | 			// Grab the "CreationDate" field,
 46 | 			// since it is what we are grouping by
 47 | 			String strDate = parsed.get("CreationDate");
 48 | 
 49 | 			// Grab the comment to find the length
 50 | 			String text = parsed.get("Text");
 51 | 
 52 | 			// .get will return null if the key is not there
 53 | 			if (strDate == null || text == null) {
 54 | 				// skip this record
 55 | 				return;
 56 | 			}
 57 | 
 58 | 			try {
 59 | 				// get the hour this comment was posted in
 60 | 				Date creationDate = frmt.parse(strDate);
 61 | 				outHour.set(creationDate.getHours());
 62 | 
 63 | 				// get the comment length
 64 | 				outCountAverage.setCount(1);
 65 | 				outCountAverage.setAverage(text.length());
 66 | 
 67 | 				// write out the user ID with min max dates and count
 68 | 				context.write(outHour, outCountAverage);
 69 | 
 70 | 			} catch (ParseException e) {
 71 | 				System.err.println(e.getMessage());
 72 | 				return;
 73 | 			}
 74 | 		}
 75 | 	}
 76 | 
 77 | 	public static class SOAverageReducer
 78 | 			extends
 79 | 			Reducer<IntWritable, CountAverageTuple, IntWritable, CountAverageTuple> {
 80 | 		private CountAverageTuple result = new CountAverageTuple();
 81 | 
 82 | 		@Override
 83 | 		public void reduce(IntWritable key, Iterable<CountAverageTuple> values,
 84 | 				Context context) throws IOException, InterruptedException {
 85 | 
 86 | 			float sum = 0;
 87 | 			float count = 0;
 88 | 
 89 | 			// Iterate through all input values for this key
 90 | 			for (CountAverageTuple val : values) {
 91 | 				sum += val.getCount() * val.getAverage();
 92 | 				count += val.getCount();
 93 | 			}
 94 | 
 95 | 			result.setCount(count);
 96 | 			result.setAverage(sum / count);
 97 | 
 98 | 			context.write(key, result);
 99 | 		}
100 | 	}
101 | 
102 | 	public static void main(String[] args) throws Exception {
103 | 		Configuration conf = new Configuration();
104 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
105 | 				.getRemainingArgs();
106 | 		if (otherArgs.length != 2) {
107 | 			System.err.println("Usage: AverageDriver <in> <out>");
108 | 			System.exit(2);
109 | 		}
110 | 		Job job = new Job(conf, "StackOverflow Average Comment Length");
111 | 		job.setJarByClass(AverageDriver.class);
112 | 		job.setMapperClass(SOAverageMapper.class);
113 | 		job.setCombinerClass(SOAverageReducer.class);
114 | 		job.setReducerClass(SOAverageReducer.class);
115 | 		job.setOutputKeyClass(IntWritable.class);
116 | 		job.setOutputValueClass(CountAverageTuple.class);
117 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
118 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
119 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
120 | 	}
121 | 
122 | 	public static class CountAverageTuple implements Writable {
123 | 		private float count = 0f;
124 | 		private float average = 0f;
125 | 
126 | 		public float getCount() {
127 | 			return count;
128 | 		}
129 | 
130 | 		public void setCount(float count) {
131 | 			this.count = count;
132 | 		}
133 | 
134 | 		public float getAverage() {
135 | 			return average;
136 | 		}
137 | 
138 | 		public void setAverage(float average) {
139 | 			this.average = average;
140 | 		}
141 | 
142 | 		@Override
143 | 		public void readFields(DataInput in) throws IOException {
144 | 			count = in.readFloat();
145 | 			average = in.readFloat();
146 | 		}
147 | 
148 | 		@Override
149 | 		public void write(DataOutput out) throws IOException {
150 | 			out.writeFloat(count);
151 | 			out.writeFloat(average);
152 | 		}
153 | 
154 | 		@Override
155 | 		public String toString() {
156 | 			return count + "\t" + average;
157 | 		}
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/CountNumUsersByStateDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Arrays;
  5 | import java.util.HashSet;
  6 | import java.util.Map;
  7 | 
  8 | import mrdp.utils.MRDPUtils;
  9 | 
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.fs.FileSystem;
 12 | import org.apache.hadoop.fs.Path;
 13 | import org.apache.hadoop.io.NullWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 18 | import org.apache.hadoop.mapreduce.Counter;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | import org.apache.hadoop.util.GenericOptionsParser;
 21 | 
 22 | public class CountNumUsersByStateDriver {
 23 | 
 24 | 	public static class CountNumUsersByStateMapper extends
 25 | 			Mapper<Object, Text, NullWritable, NullWritable> {
 26 | 
 27 | 		public static final String STATE_COUNTER_GROUP = "State";
 28 | 
 29 | 		private String[] statesArray = new String[] { "AL", "AK", "AZ", "AR",
 30 | 				"CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN",
 31 | 				"IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
 32 | 				"MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND",
 33 | 				"OH", "OK", "OR", "PA", "RI", "SC", "SF", "TN", "TX", "UT",
 34 | 				"VT", "VA", "WA", "WV", "WI", "WY" };
 35 | 
 36 | 		private HashSet<String> states = new HashSet<String>(
 37 | 				Arrays.asList(statesArray));
 38 | 
 39 | 		@Override
 40 | 		public void map(Object key, Text value, Context context)
 41 | 				throws IOException, InterruptedException {
 42 | 
 43 | 			// Parse the input into a nice map.
 44 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 45 | 					.toString());
 46 | 
 47 | 			// Get the value for the Location attribute
 48 | 			String location = parsed.get("Location");
 49 | 
 50 | 			// Look for a state abbreviation code if the location is not null or
 51 | 			// empty
 52 | 			if (location != null && !location.isEmpty()) {
 53 | 				boolean unknown = true;
 54 | 				// Make location uppercase and split on white space
 55 | 				String[] tokens = location.toUpperCase().split("\\s");
 56 | 				// For each token
 57 | 				for (String state : tokens) {
 58 | 					// Check if it is a state
 59 | 					if (states.contains(state)) {
 60 | 
 61 | 						// If so, increment the state's counter by 1 and flag it
 62 | 						// as not unknown
 63 | 						context.getCounter(STATE_COUNTER_GROUP, state)
 64 | 								.increment(1);
 65 | 						unknown = false;
 66 | 						break;
 67 | 					}
 68 | 				}
 69 | 
 70 | 				// If the state is unknown, increment the counter
 71 | 				if (unknown) {
 72 | 					context.getCounter(STATE_COUNTER_GROUP, "Unknown")
 73 | 							.increment(1);
 74 | 				}
 75 | 			} else {
 76 | 				// If it is empty or null, increment the counter by 1
 77 | 				context.getCounter(STATE_COUNTER_GROUP, "NullOrEmpty")
 78 | 						.increment(1);
 79 | 			}
 80 | 		}
 81 | 	}
 82 | 
 83 | 	public static void main(String[] args) throws Exception {
 84 | 		Configuration conf = new Configuration();
 85 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 86 | 				.getRemainingArgs();
 87 | 
 88 | 		if (otherArgs.length != 2) {
 89 | 			System.err.println("Usage: CountNumUsersByState <users> <out>");
 90 | 			System.exit(2);
 91 | 		}
 92 | 
 93 | 		Path input = new Path(otherArgs[0]);
 94 | 		Path outputDir = new Path(otherArgs[1]);
 95 | 
 96 | 		Job job = new Job(conf, "Count Num Users By State");
 97 | 		job.setJarByClass(CountNumUsersByStateDriver.class);
 98 | 
 99 | 		job.setMapperClass(CountNumUsersByStateMapper.class);
100 | 		job.setNumReduceTasks(0);
101 | 
102 | 		job.setOutputKeyClass(NullWritable.class);
103 | 		job.setOutputValueClass(NullWritable.class);
104 | 
105 | 		FileInputFormat.addInputPath(job, input);
106 | 		FileOutputFormat.setOutputPath(job, outputDir);
107 | 
108 | 		int code = job.waitForCompletion(true) ? 0 : 1;
109 | 
110 | 		if (code == 0) {
111 | 			for (Counter counter : job.getCounters().getGroup(
112 | 					CountNumUsersByStateMapper.STATE_COUNTER_GROUP)) {
113 | 				System.out.println(counter.getDisplayName() + "\t"
114 | 						+ counter.getValue());
115 | 			}
116 | 		}
117 | 
118 | 		// Clean up empty output directory
119 | 		FileSystem.get(conf).delete(outputDir, true);
120 | 
121 | 		System.exit(code);
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/MedianStdDevDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.ArrayList;
  9 | import java.util.Collections;
 10 | import java.util.Date;
 11 | import java.util.Map;
 12 | 
 13 | import mrdp.utils.MRDPUtils;
 14 | 
 15 | import org.apache.hadoop.conf.Configuration;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.IntWritable;
 18 | import org.apache.hadoop.io.Text;
 19 | import org.apache.hadoop.io.Writable;
 20 | import org.apache.hadoop.mapreduce.Job;
 21 | import org.apache.hadoop.mapreduce.Mapper;
 22 | import org.apache.hadoop.mapreduce.Reducer;
 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 25 | import org.apache.hadoop.util.GenericOptionsParser;
 26 | 
 27 | public class MedianStdDevDriver {
 28 | 
 29 | 	public static class SOMedianStdDevMapper extends
 30 | 			Mapper<Object, Text, IntWritable, IntWritable> {
 31 | 
 32 | 		private IntWritable outHour = new IntWritable();
 33 | 		private IntWritable outCommentLength = new IntWritable();
 34 | 
 35 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 36 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 37 | 
 38 | 		@SuppressWarnings("deprecation")
 39 | 		@Override
 40 | 		public void map(Object key, Text value, Context context)
 41 | 				throws IOException, InterruptedException {
 42 | 
 43 | 			// Parse the input string into a nice map
 44 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
 45 | 
 46 | 			// Grab the "CreationDate" field,
 47 | 			// since it is what we are grouping by
 48 | 			String strDate = parsed.get("CreationDate");
 49 | 
 50 | 			// Grab the comment to find the length
 51 | 			String text = parsed.get("Text");
 52 | 
 53 | 			// .get will return null if the key is not there
 54 | 			if (strDate == null || text == null) {
 55 | 				// skip this record
 56 | 				return;
 57 | 			}
 58 | 
 59 | 			try {
 60 | 				// get the hour this comment was posted in
 61 | 				Date creationDate = frmt.parse(strDate);
 62 | 				outHour.set(creationDate.getHours());
 63 | 
 64 | 				// get the comment length
 65 | 				outCommentLength.set(text.length());
 66 | 
 67 | 				// write out the user ID with min max dates and count
 68 | 				context.write(outHour, outCommentLength);
 69 | 
 70 | 			} catch (ParseException e) {
 71 | 				System.err.println(e.getMessage());
 72 | 				return;
 73 | 			}
 74 | 		}
 75 | 	}
 76 | 
 77 | 	public static class SOMedianStdDevReducer extends
 78 | 			Reducer<IntWritable, IntWritable, IntWritable, MedianStdDevTuple> {
 79 | 		private MedianStdDevTuple result = new MedianStdDevTuple();
 80 | 		private ArrayList<Float> commentLengths = new ArrayList<Float>();
 81 | 
 82 | 		@Override
 83 | 		public void reduce(IntWritable key, Iterable<IntWritable> values,
 84 | 				Context context) throws IOException, InterruptedException {
 85 | 
 86 | 			float sum = 0;
 87 | 			float count = 0;
 88 | 			commentLengths.clear();
 89 | 			result.setStdDev(0);
 90 | 			
 91 | 			// Iterate through all input values for this key
 92 | 			for (IntWritable val : values) {
 93 | 				commentLengths.add((float) val.get());
 94 | 				sum += val.get();
 95 | 				++count;
 96 | 			}
 97 | 
 98 | 			// sort commentLengths to calculate median
 99 | 			Collections.sort(commentLengths);
100 | 
101 | 			// if commentLengths is an even value, average middle two elements
102 | 			if (count % 2 == 0) {
103 | 				result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths
104 | 						.get((int) count / 2)) / 2.0f);
105 | 			} else {
106 | 				// else, set median to middle value
107 | 				result.setMedian(commentLengths.get((int) count / 2));
108 | 			}
109 | 
110 | 			// calculate standard deviation
111 | 			float mean = sum / count;
112 | 
113 | 			float sumOfSquares = 0.0f;
114 | 			for (Float f : commentLengths) {
115 | 				sumOfSquares += (f - mean) * (f - mean);
116 | 			}
117 | 
118 | 			result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1)));
119 | 
120 | 			context.write(key, result);
121 | 		}
122 | 	}
123 | 
124 | 	public static void main(String[] args) throws Exception {
125 | 		Configuration conf = new Configuration();
126 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
127 | 				.getRemainingArgs();
128 | 		if (otherArgs.length != 2) {
129 | 			System.err.println("Usage: MedianStdDevDriver <in> <out>");
130 | 			System.exit(2);
131 | 		}
132 | 		Job job = new Job(conf,
133 | 				"StackOverflow Comment Length Median StdDev By Hour");
134 | 		job.setJarByClass(MedianStdDevDriver.class);
135 | 		job.setMapperClass(SOMedianStdDevMapper.class);
136 | 		job.setReducerClass(SOMedianStdDevReducer.class);
137 | 		job.setMapOutputKeyClass(IntWritable.class);
138 | 		job.setMapOutputValueClass(IntWritable.class);
139 | 		job.setOutputKeyClass(IntWritable.class);
140 | 		job.setOutputValueClass(MedianStdDevTuple.class);
141 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
142 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
143 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
144 | 	}
145 | 
146 | 	public static class MedianStdDevTuple implements Writable {
147 | 		private float median = 0;
148 | 		private float stddev = 0f;
149 | 
150 | 		public float getMedian() {
151 | 			return median;
152 | 		}
153 | 
154 | 		public void setMedian(float median) {
155 | 			this.median = median;
156 | 		}
157 | 
158 | 		public float getStdDev() {
159 | 			return stddev;
160 | 		}
161 | 
162 | 		public void setStdDev(float stddev) {
163 | 			this.stddev = stddev;
164 | 		}
165 | 
166 | 		@Override
167 | 		public void readFields(DataInput in) throws IOException {
168 | 			median = in.readFloat();
169 | 			stddev = in.readFloat();
170 | 		}
171 | 
172 | 		@Override
173 | 		public void write(DataOutput out) throws IOException {
174 | 			out.writeFloat(median);
175 | 			out.writeFloat(stddev);
176 | 		}
177 | 
178 | 		@Override
179 | 		public String toString() {
180 | 			return median + "\t" + stddev;
181 | 		}
182 | 	}
183 | }
184 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/MinMaxCountDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Date;
  9 | import java.util.Map;
 10 | 
 11 | import mrdp.utils.MRDPUtils;
 12 | 
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.io.Writable;
 17 | import org.apache.hadoop.mapreduce.Job;
 18 | import org.apache.hadoop.mapreduce.Mapper;
 19 | import org.apache.hadoop.mapreduce.Reducer;
 20 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | 
 24 | public class MinMaxCountDriver {
 25 | 
 26 | 	public static class SOMinMaxCountMapper extends
 27 | 			Mapper<Object, Text, Text, MinMaxCountTuple> {
 28 | 		// Our output key and value Writables
 29 | 		private Text outUserId = new Text();
 30 | 		private MinMaxCountTuple outTuple = new MinMaxCountTuple();
 31 | 
 32 | 		// This object will format the creation date string into a Date object
 33 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 34 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 35 | 
 36 | 		@Override
 37 | 		public void map(Object key, Text value, Context context)
 38 | 				throws IOException, InterruptedException {
 39 | 
 40 | 			// Parse the input string into a nice map
 41 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
 42 | 
 43 | 			// Grab the "CreationDate" field since it is what we are finding
 44 | 			// the min and max value of
 45 | 			String strDate = parsed.get("CreationDate");
 46 | 
 47 | 			// Grab the “UserID” since it is what we are grouping by
 48 | 			String userId = parsed.get("UserId");
 49 | 
 50 | 			// .get will return null if the key is not there
 51 | 			if (strDate == null || userId == null) {
 52 | 				// skip this record
 53 | 				return;
 54 | 			}
 55 | 
 56 | 			try {
 57 | 				// Parse the string into a Date object
 58 | 				Date creationDate = frmt.parse(strDate);
 59 | 
 60 | 				// Set the minimum and maximum date values to the creationDate
 61 | 				outTuple.setMin(creationDate);
 62 | 				outTuple.setMax(creationDate);
 63 | 
 64 | 				// Set the comment count to 1
 65 | 				outTuple.setCount(1);
 66 | 
 67 | 				// Set our user ID as the output key
 68 | 				outUserId.set(userId);
 69 | 
 70 | 				// Write out the user ID with min max dates and count
 71 | 				context.write(outUserId, outTuple);
 72 | 			} catch (ParseException e) {
 73 | 				// An error occurred parsing the creation Date string
 74 | 				// skip this record
 75 | 			}
 76 | 		}
 77 | 	}
 78 | 
 79 | 	public static class SOMinMaxCountReducer extends
 80 | 			Reducer<Text, MinMaxCountTuple, Text, MinMaxCountTuple> {
 81 | 		private MinMaxCountTuple result = new MinMaxCountTuple();
 82 | 
 83 | 		@Override
 84 | 		public void reduce(Text key, Iterable<MinMaxCountTuple> values,
 85 | 				Context context) throws IOException, InterruptedException {
 86 | 
 87 | 			// Initialize our result
 88 | 			result.setMin(null);
 89 | 			result.setMax(null);
 90 | 			int sum = 0;
 91 | 
 92 | 			// Iterate through all input values for this key
 93 | 			for (MinMaxCountTuple val : values) {
 94 | 
 95 | 				// If the value's min is less than the result's min
 96 | 				// Set the result's min to value's
 97 | 				if (result.getMin() == null
 98 | 						|| val.getMin().compareTo(result.getMin()) < 0) {
 99 | 					result.setMin(val.getMin());
100 | 				}
101 | 
102 | 				// If the value's max is less than the result's max
103 | 				// Set the result's max to value's
104 | 				if (result.getMax() == null
105 | 						|| val.getMax().compareTo(result.getMax()) > 0) {
106 | 					result.setMax(val.getMax());
107 | 				}
108 | 
109 | 				// Add to our sum the count for val
110 | 				sum += val.getCount();
111 | 			}
112 | 
113 | 			// Set our count to the number of input values
114 | 			result.setCount(sum);
115 | 
116 | 			context.write(key, result);
117 | 		}
118 | 	}
119 | 
120 | 	public static void main(String[] args) throws Exception {
121 | 		Configuration conf = new Configuration();
122 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
123 | 				.getRemainingArgs();
124 | 		if (otherArgs.length != 2) {
125 | 			System.err.println("Usage: MinMaxCountDriver <in> <out>");
126 | 			System.exit(2);
127 | 		}
128 | 		Job job = new Job(conf, "StackOverflow Comment Date Min Max Count");
129 | 		job.setJarByClass(MinMaxCountDriver.class);
130 | 		job.setMapperClass(SOMinMaxCountMapper.class);
131 | 		job.setCombinerClass(SOMinMaxCountReducer.class);
132 | 		job.setReducerClass(SOMinMaxCountReducer.class);
133 | 		job.setOutputKeyClass(Text.class);
134 | 		job.setOutputValueClass(MinMaxCountTuple.class);
135 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
136 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
137 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
138 | 	}
139 | 
140 | 	public static class MinMaxCountTuple implements Writable {
141 | 		private Date min = new Date();
142 | 		private Date max = new Date();
143 | 		private long count = 0;
144 | 
145 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
146 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
147 | 
148 | 		public Date getMin() {
149 | 			return min;
150 | 		}
151 | 
152 | 		public void setMin(Date min) {
153 | 			this.min = min;
154 | 		}
155 | 
156 | 		public Date getMax() {
157 | 			return max;
158 | 		}
159 | 
160 | 		public void setMax(Date max) {
161 | 			this.max = max;
162 | 		}
163 | 
164 | 		public long getCount() {
165 | 			return count;
166 | 		}
167 | 
168 | 		public void setCount(long count) {
169 | 			this.count = count;
170 | 		}
171 | 
172 | 		@Override
173 | 		public void readFields(DataInput in) throws IOException {
174 | 			min = new Date(in.readLong());
175 | 			max = new Date(in.readLong());
176 | 			count = in.readLong();
177 | 		}
178 | 
179 | 		@Override
180 | 		public void write(DataOutput out) throws IOException {
181 | 			out.writeLong(min.getTime());
182 | 			out.writeLong(max.getTime());
183 | 			out.writeLong(count);
184 | 		}
185 | 
186 | 		@Override
187 | 		public String toString() {
188 | 			return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count;
189 | 		}
190 | 	}
191 | }
192 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/SmarterMedianStdDevDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Date;
  9 | import java.util.Map;
 10 | import java.util.TreeMap;
 11 | import java.util.Map.Entry;
 12 | 
 13 | import mrdp.utils.MRDPUtils;
 14 | 
 15 | import org.apache.hadoop.conf.Configuration;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.IntWritable;
 18 | import org.apache.hadoop.io.LongWritable;
 19 | import org.apache.hadoop.io.SortedMapWritable;
 20 | import org.apache.hadoop.io.Text;
 21 | import org.apache.hadoop.io.Writable;
 22 | import org.apache.hadoop.io.WritableComparable;
 23 | import org.apache.hadoop.mapreduce.Job;
 24 | import org.apache.hadoop.mapreduce.Mapper;
 25 | import org.apache.hadoop.mapreduce.Reducer;
 26 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 27 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 28 | import org.apache.hadoop.util.GenericOptionsParser;
 29 | 
 30 | public class SmarterMedianStdDevDriver {
 31 | 
 32 | 	public static class SOMedianStdDevMapper extends
 33 | 			Mapper<Object, Text, IntWritable, SortedMapWritable> {
 34 | 
 35 | 		private IntWritable commentLength = new IntWritable();
 36 | 		private static final LongWritable ONE = new LongWritable(1);
 37 | 		private IntWritable outHour = new IntWritable();
 38 | 
 39 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 40 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 41 | 
 42 | 		@SuppressWarnings("deprecation")
 43 | 		@Override
 44 | 		public void map(Object key, Text value, Context context)
 45 | 				throws IOException, InterruptedException {
 46 | 
 47 | 			// Parse the input string into a nice map
 48 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 49 | 					.toString());
 50 | 
 51 | 			// Grab the "CreationDate" field,
 52 | 			// since it is what we are grouping by
 53 | 			String strDate = parsed.get("CreationDate");
 54 | 
 55 | 			// Grab the comment to find the length
 56 | 			String text = parsed.get("Text");
 57 | 
 58 | 			// .get will return null if the key is not there
 59 | 			if (strDate == null || text == null) {
 60 | 				// skip this record
 61 | 				return;
 62 | 			}
 63 | 
 64 | 			try {
 65 | 				// get the hour this comment was posted in
 66 | 				Date creationDate = frmt.parse(strDate);
 67 | 				outHour.set(creationDate.getHours());
 68 | 
 69 | 				commentLength.set(text.length());
 70 | 				SortedMapWritable outCommentLength = new SortedMapWritable();
 71 | 				outCommentLength.put(commentLength, ONE);
 72 | 
 73 | 				// write out the user ID with min max dates and count
 74 | 				context.write(outHour, outCommentLength);
 75 | 
 76 | 			} catch (ParseException e) {
 77 | 				System.err.println(e.getMessage());
 78 | 				return;
 79 | 			}
 80 | 		}
 81 | 	}
 82 | 
 83 | 	public static class SOMedianStdDevCombiner
 84 | 			extends
 85 | 			Reducer<IntWritable, SortedMapWritable, IntWritable, SortedMapWritable> {
 86 | 
 87 | 		@SuppressWarnings("rawtypes")
 88 | 		protected void reduce(IntWritable key,
 89 | 				Iterable<SortedMapWritable> values, Context context)
 90 | 				throws IOException, InterruptedException {
 91 | 
 92 | 			SortedMapWritable outValue = new SortedMapWritable();
 93 | 
 94 | 			for (SortedMapWritable v : values) {
 95 | 				for (Entry<WritableComparable, Writable> entry : v.entrySet()) {
 96 | 					LongWritable count = (LongWritable) outValue.get(entry
 97 | 							.getKey());
 98 | 
 99 | 					if (count != null) {
100 | 						count.set(count.get()
101 | 								+ ((LongWritable) entry.getValue()).get());
102 | 					} else {
103 | 						outValue.put(entry.getKey(), new LongWritable(
104 | 								((LongWritable) entry.getValue()).get()));
105 | 					}
106 | 				}
107 | 			}
108 | 
109 | 			context.write(key, outValue);
110 | 		}
111 | 	}
112 | 
113 | 	public static class SOMedianStdDevReducer
114 | 			extends
115 | 			Reducer<IntWritable, SortedMapWritable, IntWritable, MedianStdDevTuple> {
116 | 		private MedianStdDevTuple result = new MedianStdDevTuple();
117 | 		private TreeMap<Integer, Long> commentLengthCounts = new TreeMap<Integer, Long>();
118 | 
119 | 		@SuppressWarnings("rawtypes")
120 | 		@Override
121 | 		public void reduce(IntWritable key, Iterable<SortedMapWritable> values,
122 | 				Context context) throws IOException, InterruptedException {
123 | 
124 | 			float sum = 0;
125 | 			long totalComments = 0;
126 | 			commentLengthCounts.clear();
127 | 			result.setMedian(0);
128 | 			result.setStdDev(0);
129 | 
130 | 			for (SortedMapWritable v : values) {
131 | 				for (Entry<WritableComparable, Writable> entry : v.entrySet()) {
132 | 					int length = ((IntWritable) entry.getKey()).get();
133 | 					long count = ((LongWritable) entry.getValue()).get();
134 | 
135 | 					totalComments += count;
136 | 					sum += length * count;
137 | 
138 | 					Long storedCount = commentLengthCounts.get(length);
139 | 					if (storedCount == null) {
140 | 						commentLengthCounts.put(length, count);
141 | 					} else {
142 | 						commentLengthCounts.put(length, storedCount + count);
143 | 					}
144 | 				}
145 | 			}
146 | 
147 | 			long medianIndex = totalComments / 2L;
148 | 			long previousComments = 0;
149 | 			long comments = 0;
150 | 			int prevKey = 0;
151 | 			for (Entry<Integer, Long> entry : commentLengthCounts.entrySet()) {
152 | 				comments = previousComments + entry.getValue();
153 | 				if (previousComments <= medianIndex && medianIndex < comments) {
154 | 					if (totalComments % 2 == 0) {
155 | 						if (previousComments == medianIndex) {
156 | 							result.setMedian((float) (entry.getKey() + prevKey) / 2.0f);
157 | 						} else {
158 | 							result.setMedian(entry.getKey());
159 | 						}
160 | 					} else {
161 | 						result.setMedian(entry.getKey());
162 | 					}
163 | 					break;
164 | 				}
165 | 				previousComments = comments;
166 | 				prevKey = entry.getKey();
167 | 			}
168 | 
169 | 			// calculate standard deviation
170 | 			float mean = sum / totalComments;
171 | 
172 | 			float sumOfSquares = 0.0f;
173 | 			for (Entry<Integer, Long> entry : commentLengthCounts.entrySet()) {
174 | 				sumOfSquares += (entry.getKey() - mean)
175 | 						* (entry.getKey() - mean) * entry.getValue();
176 | 			}
177 | 
178 | 			result.setStdDev((float) Math.sqrt(sumOfSquares
179 | 					/ (totalComments - 1)));
180 | 
181 | 			context.write(key, result);
182 | 		}
183 | 	}
184 | 
185 | 	public static void main(String[] args) throws Exception {
186 | 		Configuration conf = new Configuration();
187 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
188 | 				.getRemainingArgs();
189 | 		if (otherArgs.length != 2) {
190 | 			System.err.println("Usage: MedianStdDevDriver <in> <out>");
191 | 			System.exit(2);
192 | 		}
193 | 		Job job = new Job(conf,
194 | 				"StackOverflow Comment Length Median StdDev By Hour");
195 | 		job.setJarByClass(SmarterMedianStdDevDriver.class);
196 | 		job.setMapperClass(SOMedianStdDevMapper.class);
197 | 		job.setCombinerClass(SOMedianStdDevCombiner.class);
198 | 		job.setReducerClass(SOMedianStdDevReducer.class);
199 | 		job.setMapOutputKeyClass(IntWritable.class);
200 | 		job.setMapOutputValueClass(SortedMapWritable.class);
201 | 		job.setOutputKeyClass(IntWritable.class);
202 | 		job.setOutputValueClass(MedianStdDevTuple.class);
203 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
204 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
205 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
206 | 	}
207 | 
208 | 	public static class MedianStdDevTuple implements Writable {
209 | 		private float median = 0;
210 | 		private float stddev = 0f;
211 | 
212 | 		public float getMedian() {
213 | 			return median;
214 | 		}
215 | 
216 | 		public void setMedian(float median) {
217 | 			this.median = median;
218 | 		}
219 | 
220 | 		public float getStdDev() {
221 | 			return stddev;
222 | 		}
223 | 
224 | 		public void setStdDev(float stddev) {
225 | 			this.stddev = stddev;
226 | 		}
227 | 
228 | 		@Override
229 | 		public void readFields(DataInput in) throws IOException {
230 | 			median = in.readFloat();
231 | 			stddev = in.readFloat();
232 | 		}
233 | 
234 | 		@Override
235 | 		public void write(DataOutput out) throws IOException {
236 | 			out.writeFloat(median);
237 | 			out.writeFloat(stddev);
238 | 		}
239 | 
240 | 		@Override
241 | 		public String toString() {
242 | 			return median + "\t" + stddev;
243 | 		}
244 | 	}
245 | }
246 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch2/WikipediaIndex.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch2;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | 
  6 | import mrdp.utils.MRDPUtils;
  7 | 
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Reducer;
 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 | import org.apache.hadoop.util.GenericOptionsParser;
 17 | 
 18 | import org.apache.commons.lang.StringEscapeUtils;
 19 | 
 20 | public class WikipediaIndex {
 21 | 
 22 | 	public static String getWikipediaURL(String text) {
 23 | 
 24 | 		int idx = text.indexOf("\"http://en.wikipedia.org");
 25 | 		if (idx == -1) {
 26 | 			return null;
 27 | 		}
 28 | 		int idx_end = text.indexOf('"', idx + 1);
 29 | 
 30 | 		if (idx_end == -1) {
 31 | 			return null;
 32 | 		}
 33 | 
 34 | 		int idx_hash = text.indexOf('#', idx + 1);
 35 | 
 36 | 		if (idx_hash != -1 && idx_hash < idx_end) {
 37 | 			return text.substring(idx + 1, idx_hash);
 38 | 		} else {
 39 | 			return text.substring(idx + 1, idx_end);
 40 | 		}
 41 | 
 42 | 	}
 43 | 
 44 | 	public static class SOWikipediaExtractor extends
 45 | 			Mapper<Object, Text, Text, Text> {
 46 | 
 47 | 		private Text link = new Text();
 48 | 		private Text outkey = new Text();
 49 | 
 50 | 		public void map(Object key, Text value, Context context)
 51 | 				throws IOException, InterruptedException {
 52 | 
 53 | 			// Parse the input string into a nice map
 54 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 55 | 					.toString());
 56 | 
 57 | 			// Grab the necessary XML attributes
 58 | 			String txt = parsed.get("Body");
 59 | 			String posttype = parsed.get("PostTypeId");
 60 | 			String row_id = parsed.get("Id");
 61 | 
 62 | 			// if the body is null, or the post is a question (1), skip
 63 | 			if (txt == null || (posttype != null && posttype.equals("1"))) {
 64 | 				return;
 65 | 			}
 66 | 
 67 | 			// Unescape the HTML because the SO data is escaped.
 68 | 			txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase());
 69 | 
 70 | 			link.set(getWikipediaURL(txt));
 71 | 			outkey.set(row_id);
 72 | 			context.write(link, outkey);
 73 | 		}
 74 | 	}
 75 | 
 76 | 	public static class Concatenator extends Reducer<Text, Text, Text, Text> {
 77 | 		private Text result = new Text();
 78 | 
 79 | 		public void reduce(Text key, Iterable<Text> values, Context context)
 80 | 				throws IOException, InterruptedException {
 81 | 
 82 | 			StringBuilder sb = new StringBuilder();
 83 | 			for (Text id : values) {
 84 | 				sb.append(id.toString() + " ");
 85 | 			}
 86 | 
 87 | 			result.set(sb.substring(0, sb.length() - 1).toString());
 88 | 			context.write(key, result);
 89 | 		}
 90 | 	}
 91 | 
 92 | 	public static void main(String[] args) throws Exception {
 93 | 		Configuration conf = new Configuration();
 94 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 95 | 				.getRemainingArgs();
 96 | 		if (otherArgs.length != 2) {
 97 | 			System.err.println("Usage: WikipediallIndex <in> <out>");
 98 | 			System.exit(2);
 99 | 		}
100 | 		Job job = new Job(conf, "StackOverflow Wikipedia URL Inverted Index");
101 | 		job.setJarByClass(WikipediaIndex.class);
102 | 		job.setMapperClass(SOWikipediaExtractor.class);
103 | 		job.setCombinerClass(Concatenator.class);
104 | 		job.setReducerClass(Concatenator.class);
105 | 		job.setOutputKeyClass(Text.class);
106 | 		job.setOutputValueClass(Text.class);
107 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
108 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
109 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/BloomFilteringDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch3;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.net.URI;
  7 | import java.util.Map;
  8 | import java.util.StringTokenizer;
  9 | 
 10 | import mrdp.utils.MRDPUtils;
 11 | 
 12 | import org.apache.hadoop.conf.Configuration;
 13 | import org.apache.hadoop.filecache.DistributedCache;
 14 | import org.apache.hadoop.fs.FileSystem;
 15 | import org.apache.hadoop.fs.Path;
 16 | import org.apache.hadoop.io.NullWritable;
 17 | import org.apache.hadoop.io.Text;
 18 | import org.apache.hadoop.mapreduce.Job;
 19 | import org.apache.hadoop.mapreduce.Mapper;
 20 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | import org.apache.hadoop.util.bloom.BloomFilter;
 24 | import org.apache.hadoop.util.bloom.Key;
 25 | 
 26 | public class BloomFilteringDriver {
 27 | 
 28 | 	public static class BloomFilteringMapper extends
 29 | 			Mapper<Object, Text, Text, NullWritable> {
 30 | 
 31 | 		private BloomFilter filter = new BloomFilter();
 32 | 
 33 | 		@Override
 34 | 		protected void setup(Context context) throws IOException,
 35 | 				InterruptedException {
 36 | 			URI[] files = DistributedCache.getCacheFiles(context
 37 | 					.getConfiguration());
 38 | 
 39 | 			// if the files in the distributed cache are set
 40 | 			if (files != null && files.length == 1) {
 41 | 				System.out.println("Reading Bloom filter from: "
 42 | 						+ files[0].getPath());
 43 | 
 44 | 				// Open local file for read.
 45 | 				DataInputStream strm = new DataInputStream(new FileInputStream(
 46 | 						files[0].getPath()));
 47 | 
 48 | 				// Read into our Bloom filter.
 49 | 				filter.readFields(strm);
 50 | 				strm.close();
 51 | 			} else {
 52 | 				throw new IOException(
 53 | 						"Bloom filter file not set in the DistributedCache.");
 54 | 			}
 55 | 		}
 56 | 
 57 | 		@Override
 58 | 		public void map(Object key, Text value, Context context)
 59 | 				throws IOException, InterruptedException {
 60 | 
 61 | 			// Parse the input into a nice map.
 62 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 63 | 					.toString());
 64 | 
 65 | 			// Get the value for the comment
 66 | 			String comment = parsed.get("Text");
 67 | 
 68 | 			// If it is null, skip this record
 69 | 			if (comment == null) {
 70 | 				return;
 71 | 			}
 72 | 
 73 | 			StringTokenizer tokenizer = new StringTokenizer(comment);
 74 | 			// For each word in the comment
 75 | 			while (tokenizer.hasMoreTokens()) {
 76 | 
 77 | 				// Clean up the words
 78 | 				String cleanWord = tokenizer.nextToken().replaceAll("'", "")
 79 | 						.replaceAll("[^a-zA-Z]", " ");
 80 | 
 81 | 				// If the word is in the filter, output it and break
 82 | 				if (cleanWord.length() > 0
 83 | 						&& filter.membershipTest(new Key(cleanWord.getBytes()))) {
 84 | 					context.write(value, NullWritable.get());
 85 | 					break;
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 	}
 90 | 
 91 | 	public static void main(String[] args) throws Exception {
 92 | 		Configuration conf = new Configuration();
 93 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 94 | 				.getRemainingArgs();
 95 | 		if (otherArgs.length != 3) {
 96 | 			System.err.println("Usage: BloomFiltering <in> <cachefile> <out>");
 97 | 			System.exit(1);
 98 | 		}
 99 | 
100 | 		FileSystem.get(conf).delete(new Path(otherArgs[2]), true);
101 | 
102 | 		Job job = new Job(conf, "StackOverflow Bloom Filtering");
103 | 		job.setJarByClass(BloomFilteringDriver.class);
104 | 		job.setMapperClass(BloomFilteringMapper.class);
105 | 		job.setNumReduceTasks(0);
106 | 		job.setOutputKeyClass(Text.class);
107 | 		job.setOutputValueClass(NullWritable.class);
108 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
109 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
110 | 
111 | 		DistributedCache.addCacheFile(
112 | 				FileSystem.get(conf).makeQualified(new Path(otherArgs[1]))
113 | 						.toUri(), job.getConfiguration());
114 | 
115 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/DistinctUserDriver.java:
--------------------------------------------------------------------------------
 1 | package mrdp.ch3;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | 
 6 | import mrdp.utils.MRDPUtils;
 7 | 
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.NullWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.mapreduce.Job;
13 | import org.apache.hadoop.mapreduce.Mapper;
14 | import org.apache.hadoop.mapreduce.Reducer;
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
17 | import org.apache.hadoop.util.GenericOptionsParser;
18 | 
19 | public class DistinctUserDriver {
20 | 
21 | 	public static class SODistinctUserMapper extends
22 | 			Mapper<Object, Text, Text, NullWritable> {
23 | 
24 | 		private Text outUserId = new Text();
25 | 
26 | 		@Override
27 | 		public void map(Object key, Text value, Context context)
28 | 				throws IOException, InterruptedException {
29 | 
30 | 			// Parse the input into a nice map.
31 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
32 | 
33 | 			// Get the value for the UserId attribute
34 | 			String userId = parsed.get("UserId");
35 | 
36 | 			// If it is null, skip this record
37 | 			if (userId == null) {
38 | 				return;
39 | 			}
40 | 
41 | 			// Otherwise, set our output key to the user's id
42 | 			outUserId.set(userId);
43 | 
44 | 			// Write the user's id with a null value
45 | 			context.write(outUserId, NullWritable.get());
46 | 		}
47 | 	}
48 | 
49 | 	public static class SODistinctUserReducer extends
50 | 			Reducer<Text, NullWritable, Text, NullWritable> {
51 | 
52 | 		@Override
53 | 		public void reduce(Text key, Iterable<NullWritable> values,
54 | 				Context context) throws IOException, InterruptedException {
55 | 
56 | 			// Write the user's id with a null value
57 | 			context.write(key, NullWritable.get());
58 | 		}
59 | 	}
60 | 
61 | 	public static void main(String[] args) throws Exception {
62 | 		Configuration conf = new Configuration();
63 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
64 | 				.getRemainingArgs();
65 | 		if (otherArgs.length != 2) {
66 | 			System.err.println("Usage: UniqueUserCount <in> <out>");
67 | 			System.exit(2);
68 | 		}
69 | 
70 | 		Job job = new Job(conf, "StackOverflow Distinct Users");
71 | 		job.setJarByClass(DistinctUserDriver.class);
72 | 		job.setMapperClass(SODistinctUserMapper.class);
73 | 		job.setCombinerClass(SODistinctUserReducer.class);
74 | 		job.setReducerClass(SODistinctUserReducer.class);
75 | 		job.setOutputKeyClass(Text.class);
76 | 		job.setOutputValueClass(NullWritable.class);
77 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
78 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
79 | 
80 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/DistributedGrep.java:
--------------------------------------------------------------------------------
 1 | package mrdp.ch3;
 2 | 
 3 | import java.io.*;
 4 | 
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.apache.hadoop.fs.*;
 7 | import org.apache.hadoop.io.*;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 | import org.apache.hadoop.util.GenericOptionsParser;
13 | 
14 | public class DistributedGrep {
15 | 
16 | 	public static class GrepMapper extends
17 | 			Mapper<Object, Text, NullWritable, Text> {
18 | 
19 | 		public void map(Object key, Text value, Context context)
20 | 				throws IOException, InterruptedException {
21 | 
22 | 			String txt = value.toString();
23 | 			String mapRegex = context.getConfiguration().get("mapregex");
24 | 
25 | 			if (txt.matches(mapRegex)) {
26 | 				context.write(NullWritable.get(), value);
27 | 			}
28 | 		}
29 | 	}
30 | 
31 | 	public static void main(String[] args) throws Exception {
32 | 		Configuration conf = new Configuration();
33 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
34 | 				.getRemainingArgs();
35 | 		if (otherArgs.length != 3) {
36 | 			System.err.println("Usage: DistributedGrep <regex> <in> <out>");
37 | 			System.exit(2);
38 | 		}
39 | 		conf.set("mapregex", otherArgs[0]);
40 | 
41 | 		Job job = new Job(conf, "Distributed Grep");
42 | 		job.setJarByClass(DistributedGrep.class);
43 | 		job.setMapperClass(GrepMapper.class);
44 | 		job.setOutputKeyClass(NullWritable.class);
45 | 		job.setOutputValueClass(Text.class);
46 | 		job.setNumReduceTasks(0); // Set number of reducers to zero
47 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
48 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
49 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/QueryBloomFiltering.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch3;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.net.URI;
  7 | import java.util.Map;
  8 | 
  9 | import mrdp.utils.MRDPUtils;
 10 | 
 11 | import org.apache.hadoop.conf.Configuration;
 12 | import org.apache.hadoop.filecache.DistributedCache;
 13 | import org.apache.hadoop.fs.FileSystem;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.hbase.HBaseConfiguration;
 16 | import org.apache.hadoop.hbase.client.Get;
 17 | import org.apache.hadoop.hbase.client.HTable;
 18 | import org.apache.hadoop.hbase.client.Result;
 19 | import org.apache.hadoop.io.NullWritable;
 20 | import org.apache.hadoop.io.Text;
 21 | import org.apache.hadoop.mapreduce.Job;
 22 | import org.apache.hadoop.mapreduce.Mapper;
 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 25 | import org.apache.hadoop.util.GenericOptionsParser;
 26 | import org.apache.hadoop.util.bloom.BloomFilter;
 27 | import org.apache.hadoop.util.bloom.Key;
 28 | 
 29 | public class QueryBloomFiltering {
 30 | 
 31 | 	public static class BloomFilteringMapper extends
 32 | 			Mapper<Object, Text, Text, NullWritable> {
 33 | 
 34 | 		private BloomFilter filter = new BloomFilter();
 35 | 		private HTable table = null;
 36 | 
 37 | 		@Override
 38 | 		protected void setup(Context context) throws IOException,
 39 | 				InterruptedException {
 40 | 			URI[] files = DistributedCache.getCacheFiles(context
 41 | 					.getConfiguration());
 42 | 
 43 | 			// if the files in the distributed cache are set
 44 | 			if (files != null && files.length == 1) {
 45 | 				System.out.println("Reading Bloom filter from: "
 46 | 						+ files[0].getPath());
 47 | 
 48 | 				// Open local file for read.
 49 | 				DataInputStream strm = new DataInputStream(new FileInputStream(
 50 | 						files[0].getPath()));
 51 | 
 52 | 				// Read into our Bloom filter.
 53 | 				filter.readFields(strm);
 54 | 				strm.close();
 55 | 			} else {
 56 | 				throw new IOException(
 57 | 						"Bloom filter file not set in the DistributedCache.");
 58 | 			}
 59 | 
 60 | 			// Get HBase table of user info
 61 | 			Configuration hconf = HBaseConfiguration.create();
 62 | 			table = new HTable(hconf, "user_table");
 63 | 		}
 64 | 
 65 | 		@Override
 66 | 		public void map(Object key, Text value, Context context)
 67 | 				throws IOException, InterruptedException {
 68 | 
 69 | 			// Parse the input into a nice map.
 70 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 71 | 					.toString());
 72 | 
 73 | 			// Get the value for the comment
 74 | 			String userid = parsed.get("UserId");
 75 | 
 76 | 			// If it is null, skip this record
 77 | 			if (userid == null) {
 78 | 				return;
 79 | 			}
 80 | 
 81 | 			// If this user ID is in the set
 82 | 			if (filter.membershipTest(new Key(userid.getBytes()))) {
 83 | 				// Get the reputation from the HBase table
 84 | 				Result r = table.get(new Get(userid.getBytes()));
 85 | 				int reputation = Integer.parseInt(new String(r.getValue(
 86 | 						"attr".getBytes(), "Reputation".getBytes())));
 87 | 				// If the reputation is at least 1,500,
 88 | 				// write the record to the file system
 89 | 				if (reputation >= 1500) {
 90 | 					context.write(value, NullWritable.get());
 91 | 				}
 92 | 			}
 93 | 		}
 94 | 	}
 95 | 
 96 | 	public static void main(String[] args) throws Exception {
 97 | 		Configuration conf = new Configuration();
 98 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 99 | 				.getRemainingArgs();
100 | 		if (otherArgs.length != 3) {
101 | 			System.err.println("Usage: BloomFiltering <in> <cachefile> <out>");
102 | 			System.exit(1);
103 | 		}
104 | 
105 | 		FileSystem.get(conf).delete(new Path(otherArgs[2]), true);
106 | 
107 | 		Job job = new Job(conf, "StackOverflow Bloom Filtering");
108 | 		job.setJarByClass(QueryBloomFiltering.class);
109 | 		job.setMapperClass(BloomFilteringMapper.class);
110 | 		job.setNumReduceTasks(0);
111 | 		job.setOutputKeyClass(Text.class);
112 | 		job.setOutputValueClass(NullWritable.class);
113 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
114 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
115 | 
116 | 		DistributedCache.addCacheFile(
117 | 				FileSystem.get(conf).makeQualified(new Path(otherArgs[1]))
118 | 						.toUri(), job.getConfiguration());
119 | 
120 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/SimpleRandomSampling.java:
--------------------------------------------------------------------------------
 1 | package mrdp.ch3;
 2 | 
 3 | import java.io.*;
 4 | import java.util.Random;
 5 | 
 6 | import org.apache.hadoop.conf.Configuration;
 7 | import org.apache.hadoop.fs.*;
 8 | import org.apache.hadoop.io.*;
 9 | import org.apache.hadoop.mapreduce.Job;
10 | import org.apache.hadoop.mapreduce.Mapper;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 | import org.apache.hadoop.util.GenericOptionsParser;
14 | 
15 | public class SimpleRandomSampling {
16 | 
17 | 	public static class SRSMapper extends
18 | 			Mapper<Object, Text, NullWritable, Text> {
19 | 
20 | 		private Random rands = new Random();
21 | 		private Double percentage;
22 | 
23 | 		@Override
24 | 		protected void setup(Context context) throws IOException,
25 | 				InterruptedException {
26 | 			// retrieve the percentage that is passed in via the configuration
27 | 			// like this: conf.set("filter_percentage", .5); for .5%
28 | 			String strPercentage = context.getConfiguration().get(
29 | 					"filter_percentage");
30 | 
31 | 			percentage = Double.parseDouble(strPercentage) / 100.0;
32 | 		}
33 | 
34 | 		@Override
35 | 		public void map(Object key, Text value, Context context)
36 | 				throws IOException, InterruptedException {
37 | 
38 | 			if (rands.nextDouble() < percentage) {
39 | 				context.write(NullWritable.get(), value);
40 | 			}
41 | 		}
42 | 	}
43 | 
44 | 	public static void main(String[] args) throws Exception {
45 | 		Configuration conf = new Configuration();
46 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
47 | 				.getRemainingArgs();
48 | 		if (otherArgs.length != 3) {
49 | 			System.err.println("Usage: SRS <percentage> <in> <out>");
50 | 			System.exit(2);
51 | 		}
52 | 		conf.set("filter_percentage", otherArgs[0]);
53 | 
54 | 		Job job = new Job(conf, "SRS");
55 | 		job.setJarByClass(SimpleRandomSampling.class);
56 | 		job.setMapperClass(SRSMapper.class);
57 | 		job.setOutputKeyClass(NullWritable.class);
58 | 		job.setOutputValueClass(Text.class);
59 | 		job.setNumReduceTasks(0); // Set number of reducers to zero
60 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
61 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
62 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/TopTenDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch3;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | import java.util.TreeMap;
  6 | 
  7 | import mrdp.utils.MRDPUtils;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.NullWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 | import org.apache.hadoop.util.GenericOptionsParser;
 19 | 
 20 | public class TopTenDriver {
 21 | 
 22 | 	public static class SOTopTenMapper extends
 23 | 			Mapper<Object, Text, NullWritable, Text> {
 24 | 		// Our output key and value Writables
 25 | 		private TreeMap<Integer, Text> repToRecordMap = new TreeMap<Integer, Text>();
 26 | 
 27 | 		@Override
 28 | 		public void map(Object key, Text value, Context context)
 29 | 				throws IOException, InterruptedException {
 30 | 			// Parse the input string into a nice map
 31 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 32 | 					.toString());
 33 | 			if (parsed == null) {
 34 | 				return;
 35 | 			}
 36 | 
 37 | 			String userId = parsed.get("Id");
 38 | 			String reputation = parsed.get("Reputation");
 39 | 
 40 | 			// Get will return null if the key is not there
 41 | 			if (userId == null || reputation == null) {
 42 | 				// skip this record
 43 | 				return;
 44 | 			}
 45 | 
 46 | 			repToRecordMap.put(Integer.parseInt(reputation), new Text(value));
 47 | 
 48 | 			if (repToRecordMap.size() > 10) {
 49 | 				repToRecordMap.remove(repToRecordMap.firstKey());
 50 | 			}
 51 | 		}
 52 | 
 53 | 		@Override
 54 | 		protected void cleanup(Context context) throws IOException,
 55 | 				InterruptedException {
 56 | 			for (Text t : repToRecordMap.values()) {
 57 | 				context.write(NullWritable.get(), t);
 58 | 			}
 59 | 		}
 60 | 	}
 61 | 
 62 | 	public static class SOTopTenReducer extends
 63 | 			Reducer<NullWritable, Text, NullWritable, Text> {
 64 | 
 65 | 		private TreeMap<Integer, Text> repToRecordMap = new TreeMap<Integer, Text>();
 66 | 
 67 | 		@Override
 68 | 		public void reduce(NullWritable key, Iterable<Text> values,
 69 | 				Context context) throws IOException, InterruptedException {
 70 | 			for (Text value : values) {
 71 | 				Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 72 | 						.toString());
 73 | 
 74 | 				repToRecordMap.put(Integer.parseInt(parsed.get("Reputation")),
 75 | 						new Text(value));
 76 | 
 77 | 				if (repToRecordMap.size() > 10) {
 78 | 					repToRecordMap.remove(repToRecordMap.firstKey());
 79 | 				}
 80 | 			}
 81 | 
 82 | 			for (Text t : repToRecordMap.descendingMap().values()) {
 83 | 				context.write(NullWritable.get(), t);
 84 | 			}
 85 | 		}
 86 | 	}
 87 | 
 88 | 	public static void main(String[] args) throws Exception {
 89 | 		Configuration conf = new Configuration();
 90 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 91 | 				.getRemainingArgs();
 92 | 		if (otherArgs.length != 2) {
 93 | 			System.err.println("Usage: TopTenDriver <in> <out>");
 94 | 			System.exit(2);
 95 | 		}
 96 | 
 97 | 		Job job = new Job(conf, "Top Ten Users by Reputation");
 98 | 		job.setJarByClass(TopTenDriver.class);
 99 | 		job.setMapperClass(SOTopTenMapper.class);
100 | 		job.setReducerClass(SOTopTenReducer.class);
101 | 		job.setNumReduceTasks(1);
102 | 		job.setOutputKeyClass(NullWritable.class);
103 | 		job.setOutputValueClass(Text.class);
104 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
105 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
106 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch3/UniqueUserCount.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch3;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | 
  6 | import mrdp.utils.MRDPUtils;
  7 | 
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.IntWritable;
 12 | import org.apache.hadoop.io.NullWritable;
 13 | import org.apache.hadoop.io.Text;
 14 | import org.apache.hadoop.mapreduce.Job;
 15 | import org.apache.hadoop.mapreduce.Mapper;
 16 | import org.apache.hadoop.mapreduce.Reducer;
 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 18 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 20 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | 
 24 | public class UniqueUserCount {
 25 | 
 26 | 	public static class SODistinctUserMapper extends
 27 | 			Mapper<Object, Text, Text, NullWritable> {
 28 | 
 29 | 		private Text outUserId = new Text();
 30 | 
 31 | 		@Override
 32 | 		public void map(Object key, Text value, Context context)
 33 | 				throws IOException, InterruptedException {
 34 | 
 35 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 36 | 					.toString());
 37 | 			String userId = parsed.get("UserId");
 38 | 			if (userId == null) {
 39 | 				return;
 40 | 			}
 41 | 
 42 | 			outUserId.set(userId);
 43 | 			context.write(outUserId, NullWritable.get());
 44 | 		}
 45 | 	}
 46 | 
 47 | 	public static class SODistinctUserReducer extends
 48 | 			Reducer<Text, NullWritable, Text, NullWritable> {
 49 | 
 50 | 		@Override
 51 | 		public void reduce(Text key, Iterable<NullWritable> values,
 52 | 				Context context) throws IOException, InterruptedException {
 53 | 			context.write(key, NullWritable.get());
 54 | 		}
 55 | 	}
 56 | 
 57 | 	public static class SOUserCounterMapper extends
 58 | 			Mapper<Text, NullWritable, Text, IntWritable> {
 59 | 
 60 | 		private static final Text DUMMY = new Text("Total:");
 61 | 		private static final IntWritable ONE = new IntWritable(1);
 62 | 
 63 | 		@Override
 64 | 		public void map(Text key, NullWritable value, Context context)
 65 | 				throws IOException, InterruptedException {
 66 | 
 67 | 			context.write(DUMMY, ONE);
 68 | 		}
 69 | 	}
 70 | 
 71 | 	public static void main(String[] args) throws Exception {
 72 | 		Configuration conf = new Configuration();
 73 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 74 | 				.getRemainingArgs();
 75 | 		if (otherArgs.length != 2) {
 76 | 			System.err.println("Usage: UniqueUserCount <in> <out>");
 77 | 			System.exit(2);
 78 | 		}
 79 | 
 80 | 		Path tmpout = new Path(otherArgs[1] + "_tmp");
 81 | 		FileSystem.get(new Configuration()).delete(tmpout, true);
 82 | 		Path finalout = new Path(otherArgs[1]);
 83 | 		Job job = new Job(conf, "StackOverflow Unique User Count");
 84 | 		job.setJarByClass(UniqueUserCount.class);
 85 | 		job.setMapperClass(SODistinctUserMapper.class);
 86 | 		job.setCombinerClass(SODistinctUserReducer.class);
 87 | 		job.setReducerClass(SODistinctUserReducer.class);
 88 | 		job.setOutputKeyClass(Text.class);
 89 | 		job.setOutputValueClass(NullWritable.class);
 90 | 		job.setOutputFormatClass(SequenceFileOutputFormat.class);
 91 | 		job.setNumReduceTasks(1);
 92 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
 93 | 		FileOutputFormat.setOutputPath(job, tmpout);
 94 | 
 95 | 		boolean exitCode = job.waitForCompletion(true);
 96 | 		if (exitCode) {
 97 | 			job = new Job(conf, "Stack Overflow Unique User Count");
 98 | 			job.setJarByClass(UniqueUserCount.class);
 99 | 			job.setMapperClass(SOUserCounterMapper.class);
100 | 			job.setCombinerClass(IntSumReducer.class);
101 | 			job.setReducerClass(IntSumReducer.class);
102 | 			job.setOutputKeyClass(Text.class);
103 | 			job.setOutputValueClass(IntWritable.class);
104 | 			job.setInputFormatClass(SequenceFileInputFormat.class);
105 | 			FileInputFormat.addInputPath(job, tmpout);
106 | 			FileOutputFormat.setOutputPath(job, finalout);
107 | 			exitCode = job.waitForCompletion(true);
108 | 		}
109 | 
110 | 		System.exit(exitCode ? 0 : 1);
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/AnonymizeDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | import java.util.Map.Entry;
  6 | import java.util.Random;
  7 | 
  8 | import mrdp.utils.MRDPUtils;
  9 | 
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.fs.Path;
 12 | import org.apache.hadoop.io.IntWritable;
 13 | import org.apache.hadoop.io.NullWritable;
 14 | import org.apache.hadoop.io.Text;
 15 | import org.apache.hadoop.mapreduce.Job;
 16 | import org.apache.hadoop.mapreduce.Mapper;
 17 | import org.apache.hadoop.mapreduce.Reducer;
 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 20 | import org.apache.hadoop.util.GenericOptionsParser;
 21 | 
 22 | public class AnonymizeDriver {
 23 | 
 24 | 	public static class AnonymizeMapper extends
 25 | 			Mapper<Object, Text, IntWritable, Text> {
 26 | 
 27 | 		private IntWritable outkey = new IntWritable();
 28 | 		private Random rndm = new Random();
 29 | 		private Text outvalue = new Text();
 30 | 
 31 | 		@Override
 32 | 		public void map(Object key, Text value, Context context)
 33 | 				throws IOException, InterruptedException {
 34 | 
 35 | 			// Parse the input string into a nice map
 36 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 37 | 					.toString());
 38 | 
 39 | 			if (parsed.size() > 0) {
 40 | 				StringBuilder bldr = new StringBuilder();
 41 | 				bldr.append("<row ");
 42 | 				for (Entry<String, String> entry : parsed.entrySet()) {
 43 | 
 44 | 					if (entry.getKey().equals("UserId")
 45 | 							|| entry.getKey().equals("Id")) {
 46 | 						// ignore these fields
 47 | 					} else if (entry.getKey().equals("CreationDate")) {
 48 | 						// Strip out the time, anything after the 'T' in the
 49 | 						// value
 50 | 						bldr.append(entry.getKey()
 51 | 								+ "=\""
 52 | 								+ entry.getValue().substring(0,
 53 | 										entry.getValue().indexOf('T')) + "\" ");
 54 | 					} else {
 55 | 						// Otherwise, output this.
 56 | 						bldr.append(entry.getKey() + "=\"" + entry.getValue()
 57 | 								+ "\" ");
 58 | 					}
 59 | 
 60 | 				}
 61 | 				bldr.append(">");
 62 | 				outkey.set(rndm.nextInt());
 63 | 				outvalue.set(bldr.toString());
 64 | 				context.write(outkey, outvalue);
 65 | 			}
 66 | 		}
 67 | 	}
 68 | 
 69 | 	public static class ValueReducer extends
 70 | 			Reducer<IntWritable, Text, Text, NullWritable> {
 71 | 		@Override
 72 | 		protected void reduce(IntWritable key, Iterable<Text> values,
 73 | 				Context context) throws IOException, InterruptedException {
 74 | 
 75 | 			for (Text t : values) {
 76 | 				context.write(t, NullWritable.get());
 77 | 			}
 78 | 		}
 79 | 	}
 80 | 
 81 | 	public static void main(String[] args) throws Exception {
 82 | 		Configuration conf = new Configuration();
 83 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 84 | 				.getRemainingArgs();
 85 | 		if (otherArgs.length != 2) {
 86 | 			System.err.println("Usage: Anonymize <user data> <out>");
 87 | 			System.exit(1);
 88 | 		}
 89 | 
 90 | 		// Configure the join type
 91 | 		Job job = new Job(conf, "Anonymize");
 92 | 		job.setJarByClass(AnonymizeDriver.class);
 93 | 
 94 | 		job.setMapperClass(AnonymizeMapper.class);
 95 | 		job.setReducerClass(ValueReducer.class);
 96 | 		job.setNumReduceTasks(10);
 97 | 
 98 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
 99 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
100 | 
101 | 		job.setOutputKeyClass(IntWritable.class);
102 | 		job.setOutputValueClass(Text.class);
103 | 
104 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/Binning.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | 
  6 | import mrdp.utils.MRDPUtils;
  7 | 
  8 | import org.apache.commons.lang.StringEscapeUtils;
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.NullWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 19 | import org.apache.hadoop.util.GenericOptionsParser;
 20 | 
 21 | public class Binning {
 22 | 
 23 | 	public static class BinningMapper extends
 24 | 			Mapper<Object, Text, Text, NullWritable> {
 25 | 
 26 | 		private MultipleOutputs<Text, NullWritable> mos = null;
 27 | 
 28 | 		@SuppressWarnings({ "unchecked", "rawtypes" })
 29 | 		@Override
 30 | 		protected void setup(Context context) {
 31 | 			// Create a new MultipleOutputs using the context object
 32 | 			mos = new MultipleOutputs(context);
 33 | 		}
 34 | 
 35 | 		@Override
 36 | 		protected void map(Object key, Text value, Context context)
 37 | 				throws IOException, InterruptedException {
 38 | 
 39 | 			// Parse the input string into a nice map
 40 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 41 | 					.toString());
 42 | 
 43 | 			String rawtags = parsed.get("Tags");
 44 | 			if (rawtags == null) {
 45 | 				return;
 46 | 			}
 47 | 
 48 | 			// Tags are delimited by ><. i.e. <tag1><tag2><tag3>
 49 | 			String[] tagTokens = StringEscapeUtils.unescapeHtml(rawtags).split(
 50 | 					"><");
 51 | 
 52 | 			// For each tag
 53 | 			for (String tag : tagTokens) {
 54 | 				// Remove any > or < from the token
 55 | 				String groomed = tag.replaceAll(">|<", "").toLowerCase();
 56 | 
 57 | 				// If this tag is one of the following, write to the named bin
 58 | 				if (groomed.equalsIgnoreCase("hadoop")) {
 59 | 					mos.write("bins", value, NullWritable.get(), "hadoop-tag");
 60 | 				}
 61 | 
 62 | 				if (groomed.equalsIgnoreCase("pig")) {
 63 | 					mos.write("bins", value, NullWritable.get(), "pig-tag");
 64 | 				}
 65 | 
 66 | 				if (groomed.equalsIgnoreCase("hive")) {
 67 | 					mos.write("bins", value, NullWritable.get(), "hive-tag");
 68 | 				}
 69 | 
 70 | 				if (groomed.equalsIgnoreCase("hbase")) {
 71 | 					mos.write("bins", value, NullWritable.get(), "hbase-tag");
 72 | 				}
 73 | 			}
 74 | 
 75 | 			// Get the body of the post
 76 | 			String post = parsed.get("Body");
 77 | 
 78 | 			if (post == null) {
 79 | 				return;
 80 | 			}
 81 | 
 82 | 			// If the post contains the word "hadoop", write it to its own bin
 83 | 			if (post.toLowerCase().contains("hadoop")) {
 84 | 				mos.write("bins", value, NullWritable.get(), "hadoop-post");
 85 | 			}
 86 | 		}
 87 | 
 88 | 		@Override
 89 | 		protected void cleanup(Context context) throws IOException,
 90 | 				InterruptedException {
 91 | 			// Close multiple outputs!
 92 | 			mos.close();
 93 | 		}
 94 | 	}
 95 | 
 96 | 	public static void main(String[] args) throws Exception {
 97 | 		Configuration conf = new Configuration();
 98 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 99 | 				.getRemainingArgs();
100 | 		if (otherArgs.length != 2) {
101 | 			System.err.println("Usage: Binning <posts> <outdir>");
102 | 			System.exit(1);
103 | 		}
104 | 
105 | 		Job job = new Job(conf, "Binning");
106 | 		job.setJarByClass(Binning.class);
107 | 		job.setMapperClass(BinningMapper.class);
108 | 		job.setNumReduceTasks(0);
109 | 
110 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
111 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
112 | 
113 | 		// Configure the MultipleOutputs by adding an output called "bins"
114 | 		// With the proper output format and mapper key/value pairs
115 | 		MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class,
116 | 				Text.class, NullWritable.class);
117 | 
118 | 		// Enable the counters for the job
119 | 		// If there is a significant number of different named outputs, this
120 | 		// should be disabled
121 | 		MultipleOutputs.setCountersEnabled(job, true);
122 | 
123 | 		System.exit(job.waitForCompletion(true) ? 0 : 2);
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/PartitionedUsers.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.text.ParseException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.Calendar;
  7 | import java.util.Map;
  8 | 
  9 | import mrdp.utils.MRDPUtils;
 10 | 
 11 | import org.apache.hadoop.conf.Configurable;
 12 | import org.apache.hadoop.conf.Configuration;
 13 | import org.apache.hadoop.fs.Path;
 14 | import org.apache.hadoop.io.IntWritable;
 15 | import org.apache.hadoop.io.NullWritable;
 16 | import org.apache.hadoop.io.Text;
 17 | import org.apache.hadoop.mapreduce.Job;
 18 | import org.apache.hadoop.mapreduce.Mapper;
 19 | import org.apache.hadoop.mapreduce.Partitioner;
 20 | import org.apache.hadoop.mapreduce.Reducer;
 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 24 | import org.apache.hadoop.util.GenericOptionsParser;
 25 | 
 26 | public class PartitionedUsers {
 27 | 
 28 | 	public static class LastAccessDateMapper extends
 29 | 			Mapper<Object, Text, IntWritable, Text> {
 30 | 
 31 | 		// This object will format the creation date string into a Date object
 32 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 33 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 34 | 
 35 | 		private IntWritable outkey = new IntWritable();
 36 | 
 37 | 		@Override
 38 | 		protected void map(Object key, Text value, Context context)
 39 | 				throws IOException, InterruptedException {
 40 | 
 41 | 			// Parse the input string into a nice map
 42 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 43 | 					.toString());
 44 | 
 45 | 			// Grab the last access date
 46 | 			String strDate = parsed.get("LastAccessDate");
 47 | 
 48 | 			// skip this record if date is null
 49 | 			if (strDate != null) {
 50 | 				try {
 51 | 					// Parse the string into a Calendar object
 52 | 					Calendar cal = Calendar.getInstance();
 53 | 					cal.setTime(frmt.parse(strDate));
 54 | 					outkey.set(cal.get(Calendar.YEAR));
 55 | 					// Write out the year with the input value
 56 | 					context.write(outkey, value);
 57 | 				} catch (ParseException e) {
 58 | 					// An error occurred parsing the creation Date string
 59 | 					// skip this record
 60 | 				}
 61 | 			}
 62 | 		}
 63 | 	}
 64 | 
 65 | 	public static class LastAccessDatePartitioner extends
 66 | 			Partitioner<IntWritable, Text> implements Configurable {
 67 | 
 68 | 		private static final String MIN_LAST_ACCESS_DATE_YEAR = "min.last.access.date.year";
 69 | 
 70 | 		private Configuration conf = null;
 71 | 		private int minLastAccessDateYear = 0;
 72 | 
 73 | 		@Override
 74 | 		public int getPartition(IntWritable key, Text value, int numPartitions) {
 75 | 			return key.get() - minLastAccessDateYear;
 76 | 		}
 77 | 
 78 | 		@Override
 79 | 		public Configuration getConf() {
 80 | 			return conf;
 81 | 		}
 82 | 
 83 | 		@Override
 84 | 		public void setConf(Configuration conf) {
 85 | 			this.conf = conf;
 86 | 			minLastAccessDateYear = conf.getInt(MIN_LAST_ACCESS_DATE_YEAR, 0);
 87 | 		}
 88 | 
 89 | 		/**
 90 | 		 * Sets the minimum possible last access date to subtract from each key
 91 | 		 * to be partitioned<br>
 92 | 		 * <br>
 93 | 		 * 
 94 | 		 * That is, if the last min access date is "2008" and the key to
 95 | 		 * partition is "2009", it will go to partition 2009 - 2008 = 1
 96 | 		 * 
 97 | 		 * @param job
 98 | 		 *            The job to configure
 99 | 		 * @param minLastAccessDateYear
100 | 		 *            The minimum access date.
101 | 		 */
102 | 		public static void setMinLastAccessDate(Job job,
103 | 				int minLastAccessDateYear) {
104 | 			job.getConfiguration().setInt(MIN_LAST_ACCESS_DATE_YEAR,
105 | 					minLastAccessDateYear);
106 | 		}
107 | 	}
108 | 
109 | 	public static class ValueReducer extends
110 | 			Reducer<IntWritable, Text, Text, NullWritable> {
111 | 
112 | 		protected void reduce(IntWritable key, Iterable<Text> values,
113 | 				Context context) throws IOException, InterruptedException {
114 | 			for (Text t : values) {
115 | 				context.write(t, NullWritable.get());
116 | 			}
117 | 		}
118 | 	}
119 | 
120 | 	public static void main(String[] args) throws Exception {
121 | 		Configuration conf = new Configuration();
122 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
123 | 				.getRemainingArgs();
124 | 		if (otherArgs.length != 2) {
125 | 			System.err.println("Usage: PartitionedUsers <users> <outdir>");
126 | 			System.exit(2);
127 | 		}
128 | 
129 | 		Job job = new Job(conf, "PartitionedUsers");
130 | 
131 | 		job.setJarByClass(PartitionedUsers.class);
132 | 
133 | 		job.setMapperClass(LastAccessDateMapper.class);
134 | 
135 | 		// Set custom partitioner and min last access date
136 | 		job.setPartitionerClass(LastAccessDatePartitioner.class);
137 | 		LastAccessDatePartitioner.setMinLastAccessDate(job, 2008);
138 | 
139 | 		// Last access dates span between 2008-2011, or 4 years
140 | 		job.setNumReduceTasks(4);
141 | 		job.setReducerClass(ValueReducer.class);
142 | 
143 | 		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
144 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
145 | 
146 | 		job.setOutputKeyClass(IntWritable.class);
147 | 		job.setOutputValueClass(Text.class);
148 | 
149 | 		job.setOutputFormatClass(TextOutputFormat.class);
150 | 		job.getConfiguration().set("mapred.textoutputformat.separator", "");
151 | 
152 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
153 | 	}
154 | }
155 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/PostCommentBuildingDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.StringReader;
  5 | import java.io.StringWriter;
  6 | import java.util.ArrayList;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | 
 10 | import javax.xml.parsers.DocumentBuilder;
 11 | import javax.xml.parsers.DocumentBuilderFactory;
 12 | import javax.xml.transform.OutputKeys;
 13 | import javax.xml.transform.Transformer;
 14 | import javax.xml.transform.TransformerFactory;
 15 | import javax.xml.transform.dom.DOMSource;
 16 | import javax.xml.transform.stream.StreamResult;
 17 | 
 18 | import mrdp.utils.MRDPUtils;
 19 | 
 20 | import org.apache.hadoop.conf.Configuration;
 21 | import org.apache.hadoop.fs.Path;
 22 | import org.apache.hadoop.io.NullWritable;
 23 | import org.apache.hadoop.io.Text;
 24 | import org.apache.hadoop.mapreduce.Job;
 25 | import org.apache.hadoop.mapreduce.Mapper;
 26 | import org.apache.hadoop.mapreduce.Reducer;
 27 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
 28 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 29 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 30 | import org.apache.hadoop.util.GenericOptionsParser;
 31 | import org.w3c.dom.Attr;
 32 | import org.w3c.dom.Document;
 33 | import org.w3c.dom.Element;
 34 | import org.w3c.dom.NamedNodeMap;
 35 | import org.xml.sax.InputSource;
 36 | 
 37 | public class PostCommentBuildingDriver {
 38 | 
 39 | 	public static class PostMapper extends Mapper<Object, Text, Text, Text> {
 40 | 
 41 | 		private Text outkey = new Text();
 42 | 		private Text outvalue = new Text();
 43 | 
 44 | 		@Override
 45 | 		public void map(Object key, Text value, Context context)
 46 | 				throws IOException, InterruptedException {
 47 | 
 48 | 			// Parse the input string into a nice map
 49 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 50 | 					.toString());
 51 | 
 52 | 			String postId = parsed.get("Id");
 53 | 
 54 | 			if (postId == null) {
 55 | 				return;
 56 | 			}
 57 | 
 58 | 			// The foreign join key is the post ID
 59 | 			outkey.set(postId);
 60 | 
 61 | 			// Flag this record for the reducer and then output
 62 | 			outvalue.set("P" + value.toString());
 63 | 			context.write(outkey, outvalue);
 64 | 		}
 65 | 	}
 66 | 
 67 | 	public static class CommentMapper extends Mapper<Object, Text, Text, Text> {
 68 | 		private Text outkey = new Text();
 69 | 		private Text outvalue = new Text();
 70 | 
 71 | 		@Override
 72 | 		public void map(Object key, Text value, Context context)
 73 | 				throws IOException, InterruptedException {
 74 | 
 75 | 			// Parse the input string into a nice map
 76 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 77 | 					.toString());
 78 | 
 79 | 			String postId = parsed.get("PostId");
 80 | 			if (postId == null) {
 81 | 				return;
 82 | 			}
 83 | 
 84 | 			// The foreign join key is the user ID
 85 | 			outkey.set(postId);
 86 | 
 87 | 			// Flag this record for the reducer and then output
 88 | 			outvalue.set("C" + value.toString());
 89 | 			context.write(outkey, outvalue);
 90 | 		}
 91 | 	}
 92 | 
 93 | 	public static class PostCommentHierarchyReducer extends
 94 | 			Reducer<Text, Text, Text, NullWritable> {
 95 | 
 96 | 		private ArrayList<String> comments = new ArrayList<String>();
 97 | 		private DocumentBuilderFactory dbf = DocumentBuilderFactory
 98 | 				.newInstance();
 99 | 		private String post = null;
100 | 
101 | 		@Override
102 | 		public void reduce(Text key, Iterable<Text> values, Context context)
103 | 				throws IOException, InterruptedException {
104 | 			// Reset variables
105 | 			post = null;
106 | 			comments.clear();
107 | 
108 | 			// For each input value
109 | 			for (Text t : values) {
110 | 				// If this is the post record, store it, minus the flag
111 | 				if (t.charAt(0) == 'P') {
112 | 					post = t.toString().substring(1, t.toString().length())
113 | 							.trim();
114 | 				} else {
115 | 					// Else, it is a comment record. Add it to the list, minus
116 | 					// the flag
117 | 					comments.add(t.toString()
118 | 							.substring(1, t.toString().length()).trim());
119 | 				}
120 | 			}
121 | 
122 | 			// If post is not null
123 | 			if (post != null) {
124 | 				// nest the comments underneath the post element
125 | 				String postWithCommentChildren = nestElements(post, comments);
126 | 
127 | 				// write out the XML
128 | 				context.write(new Text(postWithCommentChildren),
129 | 						NullWritable.get());
130 | 			}
131 | 		}
132 | 
133 | 		private String nestElements(String post, List<String> comments) {
134 | 			try {
135 | 				// Create the new document to build the XML
136 | 				DocumentBuilder bldr = dbf.newDocumentBuilder();
137 | 				Document doc = bldr.newDocument();
138 | 
139 | 				// Copy parent node to document
140 | 				Element postEl = getXmlElementFromString(post);
141 | 				Element toAddPostEl = doc.createElement("post");
142 | 
143 | 				// Copy the attributes of the original post element to the new
144 | 				// one
145 | 				copyAttributesToElement(postEl.getAttributes(), toAddPostEl);
146 | 
147 | 				// For each comment, copy it to the "post" node
148 | 				for (String commentXml : comments) {
149 | 					Element commentEl = getXmlElementFromString(commentXml);
150 | 					Element toAddCommentEl = doc.createElement("comments");
151 | 
152 | 					// Copy the attributes of the original comment element to
153 | 					// the new one
154 | 					copyAttributesToElement(commentEl.getAttributes(),
155 | 							toAddCommentEl);
156 | 
157 | 					// Add the copied comment to the post element
158 | 					toAddPostEl.appendChild(toAddCommentEl);
159 | 				}
160 | 
161 | 				// Add the post element to the document
162 | 				doc.appendChild(toAddPostEl);
163 | 
164 | 				// Transform the document into a String of XML and return
165 | 				return transformDocumentToString(doc);
166 | 
167 | 			} catch (Exception e) {
168 | 				return null;
169 | 			}
170 | 		}
171 | 
172 | 		private Element getXmlElementFromString(String xml) {
173 | 			try {
174 | 				// Create a new document builder
175 | 				DocumentBuilder bldr = dbf.newDocumentBuilder();
176 | 
177 | 				// Parse the XML string and return the first element
178 | 				return bldr.parse(new InputSource(new StringReader(xml)))
179 | 						.getDocumentElement();
180 | 			} catch (Exception e) {
181 | 				return null;
182 | 			}
183 | 		}
184 | 
185 | 		private void copyAttributesToElement(NamedNodeMap attributes,
186 | 				Element element) {
187 | 
188 | 			// For each attribute, copy it to the element
189 | 			for (int i = 0; i < attributes.getLength(); ++i) {
190 | 				Attr toCopy = (Attr) attributes.item(i);
191 | 				element.setAttribute(toCopy.getName(), toCopy.getValue());
192 | 			}
193 | 		}
194 | 
195 | 		private String transformDocumentToString(Document doc) {
196 | 			try {
197 | 				TransformerFactory tf = TransformerFactory.newInstance();
198 | 				Transformer transformer = tf.newTransformer();
199 | 				transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION,
200 | 						"yes");
201 | 				StringWriter writer = new StringWriter();
202 | 				transformer.transform(new DOMSource(doc), new StreamResult(
203 | 						writer));
204 | 				// Replace all new line characters with an empty string to have
205 | 				// one record per line.
206 | 				return writer.getBuffer().toString().replaceAll("\n|\r", "");
207 | 			} catch (Exception e) {
208 | 				return null;
209 | 			}
210 | 		}
211 | 	}
212 | 
213 | 	public static void main(String[] args) throws Exception {
214 | 		Configuration conf = new Configuration();
215 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
216 | 				.getRemainingArgs();
217 | 		if (otherArgs.length != 3) {
218 | 			System.err
219 | 					.println("Usage: PostCommentHierarchy <posts> <comments> <outdir>");
220 | 			System.exit(1);
221 | 		}
222 | 
223 | 		Job job = new Job(conf, "PostCommentHierarchy");
224 | 		job.setJarByClass(PostCommentBuildingDriver.class);
225 | 
226 | 		MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
227 | 				TextInputFormat.class, PostMapper.class);
228 | 
229 | 		MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
230 | 				TextInputFormat.class, CommentMapper.class);
231 | 
232 | 		job.setReducerClass(PostCommentHierarchyReducer.class);
233 | 
234 | 		job.setOutputFormatClass(TextOutputFormat.class);
235 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
236 | 
237 | 		job.setOutputKeyClass(Text.class);
238 | 		job.setOutputValueClass(Text.class);
239 | 
240 | 		System.exit(job.waitForCompletion(true) ? 0 : 2);
241 | 	}
242 | }
243 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/QuestionAnswerBuildingDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.StringReader;
  5 | import java.io.StringWriter;
  6 | import java.util.ArrayList;
  7 | import java.util.List;
  8 | import javax.xml.parsers.DocumentBuilder;
  9 | import javax.xml.parsers.DocumentBuilderFactory;
 10 | import javax.xml.transform.OutputKeys;
 11 | import javax.xml.transform.Transformer;
 12 | import javax.xml.transform.TransformerFactory;
 13 | import javax.xml.transform.dom.DOMSource;
 14 | import javax.xml.transform.stream.StreamResult;
 15 | 
 16 | import org.apache.hadoop.conf.Configuration;
 17 | import org.apache.hadoop.fs.Path;
 18 | import org.apache.hadoop.io.NullWritable;
 19 | import org.apache.hadoop.io.Text;
 20 | import org.apache.hadoop.mapreduce.Job;
 21 | import org.apache.hadoop.mapreduce.Mapper;
 22 | import org.apache.hadoop.mapreduce.Reducer;
 23 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 24 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 25 | import org.apache.hadoop.util.GenericOptionsParser;
 26 | import org.w3c.dom.Attr;
 27 | import org.w3c.dom.Document;
 28 | import org.w3c.dom.Element;
 29 | import org.w3c.dom.NamedNodeMap;
 30 | import org.xml.sax.InputSource;
 31 | 
 32 | public class QuestionAnswerBuildingDriver {
 33 | 
 34 | 	public static class PostCommentMapper extends
 35 | 			Mapper<Object, Text, Text, Text> {
 36 | 
 37 | 		private DocumentBuilderFactory dbf = DocumentBuilderFactory
 38 | 				.newInstance();
 39 | 		private Text outkey = new Text();
 40 | 		private Text outvalue = new Text();
 41 | 
 42 | 		@Override
 43 | 		public void map(Object key, Text value, Context context)
 44 | 				throws IOException, InterruptedException {
 45 | 
 46 | 			// Parse the post/comment XML hierarchy into an Element
 47 | 			Element post = getXmlElementFromString(value.toString());
 48 | 
 49 | 			int postType = Integer.parseInt(post.getAttribute("PostTypeId"));
 50 | 
 51 | 			// If postType is 1, it is a question
 52 | 			if (postType == 1) {
 53 | 				outkey.set(post.getAttribute("Id"));
 54 | 				outvalue.set("Q" + value.toString());
 55 | 			} else {
 56 | 				// Else, it is an answer
 57 | 				outkey.set(post.getAttribute("ParentId"));
 58 | 				outvalue.set("A" + value.toString());
 59 | 			}
 60 | 
 61 | 			context.write(outkey, outvalue);
 62 | 		}
 63 | 
 64 | 		private Element getXmlElementFromString(String xml) {
 65 | 			try {
 66 | 				// Create a new document builder
 67 | 				DocumentBuilder bldr = dbf.newDocumentBuilder();
 68 | 
 69 | 				// Parse the XML string and return the first element
 70 | 				return bldr.parse(new InputSource(new StringReader(xml)))
 71 | 						.getDocumentElement();
 72 | 			} catch (Exception e) {
 73 | 				return null;
 74 | 			}
 75 | 		}
 76 | 	}
 77 | 
 78 | 	public static class QuestionAnswerReducer extends
 79 | 			Reducer<Text, Text, Text, NullWritable> {
 80 | 
 81 | 		private ArrayList<String> answers = new ArrayList<String>();
 82 | 		private DocumentBuilderFactory dbf = DocumentBuilderFactory
 83 | 				.newInstance();
 84 | 		private String question = null;
 85 | 
 86 | 		@Override
 87 | 		public void reduce(Text key, Iterable<Text> values, Context context)
 88 | 				throws IOException, InterruptedException {
 89 | 			// Reset variables
 90 | 			question = null;
 91 | 			answers.clear();
 92 | 
 93 | 			// For each input value
 94 | 			for (Text t : values) {
 95 | 				// If this is the post record, store it, minus the flag
 96 | 				if (t.charAt(0) == 'Q') {
 97 | 					question = t.toString().substring(1, t.toString().length())
 98 | 							.trim();
 99 | 				} else {
100 | 					// Else, it is a comment record. Add it to the list, minus
101 | 					// the flag
102 | 					answers.add(t.toString()
103 | 							.substring(1, t.toString().length()).trim());
104 | 				}
105 | 			}
106 | 
107 | 			// If post is not null
108 | 			if (question != null) {
109 | 				// nest the comments underneath the post element
110 | 				String postWithCommentChildren = nestElements(question, answers);
111 | 
112 | 				// write out the XML
113 | 				context.write(new Text(postWithCommentChildren),
114 | 						NullWritable.get());
115 | 			}
116 | 		}
117 | 
118 | 		private String nestElements(String post, List<String> comments) {
119 | 			try {
120 | 				// Create the new document to build the XML
121 | 				DocumentBuilder bldr = dbf.newDocumentBuilder();
122 | 				Document doc = bldr.newDocument();
123 | 
124 | 				// Copy parent node to document
125 | 				Element postEl = getXmlElementFromString(post);
126 | 				Element toAddPostEl = doc.createElement("question");
127 | 
128 | 				// Copy the attributes of the original post element to the new
129 | 				// one
130 | 				copyAttributesToElement(postEl.getAttributes(), toAddPostEl);
131 | 
132 | 				// For each comment, copy it to the "post" node
133 | 				for (String commentXml : comments) {
134 | 					Element commentEl = getXmlElementFromString(commentXml);
135 | 					Element toAddCommentEl = doc.createElement("answer");
136 | 
137 | 					// Copy the attributes of the original comment element to
138 | 					// the new one
139 | 					copyAttributesToElement(commentEl.getAttributes(),
140 | 							toAddCommentEl);
141 | 
142 | 					// Add the copied comment to the post element
143 | 					toAddPostEl.appendChild(toAddCommentEl);
144 | 				}
145 | 
146 | 				// Add the post element to the document
147 | 				doc.appendChild(toAddPostEl);
148 | 
149 | 				// Transform the document into a String of XML and return
150 | 				return transformDocumentToString(doc);
151 | 
152 | 			} catch (Exception e) {
153 | 				return null;
154 | 			}
155 | 		}
156 | 
157 | 		private Element getXmlElementFromString(String xml) {
158 | 			try {
159 | 				// Create a new document builder
160 | 				DocumentBuilder bldr = dbf.newDocumentBuilder();
161 | 
162 | 				// Parse the XML string and return the first element
163 | 				return bldr.parse(new InputSource(new StringReader(xml)))
164 | 						.getDocumentElement();
165 | 			} catch (Exception e) {
166 | 				return null;
167 | 			}
168 | 		}
169 | 
170 | 		private void copyAttributesToElement(NamedNodeMap attributes,
171 | 				Element element) {
172 | 
173 | 			// For each attribute, copy it to the element
174 | 			for (int i = 0; i < attributes.getLength(); ++i) {
175 | 				Attr toCopy = (Attr) attributes.item(i);
176 | 				element.setAttribute(toCopy.getName(), toCopy.getValue());
177 | 			}
178 | 		}
179 | 
180 | 		private String transformDocumentToString(Document doc) {
181 | 			try {
182 | 				TransformerFactory tf = TransformerFactory.newInstance();
183 | 				Transformer transformer = tf.newTransformer();
184 | 				transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION,
185 | 						"yes");
186 | 				StringWriter writer = new StringWriter();
187 | 				transformer.transform(new DOMSource(doc), new StreamResult(
188 | 						writer));
189 | 				// Replace all new line characters with an empty string to have
190 | 				// one record per line.
191 | 				return writer.getBuffer().toString().replaceAll("\n|\r", "");
192 | 			} catch (Exception e) {
193 | 				return null;
194 | 			}
195 | 		}
196 | 	}
197 | 
198 | 	public static void main(String[] args) throws Exception {
199 | 		Configuration conf = new Configuration();
200 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
201 | 				.getRemainingArgs();
202 | 		if (otherArgs.length != 2) {
203 | 			System.err
204 | 					.println("Usage: QuestionAnswerHierarchy <post-comment> <outdir>");
205 | 			System.exit(1);
206 | 		}
207 | 
208 | 		Job job = new Job(conf, "QuestionAnswerHierarchy");
209 | 		job.setJarByClass(QuestionAnswerBuildingDriver.class);
210 | 
211 | 		job.setMapperClass(PostCommentMapper.class);
212 | 
213 | 		job.setInputFormatClass(TextInputFormat.class);
214 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
215 | 
216 | 		job.setReducerClass(QuestionAnswerReducer.class);
217 | 
218 | 		job.setOutputFormatClass(TextOutputFormat.class);
219 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
220 | 
221 | 		job.setOutputKeyClass(Text.class);
222 | 		job.setOutputValueClass(Text.class);
223 | 
224 | 		System.exit(job.waitForCompletion(true) ? 0 : 1);
225 | 	}
226 | }
227 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch4/TotalOrderSorting.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch4;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.Map;
  5 | 
  6 | import mrdp.utils.MRDPUtils;
  7 | 
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.NullWritable;
 12 | import org.apache.hadoop.io.Text;
 13 | import org.apache.hadoop.mapreduce.Job;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.Reducer;
 16 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 18 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 20 | import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
 21 | import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | 
 24 | public class TotalOrderSorting {
 25 | 
 26 | 	public static class LastAccessDateMapper extends
 27 | 			Mapper<Object, Text, Text, Text> {
 28 | 
 29 | 		private Text outkey = new Text();
 30 | 
 31 | 		@Override
 32 | 		public void map(Object key, Text value, Context context)
 33 | 				throws IOException, InterruptedException {
 34 | 
 35 | 			// Parse the input string into a nice map
 36 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 37 | 					.toString());
 38 | 
 39 | 			String date = parsed.get("LastAccessDate");
 40 | 			if (date != null) {
 41 | 				outkey.set(date);
 42 | 				context.write(outkey, value);
 43 | 			}
 44 | 		}
 45 | 	}
 46 | 
 47 | 	public static class ValueReducer extends
 48 | 			Reducer<Text, Text, Text, NullWritable> {
 49 | 
 50 | 		@Override
 51 | 		public void reduce(Text key, Iterable<Text> values, Context context)
 52 | 				throws IOException, InterruptedException {
 53 | 			for (Text t : values) {
 54 | 				context.write(t, NullWritable.get());
 55 | 			}
 56 | 		}
 57 | 	}
 58 | 
 59 | 	@SuppressWarnings({ "unchecked", "rawtypes" })
 60 | 	public static void main(String[] args) throws Exception {
 61 | 		Configuration conf = new Configuration();
 62 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 63 | 				.getRemainingArgs();
 64 | 		if (otherArgs.length != 3) {
 65 | 			System.err
 66 | 					.println("Usage: TotalOrderSorting <user data> <out> <sample rate>");
 67 | 			System.exit(1);
 68 | 		}
 69 | 
 70 | 		Path inputPath = new Path(otherArgs[0]);
 71 | 		Path partitionFile = new Path(otherArgs[1] + "_partitions.lst");
 72 | 		Path outputStage = new Path(otherArgs[1] + "_staging");
 73 | 		Path outputOrder = new Path(otherArgs[1]);
 74 | 		double sampleRate = Double.parseDouble(otherArgs[2]);
 75 | 
 76 | 		FileSystem.get(new Configuration()).delete(outputOrder, true);
 77 | 		FileSystem.get(new Configuration()).delete(outputStage, true);
 78 | 		FileSystem.get(new Configuration()).delete(partitionFile, true);
 79 | 
 80 | 		// Configure job to prepare for sampling
 81 | 		Job sampleJob = new Job(conf, "TotalOrderSortingStage");
 82 | 		sampleJob.setJarByClass(TotalOrderSorting.class);
 83 | 
 84 | 		// Use the mapper implementation with zero reduce tasks
 85 | 		sampleJob.setMapperClass(LastAccessDateMapper.class);
 86 | 		sampleJob.setNumReduceTasks(0);
 87 | 
 88 | 		sampleJob.setOutputKeyClass(Text.class);
 89 | 		sampleJob.setOutputValueClass(Text.class);
 90 | 
 91 | 		TextInputFormat.setInputPaths(sampleJob, inputPath);
 92 | 
 93 | 		// Set the output format to a sequence file
 94 | 		sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);
 95 | 		SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage);
 96 | 
 97 | 		// Submit the job and get completion code.
 98 | 		int code = sampleJob.waitForCompletion(true) ? 0 : 1;
 99 | 
100 | 		if (code == 0) {
101 | 			Job orderJob = new Job(conf, "TotalOrderSortingStage");
102 | 			orderJob.setJarByClass(TotalOrderSorting.class);
103 | 
104 | 			// Here, use the identity mapper to output the key/value pairs in
105 | 			// the SequenceFile
106 | 			orderJob.setMapperClass(Mapper.class);
107 | 			orderJob.setReducerClass(ValueReducer.class);
108 | 
109 | 			// Set the number of reduce tasks to an appropriate number for the
110 | 			// amount of data being sorted
111 | 			orderJob.setNumReduceTasks(10);
112 | 
113 | 			// Use Hadoop's TotalOrderPartitioner class
114 | 			orderJob.setPartitionerClass(TotalOrderPartitioner.class);
115 | 
116 | 			// Set the partition file
117 | 			TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(),
118 | 					partitionFile);
119 | 
120 | 			orderJob.setOutputKeyClass(Text.class);
121 | 			orderJob.setOutputValueClass(Text.class);
122 | 
123 | 			// Set the input to the previous job's output
124 | 			orderJob.setInputFormatClass(SequenceFileInputFormat.class);
125 | 			SequenceFileInputFormat.setInputPaths(orderJob, outputStage);
126 | 
127 | 			// Set the output path to the command line parameter
128 | 			TextOutputFormat.setOutputPath(orderJob, outputOrder);
129 | 
130 | 			// Set the separator to an empty string
131 | 			orderJob.getConfiguration().set(
132 | 					"mapred.textoutputformat.separator", "");
133 | 
134 | 			// Use the InputSampler to go through the output of the previous
135 | 			// job, sample it, and create the partition file
136 | 			InputSampler.writePartitionFile(orderJob,
137 | 					new InputSampler.RandomSampler(sampleRate, 10000));
138 | 
139 | 			// Submit the job
140 | 			code = orderJob.waitForCompletion(true) ? 0 : 2;
141 | 		}
142 | 
143 | 		// Cleanup the partition file and the staging directory
144 | 		FileSystem.get(new Configuration()).delete(partitionFile, false);
145 | 		FileSystem.get(new Configuration()).delete(outputStage, true);
146 | 
147 | 		System.exit(code);
148 | 	}
149 | }
150 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/CartesianFormatter.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch5;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.Arrays;
  8 | import java.util.HashSet;
  9 | import java.util.Map;
 10 | 
 11 | import mrdp.utils.MRDPUtils;
 12 | 
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.mapreduce.Job;
 17 | import org.apache.hadoop.mapreduce.Mapper;
 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 20 | import org.apache.hadoop.util.GenericOptionsParser;
 21 | 
 22 | public class CartesianFormatter {
 23 | 
 24 | 	public static class CommentMapper extends Mapper<Object, Text, Text, Text> {
 25 | 
 26 | 		private Text outkey = new Text(), outvalue = new Text();
 27 | 		private HashSet<String> commonWords = new HashSet<String>();
 28 | 
 29 | 		protected void setup(Context context) throws IOException,
 30 | 				InterruptedException {
 31 | 
 32 | 			File f = new File(System.getProperty("user.dir")
 33 | 					+ "/commonwords.txt");
 34 | 
 35 | 			BufferedReader rdr = new BufferedReader(new FileReader(f));
 36 | 
 37 | 			String word = null;
 38 | 			while ((word = rdr.readLine()) != null) {
 39 | 				commonWords.add(word);
 40 | 			}
 41 | 
 42 | 			rdr.close();
 43 | 		}
 44 | 
 45 | 		@Override
 46 | 		public void map(Object key, Text value, Context context)
 47 | 				throws IOException, InterruptedException {
 48 | 
 49 | 			// Parse the input string into a nice map
 50 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
 51 | 
 52 | 			String id = parsed.get("Id");
 53 | 			String comment = parsed.get("Text");
 54 | 
 55 | 			if (id == null || comment == null) {
 56 | 				return;
 57 | 			}
 58 | 
 59 | 			String[] tokens = comment.toLowerCase()
 60 | 					.replaceAll("[^a-z0-9\\s]", "").split("\\s");
 61 | 
 62 | 			HashSet<String> setTokens = new HashSet<String>(
 63 | 					Arrays.asList(tokens));
 64 | 			setTokens.removeAll(commonWords);
 65 | 
 66 | 			StringBuilder bldr = new StringBuilder();
 67 | 
 68 | 			for (String word : setTokens) {
 69 | 				if (!word.isEmpty()) {
 70 | 					bldr.append(word + ",");
 71 | 				}
 72 | 			}
 73 | 
 74 | 			if (bldr.length() > 0) {
 75 | 				outkey.set(id);
 76 | 				outvalue.set(bldr.deleteCharAt(bldr.length() - 1).toString());
 77 | 				context.write(outkey, outvalue);
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 
 82 | 	public static void main(String[] args) throws Exception {
 83 | 		Configuration conf = new Configuration();
 84 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 85 | 				.getRemainingArgs();
 86 | 		if (otherArgs.length != 2) {
 87 | 			System.err.println("Usage: CartesianFormatter <user data> <out>");
 88 | 			System.exit(1);
 89 | 		}
 90 | 
 91 | 		// Configure the join type
 92 | 		Job job = new Job(conf, "CartesianFormatter");
 93 | 		job.setJarByClass(CartesianFormatter.class);
 94 | 
 95 | 		job.setMapperClass(CommentMapper.class);
 96 | 		job.setNumReduceTasks(0);
 97 | 
 98 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
 99 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
100 | 
101 | 		job.setOutputKeyClass(Text.class);
102 | 		job.setOutputValueClass(Text.class);
103 | 
104 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/CompositeJoinDriver.java:
--------------------------------------------------------------------------------
 1 | package mrdp.ch5;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapred.JobClient;
 8 | import org.apache.hadoop.mapred.JobConf;
 9 | import org.apache.hadoop.mapred.KeyValueTextInputFormat;
10 | import org.apache.hadoop.mapred.MapReduceBase;
11 | import org.apache.hadoop.mapred.Mapper;
12 | import org.apache.hadoop.mapred.OutputCollector;
13 | import org.apache.hadoop.mapred.Reporter;
14 | import org.apache.hadoop.mapred.RunningJob;
15 | import org.apache.hadoop.mapred.TextOutputFormat;
16 | import org.apache.hadoop.mapred.join.CompositeInputFormat;
17 | import org.apache.hadoop.mapred.join.TupleWritable;
18 | import org.apache.hadoop.util.GenericOptionsParser;
19 | 
20 | public class CompositeJoinDriver {
21 | 
22 | 	public static class CompositeMapper extends MapReduceBase implements
23 | 			Mapper<Text, TupleWritable, Text, Text> {
24 | 
25 | 		@Override
26 | 		public void map(Text key, TupleWritable value,
27 | 				OutputCollector<Text, Text> output, Reporter reporter)
28 | 				throws IOException {
29 | 
30 | 			// Get the first two elements in the tuple and output them
31 | 			output.collect((Text) value.get(0), (Text) value.get(1));
32 | 		}
33 | 	}
34 | 
35 | 	public static void main(String[] args) throws Exception {
36 | 		JobConf conf = new JobConf("CompositeJoin");
37 | 		conf.setJarByClass(CompositeJoinDriver.class);
38 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
39 | 				.getRemainingArgs();
40 | 		if (otherArgs.length != 4) {
41 | 			System.err
42 | 					.println("Usage: CompositeJoin <user data> <comment data> <out> [inner|outer]");
43 | 			System.exit(1);
44 | 		}
45 | 
46 | 		Path userPath = new Path(otherArgs[0]);
47 | 		Path commentPath = new Path(otherArgs[1]);
48 | 		Path outputDir = new Path(otherArgs[2]);
49 | 		String joinType = otherArgs[3];
50 | 		if (!(joinType.equalsIgnoreCase("inner") || joinType
51 | 				.equalsIgnoreCase("outer"))) {
52 | 			System.err.println("Join type not set to inner or outer");
53 | 			System.exit(2);
54 | 		}
55 | 
56 | 		conf.setMapperClass(CompositeMapper.class);
57 | 		conf.setNumReduceTasks(0);
58 | 
59 | 		// Set the input format class to a CompositeInputFormat class.
60 | 		// The CompositeInputFormat will parse all of our input files and output
61 | 		// records to our mapper.
62 | 		conf.setInputFormat(CompositeInputFormat.class);
63 | 
64 | 		// The composite input format join expression will set how the records
65 | 		// are going to be read in, and in what input format.
66 | 		conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
67 | 				KeyValueTextInputFormat.class, userPath, commentPath));
68 | 
69 | 		TextOutputFormat.setOutputPath(conf, outputDir);
70 | 
71 | 		conf.setOutputKeyClass(Text.class);
72 | 		conf.setOutputValueClass(Text.class);
73 | 
74 | 		RunningJob job = JobClient.runJob(conf);
75 | 		while (!job.isComplete()) {
76 | 			Thread.sleep(1000);
77 | 		}
78 | 
79 | 		System.exit(job.isSuccessful() ? 0 : 2);
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/JoinFormatting.java:
--------------------------------------------------------------------------------
 1 | package mrdp.ch5;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | 
 6 | import mrdp.utils.MRDPUtils;
 7 | 
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.Path;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 | import org.apache.hadoop.util.GenericOptionsParser;
16 | 
17 | public class JoinFormatting {
18 | 
19 | 	public static class ReplicatedJoinMapper extends
20 | 			Mapper<Object, Text, Text, Text> {
21 | 
22 | 		private Text outkey = new Text();
23 | 
24 | 		@Override
25 | 		public void map(Object key, Text value, Context context)
26 | 				throws IOException, InterruptedException {
27 | 
28 | 			// Parse the input string into a nice map
29 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
30 | 
31 | 			String userId = parsed.get("UserId");
32 | 
33 | 			if (userId == null) {
34 | 				return;
35 | 			}
36 | 
37 | 			outkey.set(userId);
38 | 			context.write(outkey, value);
39 | 		}
40 | 	}
41 | 
42 | 	public static void main(String[] args) throws Exception {
43 | 		Configuration conf = new Configuration();
44 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
45 | 				.getRemainingArgs();
46 | 		if (otherArgs.length != 2) {
47 | 			System.err.println("Usage: ReplicatedJoin <user data> <out>");
48 | 			System.exit(1);
49 | 		}
50 | 
51 | 		// Configure the join type
52 | 		Job job = new Job(conf, "Replicated Join");
53 | 		job.setJarByClass(ReplicatedJoinDriver.class);
54 | 
55 | 		job.setMapperClass(ReplicatedJoinMapper.class);
56 | 		job.setNumReduceTasks(0);
57 | 
58 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
59 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
60 | 
61 | 		job.setOutputKeyClass(Text.class);
62 | 		job.setOutputValueClass(Text.class);
63 | 
64 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/ReduceSideJoinDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch5;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Map;
  6 | 
  7 | import mrdp.utils.MRDPUtils;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.apache.hadoop.mapreduce.Job;
 13 | import org.apache.hadoop.mapreduce.Mapper;
 14 | import org.apache.hadoop.mapreduce.Reducer;
 15 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 18 | import org.apache.hadoop.util.GenericOptionsParser;
 19 | 
 20 | public class ReduceSideJoinDriver {
 21 | 
 22 | 	public static class UserJoinMapper extends Mapper<Object, Text, Text, Text> {
 23 | 
 24 | 		private Text outkey = new Text();
 25 | 		private Text outvalue = new Text();
 26 | 
 27 | 		@Override
 28 | 		public void map(Object key, Text value, Context context)
 29 | 				throws IOException, InterruptedException {
 30 | 
 31 | 			// Parse the input string into a nice map
 32 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
 33 | 
 34 | 			String userId = parsed.get("Id");
 35 | 
 36 | 			if (userId == null) {
 37 | 				return;
 38 | 			}
 39 | 
 40 | 			// The foreign join key is the user ID
 41 | 			outkey.set(userId);
 42 | 
 43 | 			// Flag this record for the reducer and then output
 44 | 			outvalue.set("A" + value.toString());
 45 | 			context.write(outkey, outvalue);
 46 | 		}
 47 | 	}
 48 | 
 49 | 	public static class CommentJoinMapper extends
 50 | 			Mapper<Object, Text, Text, Text> {
 51 | 
 52 | 		private Text outkey = new Text();
 53 | 		private Text outvalue = new Text();
 54 | 
 55 | 		@Override
 56 | 		public void map(Object key, Text value, Context context)
 57 | 				throws IOException, InterruptedException {
 58 | 
 59 | 			// Parse the input string into a nice map
 60 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value.toString());
 61 | 
 62 | 			String userId = parsed.get("UserId");
 63 | 			if (userId == null) {
 64 | 				return;
 65 | 			}
 66 | 
 67 | 			// The foreign join key is the user ID
 68 | 			outkey.set(userId);
 69 | 
 70 | 			// Flag this record for the reducer and then output
 71 | 			outvalue.set("B" + value.toString());
 72 | 			context.write(outkey, outvalue);
 73 | 		}
 74 | 	}
 75 | 
 76 | 	public static class UserJoinReducer extends Reducer<Text, Text, Text, Text> {
 77 | 
 78 | 		private ArrayList<Text> listA = new ArrayList<Text>();
 79 | 		private ArrayList<Text> listB = new ArrayList<Text>();
 80 | 		private String joinType = null;
 81 | 
 82 | 		@Override
 83 | 		public void setup(Context context) {
 84 | 			// Get the type of join from our configuration
 85 | 			joinType = context.getConfiguration().get("join.type");
 86 | 		}
 87 | 
 88 | 		@Override
 89 | 		public void reduce(Text key, Iterable<Text> values, Context context)
 90 | 				throws IOException, InterruptedException {
 91 | 
 92 | 			// Clear our lists
 93 | 			listA.clear();
 94 | 			listB.clear();
 95 | 
 96 | 			// iterate through all our values, binning each record based on what
 97 | 			// it was tagged with
 98 | 			// make sure to remove the tag!
 99 | 			for (Text t : values) {
100 | 				if (t.charAt(0) == 'A') {
101 | 					listA.add(new Text(t.toString().substring(1)));
102 | 				} else if (t.charAt(0) == 'B') {
103 | 					listB.add(new Text(t.toString().substring(1)));
104 | 				}
105 | 			}
106 | 
107 | 			// Execute our join logic now that the lists are filled
108 | 			executeJoinLogic(context);
109 | 		}
110 | 
111 | 		private void executeJoinLogic(Context context) throws IOException,
112 | 				InterruptedException {
113 | 			if (joinType.equalsIgnoreCase("inner")) {
114 | 				// If both lists are not empty, join A with B
115 | 				if (!listA.isEmpty() && !listB.isEmpty()) {
116 | 					for (Text A : listA) {
117 | 						for (Text B : listB) {
118 | 							context.write(A, B);
119 | 						}
120 | 					}
121 | 				}
122 | 			} else if (joinType.equalsIgnoreCase("leftouter")) {
123 | 				// For each entry in A,
124 | 				for (Text A : listA) {
125 | 					// If list B is not empty, join A and B
126 | 					if (!listB.isEmpty()) {
127 | 						for (Text B : listB) {
128 | 							context.write(A, B);
129 | 						}
130 | 					} else {
131 | 						// Else, output A by itself
132 | 						context.write(A, new Text(""));
133 | 					}
134 | 				}
135 | 			} else if (joinType.equalsIgnoreCase("rightouter")) {
136 | 				// FOr each entry in B,
137 | 				for (Text B : listB) {
138 | 					// If list A is not empty, join A and B
139 | 					if (!listA.isEmpty()) {
140 | 						for (Text A : listA) {
141 | 							context.write(A, B);
142 | 						}
143 | 					} else {
144 | 						// Else, output B by itself
145 | 						context.write(new Text(""), B);
146 | 					}
147 | 				}
148 | 			} else if (joinType.equalsIgnoreCase("fullouter")) {
149 | 				// If list A is not empty
150 | 				if (!listA.isEmpty()) {
151 | 					// For each entry in A
152 | 					for (Text A : listA) {
153 | 						// If list B is not empty, join A with B
154 | 						if (!listB.isEmpty()) {
155 | 							for (Text B : listB) {
156 | 								context.write(A, B);
157 | 							}
158 | 						} else {
159 | 							// Else, output A by itself
160 | 							context.write(A, new Text(""));
161 | 						}
162 | 					}
163 | 				} else {
164 | 					// If list A is empty, just output B
165 | 					for (Text B : listB) {
166 | 						context.write(new Text(""), B);
167 | 					}
168 | 				}
169 | 			} else if (joinType.equalsIgnoreCase("anti")) {
170 | 				// If list A is empty and B is empty or vice versa
171 | 				if (listA.isEmpty() ^ listB.isEmpty()) {
172 | 
173 | 					// Iterate both A and B with null values
174 | 					// The previous XOR check will make sure exactly one of
175 | 					// these lists is empty and therefore won't have output
176 | 					for (Text A : listA) {
177 | 						context.write(A, new Text(""));
178 | 					}
179 | 
180 | 					for (Text B : listB) {
181 | 						context.write(new Text(""), B);
182 | 					}
183 | 				}
184 | 			} else {
185 | 				throw new RuntimeException(
186 | 						"Join type not set to inner, leftouter, rightouter, fullouter, or anti");
187 | 			}
188 | 		}
189 | 	}
190 | 
191 | 	public static void main(String[] args) throws Exception {
192 | 		Configuration conf = new Configuration();
193 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
194 | 				.getRemainingArgs();
195 | 		if (otherArgs.length != 4) {
196 | 			System.err
197 | 					.println("Usage: ReduceSideJoin <user data> <comment data> <out> [inner|leftouter|rightouter|fullouter|anti]");
198 | 			System.exit(1);
199 | 		}
200 | 
201 | 		String joinType = otherArgs[3];
202 | 		if (!(joinType.equalsIgnoreCase("inner")
203 | 				|| joinType.equalsIgnoreCase("leftouter")
204 | 				|| joinType.equalsIgnoreCase("rightouter")
205 | 				|| joinType.equalsIgnoreCase("fullouter") || joinType
206 | 					.equalsIgnoreCase("anti"))) {
207 | 			System.err
208 | 					.println("Join type not set to inner, leftouter, rightouter, fullouter, or anti");
209 | 			System.exit(2);
210 | 		}
211 | 
212 | 		Job job = new Job(conf, "Reduce Side Join");
213 | 
214 | 		// Configure the join type
215 | 		job.getConfiguration().set("join.type", joinType);
216 | 		job.setJarByClass(ReduceSideJoinDriver.class);
217 | 
218 | 		// Use multiple inputs to set which input uses what mapper
219 | 		// This will keep parsing of each data set separate from a logical
220 | 		// standpoint
221 | 		// However, this version of Hadoop has not upgraded MultipleInputs
222 | 		// to the mapreduce package, so we have to use the deprecated API.
223 | 		// Future releases have this in the "mapreduce" package.
224 | 		MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
225 | 				TextInputFormat.class, UserJoinMapper.class);
226 | 
227 | 		MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
228 | 				TextInputFormat.class, CommentJoinMapper.class);
229 | 
230 | 		job.setReducerClass(UserJoinReducer.class);
231 | 
232 | 		FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
233 | 
234 | 		job.setOutputKeyClass(Text.class);
235 | 		job.setOutputValueClass(Text.class);
236 | 
237 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
238 | 	}
239 | }
240 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/ReduceSideJoinWithBloomDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch5;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Map;
  9 | 
 10 | import mrdp.utils.MRDPUtils;
 11 | 
 12 | import org.apache.hadoop.conf.Configuration;
 13 | import org.apache.hadoop.filecache.DistributedCache;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.mapreduce.Job;
 17 | import org.apache.hadoop.mapreduce.Mapper;
 18 | import org.apache.hadoop.mapreduce.Reducer;
 19 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | import org.apache.hadoop.util.bloom.BloomFilter;
 24 | import org.apache.hadoop.util.bloom.Key;
 25 | 
 26 | public class ReduceSideJoinWithBloomDriver {
 27 | 
 28 | 	public static class UserJoinMapperWithBloom extends
 29 | 			Mapper<Object, Text, Text, Text> {
 30 | 
 31 | 		private Text outkey = new Text();
 32 | 		private Text outvalue = new Text();
 33 | 
 34 | 		@Override
 35 | 		public void map(Object key, Text value, Context context)
 36 | 				throws IOException, InterruptedException {
 37 | 
 38 | 			// Parse the input string into a nice map
 39 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 40 | 					.toString());
 41 | 
 42 | 			String userId = parsed.get("Id");
 43 | 			String reputation = parsed.get("Reputation");
 44 | 
 45 | 			if (userId == null || reputation == null) {
 46 | 				return;
 47 | 			}
 48 | 
 49 | 			// If the reputation is greater than 1,500, output the user ID with
 50 | 			// the value
 51 | 			if (Integer.parseInt(reputation) > 1500) {
 52 | 				outkey.set(parsed.get("Id"));
 53 | 				outvalue.set("A" + value.toString());
 54 | 				context.write(outkey, outvalue);
 55 | 			}
 56 | 		}
 57 | 
 58 | 		public static class CommentJoinMapperWithBloom extends
 59 | 				Mapper<Object, Text, Text, Text> {
 60 | 
 61 | 			private BloomFilter bfilter = new BloomFilter();
 62 | 			private Text outkey = new Text();
 63 | 			private Text outvalue = new Text();
 64 | 
 65 | 			@Override
 66 | 			public void setup(Context context) {
 67 | 				try {
 68 | 					Path[] files = DistributedCache.getLocalCacheFiles(context
 69 | 							.getConfiguration());
 70 | 
 71 | 					if (files.length != 0) {
 72 | 						DataInputStream strm = new DataInputStream(
 73 | 								new FileInputStream(new File(
 74 | 										files[0].toString())));
 75 | 						bfilter.readFields(strm);
 76 | 					} else {
 77 | 						throw new RuntimeException(
 78 | 								"Bloom filter not set in DistributedCache");
 79 | 					}
 80 | 				} catch (IOException e) {
 81 | 					throw new RuntimeException(e);
 82 | 				}
 83 | 			}
 84 | 
 85 | 			@Override
 86 | 			public void map(Object key, Text value, Context context)
 87 | 					throws IOException, InterruptedException {
 88 | 
 89 | 				// Parse the input string into a nice map
 90 | 				Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 91 | 						.toString());
 92 | 
 93 | 				String userId = parsed.get("UserId");
 94 | 
 95 | 				if (userId == null) {
 96 | 					return;
 97 | 				}
 98 | 
 99 | 				if (bfilter.membershipTest(new Key(userId.getBytes()))) {
100 | 					outkey.set(userId);
101 | 					outvalue.set("B" + value.toString());
102 | 					context.write(outkey, outvalue);
103 | 				}
104 | 			}
105 | 		}
106 | 
107 | 		public static class UserJoinReducer extends
108 | 				Reducer<Text, Text, Text, Text> {
109 | 
110 | 			private ArrayList<Text> listA = new ArrayList<Text>();
111 | 			private ArrayList<Text> listB = new ArrayList<Text>();
112 | 			private String joinType = null;
113 | 
114 | 			@Override
115 | 			public void setup(Context context) {
116 | 				// Get the type of join from our configuration
117 | 				joinType = context.getConfiguration().get("join.type");
118 | 			}
119 | 
120 | 			@Override
121 | 			public void reduce(Text key, Iterable<Text> values, Context context)
122 | 					throws IOException, InterruptedException {
123 | 
124 | 				// Clear our lists
125 | 				listA.clear();
126 | 				listB.clear();
127 | 
128 | 				// iterate through all our values, binning each record based on
129 | 				// what
130 | 				// it was tagged with
131 | 				// make sure to remove the tag!
132 | 				for (Text t : values) {
133 | 					if (t.charAt(0) == 'A') {
134 | 						listA.add(new Text(t.toString().substring(1)));
135 | 					} else /* if (tmp.charAt('0') == 'B') */{
136 | 						listB.add(new Text(t.toString().substring(1)));
137 | 					}
138 | 				}
139 | 
140 | 				// Execute our join logic now that the lists are filled
141 | 				executeJoinLogic(context);
142 | 			}
143 | 
144 | 			private void executeJoinLogic(Context context) throws IOException,
145 | 					InterruptedException {
146 | 				if (joinType.equalsIgnoreCase("inner")) {
147 | 					// If both lists are not empty, join A with B
148 | 					if (!listA.isEmpty() && !listB.isEmpty()) {
149 | 						for (Text A : listA) {
150 | 							for (Text B : listB) {
151 | 								context.write(A, B);
152 | 							}
153 | 						}
154 | 					}
155 | 				} else if (joinType.equalsIgnoreCase("leftouter")) {
156 | 					// For each entry in A,
157 | 					for (Text A : listA) {
158 | 						// If list B is not empty, join A and B
159 | 						if (!listB.isEmpty()) {
160 | 							for (Text B : listB) {
161 | 								context.write(A, B);
162 | 							}
163 | 						} else {
164 | 							// Else, output A by itself
165 | 							context.write(A, new Text(""));
166 | 						}
167 | 					}
168 | 				} else if (joinType.equalsIgnoreCase("rightouter")) {
169 | 					// FOr each entry in B,
170 | 					for (Text B : listB) {
171 | 						// If list A is not empty, join A and B
172 | 						if (!listA.isEmpty()) {
173 | 							for (Text A : listA) {
174 | 								context.write(A, B);
175 | 							}
176 | 						} else {
177 | 							// Else, output B by itself
178 | 							context.write(new Text(""), B);
179 | 						}
180 | 					}
181 | 				} else if (joinType.equalsIgnoreCase("fullouter")) {
182 | 					// If list A is not empty
183 | 					if (!listA.isEmpty()) {
184 | 						// For each entry in A
185 | 						for (Text A : listA) {
186 | 							// If list B is not empty, join A with B
187 | 							if (!listB.isEmpty()) {
188 | 								for (Text B : listB) {
189 | 									context.write(A, B);
190 | 								}
191 | 							} else {
192 | 								// Else, output A by itself
193 | 								context.write(A, new Text(""));
194 | 							}
195 | 						}
196 | 					} else {
197 | 						// If list A is empty, just output B
198 | 						for (Text B : listB) {
199 | 							context.write(new Text(""), B);
200 | 						}
201 | 					}
202 | 				} else if (joinType.equalsIgnoreCase("anti")) {
203 | 					// If list A is empty and B is not empty or vice versa
204 | 					if (listA.isEmpty() ^ listB.isEmpty()) {
205 | 
206 | 						// Iterate both A and B with null values
207 | 						// The previous XOR check will make sure exactly one of
208 | 						// these lists is empty and therefore won't have output
209 | 						for (Text A : listA) {
210 | 							context.write(A, new Text(""));
211 | 						}
212 | 
213 | 						for (Text B : listB) {
214 | 							context.write(new Text(""), B);
215 | 						}
216 | 					}
217 | 				} else {
218 | 					throw new RuntimeException(
219 | 							"Join type not set to inner, leftouter, rightouter, fullouter, or anti");
220 | 				}
221 | 			}
222 | 		}
223 | 
224 | 		public static void main(String[] args) throws Exception {
225 | 			Configuration conf = new Configuration();
226 | 			String[] otherArgs = new GenericOptionsParser(conf, args)
227 | 					.getRemainingArgs();
228 | 			if (otherArgs.length != 4) {
229 | 				System.err
230 | 						.println("Usage: ReduceSideJoin <user data> <comment data> <out> [inner|leftouter|rightouter|fullouter|anti]");
231 | 				System.exit(1);
232 | 			}
233 | 
234 | 			String joinType = otherArgs[3];
235 | 			if (!(joinType.equalsIgnoreCase("inner")
236 | 					|| joinType.equalsIgnoreCase("leftouter")
237 | 					|| joinType.equalsIgnoreCase("rightouter")
238 | 					|| joinType.equalsIgnoreCase("fullouter") || joinType
239 | 						.equalsIgnoreCase("anti"))) {
240 | 				System.err
241 | 						.println("Join type not set to inner, leftouter, rightouter, fullouter, or anti");
242 | 				System.exit(2);
243 | 			}
244 | 
245 | 			Job job = new Job(conf, "Reduce Side Join");
246 | 			// Configure the join type
247 | 			job.getConfiguration().set("join.type", joinType);
248 | 			job.setJarByClass(ReduceSideJoinWithBloomDriver.class);
249 | 
250 | 			// Use multiple inputs to set which input uses what mapper
251 | 			// This will keep parsing of each data set separate from a logical
252 | 			// standpoint
253 | 			MultipleInputs.addInputPath(job, new Path(otherArgs[0]),
254 | 					TextInputFormat.class, UserJoinMapperWithBloom.class);
255 | 			MultipleInputs.addInputPath(job, new Path(otherArgs[1]),
256 | 					TextInputFormat.class, CommentJoinMapperWithBloom.class);
257 | 
258 | 			job.setReducerClass(UserJoinReducer.class);
259 | 
260 | 			FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
261 | 
262 | 			job.setOutputKeyClass(Text.class);
263 | 			job.setOutputValueClass(Text.class);
264 | 
265 | 			System.exit(job.waitForCompletion(true) ? 0 : 3);
266 | 		}
267 | 	}
268 | }
269 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch5/ReplicatedJoinDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch5;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.Map;
  9 | import java.util.HashMap;
 10 | import java.util.zip.GZIPInputStream;
 11 | 
 12 | import mrdp.utils.MRDPUtils;
 13 | 
 14 | import org.apache.hadoop.conf.Configuration;
 15 | import org.apache.hadoop.filecache.DistributedCache;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.Text;
 18 | import org.apache.hadoop.mapreduce.Job;
 19 | import org.apache.hadoop.mapreduce.Mapper;
 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | 
 24 | public class ReplicatedJoinDriver {
 25 | 
 26 | 	public static class ReplicatedJoinMapper extends
 27 | 			Mapper<Object, Text, Text, Text> {
 28 | 
 29 | 		private HashMap<String, String> userIdToInfo = new HashMap<String, String>();
 30 | 
 31 | 		private Text outvalue = new Text();
 32 | 		private String joinType = null;
 33 | 
 34 | 		@Override
 35 | 		public void setup(Context context) throws IOException,
 36 | 				InterruptedException {
 37 | 			try {
 38 | 				Path[] files = DistributedCache.getLocalCacheFiles(context
 39 | 						.getConfiguration());
 40 | 
 41 | 				if (files == null || files.length == 0) {
 42 | 					throw new RuntimeException(
 43 | 							"User information is not set in DistributedCache");
 44 | 				}
 45 | 
 46 | 				// Read all files in the DistributedCache
 47 | 				for (Path p : files) {
 48 | 					BufferedReader rdr = new BufferedReader(
 49 | 							new InputStreamReader(
 50 | 									new GZIPInputStream(new FileInputStream(
 51 | 											new File(p.toString())))));
 52 | 
 53 | 					String line;
 54 | 					// For each record in the user file
 55 | 					while ((line = rdr.readLine()) != null) {
 56 | 
 57 | 						// Get the user ID for this record
 58 | 						Map<String, String> parsed = MRDPUtils
 59 | 								.transformXmlToMap(line);
 60 | 						String userId = parsed.get("Id");
 61 | 
 62 | 						if (userId != null) {
 63 | 							// Map the user ID to the record
 64 | 							userIdToInfo.put(userId, line);
 65 | 						}
 66 | 					}
 67 | 				}
 68 | 
 69 | 			} catch (IOException e) {
 70 | 				throw new RuntimeException(e);
 71 | 			}
 72 | 
 73 | 			// Get the join type
 74 | 			joinType = context.getConfiguration().get("join.type");
 75 | 		}
 76 | 
 77 | 		@Override
 78 | 		public void map(Object key, Text value, Context context)
 79 | 				throws IOException, InterruptedException {
 80 | 
 81 | 			// Parse the input string into a nice map
 82 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 83 | 					.toString());
 84 | 
 85 | 			String userId = parsed.get("UserId");
 86 | 
 87 | 			if (userId == null) {
 88 | 				return;
 89 | 			}
 90 | 
 91 | 			String userInformation = userIdToInfo.get(userId);
 92 | 
 93 | 			// If the user information is not null, then output
 94 | 			if (userInformation != null) {
 95 | 				outvalue.set(userInformation);
 96 | 				context.write(value, outvalue);
 97 | 			} else if (joinType.equalsIgnoreCase("leftouter")) {
 98 | 				// If we are doing a left outer join, output the record with an
 99 | 				// empty value
100 | 				context.write(value, new Text(""));
101 | 			}
102 | 		}
103 | 	}
104 | 
105 | 	public static void main(String[] args) throws Exception {
106 | 		Configuration conf = new Configuration();
107 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
108 | 				.getRemainingArgs();
109 | 		if (otherArgs.length != 4) {
110 | 			System.err
111 | 					.println("Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]");
112 | 			System.exit(1);
113 | 		}
114 | 
115 | 		String joinType = otherArgs[3];
116 | 		if (!(joinType.equalsIgnoreCase("inner") || joinType
117 | 				.equalsIgnoreCase("leftouter"))) {
118 | 			System.err.println("Join type not set to inner or leftouter");
119 | 			System.exit(2);
120 | 		}
121 | 
122 | 		// Configure the join type
123 | 		Job job = new Job(conf, "Replicated Join");
124 | 		job.getConfiguration().set("join.type", joinType);
125 | 		job.setJarByClass(ReplicatedJoinDriver.class);
126 | 
127 | 		job.setMapperClass(ReplicatedJoinMapper.class);
128 | 		job.setNumReduceTasks(0);
129 | 
130 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[1]));
131 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
132 | 
133 | 		job.setOutputKeyClass(Text.class);
134 | 		job.setOutputValueClass(Text.class);
135 | 
136 | 		// Configure the DistributedCache
137 | 		DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(),
138 | 				job.getConfiguration());
139 | 
140 | 		DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]);
141 | 
142 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
143 | 	}
144 | }
145 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch6/ChainMapperDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch6;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.HashMap;
  9 | import java.util.Iterator;
 10 | import java.util.Map;
 11 | import java.util.zip.GZIPInputStream;
 12 | 
 13 | import mrdp.utils.MRDPUtils;
 14 | 
 15 | import org.apache.hadoop.filecache.DistributedCache;
 16 | import org.apache.hadoop.fs.FileStatus;
 17 | import org.apache.hadoop.fs.FileSystem;
 18 | import org.apache.hadoop.fs.Path;
 19 | import org.apache.hadoop.io.LongWritable;
 20 | import org.apache.hadoop.io.Text;
 21 | import org.apache.hadoop.mapred.FileOutputFormat;
 22 | import org.apache.hadoop.mapred.JobClient;
 23 | import org.apache.hadoop.mapred.JobConf;
 24 | import org.apache.hadoop.mapred.MapReduceBase;
 25 | import org.apache.hadoop.mapred.Mapper;
 26 | import org.apache.hadoop.mapred.OutputCollector;
 27 | import org.apache.hadoop.mapred.Reducer;
 28 | import org.apache.hadoop.mapred.Reporter;
 29 | import org.apache.hadoop.mapred.RunningJob;
 30 | import org.apache.hadoop.mapred.TextInputFormat;
 31 | import org.apache.hadoop.mapred.TextOutputFormat;
 32 | import org.apache.hadoop.mapred.lib.ChainMapper;
 33 | import org.apache.hadoop.mapred.lib.ChainReducer;
 34 | import org.apache.hadoop.mapred.lib.MultipleOutputs;
 35 | import org.apache.hadoop.mapred.lib.NullOutputFormat;
 36 | import org.apache.hadoop.util.GenericOptionsParser;
 37 | 
 38 | public class ChainMapperDriver {
 39 | 
 40 | 	public static final String AVERAGE_CALC_GROUP = "AverageCalculation";
 41 | 	public static final String MULTIPLE_OUTPUTS_BELOW_5000 = "below5000";
 42 | 	public static final String MULTIPLE_OUTPUTS_ABOVE_5000 = "above5000";
 43 | 
 44 | 	public static class UserIdCountMapper extends MapReduceBase implements
 45 | 			Mapper<Object, Text, Text, LongWritable> {
 46 | 
 47 | 		public static final String RECORDS_COUNTER_NAME = "Records";
 48 | 
 49 | 		private static final LongWritable ONE = new LongWritable(1);
 50 | 		private Text outkey = new Text();
 51 | 
 52 | 		@Override
 53 | 		public void map(Object key, Text value,
 54 | 				OutputCollector<Text, LongWritable> output, Reporter reporter)
 55 | 				throws IOException {
 56 | 
 57 | 			// Parse the input into a nice map.
 58 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 59 | 					.toString());
 60 | 
 61 | 			// Get the value for the OwnerUserId attribute
 62 | 			String userId = parsed.get("OwnerUserId");
 63 | 
 64 | 			if (userId != null) {
 65 | 				outkey.set(userId);
 66 | 				output.collect(outkey, ONE);
 67 | 			}
 68 | 		}
 69 | 	}
 70 | 
 71 | 	public static class UserIdReputationEnrichmentMapper extends MapReduceBase
 72 | 			implements Mapper<Text, LongWritable, Text, LongWritable> {
 73 | 
 74 | 		private Text outkey = new Text();
 75 | 		private HashMap<String, String> userIdToReputation = new HashMap<String, String>();
 76 | 
 77 | 		@Override
 78 | 		public void configure(JobConf job) {
 79 | 			try {
 80 | 				userIdToReputation.clear();
 81 | 				Path[] files = DistributedCache.getLocalCacheFiles(job);
 82 | 
 83 | 				if (files == null || files.length == 0) {
 84 | 					throw new RuntimeException(
 85 | 							"User information is not set in DistributedCache");
 86 | 				}
 87 | 
 88 | 				// Read all files in the DistributedCache
 89 | 				for (Path p : files) {
 90 | 					BufferedReader rdr = new BufferedReader(
 91 | 							new InputStreamReader(
 92 | 									new GZIPInputStream(new FileInputStream(
 93 | 											new File(p.toString())))));
 94 | 
 95 | 					String line;
 96 | 					// For each record in the user file
 97 | 					while ((line = rdr.readLine()) != null) {
 98 | 
 99 | 						// Get the user ID and reputation
100 | 						Map<String, String> parsed = MRDPUtils
101 | 								.transformXmlToMap(line);
102 | 						String userId = parsed.get("Id");
103 | 						String reputation = parsed.get("Reputation");
104 | 
105 | 						if (userId != null && reputation != null) {
106 | 							// Map the user ID to the reputation
107 | 							userIdToReputation.put(userId, reputation);
108 | 						}
109 | 					}
110 | 				}
111 | 
112 | 			} catch (IOException e) {
113 | 				throw new RuntimeException(e);
114 | 			}
115 | 		}
116 | 
117 | 		@Override
118 | 		public void map(Text key, LongWritable value,
119 | 				OutputCollector<Text, LongWritable> output, Reporter reporter)
120 | 				throws IOException {
121 | 
122 | 			String reputation = userIdToReputation.get(key.toString());
123 | 			if (reputation != null) {
124 | 				outkey.set(value.get() + "\t" + reputation);
125 | 				output.collect(outkey, value);
126 | 			}
127 | 		}
128 | 	}
129 | 
130 | 	public static class LongSumReducer extends MapReduceBase implements
131 | 			Reducer<Text, LongWritable, Text, LongWritable> {
132 | 
133 | 		private LongWritable outvalue = new LongWritable();
134 | 
135 | 		@Override
136 | 		public void reduce(Text key, Iterator<LongWritable> values,
137 | 				OutputCollector<Text, LongWritable> output, Reporter reporter)
138 | 				throws IOException {
139 | 
140 | 			int sum = 0;
141 | 			while (values.hasNext()) {
142 | 				sum += values.next().get();
143 | 			}
144 | 			outvalue.set(sum);
145 | 			output.collect(key, outvalue);
146 | 		}
147 | 	}
148 | 
149 | 	public static class UserIdBinningMapper extends MapReduceBase implements
150 | 			Mapper<Text, LongWritable, Text, LongWritable> {
151 | 
152 | 		private MultipleOutputs mos = null;
153 | 
154 | 		@Override
155 | 		public void configure(JobConf conf) {
156 | 			mos = new MultipleOutputs(conf);
157 | 		}
158 | 
159 | 		@SuppressWarnings("unchecked")
160 | 		@Override
161 | 		public void map(Text key, LongWritable value,
162 | 				OutputCollector<Text, LongWritable> output, Reporter reporter)
163 | 				throws IOException {
164 | 
165 | 			if (Integer.parseInt(key.toString().split("\t")[1]) < 5000) {
166 | 				mos.getCollector(MULTIPLE_OUTPUTS_BELOW_5000, reporter)
167 | 						.collect(key, value);
168 | 			} else {
169 | 				mos.getCollector(MULTIPLE_OUTPUTS_ABOVE_5000, reporter)
170 | 						.collect(key, value);
171 | 			}
172 | 		}
173 | 
174 | 		@Override
175 | 		public void close() {
176 | 			try {
177 | 				mos.close();
178 | 			} catch (IOException e) {
179 | 				e.printStackTrace();
180 | 			}
181 | 		}
182 | 	}
183 | 
184 | 	public static void main(String[] args) throws Exception {
185 | 		JobConf conf = new JobConf("ChainMapperReducer");
186 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
187 | 				.getRemainingArgs();
188 | 
189 | 		if (otherArgs.length != 3) {
190 | 			System.err
191 | 					.println("Usage: ChainMapperReducer <posts> <users> <out>");
192 | 			System.exit(2);
193 | 		}
194 | 
195 | 		Path postInput = new Path(otherArgs[0]);
196 | 		Path userInput = new Path(otherArgs[1]);
197 | 		Path outputDir = new Path(otherArgs[2]);
198 | 
199 | 		// Setup first job to counter user posts
200 | 		conf.setJarByClass(ChainMapperDriver.class);
201 | 
202 | 		ChainMapper.addMapper(conf, UserIdCountMapper.class,
203 | 				LongWritable.class, Text.class, Text.class, LongWritable.class,
204 | 				false, new JobConf(false));
205 | 
206 | 		ChainMapper.addMapper(conf, UserIdReputationEnrichmentMapper.class,
207 | 				Text.class, LongWritable.class, Text.class, LongWritable.class,
208 | 				false, new JobConf(false));
209 | 
210 | 		ChainReducer.setReducer(conf, LongSumReducer.class, Text.class,
211 | 				LongWritable.class, Text.class, LongWritable.class, false,
212 | 				new JobConf(false));
213 | 
214 | 		ChainReducer.addMapper(conf, UserIdBinningMapper.class, Text.class,
215 | 				LongWritable.class, Text.class, LongWritable.class, false,
216 | 				new JobConf(false));
217 | 
218 | 		conf.setCombinerClass(LongSumReducer.class);
219 | 
220 | 		conf.setInputFormat(TextInputFormat.class);
221 | 		TextInputFormat.setInputPaths(conf, postInput);
222 | 
223 | 		// Configure multiple outputs
224 | 		conf.setOutputFormat(NullOutputFormat.class);
225 | 		FileOutputFormat.setOutputPath(conf, outputDir);
226 | 		MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_ABOVE_5000,
227 | 				TextOutputFormat.class, Text.class, LongWritable.class);
228 | 		MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_BELOW_5000,
229 | 				TextOutputFormat.class, Text.class, LongWritable.class);
230 | 
231 | 		conf.setOutputKeyClass(Text.class);
232 | 		conf.setOutputValueClass(LongWritable.class);
233 | 
234 | 		// Add the user files to the DistributedCache
235 | 		FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
236 | 		for (FileStatus status : userFiles) {
237 | 			DistributedCache.addCacheFile(status.getPath().toUri(), conf);
238 | 		}
239 | 
240 | 		RunningJob job = JobClient.runJob(conf);
241 | 
242 | 		while (!job.isComplete()) {
243 | 			Thread.sleep(5000);
244 | 		}
245 | 
246 | 		System.exit(job.isSuccessful() ? 0 : 1);
247 | 	}
248 | }
249 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch6/JobChainingDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch6;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStreamReader;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | import java.util.zip.GZIPInputStream;
 11 | 
 12 | import mrdp.utils.MRDPUtils;
 13 | 
 14 | import org.apache.hadoop.conf.Configuration;
 15 | import org.apache.hadoop.filecache.DistributedCache;
 16 | import org.apache.hadoop.fs.FileStatus;
 17 | import org.apache.hadoop.fs.FileSystem;
 18 | import org.apache.hadoop.fs.Path;
 19 | import org.apache.hadoop.io.LongWritable;
 20 | import org.apache.hadoop.io.Text;
 21 | import org.apache.hadoop.mapreduce.Job;
 22 | import org.apache.hadoop.mapreduce.Mapper;
 23 | import org.apache.hadoop.mapreduce.Reducer;
 24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 25 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 26 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 27 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
 28 | import org.apache.hadoop.util.GenericOptionsParser;
 29 | 
 30 | public class JobChainingDriver {
 31 | 
 32 | 	public static final String AVERAGE_CALC_GROUP = "AverageCalculation";
 33 | 	public static final String MULTIPLE_OUTPUTS_ABOVE_NAME = "aboveavg";
 34 | 	public static final String MULTIPLE_OUTPUTS_BELOW_NAME = "belowavg";
 35 | 
 36 | 	public static class UserIdCountMapper extends
 37 | 			Mapper<Object, Text, Text, LongWritable> {
 38 | 
 39 | 		public static final String RECORDS_COUNTER_NAME = "Records";
 40 | 
 41 | 		private static final LongWritable ONE = new LongWritable(1);
 42 | 		private Text outkey = new Text();
 43 | 
 44 | 		@Override
 45 | 		public void map(Object key, Text value, Context context)
 46 | 				throws IOException, InterruptedException {
 47 | 
 48 | 			// Parse the input into a nice map.
 49 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 50 | 					.toString());
 51 | 
 52 | 			// Get the value for the OwnerUserId attribute
 53 | 			String userId = parsed.get("OwnerUserId");
 54 | 
 55 | 			if (userId != null) {
 56 | 				outkey.set(userId);
 57 | 				context.write(outkey, ONE);
 58 | 				context.getCounter(AVERAGE_CALC_GROUP, RECORDS_COUNTER_NAME)
 59 | 						.increment(1);
 60 | 			}
 61 | 		}
 62 | 	}
 63 | 
 64 | 	public static class UserIdSumReducer extends
 65 | 			Reducer<Text, LongWritable, Text, LongWritable> {
 66 | 
 67 | 		public static final String USERS_COUNTER_NAME = "Users";
 68 | 		private LongWritable outvalue = new LongWritable();
 69 | 
 70 | 		@Override
 71 | 		public void reduce(Text key, Iterable<LongWritable> values,
 72 | 				Context context) throws IOException, InterruptedException {
 73 | 
 74 | 			// Increment user counter, as each reduce group represents one user
 75 | 			context.getCounter(AVERAGE_CALC_GROUP, USERS_COUNTER_NAME)
 76 | 					.increment(1);
 77 | 
 78 | 			int sum = 0;
 79 | 
 80 | 			for (LongWritable value : values) {
 81 | 				sum += value.get();
 82 | 			}
 83 | 
 84 | 			outvalue.set(sum);
 85 | 			context.write(key, outvalue);
 86 | 		}
 87 | 	}
 88 | 
 89 | 	public static class UserIdBinningMapper extends
 90 | 			Mapper<Object, Text, Text, Text> {
 91 | 
 92 | 		public static final String AVERAGE_POSTS_PER_USER = "avg.posts.per.user";
 93 | 
 94 | 		public static void setAveragePostsPerUser(Job job, double avg) {
 95 | 			job.getConfiguration().set(AVERAGE_POSTS_PER_USER,
 96 | 					Double.toString(avg));
 97 | 		}
 98 | 
 99 | 		public static double getAveragePostsPerUser(Configuration conf) {
100 | 			return Double.parseDouble(conf.get(AVERAGE_POSTS_PER_USER));
101 | 		}
102 | 
103 | 		private double average = 0.0;
104 | 		private MultipleOutputs<Text, Text> mos = null;
105 | 		private Text outkey = new Text(), outvalue = new Text();
106 | 		private HashMap<String, String> userIdToReputation = new HashMap<String, String>();
107 | 
108 | 		protected void setup(Context context) throws IOException,
109 | 				InterruptedException {
110 | 			average = getAveragePostsPerUser(context.getConfiguration());
111 | 			mos = new MultipleOutputs<Text, Text>(context);
112 | 
113 | 			try {
114 | 				Path[] files = DistributedCache.getLocalCacheFiles(context
115 | 						.getConfiguration());
116 | 
117 | 				if (files == null || files.length == 0) {
118 | 					throw new RuntimeException(
119 | 							"User information is not set in DistributedCache");
120 | 				}
121 | 
122 | 				// Read all files in the DistributedCache
123 | 				for (Path p : files) {
124 | 					BufferedReader rdr = new BufferedReader(
125 | 							new InputStreamReader(
126 | 									new GZIPInputStream(new FileInputStream(
127 | 											new File(p.toString())))));
128 | 
129 | 					String line;
130 | 					// For each record in the user file
131 | 					while ((line = rdr.readLine()) != null) {
132 | 
133 | 						// Get the user ID and reputation
134 | 						Map<String, String> parsed = MRDPUtils
135 | 								.transformXmlToMap(line);
136 | 						String userId = parsed.get("Id");
137 | 						String reputation = parsed.get("Reputation");
138 | 
139 | 						if (userId != null && reputation != null) {
140 | 							// Map the user ID to the reputation
141 | 							userIdToReputation.put(userId, reputation);
142 | 						}
143 | 					}
144 | 				}
145 | 
146 | 			} catch (IOException e) {
147 | 				throw new RuntimeException(e);
148 | 			}
149 | 		}
150 | 
151 | 		@Override
152 | 		public void map(Object key, Text value, Context context)
153 | 				throws IOException, InterruptedException {
154 | 
155 | 			String[] tokens = value.toString().split("\t");
156 | 
157 | 			String userId = tokens[0];
158 | 			int posts = Integer.parseInt(tokens[1]);
159 | 
160 | 			outkey.set(userId);
161 | 			outvalue.set((long) posts + "\t" + userIdToReputation.get(userId));
162 | 
163 | 			if ((double) posts < average) {
164 | 				mos.write(MULTIPLE_OUTPUTS_BELOW_NAME, outkey, outvalue,
165 | 						MULTIPLE_OUTPUTS_BELOW_NAME + "/part");
166 | 			} else {
167 | 				mos.write(MULTIPLE_OUTPUTS_ABOVE_NAME, outkey, outvalue,
168 | 						MULTIPLE_OUTPUTS_ABOVE_NAME + "/part");
169 | 			}
170 | 
171 | 		}
172 | 
173 | 		protected void cleanup(Context context) throws IOException,
174 | 				InterruptedException {
175 | 			mos.close();
176 | 		}
177 | 	}
178 | 
179 | 	public static void main(String[] args) throws Exception {
180 | 		Configuration conf = new Configuration();
181 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
182 | 				.getRemainingArgs();
183 | 
184 | 		if (otherArgs.length != 3) {
185 | 			System.err
186 | 					.println("Usage: JobChainingDriver <posts> <users> <out>");
187 | 			System.exit(2);
188 | 		}
189 | 
190 | 		Path postInput = new Path(otherArgs[0]);
191 | 		Path userInput = new Path(otherArgs[1]);
192 | 		Path outputDirIntermediate = new Path(otherArgs[2] + "_int");
193 | 		Path outputDir = new Path(otherArgs[2]);
194 | 
195 | 		// Setup first job to counter user posts
196 | 		Job countingJob = new Job(conf, "JobChaining-Counting");
197 | 		countingJob.setJarByClass(JobChainingDriver.class);
198 | 
199 | 		// Set our mapper and reducer, we can use the API's long sum reducer for
200 | 		// a combiner!
201 | 		countingJob.setMapperClass(UserIdCountMapper.class);
202 | 		countingJob.setCombinerClass(LongSumReducer.class);
203 | 		countingJob.setReducerClass(UserIdSumReducer.class);
204 | 
205 | 		countingJob.setOutputKeyClass(Text.class);
206 | 		countingJob.setOutputValueClass(LongWritable.class);
207 | 
208 | 		countingJob.setInputFormatClass(TextInputFormat.class);
209 | 
210 | 		TextInputFormat.addInputPath(countingJob, postInput);
211 | 
212 | 		countingJob.setOutputFormatClass(TextOutputFormat.class);
213 | 		TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);
214 | 
215 | 		// Execute job and grab exit code
216 | 		int code = countingJob.waitForCompletion(true) ? 0 : 1;
217 | 
218 | 		if (code == 0) {
219 | 			// Calculate the average posts per user by getting counter values
220 | 			double numRecords = (double) countingJob
221 | 					.getCounters()
222 | 					.findCounter(AVERAGE_CALC_GROUP,
223 | 							UserIdCountMapper.RECORDS_COUNTER_NAME).getValue();
224 | 			double numUsers = (double) countingJob
225 | 					.getCounters()
226 | 					.findCounter(AVERAGE_CALC_GROUP,
227 | 							UserIdSumReducer.USERS_COUNTER_NAME).getValue();
228 | 
229 | 			double averagePostsPerUser = numRecords / numUsers;
230 | 
231 | 			// Setup binning job
232 | 			Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
233 | 			binningJob.setJarByClass(JobChainingDriver.class);
234 | 
235 | 			// Set mapper and the average posts per user
236 | 			binningJob.setMapperClass(UserIdBinningMapper.class);
237 | 			UserIdBinningMapper.setAveragePostsPerUser(binningJob,
238 | 					averagePostsPerUser);
239 | 
240 | 			binningJob.setNumReduceTasks(0);
241 | 
242 | 			binningJob.setInputFormatClass(TextInputFormat.class);
243 | 			TextInputFormat.addInputPath(binningJob, outputDirIntermediate);
244 | 
245 | 			// Add two named outputs for below/above average
246 | 			MultipleOutputs.addNamedOutput(binningJob,
247 | 					MULTIPLE_OUTPUTS_BELOW_NAME, TextOutputFormat.class,
248 | 					Text.class, Text.class);
249 | 
250 | 			MultipleOutputs.addNamedOutput(binningJob,
251 | 					MULTIPLE_OUTPUTS_ABOVE_NAME, TextOutputFormat.class,
252 | 					Text.class, Text.class);
253 | 			MultipleOutputs.setCountersEnabled(binningJob, true);
254 | 
255 | 			TextOutputFormat.setOutputPath(binningJob, outputDir);
256 | 
257 | 			// Add the user files to the DistributedCache
258 | 			FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
259 | 			for (FileStatus status : userFiles) {
260 | 				DistributedCache.addCacheFile(status.getPath().toUri(),
261 | 						binningJob.getConfiguration());
262 | 			}
263 | 
264 | 			// Execute job and grab exit code
265 | 			code = binningJob.waitForCompletion(true) ? 0 : 1;
266 | 		}
267 | 
268 | 		// Clean up the intermediate output
269 | 		FileSystem.get(conf).delete(outputDirIntermediate, true);
270 | 
271 | 		System.exit(code);
272 | 	}
273 | }
274 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch6/JobControlDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch6;
  2 | 
  3 | import java.io.IOException;
  4 | import mrdp.ch6.JobChainingDriver.UserIdBinningMapper;
  5 | import mrdp.ch6.JobChainingDriver.UserIdCountMapper;
  6 | import mrdp.ch6.JobChainingDriver.UserIdSumReducer;
  7 | import mrdp.ch6.ParallelJobs.AverageReputationMapper;
  8 | import mrdp.ch6.ParallelJobs.AverageReputationReducer;
  9 | 
 10 | import org.apache.hadoop.conf.Configuration;
 11 | import org.apache.hadoop.filecache.DistributedCache;
 12 | import org.apache.hadoop.fs.FileStatus;
 13 | import org.apache.hadoop.fs.FileSystem;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.DoubleWritable;
 16 | import org.apache.hadoop.io.LongWritable;
 17 | import org.apache.hadoop.io.Text;
 18 | import org.apache.hadoop.mapreduce.Job;
 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 20 | import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
 21 | import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
 22 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 24 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
 25 | 
 26 | public class JobControlDriver {
 27 | 	public static void main(String[] args) throws Exception {
 28 | 
 29 | 		if (args.length != 4) {
 30 | 			System.err
 31 | 					.println("Usage: JobChainingDriver <posts> <users> <belowavgrepout> <aboveavgrepout>");
 32 | 			System.exit(2);
 33 | 		}
 34 | 
 35 | 		Path postInput = new Path(args[0]);
 36 | 		Path userInput = new Path(args[1]);
 37 | 		Path countingOutput = new Path(args[3] + "_count");
 38 | 		Path binningOutputRoot = new Path(args[3] + "_bins");
 39 | 		Path binningOutputBelow = new Path(binningOutputRoot + "/"
 40 | 				+ JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME);
 41 | 		Path binningOutputAbove = new Path(binningOutputRoot + "/"
 42 | 				+ JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME);
 43 | 
 44 | 		Path belowAverageRepOutput = new Path(args[2]);
 45 | 		Path aboveAverageRepOutput = new Path(args[3]);
 46 | 
 47 | 		Job countingJob = getCountingJob(postInput, countingOutput);
 48 | 
 49 | 		int code = 1;
 50 | 		if (countingJob.waitForCompletion(true)) {
 51 | 			ControlledJob binningControlledJob = new ControlledJob(
 52 | 					getBinningJobConf(countingJob, countingOutput, userInput,
 53 | 							binningOutputRoot));
 54 | 
 55 | 			ControlledJob belowAvgControlledJob = new ControlledJob(
 56 | 					getAverageJobConf(binningOutputBelow, belowAverageRepOutput));
 57 | 			belowAvgControlledJob.addDependingJob(binningControlledJob);
 58 | 
 59 | 			ControlledJob aboveAvgControlledJob = new ControlledJob(
 60 | 					getAverageJobConf(binningOutputAbove, aboveAverageRepOutput));
 61 | 			aboveAvgControlledJob.addDependingJob(binningControlledJob);
 62 | 
 63 | 			JobControl jc = new JobControl("AverageReputation");
 64 | 			jc.addJob(binningControlledJob);
 65 | 			jc.addJob(belowAvgControlledJob);
 66 | 			jc.addJob(aboveAvgControlledJob);
 67 | 
 68 | 			jc.run();
 69 | 			code = jc.getFailedJobList().size() == 0 ? 0 : 1;
 70 | 		}
 71 | 
 72 | 		FileSystem fs = FileSystem.get(new Configuration());
 73 | 		fs.delete(countingOutput, true);
 74 | 		fs.delete(binningOutputRoot, true);
 75 | 
 76 | 		System.out.println("All Done");
 77 | 		System.exit(code);
 78 | 	}
 79 | 
 80 | 	public static Job getCountingJob(Path postInput, Path outputDirIntermediate)
 81 | 			throws IOException {
 82 | 		// Setup first job to counter user posts
 83 | 		Job countingJob = new Job(new Configuration(), "JobChaining-Counting");
 84 | 		countingJob.setJarByClass(JobChainingDriver.class);
 85 | 
 86 | 		// Set our mapper and reducer, we can use the API's long sum reducer for
 87 | 		// a combiner!
 88 | 		countingJob.setMapperClass(UserIdCountMapper.class);
 89 | 		countingJob.setCombinerClass(LongSumReducer.class);
 90 | 		countingJob.setReducerClass(UserIdSumReducer.class);
 91 | 
 92 | 		countingJob.setOutputKeyClass(Text.class);
 93 | 		countingJob.setOutputValueClass(LongWritable.class);
 94 | 
 95 | 		countingJob.setInputFormatClass(TextInputFormat.class);
 96 | 
 97 | 		TextInputFormat.addInputPath(countingJob, postInput);
 98 | 
 99 | 		countingJob.setOutputFormatClass(TextOutputFormat.class);
100 | 		TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);
101 | 
102 | 		return countingJob;
103 | 	}
104 | 
105 | 	public static Configuration getBinningJobConf(Job countingJob,
106 | 			Path jobchainOutdir, Path userInput, Path binningOutput)
107 | 			throws IOException {
108 | 		// Calculate the average posts per user by getting counter values
109 | 		double numRecords = (double) countingJob
110 | 				.getCounters()
111 | 				.findCounter(JobChainingDriver.AVERAGE_CALC_GROUP,
112 | 						UserIdCountMapper.RECORDS_COUNTER_NAME).getValue();
113 | 		double numUsers = (double) countingJob
114 | 				.getCounters()
115 | 				.findCounter(JobChainingDriver.AVERAGE_CALC_GROUP,
116 | 						UserIdSumReducer.USERS_COUNTER_NAME).getValue();
117 | 
118 | 		double averagePostsPerUser = numRecords / numUsers;
119 | 
120 | 		// Setup binning job
121 | 		Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
122 | 		binningJob.setJarByClass(JobChainingDriver.class);
123 | 
124 | 		// Set mapper and the average posts per user
125 | 		binningJob.setMapperClass(UserIdBinningMapper.class);
126 | 		UserIdBinningMapper.setAveragePostsPerUser(binningJob,
127 | 				averagePostsPerUser);
128 | 
129 | 		binningJob.setNumReduceTasks(0);
130 | 
131 | 		binningJob.setInputFormatClass(TextInputFormat.class);
132 | 		TextInputFormat.addInputPath(binningJob, jobchainOutdir);
133 | 
134 | 		// Add two named outputs for below/above average
135 | 		MultipleOutputs.addNamedOutput(binningJob,
136 | 				JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME,
137 | 				TextOutputFormat.class, Text.class, Text.class);
138 | 
139 | 		MultipleOutputs.addNamedOutput(binningJob,
140 | 				JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME,
141 | 				TextOutputFormat.class, Text.class, Text.class);
142 | 		MultipleOutputs.setCountersEnabled(binningJob, true);
143 | 
144 | 		TextOutputFormat.setOutputPath(binningJob, binningOutput);
145 | 
146 | 		// Add the user files to the DistributedCache
147 | 		FileStatus[] userFiles = FileSystem.get(new Configuration())
148 | 				.listStatus(userInput);
149 | 		for (FileStatus status : userFiles) {
150 | 			DistributedCache.addCacheFile(status.getPath().toUri(),
151 | 					binningJob.getConfiguration());
152 | 		}
153 | 
154 | 		// Execute job and grab exit code
155 | 		return binningJob.getConfiguration();
156 | 	}
157 | 
158 | 	public static Configuration getAverageJobConf(Path averageOutputDir,
159 | 			Path outputDir) throws IOException {
160 | 
161 | 		Job averageJob = new Job(new Configuration(), "ParallelJobs");
162 | 		averageJob.setJarByClass(ParallelJobs.class);
163 | 
164 | 		averageJob.setMapperClass(AverageReputationMapper.class);
165 | 		averageJob.setReducerClass(AverageReputationReducer.class);
166 | 
167 | 		averageJob.setOutputKeyClass(Text.class);
168 | 		averageJob.setOutputValueClass(DoubleWritable.class);
169 | 
170 | 		averageJob.setInputFormatClass(TextInputFormat.class);
171 | 
172 | 		TextInputFormat.addInputPath(averageJob, averageOutputDir);
173 | 
174 | 		averageJob.setOutputFormatClass(TextOutputFormat.class);
175 | 		TextOutputFormat.setOutputPath(averageJob, outputDir);
176 | 
177 | 		// Execute job and grab exit code
178 | 		return averageJob.getConfiguration();
179 | 	}
180 | 
181 | }
182 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch6/MergedJobDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch6;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.Map;
  7 | import java.util.Map.Entry;
  8 | import java.util.Random;
  9 | 
 10 | import mrdp.utils.MRDPUtils;
 11 | 
 12 | import org.apache.hadoop.conf.Configuration;
 13 | import org.apache.hadoop.fs.Path;
 14 | import org.apache.hadoop.io.NullWritable;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.io.WritableComparable;
 17 | import org.apache.hadoop.mapreduce.Job;
 18 | import org.apache.hadoop.mapreduce.Mapper;
 19 | import org.apache.hadoop.mapreduce.Reducer;
 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 21 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 22 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 23 | import org.apache.hadoop.util.GenericOptionsParser;
 24 | 
 25 | public class MergedJobDriver {
 26 | 
 27 | 	public static final String MULTIPLE_OUTPUTS_ANONYMIZE = "anonymize";
 28 | 	public static final String MULTIPLE_OUTPUTS_DISTINCT = "distinct";
 29 | 
 30 | 	public static class AnonymizeDistinctMergedMapper extends
 31 | 			Mapper<Object, Text, TaggedText, Text> {
 32 | 
 33 | 		private static final Text DISTINCT_OUT_VALUE = new Text();
 34 | 
 35 | 		private Random rndm = new Random();
 36 | 		private TaggedText anonymizeOutkey = new TaggedText(),
 37 | 				distinctOutkey = new TaggedText();
 38 | 		private Text anonymizeOutvalue = new Text();
 39 | 
 40 | 		@Override
 41 | 		public void map(Object key, Text value, Context context)
 42 | 				throws IOException, InterruptedException {
 43 | 			anonymizeMap(key, value, context);
 44 | 			distinctMap(key, value, context);
 45 | 		}
 46 | 
 47 | 		private void anonymizeMap(Object key, Text value, Context context)
 48 | 				throws IOException, InterruptedException {
 49 | 			// Parse the input string into a nice map
 50 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 51 | 					.toString());
 52 | 
 53 | 			if (parsed.size() > 0) {
 54 | 				StringBuilder bldr = new StringBuilder();
 55 | 				bldr.append("<row ");
 56 | 				for (Entry<String, String> entry : parsed.entrySet()) {
 57 | 
 58 | 					if (entry.getKey().equals("UserId")
 59 | 							|| entry.getKey().equals("Id")) {
 60 | 						// ignore these fields
 61 | 					} else if (entry.getKey().equals("CreationDate")) {
 62 | 						// Strip out the time, anything after the 'T' in the
 63 | 						// value
 64 | 						bldr.append(entry.getKey()
 65 | 								+ "=\""
 66 | 								+ entry.getValue().substring(0,
 67 | 										entry.getValue().indexOf('T')) + "\" ");
 68 | 					} else {
 69 | 						// Otherwise, output this.
 70 | 						bldr.append(entry.getKey() + "=\"" + entry.getValue()
 71 | 								+ "\" ");
 72 | 					}
 73 | 
 74 | 				}
 75 | 				bldr.append(">");
 76 | 				anonymizeOutkey.setTag("A");
 77 | 				anonymizeOutkey.setText(Integer.toString(rndm.nextInt()));
 78 | 				anonymizeOutvalue.set(bldr.toString());
 79 | 				context.write(anonymizeOutkey, anonymizeOutvalue);
 80 | 			}
 81 | 		}
 82 | 
 83 | 		private void distinctMap(Object key, Text value, Context context)
 84 | 				throws IOException, InterruptedException {
 85 | 			// Parse the input into a nice map.
 86 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 87 | 					.toString());
 88 | 
 89 | 			// Get the value for the UserId attribute
 90 | 			String userId = parsed.get("UserId");
 91 | 
 92 | 			// If it is null, skip this record
 93 | 			if (userId == null) {
 94 | 				return;
 95 | 			}
 96 | 
 97 | 			// Otherwise, set our output key to the user's id, tagged with a "D"
 98 | 			distinctOutkey.setTag("D");
 99 | 			distinctOutkey.setText(userId);
100 | 
101 | 			// Write the user's id with a null value
102 | 			context.write(distinctOutkey, DISTINCT_OUT_VALUE);
103 | 		}
104 | 	}
105 | 
106 | 	public static class AnonymizeDistinctMergedReducer extends
107 | 			Reducer<TaggedText, Text, Text, NullWritable> {
108 | 
109 | 		private MultipleOutputs<Text, NullWritable> mos = null;
110 | 
111 | 		@Override
112 | 		protected void setup(Context context) throws IOException,
113 | 				InterruptedException {
114 | 			mos = new MultipleOutputs<Text, NullWritable>(context);
115 | 		}
116 | 
117 | 		@Override
118 | 		protected void reduce(TaggedText key, Iterable<Text> values,
119 | 				Context context) throws IOException, InterruptedException {
120 | 
121 | 			if (key.getTag().equals("A")) {
122 | 				anonymizeReduce(key.getText(), values, context);
123 | 			} else {
124 | 				distinctReduce(key.getText(), values, context);
125 | 			}
126 | 		}
127 | 
128 | 		private void anonymizeReduce(Text key, Iterable<Text> values,
129 | 				Context context) throws IOException, InterruptedException {
130 | 
131 | 			for (Text value : values) {
132 | 				mos.write(MULTIPLE_OUTPUTS_ANONYMIZE, value,
133 | 						NullWritable.get(), MULTIPLE_OUTPUTS_ANONYMIZE
134 | 								+ "/part");
135 | 			}
136 | 		}
137 | 
138 | 		private void distinctReduce(Text key, Iterable<Text> values,
139 | 				Context context) throws IOException, InterruptedException {
140 | 			mos.write(MULTIPLE_OUTPUTS_DISTINCT, key, NullWritable.get(),
141 | 					MULTIPLE_OUTPUTS_DISTINCT + "/part");
142 | 		}
143 | 
144 | 		@Override
145 | 		protected void cleanup(Context context) throws IOException,
146 | 				InterruptedException {
147 | 			mos.close();
148 | 		}
149 | 	}
150 | 
151 | 	public static void main(String[] args) throws Exception {
152 | 		Configuration conf = new Configuration();
153 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
154 | 				.getRemainingArgs();
155 | 		if (otherArgs.length != 2) {
156 | 			System.err.println("Usage: MergedJob <comment data> <out>");
157 | 			System.exit(1);
158 | 		}
159 | 
160 | 		// Configure the merged job
161 | 		Job job = new Job(conf, "MergedJob");
162 | 		job.setJarByClass(MergedJobDriver.class);
163 | 
164 | 		job.setMapperClass(AnonymizeDistinctMergedMapper.class);
165 | 		job.setReducerClass(AnonymizeDistinctMergedReducer.class);
166 | 		job.setNumReduceTasks(10);
167 | 
168 | 		TextInputFormat.setInputPaths(job, new Path(otherArgs[0]));
169 | 		TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
170 | 
171 | 		MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_ANONYMIZE,
172 | 				TextOutputFormat.class, Text.class, NullWritable.class);
173 | 		MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_DISTINCT,
174 | 				TextOutputFormat.class, Text.class, NullWritable.class);
175 | 
176 | 		job.setOutputKeyClass(TaggedText.class);
177 | 		job.setOutputValueClass(Text.class);
178 | 
179 | 		System.exit(job.waitForCompletion(true) ? 0 : 2);
180 | 	}
181 | 
182 | 	public static class TaggedText implements WritableComparable<TaggedText> {
183 | 
184 | 		private String tag = "";
185 | 		private Text text = new Text();
186 | 
187 | 		public TaggedText() {
188 | 
189 | 		}
190 | 
191 | 		public TaggedText(TaggedText text) {
192 | 			setTag(text.getTag());
193 | 			setText(text.getText());
194 | 		}
195 | 
196 | 		public void setTag(String tag) {
197 | 			this.tag = tag;
198 | 		}
199 | 
200 | 		public String getTag() {
201 | 			return tag;
202 | 		}
203 | 
204 | 		public void setText(Text text) {
205 | 			this.text.set(text);
206 | 		}
207 | 
208 | 		public void setText(String text) {
209 | 			this.text.set(text);
210 | 		}
211 | 
212 | 		public Text getText() {
213 | 			return text;
214 | 		}
215 | 
216 | 		@Override
217 | 		public void readFields(DataInput in) throws IOException {
218 | 			tag = in.readUTF();
219 | 			text.readFields(in);
220 | 		}
221 | 
222 | 		@Override
223 | 		public void write(DataOutput out) throws IOException {
224 | 			out.writeUTF(tag);
225 | 			text.write(out);
226 | 		}
227 | 
228 | 		@Override
229 | 		public int compareTo(TaggedText obj) {
230 | 			int compare = tag.compareTo(obj.getTag());
231 | 			if (compare == 0) {
232 | 				return text.compareTo(obj.getText());
233 | 			} else {
234 | 				return compare;
235 | 			}
236 | 		}
237 | 		
238 | 		@Override
239 | 		public String toString() {
240 | 			return tag.toString() + ":" + text.toString();
241 | 		}
242 | 	}
243 | }
244 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch6/ParallelJobs.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch6;
  2 | 
  3 | import java.io.IOException;
  4 | import org.apache.hadoop.conf.Configuration;
  5 | import org.apache.hadoop.fs.Path;
  6 | import org.apache.hadoop.io.DoubleWritable;
  7 | import org.apache.hadoop.io.LongWritable;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.apache.hadoop.mapreduce.Job;
 10 | import org.apache.hadoop.mapreduce.Mapper;
 11 | import org.apache.hadoop.mapreduce.Reducer;
 12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 13 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 14 | import org.apache.hadoop.util.GenericOptionsParser;
 15 | 
 16 | public class ParallelJobs {
 17 | 
 18 | 	public static class AverageReputationMapper extends
 19 | 			Mapper<LongWritable, Text, Text, DoubleWritable> {
 20 | 
 21 | 		private static final Text GROUP_ALL_KEY = new Text(
 22 | 				"Average Reputation:");
 23 | 		private DoubleWritable outvalue = new DoubleWritable();
 24 | 
 25 | 		@Override
 26 | 		protected void map(LongWritable key, Text value, Context context)
 27 | 				throws IOException, InterruptedException {
 28 | 			try {
 29 | 				// Split the line into tokens
 30 | 				String[] tokens = value.toString().split("\t");
 31 | 
 32 | 				// Get the reputation from the third column
 33 | 				double reputation = Double.parseDouble(tokens[2]);
 34 | 
 35 | 				// Set the output value and write to context
 36 | 				outvalue.set(reputation);
 37 | 				context.write(GROUP_ALL_KEY, outvalue);
 38 | 			} catch (NumberFormatException e) {
 39 | 				// Skip record
 40 | 			}
 41 | 		}
 42 | 	}
 43 | 
 44 | 	public static class AverageReputationReducer extends
 45 | 			Reducer<Text, DoubleWritable, Text, DoubleWritable> {
 46 | 
 47 | 		private DoubleWritable outvalue = new DoubleWritable();
 48 | 
 49 | 		@Override
 50 | 		protected void reduce(Text key, Iterable<DoubleWritable> values,
 51 | 				Context context) throws IOException, InterruptedException {
 52 | 
 53 | 			double sum = 0.0;
 54 | 			double count = 0;
 55 | 			for (DoubleWritable dw : values) {
 56 | 				sum += dw.get();
 57 | 				++count;
 58 | 			}
 59 | 
 60 | 			outvalue.set(sum / count);
 61 | 			context.write(key, outvalue);
 62 | 		}
 63 | 	}
 64 | 
 65 | 	public static void main(String[] args) throws Exception {
 66 | 
 67 | 		Configuration conf = new Configuration();
 68 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
 69 | 				.getRemainingArgs();
 70 | 
 71 | 		if (otherArgs.length != 4) {
 72 | 			System.err
 73 | 					.println("Usage: ParallelJobs <below-avg-in> <below-avg-out> <below-avg-out> <above-avg-out>");
 74 | 			System.exit(2);
 75 | 		}
 76 | 
 77 | 		Path belowAvgInputDir = new Path(otherArgs[0]);
 78 | 		Path aboveAvgInputDir = new Path(otherArgs[1]);
 79 | 
 80 | 		Path belowAvgOutputDir = new Path(otherArgs[2]);
 81 | 		Path aboveAvgOutputDir = new Path(otherArgs[3]);
 82 | 
 83 | 		Job belowAvgJob = submitJob(conf, belowAvgInputDir, belowAvgOutputDir);
 84 | 		Job aboveAvgJob = submitJob(conf, aboveAvgInputDir, aboveAvgOutputDir);
 85 | 
 86 | 		// While both jobs are not finished, sleep
 87 | 		while (!belowAvgJob.isComplete() || !aboveAvgJob.isComplete()) {
 88 | 			Thread.sleep(5000);
 89 | 		}
 90 | 
 91 | 		if (belowAvgJob.isSuccessful()) {
 92 | 			System.out.println("Below average job completed successfully!");
 93 | 		} else {
 94 | 			System.out.println("Below average job failed!");
 95 | 		}
 96 | 
 97 | 		if (aboveAvgJob.isSuccessful()) {
 98 | 			System.out.println("Above average job completed successfully!");
 99 | 		} else {
100 | 			System.out.println("Above average job failed!");
101 | 		}
102 | 
103 | 		System.exit(belowAvgJob.isSuccessful() && aboveAvgJob.isSuccessful() ? 0
104 | 				: 1);
105 | 	}
106 | 
107 | 	private static Job submitJob(Configuration conf, Path inputDir,
108 | 			Path outputDir) throws IOException, InterruptedException,
109 | 			ClassNotFoundException {
110 | 
111 | 		Job job = new Job(conf, "ParallelJobs");
112 | 		job.setJarByClass(ParallelJobs.class);
113 | 
114 | 		job.setMapperClass(AverageReputationMapper.class);
115 | 		job.setReducerClass(AverageReputationReducer.class);
116 | 
117 | 		job.setOutputKeyClass(Text.class);
118 | 		job.setOutputValueClass(DoubleWritable.class);
119 | 
120 | 		job.setInputFormatClass(TextInputFormat.class);
121 | 		TextInputFormat.addInputPath(job, inputDir);
122 | 
123 | 		job.setOutputFormatClass(TextOutputFormat.class);
124 | 		TextOutputFormat.setOutputPath(job, outputDir);
125 | 
126 | 		job.submit();
127 | 		return job;
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch7/PartitionPruningInputDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch7;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.ArrayList;
  7 | import java.util.HashMap;
  8 | import java.util.Iterator;
  9 | import java.util.List;
 10 | import java.util.Map.Entry;
 11 | 
 12 | import mrdp.ch7.PartitionPruningOutputDriver.RedisKey;
 13 | import mrdp.utils.MRDPUtils;
 14 | 
 15 | import org.apache.hadoop.conf.Configuration;
 16 | import org.apache.hadoop.fs.Path;
 17 | import org.apache.hadoop.io.Text;
 18 | import org.apache.hadoop.io.Writable;
 19 | import org.apache.hadoop.mapreduce.InputFormat;
 20 | import org.apache.hadoop.mapreduce.InputSplit;
 21 | import org.apache.hadoop.mapreduce.Job;
 22 | import org.apache.hadoop.mapreduce.JobContext;
 23 | import org.apache.hadoop.mapreduce.RecordReader;
 24 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 25 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 26 | import org.apache.hadoop.util.GenericOptionsParser;
 27 | import org.apache.log4j.Logger;
 28 | 
 29 | import redis.clients.jedis.Jedis;
 30 | 
 31 | public class PartitionPruningInputDriver {
 32 | 
 33 | 	public static class RedisLastAccessInputFormat extends
 34 | 			InputFormat<RedisKey, Text> {
 35 | 
 36 | 		public static final String REDIS_SELECTED_MONTHS_CONF = "mapred.redilastaccessinputformat.months";
 37 | 		private static final HashMap<String, Integer> MONTH_FROM_STRING = new HashMap<String, Integer>();
 38 | 		private static final HashMap<String, String> MONTH_TO_INST_MAP = new HashMap<String, String>();
 39 | 		private static final Logger LOG = Logger
 40 | 				.getLogger(RedisLastAccessInputFormat.class);
 41 | 
 42 | 		static {
 43 | 			MONTH_TO_INST_MAP.put("JAN", MRDPUtils.REDIS_INSTANCES[0]);
 44 | 			MONTH_TO_INST_MAP.put("FEB", MRDPUtils.REDIS_INSTANCES[0]);
 45 | 			MONTH_TO_INST_MAP.put("MAR", MRDPUtils.REDIS_INSTANCES[1]);
 46 | 			MONTH_TO_INST_MAP.put("APR", MRDPUtils.REDIS_INSTANCES[1]);
 47 | 			MONTH_TO_INST_MAP.put("MAY", MRDPUtils.REDIS_INSTANCES[2]);
 48 | 			MONTH_TO_INST_MAP.put("JUN", MRDPUtils.REDIS_INSTANCES[2]);
 49 | 			MONTH_TO_INST_MAP.put("JUL", MRDPUtils.REDIS_INSTANCES[3]);
 50 | 			MONTH_TO_INST_MAP.put("AUG", MRDPUtils.REDIS_INSTANCES[3]);
 51 | 			MONTH_TO_INST_MAP.put("SEP", MRDPUtils.REDIS_INSTANCES[4]);
 52 | 			MONTH_TO_INST_MAP.put("OCT", MRDPUtils.REDIS_INSTANCES[4]);
 53 | 			MONTH_TO_INST_MAP.put("NOV", MRDPUtils.REDIS_INSTANCES[5]);
 54 | 			MONTH_TO_INST_MAP.put("DEC", MRDPUtils.REDIS_INSTANCES[5]);
 55 | 
 56 | 			MONTH_FROM_STRING.put("JAN", 0);
 57 | 			MONTH_FROM_STRING.put("FEB", 1);
 58 | 			MONTH_FROM_STRING.put("MAR", 2);
 59 | 			MONTH_FROM_STRING.put("APR", 3);
 60 | 			MONTH_FROM_STRING.put("MAY", 4);
 61 | 			MONTH_FROM_STRING.put("JUN", 5);
 62 | 			MONTH_FROM_STRING.put("JUL", 6);
 63 | 			MONTH_FROM_STRING.put("AUG", 7);
 64 | 			MONTH_FROM_STRING.put("SEP", 8);
 65 | 			MONTH_FROM_STRING.put("OCT", 9);
 66 | 			MONTH_FROM_STRING.put("NOV", 10);
 67 | 			MONTH_FROM_STRING.put("DEC", 11);
 68 | 		}
 69 | 
 70 | 		/**
 71 | 		 * Sets the CSV string for months you want to pull
 72 | 		 * 
 73 | 		 * @param job
 74 | 		 *            The job conf
 75 | 		 * @param String
 76 | 		 *            months The CSV list of months
 77 | 		 */
 78 | 		public static void setRedisLastAccessMonths(Job job, String months) {
 79 | 			job.getConfiguration().set(REDIS_SELECTED_MONTHS_CONF, months);
 80 | 		}
 81 | 
 82 | 		@Override
 83 | 		public List<InputSplit> getSplits(JobContext job) throws IOException {
 84 | 
 85 | 			String months = job.getConfiguration().get(
 86 | 					REDIS_SELECTED_MONTHS_CONF);
 87 | 
 88 | 			if (months == null || months.isEmpty()) {
 89 | 				throw new IOException(REDIS_SELECTED_MONTHS_CONF
 90 | 						+ " is null or empty.");
 91 | 			}
 92 | 
 93 | 			// Create input splits from the input months
 94 | 			HashMap<String, RedisLastAccessInputSplit> instanceToSplitMap = new HashMap<String, RedisLastAccessInputSplit>();
 95 | 			for (String month : months.split(",")) {
 96 | 				String host = MONTH_TO_INST_MAP.get(month);
 97 | 				RedisLastAccessInputSplit split = instanceToSplitMap.get(host);
 98 | 				if (split == null) {
 99 | 					split = new RedisLastAccessInputSplit(host);
100 | 					split.addHashKey(month);
101 | 					instanceToSplitMap.put(host, split);
102 | 				} else {
103 | 					split.addHashKey(month);
104 | 				}
105 | 			}
106 | 
107 | 			LOG.info("Input splits to process: "
108 | 					+ instanceToSplitMap.values().size());
109 | 			return new ArrayList<InputSplit>(instanceToSplitMap.values());
110 | 		}
111 | 
112 | 		@Override
113 | 		public RecordReader<RedisKey, Text> createRecordReader(
114 | 				InputSplit split, TaskAttemptContext context)
115 | 				throws IOException, InterruptedException {
116 | 			return new RedisLastAccessRecordReader();
117 | 		}
118 | 
119 | 		public static class RedisLastAccessRecordReader extends
120 | 				RecordReader<RedisKey, Text> {
121 | 
122 | 			private static final Logger LOG = Logger
123 | 					.getLogger(RedisLastAccessRecordReader.class);
124 | 			private Entry<String, String> currentEntry = null;
125 | 			private float processedKVs = 0, totalKVs = 0;
126 | 			private int currentHashMonth = 0;
127 | 			private Iterator<Entry<String, String>> hashIterator = null;
128 | 			private Iterator<String> hashKeys = null;
129 | 			private RedisKey key = new RedisKey();
130 | 			private String host = null;
131 | 			private Text value = new Text();
132 | 
133 | 			@Override
134 | 			public void initialize(InputSplit split, TaskAttemptContext context)
135 | 					throws IOException, InterruptedException {
136 | 
137 | 				// Get the host location from the InputSplit
138 | 				host = split.getLocations()[0];
139 | 
140 | 				// Get an iterator of all the hash keys we want to read
141 | 				hashKeys = ((RedisLastAccessInputSplit) split).getHashKeys()
142 | 						.iterator();
143 | 
144 | 				LOG.info("Connecting to " + host);
145 | 			}
146 | 
147 | 			@Override
148 | 			public boolean nextKeyValue() throws IOException,
149 | 					InterruptedException {
150 | 
151 | 				boolean nextHashKey = false;
152 | 				do {
153 | 					// if this is the first call or the iterator does not have a
154 | 					// next
155 | 					if (hashIterator == null || !hashIterator.hasNext()) {
156 | 						// if we have reached the end of our hash keys, return
157 | 						// false
158 | 						if (!hashKeys.hasNext()) {
159 | 							// ultimate end condition, return false
160 | 							return false;
161 | 						} else {
162 | 							// Otherwise, connect to Redis and get all
163 | 							// the name/value pairs for this hash key
164 | 							Jedis jedis = new Jedis(host);
165 | 							jedis.connect();
166 | 							String strKey = hashKeys.next();
167 | 							currentHashMonth = MONTH_FROM_STRING.get(strKey);
168 | 							hashIterator = jedis.hgetAll(strKey).entrySet()
169 | 									.iterator();
170 | 							jedis.disconnect();
171 | 						}
172 | 					}
173 | 
174 | 					// If the key/value map still has values
175 | 					if (hashIterator.hasNext()) {
176 | 						// Get the current entry and set the Text objects to
177 | 						// the
178 | 						// entry
179 | 						currentEntry = hashIterator.next();
180 | 						key.setLastAccessMonth(currentHashMonth);
181 | 						key.setField(currentEntry.getKey());
182 | 						value.set(currentEntry.getValue());
183 | 					} else {
184 | 						nextHashKey = true;
185 | 					}
186 | 				} while (nextHashKey);
187 | 
188 | 				return true;
189 | 			}
190 | 
191 | 			@Override
192 | 			public RedisKey getCurrentKey() throws IOException,
193 | 					InterruptedException {
194 | 				return key;
195 | 			}
196 | 
197 | 			@Override
198 | 			public Text getCurrentValue() throws IOException,
199 | 					InterruptedException {
200 | 				return value;
201 | 			}
202 | 
203 | 			@Override
204 | 			public float getProgress() throws IOException, InterruptedException {
205 | 				return processedKVs / totalKVs;
206 | 			}
207 | 
208 | 			@Override
209 | 			public void close() throws IOException {
210 | 				// nothing to do here
211 | 			}
212 | 		}
213 | 	}
214 | 
215 | 	public static class RedisLastAccessInputSplit extends InputSplit implements
216 | 			Writable {
217 | 
218 | 		/**
219 | 		 * The Redis instance location
220 | 		 */
221 | 		private String location = null;
222 | 		private List<String> hashKeys = new ArrayList<String>();
223 | 
224 | 		public RedisLastAccessInputSplit() {
225 | 			// Default constructor for reflection
226 | 		}
227 | 
228 | 		public RedisLastAccessInputSplit(String redisHost) {
229 | 			this.location = redisHost;
230 | 		}
231 | 
232 | 		public void addHashKey(String key) {
233 | 			hashKeys.add(key);
234 | 		}
235 | 
236 | 		public void removeHashKey(String key) {
237 | 			hashKeys.remove(key);
238 | 		}
239 | 
240 | 		public List<String> getHashKeys() {
241 | 			return hashKeys;
242 | 		}
243 | 
244 | 		@Override
245 | 		public void readFields(DataInput in) throws IOException {
246 | 			location = in.readUTF();
247 | 			int numKeys = in.readInt();
248 | 			hashKeys.clear();
249 | 			for (int i = 0; i < numKeys; ++i) {
250 | 				hashKeys.add(in.readUTF());
251 | 			}
252 | 		}
253 | 
254 | 		@Override
255 | 		public void write(DataOutput out) throws IOException {
256 | 			out.writeUTF(location);
257 | 			out.writeInt(hashKeys.size());
258 | 			for (String key : hashKeys) {
259 | 				out.writeUTF(key);
260 | 			}
261 | 		}
262 | 
263 | 		@Override
264 | 		public long getLength() throws IOException, InterruptedException {
265 | 			return 0;
266 | 		}
267 | 
268 | 		@Override
269 | 		public String[] getLocations() throws IOException, InterruptedException {
270 | 			return new String[] { location };
271 | 		}
272 | 	}
273 | 
274 | 	public static void main(String[] args) throws Exception {
275 | 		Configuration conf = new Configuration();
276 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
277 | 				.getRemainingArgs();
278 | 
279 | 		if (otherArgs.length != 2) {
280 | 			System.err
281 | 					.println("Usage: PartitionPruning <last access months> <output>");
282 | 			System.exit(1);
283 | 		}
284 | 
285 | 		String lastAccessMonths = otherArgs[0];
286 | 		Path outputDir = new Path(otherArgs[1]);
287 | 
288 | 		Job job = new Job(conf, "Redis Input");
289 | 		job.setJarByClass(PartitionPruningInputDriver.class);
290 | 
291 | 		// Use the identity mapper
292 | 		job.setNumReduceTasks(0);
293 | 
294 | 		job.setInputFormatClass(RedisLastAccessInputFormat.class);
295 | 		RedisLastAccessInputFormat.setRedisLastAccessMonths(job,
296 | 				lastAccessMonths);
297 | 
298 | 		job.setOutputFormatClass(TextOutputFormat.class);
299 | 		TextOutputFormat.setOutputPath(job, outputDir);
300 | 
301 | 		job.setOutputKeyClass(RedisKey.class);
302 | 		job.setOutputValueClass(Text.class);
303 | 
304 | 		System.exit(job.waitForCompletion(true) ? 0 : 2);
305 | 	}
306 | }
307 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch7/PartitionPruningOutputDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch7;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.text.ParseException;
  7 | import java.text.SimpleDateFormat;
  8 | import java.util.Calendar;
  9 | import java.util.HashMap;
 10 | import java.util.Map;
 11 | import mrdp.utils.MRDPUtils;
 12 | 
 13 | import org.apache.hadoop.conf.Configuration;
 14 | import org.apache.hadoop.fs.Path;
 15 | import org.apache.hadoop.io.Text;
 16 | import org.apache.hadoop.io.WritableComparable;
 17 | import org.apache.hadoop.mapreduce.Job;
 18 | import org.apache.hadoop.mapreduce.JobContext;
 19 | import org.apache.hadoop.mapreduce.Mapper;
 20 | import org.apache.hadoop.mapreduce.OutputCommitter;
 21 | import org.apache.hadoop.mapreduce.OutputFormat;
 22 | import org.apache.hadoop.mapreduce.RecordWriter;
 23 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 25 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 26 | import org.apache.hadoop.util.GenericOptionsParser;
 27 | 
 28 | import redis.clients.jedis.Jedis;
 29 | 
 30 | public class PartitionPruningOutputDriver {
 31 | 
 32 | 	private static final HashMap<Integer, String> MONTH_FROM_INT = new HashMap<Integer, String>();
 33 | 
 34 | 	static {
 35 | 		MONTH_FROM_INT.put(0, "JAN");
 36 | 		MONTH_FROM_INT.put(1, "FEB");
 37 | 		MONTH_FROM_INT.put(2, "MAR");
 38 | 		MONTH_FROM_INT.put(3, "APR");
 39 | 		MONTH_FROM_INT.put(4, "MAY");
 40 | 		MONTH_FROM_INT.put(5, "JUN");
 41 | 		MONTH_FROM_INT.put(6, "JUL");
 42 | 		MONTH_FROM_INT.put(7, "AUG");
 43 | 		MONTH_FROM_INT.put(8, "SEP");
 44 | 		MONTH_FROM_INT.put(9, "OCT");
 45 | 		MONTH_FROM_INT.put(10, "NOV");
 46 | 		MONTH_FROM_INT.put(11, "DEC");
 47 | 	}
 48 | 
 49 | 	public static class RedisLastAccessOutputMapper extends
 50 | 			Mapper<Object, Text, RedisKey, Text> {
 51 | 
 52 | 		// This object will format the creation date string into a Date object
 53 | 		private final static SimpleDateFormat frmt = new SimpleDateFormat(
 54 | 				"yyyy-MM-dd'T'HH:mm:ss.SSS");
 55 | 
 56 | 		private RedisKey outkey = new RedisKey();
 57 | 		private Text outvalue = new Text();
 58 | 
 59 | 		@Override
 60 | 		public void map(Object key, Text value, Context context)
 61 | 				throws IOException, InterruptedException {
 62 | 
 63 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 64 | 					.toString());
 65 | 
 66 | 			String userId = parsed.get("Id");
 67 | 			String reputation = parsed.get("Reputation");
 68 | 
 69 | 			// Grab the last access date
 70 | 			String strDate = parsed.get("LastAccessDate");
 71 | 
 72 | 			if (userId == null || reputation == null || strDate == null) {
 73 | 				return;
 74 | 			}
 75 | 
 76 | 			try {
 77 | 				// Parse the string into a Calendar object
 78 | 				Calendar cal = Calendar.getInstance();
 79 | 				cal.setTime(frmt.parse(strDate));
 80 | 
 81 | 				// Set our output key and values
 82 | 				outkey.setLastAccessMonth(cal.get(Calendar.MONTH));
 83 | 				outkey.setField(userId);
 84 | 				outvalue.set(reputation);
 85 | 
 86 | 				context.write(outkey, outvalue);
 87 | 			} catch (ParseException e) {
 88 | 				e.printStackTrace();
 89 | 			}
 90 | 		}
 91 | 	}
 92 | 
 93 | 	public static class RedisKey implements WritableComparable<RedisKey> {
 94 | 
 95 | 		private int lastAccessMonth = 0;
 96 | 		private Text field = new Text();
 97 | 
 98 | 		public int getLastAccessMonth() {
 99 | 			return this.lastAccessMonth;
100 | 		}
101 | 
102 | 		public void setLastAccessMonth(int lastAccessMonth) {
103 | 			this.lastAccessMonth = lastAccessMonth;
104 | 		}
105 | 
106 | 		public Text getField() {
107 | 			return this.field;
108 | 		}
109 | 
110 | 		public void setField(String field) {
111 | 			this.field.set(field);
112 | 		}
113 | 
114 | 		@Override
115 | 		public void readFields(DataInput in) throws IOException {
116 | 			lastAccessMonth = in.readInt();
117 | 			this.field.readFields(in);
118 | 		}
119 | 
120 | 		@Override
121 | 		public void write(DataOutput out) throws IOException {
122 | 			out.writeInt(lastAccessMonth);
123 | 			this.field.write(out);
124 | 		}
125 | 
126 | 		@Override
127 | 		public int compareTo(RedisKey rhs) {
128 | 			if (this.lastAccessMonth == rhs.getLastAccessMonth()) {
129 | 				return this.field.compareTo(rhs.getField());
130 | 			} else {
131 | 				return this.lastAccessMonth < rhs.getLastAccessMonth() ? -1 : 1;
132 | 			}
133 | 		}
134 | 
135 | 		@Override
136 | 		public String toString() {
137 | 			return this.lastAccessMonth + "\t" + this.field.toString();
138 | 		}
139 | 
140 | 		@Override
141 | 		public int hashCode() {
142 | 			return toString().hashCode();
143 | 		}
144 | 	}
145 | 
146 | 	public static class RedisLastAccessOutputFormat extends
147 | 			OutputFormat<RedisKey, Text> {
148 | 
149 | 		@Override
150 | 		public RecordWriter<RedisKey, Text> getRecordWriter(
151 | 				TaskAttemptContext job) throws IOException,
152 | 				InterruptedException {
153 | 			return new RedisLastAccessRecordWriter();
154 | 		}
155 | 
156 | 		@Override
157 | 		public void checkOutputSpecs(JobContext context) throws IOException,
158 | 				InterruptedException {
159 | 		}
160 | 
161 | 		@Override
162 | 		public OutputCommitter getOutputCommitter(TaskAttemptContext context)
163 | 				throws IOException, InterruptedException {
164 | 			return (new NullOutputFormat<Text, Text>())
165 | 					.getOutputCommitter(context);
166 | 		}
167 | 
168 | 		public static class RedisLastAccessRecordWriter extends
169 | 				RecordWriter<RedisKey, Text> {
170 | 
171 | 			private HashMap<Integer, Jedis> jedisMap = new HashMap<Integer, Jedis>();
172 | 
173 | 			public RedisLastAccessRecordWriter() {
174 | 				// Create a connection to Redis for each host
175 | 				int i = 0;
176 | 				for (String host : MRDPUtils.REDIS_INSTANCES) {
177 | 					Jedis jedis = new Jedis(host);
178 | 					jedis.connect();
179 | 					jedisMap.put(i, jedis);
180 | 					jedisMap.put(i + 1, jedis);
181 | 					i += 2;
182 | 				}
183 | 			}
184 | 
185 | 			@Override
186 | 			public void write(RedisKey key, Text value) throws IOException,
187 | 					InterruptedException {
188 | 				// Get the Jedis instance that this key/value pair will be
189 | 				// written to -- (0,1)->0, (2-3)->1, ... , (10-11)->5
190 | 				Jedis j = jedisMap.get(key.getLastAccessMonth());
191 | 
192 | 				// Write the key/value pair
193 | 				j.hset(MONTH_FROM_INT.get(key.getLastAccessMonth()), key
194 | 						.getField().toString(), value.toString());
195 | 			}
196 | 
197 | 			@Override
198 | 			public void close(TaskAttemptContext context) throws IOException,
199 | 					InterruptedException {
200 | 				// For each jedis instance, disconnect it
201 | 				for (Jedis jedis : jedisMap.values()) {
202 | 					jedis.disconnect();
203 | 				}
204 | 			}
205 | 		}
206 | 	}
207 | 
208 | 	public static void main(String[] args) throws Exception {
209 | 		Configuration conf = new Configuration();
210 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
211 | 				.getRemainingArgs();
212 | 
213 | 		if (otherArgs.length != 1) {
214 | 			System.err.println("Usage: PartitionPruningOutput <user data>");
215 | 			System.exit(1);
216 | 		}
217 | 
218 | 		Path inputPath = new Path(otherArgs[0]);
219 | 
220 | 		Job job = new Job(conf, "Redis Last Access Output");
221 | 		job.setJarByClass(PartitionPruningOutputDriver.class);
222 | 
223 | 		job.setMapperClass(RedisLastAccessOutputMapper.class);
224 | 		job.setNumReduceTasks(0);
225 | 
226 | 		job.setInputFormatClass(TextInputFormat.class);
227 | 		TextInputFormat.setInputPaths(job, inputPath);
228 | 
229 | 		job.setOutputFormatClass(RedisLastAccessOutputFormat.class);
230 | 
231 | 		job.setOutputKeyClass(RedisKey.class);
232 | 		job.setOutputValueClass(Text.class);
233 | 
234 | 		int code = job.waitForCompletion(true) ? 0 : 2;
235 | 
236 | 		System.exit(code);
237 | 	}
238 | }
239 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch7/RandomDataGenerationDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch7;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.DataInput;
  5 | import java.io.DataOutput;
  6 | import java.io.FileReader;
  7 | import java.io.IOException;
  8 | import java.net.URI;
  9 | import java.security.InvalidParameterException;
 10 | import java.text.SimpleDateFormat;
 11 | import java.util.ArrayList;
 12 | import java.util.List;
 13 | import java.util.Random;
 14 | 
 15 | import org.apache.hadoop.conf.Configuration;
 16 | import org.apache.hadoop.filecache.DistributedCache;
 17 | import org.apache.hadoop.fs.Path;
 18 | import org.apache.hadoop.io.NullWritable;
 19 | import org.apache.hadoop.io.Text;
 20 | import org.apache.hadoop.io.Writable;
 21 | import org.apache.hadoop.mapreduce.InputFormat;
 22 | import org.apache.hadoop.mapreduce.InputSplit;
 23 | import org.apache.hadoop.mapreduce.Job;
 24 | import org.apache.hadoop.mapreduce.JobContext;
 25 | import org.apache.hadoop.mapreduce.RecordReader;
 26 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 27 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 28 | import org.apache.hadoop.util.GenericOptionsParser;
 29 | 
 30 | public class RandomDataGenerationDriver {
 31 | 
 32 | 	public static class RandomStackOverflowInputFormat extends
 33 | 			InputFormat<Text, NullWritable> {
 34 | 
 35 | 		public static final String NUM_MAP_TASKS = "random.generator.map.tasks";
 36 | 		public static final String NUM_RECORDS_PER_TASK = "random.generator.num.records.per.map.task";
 37 | 		public static final String RANDOM_WORD_LIST = "random.generator.random.word.file";
 38 | 
 39 | 		@Override
 40 | 		public List<InputSplit> getSplits(JobContext job) throws IOException {
 41 | 
 42 | 			// Get the number of map tasks configured for
 43 | 			int numSplits = job.getConfiguration().getInt(NUM_MAP_TASKS, -1);
 44 | 			if (numSplits <= 0) {
 45 | 				throw new IOException(NUM_MAP_TASKS + " is not set.");
 46 | 			}
 47 | 
 48 | 			// Create a number of input splits equivalent to the number of tasks
 49 | 			ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
 50 | 			for (int i = 0; i < numSplits; ++i) {
 51 | 				splits.add(new FakeInputSplit());
 52 | 			}
 53 | 
 54 | 			return splits;
 55 | 		}
 56 | 
 57 | 		@Override
 58 | 		public RecordReader<Text, NullWritable> createRecordReader(
 59 | 				InputSplit split, TaskAttemptContext context)
 60 | 				throws IOException, InterruptedException {
 61 | 			// Create a new RandomStackoverflowRecordReader and initialize it
 62 | 			RandomStackoverflowRecordReader rr = new RandomStackoverflowRecordReader();
 63 | 			rr.initialize(split, context);
 64 | 			return rr;
 65 | 		}
 66 | 
 67 | 		public static void setNumMapTasks(Job job, int i) {
 68 | 			job.getConfiguration().setInt(NUM_MAP_TASKS, i);
 69 | 		}
 70 | 
 71 | 		public static void setNumRecordPerTask(Job job, int i) {
 72 | 			job.getConfiguration().setInt(NUM_RECORDS_PER_TASK, i);
 73 | 		}
 74 | 
 75 | 		public static void setRandomWordList(Job job, Path file) {
 76 | 			DistributedCache.addCacheFile(file.toUri(), job.getConfiguration());
 77 | 		}
 78 | 
 79 | 		public static class RandomStackoverflowRecordReader extends
 80 | 				RecordReader<Text, NullWritable> {
 81 | 
 82 | 			private int numRecordsToCreate = 0;
 83 | 			private int createdRecords = 0;
 84 | 			private Text key = new Text();
 85 | 			private NullWritable value = NullWritable.get();
 86 | 			private Random rndm = new Random();
 87 | 			private ArrayList<String> randomWords = new ArrayList<String>();
 88 | 
 89 | 			// This object will format the creation date string into a Date
 90 | 			// object
 91 | 			private SimpleDateFormat frmt = new SimpleDateFormat(
 92 | 					"yyyy-MM-dd'T'HH:mm:ss.SSS");
 93 | 
 94 | 			@Override
 95 | 			public void initialize(InputSplit split, TaskAttemptContext context)
 96 | 					throws IOException, InterruptedException {
 97 | 
 98 | 				// Get the number of records to create from the configuration
 99 | 				this.numRecordsToCreate = context.getConfiguration().getInt(
100 | 						NUM_RECORDS_PER_TASK, -1);
101 | 
102 | 				if (numRecordsToCreate < 0) {
103 | 					throw new InvalidParameterException(NUM_RECORDS_PER_TASK
104 | 							+ " is not set.");
105 | 				}
106 | 
107 | 				// Get the list of random words from the DistributedCache
108 | 				URI[] files = DistributedCache.getCacheFiles(context
109 | 						.getConfiguration());
110 | 
111 | 				if (files.length == 0) {
112 | 					throw new InvalidParameterException(
113 | 							"Random word list not set in cache.");
114 | 				} else {
115 | 					// Read the list of random words into a list
116 | 					BufferedReader rdr = new BufferedReader(new FileReader(
117 | 							files[0].toString()));
118 | 
119 | 					String line;
120 | 					while ((line = rdr.readLine()) != null) {
121 | 						randomWords.add(line);
122 | 					}
123 | 					rdr.close();
124 | 
125 | 					if (randomWords.size() == 0) {
126 | 						throw new IOException("Random word list is empty");
127 | 					}
128 | 				}
129 | 			}
130 | 
131 | 			@Override
132 | 			public boolean nextKeyValue() throws IOException,
133 | 					InterruptedException {
134 | 				// If we still have records to create
135 | 				if (createdRecords < numRecordsToCreate) {
136 | 					// Generate random data
137 | 					int score = Math.abs(rndm.nextInt()) % 15000;
138 | 					int rowId = Math.abs(rndm.nextInt()) % 1000000000;
139 | 					int postId = Math.abs(rndm.nextInt()) % 100000000;
140 | 					int userId = Math.abs(rndm.nextInt()) % 1000000;
141 | 					String creationDate = frmt
142 | 							.format(Math.abs(rndm.nextLong()));
143 | 
144 | 					// Create a string of text from the random words
145 | 					String text = getRandomText();
146 | 
147 | 					String randomRecord = "<row Id=\"" + rowId + "\" PostId=\""
148 | 							+ postId + "\" Score=\"" + score + "\" Text=\""
149 | 							+ text + "\" CreationDate=\"" + creationDate
150 | 							+ "\" UserId\"=" + userId + "\" />";
151 | 
152 | 					key.set(randomRecord);
153 | 					++createdRecords;
154 | 					return true;
155 | 				} else {
156 | 					// Else, return false
157 | 					return false;
158 | 				}
159 | 			}
160 | 
161 | 			/**
162 | 			 * Creates a random string of words from the list. 1-30 words per
163 | 			 * string.
164 | 			 * 
165 | 			 * @return A random string of words
166 | 			 */
167 | 			private String getRandomText() {
168 | 				StringBuilder bldr = new StringBuilder();
169 | 				int numWords = Math.abs(rndm.nextInt()) % 30 + 1;
170 | 
171 | 				for (int i = 0; i < numWords; ++i) {
172 | 					bldr.append(randomWords.get(Math.abs(rndm.nextInt())
173 | 							% randomWords.size())
174 | 							+ " ");
175 | 				}
176 | 				return bldr.toString();
177 | 			}
178 | 
179 | 			@Override
180 | 			public Text getCurrentKey() throws IOException,
181 | 					InterruptedException {
182 | 				return key;
183 | 			}
184 | 
185 | 			@Override
186 | 			public NullWritable getCurrentValue() throws IOException,
187 | 					InterruptedException {
188 | 				return value;
189 | 			}
190 | 
191 | 			@Override
192 | 			public float getProgress() throws IOException, InterruptedException {
193 | 				return (float) createdRecords / (float) numRecordsToCreate;
194 | 			}
195 | 
196 | 			@Override
197 | 			public void close() throws IOException {
198 | 				// nothing to do here...
199 | 			}
200 | 		}
201 | 
202 | 		/**
203 | 		 * This class is very empty.
204 | 		 */
205 | 		public static class FakeInputSplit extends InputSplit implements
206 | 				Writable {
207 | 
208 | 			@Override
209 | 			public void readFields(DataInput arg0) throws IOException {
210 | 			}
211 | 
212 | 			@Override
213 | 			public void write(DataOutput arg0) throws IOException {
214 | 			}
215 | 
216 | 			@Override
217 | 			public long getLength() throws IOException, InterruptedException {
218 | 				return 0;
219 | 			}
220 | 
221 | 			@Override
222 | 			public String[] getLocations() throws IOException,
223 | 					InterruptedException {
224 | 				return new String[0];
225 | 			}
226 | 		}
227 | 	}
228 | 
229 | 	public static void main(String[] args) throws Exception {
230 | 		Configuration conf = new Configuration();
231 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
232 | 				.getRemainingArgs();
233 | 		if (otherArgs.length != 4) {
234 | 			System.err
235 | 					.println("Usage: RandomDataGenerationDriver <num map tasks> <num records per task> <word list> <output>");
236 | 			System.exit(1);
237 | 		}
238 | 
239 | 		int numMapTasks = Integer.parseInt(otherArgs[0]);
240 | 		int numRecordsPerTask = Integer.parseInt(otherArgs[1]);
241 | 		Path wordList = new Path(otherArgs[2]);
242 | 		Path outputDir = new Path(otherArgs[3]);
243 | 
244 | 		Job job = new Job(conf, "RandomDataGenerationDriver");
245 | 		job.setJarByClass(RandomDataGenerationDriver.class);
246 | 
247 | 		job.setNumReduceTasks(0);
248 | 
249 | 		job.setInputFormatClass(RandomStackOverflowInputFormat.class);
250 | 
251 | 		RandomStackOverflowInputFormat.setNumMapTasks(job, numMapTasks);
252 | 		RandomStackOverflowInputFormat.setNumRecordPerTask(job,
253 | 				numRecordsPerTask);
254 | 		RandomStackOverflowInputFormat.setRandomWordList(job, wordList);
255 | 
256 | 		TextOutputFormat.setOutputPath(job, outputDir);
257 | 
258 | 		job.setOutputKeyClass(Text.class);
259 | 		job.setOutputValueClass(NullWritable.class);
260 | 
261 | 		System.exit(job.waitForCompletion(true) ? 0 : 2);
262 | 	}
263 | }
264 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch7/RedisInputDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch7;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataOutput;
  5 | import java.io.IOException;
  6 | import java.util.ArrayList;
  7 | import java.util.Iterator;
  8 | import java.util.List;
  9 | import java.util.Map.Entry;
 10 | 
 11 | import org.apache.hadoop.conf.Configuration;
 12 | import org.apache.hadoop.fs.Path;
 13 | import org.apache.hadoop.io.Text;
 14 | import org.apache.hadoop.io.Writable;
 15 | import org.apache.hadoop.mapreduce.InputFormat;
 16 | import org.apache.hadoop.mapreduce.InputSplit;
 17 | import org.apache.hadoop.mapreduce.Job;
 18 | import org.apache.hadoop.mapreduce.JobContext;
 19 | import org.apache.hadoop.mapreduce.RecordReader;
 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 22 | import org.apache.hadoop.util.GenericOptionsParser;
 23 | import org.apache.log4j.Logger;
 24 | 
 25 | import redis.clients.jedis.Jedis;
 26 | 
 27 | public class RedisInputDriver {
 28 | 
 29 | 	public static class RedisHashInputFormat extends
 30 | 			InputFormat<Text, Text> {
 31 | 
 32 | 		public static final String REDIS_HOSTS_CONF = "mapred.redishashinputformat.hosts";
 33 | 		public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key";
 34 | 		private static final Logger LOG = Logger
 35 | 				.getLogger(RedisHashInputFormat.class);
 36 | 
 37 | 		/**
 38 | 		 * Sets the CSV string of Redis hosts.
 39 | 		 * 
 40 | 		 * @param job
 41 | 		 *            The job conf
 42 | 		 * @param hosts
 43 | 		 *            The CSV string of Redis hosts
 44 | 		 */
 45 | 		public static void setRedisHosts(Job job, String hosts) {
 46 | 			job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
 47 | 		}
 48 | 
 49 | 		/**
 50 | 		 * Sets the key of the hash to write to.
 51 | 		 * 
 52 | 		 * @param job
 53 | 		 *            The job conf
 54 | 		 * @param hashKey
 55 | 		 *            The name of the hash key
 56 | 		 */
 57 | 		public static void setRedisHashKey(Job job, String hashKey) {
 58 | 			job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
 59 | 		}
 60 | 
 61 | 		@Override
 62 | 		public List<InputSplit> getSplits(JobContext job) throws IOException {
 63 | 			String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
 64 | 
 65 | 			if (hosts == null || hosts.isEmpty()) {
 66 | 				throw new IOException(REDIS_HOSTS_CONF
 67 | 						+ " is not set in configuration.");
 68 | 			}
 69 | 
 70 | 			String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF);
 71 | 			if (hashKey == null || hashKey.isEmpty()) {
 72 | 				throw new IOException(REDIS_HASH_KEY_CONF
 73 | 						+ " is not set in configuration.");
 74 | 			}
 75 | 
 76 | 			// Create an input split for each host
 77 | 			List<InputSplit> splits = new ArrayList<InputSplit>();
 78 | 			for (String host : hosts.split(",")) {
 79 | 				splits.add(new RedisHashInputSplit(host, hashKey));
 80 | 			}
 81 | 
 82 | 			LOG.info("Input splits to process: " + splits.size());
 83 | 			return splits;
 84 | 		}
 85 | 
 86 | 		@Override
 87 | 		public RecordReader<Text, Text> createRecordReader(InputSplit split,
 88 | 				TaskAttemptContext context) throws IOException,
 89 | 				InterruptedException {
 90 | 			return new RedisHashRecordReader();
 91 | 		}
 92 | 
 93 | 		public static class RedisHashRecordReader extends
 94 | 				RecordReader<Text, Text> {
 95 | 
 96 | 			private static final Logger LOG = Logger
 97 | 					.getLogger(RedisHashRecordReader.class);
 98 | 			private Iterator<Entry<String, String>> keyValueMapIter = null;
 99 | 			private Text key = new Text(), value = new Text();
100 | 			private float processedKVs = 0, totalKVs = 0;
101 | 			private Entry<String, String> currentEntry = null;
102 | 
103 | 			@Override
104 | 			public void initialize(InputSplit split, TaskAttemptContext context)
105 | 					throws IOException, InterruptedException {
106 | 
107 | 				// Get the host location from the InputSplit
108 | 				String host = split.getLocations()[0];
109 | 				String hashKey = ((RedisHashInputSplit) split).getHashKey();
110 | 
111 | 				LOG.info("Connecting to " + host + " and reading from "
112 | 						+ hashKey);
113 | 
114 | 				Jedis jedis = new Jedis(host);
115 | 				jedis.connect();
116 | 				jedis.getClient().setTimeoutInfinite();
117 | 
118 | 				// Get all the key value pairs from the Redis instance and store
119 | 				// them in memory
120 | 				totalKVs = jedis.hlen(hashKey);
121 | 				keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator();
122 | 				LOG.info("Got " + totalKVs + " from " + hashKey);
123 | 				jedis.disconnect();
124 | 			}
125 | 
126 | 			@Override
127 | 			public boolean nextKeyValue() throws IOException,
128 | 					InterruptedException {
129 | 
130 | 				// If the key/value map still has values
131 | 				if (keyValueMapIter.hasNext()) {
132 | 
133 | 					// Get the current entry and set the Text objects to the
134 | 					// entry
135 | 					currentEntry = keyValueMapIter.next();
136 | 					key.set(currentEntry.getKey());
137 | 					value.set(currentEntry.getValue());
138 | 					return true;
139 | 				} else {
140 | 					// No more values? return false.
141 | 					return false;
142 | 				}
143 | 			}
144 | 
145 | 			@Override
146 | 			public Text getCurrentKey() throws IOException,
147 | 					InterruptedException {
148 | 				return key;
149 | 			}
150 | 
151 | 			@Override
152 | 			public Text getCurrentValue() throws IOException,
153 | 					InterruptedException {
154 | 				return value;
155 | 			}
156 | 
157 | 			@Override
158 | 			public float getProgress() throws IOException, InterruptedException {
159 | 				return processedKVs / totalKVs;
160 | 			}
161 | 
162 | 			@Override
163 | 			public void close() throws IOException {
164 | 				// nothing to do here
165 | 			}
166 | 		}
167 | 	}
168 | 
169 | 	public static class RedisHashInputSplit extends InputSplit implements Writable {
170 | 
171 | 		/**
172 | 		 * The Redis instance location
173 | 		 */
174 | 		private String location = null;
175 | 
176 | 		/**
177 | 		 * The Redis hash to read from
178 | 		 */
179 | 		private String hashKey = null;
180 | 
181 | 		public RedisHashInputSplit() {
182 | 			// Default constructor for reflection
183 | 		}
184 | 
185 | 		public RedisHashInputSplit(String redisHost, String hash) {
186 | 			this.location = redisHost;
187 | 			this.hashKey = hash;
188 | 		}
189 | 
190 | 		public String getHashKey() {
191 | 			return this.hashKey;
192 | 		}
193 | 
194 | 		@Override
195 | 		public void readFields(DataInput in) throws IOException {
196 | 			this.location = in.readUTF();
197 | 			this.hashKey = in.readUTF();
198 | 		}
199 | 
200 | 		@Override
201 | 		public void write(DataOutput out) throws IOException {
202 | 			out.writeUTF(location);
203 | 			out.writeUTF(hashKey);
204 | 		}
205 | 
206 | 		@Override
207 | 		public long getLength() throws IOException, InterruptedException {
208 | 			return 0;
209 | 		}
210 | 
211 | 		@Override
212 | 		public String[] getLocations() throws IOException, InterruptedException {
213 | 			return new String[] { location };
214 | 		}
215 | 	}
216 | 
217 | 	public static void main(String[] args) throws Exception {
218 | 		Configuration conf = new Configuration();
219 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
220 | 				.getRemainingArgs();
221 | 
222 | 		if (otherArgs.length != 3) {
223 | 			System.err
224 | 					.println("Usage: RedisInput <redis hosts> <hash name> <output>");
225 | 			System.exit(1);
226 | 		}
227 | 
228 | 		String hosts = otherArgs[0];
229 | 		String hashKey = otherArgs[1];
230 | 		Path outputDir = new Path(otherArgs[2]);
231 | 
232 | 		Job job = new Job(conf, "Redis Input");
233 | 		job.setJarByClass(RedisInputDriver.class);
234 | 
235 | 		// Use the identity mapper
236 | 		job.setNumReduceTasks(0);
237 | 
238 | 		job.setInputFormatClass(RedisHashInputFormat.class);
239 | 		RedisHashInputFormat.setRedisHosts(job, hosts);
240 | 		RedisHashInputFormat.setRedisHashKey(job, hashKey);
241 | 
242 | 		job.setOutputFormatClass(TextOutputFormat.class);
243 | 		TextOutputFormat.setOutputPath(job, outputDir);
244 | 
245 | 		job.setOutputKeyClass(Text.class);
246 | 		job.setOutputValueClass(Text.class);
247 | 
248 | 		System.exit(job.waitForCompletion(true) ? 0 : 3);
249 | 	}
250 | }
251 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/ch7/RedisOutputDriver.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch7;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | import mrdp.utils.MRDPUtils;
  8 | 
  9 | import org.apache.hadoop.conf.Configuration;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.apache.hadoop.mapreduce.Job;
 13 | import org.apache.hadoop.mapreduce.JobContext;
 14 | import org.apache.hadoop.mapreduce.Mapper;
 15 | import org.apache.hadoop.mapreduce.OutputCommitter;
 16 | import org.apache.hadoop.mapreduce.OutputFormat;
 17 | import org.apache.hadoop.mapreduce.RecordWriter;
 18 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 20 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 21 | import org.apache.hadoop.util.GenericOptionsParser;
 22 | import org.apache.log4j.Logger;
 23 | 
 24 | import redis.clients.jedis.Jedis;
 25 | 
 26 | public class RedisOutputDriver {
 27 | 
 28 | 	public static class RedisOutputMapper extends
 29 | 			Mapper<Object, Text, Text, Text> {
 30 | 
 31 | 		private Text outkey = new Text();
 32 | 		private Text outvalue = new Text();
 33 | 
 34 | 		@Override
 35 | 		public void map(Object key, Text value, Context context)
 36 | 				throws IOException, InterruptedException {
 37 | 
 38 | 			Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
 39 | 					.toString());
 40 | 
 41 | 			String userId = parsed.get("Id");
 42 | 			String reputation = parsed.get("Reputation");
 43 | 
 44 | 			if (userId == null || reputation == null) {
 45 | 				return;
 46 | 			}
 47 | 
 48 | 			// Set our output key and values
 49 | 			outkey.set(userId);
 50 | 			outvalue.set(reputation);
 51 | 
 52 | 			context.write(outkey, outvalue);
 53 | 		}
 54 | 	}
 55 | 
 56 | 	public static class RedisHashOutputFormat extends OutputFormat<Text, Text> {
 57 | 
 58 | 		public static final String REDIS_HOSTS_CONF = "mapred.redishashoutputformat.hosts";
 59 | 		public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key";
 60 | 
 61 | 		/**
 62 | 		 * Sets the CSV string of Redis hosts.
 63 | 		 * 
 64 | 		 * @param job
 65 | 		 *            The job conf
 66 | 		 * @param hosts
 67 | 		 *            The CSV string of Redis hosts
 68 | 		 */
 69 | 		public static void setRedisHosts(Job job, String hosts) {
 70 | 			job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
 71 | 		}
 72 | 
 73 | 		/**
 74 | 		 * Sets the key of the hash to write to.
 75 | 		 * 
 76 | 		 * @param job
 77 | 		 *            The job conf
 78 | 		 * @param hashKey
 79 | 		 *            The name of the hash key
 80 | 		 */
 81 | 		public static void setRedisHashKey(Job job, String hashKey) {
 82 | 			job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
 83 | 		}
 84 | 
 85 | 		@Override
 86 | 		public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job)
 87 | 				throws IOException, InterruptedException {
 88 | 			return new RedisHashRecordWriter(job.getConfiguration().get(
 89 | 					REDIS_HASH_KEY_CONF), job.getConfiguration().get(
 90 | 					REDIS_HOSTS_CONF));
 91 | 		}
 92 | 
 93 | 		@Override
 94 | 		public void checkOutputSpecs(JobContext job)
 95 | 				throws IOException {
 96 | 			String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
 97 | 
 98 | 			if (hosts == null || hosts.isEmpty()) {
 99 | 				throw new IOException(REDIS_HOSTS_CONF
100 | 						+ " is not set in configuration.");
101 | 			}
102 | 
103 | 			String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF);
104 | 
105 | 			if (hashKey == null || hashKey.isEmpty()) {
106 | 				throw new IOException(REDIS_HASH_KEY_CONF
107 | 						+ " is not set in configuration.");
108 | 			}
109 | 		}
110 | 
111 | 		@Override
112 | 		public OutputCommitter getOutputCommitter(TaskAttemptContext context)
113 | 				throws IOException, InterruptedException {
114 | 			return (new NullOutputFormat<Text, Text>())
115 | 					.getOutputCommitter(context);
116 | 		}
117 | 
118 | 		public static class RedisHashRecordWriter extends
119 | 				RecordWriter<Text, Text> {
120 | 
121 | 			private static final Logger LOG = Logger
122 | 					.getLogger(RedisHashRecordWriter.class);
123 | 			private HashMap<Integer, Jedis> jedisMap = new HashMap<Integer, Jedis>();
124 | 			private String hashKey = null;
125 | 
126 | 			public RedisHashRecordWriter(String hashKey, String hosts) {
127 | 				LOG.info("Connecting to " + hosts + " and writing to "
128 | 						+ hashKey);
129 | 				this.hashKey = hashKey;
130 | 				// Create a connection to Redis for each host
131 | 				// Map an integer 0-(numRedisInstances - 1) to the instance
132 | 				int i = 0;
133 | 				for (String host : hosts.split(",")) {
134 | 					Jedis jedis = new Jedis(host);
135 | 					jedis.connect();
136 | 					jedisMap.put(i, jedis);
137 | 					++i;
138 | 				}
139 | 			}
140 | 
141 | 			@Override
142 | 			public void write(Text key, Text value) throws IOException,
143 | 					InterruptedException {
144 | 				// Get the Jedis instance that this key/value pair will be
145 | 				// written to
146 | 				Jedis j = jedisMap.get(Math.abs(key.hashCode())
147 | 						% jedisMap.size());
148 | 
149 | 				// Write the key/value pair
150 | 				j.hset(hashKey, key.toString(), value.toString());
151 | 			}
152 | 
153 | 			@Override
154 | 			public void close(TaskAttemptContext context) throws IOException,
155 | 					InterruptedException {
156 | 				// For each jedis instance, disconnect it
157 | 				for (Jedis jedis : jedisMap.values()) {
158 | 					jedis.disconnect();
159 | 				}
160 | 			}
161 | 		}
162 | 	}
163 | 
164 | 	public static void main(String[] args) throws Exception {
165 | 		Configuration conf = new Configuration();
166 | 		String[] otherArgs = new GenericOptionsParser(conf, args)
167 | 				.getRemainingArgs();
168 | 
169 | 		if (otherArgs.length != 3) {
170 | 			System.err
171 | 					.println("Usage: RedisOutput <user data> <redis hosts> <hash name>");
172 | 			System.exit(1);
173 | 		}
174 | 
175 | 		Path inputPath = new Path(otherArgs[0]);
176 | 		String hosts = otherArgs[1];
177 | 		String hashName = otherArgs[2];
178 | 
179 | 		Job job = new Job(conf, "Redis Output");
180 | 		job.setJarByClass(RedisOutputDriver.class);
181 | 
182 | 		job.setMapperClass(RedisOutputMapper.class);
183 | 		job.setNumReduceTasks(0);
184 | 
185 | 		job.setInputFormatClass(TextInputFormat.class);
186 | 		TextInputFormat.setInputPaths(job, inputPath);
187 | 
188 | 		job.setOutputFormatClass(RedisHashOutputFormat.class);
189 | 		RedisHashOutputFormat.setRedisHosts(job, hosts);
190 | 		RedisHashOutputFormat.setRedisHashKey(job, hashName);
191 | 
192 | 		job.setOutputKeyClass(Text.class);
193 | 		job.setOutputValueClass(Text.class);
194 | 
195 | 		int code = job.waitForCompletion(true) ? 0 : 2;
196 | 
197 | 		System.exit(code);
198 | 	}
199 | }
200 | 


--------------------------------------------------------------------------------
/MRDP/src/main/java/mrdp/utils/MRDPUtils.java:
--------------------------------------------------------------------------------
 1 | package mrdp.utils;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class MRDPUtils {
 7 | 	
 8 | 	public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3",
 9 | 			"p4", "p6" };
10 | 
11 | 	// This helper function parses the stackoverflow into a Map for us.
12 | 	public static Map<String, String> transformXmlToMap(String xml) {
13 | 		Map<String, String> map = new HashMap<String, String>();
14 | 		try {
15 | 			String[] tokens = xml.trim().substring(5, xml.trim().length() - 3)
16 | 					.split("\"");
17 | 
18 | 			for (int i = 0; i < tokens.length - 1; i += 2) {
19 | 				String key = tokens[i].trim();
20 | 				String val = tokens[i + 1];
21 | 
22 | 				map.put(key.substring(0, key.length() - 1), val);
23 | 			}
24 | 		} catch (StringIndexOutOfBoundsException e) {
25 | 			System.err.println(xml);
26 | 		}
27 | 
28 | 		return map;
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/MRDP/src/main/resources/highrepusers.bf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adamjshook/mapreducepatterns/315edb587d602774972c8229c65d740bba9f9f83/MRDP/src/main/resources/highrepusers.bf


--------------------------------------------------------------------------------
/MRDP/src/main/resources/hotlist.txt:
--------------------------------------------------------------------------------
  1 | edited
  2 | fix
  3 | longer
  4 | specified
  5 | retrieve
  6 | months
  7 | representation
  8 | jsbin
  9 | parameterized
 10 | publicly
 11 | pleasure
 12 | blindly
 13 | textual
 14 | ordinal
 15 | createinstance
 16 | trend
 17 | bomb
 18 | dispatched
 19 | retract
 20 | promised
 21 | loves
 22 | approached
 23 | urlrequest
 24 | webmatrix
 25 | borealid
 26 | hibernates
 27 | timezoneinfo
 28 | getcomputedstyle
 29 | donnie
 30 | looser
 31 | mingos
 32 | bryant
 33 | prejudice
 34 | dow
 35 | explination
 36 | authentic
 37 | instinctively
 38 | shrugs
 39 | tdammers
 40 | judiciously
 41 | vanishingly
 42 | cobble
 43 | myassembly
 44 | fvu
 45 | projectile
 46 | sessioninfo
 47 | afer
 48 | fluctuate
 49 | appletviewer
 50 | prateek
 51 | chnaged
 52 | jackpot
 53 | jsw
 54 | reponses
 55 | onlamp
 56 | epilogue
 57 | weeding
 58 | intellectually
 59 | honorable
 60 | raze
 61 | baeltazor
 62 | loadxmldoc
 63 | fromid
 64 | documentfilter
 65 | rlh
 66 | tolowerinvariant
 67 | httpstatus
 68 | closeevent
 69 | maki
 70 | pcampbell
 71 | getmethodname
 72 | coulmn
 73 | sshexec
 74 | rhinomock
 75 | epaga
 76 | vienna
 77 | redmon
 78 | nsalert
 79 | dugres
 80 | drorhan
 81 | wxperl
 82 | preexecute
 83 | bashism
 84 | txtdescription
 85 | salmon
 86 | alk
 87 | properities
 88 | kress
 89 | submarine
 90 | mcisendstring
 91 | rthe
 92 | justinfrench
 93 | ssiphone
 94 | sophos
 95 | setsession
 96 | objectcontainer
 97 | myvalidator
 98 | locksupport
 99 | jnkrois
100 | canoe


--------------------------------------------------------------------------------
/MRDP/src/main/resources/hotlistwords.bf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adamjshook/mapreducepatterns/315edb587d602774972c8229c65d740bba9f9f83/MRDP/src/main/resources/hotlistwords.bf


--------------------------------------------------------------------------------
/MRDP/src/test/java/mrdp/ch5/CartesianProductTest.java:
--------------------------------------------------------------------------------
  1 | package mrdp.ch5;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.PrintWriter;
  6 | 
  7 | import mrdp.ch5.CartesianProduct.CartesianInputFormat;
  8 | 
  9 | import org.apache.hadoop.fs.Path;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.mapred.JobClient;
 12 | import org.apache.hadoop.mapred.JobConf;
 13 | import org.apache.hadoop.mapred.MapReduceBase;
 14 | import org.apache.hadoop.mapred.Mapper;
 15 | import org.apache.hadoop.mapred.OutputCollector;
 16 | import org.apache.hadoop.mapred.Reporter;
 17 | import org.apache.hadoop.mapred.RunningJob;
 18 | import org.apache.hadoop.mapred.TextInputFormat;
 19 | import org.apache.hadoop.mapred.TextOutputFormat;
 20 | 
 21 | public class CartesianProductTest {
 22 | 
 23 | 	public static void main(String[] args) throws IOException,
 24 | 			InterruptedException {
 25 | 
 26 | 		File aDir = new File(System.getProperty("user.dir") + "/A");
 27 | 		aDir.mkdirs();
 28 | 		File bDir = new File(System.getProperty("user.dir") + "/B");
 29 | 		bDir.mkdirs();
 30 | 
 31 | 		File a1 = new File(System.getProperty("user.dir") + "/A/A1.txt");
 32 | 		a1.createNewFile();
 33 | 
 34 | 		PrintWriter wrtr = new PrintWriter(a1);
 35 | 
 36 | 		wrtr.println("A11");
 37 | 		wrtr.println("A12");
 38 | 		wrtr.println("A13");
 39 | 		wrtr.println("A14");
 40 | 
 41 | 		wrtr.flush();
 42 | 		wrtr.close();
 43 | 
 44 | 		File a2 = new File(System.getProperty("user.dir") + "/A/A2.txt");
 45 | 		a2.createNewFile();
 46 | 
 47 | 		wrtr = new PrintWriter(a2);
 48 | 
 49 | 		wrtr.println("A21");
 50 | 		wrtr.println("A22");
 51 | 		wrtr.println("A23");
 52 | 		wrtr.println("A24");
 53 | 
 54 | 		wrtr.flush();
 55 | 		wrtr.close();
 56 | 
 57 | 		File b1 = new File(System.getProperty("user.dir") + "/B/B1.txt");
 58 | 		b1.createNewFile();
 59 | 
 60 | 		wrtr = new PrintWriter(b1);
 61 | 
 62 | 		wrtr.println("B11");
 63 | 		wrtr.println("B12");
 64 | 		wrtr.println("B13");
 65 | 		wrtr.println("B14");
 66 | 
 67 | 		wrtr.flush();
 68 | 		wrtr.close();
 69 | 
 70 | 		File b2 = new File(System.getProperty("user.dir") + "/B/B2.txt");
 71 | 		b2.createNewFile();
 72 | 
 73 | 		wrtr = new PrintWriter(b2);
 74 | 
 75 | 		wrtr.println("B21");
 76 | 		wrtr.println("B22");
 77 | 		wrtr.println("B23");
 78 | 		wrtr.println("B24");
 79 | 
 80 | 		wrtr.flush();
 81 | 		wrtr.close();
 82 | 
 83 | 		long start = System.currentTimeMillis();
 84 | 
 85 | 		// Configure the join type
 86 | 		JobConf job = new JobConf("Cartesian Product");
 87 | 		job.setJarByClass(CartesianProduct.class);
 88 | 
 89 | 		job.setMapperClass(CartesianMapper.class);
 90 | 
 91 | 		job.setNumReduceTasks(0);
 92 | 
 93 | 		job.setInputFormat(CartesianInputFormat.class);
 94 | 		CartesianInputFormat.setLeftInputInfo(job, TextInputFormat.class,
 95 | 				System.getProperty("user.dir") + "/A");
 96 | 		CartesianInputFormat.setRightInputInfo(job, TextInputFormat.class,
 97 | 				System.getProperty("user.dir") + "/B");
 98 | 
 99 | 		TextOutputFormat.setOutputPath(job, new Path("cartoutputttest"));
100 | 
101 | 		job.setOutputKeyClass(Text.class);
102 | 		job.setOutputValueClass(Text.class);
103 | 
104 | 		RunningJob jerb = JobClient.runJob(job);
105 | 		while (!jerb.isComplete()) {
106 | 			Thread.sleep(1000);
107 | 		}
108 | 
109 | 		long finish = System.currentTimeMillis();
110 | 
111 | 		System.out.println("Time in ms: " + (finish - start));
112 | 
113 | 		System.exit(jerb.isSuccessful() ? 0 : 2);
114 | 	}
115 | 
116 | 	public static class CartesianMapper extends MapReduceBase implements
117 | 			Mapper<Text, Text, Text, Text> {
118 | 
119 | 		@Override
120 | 		public void map(Text arg0, Text arg1, OutputCollector<Text, Text> arg2,
121 | 				Reporter arg3) throws IOException {
122 | 			arg2.collect(arg0, arg1);
123 | 			System.out.println(arg0 + "\t" + arg1);
124 | 		}
125 | 	}
126 | }
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | mapreducepatterns
2 | =================
3 | 
4 | Repository for MapReduce Design Patterns (O'Reilly 2012) example source code


--------------------------------------------------------------------------------