├── .gitignore ├── MRDP └── src │ ├── main │ ├── java │ │ └── mrdp │ │ │ ├── MRDPMain.java │ │ │ ├── appendixA │ │ │ └── BloomFilterDriver.java │ │ │ ├── ch1 │ │ │ └── CommentWordCount.java │ │ │ ├── ch2 │ │ │ ├── AverageDriver.java │ │ │ ├── CountNumUsersByStateDriver.java │ │ │ ├── MedianStdDevDriver.java │ │ │ ├── MinMaxCountDriver.java │ │ │ ├── SmarterMedianStdDevDriver.java │ │ │ └── WikipediaIndex.java │ │ │ ├── ch3 │ │ │ ├── BloomFilteringDriver.java │ │ │ ├── DistinctUserDriver.java │ │ │ ├── DistributedGrep.java │ │ │ ├── QueryBloomFiltering.java │ │ │ ├── SimpleRandomSampling.java │ │ │ ├── TopTenDriver.java │ │ │ └── UniqueUserCount.java │ │ │ ├── ch4 │ │ │ ├── AnonymizeDriver.java │ │ │ ├── Binning.java │ │ │ ├── PartitionedUsers.java │ │ │ ├── PostCommentBuildingDriver.java │ │ │ ├── QuestionAnswerBuildingDriver.java │ │ │ └── TotalOrderSorting.java │ │ │ ├── ch5 │ │ │ ├── CartesianFormatter.java │ │ │ ├── CartesianProduct.java │ │ │ ├── CompositeJoinDriver.java │ │ │ ├── JoinFormatting.java │ │ │ ├── ReduceSideJoinDriver.java │ │ │ ├── ReduceSideJoinWithBloomDriver.java │ │ │ └── ReplicatedJoinDriver.java │ │ │ ├── ch6 │ │ │ ├── ChainMapperDriver.java │ │ │ ├── JobChainingDriver.java │ │ │ ├── JobControlDriver.java │ │ │ ├── MergedJobDriver.java │ │ │ └── ParallelJobs.java │ │ │ ├── ch7 │ │ │ ├── PartitionPruningInputDriver.java │ │ │ ├── PartitionPruningOutputDriver.java │ │ │ ├── RandomDataGenerationDriver.java │ │ │ ├── RedisInputDriver.java │ │ │ └── RedisOutputDriver.java │ │ │ └── utils │ │ │ └── MRDPUtils.java │ └── resources │ │ ├── highrepusers.bf │ │ ├── hotlist.txt │ │ └── hotlistwords.bf │ └── test │ └── java │ └── mrdp │ └── ch5 │ └── CartesianProductTest.java └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.jar 5 | *.war 6 | *.ear 7 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/MRDPMain.java: -------------------------------------------------------------------------------- 1 | package mrdp; 2 | 3 | import java.util.Arrays; 4 | 5 | import mrdp.ch1.*; 6 | import mrdp.ch2.*; 7 | import mrdp.ch3.*; 8 | import mrdp.ch4.*; 9 | import mrdp.ch5.*; 10 | import mrdp.ch6.*; 11 | import mrdp.ch7.*; 12 | import mrdp.utils.MRDPUtils; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.conf.Configured; 16 | import org.apache.hadoop.util.Tool; 17 | import org.apache.hadoop.util.ToolRunner; 18 | 19 | @SuppressWarnings("unused") 20 | public class MRDPMain extends Configured implements Tool { 21 | 22 | public static void main(String[] args) throws Exception { 23 | System.exit(ToolRunner.run(new Configuration(), new MRDPMain(), args)); 24 | } 25 | 26 | @Override 27 | public int run(String[] args) throws Exception { 28 | if (args.length > 0) { 29 | String example = args[0]; 30 | String[] otherArgs = Arrays.copyOfRange(args, 1, args.length); 31 | 32 | if (example.equalsIgnoreCase("PartitionPruningOutput")) { 33 | PartitionPruningOutputDriver.main(otherArgs); 34 | } else if (example.equalsIgnoreCase("PartitionPruningInput")) { 35 | PartitionPruningInputDriver.main(otherArgs); 36 | } else if (example.equalsIgnoreCase("RedisInput")) { 37 | RedisInputDriver.main(otherArgs); 38 | } else if (example.equalsIgnoreCase("RedisOutput")) { 39 | RedisOutputDriver.main(otherArgs); 40 | } else { 41 | printHelp(); 42 | return 1; 43 | } 44 | 45 | return 0; 46 | } else { 47 | printHelp(); 48 | return 1; 49 | } 50 | } 51 | 52 | private void printHelp() { 53 | System.out 54 | .println("Usage: hadoop jar mrdp.jar "); 55 | System.out.println("Examples are:"); 56 | System.out.println("Chapter 7:"); 57 | System.out 58 | .println("\tRedisOutput "); 59 | System.out 60 | .println("\tRedisInput "); 61 | System.out.println("\tPartitionPruningOutput "); 62 | System.out 63 | .println("\tPartitionPruningInput "); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/appendixA/BloomFilterDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.appendixA; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStreamReader; 5 | import java.util.zip.GZIPInputStream; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataOutputStream; 9 | import org.apache.hadoop.fs.FileStatus; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.util.GenericOptionsParser; 13 | import org.apache.hadoop.util.bloom.BloomFilter; 14 | import org.apache.hadoop.util.bloom.Key; 15 | import org.apache.hadoop.util.hash.Hash; 16 | 17 | public class BloomFilterDriver { 18 | 19 | public static void main(String[] args) throws Exception { 20 | Configuration conf = new Configuration(); 21 | String[] otherArgs = new GenericOptionsParser(conf, args) 22 | .getRemainingArgs(); 23 | if (otherArgs.length != 4) { 24 | System.err 25 | .println("Usage: BloomFilterWriter "); 26 | System.exit(1); 27 | } 28 | 29 | FileSystem fs = FileSystem.get(new Configuration()); 30 | 31 | // Parse command line arguments 32 | Path inputFile = new Path(otherArgs[0]); 33 | int numMembers = Integer.parseInt(otherArgs[1]); 34 | float falsePosRate = Float.parseFloat(otherArgs[2]); 35 | Path bfFile = new Path(otherArgs[3]); 36 | 37 | // Calculate our vector size and optimal K value based on approximations 38 | int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate); 39 | int nbHash = getOptimalK(numMembers, vectorSize); 40 | 41 | // create new Bloom filter 42 | BloomFilter filter = new BloomFilter(vectorSize, nbHash, 43 | Hash.MURMUR_HASH); 44 | 45 | // Open file for read 46 | 47 | System.out.println("Training Bloom filter of size " + vectorSize 48 | + " with " + nbHash + " hash functions, " + numMembers 49 | + " approximate number of records, and " + falsePosRate 50 | + " false positive rate"); 51 | 52 | String line = null; 53 | int numRecords = 0; 54 | for (FileStatus status : fs.listStatus(inputFile)) { 55 | BufferedReader rdr; 56 | // if file is gzipped, wrap it in a GZIPInputStream 57 | if (status.getPath().getName().endsWith(".gz")) { 58 | rdr = new BufferedReader(new InputStreamReader( 59 | new GZIPInputStream(fs.open(status.getPath())))); 60 | } else { 61 | rdr = new BufferedReader(new InputStreamReader(fs.open(status 62 | .getPath()))); 63 | } 64 | 65 | System.out.println("Reading " + status.getPath()); 66 | while ((line = rdr.readLine()) != null) { 67 | filter.add(new Key(line.getBytes())); 68 | ++numRecords; 69 | } 70 | 71 | rdr.close(); 72 | } 73 | 74 | System.out.println("Trained Bloom filter with " + numRecords 75 | + " entries."); 76 | 77 | System.out.println("Serializing Bloom filter to HDFS at " + bfFile); 78 | FSDataOutputStream strm = fs.create(bfFile); 79 | filter.write(strm); 80 | 81 | strm.flush(); 82 | strm.close(); 83 | 84 | System.out.println("Done training Bloom filter."); 85 | } 86 | 87 | public static int getOptimalBloomFilterSize(int numRecords, 88 | float falsePosRate) { 89 | int size = (int) (-numRecords * (float) Math.log(falsePosRate) / Math 90 | .pow(Math.log(2), 2)); 91 | return size; 92 | } 93 | 94 | public static int getOptimalK(float numMembers, float vectorSize) { 95 | return (int) Math.round(vectorSize / numMembers * Math.log(2)); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch1/CommentWordCount.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch1; 2 | 3 | import java.io.IOException; 4 | import java.util.StringTokenizer; 5 | import java.util.Map; 6 | 7 | import mrdp.utils.MRDPUtils; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.IntWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.util.GenericOptionsParser; 19 | 20 | import org.apache.commons.lang.StringEscapeUtils; 21 | 22 | public class CommentWordCount { 23 | 24 | public static class SOWordCountMapper extends 25 | Mapper { 26 | 27 | private final static IntWritable one = new IntWritable(1); 28 | private Text word = new Text(); 29 | 30 | public void map(Object key, Text value, Context context) 31 | throws IOException, InterruptedException { 32 | 33 | // Parse the input string into a nice map 34 | Map parsed = MRDPUtils.transformXmlToMap(value 35 | .toString()); 36 | 37 | // Grab the "Text" field, since that is what we are counting over 38 | String txt = parsed.get("Text"); 39 | 40 | // .get will return null if the key is not there 41 | if (txt == null) { 42 | // skip this record 43 | return; 44 | } 45 | 46 | // Unescape the HTML because the SO data is escaped. 47 | txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase()); 48 | 49 | // Remove some annoying punctuation 50 | txt = txt.replaceAll("'", ""); // remove single quotes (e.g., can't) 51 | txt = txt.replaceAll("[^a-zA-Z]", " "); // replace the rest with a 52 | // space 53 | 54 | // Tokenize the string, then send the tokens away 55 | StringTokenizer itr = new StringTokenizer(txt); 56 | while (itr.hasMoreTokens()) { 57 | word.set(itr.nextToken()); 58 | context.write(word, one); 59 | } 60 | } 61 | } 62 | 63 | public static class IntSumReducer extends 64 | Reducer { 65 | private IntWritable result = new IntWritable(); 66 | 67 | public void reduce(Text key, Iterable values, 68 | Context context) throws IOException, InterruptedException { 69 | int sum = 0; 70 | for (IntWritable val : values) { 71 | sum += val.get(); 72 | } 73 | 74 | result.set(sum); 75 | context.write(key, result); 76 | 77 | } 78 | } 79 | 80 | public static void main(String[] args) throws Exception { 81 | Configuration conf = new Configuration(); 82 | String[] otherArgs = new GenericOptionsParser(conf, args) 83 | .getRemainingArgs(); 84 | if (otherArgs.length != 2) { 85 | System.err.println("Usage: CommentWordCount "); 86 | System.exit(2); 87 | } 88 | Job job = new Job(conf, "StackOverflow Comment Word Count"); 89 | job.setJarByClass(CommentWordCount.class); 90 | job.setMapperClass(SOWordCountMapper.class); 91 | job.setCombinerClass(IntSumReducer.class); 92 | job.setReducerClass(IntSumReducer.class); 93 | job.setOutputKeyClass(Text.class); 94 | job.setOutputValueClass(IntWritable.class); 95 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 96 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 97 | System.exit(job.waitForCompletion(true) ? 0 : 1); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/AverageDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Date; 9 | import java.util.Map; 10 | 11 | import mrdp.utils.MRDPUtils; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.IntWritable; 16 | import org.apache.hadoop.io.Text; 17 | import org.apache.hadoop.io.Writable; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.Reducer; 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 23 | import org.apache.hadoop.util.GenericOptionsParser; 24 | 25 | public class AverageDriver { 26 | 27 | public static class SOAverageMapper extends 28 | Mapper { 29 | 30 | private IntWritable outHour = new IntWritable(); 31 | private CountAverageTuple outCountAverage = new CountAverageTuple(); 32 | 33 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 34 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 35 | 36 | @SuppressWarnings("deprecation") 37 | @Override 38 | public void map(Object key, Text value, Context context) 39 | throws IOException, InterruptedException { 40 | 41 | // Parse the input string into a nice map 42 | Map parsed = MRDPUtils.transformXmlToMap(value 43 | .toString()); 44 | 45 | // Grab the "CreationDate" field, 46 | // since it is what we are grouping by 47 | String strDate = parsed.get("CreationDate"); 48 | 49 | // Grab the comment to find the length 50 | String text = parsed.get("Text"); 51 | 52 | // .get will return null if the key is not there 53 | if (strDate == null || text == null) { 54 | // skip this record 55 | return; 56 | } 57 | 58 | try { 59 | // get the hour this comment was posted in 60 | Date creationDate = frmt.parse(strDate); 61 | outHour.set(creationDate.getHours()); 62 | 63 | // get the comment length 64 | outCountAverage.setCount(1); 65 | outCountAverage.setAverage(text.length()); 66 | 67 | // write out the user ID with min max dates and count 68 | context.write(outHour, outCountAverage); 69 | 70 | } catch (ParseException e) { 71 | System.err.println(e.getMessage()); 72 | return; 73 | } 74 | } 75 | } 76 | 77 | public static class SOAverageReducer 78 | extends 79 | Reducer { 80 | private CountAverageTuple result = new CountAverageTuple(); 81 | 82 | @Override 83 | public void reduce(IntWritable key, Iterable values, 84 | Context context) throws IOException, InterruptedException { 85 | 86 | float sum = 0; 87 | float count = 0; 88 | 89 | // Iterate through all input values for this key 90 | for (CountAverageTuple val : values) { 91 | sum += val.getCount() * val.getAverage(); 92 | count += val.getCount(); 93 | } 94 | 95 | result.setCount(count); 96 | result.setAverage(sum / count); 97 | 98 | context.write(key, result); 99 | } 100 | } 101 | 102 | public static void main(String[] args) throws Exception { 103 | Configuration conf = new Configuration(); 104 | String[] otherArgs = new GenericOptionsParser(conf, args) 105 | .getRemainingArgs(); 106 | if (otherArgs.length != 2) { 107 | System.err.println("Usage: AverageDriver "); 108 | System.exit(2); 109 | } 110 | Job job = new Job(conf, "StackOverflow Average Comment Length"); 111 | job.setJarByClass(AverageDriver.class); 112 | job.setMapperClass(SOAverageMapper.class); 113 | job.setCombinerClass(SOAverageReducer.class); 114 | job.setReducerClass(SOAverageReducer.class); 115 | job.setOutputKeyClass(IntWritable.class); 116 | job.setOutputValueClass(CountAverageTuple.class); 117 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 118 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 119 | System.exit(job.waitForCompletion(true) ? 0 : 1); 120 | } 121 | 122 | public static class CountAverageTuple implements Writable { 123 | private float count = 0f; 124 | private float average = 0f; 125 | 126 | public float getCount() { 127 | return count; 128 | } 129 | 130 | public void setCount(float count) { 131 | this.count = count; 132 | } 133 | 134 | public float getAverage() { 135 | return average; 136 | } 137 | 138 | public void setAverage(float average) { 139 | this.average = average; 140 | } 141 | 142 | @Override 143 | public void readFields(DataInput in) throws IOException { 144 | count = in.readFloat(); 145 | average = in.readFloat(); 146 | } 147 | 148 | @Override 149 | public void write(DataOutput out) throws IOException { 150 | out.writeFloat(count); 151 | out.writeFloat(average); 152 | } 153 | 154 | @Override 155 | public String toString() { 156 | return count + "\t" + average; 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/CountNumUsersByStateDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | import java.util.HashSet; 6 | import java.util.Map; 7 | 8 | import mrdp.utils.MRDPUtils; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.FileSystem; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.NullWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.Counter; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.util.GenericOptionsParser; 21 | 22 | public class CountNumUsersByStateDriver { 23 | 24 | public static class CountNumUsersByStateMapper extends 25 | Mapper { 26 | 27 | public static final String STATE_COUNTER_GROUP = "State"; 28 | 29 | private String[] statesArray = new String[] { "AL", "AK", "AZ", "AR", 30 | "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", 31 | "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", 32 | "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", 33 | "OH", "OK", "OR", "PA", "RI", "SC", "SF", "TN", "TX", "UT", 34 | "VT", "VA", "WA", "WV", "WI", "WY" }; 35 | 36 | private HashSet states = new HashSet( 37 | Arrays.asList(statesArray)); 38 | 39 | @Override 40 | public void map(Object key, Text value, Context context) 41 | throws IOException, InterruptedException { 42 | 43 | // Parse the input into a nice map. 44 | Map parsed = MRDPUtils.transformXmlToMap(value 45 | .toString()); 46 | 47 | // Get the value for the Location attribute 48 | String location = parsed.get("Location"); 49 | 50 | // Look for a state abbreviation code if the location is not null or 51 | // empty 52 | if (location != null && !location.isEmpty()) { 53 | boolean unknown = true; 54 | // Make location uppercase and split on white space 55 | String[] tokens = location.toUpperCase().split("\\s"); 56 | // For each token 57 | for (String state : tokens) { 58 | // Check if it is a state 59 | if (states.contains(state)) { 60 | 61 | // If so, increment the state's counter by 1 and flag it 62 | // as not unknown 63 | context.getCounter(STATE_COUNTER_GROUP, state) 64 | .increment(1); 65 | unknown = false; 66 | break; 67 | } 68 | } 69 | 70 | // If the state is unknown, increment the counter 71 | if (unknown) { 72 | context.getCounter(STATE_COUNTER_GROUP, "Unknown") 73 | .increment(1); 74 | } 75 | } else { 76 | // If it is empty or null, increment the counter by 1 77 | context.getCounter(STATE_COUNTER_GROUP, "NullOrEmpty") 78 | .increment(1); 79 | } 80 | } 81 | } 82 | 83 | public static void main(String[] args) throws Exception { 84 | Configuration conf = new Configuration(); 85 | String[] otherArgs = new GenericOptionsParser(conf, args) 86 | .getRemainingArgs(); 87 | 88 | if (otherArgs.length != 2) { 89 | System.err.println("Usage: CountNumUsersByState "); 90 | System.exit(2); 91 | } 92 | 93 | Path input = new Path(otherArgs[0]); 94 | Path outputDir = new Path(otherArgs[1]); 95 | 96 | Job job = new Job(conf, "Count Num Users By State"); 97 | job.setJarByClass(CountNumUsersByStateDriver.class); 98 | 99 | job.setMapperClass(CountNumUsersByStateMapper.class); 100 | job.setNumReduceTasks(0); 101 | 102 | job.setOutputKeyClass(NullWritable.class); 103 | job.setOutputValueClass(NullWritable.class); 104 | 105 | FileInputFormat.addInputPath(job, input); 106 | FileOutputFormat.setOutputPath(job, outputDir); 107 | 108 | int code = job.waitForCompletion(true) ? 0 : 1; 109 | 110 | if (code == 0) { 111 | for (Counter counter : job.getCounters().getGroup( 112 | CountNumUsersByStateMapper.STATE_COUNTER_GROUP)) { 113 | System.out.println(counter.getDisplayName() + "\t" 114 | + counter.getValue()); 115 | } 116 | } 117 | 118 | // Clean up empty output directory 119 | FileSystem.get(conf).delete(outputDir, true); 120 | 121 | System.exit(code); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/MedianStdDevDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.ArrayList; 9 | import java.util.Collections; 10 | import java.util.Date; 11 | import java.util.Map; 12 | 13 | import mrdp.utils.MRDPUtils; 14 | 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.IntWritable; 18 | import org.apache.hadoop.io.Text; 19 | import org.apache.hadoop.io.Writable; 20 | import org.apache.hadoop.mapreduce.Job; 21 | import org.apache.hadoop.mapreduce.Mapper; 22 | import org.apache.hadoop.mapreduce.Reducer; 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 25 | import org.apache.hadoop.util.GenericOptionsParser; 26 | 27 | public class MedianStdDevDriver { 28 | 29 | public static class SOMedianStdDevMapper extends 30 | Mapper { 31 | 32 | private IntWritable outHour = new IntWritable(); 33 | private IntWritable outCommentLength = new IntWritable(); 34 | 35 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 36 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 37 | 38 | @SuppressWarnings("deprecation") 39 | @Override 40 | public void map(Object key, Text value, Context context) 41 | throws IOException, InterruptedException { 42 | 43 | // Parse the input string into a nice map 44 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 45 | 46 | // Grab the "CreationDate" field, 47 | // since it is what we are grouping by 48 | String strDate = parsed.get("CreationDate"); 49 | 50 | // Grab the comment to find the length 51 | String text = parsed.get("Text"); 52 | 53 | // .get will return null if the key is not there 54 | if (strDate == null || text == null) { 55 | // skip this record 56 | return; 57 | } 58 | 59 | try { 60 | // get the hour this comment was posted in 61 | Date creationDate = frmt.parse(strDate); 62 | outHour.set(creationDate.getHours()); 63 | 64 | // get the comment length 65 | outCommentLength.set(text.length()); 66 | 67 | // write out the user ID with min max dates and count 68 | context.write(outHour, outCommentLength); 69 | 70 | } catch (ParseException e) { 71 | System.err.println(e.getMessage()); 72 | return; 73 | } 74 | } 75 | } 76 | 77 | public static class SOMedianStdDevReducer extends 78 | Reducer { 79 | private MedianStdDevTuple result = new MedianStdDevTuple(); 80 | private ArrayList commentLengths = new ArrayList(); 81 | 82 | @Override 83 | public void reduce(IntWritable key, Iterable values, 84 | Context context) throws IOException, InterruptedException { 85 | 86 | float sum = 0; 87 | float count = 0; 88 | commentLengths.clear(); 89 | result.setStdDev(0); 90 | 91 | // Iterate through all input values for this key 92 | for (IntWritable val : values) { 93 | commentLengths.add((float) val.get()); 94 | sum += val.get(); 95 | ++count; 96 | } 97 | 98 | // sort commentLengths to calculate median 99 | Collections.sort(commentLengths); 100 | 101 | // if commentLengths is an even value, average middle two elements 102 | if (count % 2 == 0) { 103 | result.setMedian((commentLengths.get((int) count / 2 - 1) + commentLengths 104 | .get((int) count / 2)) / 2.0f); 105 | } else { 106 | // else, set median to middle value 107 | result.setMedian(commentLengths.get((int) count / 2)); 108 | } 109 | 110 | // calculate standard deviation 111 | float mean = sum / count; 112 | 113 | float sumOfSquares = 0.0f; 114 | for (Float f : commentLengths) { 115 | sumOfSquares += (f - mean) * (f - mean); 116 | } 117 | 118 | result.setStdDev((float) Math.sqrt(sumOfSquares / (count - 1))); 119 | 120 | context.write(key, result); 121 | } 122 | } 123 | 124 | public static void main(String[] args) throws Exception { 125 | Configuration conf = new Configuration(); 126 | String[] otherArgs = new GenericOptionsParser(conf, args) 127 | .getRemainingArgs(); 128 | if (otherArgs.length != 2) { 129 | System.err.println("Usage: MedianStdDevDriver "); 130 | System.exit(2); 131 | } 132 | Job job = new Job(conf, 133 | "StackOverflow Comment Length Median StdDev By Hour"); 134 | job.setJarByClass(MedianStdDevDriver.class); 135 | job.setMapperClass(SOMedianStdDevMapper.class); 136 | job.setReducerClass(SOMedianStdDevReducer.class); 137 | job.setMapOutputKeyClass(IntWritable.class); 138 | job.setMapOutputValueClass(IntWritable.class); 139 | job.setOutputKeyClass(IntWritable.class); 140 | job.setOutputValueClass(MedianStdDevTuple.class); 141 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 142 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 143 | System.exit(job.waitForCompletion(true) ? 0 : 1); 144 | } 145 | 146 | public static class MedianStdDevTuple implements Writable { 147 | private float median = 0; 148 | private float stddev = 0f; 149 | 150 | public float getMedian() { 151 | return median; 152 | } 153 | 154 | public void setMedian(float median) { 155 | this.median = median; 156 | } 157 | 158 | public float getStdDev() { 159 | return stddev; 160 | } 161 | 162 | public void setStdDev(float stddev) { 163 | this.stddev = stddev; 164 | } 165 | 166 | @Override 167 | public void readFields(DataInput in) throws IOException { 168 | median = in.readFloat(); 169 | stddev = in.readFloat(); 170 | } 171 | 172 | @Override 173 | public void write(DataOutput out) throws IOException { 174 | out.writeFloat(median); 175 | out.writeFloat(stddev); 176 | } 177 | 178 | @Override 179 | public String toString() { 180 | return median + "\t" + stddev; 181 | } 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/MinMaxCountDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Date; 9 | import java.util.Map; 10 | 11 | import mrdp.utils.MRDPUtils; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.io.Writable; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.Mapper; 19 | import org.apache.hadoop.mapreduce.Reducer; 20 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | 24 | public class MinMaxCountDriver { 25 | 26 | public static class SOMinMaxCountMapper extends 27 | Mapper { 28 | // Our output key and value Writables 29 | private Text outUserId = new Text(); 30 | private MinMaxCountTuple outTuple = new MinMaxCountTuple(); 31 | 32 | // This object will format the creation date string into a Date object 33 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 34 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 35 | 36 | @Override 37 | public void map(Object key, Text value, Context context) 38 | throws IOException, InterruptedException { 39 | 40 | // Parse the input string into a nice map 41 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 42 | 43 | // Grab the "CreationDate" field since it is what we are finding 44 | // the min and max value of 45 | String strDate = parsed.get("CreationDate"); 46 | 47 | // Grab the “UserID” since it is what we are grouping by 48 | String userId = parsed.get("UserId"); 49 | 50 | // .get will return null if the key is not there 51 | if (strDate == null || userId == null) { 52 | // skip this record 53 | return; 54 | } 55 | 56 | try { 57 | // Parse the string into a Date object 58 | Date creationDate = frmt.parse(strDate); 59 | 60 | // Set the minimum and maximum date values to the creationDate 61 | outTuple.setMin(creationDate); 62 | outTuple.setMax(creationDate); 63 | 64 | // Set the comment count to 1 65 | outTuple.setCount(1); 66 | 67 | // Set our user ID as the output key 68 | outUserId.set(userId); 69 | 70 | // Write out the user ID with min max dates and count 71 | context.write(outUserId, outTuple); 72 | } catch (ParseException e) { 73 | // An error occurred parsing the creation Date string 74 | // skip this record 75 | } 76 | } 77 | } 78 | 79 | public static class SOMinMaxCountReducer extends 80 | Reducer { 81 | private MinMaxCountTuple result = new MinMaxCountTuple(); 82 | 83 | @Override 84 | public void reduce(Text key, Iterable values, 85 | Context context) throws IOException, InterruptedException { 86 | 87 | // Initialize our result 88 | result.setMin(null); 89 | result.setMax(null); 90 | int sum = 0; 91 | 92 | // Iterate through all input values for this key 93 | for (MinMaxCountTuple val : values) { 94 | 95 | // If the value's min is less than the result's min 96 | // Set the result's min to value's 97 | if (result.getMin() == null 98 | || val.getMin().compareTo(result.getMin()) < 0) { 99 | result.setMin(val.getMin()); 100 | } 101 | 102 | // If the value's max is less than the result's max 103 | // Set the result's max to value's 104 | if (result.getMax() == null 105 | || val.getMax().compareTo(result.getMax()) > 0) { 106 | result.setMax(val.getMax()); 107 | } 108 | 109 | // Add to our sum the count for val 110 | sum += val.getCount(); 111 | } 112 | 113 | // Set our count to the number of input values 114 | result.setCount(sum); 115 | 116 | context.write(key, result); 117 | } 118 | } 119 | 120 | public static void main(String[] args) throws Exception { 121 | Configuration conf = new Configuration(); 122 | String[] otherArgs = new GenericOptionsParser(conf, args) 123 | .getRemainingArgs(); 124 | if (otherArgs.length != 2) { 125 | System.err.println("Usage: MinMaxCountDriver "); 126 | System.exit(2); 127 | } 128 | Job job = new Job(conf, "StackOverflow Comment Date Min Max Count"); 129 | job.setJarByClass(MinMaxCountDriver.class); 130 | job.setMapperClass(SOMinMaxCountMapper.class); 131 | job.setCombinerClass(SOMinMaxCountReducer.class); 132 | job.setReducerClass(SOMinMaxCountReducer.class); 133 | job.setOutputKeyClass(Text.class); 134 | job.setOutputValueClass(MinMaxCountTuple.class); 135 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 136 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 137 | System.exit(job.waitForCompletion(true) ? 0 : 1); 138 | } 139 | 140 | public static class MinMaxCountTuple implements Writable { 141 | private Date min = new Date(); 142 | private Date max = new Date(); 143 | private long count = 0; 144 | 145 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 146 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 147 | 148 | public Date getMin() { 149 | return min; 150 | } 151 | 152 | public void setMin(Date min) { 153 | this.min = min; 154 | } 155 | 156 | public Date getMax() { 157 | return max; 158 | } 159 | 160 | public void setMax(Date max) { 161 | this.max = max; 162 | } 163 | 164 | public long getCount() { 165 | return count; 166 | } 167 | 168 | public void setCount(long count) { 169 | this.count = count; 170 | } 171 | 172 | @Override 173 | public void readFields(DataInput in) throws IOException { 174 | min = new Date(in.readLong()); 175 | max = new Date(in.readLong()); 176 | count = in.readLong(); 177 | } 178 | 179 | @Override 180 | public void write(DataOutput out) throws IOException { 181 | out.writeLong(min.getTime()); 182 | out.writeLong(max.getTime()); 183 | out.writeLong(count); 184 | } 185 | 186 | @Override 187 | public String toString() { 188 | return frmt.format(min) + "\t" + frmt.format(max) + "\t" + count; 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/SmarterMedianStdDevDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Date; 9 | import java.util.Map; 10 | import java.util.TreeMap; 11 | import java.util.Map.Entry; 12 | 13 | import mrdp.utils.MRDPUtils; 14 | 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.IntWritable; 18 | import org.apache.hadoop.io.LongWritable; 19 | import org.apache.hadoop.io.SortedMapWritable; 20 | import org.apache.hadoop.io.Text; 21 | import org.apache.hadoop.io.Writable; 22 | import org.apache.hadoop.io.WritableComparable; 23 | import org.apache.hadoop.mapreduce.Job; 24 | import org.apache.hadoop.mapreduce.Mapper; 25 | import org.apache.hadoop.mapreduce.Reducer; 26 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 27 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 28 | import org.apache.hadoop.util.GenericOptionsParser; 29 | 30 | public class SmarterMedianStdDevDriver { 31 | 32 | public static class SOMedianStdDevMapper extends 33 | Mapper { 34 | 35 | private IntWritable commentLength = new IntWritable(); 36 | private static final LongWritable ONE = new LongWritable(1); 37 | private IntWritable outHour = new IntWritable(); 38 | 39 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 40 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 41 | 42 | @SuppressWarnings("deprecation") 43 | @Override 44 | public void map(Object key, Text value, Context context) 45 | throws IOException, InterruptedException { 46 | 47 | // Parse the input string into a nice map 48 | Map parsed = MRDPUtils.transformXmlToMap(value 49 | .toString()); 50 | 51 | // Grab the "CreationDate" field, 52 | // since it is what we are grouping by 53 | String strDate = parsed.get("CreationDate"); 54 | 55 | // Grab the comment to find the length 56 | String text = parsed.get("Text"); 57 | 58 | // .get will return null if the key is not there 59 | if (strDate == null || text == null) { 60 | // skip this record 61 | return; 62 | } 63 | 64 | try { 65 | // get the hour this comment was posted in 66 | Date creationDate = frmt.parse(strDate); 67 | outHour.set(creationDate.getHours()); 68 | 69 | commentLength.set(text.length()); 70 | SortedMapWritable outCommentLength = new SortedMapWritable(); 71 | outCommentLength.put(commentLength, ONE); 72 | 73 | // write out the user ID with min max dates and count 74 | context.write(outHour, outCommentLength); 75 | 76 | } catch (ParseException e) { 77 | System.err.println(e.getMessage()); 78 | return; 79 | } 80 | } 81 | } 82 | 83 | public static class SOMedianStdDevCombiner 84 | extends 85 | Reducer { 86 | 87 | @SuppressWarnings("rawtypes") 88 | protected void reduce(IntWritable key, 89 | Iterable values, Context context) 90 | throws IOException, InterruptedException { 91 | 92 | SortedMapWritable outValue = new SortedMapWritable(); 93 | 94 | for (SortedMapWritable v : values) { 95 | for (Entry entry : v.entrySet()) { 96 | LongWritable count = (LongWritable) outValue.get(entry 97 | .getKey()); 98 | 99 | if (count != null) { 100 | count.set(count.get() 101 | + ((LongWritable) entry.getValue()).get()); 102 | } else { 103 | outValue.put(entry.getKey(), new LongWritable( 104 | ((LongWritable) entry.getValue()).get())); 105 | } 106 | } 107 | } 108 | 109 | context.write(key, outValue); 110 | } 111 | } 112 | 113 | public static class SOMedianStdDevReducer 114 | extends 115 | Reducer { 116 | private MedianStdDevTuple result = new MedianStdDevTuple(); 117 | private TreeMap commentLengthCounts = new TreeMap(); 118 | 119 | @SuppressWarnings("rawtypes") 120 | @Override 121 | public void reduce(IntWritable key, Iterable values, 122 | Context context) throws IOException, InterruptedException { 123 | 124 | float sum = 0; 125 | long totalComments = 0; 126 | commentLengthCounts.clear(); 127 | result.setMedian(0); 128 | result.setStdDev(0); 129 | 130 | for (SortedMapWritable v : values) { 131 | for (Entry entry : v.entrySet()) { 132 | int length = ((IntWritable) entry.getKey()).get(); 133 | long count = ((LongWritable) entry.getValue()).get(); 134 | 135 | totalComments += count; 136 | sum += length * count; 137 | 138 | Long storedCount = commentLengthCounts.get(length); 139 | if (storedCount == null) { 140 | commentLengthCounts.put(length, count); 141 | } else { 142 | commentLengthCounts.put(length, storedCount + count); 143 | } 144 | } 145 | } 146 | 147 | long medianIndex = totalComments / 2L; 148 | long previousComments = 0; 149 | long comments = 0; 150 | int prevKey = 0; 151 | for (Entry entry : commentLengthCounts.entrySet()) { 152 | comments = previousComments + entry.getValue(); 153 | if (previousComments <= medianIndex && medianIndex < comments) { 154 | if (totalComments % 2 == 0) { 155 | if (previousComments == medianIndex) { 156 | result.setMedian((float) (entry.getKey() + prevKey) / 2.0f); 157 | } else { 158 | result.setMedian(entry.getKey()); 159 | } 160 | } else { 161 | result.setMedian(entry.getKey()); 162 | } 163 | break; 164 | } 165 | previousComments = comments; 166 | prevKey = entry.getKey(); 167 | } 168 | 169 | // calculate standard deviation 170 | float mean = sum / totalComments; 171 | 172 | float sumOfSquares = 0.0f; 173 | for (Entry entry : commentLengthCounts.entrySet()) { 174 | sumOfSquares += (entry.getKey() - mean) 175 | * (entry.getKey() - mean) * entry.getValue(); 176 | } 177 | 178 | result.setStdDev((float) Math.sqrt(sumOfSquares 179 | / (totalComments - 1))); 180 | 181 | context.write(key, result); 182 | } 183 | } 184 | 185 | public static void main(String[] args) throws Exception { 186 | Configuration conf = new Configuration(); 187 | String[] otherArgs = new GenericOptionsParser(conf, args) 188 | .getRemainingArgs(); 189 | if (otherArgs.length != 2) { 190 | System.err.println("Usage: MedianStdDevDriver "); 191 | System.exit(2); 192 | } 193 | Job job = new Job(conf, 194 | "StackOverflow Comment Length Median StdDev By Hour"); 195 | job.setJarByClass(SmarterMedianStdDevDriver.class); 196 | job.setMapperClass(SOMedianStdDevMapper.class); 197 | job.setCombinerClass(SOMedianStdDevCombiner.class); 198 | job.setReducerClass(SOMedianStdDevReducer.class); 199 | job.setMapOutputKeyClass(IntWritable.class); 200 | job.setMapOutputValueClass(SortedMapWritable.class); 201 | job.setOutputKeyClass(IntWritable.class); 202 | job.setOutputValueClass(MedianStdDevTuple.class); 203 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 204 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 205 | System.exit(job.waitForCompletion(true) ? 0 : 1); 206 | } 207 | 208 | public static class MedianStdDevTuple implements Writable { 209 | private float median = 0; 210 | private float stddev = 0f; 211 | 212 | public float getMedian() { 213 | return median; 214 | } 215 | 216 | public void setMedian(float median) { 217 | this.median = median; 218 | } 219 | 220 | public float getStdDev() { 221 | return stddev; 222 | } 223 | 224 | public void setStdDev(float stddev) { 225 | this.stddev = stddev; 226 | } 227 | 228 | @Override 229 | public void readFields(DataInput in) throws IOException { 230 | median = in.readFloat(); 231 | stddev = in.readFloat(); 232 | } 233 | 234 | @Override 235 | public void write(DataOutput out) throws IOException { 236 | out.writeFloat(median); 237 | out.writeFloat(stddev); 238 | } 239 | 240 | @Override 241 | public String toString() { 242 | return median + "\t" + stddev; 243 | } 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch2/WikipediaIndex.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch2; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | import org.apache.hadoop.util.GenericOptionsParser; 17 | 18 | import org.apache.commons.lang.StringEscapeUtils; 19 | 20 | public class WikipediaIndex { 21 | 22 | public static String getWikipediaURL(String text) { 23 | 24 | int idx = text.indexOf("\"http://en.wikipedia.org"); 25 | if (idx == -1) { 26 | return null; 27 | } 28 | int idx_end = text.indexOf('"', idx + 1); 29 | 30 | if (idx_end == -1) { 31 | return null; 32 | } 33 | 34 | int idx_hash = text.indexOf('#', idx + 1); 35 | 36 | if (idx_hash != -1 && idx_hash < idx_end) { 37 | return text.substring(idx + 1, idx_hash); 38 | } else { 39 | return text.substring(idx + 1, idx_end); 40 | } 41 | 42 | } 43 | 44 | public static class SOWikipediaExtractor extends 45 | Mapper { 46 | 47 | private Text link = new Text(); 48 | private Text outkey = new Text(); 49 | 50 | public void map(Object key, Text value, Context context) 51 | throws IOException, InterruptedException { 52 | 53 | // Parse the input string into a nice map 54 | Map parsed = MRDPUtils.transformXmlToMap(value 55 | .toString()); 56 | 57 | // Grab the necessary XML attributes 58 | String txt = parsed.get("Body"); 59 | String posttype = parsed.get("PostTypeId"); 60 | String row_id = parsed.get("Id"); 61 | 62 | // if the body is null, or the post is a question (1), skip 63 | if (txt == null || (posttype != null && posttype.equals("1"))) { 64 | return; 65 | } 66 | 67 | // Unescape the HTML because the SO data is escaped. 68 | txt = StringEscapeUtils.unescapeHtml(txt.toLowerCase()); 69 | 70 | link.set(getWikipediaURL(txt)); 71 | outkey.set(row_id); 72 | context.write(link, outkey); 73 | } 74 | } 75 | 76 | public static class Concatenator extends Reducer { 77 | private Text result = new Text(); 78 | 79 | public void reduce(Text key, Iterable values, Context context) 80 | throws IOException, InterruptedException { 81 | 82 | StringBuilder sb = new StringBuilder(); 83 | for (Text id : values) { 84 | sb.append(id.toString() + " "); 85 | } 86 | 87 | result.set(sb.substring(0, sb.length() - 1).toString()); 88 | context.write(key, result); 89 | } 90 | } 91 | 92 | public static void main(String[] args) throws Exception { 93 | Configuration conf = new Configuration(); 94 | String[] otherArgs = new GenericOptionsParser(conf, args) 95 | .getRemainingArgs(); 96 | if (otherArgs.length != 2) { 97 | System.err.println("Usage: WikipediallIndex "); 98 | System.exit(2); 99 | } 100 | Job job = new Job(conf, "StackOverflow Wikipedia URL Inverted Index"); 101 | job.setJarByClass(WikipediaIndex.class); 102 | job.setMapperClass(SOWikipediaExtractor.class); 103 | job.setCombinerClass(Concatenator.class); 104 | job.setReducerClass(Concatenator.class); 105 | job.setOutputKeyClass(Text.class); 106 | job.setOutputValueClass(Text.class); 107 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 108 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 109 | System.exit(job.waitForCompletion(true) ? 0 : 1); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/BloomFilteringDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.net.URI; 7 | import java.util.Map; 8 | import java.util.StringTokenizer; 9 | 10 | import mrdp.utils.MRDPUtils; 11 | 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.filecache.DistributedCache; 14 | import org.apache.hadoop.fs.FileSystem; 15 | import org.apache.hadoop.fs.Path; 16 | import org.apache.hadoop.io.NullWritable; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | import org.apache.hadoop.util.bloom.BloomFilter; 24 | import org.apache.hadoop.util.bloom.Key; 25 | 26 | public class BloomFilteringDriver { 27 | 28 | public static class BloomFilteringMapper extends 29 | Mapper { 30 | 31 | private BloomFilter filter = new BloomFilter(); 32 | 33 | @Override 34 | protected void setup(Context context) throws IOException, 35 | InterruptedException { 36 | URI[] files = DistributedCache.getCacheFiles(context 37 | .getConfiguration()); 38 | 39 | // if the files in the distributed cache are set 40 | if (files != null && files.length == 1) { 41 | System.out.println("Reading Bloom filter from: " 42 | + files[0].getPath()); 43 | 44 | // Open local file for read. 45 | DataInputStream strm = new DataInputStream(new FileInputStream( 46 | files[0].getPath())); 47 | 48 | // Read into our Bloom filter. 49 | filter.readFields(strm); 50 | strm.close(); 51 | } else { 52 | throw new IOException( 53 | "Bloom filter file not set in the DistributedCache."); 54 | } 55 | } 56 | 57 | @Override 58 | public void map(Object key, Text value, Context context) 59 | throws IOException, InterruptedException { 60 | 61 | // Parse the input into a nice map. 62 | Map parsed = MRDPUtils.transformXmlToMap(value 63 | .toString()); 64 | 65 | // Get the value for the comment 66 | String comment = parsed.get("Text"); 67 | 68 | // If it is null, skip this record 69 | if (comment == null) { 70 | return; 71 | } 72 | 73 | StringTokenizer tokenizer = new StringTokenizer(comment); 74 | // For each word in the comment 75 | while (tokenizer.hasMoreTokens()) { 76 | 77 | // Clean up the words 78 | String cleanWord = tokenizer.nextToken().replaceAll("'", "") 79 | .replaceAll("[^a-zA-Z]", " "); 80 | 81 | // If the word is in the filter, output it and break 82 | if (cleanWord.length() > 0 83 | && filter.membershipTest(new Key(cleanWord.getBytes()))) { 84 | context.write(value, NullWritable.get()); 85 | break; 86 | } 87 | } 88 | } 89 | } 90 | 91 | public static void main(String[] args) throws Exception { 92 | Configuration conf = new Configuration(); 93 | String[] otherArgs = new GenericOptionsParser(conf, args) 94 | .getRemainingArgs(); 95 | if (otherArgs.length != 3) { 96 | System.err.println("Usage: BloomFiltering "); 97 | System.exit(1); 98 | } 99 | 100 | FileSystem.get(conf).delete(new Path(otherArgs[2]), true); 101 | 102 | Job job = new Job(conf, "StackOverflow Bloom Filtering"); 103 | job.setJarByClass(BloomFilteringDriver.class); 104 | job.setMapperClass(BloomFilteringMapper.class); 105 | job.setNumReduceTasks(0); 106 | job.setOutputKeyClass(Text.class); 107 | job.setOutputValueClass(NullWritable.class); 108 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 109 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 110 | 111 | DistributedCache.addCacheFile( 112 | FileSystem.get(conf).makeQualified(new Path(otherArgs[1])) 113 | .toUri(), job.getConfiguration()); 114 | 115 | System.exit(job.waitForCompletion(true) ? 0 : 1); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/DistinctUserDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.NullWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.util.GenericOptionsParser; 18 | 19 | public class DistinctUserDriver { 20 | 21 | public static class SODistinctUserMapper extends 22 | Mapper { 23 | 24 | private Text outUserId = new Text(); 25 | 26 | @Override 27 | public void map(Object key, Text value, Context context) 28 | throws IOException, InterruptedException { 29 | 30 | // Parse the input into a nice map. 31 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 32 | 33 | // Get the value for the UserId attribute 34 | String userId = parsed.get("UserId"); 35 | 36 | // If it is null, skip this record 37 | if (userId == null) { 38 | return; 39 | } 40 | 41 | // Otherwise, set our output key to the user's id 42 | outUserId.set(userId); 43 | 44 | // Write the user's id with a null value 45 | context.write(outUserId, NullWritable.get()); 46 | } 47 | } 48 | 49 | public static class SODistinctUserReducer extends 50 | Reducer { 51 | 52 | @Override 53 | public void reduce(Text key, Iterable values, 54 | Context context) throws IOException, InterruptedException { 55 | 56 | // Write the user's id with a null value 57 | context.write(key, NullWritable.get()); 58 | } 59 | } 60 | 61 | public static void main(String[] args) throws Exception { 62 | Configuration conf = new Configuration(); 63 | String[] otherArgs = new GenericOptionsParser(conf, args) 64 | .getRemainingArgs(); 65 | if (otherArgs.length != 2) { 66 | System.err.println("Usage: UniqueUserCount "); 67 | System.exit(2); 68 | } 69 | 70 | Job job = new Job(conf, "StackOverflow Distinct Users"); 71 | job.setJarByClass(DistinctUserDriver.class); 72 | job.setMapperClass(SODistinctUserMapper.class); 73 | job.setCombinerClass(SODistinctUserReducer.class); 74 | job.setReducerClass(SODistinctUserReducer.class); 75 | job.setOutputKeyClass(Text.class); 76 | job.setOutputValueClass(NullWritable.class); 77 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 78 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 79 | 80 | System.exit(job.waitForCompletion(true) ? 0 : 1); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/DistributedGrep.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.*; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.*; 7 | import org.apache.hadoop.io.*; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.util.GenericOptionsParser; 13 | 14 | public class DistributedGrep { 15 | 16 | public static class GrepMapper extends 17 | Mapper { 18 | 19 | public void map(Object key, Text value, Context context) 20 | throws IOException, InterruptedException { 21 | 22 | String txt = value.toString(); 23 | String mapRegex = context.getConfiguration().get("mapregex"); 24 | 25 | if (txt.matches(mapRegex)) { 26 | context.write(NullWritable.get(), value); 27 | } 28 | } 29 | } 30 | 31 | public static void main(String[] args) throws Exception { 32 | Configuration conf = new Configuration(); 33 | String[] otherArgs = new GenericOptionsParser(conf, args) 34 | .getRemainingArgs(); 35 | if (otherArgs.length != 3) { 36 | System.err.println("Usage: DistributedGrep "); 37 | System.exit(2); 38 | } 39 | conf.set("mapregex", otherArgs[0]); 40 | 41 | Job job = new Job(conf, "Distributed Grep"); 42 | job.setJarByClass(DistributedGrep.class); 43 | job.setMapperClass(GrepMapper.class); 44 | job.setOutputKeyClass(NullWritable.class); 45 | job.setOutputValueClass(Text.class); 46 | job.setNumReduceTasks(0); // Set number of reducers to zero 47 | FileInputFormat.addInputPath(job, new Path(otherArgs[1])); 48 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 49 | System.exit(job.waitForCompletion(true) ? 0 : 1); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/QueryBloomFiltering.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.net.URI; 7 | import java.util.Map; 8 | 9 | import mrdp.utils.MRDPUtils; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.filecache.DistributedCache; 13 | import org.apache.hadoop.fs.FileSystem; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.hbase.HBaseConfiguration; 16 | import org.apache.hadoop.hbase.client.Get; 17 | import org.apache.hadoop.hbase.client.HTable; 18 | import org.apache.hadoop.hbase.client.Result; 19 | import org.apache.hadoop.io.NullWritable; 20 | import org.apache.hadoop.io.Text; 21 | import org.apache.hadoop.mapreduce.Job; 22 | import org.apache.hadoop.mapreduce.Mapper; 23 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 24 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 25 | import org.apache.hadoop.util.GenericOptionsParser; 26 | import org.apache.hadoop.util.bloom.BloomFilter; 27 | import org.apache.hadoop.util.bloom.Key; 28 | 29 | public class QueryBloomFiltering { 30 | 31 | public static class BloomFilteringMapper extends 32 | Mapper { 33 | 34 | private BloomFilter filter = new BloomFilter(); 35 | private HTable table = null; 36 | 37 | @Override 38 | protected void setup(Context context) throws IOException, 39 | InterruptedException { 40 | URI[] files = DistributedCache.getCacheFiles(context 41 | .getConfiguration()); 42 | 43 | // if the files in the distributed cache are set 44 | if (files != null && files.length == 1) { 45 | System.out.println("Reading Bloom filter from: " 46 | + files[0].getPath()); 47 | 48 | // Open local file for read. 49 | DataInputStream strm = new DataInputStream(new FileInputStream( 50 | files[0].getPath())); 51 | 52 | // Read into our Bloom filter. 53 | filter.readFields(strm); 54 | strm.close(); 55 | } else { 56 | throw new IOException( 57 | "Bloom filter file not set in the DistributedCache."); 58 | } 59 | 60 | // Get HBase table of user info 61 | Configuration hconf = HBaseConfiguration.create(); 62 | table = new HTable(hconf, "user_table"); 63 | } 64 | 65 | @Override 66 | public void map(Object key, Text value, Context context) 67 | throws IOException, InterruptedException { 68 | 69 | // Parse the input into a nice map. 70 | Map parsed = MRDPUtils.transformXmlToMap(value 71 | .toString()); 72 | 73 | // Get the value for the comment 74 | String userid = parsed.get("UserId"); 75 | 76 | // If it is null, skip this record 77 | if (userid == null) { 78 | return; 79 | } 80 | 81 | // If this user ID is in the set 82 | if (filter.membershipTest(new Key(userid.getBytes()))) { 83 | // Get the reputation from the HBase table 84 | Result r = table.get(new Get(userid.getBytes())); 85 | int reputation = Integer.parseInt(new String(r.getValue( 86 | "attr".getBytes(), "Reputation".getBytes()))); 87 | // If the reputation is at least 1,500, 88 | // write the record to the file system 89 | if (reputation >= 1500) { 90 | context.write(value, NullWritable.get()); 91 | } 92 | } 93 | } 94 | } 95 | 96 | public static void main(String[] args) throws Exception { 97 | Configuration conf = new Configuration(); 98 | String[] otherArgs = new GenericOptionsParser(conf, args) 99 | .getRemainingArgs(); 100 | if (otherArgs.length != 3) { 101 | System.err.println("Usage: BloomFiltering "); 102 | System.exit(1); 103 | } 104 | 105 | FileSystem.get(conf).delete(new Path(otherArgs[2]), true); 106 | 107 | Job job = new Job(conf, "StackOverflow Bloom Filtering"); 108 | job.setJarByClass(QueryBloomFiltering.class); 109 | job.setMapperClass(BloomFilteringMapper.class); 110 | job.setNumReduceTasks(0); 111 | job.setOutputKeyClass(Text.class); 112 | job.setOutputValueClass(NullWritable.class); 113 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 114 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 115 | 116 | DistributedCache.addCacheFile( 117 | FileSystem.get(conf).makeQualified(new Path(otherArgs[1])) 118 | .toUri(), job.getConfiguration()); 119 | 120 | System.exit(job.waitForCompletion(true) ? 0 : 1); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/SimpleRandomSampling.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.*; 4 | import java.util.Random; 5 | 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.*; 8 | import org.apache.hadoop.io.*; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.util.GenericOptionsParser; 14 | 15 | public class SimpleRandomSampling { 16 | 17 | public static class SRSMapper extends 18 | Mapper { 19 | 20 | private Random rands = new Random(); 21 | private Double percentage; 22 | 23 | @Override 24 | protected void setup(Context context) throws IOException, 25 | InterruptedException { 26 | // retrieve the percentage that is passed in via the configuration 27 | // like this: conf.set("filter_percentage", .5); for .5% 28 | String strPercentage = context.getConfiguration().get( 29 | "filter_percentage"); 30 | 31 | percentage = Double.parseDouble(strPercentage) / 100.0; 32 | } 33 | 34 | @Override 35 | public void map(Object key, Text value, Context context) 36 | throws IOException, InterruptedException { 37 | 38 | if (rands.nextDouble() < percentage) { 39 | context.write(NullWritable.get(), value); 40 | } 41 | } 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | Configuration conf = new Configuration(); 46 | String[] otherArgs = new GenericOptionsParser(conf, args) 47 | .getRemainingArgs(); 48 | if (otherArgs.length != 3) { 49 | System.err.println("Usage: SRS "); 50 | System.exit(2); 51 | } 52 | conf.set("filter_percentage", otherArgs[0]); 53 | 54 | Job job = new Job(conf, "SRS"); 55 | job.setJarByClass(SimpleRandomSampling.class); 56 | job.setMapperClass(SRSMapper.class); 57 | job.setOutputKeyClass(NullWritable.class); 58 | job.setOutputValueClass(Text.class); 59 | job.setNumReduceTasks(0); // Set number of reducers to zero 60 | FileInputFormat.addInputPath(job, new Path(otherArgs[1])); 61 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 62 | System.exit(job.waitForCompletion(true) ? 0 : 1); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/TopTenDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.TreeMap; 6 | 7 | import mrdp.utils.MRDPUtils; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.util.GenericOptionsParser; 19 | 20 | public class TopTenDriver { 21 | 22 | public static class SOTopTenMapper extends 23 | Mapper { 24 | // Our output key and value Writables 25 | private TreeMap repToRecordMap = new TreeMap(); 26 | 27 | @Override 28 | public void map(Object key, Text value, Context context) 29 | throws IOException, InterruptedException { 30 | // Parse the input string into a nice map 31 | Map parsed = MRDPUtils.transformXmlToMap(value 32 | .toString()); 33 | if (parsed == null) { 34 | return; 35 | } 36 | 37 | String userId = parsed.get("Id"); 38 | String reputation = parsed.get("Reputation"); 39 | 40 | // Get will return null if the key is not there 41 | if (userId == null || reputation == null) { 42 | // skip this record 43 | return; 44 | } 45 | 46 | repToRecordMap.put(Integer.parseInt(reputation), new Text(value)); 47 | 48 | if (repToRecordMap.size() > 10) { 49 | repToRecordMap.remove(repToRecordMap.firstKey()); 50 | } 51 | } 52 | 53 | @Override 54 | protected void cleanup(Context context) throws IOException, 55 | InterruptedException { 56 | for (Text t : repToRecordMap.values()) { 57 | context.write(NullWritable.get(), t); 58 | } 59 | } 60 | } 61 | 62 | public static class SOTopTenReducer extends 63 | Reducer { 64 | 65 | private TreeMap repToRecordMap = new TreeMap(); 66 | 67 | @Override 68 | public void reduce(NullWritable key, Iterable values, 69 | Context context) throws IOException, InterruptedException { 70 | for (Text value : values) { 71 | Map parsed = MRDPUtils.transformXmlToMap(value 72 | .toString()); 73 | 74 | repToRecordMap.put(Integer.parseInt(parsed.get("Reputation")), 75 | new Text(value)); 76 | 77 | if (repToRecordMap.size() > 10) { 78 | repToRecordMap.remove(repToRecordMap.firstKey()); 79 | } 80 | } 81 | 82 | for (Text t : repToRecordMap.descendingMap().values()) { 83 | context.write(NullWritable.get(), t); 84 | } 85 | } 86 | } 87 | 88 | public static void main(String[] args) throws Exception { 89 | Configuration conf = new Configuration(); 90 | String[] otherArgs = new GenericOptionsParser(conf, args) 91 | .getRemainingArgs(); 92 | if (otherArgs.length != 2) { 93 | System.err.println("Usage: TopTenDriver "); 94 | System.exit(2); 95 | } 96 | 97 | Job job = new Job(conf, "Top Ten Users by Reputation"); 98 | job.setJarByClass(TopTenDriver.class); 99 | job.setMapperClass(SOTopTenMapper.class); 100 | job.setReducerClass(SOTopTenReducer.class); 101 | job.setNumReduceTasks(1); 102 | job.setOutputKeyClass(NullWritable.class); 103 | job.setOutputValueClass(Text.class); 104 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 105 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 106 | System.exit(job.waitForCompletion(true) ? 0 : 1); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch3/UniqueUserCount.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch3; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.IntWritable; 12 | import org.apache.hadoop.io.NullWritable; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.mapreduce.Job; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | import org.apache.hadoop.mapreduce.Reducer; 17 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 21 | import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | 24 | public class UniqueUserCount { 25 | 26 | public static class SODistinctUserMapper extends 27 | Mapper { 28 | 29 | private Text outUserId = new Text(); 30 | 31 | @Override 32 | public void map(Object key, Text value, Context context) 33 | throws IOException, InterruptedException { 34 | 35 | Map parsed = MRDPUtils.transformXmlToMap(value 36 | .toString()); 37 | String userId = parsed.get("UserId"); 38 | if (userId == null) { 39 | return; 40 | } 41 | 42 | outUserId.set(userId); 43 | context.write(outUserId, NullWritable.get()); 44 | } 45 | } 46 | 47 | public static class SODistinctUserReducer extends 48 | Reducer { 49 | 50 | @Override 51 | public void reduce(Text key, Iterable values, 52 | Context context) throws IOException, InterruptedException { 53 | context.write(key, NullWritable.get()); 54 | } 55 | } 56 | 57 | public static class SOUserCounterMapper extends 58 | Mapper { 59 | 60 | private static final Text DUMMY = new Text("Total:"); 61 | private static final IntWritable ONE = new IntWritable(1); 62 | 63 | @Override 64 | public void map(Text key, NullWritable value, Context context) 65 | throws IOException, InterruptedException { 66 | 67 | context.write(DUMMY, ONE); 68 | } 69 | } 70 | 71 | public static void main(String[] args) throws Exception { 72 | Configuration conf = new Configuration(); 73 | String[] otherArgs = new GenericOptionsParser(conf, args) 74 | .getRemainingArgs(); 75 | if (otherArgs.length != 2) { 76 | System.err.println("Usage: UniqueUserCount "); 77 | System.exit(2); 78 | } 79 | 80 | Path tmpout = new Path(otherArgs[1] + "_tmp"); 81 | FileSystem.get(new Configuration()).delete(tmpout, true); 82 | Path finalout = new Path(otherArgs[1]); 83 | Job job = new Job(conf, "StackOverflow Unique User Count"); 84 | job.setJarByClass(UniqueUserCount.class); 85 | job.setMapperClass(SODistinctUserMapper.class); 86 | job.setCombinerClass(SODistinctUserReducer.class); 87 | job.setReducerClass(SODistinctUserReducer.class); 88 | job.setOutputKeyClass(Text.class); 89 | job.setOutputValueClass(NullWritable.class); 90 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 91 | job.setNumReduceTasks(1); 92 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 93 | FileOutputFormat.setOutputPath(job, tmpout); 94 | 95 | boolean exitCode = job.waitForCompletion(true); 96 | if (exitCode) { 97 | job = new Job(conf, "Stack Overflow Unique User Count"); 98 | job.setJarByClass(UniqueUserCount.class); 99 | job.setMapperClass(SOUserCounterMapper.class); 100 | job.setCombinerClass(IntSumReducer.class); 101 | job.setReducerClass(IntSumReducer.class); 102 | job.setOutputKeyClass(Text.class); 103 | job.setOutputValueClass(IntWritable.class); 104 | job.setInputFormatClass(SequenceFileInputFormat.class); 105 | FileInputFormat.addInputPath(job, tmpout); 106 | FileOutputFormat.setOutputPath(job, finalout); 107 | exitCode = job.waitForCompletion(true); 108 | } 109 | 110 | System.exit(exitCode ? 0 : 1); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/AnonymizeDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | import java.util.Random; 7 | 8 | import mrdp.utils.MRDPUtils; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.IntWritable; 13 | import org.apache.hadoop.io.NullWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Job; 16 | import org.apache.hadoop.mapreduce.Mapper; 17 | import org.apache.hadoop.mapreduce.Reducer; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | import org.apache.hadoop.util.GenericOptionsParser; 21 | 22 | public class AnonymizeDriver { 23 | 24 | public static class AnonymizeMapper extends 25 | Mapper { 26 | 27 | private IntWritable outkey = new IntWritable(); 28 | private Random rndm = new Random(); 29 | private Text outvalue = new Text(); 30 | 31 | @Override 32 | public void map(Object key, Text value, Context context) 33 | throws IOException, InterruptedException { 34 | 35 | // Parse the input string into a nice map 36 | Map parsed = MRDPUtils.transformXmlToMap(value 37 | .toString()); 38 | 39 | if (parsed.size() > 0) { 40 | StringBuilder bldr = new StringBuilder(); 41 | bldr.append(" entry : parsed.entrySet()) { 43 | 44 | if (entry.getKey().equals("UserId") 45 | || entry.getKey().equals("Id")) { 46 | // ignore these fields 47 | } else if (entry.getKey().equals("CreationDate")) { 48 | // Strip out the time, anything after the 'T' in the 49 | // value 50 | bldr.append(entry.getKey() 51 | + "=\"" 52 | + entry.getValue().substring(0, 53 | entry.getValue().indexOf('T')) + "\" "); 54 | } else { 55 | // Otherwise, output this. 56 | bldr.append(entry.getKey() + "=\"" + entry.getValue() 57 | + "\" "); 58 | } 59 | 60 | } 61 | bldr.append(">"); 62 | outkey.set(rndm.nextInt()); 63 | outvalue.set(bldr.toString()); 64 | context.write(outkey, outvalue); 65 | } 66 | } 67 | } 68 | 69 | public static class ValueReducer extends 70 | Reducer { 71 | @Override 72 | protected void reduce(IntWritable key, Iterable values, 73 | Context context) throws IOException, InterruptedException { 74 | 75 | for (Text t : values) { 76 | context.write(t, NullWritable.get()); 77 | } 78 | } 79 | } 80 | 81 | public static void main(String[] args) throws Exception { 82 | Configuration conf = new Configuration(); 83 | String[] otherArgs = new GenericOptionsParser(conf, args) 84 | .getRemainingArgs(); 85 | if (otherArgs.length != 2) { 86 | System.err.println("Usage: Anonymize "); 87 | System.exit(1); 88 | } 89 | 90 | // Configure the join type 91 | Job job = new Job(conf, "Anonymize"); 92 | job.setJarByClass(AnonymizeDriver.class); 93 | 94 | job.setMapperClass(AnonymizeMapper.class); 95 | job.setReducerClass(ValueReducer.class); 96 | job.setNumReduceTasks(10); 97 | 98 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 99 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 100 | 101 | job.setOutputKeyClass(IntWritable.class); 102 | job.setOutputValueClass(Text.class); 103 | 104 | System.exit(job.waitForCompletion(true) ? 0 : 3); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/Binning.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.commons.lang.StringEscapeUtils; 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 18 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 19 | import org.apache.hadoop.util.GenericOptionsParser; 20 | 21 | public class Binning { 22 | 23 | public static class BinningMapper extends 24 | Mapper { 25 | 26 | private MultipleOutputs mos = null; 27 | 28 | @SuppressWarnings({ "unchecked", "rawtypes" }) 29 | @Override 30 | protected void setup(Context context) { 31 | // Create a new MultipleOutputs using the context object 32 | mos = new MultipleOutputs(context); 33 | } 34 | 35 | @Override 36 | protected void map(Object key, Text value, Context context) 37 | throws IOException, InterruptedException { 38 | 39 | // Parse the input string into a nice map 40 | Map parsed = MRDPUtils.transformXmlToMap(value 41 | .toString()); 42 | 43 | String rawtags = parsed.get("Tags"); 44 | if (rawtags == null) { 45 | return; 46 | } 47 | 48 | // Tags are delimited by ><. i.e. 49 | String[] tagTokens = StringEscapeUtils.unescapeHtml(rawtags).split( 50 | "><"); 51 | 52 | // For each tag 53 | for (String tag : tagTokens) { 54 | // Remove any > or < from the token 55 | String groomed = tag.replaceAll(">|<", "").toLowerCase(); 56 | 57 | // If this tag is one of the following, write to the named bin 58 | if (groomed.equalsIgnoreCase("hadoop")) { 59 | mos.write("bins", value, NullWritable.get(), "hadoop-tag"); 60 | } 61 | 62 | if (groomed.equalsIgnoreCase("pig")) { 63 | mos.write("bins", value, NullWritable.get(), "pig-tag"); 64 | } 65 | 66 | if (groomed.equalsIgnoreCase("hive")) { 67 | mos.write("bins", value, NullWritable.get(), "hive-tag"); 68 | } 69 | 70 | if (groomed.equalsIgnoreCase("hbase")) { 71 | mos.write("bins", value, NullWritable.get(), "hbase-tag"); 72 | } 73 | } 74 | 75 | // Get the body of the post 76 | String post = parsed.get("Body"); 77 | 78 | if (post == null) { 79 | return; 80 | } 81 | 82 | // If the post contains the word "hadoop", write it to its own bin 83 | if (post.toLowerCase().contains("hadoop")) { 84 | mos.write("bins", value, NullWritable.get(), "hadoop-post"); 85 | } 86 | } 87 | 88 | @Override 89 | protected void cleanup(Context context) throws IOException, 90 | InterruptedException { 91 | // Close multiple outputs! 92 | mos.close(); 93 | } 94 | } 95 | 96 | public static void main(String[] args) throws Exception { 97 | Configuration conf = new Configuration(); 98 | String[] otherArgs = new GenericOptionsParser(conf, args) 99 | .getRemainingArgs(); 100 | if (otherArgs.length != 2) { 101 | System.err.println("Usage: Binning "); 102 | System.exit(1); 103 | } 104 | 105 | Job job = new Job(conf, "Binning"); 106 | job.setJarByClass(Binning.class); 107 | job.setMapperClass(BinningMapper.class); 108 | job.setNumReduceTasks(0); 109 | 110 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 111 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 112 | 113 | // Configure the MultipleOutputs by adding an output called "bins" 114 | // With the proper output format and mapper key/value pairs 115 | MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, 116 | Text.class, NullWritable.class); 117 | 118 | // Enable the counters for the job 119 | // If there is a significant number of different named outputs, this 120 | // should be disabled 121 | MultipleOutputs.setCountersEnabled(job, true); 122 | 123 | System.exit(job.waitForCompletion(true) ? 0 : 2); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/PartitionedUsers.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Calendar; 7 | import java.util.Map; 8 | 9 | import mrdp.utils.MRDPUtils; 10 | 11 | import org.apache.hadoop.conf.Configurable; 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.IntWritable; 15 | import org.apache.hadoop.io.NullWritable; 16 | import org.apache.hadoop.io.Text; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.Mapper; 19 | import org.apache.hadoop.mapreduce.Partitioner; 20 | import org.apache.hadoop.mapreduce.Reducer; 21 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 22 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 24 | import org.apache.hadoop.util.GenericOptionsParser; 25 | 26 | public class PartitionedUsers { 27 | 28 | public static class LastAccessDateMapper extends 29 | Mapper { 30 | 31 | // This object will format the creation date string into a Date object 32 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 33 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 34 | 35 | private IntWritable outkey = new IntWritable(); 36 | 37 | @Override 38 | protected void map(Object key, Text value, Context context) 39 | throws IOException, InterruptedException { 40 | 41 | // Parse the input string into a nice map 42 | Map parsed = MRDPUtils.transformXmlToMap(value 43 | .toString()); 44 | 45 | // Grab the last access date 46 | String strDate = parsed.get("LastAccessDate"); 47 | 48 | // skip this record if date is null 49 | if (strDate != null) { 50 | try { 51 | // Parse the string into a Calendar object 52 | Calendar cal = Calendar.getInstance(); 53 | cal.setTime(frmt.parse(strDate)); 54 | outkey.set(cal.get(Calendar.YEAR)); 55 | // Write out the year with the input value 56 | context.write(outkey, value); 57 | } catch (ParseException e) { 58 | // An error occurred parsing the creation Date string 59 | // skip this record 60 | } 61 | } 62 | } 63 | } 64 | 65 | public static class LastAccessDatePartitioner extends 66 | Partitioner implements Configurable { 67 | 68 | private static final String MIN_LAST_ACCESS_DATE_YEAR = "min.last.access.date.year"; 69 | 70 | private Configuration conf = null; 71 | private int minLastAccessDateYear = 0; 72 | 73 | @Override 74 | public int getPartition(IntWritable key, Text value, int numPartitions) { 75 | return key.get() - minLastAccessDateYear; 76 | } 77 | 78 | @Override 79 | public Configuration getConf() { 80 | return conf; 81 | } 82 | 83 | @Override 84 | public void setConf(Configuration conf) { 85 | this.conf = conf; 86 | minLastAccessDateYear = conf.getInt(MIN_LAST_ACCESS_DATE_YEAR, 0); 87 | } 88 | 89 | /** 90 | * Sets the minimum possible last access date to subtract from each key 91 | * to be partitioned
92 | *
93 | * 94 | * That is, if the last min access date is "2008" and the key to 95 | * partition is "2009", it will go to partition 2009 - 2008 = 1 96 | * 97 | * @param job 98 | * The job to configure 99 | * @param minLastAccessDateYear 100 | * The minimum access date. 101 | */ 102 | public static void setMinLastAccessDate(Job job, 103 | int minLastAccessDateYear) { 104 | job.getConfiguration().setInt(MIN_LAST_ACCESS_DATE_YEAR, 105 | minLastAccessDateYear); 106 | } 107 | } 108 | 109 | public static class ValueReducer extends 110 | Reducer { 111 | 112 | protected void reduce(IntWritable key, Iterable values, 113 | Context context) throws IOException, InterruptedException { 114 | for (Text t : values) { 115 | context.write(t, NullWritable.get()); 116 | } 117 | } 118 | } 119 | 120 | public static void main(String[] args) throws Exception { 121 | Configuration conf = new Configuration(); 122 | String[] otherArgs = new GenericOptionsParser(conf, args) 123 | .getRemainingArgs(); 124 | if (otherArgs.length != 2) { 125 | System.err.println("Usage: PartitionedUsers "); 126 | System.exit(2); 127 | } 128 | 129 | Job job = new Job(conf, "PartitionedUsers"); 130 | 131 | job.setJarByClass(PartitionedUsers.class); 132 | 133 | job.setMapperClass(LastAccessDateMapper.class); 134 | 135 | // Set custom partitioner and min last access date 136 | job.setPartitionerClass(LastAccessDatePartitioner.class); 137 | LastAccessDatePartitioner.setMinLastAccessDate(job, 2008); 138 | 139 | // Last access dates span between 2008-2011, or 4 years 140 | job.setNumReduceTasks(4); 141 | job.setReducerClass(ValueReducer.class); 142 | 143 | FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 144 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 145 | 146 | job.setOutputKeyClass(IntWritable.class); 147 | job.setOutputValueClass(Text.class); 148 | 149 | job.setOutputFormatClass(TextOutputFormat.class); 150 | job.getConfiguration().set("mapred.textoutputformat.separator", ""); 151 | 152 | System.exit(job.waitForCompletion(true) ? 0 : 1); 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/PostCommentBuildingDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.io.StringWriter; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import javax.xml.parsers.DocumentBuilder; 11 | import javax.xml.parsers.DocumentBuilderFactory; 12 | import javax.xml.transform.OutputKeys; 13 | import javax.xml.transform.Transformer; 14 | import javax.xml.transform.TransformerFactory; 15 | import javax.xml.transform.dom.DOMSource; 16 | import javax.xml.transform.stream.StreamResult; 17 | 18 | import mrdp.utils.MRDPUtils; 19 | 20 | import org.apache.hadoop.conf.Configuration; 21 | import org.apache.hadoop.fs.Path; 22 | import org.apache.hadoop.io.NullWritable; 23 | import org.apache.hadoop.io.Text; 24 | import org.apache.hadoop.mapreduce.Job; 25 | import org.apache.hadoop.mapreduce.Mapper; 26 | import org.apache.hadoop.mapreduce.Reducer; 27 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 28 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 29 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 30 | import org.apache.hadoop.util.GenericOptionsParser; 31 | import org.w3c.dom.Attr; 32 | import org.w3c.dom.Document; 33 | import org.w3c.dom.Element; 34 | import org.w3c.dom.NamedNodeMap; 35 | import org.xml.sax.InputSource; 36 | 37 | public class PostCommentBuildingDriver { 38 | 39 | public static class PostMapper extends Mapper { 40 | 41 | private Text outkey = new Text(); 42 | private Text outvalue = new Text(); 43 | 44 | @Override 45 | public void map(Object key, Text value, Context context) 46 | throws IOException, InterruptedException { 47 | 48 | // Parse the input string into a nice map 49 | Map parsed = MRDPUtils.transformXmlToMap(value 50 | .toString()); 51 | 52 | String postId = parsed.get("Id"); 53 | 54 | if (postId == null) { 55 | return; 56 | } 57 | 58 | // The foreign join key is the post ID 59 | outkey.set(postId); 60 | 61 | // Flag this record for the reducer and then output 62 | outvalue.set("P" + value.toString()); 63 | context.write(outkey, outvalue); 64 | } 65 | } 66 | 67 | public static class CommentMapper extends Mapper { 68 | private Text outkey = new Text(); 69 | private Text outvalue = new Text(); 70 | 71 | @Override 72 | public void map(Object key, Text value, Context context) 73 | throws IOException, InterruptedException { 74 | 75 | // Parse the input string into a nice map 76 | Map parsed = MRDPUtils.transformXmlToMap(value 77 | .toString()); 78 | 79 | String postId = parsed.get("PostId"); 80 | if (postId == null) { 81 | return; 82 | } 83 | 84 | // The foreign join key is the user ID 85 | outkey.set(postId); 86 | 87 | // Flag this record for the reducer and then output 88 | outvalue.set("C" + value.toString()); 89 | context.write(outkey, outvalue); 90 | } 91 | } 92 | 93 | public static class PostCommentHierarchyReducer extends 94 | Reducer { 95 | 96 | private ArrayList comments = new ArrayList(); 97 | private DocumentBuilderFactory dbf = DocumentBuilderFactory 98 | .newInstance(); 99 | private String post = null; 100 | 101 | @Override 102 | public void reduce(Text key, Iterable values, Context context) 103 | throws IOException, InterruptedException { 104 | // Reset variables 105 | post = null; 106 | comments.clear(); 107 | 108 | // For each input value 109 | for (Text t : values) { 110 | // If this is the post record, store it, minus the flag 111 | if (t.charAt(0) == 'P') { 112 | post = t.toString().substring(1, t.toString().length()) 113 | .trim(); 114 | } else { 115 | // Else, it is a comment record. Add it to the list, minus 116 | // the flag 117 | comments.add(t.toString() 118 | .substring(1, t.toString().length()).trim()); 119 | } 120 | } 121 | 122 | // If post is not null 123 | if (post != null) { 124 | // nest the comments underneath the post element 125 | String postWithCommentChildren = nestElements(post, comments); 126 | 127 | // write out the XML 128 | context.write(new Text(postWithCommentChildren), 129 | NullWritable.get()); 130 | } 131 | } 132 | 133 | private String nestElements(String post, List comments) { 134 | try { 135 | // Create the new document to build the XML 136 | DocumentBuilder bldr = dbf.newDocumentBuilder(); 137 | Document doc = bldr.newDocument(); 138 | 139 | // Copy parent node to document 140 | Element postEl = getXmlElementFromString(post); 141 | Element toAddPostEl = doc.createElement("post"); 142 | 143 | // Copy the attributes of the original post element to the new 144 | // one 145 | copyAttributesToElement(postEl.getAttributes(), toAddPostEl); 146 | 147 | // For each comment, copy it to the "post" node 148 | for (String commentXml : comments) { 149 | Element commentEl = getXmlElementFromString(commentXml); 150 | Element toAddCommentEl = doc.createElement("comments"); 151 | 152 | // Copy the attributes of the original comment element to 153 | // the new one 154 | copyAttributesToElement(commentEl.getAttributes(), 155 | toAddCommentEl); 156 | 157 | // Add the copied comment to the post element 158 | toAddPostEl.appendChild(toAddCommentEl); 159 | } 160 | 161 | // Add the post element to the document 162 | doc.appendChild(toAddPostEl); 163 | 164 | // Transform the document into a String of XML and return 165 | return transformDocumentToString(doc); 166 | 167 | } catch (Exception e) { 168 | return null; 169 | } 170 | } 171 | 172 | private Element getXmlElementFromString(String xml) { 173 | try { 174 | // Create a new document builder 175 | DocumentBuilder bldr = dbf.newDocumentBuilder(); 176 | 177 | // Parse the XML string and return the first element 178 | return bldr.parse(new InputSource(new StringReader(xml))) 179 | .getDocumentElement(); 180 | } catch (Exception e) { 181 | return null; 182 | } 183 | } 184 | 185 | private void copyAttributesToElement(NamedNodeMap attributes, 186 | Element element) { 187 | 188 | // For each attribute, copy it to the element 189 | for (int i = 0; i < attributes.getLength(); ++i) { 190 | Attr toCopy = (Attr) attributes.item(i); 191 | element.setAttribute(toCopy.getName(), toCopy.getValue()); 192 | } 193 | } 194 | 195 | private String transformDocumentToString(Document doc) { 196 | try { 197 | TransformerFactory tf = TransformerFactory.newInstance(); 198 | Transformer transformer = tf.newTransformer(); 199 | transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, 200 | "yes"); 201 | StringWriter writer = new StringWriter(); 202 | transformer.transform(new DOMSource(doc), new StreamResult( 203 | writer)); 204 | // Replace all new line characters with an empty string to have 205 | // one record per line. 206 | return writer.getBuffer().toString().replaceAll("\n|\r", ""); 207 | } catch (Exception e) { 208 | return null; 209 | } 210 | } 211 | } 212 | 213 | public static void main(String[] args) throws Exception { 214 | Configuration conf = new Configuration(); 215 | String[] otherArgs = new GenericOptionsParser(conf, args) 216 | .getRemainingArgs(); 217 | if (otherArgs.length != 3) { 218 | System.err 219 | .println("Usage: PostCommentHierarchy "); 220 | System.exit(1); 221 | } 222 | 223 | Job job = new Job(conf, "PostCommentHierarchy"); 224 | job.setJarByClass(PostCommentBuildingDriver.class); 225 | 226 | MultipleInputs.addInputPath(job, new Path(otherArgs[0]), 227 | TextInputFormat.class, PostMapper.class); 228 | 229 | MultipleInputs.addInputPath(job, new Path(otherArgs[1]), 230 | TextInputFormat.class, CommentMapper.class); 231 | 232 | job.setReducerClass(PostCommentHierarchyReducer.class); 233 | 234 | job.setOutputFormatClass(TextOutputFormat.class); 235 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 236 | 237 | job.setOutputKeyClass(Text.class); 238 | job.setOutputValueClass(Text.class); 239 | 240 | System.exit(job.waitForCompletion(true) ? 0 : 2); 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/QuestionAnswerBuildingDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.io.StringWriter; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import javax.xml.parsers.DocumentBuilder; 9 | import javax.xml.parsers.DocumentBuilderFactory; 10 | import javax.xml.transform.OutputKeys; 11 | import javax.xml.transform.Transformer; 12 | import javax.xml.transform.TransformerFactory; 13 | import javax.xml.transform.dom.DOMSource; 14 | import javax.xml.transform.stream.StreamResult; 15 | 16 | import org.apache.hadoop.conf.Configuration; 17 | import org.apache.hadoop.fs.Path; 18 | import org.apache.hadoop.io.NullWritable; 19 | import org.apache.hadoop.io.Text; 20 | import org.apache.hadoop.mapreduce.Job; 21 | import org.apache.hadoop.mapreduce.Mapper; 22 | import org.apache.hadoop.mapreduce.Reducer; 23 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 24 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 25 | import org.apache.hadoop.util.GenericOptionsParser; 26 | import org.w3c.dom.Attr; 27 | import org.w3c.dom.Document; 28 | import org.w3c.dom.Element; 29 | import org.w3c.dom.NamedNodeMap; 30 | import org.xml.sax.InputSource; 31 | 32 | public class QuestionAnswerBuildingDriver { 33 | 34 | public static class PostCommentMapper extends 35 | Mapper { 36 | 37 | private DocumentBuilderFactory dbf = DocumentBuilderFactory 38 | .newInstance(); 39 | private Text outkey = new Text(); 40 | private Text outvalue = new Text(); 41 | 42 | @Override 43 | public void map(Object key, Text value, Context context) 44 | throws IOException, InterruptedException { 45 | 46 | // Parse the post/comment XML hierarchy into an Element 47 | Element post = getXmlElementFromString(value.toString()); 48 | 49 | int postType = Integer.parseInt(post.getAttribute("PostTypeId")); 50 | 51 | // If postType is 1, it is a question 52 | if (postType == 1) { 53 | outkey.set(post.getAttribute("Id")); 54 | outvalue.set("Q" + value.toString()); 55 | } else { 56 | // Else, it is an answer 57 | outkey.set(post.getAttribute("ParentId")); 58 | outvalue.set("A" + value.toString()); 59 | } 60 | 61 | context.write(outkey, outvalue); 62 | } 63 | 64 | private Element getXmlElementFromString(String xml) { 65 | try { 66 | // Create a new document builder 67 | DocumentBuilder bldr = dbf.newDocumentBuilder(); 68 | 69 | // Parse the XML string and return the first element 70 | return bldr.parse(new InputSource(new StringReader(xml))) 71 | .getDocumentElement(); 72 | } catch (Exception e) { 73 | return null; 74 | } 75 | } 76 | } 77 | 78 | public static class QuestionAnswerReducer extends 79 | Reducer { 80 | 81 | private ArrayList answers = new ArrayList(); 82 | private DocumentBuilderFactory dbf = DocumentBuilderFactory 83 | .newInstance(); 84 | private String question = null; 85 | 86 | @Override 87 | public void reduce(Text key, Iterable values, Context context) 88 | throws IOException, InterruptedException { 89 | // Reset variables 90 | question = null; 91 | answers.clear(); 92 | 93 | // For each input value 94 | for (Text t : values) { 95 | // If this is the post record, store it, minus the flag 96 | if (t.charAt(0) == 'Q') { 97 | question = t.toString().substring(1, t.toString().length()) 98 | .trim(); 99 | } else { 100 | // Else, it is a comment record. Add it to the list, minus 101 | // the flag 102 | answers.add(t.toString() 103 | .substring(1, t.toString().length()).trim()); 104 | } 105 | } 106 | 107 | // If post is not null 108 | if (question != null) { 109 | // nest the comments underneath the post element 110 | String postWithCommentChildren = nestElements(question, answers); 111 | 112 | // write out the XML 113 | context.write(new Text(postWithCommentChildren), 114 | NullWritable.get()); 115 | } 116 | } 117 | 118 | private String nestElements(String post, List comments) { 119 | try { 120 | // Create the new document to build the XML 121 | DocumentBuilder bldr = dbf.newDocumentBuilder(); 122 | Document doc = bldr.newDocument(); 123 | 124 | // Copy parent node to document 125 | Element postEl = getXmlElementFromString(post); 126 | Element toAddPostEl = doc.createElement("question"); 127 | 128 | // Copy the attributes of the original post element to the new 129 | // one 130 | copyAttributesToElement(postEl.getAttributes(), toAddPostEl); 131 | 132 | // For each comment, copy it to the "post" node 133 | for (String commentXml : comments) { 134 | Element commentEl = getXmlElementFromString(commentXml); 135 | Element toAddCommentEl = doc.createElement("answer"); 136 | 137 | // Copy the attributes of the original comment element to 138 | // the new one 139 | copyAttributesToElement(commentEl.getAttributes(), 140 | toAddCommentEl); 141 | 142 | // Add the copied comment to the post element 143 | toAddPostEl.appendChild(toAddCommentEl); 144 | } 145 | 146 | // Add the post element to the document 147 | doc.appendChild(toAddPostEl); 148 | 149 | // Transform the document into a String of XML and return 150 | return transformDocumentToString(doc); 151 | 152 | } catch (Exception e) { 153 | return null; 154 | } 155 | } 156 | 157 | private Element getXmlElementFromString(String xml) { 158 | try { 159 | // Create a new document builder 160 | DocumentBuilder bldr = dbf.newDocumentBuilder(); 161 | 162 | // Parse the XML string and return the first element 163 | return bldr.parse(new InputSource(new StringReader(xml))) 164 | .getDocumentElement(); 165 | } catch (Exception e) { 166 | return null; 167 | } 168 | } 169 | 170 | private void copyAttributesToElement(NamedNodeMap attributes, 171 | Element element) { 172 | 173 | // For each attribute, copy it to the element 174 | for (int i = 0; i < attributes.getLength(); ++i) { 175 | Attr toCopy = (Attr) attributes.item(i); 176 | element.setAttribute(toCopy.getName(), toCopy.getValue()); 177 | } 178 | } 179 | 180 | private String transformDocumentToString(Document doc) { 181 | try { 182 | TransformerFactory tf = TransformerFactory.newInstance(); 183 | Transformer transformer = tf.newTransformer(); 184 | transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, 185 | "yes"); 186 | StringWriter writer = new StringWriter(); 187 | transformer.transform(new DOMSource(doc), new StreamResult( 188 | writer)); 189 | // Replace all new line characters with an empty string to have 190 | // one record per line. 191 | return writer.getBuffer().toString().replaceAll("\n|\r", ""); 192 | } catch (Exception e) { 193 | return null; 194 | } 195 | } 196 | } 197 | 198 | public static void main(String[] args) throws Exception { 199 | Configuration conf = new Configuration(); 200 | String[] otherArgs = new GenericOptionsParser(conf, args) 201 | .getRemainingArgs(); 202 | if (otherArgs.length != 2) { 203 | System.err 204 | .println("Usage: QuestionAnswerHierarchy "); 205 | System.exit(1); 206 | } 207 | 208 | Job job = new Job(conf, "QuestionAnswerHierarchy"); 209 | job.setJarByClass(QuestionAnswerBuildingDriver.class); 210 | 211 | job.setMapperClass(PostCommentMapper.class); 212 | 213 | job.setInputFormatClass(TextInputFormat.class); 214 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 215 | 216 | job.setReducerClass(QuestionAnswerReducer.class); 217 | 218 | job.setOutputFormatClass(TextOutputFormat.class); 219 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 220 | 221 | job.setOutputKeyClass(Text.class); 222 | job.setOutputValueClass(Text.class); 223 | 224 | System.exit(job.waitForCompletion(true) ? 0 : 1); 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch4/TotalOrderSorting.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch4; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.Reducer; 16 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 18 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | import org.apache.hadoop.mapreduce.lib.partition.InputSampler; 21 | import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | 24 | public class TotalOrderSorting { 25 | 26 | public static class LastAccessDateMapper extends 27 | Mapper { 28 | 29 | private Text outkey = new Text(); 30 | 31 | @Override 32 | public void map(Object key, Text value, Context context) 33 | throws IOException, InterruptedException { 34 | 35 | // Parse the input string into a nice map 36 | Map parsed = MRDPUtils.transformXmlToMap(value 37 | .toString()); 38 | 39 | String date = parsed.get("LastAccessDate"); 40 | if (date != null) { 41 | outkey.set(date); 42 | context.write(outkey, value); 43 | } 44 | } 45 | } 46 | 47 | public static class ValueReducer extends 48 | Reducer { 49 | 50 | @Override 51 | public void reduce(Text key, Iterable values, Context context) 52 | throws IOException, InterruptedException { 53 | for (Text t : values) { 54 | context.write(t, NullWritable.get()); 55 | } 56 | } 57 | } 58 | 59 | @SuppressWarnings({ "unchecked", "rawtypes" }) 60 | public static void main(String[] args) throws Exception { 61 | Configuration conf = new Configuration(); 62 | String[] otherArgs = new GenericOptionsParser(conf, args) 63 | .getRemainingArgs(); 64 | if (otherArgs.length != 3) { 65 | System.err 66 | .println("Usage: TotalOrderSorting "); 67 | System.exit(1); 68 | } 69 | 70 | Path inputPath = new Path(otherArgs[0]); 71 | Path partitionFile = new Path(otherArgs[1] + "_partitions.lst"); 72 | Path outputStage = new Path(otherArgs[1] + "_staging"); 73 | Path outputOrder = new Path(otherArgs[1]); 74 | double sampleRate = Double.parseDouble(otherArgs[2]); 75 | 76 | FileSystem.get(new Configuration()).delete(outputOrder, true); 77 | FileSystem.get(new Configuration()).delete(outputStage, true); 78 | FileSystem.get(new Configuration()).delete(partitionFile, true); 79 | 80 | // Configure job to prepare for sampling 81 | Job sampleJob = new Job(conf, "TotalOrderSortingStage"); 82 | sampleJob.setJarByClass(TotalOrderSorting.class); 83 | 84 | // Use the mapper implementation with zero reduce tasks 85 | sampleJob.setMapperClass(LastAccessDateMapper.class); 86 | sampleJob.setNumReduceTasks(0); 87 | 88 | sampleJob.setOutputKeyClass(Text.class); 89 | sampleJob.setOutputValueClass(Text.class); 90 | 91 | TextInputFormat.setInputPaths(sampleJob, inputPath); 92 | 93 | // Set the output format to a sequence file 94 | sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class); 95 | SequenceFileOutputFormat.setOutputPath(sampleJob, outputStage); 96 | 97 | // Submit the job and get completion code. 98 | int code = sampleJob.waitForCompletion(true) ? 0 : 1; 99 | 100 | if (code == 0) { 101 | Job orderJob = new Job(conf, "TotalOrderSortingStage"); 102 | orderJob.setJarByClass(TotalOrderSorting.class); 103 | 104 | // Here, use the identity mapper to output the key/value pairs in 105 | // the SequenceFile 106 | orderJob.setMapperClass(Mapper.class); 107 | orderJob.setReducerClass(ValueReducer.class); 108 | 109 | // Set the number of reduce tasks to an appropriate number for the 110 | // amount of data being sorted 111 | orderJob.setNumReduceTasks(10); 112 | 113 | // Use Hadoop's TotalOrderPartitioner class 114 | orderJob.setPartitionerClass(TotalOrderPartitioner.class); 115 | 116 | // Set the partition file 117 | TotalOrderPartitioner.setPartitionFile(orderJob.getConfiguration(), 118 | partitionFile); 119 | 120 | orderJob.setOutputKeyClass(Text.class); 121 | orderJob.setOutputValueClass(Text.class); 122 | 123 | // Set the input to the previous job's output 124 | orderJob.setInputFormatClass(SequenceFileInputFormat.class); 125 | SequenceFileInputFormat.setInputPaths(orderJob, outputStage); 126 | 127 | // Set the output path to the command line parameter 128 | TextOutputFormat.setOutputPath(orderJob, outputOrder); 129 | 130 | // Set the separator to an empty string 131 | orderJob.getConfiguration().set( 132 | "mapred.textoutputformat.separator", ""); 133 | 134 | // Use the InputSampler to go through the output of the previous 135 | // job, sample it, and create the partition file 136 | InputSampler.writePartitionFile(orderJob, 137 | new InputSampler.RandomSampler(sampleRate, 10000)); 138 | 139 | // Submit the job 140 | code = orderJob.waitForCompletion(true) ? 0 : 2; 141 | } 142 | 143 | // Cleanup the partition file and the staging directory 144 | FileSystem.get(new Configuration()).delete(partitionFile, false); 145 | FileSystem.get(new Configuration()).delete(outputStage, true); 146 | 147 | System.exit(code); 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/CartesianFormatter.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.Arrays; 8 | import java.util.HashSet; 9 | import java.util.Map; 10 | 11 | import mrdp.utils.MRDPUtils; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.Mapper; 18 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 19 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 20 | import org.apache.hadoop.util.GenericOptionsParser; 21 | 22 | public class CartesianFormatter { 23 | 24 | public static class CommentMapper extends Mapper { 25 | 26 | private Text outkey = new Text(), outvalue = new Text(); 27 | private HashSet commonWords = new HashSet(); 28 | 29 | protected void setup(Context context) throws IOException, 30 | InterruptedException { 31 | 32 | File f = new File(System.getProperty("user.dir") 33 | + "/commonwords.txt"); 34 | 35 | BufferedReader rdr = new BufferedReader(new FileReader(f)); 36 | 37 | String word = null; 38 | while ((word = rdr.readLine()) != null) { 39 | commonWords.add(word); 40 | } 41 | 42 | rdr.close(); 43 | } 44 | 45 | @Override 46 | public void map(Object key, Text value, Context context) 47 | throws IOException, InterruptedException { 48 | 49 | // Parse the input string into a nice map 50 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 51 | 52 | String id = parsed.get("Id"); 53 | String comment = parsed.get("Text"); 54 | 55 | if (id == null || comment == null) { 56 | return; 57 | } 58 | 59 | String[] tokens = comment.toLowerCase() 60 | .replaceAll("[^a-z0-9\\s]", "").split("\\s"); 61 | 62 | HashSet setTokens = new HashSet( 63 | Arrays.asList(tokens)); 64 | setTokens.removeAll(commonWords); 65 | 66 | StringBuilder bldr = new StringBuilder(); 67 | 68 | for (String word : setTokens) { 69 | if (!word.isEmpty()) { 70 | bldr.append(word + ","); 71 | } 72 | } 73 | 74 | if (bldr.length() > 0) { 75 | outkey.set(id); 76 | outvalue.set(bldr.deleteCharAt(bldr.length() - 1).toString()); 77 | context.write(outkey, outvalue); 78 | } 79 | } 80 | } 81 | 82 | public static void main(String[] args) throws Exception { 83 | Configuration conf = new Configuration(); 84 | String[] otherArgs = new GenericOptionsParser(conf, args) 85 | .getRemainingArgs(); 86 | if (otherArgs.length != 2) { 87 | System.err.println("Usage: CartesianFormatter "); 88 | System.exit(1); 89 | } 90 | 91 | // Configure the join type 92 | Job job = new Job(conf, "CartesianFormatter"); 93 | job.setJarByClass(CartesianFormatter.class); 94 | 95 | job.setMapperClass(CommentMapper.class); 96 | job.setNumReduceTasks(0); 97 | 98 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 99 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 100 | 101 | job.setOutputKeyClass(Text.class); 102 | job.setOutputValueClass(Text.class); 103 | 104 | System.exit(job.waitForCompletion(true) ? 0 : 3); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/CompositeJoinDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapred.JobClient; 8 | import org.apache.hadoop.mapred.JobConf; 9 | import org.apache.hadoop.mapred.KeyValueTextInputFormat; 10 | import org.apache.hadoop.mapred.MapReduceBase; 11 | import org.apache.hadoop.mapred.Mapper; 12 | import org.apache.hadoop.mapred.OutputCollector; 13 | import org.apache.hadoop.mapred.Reporter; 14 | import org.apache.hadoop.mapred.RunningJob; 15 | import org.apache.hadoop.mapred.TextOutputFormat; 16 | import org.apache.hadoop.mapred.join.CompositeInputFormat; 17 | import org.apache.hadoop.mapred.join.TupleWritable; 18 | import org.apache.hadoop.util.GenericOptionsParser; 19 | 20 | public class CompositeJoinDriver { 21 | 22 | public static class CompositeMapper extends MapReduceBase implements 23 | Mapper { 24 | 25 | @Override 26 | public void map(Text key, TupleWritable value, 27 | OutputCollector output, Reporter reporter) 28 | throws IOException { 29 | 30 | // Get the first two elements in the tuple and output them 31 | output.collect((Text) value.get(0), (Text) value.get(1)); 32 | } 33 | } 34 | 35 | public static void main(String[] args) throws Exception { 36 | JobConf conf = new JobConf("CompositeJoin"); 37 | conf.setJarByClass(CompositeJoinDriver.class); 38 | String[] otherArgs = new GenericOptionsParser(conf, args) 39 | .getRemainingArgs(); 40 | if (otherArgs.length != 4) { 41 | System.err 42 | .println("Usage: CompositeJoin [inner|outer]"); 43 | System.exit(1); 44 | } 45 | 46 | Path userPath = new Path(otherArgs[0]); 47 | Path commentPath = new Path(otherArgs[1]); 48 | Path outputDir = new Path(otherArgs[2]); 49 | String joinType = otherArgs[3]; 50 | if (!(joinType.equalsIgnoreCase("inner") || joinType 51 | .equalsIgnoreCase("outer"))) { 52 | System.err.println("Join type not set to inner or outer"); 53 | System.exit(2); 54 | } 55 | 56 | conf.setMapperClass(CompositeMapper.class); 57 | conf.setNumReduceTasks(0); 58 | 59 | // Set the input format class to a CompositeInputFormat class. 60 | // The CompositeInputFormat will parse all of our input files and output 61 | // records to our mapper. 62 | conf.setInputFormat(CompositeInputFormat.class); 63 | 64 | // The composite input format join expression will set how the records 65 | // are going to be read in, and in what input format. 66 | conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType, 67 | KeyValueTextInputFormat.class, userPath, commentPath)); 68 | 69 | TextOutputFormat.setOutputPath(conf, outputDir); 70 | 71 | conf.setOutputKeyClass(Text.class); 72 | conf.setOutputValueClass(Text.class); 73 | 74 | RunningJob job = JobClient.runJob(conf); 75 | while (!job.isComplete()) { 76 | Thread.sleep(1000); 77 | } 78 | 79 | System.exit(job.isSuccessful() ? 0 : 2); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/JoinFormatting.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import mrdp.utils.MRDPUtils; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 15 | import org.apache.hadoop.util.GenericOptionsParser; 16 | 17 | public class JoinFormatting { 18 | 19 | public static class ReplicatedJoinMapper extends 20 | Mapper { 21 | 22 | private Text outkey = new Text(); 23 | 24 | @Override 25 | public void map(Object key, Text value, Context context) 26 | throws IOException, InterruptedException { 27 | 28 | // Parse the input string into a nice map 29 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 30 | 31 | String userId = parsed.get("UserId"); 32 | 33 | if (userId == null) { 34 | return; 35 | } 36 | 37 | outkey.set(userId); 38 | context.write(outkey, value); 39 | } 40 | } 41 | 42 | public static void main(String[] args) throws Exception { 43 | Configuration conf = new Configuration(); 44 | String[] otherArgs = new GenericOptionsParser(conf, args) 45 | .getRemainingArgs(); 46 | if (otherArgs.length != 2) { 47 | System.err.println("Usage: ReplicatedJoin "); 48 | System.exit(1); 49 | } 50 | 51 | // Configure the join type 52 | Job job = new Job(conf, "Replicated Join"); 53 | job.setJarByClass(ReplicatedJoinDriver.class); 54 | 55 | job.setMapperClass(ReplicatedJoinMapper.class); 56 | job.setNumReduceTasks(0); 57 | 58 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 59 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 60 | 61 | job.setOutputKeyClass(Text.class); 62 | job.setOutputValueClass(Text.class); 63 | 64 | System.exit(job.waitForCompletion(true) ? 0 : 3); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/ReduceSideJoinDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Map; 6 | 7 | import mrdp.utils.MRDPUtils; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.Mapper; 14 | import org.apache.hadoop.mapreduce.Reducer; 15 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 16 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 18 | import org.apache.hadoop.util.GenericOptionsParser; 19 | 20 | public class ReduceSideJoinDriver { 21 | 22 | public static class UserJoinMapper extends Mapper { 23 | 24 | private Text outkey = new Text(); 25 | private Text outvalue = new Text(); 26 | 27 | @Override 28 | public void map(Object key, Text value, Context context) 29 | throws IOException, InterruptedException { 30 | 31 | // Parse the input string into a nice map 32 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 33 | 34 | String userId = parsed.get("Id"); 35 | 36 | if (userId == null) { 37 | return; 38 | } 39 | 40 | // The foreign join key is the user ID 41 | outkey.set(userId); 42 | 43 | // Flag this record for the reducer and then output 44 | outvalue.set("A" + value.toString()); 45 | context.write(outkey, outvalue); 46 | } 47 | } 48 | 49 | public static class CommentJoinMapper extends 50 | Mapper { 51 | 52 | private Text outkey = new Text(); 53 | private Text outvalue = new Text(); 54 | 55 | @Override 56 | public void map(Object key, Text value, Context context) 57 | throws IOException, InterruptedException { 58 | 59 | // Parse the input string into a nice map 60 | Map parsed = MRDPUtils.transformXmlToMap(value.toString()); 61 | 62 | String userId = parsed.get("UserId"); 63 | if (userId == null) { 64 | return; 65 | } 66 | 67 | // The foreign join key is the user ID 68 | outkey.set(userId); 69 | 70 | // Flag this record for the reducer and then output 71 | outvalue.set("B" + value.toString()); 72 | context.write(outkey, outvalue); 73 | } 74 | } 75 | 76 | public static class UserJoinReducer extends Reducer { 77 | 78 | private ArrayList listA = new ArrayList(); 79 | private ArrayList listB = new ArrayList(); 80 | private String joinType = null; 81 | 82 | @Override 83 | public void setup(Context context) { 84 | // Get the type of join from our configuration 85 | joinType = context.getConfiguration().get("join.type"); 86 | } 87 | 88 | @Override 89 | public void reduce(Text key, Iterable values, Context context) 90 | throws IOException, InterruptedException { 91 | 92 | // Clear our lists 93 | listA.clear(); 94 | listB.clear(); 95 | 96 | // iterate through all our values, binning each record based on what 97 | // it was tagged with 98 | // make sure to remove the tag! 99 | for (Text t : values) { 100 | if (t.charAt(0) == 'A') { 101 | listA.add(new Text(t.toString().substring(1))); 102 | } else if (t.charAt(0) == 'B') { 103 | listB.add(new Text(t.toString().substring(1))); 104 | } 105 | } 106 | 107 | // Execute our join logic now that the lists are filled 108 | executeJoinLogic(context); 109 | } 110 | 111 | private void executeJoinLogic(Context context) throws IOException, 112 | InterruptedException { 113 | if (joinType.equalsIgnoreCase("inner")) { 114 | // If both lists are not empty, join A with B 115 | if (!listA.isEmpty() && !listB.isEmpty()) { 116 | for (Text A : listA) { 117 | for (Text B : listB) { 118 | context.write(A, B); 119 | } 120 | } 121 | } 122 | } else if (joinType.equalsIgnoreCase("leftouter")) { 123 | // For each entry in A, 124 | for (Text A : listA) { 125 | // If list B is not empty, join A and B 126 | if (!listB.isEmpty()) { 127 | for (Text B : listB) { 128 | context.write(A, B); 129 | } 130 | } else { 131 | // Else, output A by itself 132 | context.write(A, new Text("")); 133 | } 134 | } 135 | } else if (joinType.equalsIgnoreCase("rightouter")) { 136 | // FOr each entry in B, 137 | for (Text B : listB) { 138 | // If list A is not empty, join A and B 139 | if (!listA.isEmpty()) { 140 | for (Text A : listA) { 141 | context.write(A, B); 142 | } 143 | } else { 144 | // Else, output B by itself 145 | context.write(new Text(""), B); 146 | } 147 | } 148 | } else if (joinType.equalsIgnoreCase("fullouter")) { 149 | // If list A is not empty 150 | if (!listA.isEmpty()) { 151 | // For each entry in A 152 | for (Text A : listA) { 153 | // If list B is not empty, join A with B 154 | if (!listB.isEmpty()) { 155 | for (Text B : listB) { 156 | context.write(A, B); 157 | } 158 | } else { 159 | // Else, output A by itself 160 | context.write(A, new Text("")); 161 | } 162 | } 163 | } else { 164 | // If list A is empty, just output B 165 | for (Text B : listB) { 166 | context.write(new Text(""), B); 167 | } 168 | } 169 | } else if (joinType.equalsIgnoreCase("anti")) { 170 | // If list A is empty and B is empty or vice versa 171 | if (listA.isEmpty() ^ listB.isEmpty()) { 172 | 173 | // Iterate both A and B with null values 174 | // The previous XOR check will make sure exactly one of 175 | // these lists is empty and therefore won't have output 176 | for (Text A : listA) { 177 | context.write(A, new Text("")); 178 | } 179 | 180 | for (Text B : listB) { 181 | context.write(new Text(""), B); 182 | } 183 | } 184 | } else { 185 | throw new RuntimeException( 186 | "Join type not set to inner, leftouter, rightouter, fullouter, or anti"); 187 | } 188 | } 189 | } 190 | 191 | public static void main(String[] args) throws Exception { 192 | Configuration conf = new Configuration(); 193 | String[] otherArgs = new GenericOptionsParser(conf, args) 194 | .getRemainingArgs(); 195 | if (otherArgs.length != 4) { 196 | System.err 197 | .println("Usage: ReduceSideJoin [inner|leftouter|rightouter|fullouter|anti]"); 198 | System.exit(1); 199 | } 200 | 201 | String joinType = otherArgs[3]; 202 | if (!(joinType.equalsIgnoreCase("inner") 203 | || joinType.equalsIgnoreCase("leftouter") 204 | || joinType.equalsIgnoreCase("rightouter") 205 | || joinType.equalsIgnoreCase("fullouter") || joinType 206 | .equalsIgnoreCase("anti"))) { 207 | System.err 208 | .println("Join type not set to inner, leftouter, rightouter, fullouter, or anti"); 209 | System.exit(2); 210 | } 211 | 212 | Job job = new Job(conf, "Reduce Side Join"); 213 | 214 | // Configure the join type 215 | job.getConfiguration().set("join.type", joinType); 216 | job.setJarByClass(ReduceSideJoinDriver.class); 217 | 218 | // Use multiple inputs to set which input uses what mapper 219 | // This will keep parsing of each data set separate from a logical 220 | // standpoint 221 | // However, this version of Hadoop has not upgraded MultipleInputs 222 | // to the mapreduce package, so we have to use the deprecated API. 223 | // Future releases have this in the "mapreduce" package. 224 | MultipleInputs.addInputPath(job, new Path(otherArgs[0]), 225 | TextInputFormat.class, UserJoinMapper.class); 226 | 227 | MultipleInputs.addInputPath(job, new Path(otherArgs[1]), 228 | TextInputFormat.class, CommentJoinMapper.class); 229 | 230 | job.setReducerClass(UserJoinReducer.class); 231 | 232 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 233 | 234 | job.setOutputKeyClass(Text.class); 235 | job.setOutputValueClass(Text.class); 236 | 237 | System.exit(job.waitForCompletion(true) ? 0 : 3); 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/ReduceSideJoinWithBloomDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.Map; 9 | 10 | import mrdp.utils.MRDPUtils; 11 | 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.filecache.DistributedCache; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.Mapper; 18 | import org.apache.hadoop.mapreduce.Reducer; 19 | import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | import org.apache.hadoop.util.bloom.BloomFilter; 24 | import org.apache.hadoop.util.bloom.Key; 25 | 26 | public class ReduceSideJoinWithBloomDriver { 27 | 28 | public static class UserJoinMapperWithBloom extends 29 | Mapper { 30 | 31 | private Text outkey = new Text(); 32 | private Text outvalue = new Text(); 33 | 34 | @Override 35 | public void map(Object key, Text value, Context context) 36 | throws IOException, InterruptedException { 37 | 38 | // Parse the input string into a nice map 39 | Map parsed = MRDPUtils.transformXmlToMap(value 40 | .toString()); 41 | 42 | String userId = parsed.get("Id"); 43 | String reputation = parsed.get("Reputation"); 44 | 45 | if (userId == null || reputation == null) { 46 | return; 47 | } 48 | 49 | // If the reputation is greater than 1,500, output the user ID with 50 | // the value 51 | if (Integer.parseInt(reputation) > 1500) { 52 | outkey.set(parsed.get("Id")); 53 | outvalue.set("A" + value.toString()); 54 | context.write(outkey, outvalue); 55 | } 56 | } 57 | 58 | public static class CommentJoinMapperWithBloom extends 59 | Mapper { 60 | 61 | private BloomFilter bfilter = new BloomFilter(); 62 | private Text outkey = new Text(); 63 | private Text outvalue = new Text(); 64 | 65 | @Override 66 | public void setup(Context context) { 67 | try { 68 | Path[] files = DistributedCache.getLocalCacheFiles(context 69 | .getConfiguration()); 70 | 71 | if (files.length != 0) { 72 | DataInputStream strm = new DataInputStream( 73 | new FileInputStream(new File( 74 | files[0].toString()))); 75 | bfilter.readFields(strm); 76 | } else { 77 | throw new RuntimeException( 78 | "Bloom filter not set in DistributedCache"); 79 | } 80 | } catch (IOException e) { 81 | throw new RuntimeException(e); 82 | } 83 | } 84 | 85 | @Override 86 | public void map(Object key, Text value, Context context) 87 | throws IOException, InterruptedException { 88 | 89 | // Parse the input string into a nice map 90 | Map parsed = MRDPUtils.transformXmlToMap(value 91 | .toString()); 92 | 93 | String userId = parsed.get("UserId"); 94 | 95 | if (userId == null) { 96 | return; 97 | } 98 | 99 | if (bfilter.membershipTest(new Key(userId.getBytes()))) { 100 | outkey.set(userId); 101 | outvalue.set("B" + value.toString()); 102 | context.write(outkey, outvalue); 103 | } 104 | } 105 | } 106 | 107 | public static class UserJoinReducer extends 108 | Reducer { 109 | 110 | private ArrayList listA = new ArrayList(); 111 | private ArrayList listB = new ArrayList(); 112 | private String joinType = null; 113 | 114 | @Override 115 | public void setup(Context context) { 116 | // Get the type of join from our configuration 117 | joinType = context.getConfiguration().get("join.type"); 118 | } 119 | 120 | @Override 121 | public void reduce(Text key, Iterable values, Context context) 122 | throws IOException, InterruptedException { 123 | 124 | // Clear our lists 125 | listA.clear(); 126 | listB.clear(); 127 | 128 | // iterate through all our values, binning each record based on 129 | // what 130 | // it was tagged with 131 | // make sure to remove the tag! 132 | for (Text t : values) { 133 | if (t.charAt(0) == 'A') { 134 | listA.add(new Text(t.toString().substring(1))); 135 | } else /* if (tmp.charAt('0') == 'B') */{ 136 | listB.add(new Text(t.toString().substring(1))); 137 | } 138 | } 139 | 140 | // Execute our join logic now that the lists are filled 141 | executeJoinLogic(context); 142 | } 143 | 144 | private void executeJoinLogic(Context context) throws IOException, 145 | InterruptedException { 146 | if (joinType.equalsIgnoreCase("inner")) { 147 | // If both lists are not empty, join A with B 148 | if (!listA.isEmpty() && !listB.isEmpty()) { 149 | for (Text A : listA) { 150 | for (Text B : listB) { 151 | context.write(A, B); 152 | } 153 | } 154 | } 155 | } else if (joinType.equalsIgnoreCase("leftouter")) { 156 | // For each entry in A, 157 | for (Text A : listA) { 158 | // If list B is not empty, join A and B 159 | if (!listB.isEmpty()) { 160 | for (Text B : listB) { 161 | context.write(A, B); 162 | } 163 | } else { 164 | // Else, output A by itself 165 | context.write(A, new Text("")); 166 | } 167 | } 168 | } else if (joinType.equalsIgnoreCase("rightouter")) { 169 | // FOr each entry in B, 170 | for (Text B : listB) { 171 | // If list A is not empty, join A and B 172 | if (!listA.isEmpty()) { 173 | for (Text A : listA) { 174 | context.write(A, B); 175 | } 176 | } else { 177 | // Else, output B by itself 178 | context.write(new Text(""), B); 179 | } 180 | } 181 | } else if (joinType.equalsIgnoreCase("fullouter")) { 182 | // If list A is not empty 183 | if (!listA.isEmpty()) { 184 | // For each entry in A 185 | for (Text A : listA) { 186 | // If list B is not empty, join A with B 187 | if (!listB.isEmpty()) { 188 | for (Text B : listB) { 189 | context.write(A, B); 190 | } 191 | } else { 192 | // Else, output A by itself 193 | context.write(A, new Text("")); 194 | } 195 | } 196 | } else { 197 | // If list A is empty, just output B 198 | for (Text B : listB) { 199 | context.write(new Text(""), B); 200 | } 201 | } 202 | } else if (joinType.equalsIgnoreCase("anti")) { 203 | // If list A is empty and B is not empty or vice versa 204 | if (listA.isEmpty() ^ listB.isEmpty()) { 205 | 206 | // Iterate both A and B with null values 207 | // The previous XOR check will make sure exactly one of 208 | // these lists is empty and therefore won't have output 209 | for (Text A : listA) { 210 | context.write(A, new Text("")); 211 | } 212 | 213 | for (Text B : listB) { 214 | context.write(new Text(""), B); 215 | } 216 | } 217 | } else { 218 | throw new RuntimeException( 219 | "Join type not set to inner, leftouter, rightouter, fullouter, or anti"); 220 | } 221 | } 222 | } 223 | 224 | public static void main(String[] args) throws Exception { 225 | Configuration conf = new Configuration(); 226 | String[] otherArgs = new GenericOptionsParser(conf, args) 227 | .getRemainingArgs(); 228 | if (otherArgs.length != 4) { 229 | System.err 230 | .println("Usage: ReduceSideJoin [inner|leftouter|rightouter|fullouter|anti]"); 231 | System.exit(1); 232 | } 233 | 234 | String joinType = otherArgs[3]; 235 | if (!(joinType.equalsIgnoreCase("inner") 236 | || joinType.equalsIgnoreCase("leftouter") 237 | || joinType.equalsIgnoreCase("rightouter") 238 | || joinType.equalsIgnoreCase("fullouter") || joinType 239 | .equalsIgnoreCase("anti"))) { 240 | System.err 241 | .println("Join type not set to inner, leftouter, rightouter, fullouter, or anti"); 242 | System.exit(2); 243 | } 244 | 245 | Job job = new Job(conf, "Reduce Side Join"); 246 | // Configure the join type 247 | job.getConfiguration().set("join.type", joinType); 248 | job.setJarByClass(ReduceSideJoinWithBloomDriver.class); 249 | 250 | // Use multiple inputs to set which input uses what mapper 251 | // This will keep parsing of each data set separate from a logical 252 | // standpoint 253 | MultipleInputs.addInputPath(job, new Path(otherArgs[0]), 254 | TextInputFormat.class, UserJoinMapperWithBloom.class); 255 | MultipleInputs.addInputPath(job, new Path(otherArgs[1]), 256 | TextInputFormat.class, CommentJoinMapperWithBloom.class); 257 | 258 | job.setReducerClass(UserJoinReducer.class); 259 | 260 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 261 | 262 | job.setOutputKeyClass(Text.class); 263 | job.setOutputValueClass(Text.class); 264 | 265 | System.exit(job.waitForCompletion(true) ? 0 : 3); 266 | } 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch5/ReplicatedJoinDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.Map; 9 | import java.util.HashMap; 10 | import java.util.zip.GZIPInputStream; 11 | 12 | import mrdp.utils.MRDPUtils; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.filecache.DistributedCache; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | 24 | public class ReplicatedJoinDriver { 25 | 26 | public static class ReplicatedJoinMapper extends 27 | Mapper { 28 | 29 | private HashMap userIdToInfo = new HashMap(); 30 | 31 | private Text outvalue = new Text(); 32 | private String joinType = null; 33 | 34 | @Override 35 | public void setup(Context context) throws IOException, 36 | InterruptedException { 37 | try { 38 | Path[] files = DistributedCache.getLocalCacheFiles(context 39 | .getConfiguration()); 40 | 41 | if (files == null || files.length == 0) { 42 | throw new RuntimeException( 43 | "User information is not set in DistributedCache"); 44 | } 45 | 46 | // Read all files in the DistributedCache 47 | for (Path p : files) { 48 | BufferedReader rdr = new BufferedReader( 49 | new InputStreamReader( 50 | new GZIPInputStream(new FileInputStream( 51 | new File(p.toString()))))); 52 | 53 | String line; 54 | // For each record in the user file 55 | while ((line = rdr.readLine()) != null) { 56 | 57 | // Get the user ID for this record 58 | Map parsed = MRDPUtils 59 | .transformXmlToMap(line); 60 | String userId = parsed.get("Id"); 61 | 62 | if (userId != null) { 63 | // Map the user ID to the record 64 | userIdToInfo.put(userId, line); 65 | } 66 | } 67 | } 68 | 69 | } catch (IOException e) { 70 | throw new RuntimeException(e); 71 | } 72 | 73 | // Get the join type 74 | joinType = context.getConfiguration().get("join.type"); 75 | } 76 | 77 | @Override 78 | public void map(Object key, Text value, Context context) 79 | throws IOException, InterruptedException { 80 | 81 | // Parse the input string into a nice map 82 | Map parsed = MRDPUtils.transformXmlToMap(value 83 | .toString()); 84 | 85 | String userId = parsed.get("UserId"); 86 | 87 | if (userId == null) { 88 | return; 89 | } 90 | 91 | String userInformation = userIdToInfo.get(userId); 92 | 93 | // If the user information is not null, then output 94 | if (userInformation != null) { 95 | outvalue.set(userInformation); 96 | context.write(value, outvalue); 97 | } else if (joinType.equalsIgnoreCase("leftouter")) { 98 | // If we are doing a left outer join, output the record with an 99 | // empty value 100 | context.write(value, new Text("")); 101 | } 102 | } 103 | } 104 | 105 | public static void main(String[] args) throws Exception { 106 | Configuration conf = new Configuration(); 107 | String[] otherArgs = new GenericOptionsParser(conf, args) 108 | .getRemainingArgs(); 109 | if (otherArgs.length != 4) { 110 | System.err 111 | .println("Usage: ReplicatedJoin [inner|leftouter]"); 112 | System.exit(1); 113 | } 114 | 115 | String joinType = otherArgs[3]; 116 | if (!(joinType.equalsIgnoreCase("inner") || joinType 117 | .equalsIgnoreCase("leftouter"))) { 118 | System.err.println("Join type not set to inner or leftouter"); 119 | System.exit(2); 120 | } 121 | 122 | // Configure the join type 123 | Job job = new Job(conf, "Replicated Join"); 124 | job.getConfiguration().set("join.type", joinType); 125 | job.setJarByClass(ReplicatedJoinDriver.class); 126 | 127 | job.setMapperClass(ReplicatedJoinMapper.class); 128 | job.setNumReduceTasks(0); 129 | 130 | TextInputFormat.setInputPaths(job, new Path(otherArgs[1])); 131 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[2])); 132 | 133 | job.setOutputKeyClass(Text.class); 134 | job.setOutputValueClass(Text.class); 135 | 136 | // Configure the DistributedCache 137 | DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), 138 | job.getConfiguration()); 139 | 140 | DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]); 141 | 142 | System.exit(job.waitForCompletion(true) ? 0 : 3); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch6/ChainMapperDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch6; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.HashMap; 9 | import java.util.Iterator; 10 | import java.util.Map; 11 | import java.util.zip.GZIPInputStream; 12 | 13 | import mrdp.utils.MRDPUtils; 14 | 15 | import org.apache.hadoop.filecache.DistributedCache; 16 | import org.apache.hadoop.fs.FileStatus; 17 | import org.apache.hadoop.fs.FileSystem; 18 | import org.apache.hadoop.fs.Path; 19 | import org.apache.hadoop.io.LongWritable; 20 | import org.apache.hadoop.io.Text; 21 | import org.apache.hadoop.mapred.FileOutputFormat; 22 | import org.apache.hadoop.mapred.JobClient; 23 | import org.apache.hadoop.mapred.JobConf; 24 | import org.apache.hadoop.mapred.MapReduceBase; 25 | import org.apache.hadoop.mapred.Mapper; 26 | import org.apache.hadoop.mapred.OutputCollector; 27 | import org.apache.hadoop.mapred.Reducer; 28 | import org.apache.hadoop.mapred.Reporter; 29 | import org.apache.hadoop.mapred.RunningJob; 30 | import org.apache.hadoop.mapred.TextInputFormat; 31 | import org.apache.hadoop.mapred.TextOutputFormat; 32 | import org.apache.hadoop.mapred.lib.ChainMapper; 33 | import org.apache.hadoop.mapred.lib.ChainReducer; 34 | import org.apache.hadoop.mapred.lib.MultipleOutputs; 35 | import org.apache.hadoop.mapred.lib.NullOutputFormat; 36 | import org.apache.hadoop.util.GenericOptionsParser; 37 | 38 | public class ChainMapperDriver { 39 | 40 | public static final String AVERAGE_CALC_GROUP = "AverageCalculation"; 41 | public static final String MULTIPLE_OUTPUTS_BELOW_5000 = "below5000"; 42 | public static final String MULTIPLE_OUTPUTS_ABOVE_5000 = "above5000"; 43 | 44 | public static class UserIdCountMapper extends MapReduceBase implements 45 | Mapper { 46 | 47 | public static final String RECORDS_COUNTER_NAME = "Records"; 48 | 49 | private static final LongWritable ONE = new LongWritable(1); 50 | private Text outkey = new Text(); 51 | 52 | @Override 53 | public void map(Object key, Text value, 54 | OutputCollector output, Reporter reporter) 55 | throws IOException { 56 | 57 | // Parse the input into a nice map. 58 | Map parsed = MRDPUtils.transformXmlToMap(value 59 | .toString()); 60 | 61 | // Get the value for the OwnerUserId attribute 62 | String userId = parsed.get("OwnerUserId"); 63 | 64 | if (userId != null) { 65 | outkey.set(userId); 66 | output.collect(outkey, ONE); 67 | } 68 | } 69 | } 70 | 71 | public static class UserIdReputationEnrichmentMapper extends MapReduceBase 72 | implements Mapper { 73 | 74 | private Text outkey = new Text(); 75 | private HashMap userIdToReputation = new HashMap(); 76 | 77 | @Override 78 | public void configure(JobConf job) { 79 | try { 80 | userIdToReputation.clear(); 81 | Path[] files = DistributedCache.getLocalCacheFiles(job); 82 | 83 | if (files == null || files.length == 0) { 84 | throw new RuntimeException( 85 | "User information is not set in DistributedCache"); 86 | } 87 | 88 | // Read all files in the DistributedCache 89 | for (Path p : files) { 90 | BufferedReader rdr = new BufferedReader( 91 | new InputStreamReader( 92 | new GZIPInputStream(new FileInputStream( 93 | new File(p.toString()))))); 94 | 95 | String line; 96 | // For each record in the user file 97 | while ((line = rdr.readLine()) != null) { 98 | 99 | // Get the user ID and reputation 100 | Map parsed = MRDPUtils 101 | .transformXmlToMap(line); 102 | String userId = parsed.get("Id"); 103 | String reputation = parsed.get("Reputation"); 104 | 105 | if (userId != null && reputation != null) { 106 | // Map the user ID to the reputation 107 | userIdToReputation.put(userId, reputation); 108 | } 109 | } 110 | } 111 | 112 | } catch (IOException e) { 113 | throw new RuntimeException(e); 114 | } 115 | } 116 | 117 | @Override 118 | public void map(Text key, LongWritable value, 119 | OutputCollector output, Reporter reporter) 120 | throws IOException { 121 | 122 | String reputation = userIdToReputation.get(key.toString()); 123 | if (reputation != null) { 124 | outkey.set(value.get() + "\t" + reputation); 125 | output.collect(outkey, value); 126 | } 127 | } 128 | } 129 | 130 | public static class LongSumReducer extends MapReduceBase implements 131 | Reducer { 132 | 133 | private LongWritable outvalue = new LongWritable(); 134 | 135 | @Override 136 | public void reduce(Text key, Iterator values, 137 | OutputCollector output, Reporter reporter) 138 | throws IOException { 139 | 140 | int sum = 0; 141 | while (values.hasNext()) { 142 | sum += values.next().get(); 143 | } 144 | outvalue.set(sum); 145 | output.collect(key, outvalue); 146 | } 147 | } 148 | 149 | public static class UserIdBinningMapper extends MapReduceBase implements 150 | Mapper { 151 | 152 | private MultipleOutputs mos = null; 153 | 154 | @Override 155 | public void configure(JobConf conf) { 156 | mos = new MultipleOutputs(conf); 157 | } 158 | 159 | @SuppressWarnings("unchecked") 160 | @Override 161 | public void map(Text key, LongWritable value, 162 | OutputCollector output, Reporter reporter) 163 | throws IOException { 164 | 165 | if (Integer.parseInt(key.toString().split("\t")[1]) < 5000) { 166 | mos.getCollector(MULTIPLE_OUTPUTS_BELOW_5000, reporter) 167 | .collect(key, value); 168 | } else { 169 | mos.getCollector(MULTIPLE_OUTPUTS_ABOVE_5000, reporter) 170 | .collect(key, value); 171 | } 172 | } 173 | 174 | @Override 175 | public void close() { 176 | try { 177 | mos.close(); 178 | } catch (IOException e) { 179 | e.printStackTrace(); 180 | } 181 | } 182 | } 183 | 184 | public static void main(String[] args) throws Exception { 185 | JobConf conf = new JobConf("ChainMapperReducer"); 186 | String[] otherArgs = new GenericOptionsParser(conf, args) 187 | .getRemainingArgs(); 188 | 189 | if (otherArgs.length != 3) { 190 | System.err 191 | .println("Usage: ChainMapperReducer "); 192 | System.exit(2); 193 | } 194 | 195 | Path postInput = new Path(otherArgs[0]); 196 | Path userInput = new Path(otherArgs[1]); 197 | Path outputDir = new Path(otherArgs[2]); 198 | 199 | // Setup first job to counter user posts 200 | conf.setJarByClass(ChainMapperDriver.class); 201 | 202 | ChainMapper.addMapper(conf, UserIdCountMapper.class, 203 | LongWritable.class, Text.class, Text.class, LongWritable.class, 204 | false, new JobConf(false)); 205 | 206 | ChainMapper.addMapper(conf, UserIdReputationEnrichmentMapper.class, 207 | Text.class, LongWritable.class, Text.class, LongWritable.class, 208 | false, new JobConf(false)); 209 | 210 | ChainReducer.setReducer(conf, LongSumReducer.class, Text.class, 211 | LongWritable.class, Text.class, LongWritable.class, false, 212 | new JobConf(false)); 213 | 214 | ChainReducer.addMapper(conf, UserIdBinningMapper.class, Text.class, 215 | LongWritable.class, Text.class, LongWritable.class, false, 216 | new JobConf(false)); 217 | 218 | conf.setCombinerClass(LongSumReducer.class); 219 | 220 | conf.setInputFormat(TextInputFormat.class); 221 | TextInputFormat.setInputPaths(conf, postInput); 222 | 223 | // Configure multiple outputs 224 | conf.setOutputFormat(NullOutputFormat.class); 225 | FileOutputFormat.setOutputPath(conf, outputDir); 226 | MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_ABOVE_5000, 227 | TextOutputFormat.class, Text.class, LongWritable.class); 228 | MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_BELOW_5000, 229 | TextOutputFormat.class, Text.class, LongWritable.class); 230 | 231 | conf.setOutputKeyClass(Text.class); 232 | conf.setOutputValueClass(LongWritable.class); 233 | 234 | // Add the user files to the DistributedCache 235 | FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput); 236 | for (FileStatus status : userFiles) { 237 | DistributedCache.addCacheFile(status.getPath().toUri(), conf); 238 | } 239 | 240 | RunningJob job = JobClient.runJob(conf); 241 | 242 | while (!job.isComplete()) { 243 | Thread.sleep(5000); 244 | } 245 | 246 | System.exit(job.isSuccessful() ? 0 : 1); 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch6/JobChainingDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch6; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | import java.util.zip.GZIPInputStream; 11 | 12 | import mrdp.utils.MRDPUtils; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.filecache.DistributedCache; 16 | import org.apache.hadoop.fs.FileStatus; 17 | import org.apache.hadoop.fs.FileSystem; 18 | import org.apache.hadoop.fs.Path; 19 | import org.apache.hadoop.io.LongWritable; 20 | import org.apache.hadoop.io.Text; 21 | import org.apache.hadoop.mapreduce.Job; 22 | import org.apache.hadoop.mapreduce.Mapper; 23 | import org.apache.hadoop.mapreduce.Reducer; 24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 25 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 26 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 27 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer; 28 | import org.apache.hadoop.util.GenericOptionsParser; 29 | 30 | public class JobChainingDriver { 31 | 32 | public static final String AVERAGE_CALC_GROUP = "AverageCalculation"; 33 | public static final String MULTIPLE_OUTPUTS_ABOVE_NAME = "aboveavg"; 34 | public static final String MULTIPLE_OUTPUTS_BELOW_NAME = "belowavg"; 35 | 36 | public static class UserIdCountMapper extends 37 | Mapper { 38 | 39 | public static final String RECORDS_COUNTER_NAME = "Records"; 40 | 41 | private static final LongWritable ONE = new LongWritable(1); 42 | private Text outkey = new Text(); 43 | 44 | @Override 45 | public void map(Object key, Text value, Context context) 46 | throws IOException, InterruptedException { 47 | 48 | // Parse the input into a nice map. 49 | Map parsed = MRDPUtils.transformXmlToMap(value 50 | .toString()); 51 | 52 | // Get the value for the OwnerUserId attribute 53 | String userId = parsed.get("OwnerUserId"); 54 | 55 | if (userId != null) { 56 | outkey.set(userId); 57 | context.write(outkey, ONE); 58 | context.getCounter(AVERAGE_CALC_GROUP, RECORDS_COUNTER_NAME) 59 | .increment(1); 60 | } 61 | } 62 | } 63 | 64 | public static class UserIdSumReducer extends 65 | Reducer { 66 | 67 | public static final String USERS_COUNTER_NAME = "Users"; 68 | private LongWritable outvalue = new LongWritable(); 69 | 70 | @Override 71 | public void reduce(Text key, Iterable values, 72 | Context context) throws IOException, InterruptedException { 73 | 74 | // Increment user counter, as each reduce group represents one user 75 | context.getCounter(AVERAGE_CALC_GROUP, USERS_COUNTER_NAME) 76 | .increment(1); 77 | 78 | int sum = 0; 79 | 80 | for (LongWritable value : values) { 81 | sum += value.get(); 82 | } 83 | 84 | outvalue.set(sum); 85 | context.write(key, outvalue); 86 | } 87 | } 88 | 89 | public static class UserIdBinningMapper extends 90 | Mapper { 91 | 92 | public static final String AVERAGE_POSTS_PER_USER = "avg.posts.per.user"; 93 | 94 | public static void setAveragePostsPerUser(Job job, double avg) { 95 | job.getConfiguration().set(AVERAGE_POSTS_PER_USER, 96 | Double.toString(avg)); 97 | } 98 | 99 | public static double getAveragePostsPerUser(Configuration conf) { 100 | return Double.parseDouble(conf.get(AVERAGE_POSTS_PER_USER)); 101 | } 102 | 103 | private double average = 0.0; 104 | private MultipleOutputs mos = null; 105 | private Text outkey = new Text(), outvalue = new Text(); 106 | private HashMap userIdToReputation = new HashMap(); 107 | 108 | protected void setup(Context context) throws IOException, 109 | InterruptedException { 110 | average = getAveragePostsPerUser(context.getConfiguration()); 111 | mos = new MultipleOutputs(context); 112 | 113 | try { 114 | Path[] files = DistributedCache.getLocalCacheFiles(context 115 | .getConfiguration()); 116 | 117 | if (files == null || files.length == 0) { 118 | throw new RuntimeException( 119 | "User information is not set in DistributedCache"); 120 | } 121 | 122 | // Read all files in the DistributedCache 123 | for (Path p : files) { 124 | BufferedReader rdr = new BufferedReader( 125 | new InputStreamReader( 126 | new GZIPInputStream(new FileInputStream( 127 | new File(p.toString()))))); 128 | 129 | String line; 130 | // For each record in the user file 131 | while ((line = rdr.readLine()) != null) { 132 | 133 | // Get the user ID and reputation 134 | Map parsed = MRDPUtils 135 | .transformXmlToMap(line); 136 | String userId = parsed.get("Id"); 137 | String reputation = parsed.get("Reputation"); 138 | 139 | if (userId != null && reputation != null) { 140 | // Map the user ID to the reputation 141 | userIdToReputation.put(userId, reputation); 142 | } 143 | } 144 | } 145 | 146 | } catch (IOException e) { 147 | throw new RuntimeException(e); 148 | } 149 | } 150 | 151 | @Override 152 | public void map(Object key, Text value, Context context) 153 | throws IOException, InterruptedException { 154 | 155 | String[] tokens = value.toString().split("\t"); 156 | 157 | String userId = tokens[0]; 158 | int posts = Integer.parseInt(tokens[1]); 159 | 160 | outkey.set(userId); 161 | outvalue.set((long) posts + "\t" + userIdToReputation.get(userId)); 162 | 163 | if ((double) posts < average) { 164 | mos.write(MULTIPLE_OUTPUTS_BELOW_NAME, outkey, outvalue, 165 | MULTIPLE_OUTPUTS_BELOW_NAME + "/part"); 166 | } else { 167 | mos.write(MULTIPLE_OUTPUTS_ABOVE_NAME, outkey, outvalue, 168 | MULTIPLE_OUTPUTS_ABOVE_NAME + "/part"); 169 | } 170 | 171 | } 172 | 173 | protected void cleanup(Context context) throws IOException, 174 | InterruptedException { 175 | mos.close(); 176 | } 177 | } 178 | 179 | public static void main(String[] args) throws Exception { 180 | Configuration conf = new Configuration(); 181 | String[] otherArgs = new GenericOptionsParser(conf, args) 182 | .getRemainingArgs(); 183 | 184 | if (otherArgs.length != 3) { 185 | System.err 186 | .println("Usage: JobChainingDriver "); 187 | System.exit(2); 188 | } 189 | 190 | Path postInput = new Path(otherArgs[0]); 191 | Path userInput = new Path(otherArgs[1]); 192 | Path outputDirIntermediate = new Path(otherArgs[2] + "_int"); 193 | Path outputDir = new Path(otherArgs[2]); 194 | 195 | // Setup first job to counter user posts 196 | Job countingJob = new Job(conf, "JobChaining-Counting"); 197 | countingJob.setJarByClass(JobChainingDriver.class); 198 | 199 | // Set our mapper and reducer, we can use the API's long sum reducer for 200 | // a combiner! 201 | countingJob.setMapperClass(UserIdCountMapper.class); 202 | countingJob.setCombinerClass(LongSumReducer.class); 203 | countingJob.setReducerClass(UserIdSumReducer.class); 204 | 205 | countingJob.setOutputKeyClass(Text.class); 206 | countingJob.setOutputValueClass(LongWritable.class); 207 | 208 | countingJob.setInputFormatClass(TextInputFormat.class); 209 | 210 | TextInputFormat.addInputPath(countingJob, postInput); 211 | 212 | countingJob.setOutputFormatClass(TextOutputFormat.class); 213 | TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate); 214 | 215 | // Execute job and grab exit code 216 | int code = countingJob.waitForCompletion(true) ? 0 : 1; 217 | 218 | if (code == 0) { 219 | // Calculate the average posts per user by getting counter values 220 | double numRecords = (double) countingJob 221 | .getCounters() 222 | .findCounter(AVERAGE_CALC_GROUP, 223 | UserIdCountMapper.RECORDS_COUNTER_NAME).getValue(); 224 | double numUsers = (double) countingJob 225 | .getCounters() 226 | .findCounter(AVERAGE_CALC_GROUP, 227 | UserIdSumReducer.USERS_COUNTER_NAME).getValue(); 228 | 229 | double averagePostsPerUser = numRecords / numUsers; 230 | 231 | // Setup binning job 232 | Job binningJob = new Job(new Configuration(), "JobChaining-Binning"); 233 | binningJob.setJarByClass(JobChainingDriver.class); 234 | 235 | // Set mapper and the average posts per user 236 | binningJob.setMapperClass(UserIdBinningMapper.class); 237 | UserIdBinningMapper.setAveragePostsPerUser(binningJob, 238 | averagePostsPerUser); 239 | 240 | binningJob.setNumReduceTasks(0); 241 | 242 | binningJob.setInputFormatClass(TextInputFormat.class); 243 | TextInputFormat.addInputPath(binningJob, outputDirIntermediate); 244 | 245 | // Add two named outputs for below/above average 246 | MultipleOutputs.addNamedOutput(binningJob, 247 | MULTIPLE_OUTPUTS_BELOW_NAME, TextOutputFormat.class, 248 | Text.class, Text.class); 249 | 250 | MultipleOutputs.addNamedOutput(binningJob, 251 | MULTIPLE_OUTPUTS_ABOVE_NAME, TextOutputFormat.class, 252 | Text.class, Text.class); 253 | MultipleOutputs.setCountersEnabled(binningJob, true); 254 | 255 | TextOutputFormat.setOutputPath(binningJob, outputDir); 256 | 257 | // Add the user files to the DistributedCache 258 | FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput); 259 | for (FileStatus status : userFiles) { 260 | DistributedCache.addCacheFile(status.getPath().toUri(), 261 | binningJob.getConfiguration()); 262 | } 263 | 264 | // Execute job and grab exit code 265 | code = binningJob.waitForCompletion(true) ? 0 : 1; 266 | } 267 | 268 | // Clean up the intermediate output 269 | FileSystem.get(conf).delete(outputDirIntermediate, true); 270 | 271 | System.exit(code); 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch6/JobControlDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch6; 2 | 3 | import java.io.IOException; 4 | import mrdp.ch6.JobChainingDriver.UserIdBinningMapper; 5 | import mrdp.ch6.JobChainingDriver.UserIdCountMapper; 6 | import mrdp.ch6.JobChainingDriver.UserIdSumReducer; 7 | import mrdp.ch6.ParallelJobs.AverageReputationMapper; 8 | import mrdp.ch6.ParallelJobs.AverageReputationReducer; 9 | 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.filecache.DistributedCache; 12 | import org.apache.hadoop.fs.FileStatus; 13 | import org.apache.hadoop.fs.FileSystem; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.DoubleWritable; 16 | import org.apache.hadoop.io.LongWritable; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.mapreduce.Job; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; 21 | import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; 22 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 23 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 24 | import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer; 25 | 26 | public class JobControlDriver { 27 | public static void main(String[] args) throws Exception { 28 | 29 | if (args.length != 4) { 30 | System.err 31 | .println("Usage: JobChainingDriver "); 32 | System.exit(2); 33 | } 34 | 35 | Path postInput = new Path(args[0]); 36 | Path userInput = new Path(args[1]); 37 | Path countingOutput = new Path(args[3] + "_count"); 38 | Path binningOutputRoot = new Path(args[3] + "_bins"); 39 | Path binningOutputBelow = new Path(binningOutputRoot + "/" 40 | + JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME); 41 | Path binningOutputAbove = new Path(binningOutputRoot + "/" 42 | + JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME); 43 | 44 | Path belowAverageRepOutput = new Path(args[2]); 45 | Path aboveAverageRepOutput = new Path(args[3]); 46 | 47 | Job countingJob = getCountingJob(postInput, countingOutput); 48 | 49 | int code = 1; 50 | if (countingJob.waitForCompletion(true)) { 51 | ControlledJob binningControlledJob = new ControlledJob( 52 | getBinningJobConf(countingJob, countingOutput, userInput, 53 | binningOutputRoot)); 54 | 55 | ControlledJob belowAvgControlledJob = new ControlledJob( 56 | getAverageJobConf(binningOutputBelow, belowAverageRepOutput)); 57 | belowAvgControlledJob.addDependingJob(binningControlledJob); 58 | 59 | ControlledJob aboveAvgControlledJob = new ControlledJob( 60 | getAverageJobConf(binningOutputAbove, aboveAverageRepOutput)); 61 | aboveAvgControlledJob.addDependingJob(binningControlledJob); 62 | 63 | JobControl jc = new JobControl("AverageReputation"); 64 | jc.addJob(binningControlledJob); 65 | jc.addJob(belowAvgControlledJob); 66 | jc.addJob(aboveAvgControlledJob); 67 | 68 | jc.run(); 69 | code = jc.getFailedJobList().size() == 0 ? 0 : 1; 70 | } 71 | 72 | FileSystem fs = FileSystem.get(new Configuration()); 73 | fs.delete(countingOutput, true); 74 | fs.delete(binningOutputRoot, true); 75 | 76 | System.out.println("All Done"); 77 | System.exit(code); 78 | } 79 | 80 | public static Job getCountingJob(Path postInput, Path outputDirIntermediate) 81 | throws IOException { 82 | // Setup first job to counter user posts 83 | Job countingJob = new Job(new Configuration(), "JobChaining-Counting"); 84 | countingJob.setJarByClass(JobChainingDriver.class); 85 | 86 | // Set our mapper and reducer, we can use the API's long sum reducer for 87 | // a combiner! 88 | countingJob.setMapperClass(UserIdCountMapper.class); 89 | countingJob.setCombinerClass(LongSumReducer.class); 90 | countingJob.setReducerClass(UserIdSumReducer.class); 91 | 92 | countingJob.setOutputKeyClass(Text.class); 93 | countingJob.setOutputValueClass(LongWritable.class); 94 | 95 | countingJob.setInputFormatClass(TextInputFormat.class); 96 | 97 | TextInputFormat.addInputPath(countingJob, postInput); 98 | 99 | countingJob.setOutputFormatClass(TextOutputFormat.class); 100 | TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate); 101 | 102 | return countingJob; 103 | } 104 | 105 | public static Configuration getBinningJobConf(Job countingJob, 106 | Path jobchainOutdir, Path userInput, Path binningOutput) 107 | throws IOException { 108 | // Calculate the average posts per user by getting counter values 109 | double numRecords = (double) countingJob 110 | .getCounters() 111 | .findCounter(JobChainingDriver.AVERAGE_CALC_GROUP, 112 | UserIdCountMapper.RECORDS_COUNTER_NAME).getValue(); 113 | double numUsers = (double) countingJob 114 | .getCounters() 115 | .findCounter(JobChainingDriver.AVERAGE_CALC_GROUP, 116 | UserIdSumReducer.USERS_COUNTER_NAME).getValue(); 117 | 118 | double averagePostsPerUser = numRecords / numUsers; 119 | 120 | // Setup binning job 121 | Job binningJob = new Job(new Configuration(), "JobChaining-Binning"); 122 | binningJob.setJarByClass(JobChainingDriver.class); 123 | 124 | // Set mapper and the average posts per user 125 | binningJob.setMapperClass(UserIdBinningMapper.class); 126 | UserIdBinningMapper.setAveragePostsPerUser(binningJob, 127 | averagePostsPerUser); 128 | 129 | binningJob.setNumReduceTasks(0); 130 | 131 | binningJob.setInputFormatClass(TextInputFormat.class); 132 | TextInputFormat.addInputPath(binningJob, jobchainOutdir); 133 | 134 | // Add two named outputs for below/above average 135 | MultipleOutputs.addNamedOutput(binningJob, 136 | JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME, 137 | TextOutputFormat.class, Text.class, Text.class); 138 | 139 | MultipleOutputs.addNamedOutput(binningJob, 140 | JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME, 141 | TextOutputFormat.class, Text.class, Text.class); 142 | MultipleOutputs.setCountersEnabled(binningJob, true); 143 | 144 | TextOutputFormat.setOutputPath(binningJob, binningOutput); 145 | 146 | // Add the user files to the DistributedCache 147 | FileStatus[] userFiles = FileSystem.get(new Configuration()) 148 | .listStatus(userInput); 149 | for (FileStatus status : userFiles) { 150 | DistributedCache.addCacheFile(status.getPath().toUri(), 151 | binningJob.getConfiguration()); 152 | } 153 | 154 | // Execute job and grab exit code 155 | return binningJob.getConfiguration(); 156 | } 157 | 158 | public static Configuration getAverageJobConf(Path averageOutputDir, 159 | Path outputDir) throws IOException { 160 | 161 | Job averageJob = new Job(new Configuration(), "ParallelJobs"); 162 | averageJob.setJarByClass(ParallelJobs.class); 163 | 164 | averageJob.setMapperClass(AverageReputationMapper.class); 165 | averageJob.setReducerClass(AverageReputationReducer.class); 166 | 167 | averageJob.setOutputKeyClass(Text.class); 168 | averageJob.setOutputValueClass(DoubleWritable.class); 169 | 170 | averageJob.setInputFormatClass(TextInputFormat.class); 171 | 172 | TextInputFormat.addInputPath(averageJob, averageOutputDir); 173 | 174 | averageJob.setOutputFormatClass(TextOutputFormat.class); 175 | TextOutputFormat.setOutputPath(averageJob, outputDir); 176 | 177 | // Execute job and grab exit code 178 | return averageJob.getConfiguration(); 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch6/MergedJobDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch6; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.Map; 7 | import java.util.Map.Entry; 8 | import java.util.Random; 9 | 10 | import mrdp.utils.MRDPUtils; 11 | 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.NullWritable; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.io.WritableComparable; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.Mapper; 19 | import org.apache.hadoop.mapreduce.Reducer; 20 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 21 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 22 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 23 | import org.apache.hadoop.util.GenericOptionsParser; 24 | 25 | public class MergedJobDriver { 26 | 27 | public static final String MULTIPLE_OUTPUTS_ANONYMIZE = "anonymize"; 28 | public static final String MULTIPLE_OUTPUTS_DISTINCT = "distinct"; 29 | 30 | public static class AnonymizeDistinctMergedMapper extends 31 | Mapper { 32 | 33 | private static final Text DISTINCT_OUT_VALUE = new Text(); 34 | 35 | private Random rndm = new Random(); 36 | private TaggedText anonymizeOutkey = new TaggedText(), 37 | distinctOutkey = new TaggedText(); 38 | private Text anonymizeOutvalue = new Text(); 39 | 40 | @Override 41 | public void map(Object key, Text value, Context context) 42 | throws IOException, InterruptedException { 43 | anonymizeMap(key, value, context); 44 | distinctMap(key, value, context); 45 | } 46 | 47 | private void anonymizeMap(Object key, Text value, Context context) 48 | throws IOException, InterruptedException { 49 | // Parse the input string into a nice map 50 | Map parsed = MRDPUtils.transformXmlToMap(value 51 | .toString()); 52 | 53 | if (parsed.size() > 0) { 54 | StringBuilder bldr = new StringBuilder(); 55 | bldr.append(" entry : parsed.entrySet()) { 57 | 58 | if (entry.getKey().equals("UserId") 59 | || entry.getKey().equals("Id")) { 60 | // ignore these fields 61 | } else if (entry.getKey().equals("CreationDate")) { 62 | // Strip out the time, anything after the 'T' in the 63 | // value 64 | bldr.append(entry.getKey() 65 | + "=\"" 66 | + entry.getValue().substring(0, 67 | entry.getValue().indexOf('T')) + "\" "); 68 | } else { 69 | // Otherwise, output this. 70 | bldr.append(entry.getKey() + "=\"" + entry.getValue() 71 | + "\" "); 72 | } 73 | 74 | } 75 | bldr.append(">"); 76 | anonymizeOutkey.setTag("A"); 77 | anonymizeOutkey.setText(Integer.toString(rndm.nextInt())); 78 | anonymizeOutvalue.set(bldr.toString()); 79 | context.write(anonymizeOutkey, anonymizeOutvalue); 80 | } 81 | } 82 | 83 | private void distinctMap(Object key, Text value, Context context) 84 | throws IOException, InterruptedException { 85 | // Parse the input into a nice map. 86 | Map parsed = MRDPUtils.transformXmlToMap(value 87 | .toString()); 88 | 89 | // Get the value for the UserId attribute 90 | String userId = parsed.get("UserId"); 91 | 92 | // If it is null, skip this record 93 | if (userId == null) { 94 | return; 95 | } 96 | 97 | // Otherwise, set our output key to the user's id, tagged with a "D" 98 | distinctOutkey.setTag("D"); 99 | distinctOutkey.setText(userId); 100 | 101 | // Write the user's id with a null value 102 | context.write(distinctOutkey, DISTINCT_OUT_VALUE); 103 | } 104 | } 105 | 106 | public static class AnonymizeDistinctMergedReducer extends 107 | Reducer { 108 | 109 | private MultipleOutputs mos = null; 110 | 111 | @Override 112 | protected void setup(Context context) throws IOException, 113 | InterruptedException { 114 | mos = new MultipleOutputs(context); 115 | } 116 | 117 | @Override 118 | protected void reduce(TaggedText key, Iterable values, 119 | Context context) throws IOException, InterruptedException { 120 | 121 | if (key.getTag().equals("A")) { 122 | anonymizeReduce(key.getText(), values, context); 123 | } else { 124 | distinctReduce(key.getText(), values, context); 125 | } 126 | } 127 | 128 | private void anonymizeReduce(Text key, Iterable values, 129 | Context context) throws IOException, InterruptedException { 130 | 131 | for (Text value : values) { 132 | mos.write(MULTIPLE_OUTPUTS_ANONYMIZE, value, 133 | NullWritable.get(), MULTIPLE_OUTPUTS_ANONYMIZE 134 | + "/part"); 135 | } 136 | } 137 | 138 | private void distinctReduce(Text key, Iterable values, 139 | Context context) throws IOException, InterruptedException { 140 | mos.write(MULTIPLE_OUTPUTS_DISTINCT, key, NullWritable.get(), 141 | MULTIPLE_OUTPUTS_DISTINCT + "/part"); 142 | } 143 | 144 | @Override 145 | protected void cleanup(Context context) throws IOException, 146 | InterruptedException { 147 | mos.close(); 148 | } 149 | } 150 | 151 | public static void main(String[] args) throws Exception { 152 | Configuration conf = new Configuration(); 153 | String[] otherArgs = new GenericOptionsParser(conf, args) 154 | .getRemainingArgs(); 155 | if (otherArgs.length != 2) { 156 | System.err.println("Usage: MergedJob "); 157 | System.exit(1); 158 | } 159 | 160 | // Configure the merged job 161 | Job job = new Job(conf, "MergedJob"); 162 | job.setJarByClass(MergedJobDriver.class); 163 | 164 | job.setMapperClass(AnonymizeDistinctMergedMapper.class); 165 | job.setReducerClass(AnonymizeDistinctMergedReducer.class); 166 | job.setNumReduceTasks(10); 167 | 168 | TextInputFormat.setInputPaths(job, new Path(otherArgs[0])); 169 | TextOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 170 | 171 | MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_ANONYMIZE, 172 | TextOutputFormat.class, Text.class, NullWritable.class); 173 | MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_DISTINCT, 174 | TextOutputFormat.class, Text.class, NullWritable.class); 175 | 176 | job.setOutputKeyClass(TaggedText.class); 177 | job.setOutputValueClass(Text.class); 178 | 179 | System.exit(job.waitForCompletion(true) ? 0 : 2); 180 | } 181 | 182 | public static class TaggedText implements WritableComparable { 183 | 184 | private String tag = ""; 185 | private Text text = new Text(); 186 | 187 | public TaggedText() { 188 | 189 | } 190 | 191 | public TaggedText(TaggedText text) { 192 | setTag(text.getTag()); 193 | setText(text.getText()); 194 | } 195 | 196 | public void setTag(String tag) { 197 | this.tag = tag; 198 | } 199 | 200 | public String getTag() { 201 | return tag; 202 | } 203 | 204 | public void setText(Text text) { 205 | this.text.set(text); 206 | } 207 | 208 | public void setText(String text) { 209 | this.text.set(text); 210 | } 211 | 212 | public Text getText() { 213 | return text; 214 | } 215 | 216 | @Override 217 | public void readFields(DataInput in) throws IOException { 218 | tag = in.readUTF(); 219 | text.readFields(in); 220 | } 221 | 222 | @Override 223 | public void write(DataOutput out) throws IOException { 224 | out.writeUTF(tag); 225 | text.write(out); 226 | } 227 | 228 | @Override 229 | public int compareTo(TaggedText obj) { 230 | int compare = tag.compareTo(obj.getTag()); 231 | if (compare == 0) { 232 | return text.compareTo(obj.getText()); 233 | } else { 234 | return compare; 235 | } 236 | } 237 | 238 | @Override 239 | public String toString() { 240 | return tag.toString() + ":" + text.toString(); 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch6/ParallelJobs.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch6; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 13 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 14 | import org.apache.hadoop.util.GenericOptionsParser; 15 | 16 | public class ParallelJobs { 17 | 18 | public static class AverageReputationMapper extends 19 | Mapper { 20 | 21 | private static final Text GROUP_ALL_KEY = new Text( 22 | "Average Reputation:"); 23 | private DoubleWritable outvalue = new DoubleWritable(); 24 | 25 | @Override 26 | protected void map(LongWritable key, Text value, Context context) 27 | throws IOException, InterruptedException { 28 | try { 29 | // Split the line into tokens 30 | String[] tokens = value.toString().split("\t"); 31 | 32 | // Get the reputation from the third column 33 | double reputation = Double.parseDouble(tokens[2]); 34 | 35 | // Set the output value and write to context 36 | outvalue.set(reputation); 37 | context.write(GROUP_ALL_KEY, outvalue); 38 | } catch (NumberFormatException e) { 39 | // Skip record 40 | } 41 | } 42 | } 43 | 44 | public static class AverageReputationReducer extends 45 | Reducer { 46 | 47 | private DoubleWritable outvalue = new DoubleWritable(); 48 | 49 | @Override 50 | protected void reduce(Text key, Iterable values, 51 | Context context) throws IOException, InterruptedException { 52 | 53 | double sum = 0.0; 54 | double count = 0; 55 | for (DoubleWritable dw : values) { 56 | sum += dw.get(); 57 | ++count; 58 | } 59 | 60 | outvalue.set(sum / count); 61 | context.write(key, outvalue); 62 | } 63 | } 64 | 65 | public static void main(String[] args) throws Exception { 66 | 67 | Configuration conf = new Configuration(); 68 | String[] otherArgs = new GenericOptionsParser(conf, args) 69 | .getRemainingArgs(); 70 | 71 | if (otherArgs.length != 4) { 72 | System.err 73 | .println("Usage: ParallelJobs "); 74 | System.exit(2); 75 | } 76 | 77 | Path belowAvgInputDir = new Path(otherArgs[0]); 78 | Path aboveAvgInputDir = new Path(otherArgs[1]); 79 | 80 | Path belowAvgOutputDir = new Path(otherArgs[2]); 81 | Path aboveAvgOutputDir = new Path(otherArgs[3]); 82 | 83 | Job belowAvgJob = submitJob(conf, belowAvgInputDir, belowAvgOutputDir); 84 | Job aboveAvgJob = submitJob(conf, aboveAvgInputDir, aboveAvgOutputDir); 85 | 86 | // While both jobs are not finished, sleep 87 | while (!belowAvgJob.isComplete() || !aboveAvgJob.isComplete()) { 88 | Thread.sleep(5000); 89 | } 90 | 91 | if (belowAvgJob.isSuccessful()) { 92 | System.out.println("Below average job completed successfully!"); 93 | } else { 94 | System.out.println("Below average job failed!"); 95 | } 96 | 97 | if (aboveAvgJob.isSuccessful()) { 98 | System.out.println("Above average job completed successfully!"); 99 | } else { 100 | System.out.println("Above average job failed!"); 101 | } 102 | 103 | System.exit(belowAvgJob.isSuccessful() && aboveAvgJob.isSuccessful() ? 0 104 | : 1); 105 | } 106 | 107 | private static Job submitJob(Configuration conf, Path inputDir, 108 | Path outputDir) throws IOException, InterruptedException, 109 | ClassNotFoundException { 110 | 111 | Job job = new Job(conf, "ParallelJobs"); 112 | job.setJarByClass(ParallelJobs.class); 113 | 114 | job.setMapperClass(AverageReputationMapper.class); 115 | job.setReducerClass(AverageReputationReducer.class); 116 | 117 | job.setOutputKeyClass(Text.class); 118 | job.setOutputValueClass(DoubleWritable.class); 119 | 120 | job.setInputFormatClass(TextInputFormat.class); 121 | TextInputFormat.addInputPath(job, inputDir); 122 | 123 | job.setOutputFormatClass(TextOutputFormat.class); 124 | TextOutputFormat.setOutputPath(job, outputDir); 125 | 126 | job.submit(); 127 | return job; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch7/PartitionPruningInputDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch7; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | import java.util.Iterator; 9 | import java.util.List; 10 | import java.util.Map.Entry; 11 | 12 | import mrdp.ch7.PartitionPruningOutputDriver.RedisKey; 13 | import mrdp.utils.MRDPUtils; 14 | 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.fs.Path; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.io.Writable; 19 | import org.apache.hadoop.mapreduce.InputFormat; 20 | import org.apache.hadoop.mapreduce.InputSplit; 21 | import org.apache.hadoop.mapreduce.Job; 22 | import org.apache.hadoop.mapreduce.JobContext; 23 | import org.apache.hadoop.mapreduce.RecordReader; 24 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 25 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 26 | import org.apache.hadoop.util.GenericOptionsParser; 27 | import org.apache.log4j.Logger; 28 | 29 | import redis.clients.jedis.Jedis; 30 | 31 | public class PartitionPruningInputDriver { 32 | 33 | public static class RedisLastAccessInputFormat extends 34 | InputFormat { 35 | 36 | public static final String REDIS_SELECTED_MONTHS_CONF = "mapred.redilastaccessinputformat.months"; 37 | private static final HashMap MONTH_FROM_STRING = new HashMap(); 38 | private static final HashMap MONTH_TO_INST_MAP = new HashMap(); 39 | private static final Logger LOG = Logger 40 | .getLogger(RedisLastAccessInputFormat.class); 41 | 42 | static { 43 | MONTH_TO_INST_MAP.put("JAN", MRDPUtils.REDIS_INSTANCES[0]); 44 | MONTH_TO_INST_MAP.put("FEB", MRDPUtils.REDIS_INSTANCES[0]); 45 | MONTH_TO_INST_MAP.put("MAR", MRDPUtils.REDIS_INSTANCES[1]); 46 | MONTH_TO_INST_MAP.put("APR", MRDPUtils.REDIS_INSTANCES[1]); 47 | MONTH_TO_INST_MAP.put("MAY", MRDPUtils.REDIS_INSTANCES[2]); 48 | MONTH_TO_INST_MAP.put("JUN", MRDPUtils.REDIS_INSTANCES[2]); 49 | MONTH_TO_INST_MAP.put("JUL", MRDPUtils.REDIS_INSTANCES[3]); 50 | MONTH_TO_INST_MAP.put("AUG", MRDPUtils.REDIS_INSTANCES[3]); 51 | MONTH_TO_INST_MAP.put("SEP", MRDPUtils.REDIS_INSTANCES[4]); 52 | MONTH_TO_INST_MAP.put("OCT", MRDPUtils.REDIS_INSTANCES[4]); 53 | MONTH_TO_INST_MAP.put("NOV", MRDPUtils.REDIS_INSTANCES[5]); 54 | MONTH_TO_INST_MAP.put("DEC", MRDPUtils.REDIS_INSTANCES[5]); 55 | 56 | MONTH_FROM_STRING.put("JAN", 0); 57 | MONTH_FROM_STRING.put("FEB", 1); 58 | MONTH_FROM_STRING.put("MAR", 2); 59 | MONTH_FROM_STRING.put("APR", 3); 60 | MONTH_FROM_STRING.put("MAY", 4); 61 | MONTH_FROM_STRING.put("JUN", 5); 62 | MONTH_FROM_STRING.put("JUL", 6); 63 | MONTH_FROM_STRING.put("AUG", 7); 64 | MONTH_FROM_STRING.put("SEP", 8); 65 | MONTH_FROM_STRING.put("OCT", 9); 66 | MONTH_FROM_STRING.put("NOV", 10); 67 | MONTH_FROM_STRING.put("DEC", 11); 68 | } 69 | 70 | /** 71 | * Sets the CSV string for months you want to pull 72 | * 73 | * @param job 74 | * The job conf 75 | * @param String 76 | * months The CSV list of months 77 | */ 78 | public static void setRedisLastAccessMonths(Job job, String months) { 79 | job.getConfiguration().set(REDIS_SELECTED_MONTHS_CONF, months); 80 | } 81 | 82 | @Override 83 | public List getSplits(JobContext job) throws IOException { 84 | 85 | String months = job.getConfiguration().get( 86 | REDIS_SELECTED_MONTHS_CONF); 87 | 88 | if (months == null || months.isEmpty()) { 89 | throw new IOException(REDIS_SELECTED_MONTHS_CONF 90 | + " is null or empty."); 91 | } 92 | 93 | // Create input splits from the input months 94 | HashMap instanceToSplitMap = new HashMap(); 95 | for (String month : months.split(",")) { 96 | String host = MONTH_TO_INST_MAP.get(month); 97 | RedisLastAccessInputSplit split = instanceToSplitMap.get(host); 98 | if (split == null) { 99 | split = new RedisLastAccessInputSplit(host); 100 | split.addHashKey(month); 101 | instanceToSplitMap.put(host, split); 102 | } else { 103 | split.addHashKey(month); 104 | } 105 | } 106 | 107 | LOG.info("Input splits to process: " 108 | + instanceToSplitMap.values().size()); 109 | return new ArrayList(instanceToSplitMap.values()); 110 | } 111 | 112 | @Override 113 | public RecordReader createRecordReader( 114 | InputSplit split, TaskAttemptContext context) 115 | throws IOException, InterruptedException { 116 | return new RedisLastAccessRecordReader(); 117 | } 118 | 119 | public static class RedisLastAccessRecordReader extends 120 | RecordReader { 121 | 122 | private static final Logger LOG = Logger 123 | .getLogger(RedisLastAccessRecordReader.class); 124 | private Entry currentEntry = null; 125 | private float processedKVs = 0, totalKVs = 0; 126 | private int currentHashMonth = 0; 127 | private Iterator> hashIterator = null; 128 | private Iterator hashKeys = null; 129 | private RedisKey key = new RedisKey(); 130 | private String host = null; 131 | private Text value = new Text(); 132 | 133 | @Override 134 | public void initialize(InputSplit split, TaskAttemptContext context) 135 | throws IOException, InterruptedException { 136 | 137 | // Get the host location from the InputSplit 138 | host = split.getLocations()[0]; 139 | 140 | // Get an iterator of all the hash keys we want to read 141 | hashKeys = ((RedisLastAccessInputSplit) split).getHashKeys() 142 | .iterator(); 143 | 144 | LOG.info("Connecting to " + host); 145 | } 146 | 147 | @Override 148 | public boolean nextKeyValue() throws IOException, 149 | InterruptedException { 150 | 151 | boolean nextHashKey = false; 152 | do { 153 | // if this is the first call or the iterator does not have a 154 | // next 155 | if (hashIterator == null || !hashIterator.hasNext()) { 156 | // if we have reached the end of our hash keys, return 157 | // false 158 | if (!hashKeys.hasNext()) { 159 | // ultimate end condition, return false 160 | return false; 161 | } else { 162 | // Otherwise, connect to Redis and get all 163 | // the name/value pairs for this hash key 164 | Jedis jedis = new Jedis(host); 165 | jedis.connect(); 166 | String strKey = hashKeys.next(); 167 | currentHashMonth = MONTH_FROM_STRING.get(strKey); 168 | hashIterator = jedis.hgetAll(strKey).entrySet() 169 | .iterator(); 170 | jedis.disconnect(); 171 | } 172 | } 173 | 174 | // If the key/value map still has values 175 | if (hashIterator.hasNext()) { 176 | // Get the current entry and set the Text objects to 177 | // the 178 | // entry 179 | currentEntry = hashIterator.next(); 180 | key.setLastAccessMonth(currentHashMonth); 181 | key.setField(currentEntry.getKey()); 182 | value.set(currentEntry.getValue()); 183 | } else { 184 | nextHashKey = true; 185 | } 186 | } while (nextHashKey); 187 | 188 | return true; 189 | } 190 | 191 | @Override 192 | public RedisKey getCurrentKey() throws IOException, 193 | InterruptedException { 194 | return key; 195 | } 196 | 197 | @Override 198 | public Text getCurrentValue() throws IOException, 199 | InterruptedException { 200 | return value; 201 | } 202 | 203 | @Override 204 | public float getProgress() throws IOException, InterruptedException { 205 | return processedKVs / totalKVs; 206 | } 207 | 208 | @Override 209 | public void close() throws IOException { 210 | // nothing to do here 211 | } 212 | } 213 | } 214 | 215 | public static class RedisLastAccessInputSplit extends InputSplit implements 216 | Writable { 217 | 218 | /** 219 | * The Redis instance location 220 | */ 221 | private String location = null; 222 | private List hashKeys = new ArrayList(); 223 | 224 | public RedisLastAccessInputSplit() { 225 | // Default constructor for reflection 226 | } 227 | 228 | public RedisLastAccessInputSplit(String redisHost) { 229 | this.location = redisHost; 230 | } 231 | 232 | public void addHashKey(String key) { 233 | hashKeys.add(key); 234 | } 235 | 236 | public void removeHashKey(String key) { 237 | hashKeys.remove(key); 238 | } 239 | 240 | public List getHashKeys() { 241 | return hashKeys; 242 | } 243 | 244 | @Override 245 | public void readFields(DataInput in) throws IOException { 246 | location = in.readUTF(); 247 | int numKeys = in.readInt(); 248 | hashKeys.clear(); 249 | for (int i = 0; i < numKeys; ++i) { 250 | hashKeys.add(in.readUTF()); 251 | } 252 | } 253 | 254 | @Override 255 | public void write(DataOutput out) throws IOException { 256 | out.writeUTF(location); 257 | out.writeInt(hashKeys.size()); 258 | for (String key : hashKeys) { 259 | out.writeUTF(key); 260 | } 261 | } 262 | 263 | @Override 264 | public long getLength() throws IOException, InterruptedException { 265 | return 0; 266 | } 267 | 268 | @Override 269 | public String[] getLocations() throws IOException, InterruptedException { 270 | return new String[] { location }; 271 | } 272 | } 273 | 274 | public static void main(String[] args) throws Exception { 275 | Configuration conf = new Configuration(); 276 | String[] otherArgs = new GenericOptionsParser(conf, args) 277 | .getRemainingArgs(); 278 | 279 | if (otherArgs.length != 2) { 280 | System.err 281 | .println("Usage: PartitionPruning "); 282 | System.exit(1); 283 | } 284 | 285 | String lastAccessMonths = otherArgs[0]; 286 | Path outputDir = new Path(otherArgs[1]); 287 | 288 | Job job = new Job(conf, "Redis Input"); 289 | job.setJarByClass(PartitionPruningInputDriver.class); 290 | 291 | // Use the identity mapper 292 | job.setNumReduceTasks(0); 293 | 294 | job.setInputFormatClass(RedisLastAccessInputFormat.class); 295 | RedisLastAccessInputFormat.setRedisLastAccessMonths(job, 296 | lastAccessMonths); 297 | 298 | job.setOutputFormatClass(TextOutputFormat.class); 299 | TextOutputFormat.setOutputPath(job, outputDir); 300 | 301 | job.setOutputKeyClass(RedisKey.class); 302 | job.setOutputValueClass(Text.class); 303 | 304 | System.exit(job.waitForCompletion(true) ? 0 : 2); 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch7/PartitionPruningOutputDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch7; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.util.Calendar; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import mrdp.utils.MRDPUtils; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.fs.Path; 15 | import org.apache.hadoop.io.Text; 16 | import org.apache.hadoop.io.WritableComparable; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.JobContext; 19 | import org.apache.hadoop.mapreduce.Mapper; 20 | import org.apache.hadoop.mapreduce.OutputCommitter; 21 | import org.apache.hadoop.mapreduce.OutputFormat; 22 | import org.apache.hadoop.mapreduce.RecordWriter; 23 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 24 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 25 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 26 | import org.apache.hadoop.util.GenericOptionsParser; 27 | 28 | import redis.clients.jedis.Jedis; 29 | 30 | public class PartitionPruningOutputDriver { 31 | 32 | private static final HashMap MONTH_FROM_INT = new HashMap(); 33 | 34 | static { 35 | MONTH_FROM_INT.put(0, "JAN"); 36 | MONTH_FROM_INT.put(1, "FEB"); 37 | MONTH_FROM_INT.put(2, "MAR"); 38 | MONTH_FROM_INT.put(3, "APR"); 39 | MONTH_FROM_INT.put(4, "MAY"); 40 | MONTH_FROM_INT.put(5, "JUN"); 41 | MONTH_FROM_INT.put(6, "JUL"); 42 | MONTH_FROM_INT.put(7, "AUG"); 43 | MONTH_FROM_INT.put(8, "SEP"); 44 | MONTH_FROM_INT.put(9, "OCT"); 45 | MONTH_FROM_INT.put(10, "NOV"); 46 | MONTH_FROM_INT.put(11, "DEC"); 47 | } 48 | 49 | public static class RedisLastAccessOutputMapper extends 50 | Mapper { 51 | 52 | // This object will format the creation date string into a Date object 53 | private final static SimpleDateFormat frmt = new SimpleDateFormat( 54 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 55 | 56 | private RedisKey outkey = new RedisKey(); 57 | private Text outvalue = new Text(); 58 | 59 | @Override 60 | public void map(Object key, Text value, Context context) 61 | throws IOException, InterruptedException { 62 | 63 | Map parsed = MRDPUtils.transformXmlToMap(value 64 | .toString()); 65 | 66 | String userId = parsed.get("Id"); 67 | String reputation = parsed.get("Reputation"); 68 | 69 | // Grab the last access date 70 | String strDate = parsed.get("LastAccessDate"); 71 | 72 | if (userId == null || reputation == null || strDate == null) { 73 | return; 74 | } 75 | 76 | try { 77 | // Parse the string into a Calendar object 78 | Calendar cal = Calendar.getInstance(); 79 | cal.setTime(frmt.parse(strDate)); 80 | 81 | // Set our output key and values 82 | outkey.setLastAccessMonth(cal.get(Calendar.MONTH)); 83 | outkey.setField(userId); 84 | outvalue.set(reputation); 85 | 86 | context.write(outkey, outvalue); 87 | } catch (ParseException e) { 88 | e.printStackTrace(); 89 | } 90 | } 91 | } 92 | 93 | public static class RedisKey implements WritableComparable { 94 | 95 | private int lastAccessMonth = 0; 96 | private Text field = new Text(); 97 | 98 | public int getLastAccessMonth() { 99 | return this.lastAccessMonth; 100 | } 101 | 102 | public void setLastAccessMonth(int lastAccessMonth) { 103 | this.lastAccessMonth = lastAccessMonth; 104 | } 105 | 106 | public Text getField() { 107 | return this.field; 108 | } 109 | 110 | public void setField(String field) { 111 | this.field.set(field); 112 | } 113 | 114 | @Override 115 | public void readFields(DataInput in) throws IOException { 116 | lastAccessMonth = in.readInt(); 117 | this.field.readFields(in); 118 | } 119 | 120 | @Override 121 | public void write(DataOutput out) throws IOException { 122 | out.writeInt(lastAccessMonth); 123 | this.field.write(out); 124 | } 125 | 126 | @Override 127 | public int compareTo(RedisKey rhs) { 128 | if (this.lastAccessMonth == rhs.getLastAccessMonth()) { 129 | return this.field.compareTo(rhs.getField()); 130 | } else { 131 | return this.lastAccessMonth < rhs.getLastAccessMonth() ? -1 : 1; 132 | } 133 | } 134 | 135 | @Override 136 | public String toString() { 137 | return this.lastAccessMonth + "\t" + this.field.toString(); 138 | } 139 | 140 | @Override 141 | public int hashCode() { 142 | return toString().hashCode(); 143 | } 144 | } 145 | 146 | public static class RedisLastAccessOutputFormat extends 147 | OutputFormat { 148 | 149 | @Override 150 | public RecordWriter getRecordWriter( 151 | TaskAttemptContext job) throws IOException, 152 | InterruptedException { 153 | return new RedisLastAccessRecordWriter(); 154 | } 155 | 156 | @Override 157 | public void checkOutputSpecs(JobContext context) throws IOException, 158 | InterruptedException { 159 | } 160 | 161 | @Override 162 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) 163 | throws IOException, InterruptedException { 164 | return (new NullOutputFormat()) 165 | .getOutputCommitter(context); 166 | } 167 | 168 | public static class RedisLastAccessRecordWriter extends 169 | RecordWriter { 170 | 171 | private HashMap jedisMap = new HashMap(); 172 | 173 | public RedisLastAccessRecordWriter() { 174 | // Create a connection to Redis for each host 175 | int i = 0; 176 | for (String host : MRDPUtils.REDIS_INSTANCES) { 177 | Jedis jedis = new Jedis(host); 178 | jedis.connect(); 179 | jedisMap.put(i, jedis); 180 | jedisMap.put(i + 1, jedis); 181 | i += 2; 182 | } 183 | } 184 | 185 | @Override 186 | public void write(RedisKey key, Text value) throws IOException, 187 | InterruptedException { 188 | // Get the Jedis instance that this key/value pair will be 189 | // written to -- (0,1)->0, (2-3)->1, ... , (10-11)->5 190 | Jedis j = jedisMap.get(key.getLastAccessMonth()); 191 | 192 | // Write the key/value pair 193 | j.hset(MONTH_FROM_INT.get(key.getLastAccessMonth()), key 194 | .getField().toString(), value.toString()); 195 | } 196 | 197 | @Override 198 | public void close(TaskAttemptContext context) throws IOException, 199 | InterruptedException { 200 | // For each jedis instance, disconnect it 201 | for (Jedis jedis : jedisMap.values()) { 202 | jedis.disconnect(); 203 | } 204 | } 205 | } 206 | } 207 | 208 | public static void main(String[] args) throws Exception { 209 | Configuration conf = new Configuration(); 210 | String[] otherArgs = new GenericOptionsParser(conf, args) 211 | .getRemainingArgs(); 212 | 213 | if (otherArgs.length != 1) { 214 | System.err.println("Usage: PartitionPruningOutput "); 215 | System.exit(1); 216 | } 217 | 218 | Path inputPath = new Path(otherArgs[0]); 219 | 220 | Job job = new Job(conf, "Redis Last Access Output"); 221 | job.setJarByClass(PartitionPruningOutputDriver.class); 222 | 223 | job.setMapperClass(RedisLastAccessOutputMapper.class); 224 | job.setNumReduceTasks(0); 225 | 226 | job.setInputFormatClass(TextInputFormat.class); 227 | TextInputFormat.setInputPaths(job, inputPath); 228 | 229 | job.setOutputFormatClass(RedisLastAccessOutputFormat.class); 230 | 231 | job.setOutputKeyClass(RedisKey.class); 232 | job.setOutputValueClass(Text.class); 233 | 234 | int code = job.waitForCompletion(true) ? 0 : 2; 235 | 236 | System.exit(code); 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch7/RandomDataGenerationDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch7; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.DataInput; 5 | import java.io.DataOutput; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.net.URI; 9 | import java.security.InvalidParameterException; 10 | import java.text.SimpleDateFormat; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.Random; 14 | 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.filecache.DistributedCache; 17 | import org.apache.hadoop.fs.Path; 18 | import org.apache.hadoop.io.NullWritable; 19 | import org.apache.hadoop.io.Text; 20 | import org.apache.hadoop.io.Writable; 21 | import org.apache.hadoop.mapreduce.InputFormat; 22 | import org.apache.hadoop.mapreduce.InputSplit; 23 | import org.apache.hadoop.mapreduce.Job; 24 | import org.apache.hadoop.mapreduce.JobContext; 25 | import org.apache.hadoop.mapreduce.RecordReader; 26 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 27 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 28 | import org.apache.hadoop.util.GenericOptionsParser; 29 | 30 | public class RandomDataGenerationDriver { 31 | 32 | public static class RandomStackOverflowInputFormat extends 33 | InputFormat { 34 | 35 | public static final String NUM_MAP_TASKS = "random.generator.map.tasks"; 36 | public static final String NUM_RECORDS_PER_TASK = "random.generator.num.records.per.map.task"; 37 | public static final String RANDOM_WORD_LIST = "random.generator.random.word.file"; 38 | 39 | @Override 40 | public List getSplits(JobContext job) throws IOException { 41 | 42 | // Get the number of map tasks configured for 43 | int numSplits = job.getConfiguration().getInt(NUM_MAP_TASKS, -1); 44 | if (numSplits <= 0) { 45 | throw new IOException(NUM_MAP_TASKS + " is not set."); 46 | } 47 | 48 | // Create a number of input splits equivalent to the number of tasks 49 | ArrayList splits = new ArrayList(); 50 | for (int i = 0; i < numSplits; ++i) { 51 | splits.add(new FakeInputSplit()); 52 | } 53 | 54 | return splits; 55 | } 56 | 57 | @Override 58 | public RecordReader createRecordReader( 59 | InputSplit split, TaskAttemptContext context) 60 | throws IOException, InterruptedException { 61 | // Create a new RandomStackoverflowRecordReader and initialize it 62 | RandomStackoverflowRecordReader rr = new RandomStackoverflowRecordReader(); 63 | rr.initialize(split, context); 64 | return rr; 65 | } 66 | 67 | public static void setNumMapTasks(Job job, int i) { 68 | job.getConfiguration().setInt(NUM_MAP_TASKS, i); 69 | } 70 | 71 | public static void setNumRecordPerTask(Job job, int i) { 72 | job.getConfiguration().setInt(NUM_RECORDS_PER_TASK, i); 73 | } 74 | 75 | public static void setRandomWordList(Job job, Path file) { 76 | DistributedCache.addCacheFile(file.toUri(), job.getConfiguration()); 77 | } 78 | 79 | public static class RandomStackoverflowRecordReader extends 80 | RecordReader { 81 | 82 | private int numRecordsToCreate = 0; 83 | private int createdRecords = 0; 84 | private Text key = new Text(); 85 | private NullWritable value = NullWritable.get(); 86 | private Random rndm = new Random(); 87 | private ArrayList randomWords = new ArrayList(); 88 | 89 | // This object will format the creation date string into a Date 90 | // object 91 | private SimpleDateFormat frmt = new SimpleDateFormat( 92 | "yyyy-MM-dd'T'HH:mm:ss.SSS"); 93 | 94 | @Override 95 | public void initialize(InputSplit split, TaskAttemptContext context) 96 | throws IOException, InterruptedException { 97 | 98 | // Get the number of records to create from the configuration 99 | this.numRecordsToCreate = context.getConfiguration().getInt( 100 | NUM_RECORDS_PER_TASK, -1); 101 | 102 | if (numRecordsToCreate < 0) { 103 | throw new InvalidParameterException(NUM_RECORDS_PER_TASK 104 | + " is not set."); 105 | } 106 | 107 | // Get the list of random words from the DistributedCache 108 | URI[] files = DistributedCache.getCacheFiles(context 109 | .getConfiguration()); 110 | 111 | if (files.length == 0) { 112 | throw new InvalidParameterException( 113 | "Random word list not set in cache."); 114 | } else { 115 | // Read the list of random words into a list 116 | BufferedReader rdr = new BufferedReader(new FileReader( 117 | files[0].toString())); 118 | 119 | String line; 120 | while ((line = rdr.readLine()) != null) { 121 | randomWords.add(line); 122 | } 123 | rdr.close(); 124 | 125 | if (randomWords.size() == 0) { 126 | throw new IOException("Random word list is empty"); 127 | } 128 | } 129 | } 130 | 131 | @Override 132 | public boolean nextKeyValue() throws IOException, 133 | InterruptedException { 134 | // If we still have records to create 135 | if (createdRecords < numRecordsToCreate) { 136 | // Generate random data 137 | int score = Math.abs(rndm.nextInt()) % 15000; 138 | int rowId = Math.abs(rndm.nextInt()) % 1000000000; 139 | int postId = Math.abs(rndm.nextInt()) % 100000000; 140 | int userId = Math.abs(rndm.nextInt()) % 1000000; 141 | String creationDate = frmt 142 | .format(Math.abs(rndm.nextLong())); 143 | 144 | // Create a string of text from the random words 145 | String text = getRandomText(); 146 | 147 | String randomRecord = ""; 151 | 152 | key.set(randomRecord); 153 | ++createdRecords; 154 | return true; 155 | } else { 156 | // Else, return false 157 | return false; 158 | } 159 | } 160 | 161 | /** 162 | * Creates a random string of words from the list. 1-30 words per 163 | * string. 164 | * 165 | * @return A random string of words 166 | */ 167 | private String getRandomText() { 168 | StringBuilder bldr = new StringBuilder(); 169 | int numWords = Math.abs(rndm.nextInt()) % 30 + 1; 170 | 171 | for (int i = 0; i < numWords; ++i) { 172 | bldr.append(randomWords.get(Math.abs(rndm.nextInt()) 173 | % randomWords.size()) 174 | + " "); 175 | } 176 | return bldr.toString(); 177 | } 178 | 179 | @Override 180 | public Text getCurrentKey() throws IOException, 181 | InterruptedException { 182 | return key; 183 | } 184 | 185 | @Override 186 | public NullWritable getCurrentValue() throws IOException, 187 | InterruptedException { 188 | return value; 189 | } 190 | 191 | @Override 192 | public float getProgress() throws IOException, InterruptedException { 193 | return (float) createdRecords / (float) numRecordsToCreate; 194 | } 195 | 196 | @Override 197 | public void close() throws IOException { 198 | // nothing to do here... 199 | } 200 | } 201 | 202 | /** 203 | * This class is very empty. 204 | */ 205 | public static class FakeInputSplit extends InputSplit implements 206 | Writable { 207 | 208 | @Override 209 | public void readFields(DataInput arg0) throws IOException { 210 | } 211 | 212 | @Override 213 | public void write(DataOutput arg0) throws IOException { 214 | } 215 | 216 | @Override 217 | public long getLength() throws IOException, InterruptedException { 218 | return 0; 219 | } 220 | 221 | @Override 222 | public String[] getLocations() throws IOException, 223 | InterruptedException { 224 | return new String[0]; 225 | } 226 | } 227 | } 228 | 229 | public static void main(String[] args) throws Exception { 230 | Configuration conf = new Configuration(); 231 | String[] otherArgs = new GenericOptionsParser(conf, args) 232 | .getRemainingArgs(); 233 | if (otherArgs.length != 4) { 234 | System.err 235 | .println("Usage: RandomDataGenerationDriver "); 236 | System.exit(1); 237 | } 238 | 239 | int numMapTasks = Integer.parseInt(otherArgs[0]); 240 | int numRecordsPerTask = Integer.parseInt(otherArgs[1]); 241 | Path wordList = new Path(otherArgs[2]); 242 | Path outputDir = new Path(otherArgs[3]); 243 | 244 | Job job = new Job(conf, "RandomDataGenerationDriver"); 245 | job.setJarByClass(RandomDataGenerationDriver.class); 246 | 247 | job.setNumReduceTasks(0); 248 | 249 | job.setInputFormatClass(RandomStackOverflowInputFormat.class); 250 | 251 | RandomStackOverflowInputFormat.setNumMapTasks(job, numMapTasks); 252 | RandomStackOverflowInputFormat.setNumRecordPerTask(job, 253 | numRecordsPerTask); 254 | RandomStackOverflowInputFormat.setRandomWordList(job, wordList); 255 | 256 | TextOutputFormat.setOutputPath(job, outputDir); 257 | 258 | job.setOutputKeyClass(Text.class); 259 | job.setOutputValueClass(NullWritable.class); 260 | 261 | System.exit(job.waitForCompletion(true) ? 0 : 2); 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch7/RedisInputDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch7; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.Iterator; 8 | import java.util.List; 9 | import java.util.Map.Entry; 10 | 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.fs.Path; 13 | import org.apache.hadoop.io.Text; 14 | import org.apache.hadoop.io.Writable; 15 | import org.apache.hadoop.mapreduce.InputFormat; 16 | import org.apache.hadoop.mapreduce.InputSplit; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.JobContext; 19 | import org.apache.hadoop.mapreduce.RecordReader; 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 21 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 22 | import org.apache.hadoop.util.GenericOptionsParser; 23 | import org.apache.log4j.Logger; 24 | 25 | import redis.clients.jedis.Jedis; 26 | 27 | public class RedisInputDriver { 28 | 29 | public static class RedisHashInputFormat extends 30 | InputFormat { 31 | 32 | public static final String REDIS_HOSTS_CONF = "mapred.redishashinputformat.hosts"; 33 | public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key"; 34 | private static final Logger LOG = Logger 35 | .getLogger(RedisHashInputFormat.class); 36 | 37 | /** 38 | * Sets the CSV string of Redis hosts. 39 | * 40 | * @param job 41 | * The job conf 42 | * @param hosts 43 | * The CSV string of Redis hosts 44 | */ 45 | public static void setRedisHosts(Job job, String hosts) { 46 | job.getConfiguration().set(REDIS_HOSTS_CONF, hosts); 47 | } 48 | 49 | /** 50 | * Sets the key of the hash to write to. 51 | * 52 | * @param job 53 | * The job conf 54 | * @param hashKey 55 | * The name of the hash key 56 | */ 57 | public static void setRedisHashKey(Job job, String hashKey) { 58 | job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey); 59 | } 60 | 61 | @Override 62 | public List getSplits(JobContext job) throws IOException { 63 | String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); 64 | 65 | if (hosts == null || hosts.isEmpty()) { 66 | throw new IOException(REDIS_HOSTS_CONF 67 | + " is not set in configuration."); 68 | } 69 | 70 | String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF); 71 | if (hashKey == null || hashKey.isEmpty()) { 72 | throw new IOException(REDIS_HASH_KEY_CONF 73 | + " is not set in configuration."); 74 | } 75 | 76 | // Create an input split for each host 77 | List splits = new ArrayList(); 78 | for (String host : hosts.split(",")) { 79 | splits.add(new RedisHashInputSplit(host, hashKey)); 80 | } 81 | 82 | LOG.info("Input splits to process: " + splits.size()); 83 | return splits; 84 | } 85 | 86 | @Override 87 | public RecordReader createRecordReader(InputSplit split, 88 | TaskAttemptContext context) throws IOException, 89 | InterruptedException { 90 | return new RedisHashRecordReader(); 91 | } 92 | 93 | public static class RedisHashRecordReader extends 94 | RecordReader { 95 | 96 | private static final Logger LOG = Logger 97 | .getLogger(RedisHashRecordReader.class); 98 | private Iterator> keyValueMapIter = null; 99 | private Text key = new Text(), value = new Text(); 100 | private float processedKVs = 0, totalKVs = 0; 101 | private Entry currentEntry = null; 102 | 103 | @Override 104 | public void initialize(InputSplit split, TaskAttemptContext context) 105 | throws IOException, InterruptedException { 106 | 107 | // Get the host location from the InputSplit 108 | String host = split.getLocations()[0]; 109 | String hashKey = ((RedisHashInputSplit) split).getHashKey(); 110 | 111 | LOG.info("Connecting to " + host + " and reading from " 112 | + hashKey); 113 | 114 | Jedis jedis = new Jedis(host); 115 | jedis.connect(); 116 | jedis.getClient().setTimeoutInfinite(); 117 | 118 | // Get all the key value pairs from the Redis instance and store 119 | // them in memory 120 | totalKVs = jedis.hlen(hashKey); 121 | keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator(); 122 | LOG.info("Got " + totalKVs + " from " + hashKey); 123 | jedis.disconnect(); 124 | } 125 | 126 | @Override 127 | public boolean nextKeyValue() throws IOException, 128 | InterruptedException { 129 | 130 | // If the key/value map still has values 131 | if (keyValueMapIter.hasNext()) { 132 | 133 | // Get the current entry and set the Text objects to the 134 | // entry 135 | currentEntry = keyValueMapIter.next(); 136 | key.set(currentEntry.getKey()); 137 | value.set(currentEntry.getValue()); 138 | return true; 139 | } else { 140 | // No more values? return false. 141 | return false; 142 | } 143 | } 144 | 145 | @Override 146 | public Text getCurrentKey() throws IOException, 147 | InterruptedException { 148 | return key; 149 | } 150 | 151 | @Override 152 | public Text getCurrentValue() throws IOException, 153 | InterruptedException { 154 | return value; 155 | } 156 | 157 | @Override 158 | public float getProgress() throws IOException, InterruptedException { 159 | return processedKVs / totalKVs; 160 | } 161 | 162 | @Override 163 | public void close() throws IOException { 164 | // nothing to do here 165 | } 166 | } 167 | } 168 | 169 | public static class RedisHashInputSplit extends InputSplit implements Writable { 170 | 171 | /** 172 | * The Redis instance location 173 | */ 174 | private String location = null; 175 | 176 | /** 177 | * The Redis hash to read from 178 | */ 179 | private String hashKey = null; 180 | 181 | public RedisHashInputSplit() { 182 | // Default constructor for reflection 183 | } 184 | 185 | public RedisHashInputSplit(String redisHost, String hash) { 186 | this.location = redisHost; 187 | this.hashKey = hash; 188 | } 189 | 190 | public String getHashKey() { 191 | return this.hashKey; 192 | } 193 | 194 | @Override 195 | public void readFields(DataInput in) throws IOException { 196 | this.location = in.readUTF(); 197 | this.hashKey = in.readUTF(); 198 | } 199 | 200 | @Override 201 | public void write(DataOutput out) throws IOException { 202 | out.writeUTF(location); 203 | out.writeUTF(hashKey); 204 | } 205 | 206 | @Override 207 | public long getLength() throws IOException, InterruptedException { 208 | return 0; 209 | } 210 | 211 | @Override 212 | public String[] getLocations() throws IOException, InterruptedException { 213 | return new String[] { location }; 214 | } 215 | } 216 | 217 | public static void main(String[] args) throws Exception { 218 | Configuration conf = new Configuration(); 219 | String[] otherArgs = new GenericOptionsParser(conf, args) 220 | .getRemainingArgs(); 221 | 222 | if (otherArgs.length != 3) { 223 | System.err 224 | .println("Usage: RedisInput "); 225 | System.exit(1); 226 | } 227 | 228 | String hosts = otherArgs[0]; 229 | String hashKey = otherArgs[1]; 230 | Path outputDir = new Path(otherArgs[2]); 231 | 232 | Job job = new Job(conf, "Redis Input"); 233 | job.setJarByClass(RedisInputDriver.class); 234 | 235 | // Use the identity mapper 236 | job.setNumReduceTasks(0); 237 | 238 | job.setInputFormatClass(RedisHashInputFormat.class); 239 | RedisHashInputFormat.setRedisHosts(job, hosts); 240 | RedisHashInputFormat.setRedisHashKey(job, hashKey); 241 | 242 | job.setOutputFormatClass(TextOutputFormat.class); 243 | TextOutputFormat.setOutputPath(job, outputDir); 244 | 245 | job.setOutputKeyClass(Text.class); 246 | job.setOutputValueClass(Text.class); 247 | 248 | System.exit(job.waitForCompletion(true) ? 0 : 3); 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/ch7/RedisOutputDriver.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch7; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import mrdp.utils.MRDPUtils; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapreduce.Job; 13 | import org.apache.hadoop.mapreduce.JobContext; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.OutputCommitter; 16 | import org.apache.hadoop.mapreduce.OutputFormat; 17 | import org.apache.hadoop.mapreduce.RecordWriter; 18 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 19 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 20 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 21 | import org.apache.hadoop.util.GenericOptionsParser; 22 | import org.apache.log4j.Logger; 23 | 24 | import redis.clients.jedis.Jedis; 25 | 26 | public class RedisOutputDriver { 27 | 28 | public static class RedisOutputMapper extends 29 | Mapper { 30 | 31 | private Text outkey = new Text(); 32 | private Text outvalue = new Text(); 33 | 34 | @Override 35 | public void map(Object key, Text value, Context context) 36 | throws IOException, InterruptedException { 37 | 38 | Map parsed = MRDPUtils.transformXmlToMap(value 39 | .toString()); 40 | 41 | String userId = parsed.get("Id"); 42 | String reputation = parsed.get("Reputation"); 43 | 44 | if (userId == null || reputation == null) { 45 | return; 46 | } 47 | 48 | // Set our output key and values 49 | outkey.set(userId); 50 | outvalue.set(reputation); 51 | 52 | context.write(outkey, outvalue); 53 | } 54 | } 55 | 56 | public static class RedisHashOutputFormat extends OutputFormat { 57 | 58 | public static final String REDIS_HOSTS_CONF = "mapred.redishashoutputformat.hosts"; 59 | public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key"; 60 | 61 | /** 62 | * Sets the CSV string of Redis hosts. 63 | * 64 | * @param job 65 | * The job conf 66 | * @param hosts 67 | * The CSV string of Redis hosts 68 | */ 69 | public static void setRedisHosts(Job job, String hosts) { 70 | job.getConfiguration().set(REDIS_HOSTS_CONF, hosts); 71 | } 72 | 73 | /** 74 | * Sets the key of the hash to write to. 75 | * 76 | * @param job 77 | * The job conf 78 | * @param hashKey 79 | * The name of the hash key 80 | */ 81 | public static void setRedisHashKey(Job job, String hashKey) { 82 | job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey); 83 | } 84 | 85 | @Override 86 | public RecordWriter getRecordWriter(TaskAttemptContext job) 87 | throws IOException, InterruptedException { 88 | return new RedisHashRecordWriter(job.getConfiguration().get( 89 | REDIS_HASH_KEY_CONF), job.getConfiguration().get( 90 | REDIS_HOSTS_CONF)); 91 | } 92 | 93 | @Override 94 | public void checkOutputSpecs(JobContext job) 95 | throws IOException { 96 | String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); 97 | 98 | if (hosts == null || hosts.isEmpty()) { 99 | throw new IOException(REDIS_HOSTS_CONF 100 | + " is not set in configuration."); 101 | } 102 | 103 | String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF); 104 | 105 | if (hashKey == null || hashKey.isEmpty()) { 106 | throw new IOException(REDIS_HASH_KEY_CONF 107 | + " is not set in configuration."); 108 | } 109 | } 110 | 111 | @Override 112 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) 113 | throws IOException, InterruptedException { 114 | return (new NullOutputFormat()) 115 | .getOutputCommitter(context); 116 | } 117 | 118 | public static class RedisHashRecordWriter extends 119 | RecordWriter { 120 | 121 | private static final Logger LOG = Logger 122 | .getLogger(RedisHashRecordWriter.class); 123 | private HashMap jedisMap = new HashMap(); 124 | private String hashKey = null; 125 | 126 | public RedisHashRecordWriter(String hashKey, String hosts) { 127 | LOG.info("Connecting to " + hosts + " and writing to " 128 | + hashKey); 129 | this.hashKey = hashKey; 130 | // Create a connection to Redis for each host 131 | // Map an integer 0-(numRedisInstances - 1) to the instance 132 | int i = 0; 133 | for (String host : hosts.split(",")) { 134 | Jedis jedis = new Jedis(host); 135 | jedis.connect(); 136 | jedisMap.put(i, jedis); 137 | ++i; 138 | } 139 | } 140 | 141 | @Override 142 | public void write(Text key, Text value) throws IOException, 143 | InterruptedException { 144 | // Get the Jedis instance that this key/value pair will be 145 | // written to 146 | Jedis j = jedisMap.get(Math.abs(key.hashCode()) 147 | % jedisMap.size()); 148 | 149 | // Write the key/value pair 150 | j.hset(hashKey, key.toString(), value.toString()); 151 | } 152 | 153 | @Override 154 | public void close(TaskAttemptContext context) throws IOException, 155 | InterruptedException { 156 | // For each jedis instance, disconnect it 157 | for (Jedis jedis : jedisMap.values()) { 158 | jedis.disconnect(); 159 | } 160 | } 161 | } 162 | } 163 | 164 | public static void main(String[] args) throws Exception { 165 | Configuration conf = new Configuration(); 166 | String[] otherArgs = new GenericOptionsParser(conf, args) 167 | .getRemainingArgs(); 168 | 169 | if (otherArgs.length != 3) { 170 | System.err 171 | .println("Usage: RedisOutput "); 172 | System.exit(1); 173 | } 174 | 175 | Path inputPath = new Path(otherArgs[0]); 176 | String hosts = otherArgs[1]; 177 | String hashName = otherArgs[2]; 178 | 179 | Job job = new Job(conf, "Redis Output"); 180 | job.setJarByClass(RedisOutputDriver.class); 181 | 182 | job.setMapperClass(RedisOutputMapper.class); 183 | job.setNumReduceTasks(0); 184 | 185 | job.setInputFormatClass(TextInputFormat.class); 186 | TextInputFormat.setInputPaths(job, inputPath); 187 | 188 | job.setOutputFormatClass(RedisHashOutputFormat.class); 189 | RedisHashOutputFormat.setRedisHosts(job, hosts); 190 | RedisHashOutputFormat.setRedisHashKey(job, hashName); 191 | 192 | job.setOutputKeyClass(Text.class); 193 | job.setOutputValueClass(Text.class); 194 | 195 | int code = job.waitForCompletion(true) ? 0 : 2; 196 | 197 | System.exit(code); 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /MRDP/src/main/java/mrdp/utils/MRDPUtils.java: -------------------------------------------------------------------------------- 1 | package mrdp.utils; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class MRDPUtils { 7 | 8 | public static final String[] REDIS_INSTANCES = { "p0", "p1", "p2", "p3", 9 | "p4", "p6" }; 10 | 11 | // This helper function parses the stackoverflow into a Map for us. 12 | public static Map transformXmlToMap(String xml) { 13 | Map map = new HashMap(); 14 | try { 15 | String[] tokens = xml.trim().substring(5, xml.trim().length() - 3) 16 | .split("\""); 17 | 18 | for (int i = 0; i < tokens.length - 1; i += 2) { 19 | String key = tokens[i].trim(); 20 | String val = tokens[i + 1]; 21 | 22 | map.put(key.substring(0, key.length() - 1), val); 23 | } 24 | } catch (StringIndexOutOfBoundsException e) { 25 | System.err.println(xml); 26 | } 27 | 28 | return map; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /MRDP/src/main/resources/highrepusers.bf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adamjshook/mapreducepatterns/315edb587d602774972c8229c65d740bba9f9f83/MRDP/src/main/resources/highrepusers.bf -------------------------------------------------------------------------------- /MRDP/src/main/resources/hotlist.txt: -------------------------------------------------------------------------------- 1 | edited 2 | fix 3 | longer 4 | specified 5 | retrieve 6 | months 7 | representation 8 | jsbin 9 | parameterized 10 | publicly 11 | pleasure 12 | blindly 13 | textual 14 | ordinal 15 | createinstance 16 | trend 17 | bomb 18 | dispatched 19 | retract 20 | promised 21 | loves 22 | approached 23 | urlrequest 24 | webmatrix 25 | borealid 26 | hibernates 27 | timezoneinfo 28 | getcomputedstyle 29 | donnie 30 | looser 31 | mingos 32 | bryant 33 | prejudice 34 | dow 35 | explination 36 | authentic 37 | instinctively 38 | shrugs 39 | tdammers 40 | judiciously 41 | vanishingly 42 | cobble 43 | myassembly 44 | fvu 45 | projectile 46 | sessioninfo 47 | afer 48 | fluctuate 49 | appletviewer 50 | prateek 51 | chnaged 52 | jackpot 53 | jsw 54 | reponses 55 | onlamp 56 | epilogue 57 | weeding 58 | intellectually 59 | honorable 60 | raze 61 | baeltazor 62 | loadxmldoc 63 | fromid 64 | documentfilter 65 | rlh 66 | tolowerinvariant 67 | httpstatus 68 | closeevent 69 | maki 70 | pcampbell 71 | getmethodname 72 | coulmn 73 | sshexec 74 | rhinomock 75 | epaga 76 | vienna 77 | redmon 78 | nsalert 79 | dugres 80 | drorhan 81 | wxperl 82 | preexecute 83 | bashism 84 | txtdescription 85 | salmon 86 | alk 87 | properities 88 | kress 89 | submarine 90 | mcisendstring 91 | rthe 92 | justinfrench 93 | ssiphone 94 | sophos 95 | setsession 96 | objectcontainer 97 | myvalidator 98 | locksupport 99 | jnkrois 100 | canoe -------------------------------------------------------------------------------- /MRDP/src/main/resources/hotlistwords.bf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adamjshook/mapreducepatterns/315edb587d602774972c8229c65d740bba9f9f83/MRDP/src/main/resources/hotlistwords.bf -------------------------------------------------------------------------------- /MRDP/src/test/java/mrdp/ch5/CartesianProductTest.java: -------------------------------------------------------------------------------- 1 | package mrdp.ch5; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.PrintWriter; 6 | 7 | import mrdp.ch5.CartesianProduct.CartesianInputFormat; 8 | 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapred.JobClient; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.MapReduceBase; 14 | import org.apache.hadoop.mapred.Mapper; 15 | import org.apache.hadoop.mapred.OutputCollector; 16 | import org.apache.hadoop.mapred.Reporter; 17 | import org.apache.hadoop.mapred.RunningJob; 18 | import org.apache.hadoop.mapred.TextInputFormat; 19 | import org.apache.hadoop.mapred.TextOutputFormat; 20 | 21 | public class CartesianProductTest { 22 | 23 | public static void main(String[] args) throws IOException, 24 | InterruptedException { 25 | 26 | File aDir = new File(System.getProperty("user.dir") + "/A"); 27 | aDir.mkdirs(); 28 | File bDir = new File(System.getProperty("user.dir") + "/B"); 29 | bDir.mkdirs(); 30 | 31 | File a1 = new File(System.getProperty("user.dir") + "/A/A1.txt"); 32 | a1.createNewFile(); 33 | 34 | PrintWriter wrtr = new PrintWriter(a1); 35 | 36 | wrtr.println("A11"); 37 | wrtr.println("A12"); 38 | wrtr.println("A13"); 39 | wrtr.println("A14"); 40 | 41 | wrtr.flush(); 42 | wrtr.close(); 43 | 44 | File a2 = new File(System.getProperty("user.dir") + "/A/A2.txt"); 45 | a2.createNewFile(); 46 | 47 | wrtr = new PrintWriter(a2); 48 | 49 | wrtr.println("A21"); 50 | wrtr.println("A22"); 51 | wrtr.println("A23"); 52 | wrtr.println("A24"); 53 | 54 | wrtr.flush(); 55 | wrtr.close(); 56 | 57 | File b1 = new File(System.getProperty("user.dir") + "/B/B1.txt"); 58 | b1.createNewFile(); 59 | 60 | wrtr = new PrintWriter(b1); 61 | 62 | wrtr.println("B11"); 63 | wrtr.println("B12"); 64 | wrtr.println("B13"); 65 | wrtr.println("B14"); 66 | 67 | wrtr.flush(); 68 | wrtr.close(); 69 | 70 | File b2 = new File(System.getProperty("user.dir") + "/B/B2.txt"); 71 | b2.createNewFile(); 72 | 73 | wrtr = new PrintWriter(b2); 74 | 75 | wrtr.println("B21"); 76 | wrtr.println("B22"); 77 | wrtr.println("B23"); 78 | wrtr.println("B24"); 79 | 80 | wrtr.flush(); 81 | wrtr.close(); 82 | 83 | long start = System.currentTimeMillis(); 84 | 85 | // Configure the join type 86 | JobConf job = new JobConf("Cartesian Product"); 87 | job.setJarByClass(CartesianProduct.class); 88 | 89 | job.setMapperClass(CartesianMapper.class); 90 | 91 | job.setNumReduceTasks(0); 92 | 93 | job.setInputFormat(CartesianInputFormat.class); 94 | CartesianInputFormat.setLeftInputInfo(job, TextInputFormat.class, 95 | System.getProperty("user.dir") + "/A"); 96 | CartesianInputFormat.setRightInputInfo(job, TextInputFormat.class, 97 | System.getProperty("user.dir") + "/B"); 98 | 99 | TextOutputFormat.setOutputPath(job, new Path("cartoutputttest")); 100 | 101 | job.setOutputKeyClass(Text.class); 102 | job.setOutputValueClass(Text.class); 103 | 104 | RunningJob jerb = JobClient.runJob(job); 105 | while (!jerb.isComplete()) { 106 | Thread.sleep(1000); 107 | } 108 | 109 | long finish = System.currentTimeMillis(); 110 | 111 | System.out.println("Time in ms: " + (finish - start)); 112 | 113 | System.exit(jerb.isSuccessful() ? 0 : 2); 114 | } 115 | 116 | public static class CartesianMapper extends MapReduceBase implements 117 | Mapper { 118 | 119 | @Override 120 | public void map(Text arg0, Text arg1, OutputCollector arg2, 121 | Reporter arg3) throws IOException { 122 | arg2.collect(arg0, arg1); 123 | System.out.println(arg0 + "\t" + arg1); 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mapreducepatterns 2 | ================= 3 | 4 | Repository for MapReduce Design Patterns (O'Reilly 2012) example source code --------------------------------------------------------------------------------