├── .DS_Store ├── README.md └── hadoop ├── .DS_Store ├── target ├── maven-status │ └── maven-compiler-plugin │ │ ├── testCompile │ │ └── default-testCompile │ │ │ ├── createdFiles.lst │ │ │ └── inputFiles.lst │ │ └── compile │ │ └── default-compile │ │ ├── createdFiles.lst │ │ └── inputFiles.lst ├── HW3-1.0-SNAPSHOT.jar ├── test-classes │ └── HW3 │ │ └── AppTest.class ├── maven-archiver │ └── pom.properties └── surefire-reports │ ├── HW3.AppTest.txt │ └── TEST-HW3.AppTest.xml ├── src └── main │ └── java │ ├── parser │ ├── Parser.java │ └── ParserImpl.java │ ├── enums │ └── PageRankEnums.java │ ├── topk │ ├── TopK.java │ ├── TopKReducer.java │ └── TopKMapper.java │ ├── driver │ └── DriverProgram.java │ ├── pagerank │ ├── PageRankImpl.java │ ├── PageRankReducer.java │ └── PageRankMapper.java │ ├── model │ └── Node.java │ └── parserjob │ └── ParserJob.java ├── Readme.txt ├── pom.xml ├── Makefile └── HW3.iml /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/hadoop-page-rank/master/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PageRankMapReduce 2 | PageRank Implementation for Map Reduce in Hadoop and Apache spark 3 | -------------------------------------------------------------------------------- /hadoop/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/hadoop-page-rank/master/hadoop/.DS_Store -------------------------------------------------------------------------------- /hadoop/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst: -------------------------------------------------------------------------------- 1 | HW3/AppTest.class 2 | -------------------------------------------------------------------------------- /hadoop/target/HW3-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/hadoop-page-rank/master/hadoop/target/HW3-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /hadoop/target/test-classes/HW3/AppTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manthanthakker/hadoop-page-rank/master/hadoop/target/test-classes/HW3/AppTest.class -------------------------------------------------------------------------------- /hadoop/target/maven-archiver/pom.properties: -------------------------------------------------------------------------------- 1 | #Generated by Maven 2 | #Sun Feb 25 13:59:30 EST 2018 3 | version=1.0-SNAPSHOT 4 | groupId=HW3 5 | artifactId=HW3 6 | -------------------------------------------------------------------------------- /hadoop/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst: -------------------------------------------------------------------------------- 1 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/test/java/HW3/AppTest.java 2 | -------------------------------------------------------------------------------- /hadoop/src/main/java/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package parser; 2 | 3 | /** 4 | * @author Manthan Thakker 5 | * @project HW3 6 | * @date 2/19/18 7 | * @email thakker.m@husky.neu.edu 8 | */ 9 | public interface Parser { 10 | } 11 | -------------------------------------------------------------------------------- /hadoop/target/surefire-reports/HW3.AppTest.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Test set: HW3.AppTest 3 | ------------------------------------------------------------------------------- 4 | Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 0.011 sec 5 | -------------------------------------------------------------------------------- /hadoop/Readme.txt: -------------------------------------------------------------------------------- 1 | 3 simple steps to run the Job (Used Joes file): 2 | 3 | 1. Open the Makefile configure 4 | 5 | local.input= ### MENTION INPUT PATH LOCATION 6 | local.output=### MENTION OUTPUT PATH LOCATION 7 | 8 | 2. Type make alone on the terminal 9 | 10 | 3. You can see the output folder topkresults in the folder. 11 | 12 | -------------------------------------------------------------------------------- /hadoop/src/main/java/enums/PageRankEnums.java: -------------------------------------------------------------------------------- 1 | package enums; 2 | 3 | /** 4 | * @author Manthan Thakker 5 | * @project HW3 6 | * @date 2/23/18 7 | * @email thakker.m@husky.neu.edu 8 | */ 9 | 10 | /** 11 | * Global Counters across mao reduce program 12 | */ 13 | public enum PageRankEnums { 14 | UNIQUEPAGES, 15 | DANGLINGNODESNEW, 16 | K 17 | } 18 | -------------------------------------------------------------------------------- /hadoop/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst: -------------------------------------------------------------------------------- 1 | parserjob/ParserJob$ParserMapper.class 2 | model/Value.class 3 | HW3/App.class 4 | parser/ParserImpl$WikiParser.class 5 | parser/Parser.class 6 | topk/TopKMapper$1.class 7 | pagerank/PageRankImpl.class 8 | parserjob/ParserJob$ParserMapper$WikiParser.class 9 | parser/ParserImpl.class 10 | topk/TopKReducer$1.class 11 | pagerank/PageRankMapper.class 12 | Enums/PageRankEnums.class 13 | driver/DriverProgram.class 14 | pagerank/PageRankReducer.class 15 | pagerank/PageRank.class 16 | topk/TopKReducer.class 17 | parserjob/ParserJob.class 18 | topk/TopKMapper.class 19 | topk/TopK.class 20 | -------------------------------------------------------------------------------- /hadoop/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst: -------------------------------------------------------------------------------- 1 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/pagerank/PageRank.java 2 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/pagerank/PageRankMapper.java 3 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/topk/TopKMapper.java 4 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/pagerank/PageRankReducer.java 5 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/parserjob/ParserJob.java 6 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/driver/DriverProgram.java 7 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/parser/Parser.java 8 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/topk/TopKReducer.java 9 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/topk/TopK.java 10 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/model/Value.java 11 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/parser/ParserImpl.java 12 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/Enums/PageRankEnums.java 13 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/HW3/App.java 14 | /Users/trailbrazer/Desktop/MR/git/MR/MR/HW3/src/main/java/pagerank/PageRankImpl.java 15 | -------------------------------------------------------------------------------- /hadoop/src/main/java/topk/TopK.java: -------------------------------------------------------------------------------- 1 | package topk; 2 | 3 | import model.Node; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | 13 | /** 14 | * @author Manthan Thakker 15 | * @project HW3 16 | * @date 2/23/18 17 | * @email thakker.m@husky.neu.edu 18 | */ 19 | public class TopK { 20 | 21 | 22 | public static void main(String[] args) throws Exception { 23 | 24 | Configuration conf = new Configuration(); 25 | Job job = Job.getInstance(conf, "Top K"); 26 | job.getConfiguration().set("UNIQUEPAGES", args[2]); 27 | 28 | job.getConfiguration().set("K",args[3]); 29 | // Setup 30 | job.setJarByClass(TopK.class); 31 | job.setMapperClass(TopKMapper.class); 32 | 33 | //Mapper 34 | job.setMapOutputKeyClass(NullWritable.class); 35 | job.setMapOutputValueClass(Node.class); 36 | 37 | job.setReducerClass(TopKReducer.class); 38 | //Reducer 39 | job.setOutputKeyClass(Text.class); 40 | job.setOutputValueClass(Text.class); 41 | 42 | FileInputFormat.addInputPath(job, new Path(args[0])); 43 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 44 | System.exit(job.waitForCompletion(true) ? 0 : 1); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /hadoop/src/main/java/driver/DriverProgram.java: -------------------------------------------------------------------------------- 1 | package driver; 2 | 3 | import pagerank.PageRankImpl; 4 | import parserjob.ParserJob; 5 | import topk.TopK; 6 | 7 | import java.io.FileInputStream; 8 | import java.io.InputStream; 9 | import java.util.Properties; 10 | 11 | /** 12 | * @author Manthan Thakker 13 | * @project HW3 14 | * @date 2/22/18 15 | * @email thakker.m@husky.neu.edu 16 | */ 17 | public class DriverProgram { 18 | 19 | /** 20 | * Initiates the execution 21 | * @param args: The input and the ouput paths 22 | * @throws Exception 23 | */ 24 | public static void main(String args[]) throws Exception { 25 | 26 | // Phase 1 27 | final String dataSetInput; 28 | final String dataSetOutput; 29 | 30 | // Phase 2 31 | final String pageRankInput; 32 | final String pageRankOutput; 33 | 34 | // Phase 3 35 | long UNIQUEPAGES; 36 | final String topKInput; 37 | final String topKoutput; 38 | 39 | long K=10; 40 | 41 | 42 | topKInput = args[1] + "/10"; 43 | topKoutput = args[1] + "/output"; 44 | 45 | String commandLine[] = new String[4]; 46 | commandLine[0] = args[0]; 47 | commandLine[1] = args[1]; 48 | UNIQUEPAGES = ParserJob.main(commandLine); 49 | 50 | commandLine[0] = args[1]; 51 | commandLine[2] = UNIQUEPAGES + ""; 52 | PageRankImpl.main(commandLine); 53 | 54 | commandLine[0] = topKInput; 55 | commandLine[1] = topKoutput; 56 | commandLine[2] = UNIQUEPAGES + ""; 57 | commandLine[3]=K+""; 58 | TopK.main(commandLine); 59 | 60 | } 61 | 62 | 63 | } 64 | -------------------------------------------------------------------------------- /hadoop/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | HW3 7 | HW3 8 | 1.0-SNAPSHOT 9 | jar 10 | 11 | HW3 12 | http://maven.apache.org 13 | 14 | 15 | UTF-8 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.hadoop 23 | hadoop-common 24 | 2.7.3 25 | provided 26 | 27 | 28 | 29 | 30 | org.apache.hadoop 31 | hadoop-mapreduce-client-core 32 | 3.0.0 33 | 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-client 38 | 2.2.0 39 | 40 | 41 | 42 | 43 | org.apache.hadoop 44 | hadoop-core 45 | 1.2.1 46 | 47 | 48 | 49 | 50 | junit 51 | junit 52 | 3.8.1 53 | test 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /hadoop/src/main/java/pagerank/PageRankImpl.java: -------------------------------------------------------------------------------- 1 | package pagerank; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | 13 | /** 14 | * @author Manthan Thakker 15 | * @project HW3 16 | * @date 2/20/18 17 | * @email thakker.m@husky.neu.edu 18 | */ 19 | public class PageRankImpl { 20 | 21 | 22 | public static void main(String[] args) throws Exception { 23 | 24 | long deltaNew = 0l; 25 | 26 | 27 | for (int i = 1; i < 11; i++) { 28 | 29 | Configuration conf = new Configuration(); 30 | 31 | Job job = Job.getInstance(conf, "Page Rank"); 32 | 33 | 34 | job.setJarByClass(PageRankImpl.class); 35 | // SETTING CONTEXT VARIABLES 36 | job.getConfiguration().set("deltaOld", deltaNew + ""); 37 | job.getConfiguration().set("UNIQUEPAGES", args[2]); 38 | 39 | 40 | // Mapper 41 | job.setMapperClass(PageRankMapper.class); 42 | job.setMapOutputKeyClass(Text.class); 43 | job.setMapOutputValueClass(Node.class); 44 | job.setReducerClass(PageRankReducer.class); 45 | 46 | //Reducer 47 | job.setOutputKeyClass(Text.class); 48 | job.setOutputValueClass(Text.class); 49 | FileInputFormat.addInputPath(job, new Path(args[0] + "/" + (i - 1) )); 50 | FileOutputFormat.setOutputPath(job, new Path(args[0] + "/" + (i) )); 51 | job.waitForCompletion(true); 52 | 53 | //Getting the number of nodes 54 | job.getConfiguration().setLong("numberOfNodes", 18000); 55 | job.getConfiguration().setBoolean("iterate", true); 56 | 57 | // Assigning the dangling node value to old value to use in the next iteration 58 | deltaNew = job.getCounters().findCounter(PageRankEnums.DANGLINGNODESNEW).getValue(); 59 | // initializiong the new delata dangling node to 0 60 | job.getCounters().findCounter(PageRankEnums.DANGLINGNODESNEW).setValue(0l); 61 | 62 | } 63 | } 64 | 65 | } 66 | 67 | -------------------------------------------------------------------------------- /hadoop/src/main/java/model/Node.java: -------------------------------------------------------------------------------- 1 | package model; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | 5 | import java.io.DataInput; 6 | import java.io.DataOutput; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * @author Manthan Thakker 13 | * @project HW3 14 | * @date 2/20/18 15 | * @email thakker.m@husky.neu.edu 16 | */ 17 | public class Node implements WritableComparable { 18 | 19 | public String id="DEFAULT"; 20 | public Double pageRank = -1.0; 21 | public List neighbors; 22 | public boolean isNode=false; 23 | public static final long SCALEUP = 1000000000l; 24 | 25 | public Node() { 26 | this.id=id; 27 | neighbors = new ArrayList(); 28 | isNode = true; 29 | } 30 | 31 | public Node(String id) { 32 | this.id = id.trim(); 33 | this.pageRank = pageRank; 34 | neighbors = new ArrayList(); 35 | isNode = true; 36 | } 37 | 38 | public Node(String id, Double pageRank) { 39 | this.id = id.trim(); 40 | this.pageRank = pageRank; 41 | neighbors = new ArrayList(); 42 | isNode = false; 43 | } 44 | 45 | // SERILIZATION AND DESERILIZATION METHODS 46 | 47 | public void write(DataOutput dataOutput) throws IOException { 48 | 49 | 50 | dataOutput.writeUTF(id.trim()); 51 | dataOutput.writeBoolean(isNode); 52 | dataOutput.writeDouble(pageRank); 53 | String accumulate = ""; 54 | for (String neighbor : neighbors) 55 | accumulate += neighbor.trim() + ","; 56 | if (accumulate.length() > 0) 57 | dataOutput.writeUTF(accumulate.substring(0, accumulate.length())); 58 | else { 59 | dataOutput.writeUTF(accumulate); 60 | } 61 | 62 | } 63 | 64 | 65 | 66 | public void readFields(DataInput dataInput) throws IOException { 67 | 68 | id = dataInput.readUTF().trim(); 69 | isNode = dataInput.readBoolean(); 70 | pageRank = dataInput.readDouble(); 71 | neighbors = new ArrayList(); 72 | String nei = dataInput.readUTF(); 73 | 74 | String neighborsName[] = nei.split(","); 75 | for (String neighbor : neighborsName) { 76 | neighbors.add(neighbor.trim()); 77 | } 78 | 79 | 80 | } 81 | 82 | @Override 83 | public String toString() { 84 | return "#" + pageRank + "#" + neighbors + "#" + isNode; 85 | } 86 | 87 | 88 | public int compareTo(Object o) { 89 | return pageRank.compareTo(((Node)o).pageRank); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /hadoop/src/main/java/pagerank/PageRankReducer.java: -------------------------------------------------------------------------------- 1 | package pagerank; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | import java.io.IOException; 10 | import java.util.Iterator; 11 | 12 | /** 13 | * @author Manthan Thakker 14 | * @project HW3 15 | * @date 2/23/18 16 | * @email thakker.m@husky.neu.edu 17 | */ 18 | public class PageRankReducer extends Reducer { 19 | 20 | Long numberOfNodes; 21 | Configuration configuration; 22 | private final long SCALE_FACTOR = 1000000000000l; 23 | 24 | /** 25 | * Initializes the state variables 26 | * @param context 27 | */ 28 | public void setup(Context context) { 29 | configuration = context.getConfiguration(); 30 | numberOfNodes = Long.parseLong(context.getConfiguration().get("UNIQUEPAGES")); 31 | } 32 | 33 | /** 34 | * 35 | * @param key: The Node id 36 | * @param values: List of Nodes/Pagerank Contributions(isNode will be false) 37 | * @param context: Context 38 | * @throws IOException 39 | * @throws InterruptedException 40 | * All partial pagerank contributions for same nodeId will be routed to the same reduce call. 41 | * A copy of the Node itself will be routed to the same reduce call. 42 | * We add up all the partial contributions and then emit the new node with the updated pageRank. 43 | */ 44 | public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 45 | 46 | // Intializing variables: 47 | Double pageRankTotal = 0.0; 48 | Iterator iterable = values.iterator(); 49 | Node M = null; 50 | 51 | // Preparing the string to be outputed 52 | String MString = ""; 53 | 54 | while (iterable.hasNext()) { 55 | Node node = iterable.next(); 56 | // If its a pagerank contribution or Actual Node. 57 | if (node.isNode) { 58 | M = node; 59 | MString = "#" + M.neighbors + "#" + M.isNode; 60 | } else { 61 | pageRankTotal += node.pageRank; 62 | } 63 | } 64 | 65 | // The pagerank formula 66 | pageRankTotal = (0.15 / (numberOfNodes)) + (0.85 * pageRankTotal); 67 | 68 | if (M != null) { 69 | context.write(new Text(M.id.trim()), new Text("#" + pageRankTotal + MString)); 70 | } else { 71 | long pageRank = Double.valueOf(pageRankTotal * SCALE_FACTOR).longValue(); 72 | context.getCounter(PageRankEnums.DANGLINGNODESNEW).increment(pageRank); 73 | context.write(key, new Text("#" + pageRankTotal + "#[]#true")); 74 | } 75 | } 76 | 77 | 78 | } 79 | 80 | -------------------------------------------------------------------------------- /hadoop/src/main/java/topk/TopKReducer.java: -------------------------------------------------------------------------------- 1 | package topk; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Reducer; 8 | 9 | import java.io.IOException; 10 | import java.util.*; 11 | 12 | /** 13 | * @author Manthan Thakker 14 | * @project HW3 15 | * @date 2/23/18 16 | * @email thakker.m@husky.neu.edu 17 | */ 18 | public class TopKReducer extends Reducer { 19 | private Map pages; 20 | private long topK; 21 | 22 | /** 23 | * Initialzes all the variables 24 | * 25 | * @param context 26 | */ 27 | public void setup(Context context) { 28 | pages = new HashMap(); 29 | this.topK = Long.parseLong(context.getConfiguration().get("K")); 30 | } 31 | 32 | 33 | /** 34 | * @param key: The Index of the Line 35 | * @param nodeIterator: List of Nodes 36 | * @param context As we know all records will route to the same reduce call. 37 | * Inserts each node and then just sorts and emits the top k results. 38 | */ 39 | public void reduce(NullWritable key, Iterable nodeIterator, Context context) throws IOException, InterruptedException { 40 | Iterator iterator = nodeIterator.iterator(); 41 | while (iterator.hasNext()) { 42 | Node node = iterator.next(); 43 | 44 | pages.put(node.pageRank + "#" + node.id, node); 45 | } 46 | pages = sortByComparator(pages, false); 47 | int i = 0; 48 | for (String page : pages.keySet()) { 49 | context.write(new Text(""), new Text(page)); 50 | i++; 51 | if (i > topK) 52 | break; 53 | } 54 | } 55 | 56 | /** 57 | * Sorts the given Unsorted Map by PageRank Values 58 | * 59 | * @param unsortMap : The map to be sorted 60 | * @param order: False to be ascending 61 | * @return Sorted Map Order 62 | * Picked up sorting from the IR Project done in last semester. 63 | */ 64 | private static Map sortByComparator(Map unsortMap, final boolean order) { 65 | 66 | List> list = new LinkedList>(unsortMap.entrySet()); 67 | 68 | // Sorting the list based on values 69 | Collections.sort(list, new Comparator>() { 70 | public int compare(Map.Entry o1, 71 | Map.Entry o2) { 72 | if (order) { 73 | return o1.getValue().pageRank.compareTo(o2.getValue().pageRank); 74 | } else { 75 | return (int) (((Double.parseDouble(o2.getKey().split("#")[0]) * 10000000000.0) - (Double.parseDouble(o1.getKey().split("#")[0])) * 10000000000.0) * 10000.0); 76 | 77 | } 78 | } 79 | }); 80 | 81 | // Maintaining insertion order with the help of LinkedList 82 | Map sortedMap = new LinkedHashMap(); 83 | for (Map.Entry entry : list) { 84 | sortedMap.put(entry.getKey(), entry.getValue()); 85 | } 86 | 87 | return sortedMap; 88 | } 89 | } -------------------------------------------------------------------------------- /hadoop/src/main/java/pagerank/PageRankMapper.java: -------------------------------------------------------------------------------- 1 | package pagerank; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | 9 | import java.io.IOException; 10 | import java.util.Arrays; 11 | 12 | /** 13 | * @author Manthan Thakker 14 | * @project HW3 15 | * @date 2/23/18 16 | * @email thakker.m@husky.neu.edu 17 | */ 18 | 19 | public class PageRankMapper extends Mapper { 20 | 21 | double deltaOld = 0.0; 22 | Long numberOfNodes; 23 | Configuration configuration; 24 | private long SCALE_FACTOR = 10000000000000000l; 25 | 26 | 27 | /** 28 | * Initializes the variables from context 29 | * @param context 30 | */ 31 | public void setup(Context context) { 32 | 33 | configuration = context.getConfiguration(); 34 | deltaOld = Long.parseLong(context.getConfiguration().get("deltaOld"))*1.0 / (SCALE_FACTOR); 35 | numberOfNodes = Long.parseLong(context.getConfiguration().get("UNIQUEPAGES")); 36 | } 37 | 38 | 39 | /** 40 | * 41 | * @param nodeId: The Id of the Node 42 | * @param record: the record which contains the string representation of the node 43 | * @param context 44 | * @throws IOException 45 | * @throws InterruptedException 46 | * 47 | * This methods takes in Value as the Node String representation, converts it into object 48 | * It emits a copy of the same to the map reduce phase and then emits the page rank contribution for each 49 | * of the neighbor nodes. 50 | */ 51 | public void map(Object nodeId, Text record, Context context) throws IOException, InterruptedException { 52 | 53 | /// Node in String Format Converted to the Node Object 54 | Node node = parseRecord(record.toString(), numberOfNodes); 55 | 56 | // Add contribution from dangling nodes to PageRank 57 | node.pageRank += 0.85 * (deltaOld / numberOfNodes); 58 | 59 | // Pass along the graph 60 | context.write(new Text(node.id.trim()), node); 61 | 62 | // Emit the pageRank contribution to the neighboring nodes 63 | Double p = 0.0; 64 | if (node.neighbors.size() > 1) { 65 | p = node.pageRank / (numberOfNodes); 66 | 67 | // Contribute Partial PageRank for each of its neighbor 68 | for (String n : node.neighbors) { 69 | 70 | // Node with just pageRank, isNode Field is set to be false 71 | Node pageRankNode = new Node(n, p); 72 | pageRankNode.id = n.trim(); 73 | context.write(new Text(n.trim()), pageRankNode); 74 | 75 | } 76 | } else { 77 | 78 | // If a node has no neighbours than add to the dangling nodes new 79 | double pageRank = ((node.pageRank / (numberOfNodes))); 80 | context.getCounter(PageRankEnums.DANGLINGNODESNEW).increment((long) (pageRank * SCALE_FACTOR)); 81 | } 82 | 83 | } 84 | 85 | /** 86 | * 87 | * @param record: String representation of the node 88 | * @param numberOfNodes: The unique page Count 89 | * @return Node object of the given String representation 90 | */ 91 | public static Node parseRecord(String record, long numberOfNodes) { 92 | Node node = new Node(); 93 | String fields[] = record.toString().split("#"); 94 | 95 | node.id = fields[0].toString().trim(); 96 | if (node.pageRank != -1.0) 97 | node.pageRank = Double.parseDouble(fields[1]); 98 | else 99 | node.pageRank = 1.0 / numberOfNodes; 100 | String neighborsArr[] = fields[2].substring(1, fields[2].length() - 1).split(","); 101 | node.neighbors = Arrays.asList(neighborsArr); 102 | node.isNode = true; 103 | return node; 104 | } 105 | 106 | } 107 | 108 | -------------------------------------------------------------------------------- /hadoop/src/main/java/topk/TopKMapper.java: -------------------------------------------------------------------------------- 1 | package topk; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | import java.io.IOException; 11 | import java.util.*; 12 | 13 | /** 14 | * @author Manthan Thakker 15 | * @project HW3 16 | * @date 2/23/18 17 | * @email thakker.m@husky.neu.edu 18 | */ 19 | public class TopKMapper extends Mapper { 20 | 21 | private Map pages; 22 | private long topK; 23 | 24 | 25 | /** 26 | * Iniitializes all tha variables 27 | * @param context 28 | */ 29 | public void setup(Context context) { 30 | pages = new HashMap(); 31 | this.topK = Long.parseLong(context.getConfiguration().get("K")); 32 | } 33 | 34 | 35 | /** 36 | * 37 | * @param key: The Index of the Line 38 | * @param value: The Node string representation. 39 | * @param context 40 | * Emits each node. 41 | */ 42 | public void map(LongWritable key, Text value, Context context) { 43 | Node node = parseRecord(value.toString()); 44 | pages.put(node.id, node); 45 | } 46 | 47 | 48 | /** 49 | * Sorts locally all the collected papges and emits only top k results 50 | * @param context 51 | * @throws IOException 52 | * @throws InterruptedException 53 | */ 54 | public void cleanup(Context context) throws IOException, InterruptedException { 55 | pages = sortByComparator(pages, false); 56 | int i = 0; 57 | for (String page : pages.keySet()) { 58 | 59 | context.write( NullWritable.get(), pages.get(page)); 60 | i++; 61 | if (i > topK) 62 | break; 63 | } 64 | } 65 | 66 | 67 | /** 68 | * 69 | * @param record: String representation of the node 70 | * @return Node object of the given String representation 71 | */ 72 | public static Node parseRecord(String record) { 73 | Node node = new Node(); 74 | String fields[] = record.toString().split("#"); 75 | 76 | 77 | node.id = fields[0].toString().trim(); 78 | node.pageRank = Double.parseDouble(fields[1]); 79 | String neighborsArr[] = fields[2].substring(1, fields[2].length() - 1).split(","); 80 | 81 | node.neighbors = Arrays.asList(neighborsArr); 82 | node.isNode = true; 83 | 84 | return node; 85 | } 86 | 87 | /** 88 | * Sorts the given Unsorted Map by PageRank Values 89 | * @param unsortMap : The map to be sorted 90 | * @param order: False to be ascending 91 | * @return Sorted Map Order 92 | * Picked up sorting from the IR Project done in last semester. 93 | */ 94 | private static Map sortByComparator(Map unsortMap, final boolean order) { 95 | 96 | List> list = new LinkedList>(unsortMap.entrySet()); 97 | 98 | // Sorting the list based on values 99 | Collections.sort(list, new Comparator>() { 100 | public int compare(Map.Entry o1, 101 | Map.Entry o2) { 102 | if (order) { 103 | return o1.getValue().pageRank.compareTo(o2.getValue().pageRank); 104 | } else { 105 | return o2.getValue().pageRank.compareTo(o1.getValue().pageRank); 106 | 107 | } 108 | } 109 | }); 110 | 111 | // Maintaining insertion order with the help of LinkedList 112 | Map sortedMap = new LinkedHashMap(); 113 | for (Map.Entry entry : list) { 114 | sortedMap.put(entry.getKey(), entry.getValue()); 115 | } 116 | 117 | return sortedMap; 118 | } 119 | } -------------------------------------------------------------------------------- /hadoop/target/surefire-reports/TEST-HW3.AppTest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /hadoop/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for MapReduce Page Rank project. 2 | 3 | # Customize these paths for your environment. 4 | # ----------------------------------------------------------- 5 | hadoop.root=/usr/local/Cellar/hadoop/2.8.2 6 | jar.name=HW3-1.0-SNAPSHOT.jar 7 | jar.path=target/${jar.name} 8 | 9 | 10 | #######################################################################TA CONFIGURE HERE############ 11 | job.name=driver.DriverProgram 12 | local.input= ### MENTION INPUT PATH 13 | local.output=### MENTION OUTPUT PATH 14 | #######################################################################TA CONFIGURE HERE############ 15 | 16 | # Pseudo-Cluster Execution 17 | hdfs.user.name=manthan 18 | hdfs.input=input1 19 | hdfs.output=topKResults 20 | 21 | 22 | # AWS EMR Execution 23 | aws.emr.release=emr-5.2.1 24 | aws.region=us-east-1 25 | aws.bucket.name=inputdatasetmapreduce 26 | aws.subnet.id=subnet-612a0f05 27 | aws.input=input1 28 | aws.output=output 29 | aws.log.dir=log 30 | aws.num.nodes=11 31 | aws.instance.type=m4.large 32 | # ----------------------------------------------------------- 33 | 34 | # Compiles code and builds jar (with dependencies). 35 | jar: 36 | mvn clean package 37 | 38 | #Removes local output directory. 39 | clean-local-output: 40 | cp -vR ${local.output} ${local.input} ; rm -rf ${local.input}.* ; rm -rf ${local.output} 41 | 42 | #clean-local-output: 43 | # rm -rf ${local.output} 44 | 45 | 46 | 47 | 48 | 49 | # Runs standalone 50 | # Make sure Hadoop is set up (in /etc/hadoop files) for standalone operation (not pseudo-cluster). 51 | # https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Standalone_Operation 52 | alone: jar clean-local-output 53 | ${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${local.input} ${local.output} 54 | 55 | #alone: jar 56 | 57 | #${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${local.input} ${local.output} 58 | 59 | # Start HDFS 60 | start-hdfs: 61 | ${hadoop.root}/sbin/start-dfs.sh 62 | 63 | # Stop HDFS 64 | stop-hdfs: 65 | ${hadoop.root}/sbin/stop-dfs.sh 66 | 67 | # Start YARN 68 | start-yarn: stop-yarn 69 | ${hadoop.root}/sbin/start-yarn.sh 70 | 71 | # Stop YARN 72 | stop-yarn: 73 | ${hadoop.root}/sbin/stop-yarn.sh 74 | 75 | # Reformats & initializes HDFS. 76 | format-hdfs: stop-hdfs 77 | rm -rf /tmp/hadoop* 78 | ${hadoop.root}/bin/hdfs namenode -format 79 | 80 | # Initializes user & input directories of HDFS. 81 | init-hdfs: start-hdfs 82 | ${hadoop.root}/bin/hdfs dfs -rm -r -f /user 83 | ${hadoop.root}/bin/hdfs dfs -mkdir /user 84 | ${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name} 85 | ${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}/${hdfs.input} 86 | 87 | # Load data to HDFS 88 | upload-input-hdfs: start-hdfs 89 | ${hadoop.root}/bin/hdfs dfs -put ${local.input}/* /user/${hdfs.user.name}/${hdfs.input} 90 | 91 | # Removes hdfs output directory. 92 | clean-hdfs-output: 93 | ${hadoop.root}/bin/hdfs dfs -rm -r -f ${hdfs.output}* 94 | 95 | # Download output from HDFS to local. 96 | download-output: 97 | mkdir ${local.output} 98 | ${hadoop.root}/bin/hdfs dfs -get ${hdfs.output}/* ${local.output} 99 | 100 | # Runs pseudo-clustered (ALL). ONLY RUN THIS ONCE, THEN USE: make pseudoq 101 | # Make sure Hadoop is set up (in /etc/hadoop files) for pseudo-clustered operation (not standalone). 102 | # https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation 103 | pseudo: jar stop-yarn format-hdfs init-hdfs upload-input-hdfs start-yarn clean-local-output 104 | ${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output} 105 | make download-output 106 | 107 | # Runs pseudo-clustered (quickie). 108 | pseudoq: jar clean-local-output clean-hdfs-output 109 | ${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output} 110 | make download-output 111 | 112 | # Create S3 bucket. 113 | make-bucket: 114 | aws s3 mb s3://${aws.bucket.name} 115 | 116 | # Upload data to S3 input dir. 117 | upload-input-aws: make-bucket 118 | aws s3 sync ${local.input} s3://${aws.bucket.name}/${aws.input} 119 | 120 | # Delete S3 output dir. 121 | delete-output-aws: 122 | aws s3 rm s3://${aws.bucket.name}/ --recursive --exclude "*" --include "${aws.output}*" 123 | 124 | # Upload application to S3 bucket. 125 | upload-app-aws: 126 | aws s3 cp ${jar.path} s3://${aws.bucket.name} 127 | 128 | # Main EMR launch. 129 | cloud: jar upload-app-aws delete-output-aws 130 | aws emr create-cluster \ 131 | --name "6 Large machines small data set" \ 132 | --release-label ${aws.emr.release} \ 133 | --instance-groups '[{"InstanceCount":${aws.num.nodes},"InstanceGroupType":"CORE","InstanceType":"${aws.instance.type}"},{"InstanceCount":1,"InstanceGroupType":"MASTER","InstanceType":"${aws.instance.type}"}]' \ 134 | --applications Name=Hadoop \ 135 | --steps '[{"Args":["${job.name}","s3://${aws.bucket.name}/${aws.input}","s3://${aws.bucket.name}/${aws.output}"],"Type":"CUSTOM_JAR","Jar":"s3://${aws.bucket.name}/${jar.name}","ActionOnFailure":"TERMINATE_CLUSTER","Name":"Custom JAR"}]' \ 136 | --log-uri s3://${aws.bucket.name}/${aws.log.dir} \ 137 | --service-role EMR_DefaultRole \ 138 | --ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,SubnetId=${aws.subnet.id} \ 139 | --region ${aws.region} \ 140 | --enable-debugging \ 141 | --auto-terminate 142 | 143 | # Download output from S3. 144 | download-output-aws: clean-local-output 145 | mkdir ${local.output} 146 | aws s3 sync s3://${aws.bucket.name}/${aws.output} ${local.output} 147 | 148 | # Change to standalone mode. 149 | switch-standalone: 150 | cp config/standalone/*.xml ${hadoop.root}/etc/hadoop 151 | 152 | # Change to pseudo-cluster mode. 153 | switch-pseudo: 154 | cp config/pseudo/*.xml ${hadoop.root}/etc/hadoop 155 | 156 | # Package for release. 157 | distro: 158 | rm -rf build 159 | mkdir build 160 | mkdir build/deliv 161 | mkdir build/deliv/WordCount 162 | cp pom.xml build/deliv/WordCount 163 | cp -r src build/deliv/WordCount 164 | cp Makefile build/deliv/WordCount 165 | cp README.txt build/deliv/WordCount 166 | tar -czf WordCount.tar.gz -C build/deliv WordCount 167 | cd build/deliv && zip -rq ../../WordCount.zip WordCount 168 | -------------------------------------------------------------------------------- /hadoop/src/main/java/parser/ParserImpl.java: -------------------------------------------------------------------------------- 1 | package parser; 2 | 3 | import java.io.*; 4 | import java.net.URLDecoder; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import javax.xml.parsers.SAXParser; 11 | import javax.xml.parsers.SAXParserFactory; 12 | 13 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; 14 | import org.xml.sax.Attributes; 15 | import org.xml.sax.InputSource; 16 | import org.xml.sax.SAXException; 17 | import org.xml.sax.XMLReader; 18 | import org.xml.sax.helpers.DefaultHandler; 19 | 20 | /** 21 | * @author Manthan Thakker 22 | * @project HW3 23 | * @date 2/19/18 24 | * @email thakker.m@husky.neu.edu 25 | */ 26 | public class ParserImpl implements Parser { 27 | private static Pattern namePattern; 28 | private static Pattern linkPattern; 29 | 30 | static { 31 | // Keep only html pages not containing tilde (~). 32 | namePattern = Pattern.compile("^([^~]+)$"); 33 | // Keep only html filenames ending relative paths and not containing tilde (~). 34 | linkPattern = Pattern.compile("^\\..*/([^~]+)\\.html$"); 35 | } 36 | 37 | public static void main(String[] args) { 38 | 39 | // Path of the file 40 | String path = "/Users/trailbrazer/Desktop/MR/HW3/input/wikipedia-simple-html.bz2"; 41 | 42 | 43 | long count=0; 44 | 45 | BufferedReader reader = null; 46 | try { 47 | File inputFile = new File(path); 48 | if (!inputFile.exists() || inputFile.isDirectory() || !inputFile.getName().endsWith(".bz2")) { 49 | System.out.println("Input File does not exist or not bz2 file: " + path); 50 | System.exit(1); 51 | } 52 | 53 | // Configure parser. 54 | SAXParserFactory spf = SAXParserFactory.newInstance(); 55 | spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 56 | SAXParser saxParser = spf.newSAXParser(); 57 | XMLReader xmlReader = saxParser.getXMLReader(); 58 | // Parser fills this list with linked page names. 59 | List linkPageNames = new LinkedList(); 60 | xmlReader.setContentHandler(new WikiParser(linkPageNames)); 61 | 62 | BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(new FileInputStream(inputFile)); 63 | reader = new BufferedReader(new InputStreamReader(inputStream)); 64 | String line; 65 | 66 | while ((line = reader.readLine()) != null) { 67 | count++; 68 | // Each line formatted as (Wiki-page-name:Wiki-page-html). 69 | int delimLoc = line.indexOf(':'); 70 | String page = line; 71 | // replace & with & 72 | line = line.replaceAll("&", "&").trim(); 73 | String pageName = line.substring(0, delimLoc); 74 | String html = line.substring(delimLoc + 1); 75 | Matcher matcher = namePattern.matcher(pageName); 76 | if (!matcher.find()) { 77 | // Skip this html file, name contains (~). 78 | continue; 79 | } 80 | 81 | // Parse page and fill list of linked pages. 82 | linkPageNames.clear(); 83 | try { 84 | xmlReader.parse(new InputSource(new StringReader(html))); 85 | } catch (Exception e) { 86 | // Discard ill-formatted pages. 87 | continue; 88 | } 89 | 90 | } 91 | 92 | } 93 | catch (EOFException e) { 94 | 95 | }catch (Exception e) { 96 | e.printStackTrace(); 97 | } 98 | 99 | finally 100 | { 101 | try { 102 | reader.close(); 103 | } catch (IOException e) { 104 | e.printStackTrace(); 105 | } 106 | } 107 | 108 | } 109 | 110 | /** 111 | * Parses a Wikipage, finding links inside bodyContent div element. 112 | */ 113 | private static class WikiParser extends DefaultHandler { 114 | /** 115 | * List of linked pages; filled by parser. 116 | */ 117 | private List linkPageNames; 118 | /** 119 | * Nesting depth inside bodyContent div element. 120 | */ 121 | private int count = 0; 122 | 123 | public WikiParser(List linkPageNames) { 124 | super(); 125 | this.linkPageNames = linkPageNames; 126 | } 127 | 128 | @Override 129 | public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 130 | super.startElement(uri, localName, qName, attributes); 131 | if ("div".equalsIgnoreCase(qName) && "bodyContent".equalsIgnoreCase(attributes.getValue("id")) && count == 0) { 132 | // Beginning of bodyContent div element. 133 | count = 1; 134 | } else if (count > 0 && "a".equalsIgnoreCase(qName)) { 135 | // Anchor tag inside bodyContent div element. 136 | count++; 137 | String link = attributes.getValue("href"); 138 | if (link == null) { 139 | return; 140 | } 141 | 142 | try { 143 | // Decode escaped characters in URL. 144 | link = URLDecoder.decode(link, "UTF-8"); 145 | } catch (Exception e) { 146 | // Wiki-weirdness; use link as is. 147 | } 148 | // Keep only html filenames ending relative paths and not containing tilde (~). 149 | Matcher matcher = linkPattern.matcher(link); 150 | if (matcher.find()) { 151 | linkPageNames.add(matcher.group(1)); 152 | } 153 | } else if (count > 0) { 154 | // Other element inside bodyContent div. 155 | count++; 156 | } 157 | } 158 | 159 | @Override 160 | public void endElement(String uri, String localName, String qName) throws SAXException { 161 | super.endElement(uri, localName, qName); 162 | if (count > 0) { 163 | // End of element inside bodyContent div. 164 | count--; 165 | } 166 | } 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /hadoop/src/main/java/parserjob/ParserJob.java: -------------------------------------------------------------------------------- 1 | package parserjob; 2 | 3 | import enums.PageRankEnums; 4 | import model.Node; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.xml.sax.Attributes; 14 | import org.xml.sax.InputSource; 15 | import org.xml.sax.SAXException; 16 | import org.xml.sax.XMLReader; 17 | import org.xml.sax.helpers.DefaultHandler; 18 | 19 | import javax.xml.parsers.SAXParser; 20 | import javax.xml.parsers.SAXParserFactory; 21 | import java.io.IOException; 22 | import java.io.StringReader; 23 | import java.net.URLDecoder; 24 | import java.util.*; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | 29 | /** 30 | * @author Manthan Thakker 31 | * @project HW3 32 | * @date 2/20/18 33 | * @email thakker.m@husky.neu.edu 34 | */ 35 | public class ParserJob { 36 | 37 | private static Pattern namePattern; 38 | private static Pattern linkPattern; 39 | 40 | static { 41 | // Keep only html pages not containing tilde (~). 42 | namePattern = Pattern.compile("^([^~]+)$"); 43 | // Keep only html filenames ending relative paths and not containing tilde (~). 44 | linkPattern = Pattern.compile("^\\..*/([^~]+)\\.html$"); 45 | } 46 | 47 | 48 | public static class ParserMapper extends Mapper { 49 | 50 | private Set uniquePages; 51 | 52 | public void setup(Context context) { 53 | uniquePages = new HashSet(); 54 | 55 | } 56 | 57 | public void map(LongWritable key, Text node, Context context) throws IOException, InterruptedException { 58 | 59 | 60 | try { 61 | // Configure parser. 62 | SAXParserFactory spf = SAXParserFactory.newInstance(); 63 | spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); 64 | SAXParser saxParser = spf.newSAXParser(); 65 | XMLReader xmlReader = saxParser.getXMLReader(); 66 | // Parser fills this list with linked page names. 67 | List linkPageNames = new LinkedList(); 68 | xmlReader.setContentHandler(new WikiParser(linkPageNames)); 69 | 70 | 71 | String line = node.toString(); 72 | 73 | Text keyToEmit = new Text(); 74 | 75 | 76 | // Each line formatted as (Wiki-page-name:Wiki-page-html). 77 | int delimLoc = line.indexOf(':'); 78 | String page = line; 79 | // replace & with & 80 | line = line.replaceAll("&", "&").trim(); 81 | String pageName = line.substring(0, delimLoc); 82 | String html = line.substring(delimLoc + 1); 83 | Matcher matcher = namePattern.matcher(pageName); 84 | if (!matcher.find()) { 85 | // Skip this html file, name contains (~). 86 | return; 87 | } 88 | 89 | 90 | // Parse page and fill list of linked pages. 91 | linkPageNames.clear(); 92 | try { 93 | xmlReader.parse(new InputSource(new StringReader(html))); 94 | } catch (Exception e) { 95 | // Discard ill-formatted pages. 96 | return; 97 | } 98 | 99 | Node newNode = new Node(pageName); 100 | 101 | Set pagesSet=new HashSet(); 102 | pagesSet.addAll(linkPageNames); 103 | 104 | newNode.neighbors = new LinkedList(pagesSet); 105 | uniquePages.add(pageName); 106 | newNode.pageRank=-1.0; 107 | keyToEmit.set(pageName); 108 | context.write(keyToEmit, newNode); 109 | 110 | 111 | } catch (Exception e) { 112 | e.printStackTrace(); 113 | } 114 | } 115 | 116 | public void cleanup(Context context) { 117 | context.getCounter(PageRankEnums.UNIQUEPAGES).increment(uniquePages.size()); 118 | } 119 | 120 | /** 121 | * Parses a Wikipage, finding links inside bodyContent div element. 122 | */ 123 | private static class WikiParser extends DefaultHandler { 124 | /** 125 | * List of linked pages; filled by parser. 126 | */ 127 | private List linkPageNames; 128 | /** 129 | * Nesting depth inside bodyContent div element. 130 | */ 131 | private int count = 0; 132 | 133 | public WikiParser(List linkPageNames) { 134 | super(); 135 | this.linkPageNames = linkPageNames; 136 | } 137 | 138 | @Override 139 | public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 140 | super.startElement(uri, localName, qName, attributes); 141 | if ("div".equalsIgnoreCase(qName) && "bodyContent".equalsIgnoreCase(attributes.getValue("id")) && count == 0) { 142 | // Beginning of bodyContent div element. 143 | count = 1; 144 | } else if (count > 0 && "a".equalsIgnoreCase(qName)) { 145 | // Anchor tag inside bodyContent div element. 146 | count++; 147 | String link = attributes.getValue("href"); 148 | if (link == null) { 149 | return; 150 | } 151 | 152 | try { 153 | // Decode escaped characters in URL. 154 | link = URLDecoder.decode(link, "UTF-8"); 155 | } catch (Exception e) { 156 | // Wiki-weirdness; use link as is. 157 | } 158 | // Keep only html filenames ending relative paths and not containing tilde (~). 159 | Matcher matcher = linkPattern.matcher(link); 160 | if (matcher.find()) { 161 | linkPageNames.add(matcher.group(1)); 162 | } 163 | } else if (count > 0) { 164 | // Other element inside bodyContent div. 165 | count++; 166 | } 167 | } 168 | 169 | @Override 170 | public void endElement(String uri, String localName, String qName) throws SAXException { 171 | super.endElement(uri, localName, qName); 172 | if (count > 0) { 173 | // End of element inside bodyContent div. 174 | count--; 175 | } 176 | } 177 | } 178 | } 179 | 180 | 181 | public static long main(String[] args) throws Exception { 182 | 183 | Configuration conf = new Configuration(); 184 | Job job = Job.getInstance(conf, "Parser"); 185 | 186 | // Setup 187 | job.setJarByClass(ParserJob.class); 188 | job.setMapperClass(ParserMapper.class); 189 | 190 | 191 | //Mapper 192 | job.setMapOutputKeyClass(Text.class); 193 | job.setMapOutputValueClass(Node.class); 194 | 195 | 196 | FileInputFormat.addInputPath(job, new Path(args[0])); 197 | FileOutputFormat.setOutputPath(job, new Path(args[1]+"/0")); 198 | job.waitForCompletion(true); 199 | return job.getCounters().findCounter(PageRankEnums.UNIQUEPAGES).getValue(); 200 | 201 | 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /hadoop/HW3.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | --------------------------------------------------------------------------------