└── src └── com └── alibaba └── taobao ├── main ├── LinearURLMiningMain.java └── MapReduceURLMiningMain.java └── worker ├── ConfigurableWorker.java ├── LifeCycle.java ├── SimpleTaskIDGenerator.java ├── SimpleURLComparator.java ├── TaskProcessor.java ├── WorkerEvent.java ├── WorkerListener.java ├── WorkerTask.java ├── linear ├── PageURLMiningProcessor.java └── PageURLMiningTask.java └── mapreduce ├── Map2ReduceConnector.java ├── MapReducePageURLMiningTask.java ├── MapReduceURLMiningMain.java ├── PageContentFetchProcessor.java └── URLMatchingProcessor.java /src/com/alibaba/taobao/main/LinearURLMiningMain.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.main; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.concurrent.ConcurrentHashMap; 6 | import java.util.concurrent.ConcurrentSkipListSet; 7 | import java.util.concurrent.TimeUnit; 8 | 9 | import com.alibaba.taobao.worker.ConfigurableWorker; 10 | import com.alibaba.taobao.worker.SimpleURLComparator; 11 | import com.alibaba.taobao.worker.WorkerEvent; 12 | import com.alibaba.taobao.worker.WorkerListener; 13 | import com.alibaba.taobao.worker.WorkerTask; 14 | import com.alibaba.taobao.worker.linear.PageURLMiningProcessor; 15 | import com.alibaba.taobao.worker.linear.PageURLMiningTask; 16 | 17 | /** 18 | * Linear version of page URL mining. It's slow but simple. 19 | * Average time cost for 1000 URLs is: 3800ms 20 | * 21 | * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com 22 | * @since Sep 16, 2012 5:35:40 PM 23 | */ 24 | public class LinearURLMiningMain implements WorkerListener { 25 | private static final String EMPTY_STRING = ""; 26 | 27 | private static final int URL_SIZE_TO_MINE = 10000; 28 | 29 | private static ConcurrentHashMap> taskID2TaskMap = new ConcurrentHashMap>(); 30 | 31 | private static ConcurrentSkipListSet foundURLs = new ConcurrentSkipListSet(new SimpleURLComparator()); 32 | 33 | public static void main(String[] args) throws InterruptedException { 34 | long startTime = System.currentTimeMillis(); 35 | 36 | ConfigurableWorker worker = new ConfigurableWorker("W001"); 37 | worker.setTaskProcessor(new PageURLMiningProcessor()); 38 | 39 | addTask2Worker(worker, new PageURLMiningTask("http://www.taobao.com")); 40 | addTask2Worker(worker, new PageURLMiningTask("http://www.xinhuanet.com")); 41 | addTask2Worker(worker, new PageURLMiningTask("http://www.zol.com.cn")); 42 | addTask2Worker(worker, new PageURLMiningTask("http://www.163.com")); 43 | 44 | LinearURLMiningMain mainListener = new LinearURLMiningMain(); 45 | worker.addListener(mainListener); 46 | 47 | worker.start(); 48 | 49 | String targetURL = EMPTY_STRING; 50 | while (foundURLs.size() < URL_SIZE_TO_MINE) { 51 | targetURL = foundURLs.pollFirst(); 52 | 53 | if (targetURL == null) { 54 | TimeUnit.MILLISECONDS.sleep(50); 55 | continue; 56 | } 57 | 58 | PageURLMiningTask task = new PageURLMiningTask(targetURL); 59 | taskID2TaskMap.putIfAbsent(worker.addTask(task), task); 60 | 61 | TimeUnit.MILLISECONDS.sleep(100); 62 | } 63 | 64 | worker.stop(); 65 | 66 | for (String string : foundURLs) { 67 | System.out.println(string); 68 | } 69 | 70 | System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms"); 71 | } 72 | 73 | private static void addTask2Worker(ConfigurableWorker mapWorker_1, PageURLMiningTask task) { 74 | String taskID = mapWorker_1.addTask(task); 75 | taskID2TaskMap.put(taskID, task); 76 | } 77 | 78 | @Override 79 | public List intrests() { 80 | return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED); 81 | } 82 | 83 | @Override 84 | public void onEvent(WorkerEvent event, Object... args) { 85 | if (WorkerEvent.TASK_FAILED == event) { 86 | System.err.println("Error while extracting URLs"); 87 | return; 88 | } 89 | 90 | if (WorkerEvent.TASK_COMPLETE != event) 91 | return; 92 | 93 | PageURLMiningTask task = (PageURLMiningTask) args[0]; 94 | if (!taskID2TaskMap.containsKey(task.getTaskID())) 95 | return; 96 | 97 | foundURLs.addAll(task.getMinedURLs()); 98 | 99 | System.out.println("Found URL size: " + foundURLs.size()); 100 | 101 | taskID2TaskMap.remove(task.getTaskID()); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/main/MapReduceURLMiningMain.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.main; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.concurrent.ConcurrentHashMap; 7 | import java.util.concurrent.ConcurrentSkipListSet; 8 | import java.util.concurrent.TimeUnit; 9 | 10 | import com.alibaba.taobao.worker.ConfigurableWorker; 11 | import com.alibaba.taobao.worker.SimpleURLComparator; 12 | import com.alibaba.taobao.worker.WorkerEvent; 13 | import com.alibaba.taobao.worker.WorkerListener; 14 | import com.alibaba.taobao.worker.WorkerTask; 15 | import com.alibaba.taobao.worker.mapreduce.Map2ReduceConnector; 16 | import com.alibaba.taobao.worker.mapreduce.MapReducePageURLMiningTask; 17 | import com.alibaba.taobao.worker.mapreduce.PageContentFetchProcessor; 18 | import com.alibaba.taobao.worker.mapreduce.URLMatchingProcessor; 19 | 20 | /** 21 | * MapReduce version of page URL mining. It's very powerful. 22 | * 23 | * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com 24 | * @since Sep 16, 2012 5:35:40 PM 25 | */ 26 | public class MapReduceURLMiningMain implements WorkerListener { 27 | private static final String EMPTY_STRING = ""; 28 | 29 | private static final int URL_SIZE_TO_MINE = 10000; 30 | 31 | private static ConcurrentHashMap> taskID2TaskMap = new ConcurrentHashMap>(); 32 | 33 | private static ConcurrentSkipListSet foundURLs = new ConcurrentSkipListSet(new SimpleURLComparator()); 34 | 35 | public static void main(String[] args) throws InterruptedException { 36 | long startTime = System.currentTimeMillis(); 37 | 38 | // four mapers 39 | List mappers = new ArrayList(4); 40 | 41 | ConfigurableWorker mapWorker_1 = new ConfigurableWorker("W_M1"); 42 | ConfigurableWorker mapWorker_2 = new ConfigurableWorker("W_M2"); 43 | ConfigurableWorker mapWorker_3 = new ConfigurableWorker("W_M3"); 44 | ConfigurableWorker mapWorker_4 = new ConfigurableWorker("W_M4"); 45 | mapWorker_1.setTaskProcessor(new PageContentFetchProcessor()); 46 | mapWorker_2.setTaskProcessor(new PageContentFetchProcessor()); 47 | mapWorker_3.setTaskProcessor(new PageContentFetchProcessor()); 48 | mapWorker_4.setTaskProcessor(new PageContentFetchProcessor()); 49 | 50 | mappers.add(mapWorker_1); 51 | mappers.add(mapWorker_2); 52 | mappers.add(mapWorker_3); 53 | mappers.add(mapWorker_4); 54 | 55 | // one reducers 56 | ConfigurableWorker reduceWorker_1 = new ConfigurableWorker("W_R1"); 57 | reduceWorker_1.setTaskProcessor(new URLMatchingProcessor()); 58 | 59 | // bind reducer to final result class 60 | MapReduceURLMiningMain main = new MapReduceURLMiningMain(); 61 | reduceWorker_1.addListener(main); 62 | 63 | // initiate tasks 64 | addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.taobao.com")); 65 | addTask2Worker(mapWorker_2, new MapReducePageURLMiningTask("http://www.xinhuanet.com")); 66 | addTask2Worker(mapWorker_3, new MapReducePageURLMiningTask("http://www.zol.com.cn")); 67 | addTask2Worker(mapWorker_4, new MapReducePageURLMiningTask("http://www.sina.com.cn/")); 68 | 69 | // bind mapper to reduer 70 | Map2ReduceConnector connector = new Map2ReduceConnector(Arrays.asList(reduceWorker_1)); 71 | mapWorker_1.addListener(connector); 72 | mapWorker_2.addListener(connector); 73 | mapWorker_3.addListener(connector); 74 | mapWorker_4.addListener(connector); 75 | 76 | // start all 77 | mapWorker_1.start(); 78 | mapWorker_2.start(); 79 | mapWorker_3.start(); 80 | mapWorker_4.start(); 81 | reduceWorker_1.start(); 82 | 83 | String targetURL = EMPTY_STRING; 84 | int lastIndex = 0; 85 | while (foundURLs.size() < URL_SIZE_TO_MINE) { 86 | targetURL = foundURLs.pollFirst(); 87 | 88 | if (targetURL == null) { 89 | TimeUnit.MILLISECONDS.sleep(50); 90 | continue; 91 | } 92 | 93 | lastIndex = ++lastIndex % mappers.size(); 94 | MapReducePageURLMiningTask task = new MapReducePageURLMiningTask(targetURL); 95 | taskID2TaskMap.putIfAbsent(mappers.get(lastIndex).addTask(task), task); 96 | 97 | TimeUnit.MILLISECONDS.sleep(100); 98 | } 99 | 100 | // stop all 101 | mapWorker_1.stop(); 102 | mapWorker_2.stop(); 103 | mapWorker_3.stop(); 104 | mapWorker_4.stop(); 105 | reduceWorker_1.stop(); 106 | 107 | for (String string : foundURLs) { 108 | System.out.println(string); 109 | } 110 | 111 | System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms"); 112 | } 113 | 114 | private static void addTask2Worker(ConfigurableWorker mapWorker_1, MapReducePageURLMiningTask task) { 115 | String taskID = mapWorker_1.addTask(task); 116 | taskID2TaskMap.put(taskID, task); 117 | } 118 | 119 | @Override 120 | public List intrests() { 121 | return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED); 122 | } 123 | 124 | @Override 125 | public void onEvent(WorkerEvent event, Object... args) { 126 | if (WorkerEvent.TASK_FAILED == event) { 127 | System.err.println("Error while extracting URLs"); 128 | return; 129 | } 130 | 131 | if (WorkerEvent.TASK_COMPLETE != event) 132 | return; 133 | 134 | MapReducePageURLMiningTask task = (MapReducePageURLMiningTask) args[0]; 135 | if (!taskID2TaskMap.containsKey(task.getTaskID())) 136 | return; 137 | 138 | foundURLs.addAll(task.getMinedURLs()); 139 | 140 | System.out.println("Found URL size: " + foundURLs.size()); 141 | 142 | taskID2TaskMap.remove(task.getTaskID()); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/ConfigurableWorker.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.concurrent.ArrayBlockingQueue; 6 | import java.util.concurrent.BlockingQueue; 7 | import java.util.concurrent.CopyOnWriteArrayList; 8 | 9 | public class ConfigurableWorker implements Runnable, LifeCycle { 10 | private BlockingQueue> taskQueue = new ArrayBlockingQueue>(5); 11 | 12 | private Thread thread; 13 | 14 | private HashMap> listenerMap; 15 | 16 | private TaskProcessor taskProcessor; 17 | 18 | private volatile boolean initiated = false; 19 | 20 | private String workerID; 21 | 22 | public ConfigurableWorker(String workerID) { 23 | this.workerID = workerID; 24 | } 25 | 26 | @Override 27 | public void start() { 28 | if (!initiated) { 29 | init(); 30 | } 31 | 32 | thread.start(); 33 | } 34 | 35 | @Override 36 | public void init() { 37 | if (initiated) 38 | return; 39 | 40 | if (taskProcessor == null) 41 | throw new IllegalStateException("Task Processor must be set first"); 42 | 43 | thread = new Thread(this); 44 | thread.setDaemon(true); 45 | 46 | listenerMap = new HashMap>(); 47 | 48 | initiated = true; 49 | } 50 | 51 | @Override 52 | public void stop() { 53 | thread.interrupt(); 54 | } 55 | 56 | public void fireEvent(WorkerEvent event, Object... args) { 57 | CopyOnWriteArrayList listeners = listenerMap.get(event); 58 | 59 | if (listeners == null) 60 | return; 61 | 62 | for (WorkerListener listener : listeners) { 63 | listener.onEvent(event, args); 64 | } 65 | } 66 | 67 | public synchronized void addListener(WorkerListener listener) { 68 | if (!initiated) { 69 | init(); 70 | } 71 | 72 | List intrestEvents = listener.intrests(); 73 | for (WorkerEvent event : intrestEvents) { 74 | CopyOnWriteArrayList listeners = listenerMap.get(event); 75 | if (listeners == null) { 76 | listeners = new CopyOnWriteArrayList(); 77 | } 78 | 79 | listeners.add(listener); 80 | listenerMap.put(event, listeners); 81 | } 82 | } 83 | 84 | public String addTask(WorkerTask task) { 85 | if (!initiated) { 86 | init(); 87 | } 88 | 89 | try { 90 | taskQueue.put(task); 91 | } catch (InterruptedException e) { 92 | thread.interrupt(); 93 | } 94 | 95 | return task.getTaskID(); 96 | } 97 | 98 | @Override 99 | public void run() { 100 | try { 101 | for (;;) { 102 | WorkerTask task = taskQueue.take(); 103 | 104 | taskProcessor.process(task); 105 | 106 | if (task.isDone()) { 107 | fireEvent(WorkerEvent.TASK_COMPLETE, task); 108 | continue; 109 | } 110 | 111 | fireEvent(WorkerEvent.TASK_FAILED, task); 112 | } 113 | } catch (InterruptedException e) { 114 | System.out.println("Worker mission canceled, remaining task size: " + taskQueue.size()); 115 | return; 116 | } 117 | } 118 | 119 | public TaskProcessor getTaskProcessor() { 120 | return taskProcessor; 121 | } 122 | 123 | public void setTaskProcessor(TaskProcessor taskProcessor) { 124 | this.taskProcessor = taskProcessor; 125 | } 126 | 127 | public String getWorkerID() { 128 | return workerID; 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/LifeCycle.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | public interface LifeCycle { 4 | void start(); 5 | 6 | void init(); 7 | 8 | void stop(); 9 | } 10 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/SimpleTaskIDGenerator.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | import java.util.concurrent.atomic.AtomicInteger; 4 | 5 | public class SimpleTaskIDGenerator { 6 | private static AtomicInteger lastID = new AtomicInteger(0); 7 | 8 | public static String genTaskID() { 9 | return "T" + lastID.incrementAndGet(); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/SimpleURLComparator.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | import java.net.URL; 4 | import java.util.Comparator; 5 | 6 | /** 7 | * Comparator which makes most significant different URLs stay in top of the TreeSet 8 | */ 9 | public class SimpleURLComparator implements Comparator { 10 | @Override 11 | public int compare(String o1, String o2) { 12 | try { 13 | if(o1.equals(o2)) 14 | return 0; 15 | 16 | URL url_1 = new URL(o1); 17 | URL url_2 = new URL(o2); 18 | 19 | if (!url_1.getHost().equals(url_2.getHost())) 20 | return 1; 21 | 22 | String urlPath_1 = url_1.getPath(); 23 | String urlPath_2 = url_2.getPath(); 24 | 25 | int shortestURLLength = urlPath_1.length(); 26 | if (urlPath_2.length() < shortestURLLength) { 27 | shortestURLLength = urlPath_2.length(); 28 | } 29 | 30 | int similarStrSize = 0; 31 | for (int i = 0; i < shortestURLLength; i++) { 32 | if (urlPath_1.charAt(i) == urlPath_2.charAt(i)) { 33 | similarStrSize++; 34 | continue; 35 | } 36 | 37 | break; 38 | } 39 | 40 | return similarStrSize * 100 / shortestURLLength > 40 ? -1 : 1; 41 | } catch (Exception e) { 42 | // not an URL, no need to comparate 43 | return 0; 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/TaskProcessor.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | public interface TaskProcessor { 4 | void process(WorkerTask task); 5 | } 6 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/WorkerEvent.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | public enum WorkerEvent { 4 | TASK_COMPLETE, 5 | TASK_FAILED; 6 | } 7 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/WorkerListener.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | import java.util.List; 4 | 5 | public interface WorkerListener { 6 | List intrests(); 7 | 8 | void onEvent(WorkerEvent event, Object... args); 9 | } 10 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/WorkerTask.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker; 2 | 3 | import java.util.concurrent.Future; 4 | 5 | public abstract class WorkerTask implements Future { 6 | protected String taskID; 7 | 8 | protected boolean done = false; 9 | 10 | protected int priority; 11 | 12 | public WorkerTask(int priority) { 13 | taskID = SimpleTaskIDGenerator.genTaskID(); 14 | } 15 | 16 | public String getTaskID() { 17 | return taskID; 18 | } 19 | 20 | public void setTaskID(String taskID) { 21 | this.taskID = taskID; 22 | } 23 | 24 | public int getPriority() { 25 | return priority; 26 | } 27 | 28 | public void setPriority(int priority) { 29 | this.priority = priority; 30 | } 31 | 32 | @Override 33 | public boolean isDone() { 34 | return done; 35 | } 36 | 37 | public void setDone(boolean done) { 38 | this.done = done; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/linear/PageURLMiningProcessor.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.linear; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.net.URL; 7 | import java.net.URLConnection; 8 | import java.util.concurrent.TimeUnit; 9 | import java.util.regex.Matcher; 10 | import java.util.regex.Pattern; 11 | 12 | import com.alibaba.taobao.worker.TaskProcessor; 13 | import com.alibaba.taobao.worker.WorkerTask; 14 | 15 | /** 16 | * Given a specified URL, the processor will try to mine all of the URLs out from the page. The URLs 17 | * are guaranteed to be unique. 18 | * 19 | * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com 20 | * @since Sep 15, 2012 4:19:15 PM 21 | */ 22 | public class PageURLMiningProcessor implements TaskProcessor { 23 | private static final String URL_PATTERN = "http(s)?://[\\w\\.\\/]*(\\.htm|\\.do|\\.html|\\.xhtm|\\.xhtml)"; 24 | 25 | private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10; 26 | 27 | private static final int BUFFER_SIZE = 128 * 1024; 28 | 29 | @Override 30 | public void process(WorkerTask task) { 31 | if (!(task instanceof PageURLMiningTask)) 32 | throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName()); 33 | 34 | PageURLMiningTask urlMiningTask = (PageURLMiningTask) task; 35 | 36 | try { 37 | URL url = new URL(urlMiningTask.getTargetURL()); 38 | 39 | URLConnection urlConnection = url.openConnection(); 40 | urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2)); 41 | urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2)); 42 | 43 | InputStream inputStream = urlConnection.getInputStream(); 44 | BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE); 45 | 46 | StringBuilder pageContent = new StringBuilder(); 47 | 48 | String line = null; 49 | while ((line = reader.readLine()) != null) { 50 | pageContent.append(line); 51 | 52 | if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) { 53 | break; 54 | } 55 | } 56 | 57 | Matcher matcher = Pattern.compile(URL_PATTERN).matcher(pageContent); 58 | while (matcher.find()) { 59 | urlMiningTask.addMinedURL(matcher.group()); 60 | } 61 | 62 | urlMiningTask.setDone(true); 63 | } catch (Exception e) { 64 | System.err.println("Error while fetching specified URL: " + urlMiningTask.getTargetURL() + "\nException" 65 | + e.toString()); 66 | } finally { 67 | synchronized (urlMiningTask) { 68 | urlMiningTask.notifyAll(); 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/linear/PageURLMiningTask.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.linear; 2 | 3 | import java.util.HashSet; 4 | import java.util.concurrent.ExecutionException; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.TimeoutException; 7 | 8 | import com.alibaba.taobao.worker.WorkerTask; 9 | 10 | public class PageURLMiningTask extends WorkerTask> { 11 | private static final int NO_PRIORITY = 0; 12 | 13 | private HashSet minedURLs = new HashSet(); 14 | 15 | private String targetURL; 16 | 17 | public PageURLMiningTask(String targetURL) { 18 | super(NO_PRIORITY); 19 | 20 | this.targetURL = targetURL; 21 | } 22 | 23 | @Override 24 | public boolean cancel(boolean mayInterruptIfRunning) { 25 | throw new UnsupportedOperationException("Not implemented yet"); 26 | } 27 | 28 | @Override 29 | public boolean isCancelled() { 30 | throw new UnsupportedOperationException("Not implemented yet"); 31 | } 32 | 33 | @Override 34 | public synchronized HashSet get() throws InterruptedException, ExecutionException { 35 | if (!isDone()) { 36 | wait(); 37 | } 38 | 39 | return minedURLs; 40 | } 41 | 42 | @Override 43 | public synchronized HashSet get(long timeout, TimeUnit unit) throws InterruptedException, 44 | ExecutionException, TimeoutException { 45 | if (!isDone()) { 46 | wait(unit.toMillis(timeout)); 47 | } 48 | 49 | return minedURLs; 50 | } 51 | 52 | public HashSet getMinedURLs() { 53 | return minedURLs; 54 | } 55 | 56 | public void addMinedURL(String url) { 57 | minedURLs.add(url); 58 | } 59 | 60 | public String getTargetURL() { 61 | return targetURL; 62 | } 63 | 64 | public void setTargetURL(String targetURL) { 65 | this.targetURL = targetURL; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/mapreduce/Map2ReduceConnector.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.mapreduce; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import com.alibaba.taobao.worker.ConfigurableWorker; 8 | import com.alibaba.taobao.worker.WorkerEvent; 9 | import com.alibaba.taobao.worker.WorkerListener; 10 | 11 | public class Map2ReduceConnector implements WorkerListener { 12 | private List reduces = new ArrayList(); 13 | 14 | private int lastIndex = 0; 15 | 16 | public Map2ReduceConnector(List reduces) { 17 | this.reduces.addAll(reduces); 18 | } 19 | 20 | @Override 21 | public List intrests() { 22 | return Arrays.asList(WorkerEvent.TASK_COMPLETE); 23 | } 24 | 25 | @Override 26 | public synchronized void onEvent(WorkerEvent event, Object... args) { 27 | MapReducePageURLMiningTask task = (MapReducePageURLMiningTask) args[0]; 28 | 29 | lastIndex = ++lastIndex % reduces.size(); 30 | reduces.get(lastIndex).addTask(task); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/mapreduce/MapReducePageURLMiningTask.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.mapreduce; 2 | 3 | import java.util.HashSet; 4 | import java.util.concurrent.ExecutionException; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.TimeoutException; 7 | 8 | import com.alibaba.taobao.worker.WorkerTask; 9 | 10 | public class MapReducePageURLMiningTask extends WorkerTask> { 11 | private static final int NO_PRIORITY = 0; 12 | 13 | private HashSet minedURLs = new HashSet(); 14 | 15 | private String pageContent; 16 | 17 | private String targetURL; 18 | 19 | public MapReducePageURLMiningTask(String targetURL) { 20 | super(NO_PRIORITY); 21 | 22 | this.targetURL = targetURL; 23 | } 24 | 25 | @Override 26 | public boolean cancel(boolean mayInterruptIfRunning) { 27 | throw new UnsupportedOperationException("Not implemented yet"); 28 | } 29 | 30 | @Override 31 | public boolean isCancelled() { 32 | throw new UnsupportedOperationException("Not implemented yet"); 33 | } 34 | 35 | @Override 36 | public synchronized HashSet get() throws InterruptedException, ExecutionException { 37 | if (!isDone()) { 38 | wait(); 39 | } 40 | 41 | return minedURLs; 42 | } 43 | 44 | @Override 45 | public synchronized HashSet get(long timeout, TimeUnit unit) throws InterruptedException, 46 | ExecutionException, TimeoutException { 47 | if (!isDone()) { 48 | wait(unit.toMillis(timeout)); 49 | } 50 | 51 | return minedURLs; 52 | } 53 | 54 | public HashSet getMinedURLs() { 55 | return minedURLs; 56 | } 57 | 58 | public void addMinedURL(String url) { 59 | minedURLs.add(url); 60 | } 61 | 62 | public String getTargetURL() { 63 | return targetURL; 64 | } 65 | 66 | public void setTargetURL(String targetURL) { 67 | this.targetURL = targetURL; 68 | } 69 | 70 | public String getPageContent() { 71 | return pageContent; 72 | } 73 | 74 | public void setPageContent(String pageContent) { 75 | this.pageContent = pageContent; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/mapreduce/MapReduceURLMiningMain.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.mapreduce; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.TreeSet; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | import java.util.concurrent.TimeUnit; 9 | 10 | import com.alibaba.taobao.worker.ConfigurableWorker; 11 | import com.alibaba.taobao.worker.SimpleURLComparator; 12 | import com.alibaba.taobao.worker.WorkerEvent; 13 | import com.alibaba.taobao.worker.WorkerListener; 14 | import com.alibaba.taobao.worker.WorkerTask; 15 | 16 | /** 17 | * MapReduce version of page URL mining. It's very powerful. 18 | * 19 | * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com 20 | * @since Sep 16, 2012 5:35:40 PM 21 | */ 22 | public class MapReduceURLMiningMain implements WorkerListener { 23 | private static final int URL_SIZE_TO_MINE = 5000; 24 | 25 | private static ConcurrentHashMap> taskID2TaskMap = new ConcurrentHashMap>(); 26 | 27 | private static TreeSet foundURLs = new TreeSet(new SimpleURLComparator()); 28 | 29 | public static void main(String[] args) throws InterruptedException { 30 | long startTime = System.currentTimeMillis(); 31 | 32 | // four mapers 33 | List mappers = new ArrayList(4); 34 | 35 | ConfigurableWorker mapWorker_1 = new ConfigurableWorker("W_M1"); 36 | ConfigurableWorker mapWorker_2 = new ConfigurableWorker("W_M2"); 37 | ConfigurableWorker mapWorker_3 = new ConfigurableWorker("W_M3"); 38 | ConfigurableWorker mapWorker_4 = new ConfigurableWorker("W_M4"); 39 | mapWorker_1.setTaskProcessor(new PageContentFetchProcessor()); 40 | mapWorker_2.setTaskProcessor(new PageContentFetchProcessor()); 41 | mapWorker_3.setTaskProcessor(new PageContentFetchProcessor()); 42 | mapWorker_4.setTaskProcessor(new PageContentFetchProcessor()); 43 | 44 | mappers.add(mapWorker_1); 45 | mappers.add(mapWorker_2); 46 | mappers.add(mapWorker_3); 47 | mappers.add(mapWorker_4); 48 | 49 | // one reducers 50 | ConfigurableWorker reduceWorker_1 = new ConfigurableWorker("W_R1"); 51 | reduceWorker_1.setTaskProcessor(new URLMatchingProcessor()); 52 | 53 | // bind reducer to final result class 54 | MapReduceURLMiningMain main = new MapReduceURLMiningMain(); 55 | reduceWorker_1.addListener(main); 56 | 57 | // initiate tasks 58 | addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.taobao.com")); 59 | addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.xinhuanet.com")); 60 | addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.zol.com.cn")); 61 | addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.163.com")); 62 | 63 | // bind mapper to reduer 64 | Map2ReduceConnector connector = new Map2ReduceConnector(Arrays.asList(reduceWorker_1)); 65 | mapWorker_1.addListener(connector); 66 | mapWorker_2.addListener(connector); 67 | mapWorker_3.addListener(connector); 68 | mapWorker_4.addListener(connector); 69 | 70 | // start all 71 | mapWorker_1.start(); 72 | mapWorker_2.start(); 73 | mapWorker_3.start(); 74 | mapWorker_4.start(); 75 | reduceWorker_1.start(); 76 | 77 | String targetURL = ""; 78 | int lastIndex = 0; 79 | while (foundURLs.size() < URL_SIZE_TO_MINE) { 80 | synchronized (foundURLs) { 81 | targetURL = foundURLs.pollFirst(); 82 | 83 | if (targetURL == null) { 84 | foundURLs.wait(); 85 | continue; 86 | } 87 | } 88 | 89 | lastIndex = ++lastIndex % mappers.size(); 90 | MapReducePageURLMiningTask task = new MapReducePageURLMiningTask(targetURL); 91 | taskID2TaskMap.putIfAbsent(mappers.get(lastIndex).addTask(task), task); 92 | 93 | synchronized (foundURLs) { 94 | foundURLs.add(targetURL); 95 | } 96 | 97 | TimeUnit.MILLISECONDS.sleep(100); 98 | } 99 | 100 | // stop all 101 | mapWorker_1.stop(); 102 | mapWorker_2.stop(); 103 | mapWorker_3.stop(); 104 | mapWorker_4.stop(); 105 | reduceWorker_1.stop(); 106 | 107 | synchronized (foundURLs) { 108 | for (String string : foundURLs) { 109 | System.out.println(string); 110 | } 111 | } 112 | 113 | System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms"); 114 | } 115 | 116 | private static void addTask2Worker(ConfigurableWorker mapWorker_1, MapReducePageURLMiningTask task) { 117 | String taskID = mapWorker_1.addTask(task); 118 | taskID2TaskMap.put(taskID, task); 119 | } 120 | 121 | @Override 122 | public List intrests() { 123 | return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED); 124 | } 125 | 126 | @Override 127 | public void onEvent(WorkerEvent event, Object... args) { 128 | if (WorkerEvent.TASK_FAILED == event) { 129 | System.err.println("Error while extracting URLs"); 130 | return; 131 | } 132 | 133 | if (WorkerEvent.TASK_COMPLETE != event) 134 | return; 135 | 136 | MapReducePageURLMiningTask task = (MapReducePageURLMiningTask) args[0]; 137 | if (!taskID2TaskMap.containsKey(task.getTaskID())) 138 | return; 139 | 140 | synchronized (foundURLs) { 141 | foundURLs.addAll(task.getMinedURLs()); 142 | } 143 | 144 | System.out.println("Found URL size: " + foundURLs.size()); 145 | 146 | taskID2TaskMap.remove(task.getTaskID()); 147 | 148 | synchronized (foundURLs) { 149 | // notify the static main method above 150 | foundURLs.notifyAll(); 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/mapreduce/PageContentFetchProcessor.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.mapreduce; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.net.URL; 7 | import java.net.URLConnection; 8 | import java.util.concurrent.TimeUnit; 9 | 10 | import com.alibaba.taobao.worker.TaskProcessor; 11 | import com.alibaba.taobao.worker.WorkerTask; 12 | 13 | public class PageContentFetchProcessor implements TaskProcessor { 14 | private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10; 15 | 16 | private static final int BUFFER_SIZE = 128 * 1024; 17 | 18 | @Override 19 | public void process(WorkerTask task) { 20 | if (!(task instanceof MapReducePageURLMiningTask)) 21 | throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName()); 22 | 23 | MapReducePageURLMiningTask mapReduceURLMiningTask = (MapReducePageURLMiningTask) task; 24 | 25 | try { 26 | URL url = new URL(mapReduceURLMiningTask.getTargetURL()); 27 | 28 | URLConnection urlConnection = url.openConnection(); 29 | urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2)); 30 | urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2)); 31 | 32 | InputStream inputStream = urlConnection.getInputStream(); 33 | BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE); 34 | 35 | StringBuilder pageContent = new StringBuilder(); 36 | 37 | String line = null; 38 | while ((line = reader.readLine()) != null) { 39 | pageContent.append(line); 40 | 41 | if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) { 42 | break; 43 | } 44 | } 45 | 46 | mapReduceURLMiningTask.setPageContent(pageContent.toString()); 47 | mapReduceURLMiningTask.setDone(true); 48 | } catch (Exception e) { 49 | System.err.println("Error while fetching specified URL: " + mapReduceURLMiningTask.getTargetURL() 50 | + "\nException" + e.toString()); 51 | } finally { 52 | synchronized (mapReduceURLMiningTask) { 53 | mapReduceURLMiningTask.notifyAll(); 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/com/alibaba/taobao/worker/mapreduce/URLMatchingProcessor.java: -------------------------------------------------------------------------------- 1 | package com.alibaba.taobao.worker.mapreduce; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import com.alibaba.taobao.worker.TaskProcessor; 7 | import com.alibaba.taobao.worker.WorkerTask; 8 | 9 | public class URLMatchingProcessor implements TaskProcessor { 10 | private static final String URL_PATTERN = "http(s)?://[\\w\\.\\/]*(\\.htm|\\.do|\\.html|\\.xhtm|\\.xhtml)"; 11 | 12 | @Override 13 | public void process(WorkerTask task) { 14 | if (!(task instanceof MapReducePageURLMiningTask)) 15 | throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName()); 16 | 17 | MapReducePageURLMiningTask mapReduceURLMiningTask = (MapReducePageURLMiningTask) task; 18 | 19 | try { 20 | Matcher matcher = Pattern.compile(URL_PATTERN).matcher(mapReduceURLMiningTask.getPageContent()); 21 | while (matcher.find()) { 22 | mapReduceURLMiningTask.addMinedURL(matcher.group()); 23 | } 24 | 25 | mapReduceURLMiningTask.setDone(true); 26 | } catch (Exception e) { 27 | System.err.println("Error while fetching specified URL: " + mapReduceURLMiningTask.getTargetURL() 28 | + "\nException" + e.toString()); 29 | } finally { 30 | synchronized (mapReduceURLMiningTask) { 31 | mapReduceURLMiningTask.notifyAll(); 32 | } 33 | } 34 | } 35 | } 36 | --------------------------------------------------------------------------------