├── src └── main │ └── java │ └── com │ └── pramati │ └── scraper │ ├── google_grp_scraper │ ├── ScraperStartupi.java │ ├── DownloadWorker.java │ └── CollectLink.java │ └── util │ ├── RecoveryUtil.java │ └── FileUtil.java ├── README.md └── pom.xml /src/main/java/com/pramati/scraper/google_grp_scraper/ScraperStartupi.java: -------------------------------------------------------------------------------- 1 | package com.pramati.scraper.google_grp_scraper; 2 | 3 | import java.net.URL; 4 | 5 | 6 | public class ScraperStartup { 7 | public static void main(String[] args) throws Exception { 8 | 9 | CollectLink collectLink = new CollectLink(); 10 | URL url = new URL(args[0]); 11 | int noOfWorker; 12 | try { 13 | noOfWorker = Integer.parseInt(args[1]); 14 | } catch (NumberFormatException e) { 15 | noOfWorker = 10; 16 | } 17 | 18 | collectLink.init(url, noOfWorker); 19 | collectLink.scrap(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | google-grp-scraper is a crawler which takes google group URL and no-of-downloader threads in input 2 | 3 | i.e. java -jar google-grp-scraper-0.0.1-SNAPSHOT.jar 4 | 5 | example:- java -jar google-grp-scraper-0.0.1-SNAPSHOT.jar 'https://groups.google.com/forum/#!forum/ibm.software.websphere.application-server' 20 6 | 7 | 8 | 9 | OUTPUT:- After running above you will get output as below: 10 | 1) Topics will be downloaded at (Download//Topics) 11 | 2) Recovery file will be created at (Download//Recovery) 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.pramati.scraper 6 | google-grp-scraper 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | google-grp-scraper 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | org.seleniumhq.selenium 26 | selenium-java 27 | 2.45.0 28 | 29 | 30 | org.seleniumhq.selenium 31 | selenium-htmlunit-driver 32 | 2.45.0 33 | 34 | 35 | 36 | 37 | 38 | org.apache.maven.plugins 39 | maven-jar-plugin 40 | 2.4 41 | 42 | 43 | 44 | com.pramati.scraper.google_grp_scraper.ScraperStartup 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/main/java/com/pramati/scraper/util/RecoveryUtil.java: -------------------------------------------------------------------------------- 1 | package com.pramati.scraper.util; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.util.HashSet; 6 | import java.util.Scanner; 7 | import java.util.Set; 8 | 9 | public class RecoveryUtil { 10 | private FileUtil fileUtil = new FileUtil(); 11 | private String recoveryFileName = "RecoveryList.txt"; 12 | private String directorySeparator = "/"; 13 | private String failureRecoveryDirectory = "Recovery"; 14 | 15 | public Set getDownloadedLinks(String parentDirOfRecoveryDirectory) { 16 | Scanner reader = null; 17 | Set fileStrList = new HashSet(); 18 | try { 19 | reader = new Scanner(new File(parentDirOfRecoveryDirectory 20 | + directorySeparator + failureRecoveryDirectory 21 | + directorySeparator + recoveryFileName)); 22 | while (reader.hasNext()) { 23 | String urlString = reader.nextLine().trim(); 24 | if (!urlString.equals("")) { 25 | fileStrList.add(urlString); 26 | } 27 | } 28 | } catch (FileNotFoundException e) { 29 | System.err.println("Recovery File [" + parentDirOfRecoveryDirectory 30 | + "] does not exist"); 31 | } catch (Exception e) { 32 | e.printStackTrace(); 33 | } finally { 34 | try { 35 | reader.close(); 36 | } catch (Exception e) { 37 | System.err.println("File Loading Failed"); 38 | e.printStackTrace(); 39 | } 40 | } 41 | return fileStrList; 42 | } 43 | 44 | public void maintainRecoveryList(String parentDirOfRecoveryDirectory, 45 | String content) { 46 | fileUtil.createFileAndAppendContent(recoveryFileName, 47 | parentDirOfRecoveryDirectory + directorySeparator 48 | + failureRecoveryDirectory, content); 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/pramati/scraper/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | package com.pramati.scraper.util; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.File; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | 8 | public class FileUtil { 9 | 10 | public void createDir(String dirPath) { 11 | File file = new File(dirPath); 12 | if (!file.exists()) { 13 | if (file.mkdirs()) { 14 | // logger 15 | } else { 16 | // logger error 17 | } 18 | } 19 | } 20 | 21 | public void createFileAndWriteTxt(String fileName, String parentDirPath, 22 | String textTosave) { 23 | createDir(parentDirPath); 24 | fileName = fileName.replaceAll("/", "-or-"); 25 | BufferedWriter bw = null; 26 | File file = new File(parentDirPath, fileName); 27 | try { 28 | file.createNewFile(); 29 | bw = new BufferedWriter(new FileWriter(file)); 30 | bw.write(textTosave); 31 | } catch (IOException e) { 32 | e.printStackTrace(); 33 | } finally { 34 | try { 35 | bw.flush(); 36 | bw.close(); 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | } 41 | } 42 | 43 | public void createFileAndAppendContent(String fileName, 44 | String parentDirPath, String textToappend) { 45 | createDir(parentDirPath); 46 | fileName = fileName.replaceAll("/", "-or-"); 47 | BufferedWriter bw = null; 48 | File file = new File(parentDirPath, fileName); 49 | try { 50 | file.createNewFile(); 51 | bw = new BufferedWriter(new FileWriter(file, true)); 52 | bw.newLine(); 53 | bw.write(textToappend); 54 | } catch (IOException e) { 55 | e.printStackTrace(); 56 | } finally { 57 | try { 58 | bw.flush(); 59 | bw.close(); 60 | } catch (IOException e) { 61 | e.printStackTrace(); 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/pramati/scraper/google_grp_scraper/DownloadWorker.java: -------------------------------------------------------------------------------- 1 | package com.pramati.scraper.google_grp_scraper; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.net.URL; 6 | import java.util.List; 7 | import java.util.concurrent.BlockingQueue; 8 | 9 | import com.gargoylesoftware.htmlunit.BrowserVersion; 10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 11 | import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; 12 | import com.gargoylesoftware.htmlunit.WebClient; 13 | import com.gargoylesoftware.htmlunit.WebRequest; 14 | import com.gargoylesoftware.htmlunit.html.HtmlDivision; 15 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 16 | import com.gargoylesoftware.htmlunit.html.HtmlSpan; 17 | import com.pramati.scraper.util.FileUtil; 18 | import com.pramati.scraper.util.RecoveryUtil; 19 | 20 | public class DownloadWorker implements Runnable { 21 | 22 | private BlockingQueue linksSharedQueueForDownload; 23 | private WebClient client; 24 | private String downloadDirectory = "Download"; 25 | private String failureRecoveryDirectory = "Recovery"; 26 | private String topicDirectory = "Topics"; 27 | private String directorySeparator = "/"; 28 | private FileUtil fileUtil = new FileUtil(); 29 | private String groupName; 30 | private RecoveryUtil recoveryUtil = new RecoveryUtil(); 31 | 32 | public DownloadWorker( 33 | BlockingQueue topicLinksSharedQueueForDownload, 34 | String groupName) { 35 | this.linksSharedQueueForDownload = topicLinksSharedQueueForDownload; 36 | this.groupName = groupName; 37 | } 38 | 39 | public void run() { 40 | createClient(); 41 | while (true) { 42 | String linkForDownload = null; 43 | try { 44 | linkForDownload = linksSharedQueueForDownload.take(); 45 | } catch (InterruptedException e) { 46 | e.printStackTrace(); 47 | } 48 | 49 | if (linkForDownload.equalsIgnoreCase("POISON")) { 50 | break; 51 | } 52 | if (linkForDownload != null) { 53 | download(linkForDownload); 54 | } 55 | } 56 | } 57 | 58 | @SuppressWarnings("unchecked") 59 | private void download(String linkForDownload) { 60 | 61 | WebRequest webReq; 62 | String titleOfTopic = ""; 63 | String contentOfTopic = ""; 64 | List spanElement = null; 65 | List divElement = null; 66 | try { 67 | webReq = new WebRequest(new URL(linkForDownload)); 68 | HtmlPage page = client.getPage(webReq); 69 | 70 | spanElement = (List) page 71 | .getByXPath("//span[@id=\"t-t\"]"); 72 | divElement = (List) page 73 | .getByXPath("//div[@class=\"G3J0AAD-nb-P\"]"); 74 | 75 | if (!spanElement.isEmpty()) { 76 | HtmlSpan span = (HtmlSpan) spanElement.get(0); 77 | titleOfTopic = span.asText(); 78 | contentOfTopic = "SUBJECT IS :" + titleOfTopic + "\n\n"; 79 | } 80 | 81 | for (HtmlDivision div : divElement) { 82 | if (div != null && !div.asText().equals("") 83 | && div.asText() != null) { 84 | contentOfTopic = contentOfTopic + div.asText() + "\n"; 85 | contentOfTopic = contentOfTopic 86 | + "----------------------------------------------------------------------------\n"; 87 | } 88 | } 89 | } catch (MalformedURLException e1) { 90 | e1.printStackTrace(); 91 | } catch (FailingHttpStatusCodeException e) { 92 | e.printStackTrace(); 93 | } catch (IOException e) { 94 | e.printStackTrace(); 95 | } 96 | try { 97 | if (!titleOfTopic.equals("")) { 98 | if (titleOfTopic.length() > 25) { 99 | titleOfTopic = titleOfTopic.substring(0, 20); 100 | } 101 | fileUtil.createFileAndWriteTxt(titleOfTopic, downloadDirectory 102 | + directorySeparator + groupName + directorySeparator 103 | + topicDirectory, contentOfTopic); 104 | 105 | recoveryUtil.maintainRecoveryList(downloadDirectory 106 | + directorySeparator + groupName, linkForDownload); 107 | } else { 108 | System.out.println("Nothing to download"); 109 | } 110 | } catch (Exception e) { 111 | e.printStackTrace(); 112 | } 113 | 114 | client.closeAllWindows(); 115 | } 116 | 117 | private void createClient() { 118 | client = new WebClient(BrowserVersion.FIREFOX_24); 119 | client.getOptions().setJavaScriptEnabled(true); 120 | client.getOptions().setRedirectEnabled(false); 121 | client.getOptions().setThrowExceptionOnScriptError(false); 122 | client.getOptions().setCssEnabled(false); 123 | client.getOptions().setUseInsecureSSL(false); 124 | client.getOptions().setThrowExceptionOnFailingStatusCode(false); 125 | client.setAjaxController(new NicelyResynchronizingAjaxController()); 126 | client.setJavaScriptTimeout(36000); 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/com/pramati/scraper/google_grp_scraper/CollectLink.java: -------------------------------------------------------------------------------- 1 | package com.pramati.scraper.google_grp_scraper; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URL; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | import java.util.concurrent.ArrayBlockingQueue; 8 | import java.util.concurrent.BlockingQueue; 9 | import java.util.regex.Matcher; 10 | import java.util.regex.Pattern; 11 | 12 | import org.openqa.selenium.By; 13 | import org.openqa.selenium.Keys; 14 | import org.openqa.selenium.WebDriver; 15 | import org.openqa.selenium.WebElement; 16 | import org.openqa.selenium.firefox.FirefoxDriver; 17 | import org.openqa.selenium.interactions.Actions; 18 | 19 | import com.pramati.scraper.util.RecoveryUtil; 20 | 21 | public class CollectLink { 22 | private Set collectedLinkSet = new HashSet(); 23 | private BlockingQueue linksSharedQueueForDownload = new ArrayBlockingQueue( 24 | 100000); 25 | private URL urlOfGrp; 26 | private String groupName; 27 | private int noOfWorkers; 28 | private String downloadDirectory = "Download"; 29 | private String directorySeparator = "/"; 30 | private RecoveryUtil recoveryUtil = new RecoveryUtil(); 31 | private Set recoveredLinks; 32 | 33 | public void init(URL url, int noOfWorker) throws Exception { 34 | this.urlOfGrp = url; 35 | this.noOfWorkers = noOfWorker; 36 | setGroupName(urlOfGrp); 37 | } 38 | 39 | public void scrap() throws InterruptedException { 40 | WebDriver groupBrowser = new FirefoxDriver(); 41 | this.startDownloader(); 42 | this.startCrawl(groupBrowser); 43 | } 44 | 45 | @SuppressWarnings("static-access") 46 | private void startCrawl(WebDriver groupBrowser) throws InterruptedException { 47 | groupBrowser.navigate().to(urlOfGrp); 48 | this.performFailureRecovery(); 49 | 50 | Thread.currentThread().sleep(5000); 51 | Actions clickAction = new Actions(groupBrowser); 52 | WebElement scrollablePane = groupBrowser.findElement(By 53 | //.className("G3J0AAD-b-F")); 54 | .className("IVILX2C-b-D")); 55 | clickAction.moveToElement(scrollablePane).click().build().perform(); 56 | 57 | Set links = null; 58 | boolean shouldContinueScroll; 59 | do { 60 | Actions scrollAction = new Actions(groupBrowser); 61 | scrollAction.keyDown(Keys.CONTROL).sendKeys(Keys.END).perform(); 62 | Thread.currentThread().sleep(5000); 63 | links = getNewLinksFromPage(groupBrowser.getPageSource()); 64 | shouldContinueScroll = (links.size() > 0 ? true : false); 65 | links.removeAll(recoveredLinks); 66 | linksSharedQueueForDownload.addAll(links); 67 | } while (shouldContinueScroll); 68 | 69 | for (int i = 0; i < noOfWorkers; i++) { 70 | linksSharedQueueForDownload.add("POISON"); 71 | } 72 | Thread.currentThread().sleep(5000); 73 | groupBrowser.close(); 74 | } 75 | 76 | private void performFailureRecovery() { 77 | recoveredLinks = recoveryUtil.getDownloadedLinks(downloadDirectory 78 | + directorySeparator + groupName); 79 | 80 | } 81 | 82 | private Set getNewLinksFromPage(String pageContent) { 83 | String hyperlinkRegex = "\\s*(?i)href\\s*=\\s*\"(#%21topic(.*?))\""; 84 | Set links = new HashSet(); 85 | Pattern pattern = Pattern.compile(hyperlinkRegex); 86 | Matcher matcher = pattern.matcher(pageContent); 87 | while (matcher.find()) { 88 | String str = matcher.group(1); 89 | if (str != null && str.length() > 0) { 90 | String completeUrlString = getCompleteUrlFromHyperlink(str); 91 | if (completeUrlString != null 92 | && !collectedLinkSet.contains(completeUrlString)) { 93 | links.add(completeUrlString); 94 | collectedLinkSet.add(completeUrlString); 95 | } 96 | } 97 | } 98 | return links; 99 | } 100 | 101 | private void setGroupName(URL urlOfGrp) throws Exception { 102 | String grpNameRegex = "https://groups.google.com/forum/#(.*?)forum/(.*?)"; 103 | Set links = new HashSet(); 104 | Pattern pattern = Pattern.compile(grpNameRegex); 105 | Matcher matcher = pattern.matcher(urlOfGrp.toString()); 106 | if (matcher.matches()) { 107 | groupName = matcher.group(2); 108 | } else { 109 | throw new Exception("INVALID GROUP URL"); 110 | } 111 | } 112 | 113 | private String getCompleteUrlFromHyperlink(String replace) { 114 | URL completeUrl = null; 115 | try { 116 | completeUrl = new URL(urlOfGrp, replace); 117 | } catch (MalformedURLException e) { 118 | e.printStackTrace(); 119 | } 120 | return (completeUrl != null) ? completeUrl.toString() : null; 121 | } 122 | 123 | private void startDownloader() { 124 | DownloadWorker downloadWorker = null; 125 | for (int i = 0; i < noOfWorkers; i++) { 126 | downloadWorker = new DownloadWorker(linksSharedQueueForDownload, 127 | groupName); 128 | Thread dowloaderThread = new Thread(downloadWorker); 129 | dowloaderThread.start(); 130 | } 131 | } 132 | } 133 | --------------------------------------------------------------------------------