├── .gitignore ├── file.png ├── invoice.pdf ├── ocr_exemple.jpg ├── screenshot.png ├── README.md ├── src ├── chapter4 │ ├── BasicAuthentication.java │ ├── SearchForm.java │ ├── Authentication.java │ ├── AuthenticationPostRequest.java │ └── FileUploader.java ├── chapter5 │ ├── ChromeHeadless.java │ ├── DirectApiCalls.java │ ├── ExecuteJavascriptFunction.java │ └── InfiniteScrollHeadlessChrome.java ├── chapter3 │ ├── HackerNewsItem.java │ └── HNScraper.java ├── chapter6 │ ├── OpticalCharacterRecognition.java │ ├── PDFScraping.java │ └── ReCaptchaV2.java └── chapter7 │ └── Chapter7.java ├── pom.xml └── user-agents.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.classpath 2 | *.project 3 | *.settings 4 | bin/ 5 | target/ -------------------------------------------------------------------------------- /file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/file.png -------------------------------------------------------------------------------- /invoice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/invoice.pdf -------------------------------------------------------------------------------- /ocr_exemple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/ocr_exemple.jpg -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/screenshot.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # javawebscrapinghandbook_code 2 | 3 | This is the code examples for my ebook : https://www.javawebscrapinghandbook.com 4 | -------------------------------------------------------------------------------- /src/chapter4/BasicAuthentication.java: -------------------------------------------------------------------------------- 1 | package chapter4; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.logging.Level; 6 | 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 8 | import com.gargoylesoftware.htmlunit.WebClient; 9 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 10 | 11 | public class BasicAuthentication { 12 | 13 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ; 14 | static final String username = "basic" ; 15 | static final String password = "auth" ; 16 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { 17 | WebClient client = new WebClient(); 18 | client.getOptions().setJavaScriptEnabled(true); 19 | client.getOptions().setCssEnabled(false); 20 | client.getOptions().setUseInsecureSSL(true); 21 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 22 | 23 | HtmlPage page = client.getPage(String.format("https://%s:%s@www.javawebscrapingsandbox.com/basic_auth", username, password)); 24 | System.out.println(page.asText()); 25 | 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/chapter5/ChromeHeadless.java: -------------------------------------------------------------------------------- 1 | package chapter5; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import org.apache.commons.io.FileUtils; 7 | import org.openqa.selenium.OutputType; 8 | import org.openqa.selenium.TakesScreenshot; 9 | import org.openqa.selenium.WebDriver; 10 | import org.openqa.selenium.chrome.ChromeDriver; 11 | import org.openqa.selenium.chrome.ChromeOptions; 12 | 13 | public class ChromeHeadless { 14 | public static void main(String[] args) throws IOException, InterruptedException{ 15 | String chromeDriverPath = "/Users/kevin/Downloads/chromedriver" ; 16 | System.setProperty("webdriver.chrome.driver", chromeDriverPath); 17 | ChromeOptions options = new ChromeOptions(); 18 | options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent"); 19 | WebDriver driver = new ChromeDriver(options); 20 | 21 | // Get the login page 22 | driver.get("https://pro.coinbase.com/trade/BTC-USD"); 23 | Thread.sleep(10000); 24 | 25 | 26 | // Take a screenshot of the current page 27 | File screenshot = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE); 28 | FileUtils.copyFile(screenshot, new File("screenshot.png")); 29 | driver.close(); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/chapter3/HackerNewsItem.java: -------------------------------------------------------------------------------- 1 | package chapter3; 2 | 3 | public class HackerNewsItem { 4 | private String title; 5 | 6 | private String url ; 7 | private String author; 8 | private int score; 9 | private int position ; 10 | private int id ; 11 | 12 | public HackerNewsItem(String title, String url, String author, int score, int position, int id) { 13 | super(); 14 | this.title = title; 15 | this.url = url; 16 | this.author = author; 17 | this.score = score; 18 | this.position = position; 19 | this.id = id; 20 | } 21 | 22 | public int getId() { 23 | return id; 24 | } 25 | public void setId(int id) { 26 | this.id = id; 27 | } 28 | public int getPosition() { 29 | return position; 30 | } 31 | public void setPosition(int position) { 32 | this.position = position; 33 | } 34 | public String getTitle() { 35 | return title; 36 | } 37 | public void setTitle(String title) { 38 | this.title = title; 39 | } 40 | public String getUrl() { 41 | return url; 42 | } 43 | public void setUrl(String url) { 44 | this.url = url; 45 | } 46 | public String getAuthor() { 47 | return author; 48 | } 49 | public void setAuthor(String author) { 50 | this.author = author; 51 | } 52 | public int getScore() { 53 | return score; 54 | } 55 | public void setScore(int score) { 56 | this.score = score; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/chapter6/OpticalCharacterRecognition.java: -------------------------------------------------------------------------------- 1 | package chapter6; 2 | 3 | import org.bytedeco.javacpp.*; 4 | import org.bytedeco.javacpp.BytePointer; 5 | import org.bytedeco.javacpp.lept.*; 6 | 7 | import org.bytedeco.javacpp.tesseract.TessBaseAPI; 8 | 9 | public class OpticalCharacterRecognition { 10 | 11 | final static String TESS_DATA_PATH = "/usr/local/Cellar/tesseract/3.05.02/share/tessdata" ; 12 | 13 | public static void main(String[] args) { 14 | BytePointer outText; 15 | TessBaseAPI api = new TessBaseAPI(); 16 | 17 | if (api.Init(TESS_DATA_PATH, "ENG") != 0) { 18 | System.err.println("Could not initialize tesseract."); 19 | System.exit(1); 20 | } 21 | //api.SetVariable("tessedit_char_whitelist", "0123456789,"); 22 | PIX image = lept.pixRead("ocr_exemple.jpg"); 23 | api.SetImage(image); 24 | 25 | // Get OCR result 26 | outText = api.GetUTF8Text(); 27 | String string = outText.getString(); 28 | String invoiceNumber = "" ; 29 | for(String lines : string.split("\\n")){ 30 | if(lines.contains("Invoice")){ 31 | invoiceNumber = lines.split("Invoice Number: ")[1]; 32 | System.out.println(String.format("Invoice number found : %s", invoiceNumber)); 33 | } 34 | } 35 | 36 | // Destroy used object and release memory 37 | api.End(); 38 | outText.deallocate(); 39 | lept.pixDestroy(image); 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | ebook_webscraping 4 | ebook_webscraping 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | src 9 | 10 | 11 | maven-compiler-plugin 12 | 3.3 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | net.sourceforge.htmlunit 25 | htmlunit 26 | 2.28 27 | 28 | 29 | com.fasterxml.jackson.core 30 | jackson-databind 31 | 2.7.0 32 | 33 | 34 | 35 | org.seleniumhq.selenium 36 | selenium-java 37 | 3.8.1 38 | 39 | 40 | 41 | org.apache.pdfbox 42 | pdfbox 43 | 2.0.4 44 | 45 | 46 | 47 | org.bytedeco.javacpp-presets 48 | tesseract-platform 49 | 3.05.01-1.4.1 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/chapter5/DirectApiCalls.java: -------------------------------------------------------------------------------- 1 | package chapter5; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.Iterator; 6 | import java.util.logging.Level; 7 | 8 | import com.fasterxml.jackson.core.JsonProcessingException; 9 | import com.fasterxml.jackson.databind.JsonNode; 10 | import com.fasterxml.jackson.databind.ObjectMapper; 11 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 12 | import com.gargoylesoftware.htmlunit.Page; 13 | import com.gargoylesoftware.htmlunit.WebClient; 14 | 15 | public class DirectApiCalls { 16 | 17 | public static void parseJson(String jsonString) throws JsonProcessingException, IOException{ 18 | ObjectMapper mapper = new ObjectMapper(); 19 | JsonNode rootNode = mapper.readTree(jsonString); 20 | Iterator elements = rootNode.elements(); 21 | while(elements.hasNext()){ 22 | JsonNode node = elements.next(); 23 | Long id = node.get("id").asLong(); 24 | String name = node.get("name").asText(); 25 | String price = node.get("price").asText(); 26 | System.out.println(String.format("Id: %s - Name: %s - Price: %s", id, name, price)); 27 | } 28 | } 29 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { 30 | WebClient client = new WebClient(); 31 | client.getOptions().setJavaScriptEnabled(false); 32 | client.getOptions().setCssEnabled(false); 33 | client.getOptions().setUseInsecureSSL(true); 34 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 35 | 36 | for(int i = 1; i < 5; i++){ 37 | Page json = client.getPage("https://www.javawebscrapingsandbox.com/product/api/" + i ); 38 | parseJson(json.getWebResponse().getContentAsString()); 39 | } 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/chapter5/ExecuteJavascriptFunction.java: -------------------------------------------------------------------------------- 1 | package chapter5; 2 | 3 | import java.util.List; 4 | 5 | import org.openqa.selenium.By; 6 | import org.openqa.selenium.JavascriptExecutor; 7 | import org.openqa.selenium.WebDriver; 8 | import org.openqa.selenium.WebElement; 9 | import org.openqa.selenium.chrome.ChromeDriver; 10 | import org.openqa.selenium.chrome.ChromeOptions; 11 | 12 | public class ExecuteJavascriptFunction { 13 | 14 | public static void processLines(List lines){ 15 | int size = lines.size(); 16 | System.out.println(String.format("There are %s product rows in the table", size)); 17 | } 18 | public static void main(String[] args) throws InterruptedException { 19 | String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ; 20 | System.setProperty("webdriver.chrome.driver", chromeDriverPath); 21 | ChromeOptions options = new ChromeOptions(); 22 | options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent"); 23 | options.addArguments("window-size=600,400"); 24 | 25 | WebDriver driver = new ChromeDriver(options); 26 | JavascriptExecutor js = (JavascriptExecutor) driver; 27 | int pageNumber = 5 ; 28 | 29 | driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll"); 30 | for(int i = 3; i < pageNumber + 3; i++){ 31 | js.executeScript("drawNextLines('/product/api/" + i +"');"); 32 | while((Boolean)js.executeScript("return win.data('ajaxready');") == false){ 33 | Thread.sleep(100); 34 | } 35 | } 36 | List rows = driver.findElements(By.xpath("//tr")); 37 | 38 | // do something with the row list 39 | processLines(rows); 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/chapter5/InfiniteScrollHeadlessChrome.java: -------------------------------------------------------------------------------- 1 | package chapter5; 2 | 3 | import java.util.List; 4 | 5 | import org.openqa.selenium.By; 6 | import org.openqa.selenium.JavascriptExecutor; 7 | import org.openqa.selenium.WebDriver; 8 | import org.openqa.selenium.WebElement; 9 | import org.openqa.selenium.chrome.ChromeDriver; 10 | import org.openqa.selenium.chrome.ChromeOptions; 11 | 12 | public class InfiniteScrollHeadlessChrome { 13 | 14 | 15 | static final String URL = ""; 16 | public static void processLines(List lines){ 17 | int size = lines.size(); 18 | System.out.println(String.format("There are %s product rows in the table", size)); 19 | } 20 | public static void main(String[] args) throws InterruptedException { 21 | String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ; 22 | System.setProperty("webdriver.chrome.driver", chromeDriverPath); 23 | ChromeOptions options = new ChromeOptions(); 24 | options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent"); 25 | options.addArguments("window-size=600,400"); 26 | 27 | 28 | WebDriver driver = new ChromeDriver(options); 29 | JavascriptExecutor js = (JavascriptExecutor) driver; 30 | int pageNumber = 5 ; 31 | 32 | driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll"); 33 | for(int i = 0; i < pageNumber; i++){ 34 | js.executeScript("window.scrollTo(0, document.body.scrollHeight);"); 35 | Thread.sleep(1200); 36 | } 37 | List rows = driver.findElements(By.xpath("//tr")); 38 | 39 | // do something with the row list 40 | processLines(rows); 41 | 42 | driver.quit(); 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /user-agents.txt: -------------------------------------------------------------------------------- 1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1 2 | Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1 3 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31 4 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36 6 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1 7 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.7 Safari/532.2 8 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13 9 | Mozilla/5.0 (Windows NT 5.1; rv:5.0.1) Gecko/20100101 Firefox/5.0.1 10 | Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.02 11 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0 12 | Mozilla/5.0 (Windows NT 6.1; rv:2.0b7pre) Gecko/20100921 Firefox/4.0b7pre 13 | Mozilla/5.0 (X11; U; Linux x86; fr-fr) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3 AppleWebKit/532.4 Safari/532.4 14 | Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 15 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 GTB5 16 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1 17 | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0) 18 | Opera/9.80 (Windows NT 6.2; Win64; x64) Presto/2.12.388 Version/12.15 19 | Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16 20 | Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14 -------------------------------------------------------------------------------- /src/chapter4/SearchForm.java: -------------------------------------------------------------------------------- 1 | package chapter4; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.logging.Level; 6 | 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 8 | import com.gargoylesoftware.htmlunit.WebClient; 9 | import com.gargoylesoftware.htmlunit.html.HtmlForm; 10 | import com.gargoylesoftware.htmlunit.html.HtmlInput; 11 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 12 | import com.gargoylesoftware.htmlunit.html.HtmlTable; 13 | import com.gargoylesoftware.htmlunit.html.HtmlTableRow; 14 | import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLTableElement; 15 | 16 | public class SearchForm { 17 | 18 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ; 19 | static final String MINPRICE = "300"; 20 | static final String MAXPRICE = "650" ; 21 | 22 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { 23 | WebClient client = new WebClient(); 24 | client.getOptions().setJavaScriptEnabled(true); 25 | client.getOptions().setCssEnabled(false); 26 | client.getOptions().setUseInsecureSSL(true); 27 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 28 | 29 | HtmlPage page = client.getPage(baseUrl + "product/search"); 30 | 31 | HtmlInput minPrice = page.getHtmlElementById("min_price"); 32 | HtmlInput maxPrice = page.getHtmlElementById("max_price"); 33 | 34 | // set the min/max values 35 | minPrice.setValueAttribute(MINPRICE); 36 | maxPrice.setValueAttribute(MAXPRICE); 37 | HtmlForm form = minPrice.getEnclosingForm(); 38 | 39 | page = client.getPage(form.getWebRequest(null)); 40 | 41 | HtmlTable table = page.getFirstByXPath("//table"); 42 | for(HtmlTableRow elem : table.getBodies().get(0).getRows()){ 43 | System.out.println(String.format("Name : %s Price: %s", elem.getCell(0).asText(), elem.getCell(2).asText())); 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/chapter7/Chapter7.java: -------------------------------------------------------------------------------- 1 | package chapter7; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Random; 9 | 10 | import com.gargoylesoftware.htmlunit.ProxyConfig; 11 | import com.gargoylesoftware.htmlunit.WebClient; 12 | 13 | public class Chapter7 { 14 | 15 | 16 | private static final String FILENAME = "user-agents.txt"; 17 | 18 | public static WebClient initWebClientWithHeaders(){ 19 | WebClient client = new WebClient(); 20 | client.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); 21 | client.addRequestHeader("Accept-Encoding", "gzip, deflate, br"); 22 | client.addRequestHeader("Accept-Language", "en-US,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,la;q=0.6"); 23 | client.addRequestHeader("Connection", "keep-alive"); 24 | client.addRequestHeader("Host", "ksah.in"); 25 | client.addRequestHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"); 26 | client.addRequestHeader("Pragma", "no-cache"); 27 | 28 | return client ; 29 | } 30 | 31 | private static String getRandomUseragent(){ 32 | List userAgents = new ArrayList(); 33 | Random rand = new Random(); 34 | try (BufferedReader br = new BufferedReader(new FileReader(FILENAME))) { 35 | String sCurrentLine; 36 | while ((sCurrentLine = br.readLine()) != null) { 37 | userAgents.add(sCurrentLine); 38 | } 39 | 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | } 43 | 44 | return userAgents.get(rand.nextInt(userAgents.size())); 45 | } 46 | public static void main(String[] args) { 47 | WebClient client = new WebClient() ; 48 | ProxyConfig proxyConfig = new ProxyConfig("host", 12345); 49 | client.getOptions().setProxyConfig(proxyConfig); 50 | System.out.println(getRandomUseragent()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/chapter3/HNScraper.java: -------------------------------------------------------------------------------- 1 | package chapter3; 2 | 3 | import java.util.List; 4 | 5 | import com.fasterxml.jackson.databind.ObjectMapper; 6 | import com.gargoylesoftware.htmlunit.WebClient; 7 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor; 8 | import com.gargoylesoftware.htmlunit.html.HtmlElement; 9 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 10 | 11 | public class HNScraper { 12 | 13 | public static void main(String[] args) { 14 | String baseUrl = "https://news.ycombinator.com/" ; 15 | WebClient client = new WebClient(); 16 | client.getOptions().setCssEnabled(false); 17 | client.getOptions().setJavaScriptEnabled(false); 18 | try{ 19 | HtmlPage page = client.getPage(baseUrl); 20 | List itemList = page.getByXPath("//tr[@class='athing']"); 21 | if(itemList.isEmpty()){ 22 | System.out.println("No item found"); 23 | }else{ 24 | for(HtmlElement htmlItem : itemList){ 25 | int position = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./td/span")).asText().replace(".", "")); 26 | int id = Integer.parseInt(htmlItem.getAttribute("id")); 27 | String title = ((HtmlElement) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']")).asText(); 28 | String url = ((HtmlAnchor) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']/a")).getHrefAttribute(); 29 | String author = ((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/a[@class='hnuser']")).asText(); 30 | int score = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/span[@class='score']")).asText().replace(" points", "")); 31 | 32 | HackerNewsItem hnItem = new HackerNewsItem(title, url, author, score, position, id); 33 | 34 | ObjectMapper mapper = new ObjectMapper(); 35 | String jsonString = mapper.writeValueAsString(hnItem) ; 36 | 37 | System.out.println(jsonString); 38 | } 39 | } 40 | }catch(Exception e){ 41 | e.printStackTrace(); 42 | }finally{ 43 | client.close(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/chapter4/Authentication.java: -------------------------------------------------------------------------------- 1 | package chapter4; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.logging.Level; 6 | 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 8 | import com.gargoylesoftware.htmlunit.WebClient; 9 | import com.gargoylesoftware.htmlunit.html.HtmlForm; 10 | import com.gargoylesoftware.htmlunit.html.HtmlInput; 11 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 12 | 13 | public class Authentication { 14 | 15 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ; 16 | static final String loginUrl = "account/login" ; 17 | static final String email = "test@test.com" ; 18 | static final String password = "test" ; 19 | 20 | 21 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException { 22 | WebClient client = new WebClient(); 23 | client.getOptions().setJavaScriptEnabled(true); 24 | client.getOptions().setCssEnabled(false); 25 | client.getOptions().setUseInsecureSSL(true); 26 | // Turn off the logger 27 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 28 | 29 | // Get the login page 30 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ; 31 | 32 | // Select the email input 33 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']"); 34 | 35 | // Select the password input 36 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']"); 37 | 38 | // Set the value for both inputs 39 | inputEmail.setValueAttribute(email); 40 | inputPassword.setValueAttribute(password); 41 | 42 | // Select the form 43 | HtmlForm loginForm = inputPassword.getEnclosingForm() ; 44 | 45 | // Generate the POST request with the form 46 | page = client.getPage(loginForm.getWebRequest(null)); 47 | 48 | if(!page.asText().contains("You are now logged in")){ 49 | System.err.println("Error: Authentication failed"); 50 | }else{ 51 | System.out.println("Success ! Logged in"); 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/chapter6/PDFScraping.java: -------------------------------------------------------------------------------- 1 | package chapter6; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.io.IOException; 6 | import java.util.logging.Level; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import org.apache.commons.io.IOUtils; 11 | import org.apache.pdfbox.pdmodel.PDDocument; 12 | import org.apache.pdfbox.text.PDFTextStripper; 13 | import org.apache.pdfbox.text.PDFTextStripperByArea; 14 | 15 | import com.gargoylesoftware.htmlunit.Page; 16 | import com.gargoylesoftware.htmlunit.WebClient; 17 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor; 18 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 19 | 20 | public class PDFScraping { 21 | 22 | public static void main(String[] args) throws IOException { 23 | 24 | WebClient client = new WebClient(); 25 | client.getOptions().setJavaScriptEnabled(true); 26 | client.getOptions().setCssEnabled(false); 27 | client.getOptions().setUseInsecureSSL(true); 28 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 29 | 30 | HtmlPage html = client.getPage("https://www.javawebscrapingsandbox.com/pdf"); 31 | 32 | // selects the first anchor which contains "pdf" 33 | HtmlAnchor anchor = html.getFirstByXPath("//a[contains(@href, 'pdf')]"); 34 | String pdfUrl = anchor.getHrefAttribute(); 35 | 36 | Page pdf = client.getPage(pdfUrl); 37 | 38 | if(pdf.getWebResponse().getContentType().equals("application/pdf")){ 39 | System.out.println("Pdf downloaded"); 40 | IOUtils.copy(pdf.getWebResponse().getContentAsStream(), 41 | new FileOutputStream("invoice.pdf")); 42 | System.out.println("Pdf file created"); 43 | PDDocument document = null; 44 | try{ 45 | document = PDDocument.load(new File("invoice.pdf")) ; 46 | 47 | PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 48 | stripper.setSortByPosition(true); 49 | 50 | PDFTextStripper tStripper = new PDFTextStripper(); 51 | 52 | String stringPdf = tStripper.getText(document); 53 | String lines[] = stringPdf.split("\\n"); 54 | String pattern = "Total\\s+€\\s+(.+)"; 55 | Pattern p = Pattern.compile(pattern); 56 | String price = ""; 57 | for (String line : lines) { 58 | Matcher m = p.matcher(line); 59 | if(m.find()){ 60 | price = m.group(1); 61 | } 62 | } 63 | 64 | if(!price.isEmpty()){ 65 | System.out.println("Price found: " + price); 66 | }else{ 67 | System.out.println("Price not found"); 68 | } 69 | }finally{ 70 | if(document != null){ 71 | document.close(); 72 | } 73 | } 74 | 75 | } 76 | 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/chapter4/AuthenticationPostRequest.java: -------------------------------------------------------------------------------- 1 | package chapter4; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.net.URL; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.logging.Level; 9 | 10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 11 | import com.gargoylesoftware.htmlunit.FormEncodingType; 12 | import com.gargoylesoftware.htmlunit.HttpMethod; 13 | import com.gargoylesoftware.htmlunit.WebClient; 14 | import com.gargoylesoftware.htmlunit.WebRequest; 15 | import com.gargoylesoftware.htmlunit.html.HtmlInput; 16 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 17 | import com.gargoylesoftware.htmlunit.util.NameValuePair; 18 | 19 | public class AuthenticationPostRequest { 20 | 21 | 22 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ; 23 | //static final String baseUrl = "http://localhost:8000/" ; 24 | static final String loginUrl = "account/login" ; 25 | static final String email = "test@test.com" ; 26 | static final String password = "test" ; 27 | 28 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { 29 | WebClient client = new WebClient(); 30 | client.getOptions().setJavaScriptEnabled(true); 31 | client.getOptions().setCssEnabled(false); 32 | client.getOptions().setUseInsecureSSL(true); 33 | // Turn off the logger 34 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 35 | 36 | // Get the login page 37 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ; 38 | 39 | // Select the email input 40 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']"); 41 | 42 | // Select the password input 43 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']"); 44 | 45 | HtmlInput csrfToken = page.getFirstByXPath("//form//input[@name='csrf_token']") ; 46 | WebRequest request = new WebRequest( 47 | new URL("http://www.javawebscrapingsandbox.com/account/login"), HttpMethod.POST); 48 | List params = new ArrayList(); 49 | params.add(new NameValuePair("csrf_token", csrfToken.getValueAttribute())); 50 | params.add(new NameValuePair("email", email)); 51 | params.add(new NameValuePair("password", password)); 52 | 53 | request.setRequestParameters(params); 54 | request.setAdditionalHeader("Content-Type", "application/x-www-form-urlencoded"); 55 | request.setAdditionalHeader("Accept-Encoding", "gzip, deflate"); 56 | 57 | page = client.getPage(request); 58 | 59 | if(!page.asText().contains("You are now logged in")){ 60 | System.err.println("Error: Authentication failed"); 61 | }else{ 62 | System.out.println("Success ! Logged in"); 63 | } 64 | 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/chapter4/FileUploader.java: -------------------------------------------------------------------------------- 1 | package chapter4; 2 | 3 | import java.io.IOException; 4 | import java.net.MalformedURLException; 5 | import java.util.logging.Level; 6 | 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; 8 | import com.gargoylesoftware.htmlunit.WebClient; 9 | import com.gargoylesoftware.htmlunit.html.HtmlElement; 10 | import com.gargoylesoftware.htmlunit.html.HtmlFileInput; 11 | import com.gargoylesoftware.htmlunit.html.HtmlForm; 12 | import com.gargoylesoftware.htmlunit.html.HtmlInput; 13 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 14 | 15 | public class FileUploader { 16 | 17 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ; 18 | static final String loginUrl = "account/login" ; 19 | static final String email = "test@test.com" ; 20 | static final String password = "test" ; 21 | static final String fileName = "file.png" ; 22 | 23 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException { 24 | WebClient client = new WebClient(); 25 | client.getOptions().setJavaScriptEnabled(true); 26 | client.getOptions().setCssEnabled(false); 27 | client.getOptions().setUseInsecureSSL(true); 28 | // Turn off the logger 29 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 30 | 31 | // Get the login page 32 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ; 33 | 34 | // Select the email input 35 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']"); 36 | 37 | // Select the password input 38 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']"); 39 | 40 | // Set the value for both inputs 41 | inputEmail.setValueAttribute(email); 42 | inputPassword.setValueAttribute(password); 43 | 44 | // Select the form 45 | HtmlForm loginForm = inputPassword.getEnclosingForm() ; 46 | 47 | // Generate the POST request with the form 48 | page = client.getPage(loginForm.getWebRequest(null)); 49 | 50 | if(!page.asText().contains("You are now logged in")){ 51 | System.err.println("Error: Authentication failed"); 52 | }else{ 53 | System.out.println("Success ! Logged in"); 54 | 55 | } 56 | 57 | page = client.getPage(baseUrl + "upload_file") ; 58 | HtmlForm uploadFileForm = page.getFirstByXPath("//form[@action='/upload_file']"); 59 | HtmlFileInput fileInput = uploadFileForm.getInputByName("user_file"); 60 | 61 | fileInput.setValueAttribute(fileName); 62 | fileInput.setContentType("image/png"); 63 | 64 | HtmlElement button = page.getFirstByXPath("//button"); 65 | page = button.click(); 66 | 67 | 68 | if(page.asText().contains("Your file was successful uploaded")){ 69 | System.out.println("File successfully uploaded"); 70 | }else{ 71 | System.out.println("Error uploading the file"); 72 | } 73 | 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/chapter6/ReCaptchaV2.java: -------------------------------------------------------------------------------- 1 | package chapter6; 2 | 3 | import java.util.logging.Level; 4 | 5 | import org.openqa.selenium.By; 6 | import org.openqa.selenium.JavascriptExecutor; 7 | import org.openqa.selenium.WebDriver; 8 | import org.openqa.selenium.WebElement; 9 | import org.openqa.selenium.chrome.ChromeDriver; 10 | import org.openqa.selenium.chrome.ChromeOptions; 11 | 12 | import com.gargoylesoftware.htmlunit.Page; 13 | import com.gargoylesoftware.htmlunit.WebClient; 14 | 15 | public class ReCaptchaV2 { 16 | 17 | 18 | public static final String API_KEY = "" ; 19 | 20 | 21 | 22 | public static void main(String[] args) throws Exception { 23 | final String API_BASE_URL = "http://2captcha.com/" ; 24 | final String BASE_URL = "https://www.javawebscrapingsandbox.com/captcha"; 25 | WebClient client = new WebClient(); 26 | client.getOptions().setJavaScriptEnabled(false); 27 | client.getOptions().setCssEnabled(false); 28 | client.getOptions().setUseInsecureSSL(true); 29 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 30 | 31 | 32 | final String chromeDriverPath = "/usr/local/bin/chromedriver" ; 33 | System.setProperty("webdriver.chrome.driver", chromeDriverPath); 34 | ChromeOptions options = new ChromeOptions(); 35 | options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent"); 36 | options.addArguments("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36"); 37 | WebDriver driver = new ChromeDriver(options); 38 | 39 | driver.get(BASE_URL); 40 | 41 | 42 | 43 | String siteId = "" ; 44 | WebElement elem = driver.findElement(By.xpath("//div[@class='g-recaptcha']")); 45 | 46 | try { 47 | siteId = elem.getAttribute("data-sitekey"); 48 | } catch (Exception e) { 49 | System.err.println("Catpcha's div cannot be found or missing attribute data-sitekey"); 50 | e.printStackTrace(); 51 | } 52 | String QUERY = String.format("%sin.php?key=%s&method=userrecaptcha&googlekey=%s&pageurl=%s&here=now", 53 | API_BASE_URL, API_KEY, siteId, BASE_URL); 54 | Page response = client.getPage(QUERY); 55 | String stringResponse = response.getWebResponse().getContentAsString(); 56 | String jobId = ""; 57 | if(!stringResponse.contains("OK")){ 58 | throw new Exception("Error with 2captcha.com API, received : " + stringResponse); 59 | }else{ 60 | jobId = stringResponse.split("\\|")[1]; 61 | } 62 | 63 | boolean captchaSolved = false ; 64 | while(!captchaSolved){ 65 | response = client.getPage(String.format("%sres.php?key=%s&action=get&id=%s", API_BASE_URL, API_KEY, jobId)); 66 | if (response.getWebResponse().getContentAsString().contains("CAPCHA_NOT_READY")){ 67 | Thread.sleep(3000); 68 | System.out.println("Waiting for 2Captcha.com ..."); 69 | }else{ 70 | captchaSolved = true ; 71 | System.out.println("Captcha solved !"); 72 | } 73 | } 74 | String captchaToken = response.getWebResponse().getContentAsString().split("\\|")[1]; 75 | JavascriptExecutor js = (JavascriptExecutor) driver ; 76 | js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'block';"); 77 | WebElement textarea = driver.findElement(By.xpath("//textarea[@id='g-recaptcha-response']")); 78 | 79 | textarea.sendKeys(captchaToken); 80 | js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'none';"); 81 | driver.findElement(By.id("name")).sendKeys("Kevin"); 82 | driver.getPageSource(); 83 | driver.findElement(By.id("submit")).click(); 84 | 85 | if(driver.getPageSource().contains("your captcha was successfully submitted")){ 86 | System.out.println("Captcha successfuly submitted !"); 87 | }else{ 88 | System.out.println("Error while submitting captcha"); 89 | } 90 | 91 | 92 | System.out.println(); 93 | 94 | } 95 | 96 | } 97 | --------------------------------------------------------------------------------