├── .gitignore
├── file.png
├── invoice.pdf
├── ocr_exemple.jpg
├── screenshot.png
├── README.md
├── src
├── chapter4
│ ├── BasicAuthentication.java
│ ├── SearchForm.java
│ ├── Authentication.java
│ ├── AuthenticationPostRequest.java
│ └── FileUploader.java
├── chapter5
│ ├── ChromeHeadless.java
│ ├── DirectApiCalls.java
│ ├── ExecuteJavascriptFunction.java
│ └── InfiniteScrollHeadlessChrome.java
├── chapter3
│ ├── HackerNewsItem.java
│ └── HNScraper.java
├── chapter6
│ ├── OpticalCharacterRecognition.java
│ ├── PDFScraping.java
│ └── ReCaptchaV2.java
└── chapter7
│ └── Chapter7.java
├── pom.xml
└── user-agents.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.classpath
2 | *.project
3 | *.settings
4 | bin/
5 | target/
--------------------------------------------------------------------------------
/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/file.png
--------------------------------------------------------------------------------
/invoice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/invoice.pdf
--------------------------------------------------------------------------------
/ocr_exemple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/ocr_exemple.jpg
--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/screenshot.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # javawebscrapinghandbook_code
2 |
3 | This is the code examples for my ebook : https://www.javawebscrapinghandbook.com
4 |
--------------------------------------------------------------------------------
/src/chapter4/BasicAuthentication.java:
--------------------------------------------------------------------------------
1 | package chapter4;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.logging.Level;
6 |
7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
8 | import com.gargoylesoftware.htmlunit.WebClient;
9 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
10 |
11 | public class BasicAuthentication {
12 |
13 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
14 | static final String username = "basic" ;
15 | static final String password = "auth" ;
16 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
17 | WebClient client = new WebClient();
18 | client.getOptions().setJavaScriptEnabled(true);
19 | client.getOptions().setCssEnabled(false);
20 | client.getOptions().setUseInsecureSSL(true);
21 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
22 |
23 | HtmlPage page = client.getPage(String.format("https://%s:%s@www.javawebscrapingsandbox.com/basic_auth", username, password));
24 | System.out.println(page.asText());
25 |
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/src/chapter5/ChromeHeadless.java:
--------------------------------------------------------------------------------
1 | package chapter5;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 |
6 | import org.apache.commons.io.FileUtils;
7 | import org.openqa.selenium.OutputType;
8 | import org.openqa.selenium.TakesScreenshot;
9 | import org.openqa.selenium.WebDriver;
10 | import org.openqa.selenium.chrome.ChromeDriver;
11 | import org.openqa.selenium.chrome.ChromeOptions;
12 |
13 | public class ChromeHeadless {
14 | public static void main(String[] args) throws IOException, InterruptedException{
15 | String chromeDriverPath = "/Users/kevin/Downloads/chromedriver" ;
16 | System.setProperty("webdriver.chrome.driver", chromeDriverPath);
17 | ChromeOptions options = new ChromeOptions();
18 | options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent");
19 | WebDriver driver = new ChromeDriver(options);
20 |
21 | // Get the login page
22 | driver.get("https://pro.coinbase.com/trade/BTC-USD");
23 | Thread.sleep(10000);
24 |
25 |
26 | // Take a screenshot of the current page
27 | File screenshot = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
28 | FileUtils.copyFile(screenshot, new File("screenshot.png"));
29 | driver.close();
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/chapter3/HackerNewsItem.java:
--------------------------------------------------------------------------------
1 | package chapter3;
2 |
3 | public class HackerNewsItem {
4 | private String title;
5 |
6 | private String url ;
7 | private String author;
8 | private int score;
9 | private int position ;
10 | private int id ;
11 |
12 | public HackerNewsItem(String title, String url, String author, int score, int position, int id) {
13 | super();
14 | this.title = title;
15 | this.url = url;
16 | this.author = author;
17 | this.score = score;
18 | this.position = position;
19 | this.id = id;
20 | }
21 |
22 | public int getId() {
23 | return id;
24 | }
25 | public void setId(int id) {
26 | this.id = id;
27 | }
28 | public int getPosition() {
29 | return position;
30 | }
31 | public void setPosition(int position) {
32 | this.position = position;
33 | }
34 | public String getTitle() {
35 | return title;
36 | }
37 | public void setTitle(String title) {
38 | this.title = title;
39 | }
40 | public String getUrl() {
41 | return url;
42 | }
43 | public void setUrl(String url) {
44 | this.url = url;
45 | }
46 | public String getAuthor() {
47 | return author;
48 | }
49 | public void setAuthor(String author) {
50 | this.author = author;
51 | }
52 | public int getScore() {
53 | return score;
54 | }
55 | public void setScore(int score) {
56 | this.score = score;
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/chapter6/OpticalCharacterRecognition.java:
--------------------------------------------------------------------------------
1 | package chapter6;
2 |
3 | import org.bytedeco.javacpp.*;
4 | import org.bytedeco.javacpp.BytePointer;
5 | import org.bytedeco.javacpp.lept.*;
6 |
7 | import org.bytedeco.javacpp.tesseract.TessBaseAPI;
8 |
9 | public class OpticalCharacterRecognition {
10 |
11 | final static String TESS_DATA_PATH = "/usr/local/Cellar/tesseract/3.05.02/share/tessdata" ;
12 |
13 | public static void main(String[] args) {
14 | BytePointer outText;
15 | TessBaseAPI api = new TessBaseAPI();
16 |
17 | if (api.Init(TESS_DATA_PATH, "ENG") != 0) {
18 | System.err.println("Could not initialize tesseract.");
19 | System.exit(1);
20 | }
21 | //api.SetVariable("tessedit_char_whitelist", "0123456789,");
22 | PIX image = lept.pixRead("ocr_exemple.jpg");
23 | api.SetImage(image);
24 |
25 | // Get OCR result
26 | outText = api.GetUTF8Text();
27 | String string = outText.getString();
28 | String invoiceNumber = "" ;
29 | for(String lines : string.split("\\n")){
30 | if(lines.contains("Invoice")){
31 | invoiceNumber = lines.split("Invoice Number: ")[1];
32 | System.out.println(String.format("Invoice number found : %s", invoiceNumber));
33 | }
34 | }
35 |
36 | // Destroy used object and release memory
37 | api.End();
38 | outText.deallocate();
39 | lept.pixDestroy(image);
40 |
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | ebook_webscraping
4 | ebook_webscraping
5 | 0.0.1-SNAPSHOT
6 |
7 |
8 | src
9 |
10 |
11 | maven-compiler-plugin
12 | 3.3
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | net.sourceforge.htmlunit
25 | htmlunit
26 | 2.28
27 |
28 |
29 | com.fasterxml.jackson.core
30 | jackson-databind
31 | 2.7.0
32 |
33 |
34 |
35 | org.seleniumhq.selenium
36 | selenium-java
37 | 3.8.1
38 |
39 |
40 |
41 | org.apache.pdfbox
42 | pdfbox
43 | 2.0.4
44 |
45 |
46 |
47 | org.bytedeco.javacpp-presets
48 | tesseract-platform
49 | 3.05.01-1.4.1
50 |
51 |
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/chapter5/DirectApiCalls.java:
--------------------------------------------------------------------------------
1 | package chapter5;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.Iterator;
6 | import java.util.logging.Level;
7 |
8 | import com.fasterxml.jackson.core.JsonProcessingException;
9 | import com.fasterxml.jackson.databind.JsonNode;
10 | import com.fasterxml.jackson.databind.ObjectMapper;
11 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
12 | import com.gargoylesoftware.htmlunit.Page;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 |
15 | public class DirectApiCalls {
16 |
17 | public static void parseJson(String jsonString) throws JsonProcessingException, IOException{
18 | ObjectMapper mapper = new ObjectMapper();
19 | JsonNode rootNode = mapper.readTree(jsonString);
20 | Iterator elements = rootNode.elements();
21 | while(elements.hasNext()){
22 | JsonNode node = elements.next();
23 | Long id = node.get("id").asLong();
24 | String name = node.get("name").asText();
25 | String price = node.get("price").asText();
26 | System.out.println(String.format("Id: %s - Name: %s - Price: %s", id, name, price));
27 | }
28 | }
29 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
30 | WebClient client = new WebClient();
31 | client.getOptions().setJavaScriptEnabled(false);
32 | client.getOptions().setCssEnabled(false);
33 | client.getOptions().setUseInsecureSSL(true);
34 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
35 |
36 | for(int i = 1; i < 5; i++){
37 | Page json = client.getPage("https://www.javawebscrapingsandbox.com/product/api/" + i );
38 | parseJson(json.getWebResponse().getContentAsString());
39 | }
40 |
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/chapter5/ExecuteJavascriptFunction.java:
--------------------------------------------------------------------------------
1 | package chapter5;
2 |
3 | import java.util.List;
4 |
5 | import org.openqa.selenium.By;
6 | import org.openqa.selenium.JavascriptExecutor;
7 | import org.openqa.selenium.WebDriver;
8 | import org.openqa.selenium.WebElement;
9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 |
12 | public class ExecuteJavascriptFunction {
13 |
14 | public static void processLines(List lines){
15 | int size = lines.size();
16 | System.out.println(String.format("There are %s product rows in the table", size));
17 | }
18 | public static void main(String[] args) throws InterruptedException {
19 | String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ;
20 | System.setProperty("webdriver.chrome.driver", chromeDriverPath);
21 | ChromeOptions options = new ChromeOptions();
22 | options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent");
23 | options.addArguments("window-size=600,400");
24 |
25 | WebDriver driver = new ChromeDriver(options);
26 | JavascriptExecutor js = (JavascriptExecutor) driver;
27 | int pageNumber = 5 ;
28 |
29 | driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll");
30 | for(int i = 3; i < pageNumber + 3; i++){
31 | js.executeScript("drawNextLines('/product/api/" + i +"');");
32 | while((Boolean)js.executeScript("return win.data('ajaxready');") == false){
33 | Thread.sleep(100);
34 | }
35 | }
36 | List rows = driver.findElements(By.xpath("//tr"));
37 |
38 | // do something with the row list
39 | processLines(rows);
40 |
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/chapter5/InfiniteScrollHeadlessChrome.java:
--------------------------------------------------------------------------------
1 | package chapter5;
2 |
3 | import java.util.List;
4 |
5 | import org.openqa.selenium.By;
6 | import org.openqa.selenium.JavascriptExecutor;
7 | import org.openqa.selenium.WebDriver;
8 | import org.openqa.selenium.WebElement;
9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 |
12 | public class InfiniteScrollHeadlessChrome {
13 |
14 |
15 | static final String URL = "";
16 | public static void processLines(List lines){
17 | int size = lines.size();
18 | System.out.println(String.format("There are %s product rows in the table", size));
19 | }
20 | public static void main(String[] args) throws InterruptedException {
21 | String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ;
22 | System.setProperty("webdriver.chrome.driver", chromeDriverPath);
23 | ChromeOptions options = new ChromeOptions();
24 | options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent");
25 | options.addArguments("window-size=600,400");
26 |
27 |
28 | WebDriver driver = new ChromeDriver(options);
29 | JavascriptExecutor js = (JavascriptExecutor) driver;
30 | int pageNumber = 5 ;
31 |
32 | driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll");
33 | for(int i = 0; i < pageNumber; i++){
34 | js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
35 | Thread.sleep(1200);
36 | }
37 | List rows = driver.findElements(By.xpath("//tr"));
38 |
39 | // do something with the row list
40 | processLines(rows);
41 |
42 | driver.quit();
43 |
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/user-agents.txt:
--------------------------------------------------------------------------------
1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1
2 | Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1
3 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31
4 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36
5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36
6 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
7 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.7 Safari/532.2
8 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13
9 | Mozilla/5.0 (Windows NT 5.1; rv:5.0.1) Gecko/20100101 Firefox/5.0.1
10 | Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.02
11 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0
12 | Mozilla/5.0 (Windows NT 6.1; rv:2.0b7pre) Gecko/20100921 Firefox/4.0b7pre
13 | Mozilla/5.0 (X11; U; Linux x86; fr-fr) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3 AppleWebKit/532.4 Safari/532.4
14 | Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11
15 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 GTB5
16 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1
17 | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)
18 | Opera/9.80 (Windows NT 6.2; Win64; x64) Presto/2.12.388 Version/12.15
19 | Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16
20 | Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14
--------------------------------------------------------------------------------
/src/chapter4/SearchForm.java:
--------------------------------------------------------------------------------
1 | package chapter4;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.logging.Level;
6 |
7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
8 | import com.gargoylesoftware.htmlunit.WebClient;
9 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
10 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
12 | import com.gargoylesoftware.htmlunit.html.HtmlTable;
13 | import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
14 | import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLTableElement;
15 |
16 | public class SearchForm {
17 |
18 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
19 | static final String MINPRICE = "300";
20 | static final String MAXPRICE = "650" ;
21 |
22 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
23 | WebClient client = new WebClient();
24 | client.getOptions().setJavaScriptEnabled(true);
25 | client.getOptions().setCssEnabled(false);
26 | client.getOptions().setUseInsecureSSL(true);
27 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
28 |
29 | HtmlPage page = client.getPage(baseUrl + "product/search");
30 |
31 | HtmlInput minPrice = page.getHtmlElementById("min_price");
32 | HtmlInput maxPrice = page.getHtmlElementById("max_price");
33 |
34 | // set the min/max values
35 | minPrice.setValueAttribute(MINPRICE);
36 | maxPrice.setValueAttribute(MAXPRICE);
37 | HtmlForm form = minPrice.getEnclosingForm();
38 |
39 | page = client.getPage(form.getWebRequest(null));
40 |
41 | HtmlTable table = page.getFirstByXPath("//table");
42 | for(HtmlTableRow elem : table.getBodies().get(0).getRows()){
43 | System.out.println(String.format("Name : %s Price: %s", elem.getCell(0).asText(), elem.getCell(2).asText()));
44 | }
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/chapter7/Chapter7.java:
--------------------------------------------------------------------------------
1 | package chapter7;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileReader;
5 | import java.io.IOException;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 | import java.util.Random;
9 |
10 | import com.gargoylesoftware.htmlunit.ProxyConfig;
11 | import com.gargoylesoftware.htmlunit.WebClient;
12 |
13 | public class Chapter7 {
14 |
15 |
16 | private static final String FILENAME = "user-agents.txt";
17 |
18 | public static WebClient initWebClientWithHeaders(){
19 | WebClient client = new WebClient();
20 | client.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
21 | client.addRequestHeader("Accept-Encoding", "gzip, deflate, br");
22 | client.addRequestHeader("Accept-Language", "en-US,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,la;q=0.6");
23 | client.addRequestHeader("Connection", "keep-alive");
24 | client.addRequestHeader("Host", "ksah.in");
25 | client.addRequestHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
26 | client.addRequestHeader("Pragma", "no-cache");
27 |
28 | return client ;
29 | }
30 |
31 | private static String getRandomUseragent(){
32 | List userAgents = new ArrayList();
33 | Random rand = new Random();
34 | try (BufferedReader br = new BufferedReader(new FileReader(FILENAME))) {
35 | String sCurrentLine;
36 | while ((sCurrentLine = br.readLine()) != null) {
37 | userAgents.add(sCurrentLine);
38 | }
39 |
40 | } catch (IOException e) {
41 | e.printStackTrace();
42 | }
43 |
44 | return userAgents.get(rand.nextInt(userAgents.size()));
45 | }
46 | public static void main(String[] args) {
47 | WebClient client = new WebClient() ;
48 | ProxyConfig proxyConfig = new ProxyConfig("host", 12345);
49 | client.getOptions().setProxyConfig(proxyConfig);
50 | System.out.println(getRandomUseragent());
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/chapter3/HNScraper.java:
--------------------------------------------------------------------------------
1 | package chapter3;
2 |
3 | import java.util.List;
4 |
5 | import com.fasterxml.jackson.databind.ObjectMapper;
6 | import com.gargoylesoftware.htmlunit.WebClient;
7 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
8 | import com.gargoylesoftware.htmlunit.html.HtmlElement;
9 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
10 |
11 | public class HNScraper {
12 |
13 | public static void main(String[] args) {
14 | String baseUrl = "https://news.ycombinator.com/" ;
15 | WebClient client = new WebClient();
16 | client.getOptions().setCssEnabled(false);
17 | client.getOptions().setJavaScriptEnabled(false);
18 | try{
19 | HtmlPage page = client.getPage(baseUrl);
20 | List itemList = page.getByXPath("//tr[@class='athing']");
21 | if(itemList.isEmpty()){
22 | System.out.println("No item found");
23 | }else{
24 | for(HtmlElement htmlItem : itemList){
25 | int position = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./td/span")).asText().replace(".", ""));
26 | int id = Integer.parseInt(htmlItem.getAttribute("id"));
27 | String title = ((HtmlElement) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']")).asText();
28 | String url = ((HtmlAnchor) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']/a")).getHrefAttribute();
29 | String author = ((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/a[@class='hnuser']")).asText();
30 | int score = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/span[@class='score']")).asText().replace(" points", ""));
31 |
32 | HackerNewsItem hnItem = new HackerNewsItem(title, url, author, score, position, id);
33 |
34 | ObjectMapper mapper = new ObjectMapper();
35 | String jsonString = mapper.writeValueAsString(hnItem) ;
36 |
37 | System.out.println(jsonString);
38 | }
39 | }
40 | }catch(Exception e){
41 | e.printStackTrace();
42 | }finally{
43 | client.close();
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/chapter4/Authentication.java:
--------------------------------------------------------------------------------
1 | package chapter4;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.logging.Level;
6 |
7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
8 | import com.gargoylesoftware.htmlunit.WebClient;
9 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
10 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
12 |
13 | public class Authentication {
14 |
15 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
16 | static final String loginUrl = "account/login" ;
17 | static final String email = "test@test.com" ;
18 | static final String password = "test" ;
19 |
20 |
21 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException {
22 | WebClient client = new WebClient();
23 | client.getOptions().setJavaScriptEnabled(true);
24 | client.getOptions().setCssEnabled(false);
25 | client.getOptions().setUseInsecureSSL(true);
26 | // Turn off the logger
27 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
28 |
29 | // Get the login page
30 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
31 |
32 | // Select the email input
33 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
34 |
35 | // Select the password input
36 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
37 |
38 | // Set the value for both inputs
39 | inputEmail.setValueAttribute(email);
40 | inputPassword.setValueAttribute(password);
41 |
42 | // Select the form
43 | HtmlForm loginForm = inputPassword.getEnclosingForm() ;
44 |
45 | // Generate the POST request with the form
46 | page = client.getPage(loginForm.getWebRequest(null));
47 |
48 | if(!page.asText().contains("You are now logged in")){
49 | System.err.println("Error: Authentication failed");
50 | }else{
51 | System.out.println("Success ! Logged in");
52 | }
53 |
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/chapter6/PDFScraping.java:
--------------------------------------------------------------------------------
1 | package chapter6;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 | import java.util.logging.Level;
7 | import java.util.regex.Matcher;
8 | import java.util.regex.Pattern;
9 |
10 | import org.apache.commons.io.IOUtils;
11 | import org.apache.pdfbox.pdmodel.PDDocument;
12 | import org.apache.pdfbox.text.PDFTextStripper;
13 | import org.apache.pdfbox.text.PDFTextStripperByArea;
14 |
15 | import com.gargoylesoftware.htmlunit.Page;
16 | import com.gargoylesoftware.htmlunit.WebClient;
17 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
18 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
19 |
20 | public class PDFScraping {
21 |
22 | public static void main(String[] args) throws IOException {
23 |
24 | WebClient client = new WebClient();
25 | client.getOptions().setJavaScriptEnabled(true);
26 | client.getOptions().setCssEnabled(false);
27 | client.getOptions().setUseInsecureSSL(true);
28 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
29 |
30 | HtmlPage html = client.getPage("https://www.javawebscrapingsandbox.com/pdf");
31 |
32 | // selects the first anchor which contains "pdf"
33 | HtmlAnchor anchor = html.getFirstByXPath("//a[contains(@href, 'pdf')]");
34 | String pdfUrl = anchor.getHrefAttribute();
35 |
36 | Page pdf = client.getPage(pdfUrl);
37 |
38 | if(pdf.getWebResponse().getContentType().equals("application/pdf")){
39 | System.out.println("Pdf downloaded");
40 | IOUtils.copy(pdf.getWebResponse().getContentAsStream(),
41 | new FileOutputStream("invoice.pdf"));
42 | System.out.println("Pdf file created");
43 | PDDocument document = null;
44 | try{
45 | document = PDDocument.load(new File("invoice.pdf")) ;
46 |
47 | PDFTextStripperByArea stripper = new PDFTextStripperByArea();
48 | stripper.setSortByPosition(true);
49 |
50 | PDFTextStripper tStripper = new PDFTextStripper();
51 |
52 | String stringPdf = tStripper.getText(document);
53 | String lines[] = stringPdf.split("\\n");
54 | String pattern = "Total\\s+€\\s+(.+)";
55 | Pattern p = Pattern.compile(pattern);
56 | String price = "";
57 | for (String line : lines) {
58 | Matcher m = p.matcher(line);
59 | if(m.find()){
60 | price = m.group(1);
61 | }
62 | }
63 |
64 | if(!price.isEmpty()){
65 | System.out.println("Price found: " + price);
66 | }else{
67 | System.out.println("Price not found");
68 | }
69 | }finally{
70 | if(document != null){
71 | document.close();
72 | }
73 | }
74 |
75 | }
76 |
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/chapter4/AuthenticationPostRequest.java:
--------------------------------------------------------------------------------
1 | package chapter4;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.net.URL;
6 | import java.util.ArrayList;
7 | import java.util.List;
8 | import java.util.logging.Level;
9 |
10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
11 | import com.gargoylesoftware.htmlunit.FormEncodingType;
12 | import com.gargoylesoftware.htmlunit.HttpMethod;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 | import com.gargoylesoftware.htmlunit.WebRequest;
15 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
16 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
17 | import com.gargoylesoftware.htmlunit.util.NameValuePair;
18 |
19 | public class AuthenticationPostRequest {
20 |
21 |
22 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
23 | //static final String baseUrl = "http://localhost:8000/" ;
24 | static final String loginUrl = "account/login" ;
25 | static final String email = "test@test.com" ;
26 | static final String password = "test" ;
27 |
28 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
29 | WebClient client = new WebClient();
30 | client.getOptions().setJavaScriptEnabled(true);
31 | client.getOptions().setCssEnabled(false);
32 | client.getOptions().setUseInsecureSSL(true);
33 | // Turn off the logger
34 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
35 |
36 | // Get the login page
37 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
38 |
39 | // Select the email input
40 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
41 |
42 | // Select the password input
43 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
44 |
45 | HtmlInput csrfToken = page.getFirstByXPath("//form//input[@name='csrf_token']") ;
46 | WebRequest request = new WebRequest(
47 | new URL("http://www.javawebscrapingsandbox.com/account/login"), HttpMethod.POST);
48 | List params = new ArrayList();
49 | params.add(new NameValuePair("csrf_token", csrfToken.getValueAttribute()));
50 | params.add(new NameValuePair("email", email));
51 | params.add(new NameValuePair("password", password));
52 |
53 | request.setRequestParameters(params);
54 | request.setAdditionalHeader("Content-Type", "application/x-www-form-urlencoded");
55 | request.setAdditionalHeader("Accept-Encoding", "gzip, deflate");
56 |
57 | page = client.getPage(request);
58 |
59 | if(!page.asText().contains("You are now logged in")){
60 | System.err.println("Error: Authentication failed");
61 | }else{
62 | System.out.println("Success ! Logged in");
63 | }
64 |
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/chapter4/FileUploader.java:
--------------------------------------------------------------------------------
1 | package chapter4;
2 |
3 | import java.io.IOException;
4 | import java.net.MalformedURLException;
5 | import java.util.logging.Level;
6 |
7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
8 | import com.gargoylesoftware.htmlunit.WebClient;
9 | import com.gargoylesoftware.htmlunit.html.HtmlElement;
10 | import com.gargoylesoftware.htmlunit.html.HtmlFileInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
12 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
13 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
14 |
15 | public class FileUploader {
16 |
17 | static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
18 | static final String loginUrl = "account/login" ;
19 | static final String email = "test@test.com" ;
20 | static final String password = "test" ;
21 | static final String fileName = "file.png" ;
22 |
23 | public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException {
24 | WebClient client = new WebClient();
25 | client.getOptions().setJavaScriptEnabled(true);
26 | client.getOptions().setCssEnabled(false);
27 | client.getOptions().setUseInsecureSSL(true);
28 | // Turn off the logger
29 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
30 |
31 | // Get the login page
32 | HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
33 |
34 | // Select the email input
35 | HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
36 |
37 | // Select the password input
38 | HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
39 |
40 | // Set the value for both inputs
41 | inputEmail.setValueAttribute(email);
42 | inputPassword.setValueAttribute(password);
43 |
44 | // Select the form
45 | HtmlForm loginForm = inputPassword.getEnclosingForm() ;
46 |
47 | // Generate the POST request with the form
48 | page = client.getPage(loginForm.getWebRequest(null));
49 |
50 | if(!page.asText().contains("You are now logged in")){
51 | System.err.println("Error: Authentication failed");
52 | }else{
53 | System.out.println("Success ! Logged in");
54 |
55 | }
56 |
57 | page = client.getPage(baseUrl + "upload_file") ;
58 | HtmlForm uploadFileForm = page.getFirstByXPath("//form[@action='/upload_file']");
59 | HtmlFileInput fileInput = uploadFileForm.getInputByName("user_file");
60 |
61 | fileInput.setValueAttribute(fileName);
62 | fileInput.setContentType("image/png");
63 |
64 | HtmlElement button = page.getFirstByXPath("//button");
65 | page = button.click();
66 |
67 |
68 | if(page.asText().contains("Your file was successful uploaded")){
69 | System.out.println("File successfully uploaded");
70 | }else{
71 | System.out.println("Error uploading the file");
72 | }
73 |
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/chapter6/ReCaptchaV2.java:
--------------------------------------------------------------------------------
1 | package chapter6;
2 |
3 | import java.util.logging.Level;
4 |
5 | import org.openqa.selenium.By;
6 | import org.openqa.selenium.JavascriptExecutor;
7 | import org.openqa.selenium.WebDriver;
8 | import org.openqa.selenium.WebElement;
9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 |
12 | import com.gargoylesoftware.htmlunit.Page;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 |
15 | public class ReCaptchaV2 {
16 |
17 |
18 | public static final String API_KEY = "" ;
19 |
20 |
21 |
22 | public static void main(String[] args) throws Exception {
23 | final String API_BASE_URL = "http://2captcha.com/" ;
24 | final String BASE_URL = "https://www.javawebscrapingsandbox.com/captcha";
25 | WebClient client = new WebClient();
26 | client.getOptions().setJavaScriptEnabled(false);
27 | client.getOptions().setCssEnabled(false);
28 | client.getOptions().setUseInsecureSSL(true);
29 | java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
30 |
31 |
32 | final String chromeDriverPath = "/usr/local/bin/chromedriver" ;
33 | System.setProperty("webdriver.chrome.driver", chromeDriverPath);
34 | ChromeOptions options = new ChromeOptions();
35 | options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent");
36 | options.addArguments("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36");
37 | WebDriver driver = new ChromeDriver(options);
38 |
39 | driver.get(BASE_URL);
40 |
41 |
42 |
43 | String siteId = "" ;
44 | WebElement elem = driver.findElement(By.xpath("//div[@class='g-recaptcha']"));
45 |
46 | try {
47 | siteId = elem.getAttribute("data-sitekey");
48 | } catch (Exception e) {
49 | System.err.println("Catpcha's div cannot be found or missing attribute data-sitekey");
50 | e.printStackTrace();
51 | }
52 | String QUERY = String.format("%sin.php?key=%s&method=userrecaptcha&googlekey=%s&pageurl=%s&here=now",
53 | API_BASE_URL, API_KEY, siteId, BASE_URL);
54 | Page response = client.getPage(QUERY);
55 | String stringResponse = response.getWebResponse().getContentAsString();
56 | String jobId = "";
57 | if(!stringResponse.contains("OK")){
58 | throw new Exception("Error with 2captcha.com API, received : " + stringResponse);
59 | }else{
60 | jobId = stringResponse.split("\\|")[1];
61 | }
62 |
63 | boolean captchaSolved = false ;
64 | while(!captchaSolved){
65 | response = client.getPage(String.format("%sres.php?key=%s&action=get&id=%s", API_BASE_URL, API_KEY, jobId));
66 | if (response.getWebResponse().getContentAsString().contains("CAPCHA_NOT_READY")){
67 | Thread.sleep(3000);
68 | System.out.println("Waiting for 2Captcha.com ...");
69 | }else{
70 | captchaSolved = true ;
71 | System.out.println("Captcha solved !");
72 | }
73 | }
74 | String captchaToken = response.getWebResponse().getContentAsString().split("\\|")[1];
75 | JavascriptExecutor js = (JavascriptExecutor) driver ;
76 | js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'block';");
77 | WebElement textarea = driver.findElement(By.xpath("//textarea[@id='g-recaptcha-response']"));
78 |
79 | textarea.sendKeys(captchaToken);
80 | js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'none';");
81 | driver.findElement(By.id("name")).sendKeys("Kevin");
82 | driver.getPageSource();
83 | driver.findElement(By.id("submit")).click();
84 |
85 | if(driver.getPageSource().contains("your captcha was successfully submitted")){
86 | System.out.println("Captcha successfuly submitted !");
87 | }else{
88 | System.out.println("Error while submitting captcha");
89 | }
90 |
91 |
92 | System.out.println();
93 |
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------