├── .gitignore
├── file.png
├── invoice.pdf
├── ocr_exemple.jpg
├── screenshot.png
├── README.md
├── src
    ├── chapter4
    │   ├── BasicAuthentication.java
    │   ├── SearchForm.java
    │   ├── Authentication.java
    │   ├── AuthenticationPostRequest.java
    │   └── FileUploader.java
    ├── chapter5
    │   ├── ChromeHeadless.java
    │   ├── DirectApiCalls.java
    │   ├── ExecuteJavascriptFunction.java
    │   └── InfiniteScrollHeadlessChrome.java
    ├── chapter3
    │   ├── HackerNewsItem.java
    │   └── HNScraper.java
    ├── chapter6
    │   ├── OpticalCharacterRecognition.java
    │   ├── PDFScraping.java
    │   └── ReCaptchaV2.java
    └── chapter7
    │   └── Chapter7.java
├── pom.xml
└── user-agents.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | *.classpath
2 | *.project
3 | *.settings
4 | bin/
5 | target/


--------------------------------------------------------------------------------
/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/file.png


--------------------------------------------------------------------------------
/invoice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/invoice.pdf


--------------------------------------------------------------------------------
/ocr_exemple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/ocr_exemple.jpg


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksahin/javawebscrapinghandbook_code/HEAD/screenshot.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # javawebscrapinghandbook_code
2 | 
3 | This is the code examples for my ebook : https://www.javawebscrapinghandbook.com
4 | 


--------------------------------------------------------------------------------
/src/chapter4/BasicAuthentication.java:
--------------------------------------------------------------------------------
 1 | package chapter4;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.util.logging.Level;
 6 | 
 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 8 | import com.gargoylesoftware.htmlunit.WebClient;
 9 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
10 | 
11 | public class BasicAuthentication {
12 | 	
13 | 	static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
14 | 	static final String username = "basic" ;
15 | 	static final String password = "auth" ;
16 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
17 | 		WebClient client = new WebClient();
18 | 		client.getOptions().setJavaScriptEnabled(true);
19 | 		client.getOptions().setCssEnabled(false);
20 | 		client.getOptions().setUseInsecureSSL(true);
21 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
22 | 		
23 | 		HtmlPage page = client.getPage(String.format("https://%s:%s@www.javawebscrapingsandbox.com/basic_auth", username, password));
24 | 		System.out.println(page.asText());
25 | 		
26 | 	}
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/chapter5/ChromeHeadless.java:
--------------------------------------------------------------------------------
 1 | package chapter5;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.commons.io.FileUtils;
 7 | import org.openqa.selenium.OutputType;
 8 | import org.openqa.selenium.TakesScreenshot;
 9 | import org.openqa.selenium.WebDriver;
10 | import org.openqa.selenium.chrome.ChromeDriver;
11 | import org.openqa.selenium.chrome.ChromeOptions;
12 | 
13 | public class ChromeHeadless {
14 | 	public static void main(String[] args) throws IOException, InterruptedException{
15 | 		String chromeDriverPath = "/Users/kevin/Downloads/chromedriver" ;
16 | 		System.setProperty("webdriver.chrome.driver", chromeDriverPath);
17 | 		ChromeOptions options = new ChromeOptions();
18 |         options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent");
19 |         WebDriver driver = new ChromeDriver(options);
20 | 
21 |         // Get the login page
22 |         driver.get("https://pro.coinbase.com/trade/BTC-USD");
23 |         Thread.sleep(10000);
24 |         
25 |       
26 |         // Take a screenshot of the current page
27 |         File screenshot = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
28 |         FileUtils.copyFile(screenshot, new File("screenshot.png"));
29 |         driver.close();
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/chapter3/HackerNewsItem.java:
--------------------------------------------------------------------------------
 1 | package chapter3;
 2 | 
 3 | public class HackerNewsItem {
 4 | 	private String title;
 5 | 
 6 | 	private String url ;
 7 | 	private String author;
 8 | 	private int score;
 9 | 	private int position ;
10 | 	private int id ;
11 | 	
12 | 	public HackerNewsItem(String title, String url, String author, int score, int position, int id) {
13 | 		super();
14 | 		this.title = title;
15 | 		this.url = url;
16 | 		this.author = author;
17 | 		this.score = score;
18 | 		this.position = position;
19 | 		this.id = id;
20 | 	}
21 | 	
22 | 	public int getId() {
23 | 		return id;
24 | 	}
25 | 	public void setId(int id) {
26 | 		this.id = id;
27 | 	}
28 | 	public int getPosition() {
29 | 		return position;
30 | 	}
31 | 	public void setPosition(int position) {
32 | 		this.position = position;
33 | 	}
34 | 	public String getTitle() {
35 | 		return title;
36 | 	}
37 | 	public void setTitle(String title) {
38 | 		this.title = title;
39 | 	}
40 | 	public String getUrl() {
41 | 		return url;
42 | 	}
43 | 	public void setUrl(String url) {
44 | 		this.url = url;
45 | 	}
46 | 	public String getAuthor() {
47 | 		return author;
48 | 	}
49 | 	public void setAuthor(String author) {
50 | 		this.author = author;
51 | 	}
52 | 	public int getScore() {
53 | 		return score;
54 | 	}
55 | 	public void setScore(int score) {
56 | 		this.score = score;
57 | 	}
58 | 	
59 | }
60 | 


--------------------------------------------------------------------------------
/src/chapter6/OpticalCharacterRecognition.java:
--------------------------------------------------------------------------------
 1 | package chapter6;
 2 | 
 3 | import org.bytedeco.javacpp.*;
 4 | import org.bytedeco.javacpp.BytePointer;
 5 | import org.bytedeco.javacpp.lept.*;
 6 | 
 7 | import org.bytedeco.javacpp.tesseract.TessBaseAPI;
 8 | 
 9 | public class OpticalCharacterRecognition {
10 | 
11 | 	final static String TESS_DATA_PATH = "/usr/local/Cellar/tesseract/3.05.02/share/tessdata" ;
12 | 	
13 | 	public static void main(String[] args) {
14 | 		BytePointer outText;
15 | 		TessBaseAPI api = new TessBaseAPI();
16 | 		
17 | 		if (api.Init(TESS_DATA_PATH, "ENG") != 0) {
18 |             System.err.println("Could not initialize tesseract.");
19 |             System.exit(1);
20 |         }
21 | 		//api.SetVariable("tessedit_char_whitelist", "0123456789,");
22 | 		PIX image = lept.pixRead("ocr_exemple.jpg");
23 |         api.SetImage(image);
24 |         
25 |         // Get OCR result
26 |         outText = api.GetUTF8Text();
27 |         String string = outText.getString();
28 |         String invoiceNumber = "" ;
29 |         for(String lines : string.split("\\n")){
30 |         	if(lines.contains("Invoice")){
31 |         		invoiceNumber = lines.split("Invoice Number: ")[1];
32 |         		System.out.println(String.format("Invoice number found : %s", invoiceNumber));
33 |         	}
34 |         }
35 |         
36 |         // Destroy used object and release memory
37 |         api.End();
38 |         outText.deallocate();
39 |         lept.pixDestroy(image);
40 | 
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>ebook_webscraping</groupId>
 4 |   <artifactId>ebook_webscraping</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <build>
 7 |   
 8 |     <sourceDirectory>src</sourceDirectory>
 9 |     <plugins>
10 |       <plugin>
11 |         <artifactId>maven-compiler-plugin</artifactId>
12 |         <version>3.3</version>
13 |         <configuration>
14 |           <source/>
15 |           <target/>
16 |         </configuration>
17 |       </plugin>
18 |     </plugins>
19 |   </build>
20 |   
21 |   
22 |    <dependencies>
23 | 		<dependency>
24 | 		    <groupId>net.sourceforge.htmlunit</groupId>
25 | 		    <artifactId>htmlunit</artifactId>
26 | 		    <version>2.28</version>
27 | 		</dependency>
28 |   	<dependency>
29 |   		<groupId>com.fasterxml.jackson.core</groupId>
30 |   		<artifactId>jackson-databind</artifactId>
31 |   		<version>2.7.0</version>
32 |   	</dependency>
33 | 
34 | 	<dependency>
35 |     <groupId>org.seleniumhq.selenium</groupId>
36 |     <artifactId>selenium-java</artifactId>
37 |     <version>3.8.1</version>
38 | </dependency>
39 | 
40 | <dependency>
41 |   <groupId>org.apache.pdfbox</groupId>
42 |   <artifactId>pdfbox</artifactId>
43 |   <version>2.0.4</version>
44 | </dependency>
45 | 
46 | <dependency>
47 |     <groupId>org.bytedeco.javacpp-presets</groupId>
48 |     <artifactId>tesseract-platform</artifactId>
49 |     <version>3.05.01-1.4.1</version>
50 | </dependency>
51 | 
52 |   </dependencies>
53 |   
54 |   
55 |   
56 | </project>


--------------------------------------------------------------------------------
/src/chapter5/DirectApiCalls.java:
--------------------------------------------------------------------------------
 1 | package chapter5;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.util.Iterator;
 6 | import java.util.logging.Level;
 7 | 
 8 | import com.fasterxml.jackson.core.JsonProcessingException;
 9 | import com.fasterxml.jackson.databind.JsonNode;
10 | import com.fasterxml.jackson.databind.ObjectMapper;
11 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
12 | import com.gargoylesoftware.htmlunit.Page;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 | 
15 | public class DirectApiCalls {
16 | 	
17 | 	public static void parseJson(String jsonString) throws JsonProcessingException, IOException{
18 | 		ObjectMapper mapper = new ObjectMapper();
19 | 		JsonNode rootNode = mapper.readTree(jsonString);
20 | 		Iterator<JsonNode> elements = rootNode.elements();
21 | 		while(elements.hasNext()){
22 | 			JsonNode node = elements.next();
23 | 			Long id = node.get("id").asLong();
24 | 			String name = node.get("name").asText();
25 | 			String price = node.get("price").asText();
26 | 			System.out.println(String.format("Id: %s - Name: %s - Price: %s", id, name, price));
27 | 		}
28 | 	}
29 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
30 | 		WebClient client = new WebClient();
31 | 		client.getOptions().setJavaScriptEnabled(false);
32 | 		client.getOptions().setCssEnabled(false);
33 | 		client.getOptions().setUseInsecureSSL(true);
34 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
35 | 		
36 | 		for(int i = 1; i < 5; i++){
37 | 			Page json = client.getPage("https://www.javawebscrapingsandbox.com/product/api/" + i );
38 | 			parseJson(json.getWebResponse().getContentAsString());
39 | 		}
40 | 		
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/chapter5/ExecuteJavascriptFunction.java:
--------------------------------------------------------------------------------
 1 | package chapter5;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.openqa.selenium.By;
 6 | import org.openqa.selenium.JavascriptExecutor;
 7 | import org.openqa.selenium.WebDriver;
 8 | import org.openqa.selenium.WebElement;
 9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 | 
12 | public class ExecuteJavascriptFunction {
13 | 	
14 | 	public static void processLines(List<WebElement> lines){
15 | 		int size = lines.size();
16 | 		System.out.println(String.format("There are %s product rows in the table", size));
17 | 	}
18 | 	public static void main(String[] args) throws InterruptedException {
19 | 		String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ;
20 | 		System.setProperty("webdriver.chrome.driver", chromeDriverPath);
21 | 		ChromeOptions options = new ChromeOptions();
22 |         options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent");
23 |         options.addArguments("window-size=600,400");
24 |         
25 |         WebDriver driver = new ChromeDriver(options);
26 |         JavascriptExecutor js = (JavascriptExecutor) driver;
27 |         int pageNumber = 5 ;
28 |         
29 |         driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll");
30 |         for(int i = 3; i < pageNumber + 3; i++){
31 |         	js.executeScript("drawNextLines('/product/api/" + i +"');");
32 |         	while((Boolean)js.executeScript("return win.data('ajaxready');") == false){
33 |         		Thread.sleep(100);
34 |         	}
35 |         }
36 |         List<WebElement> rows = driver.findElements(By.xpath("//tr"));
37 |         
38 |         // do something with the row list
39 |         processLines(rows);
40 | 
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/chapter5/InfiniteScrollHeadlessChrome.java:
--------------------------------------------------------------------------------
 1 | package chapter5;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.openqa.selenium.By;
 6 | import org.openqa.selenium.JavascriptExecutor;
 7 | import org.openqa.selenium.WebDriver;
 8 | import org.openqa.selenium.WebElement;
 9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 | 
12 | public class InfiniteScrollHeadlessChrome {
13 | 	
14 | 	
15 | 	static final String URL = "";
16 | 	public static void processLines(List<WebElement> lines){
17 | 		int size = lines.size();
18 | 		System.out.println(String.format("There are %s product rows in the table", size));
19 | 	}
20 | 	public static void main(String[] args) throws InterruptedException {
21 | 		String chromeDriverPath = "/Users/kevin/.nvm/versions/node/v10.4.0/lib/node_modules/chromedriver/lib/chromedriver/chromedriver" ;
22 | 		System.setProperty("webdriver.chrome.driver", chromeDriverPath);
23 | 		ChromeOptions options = new ChromeOptions();
24 |         options.addArguments("--headless" ,"--disable-gpu", "--ignore-certificate-errors", "--silent");
25 |         options.addArguments("window-size=600,400");
26 |         
27 |         
28 |         WebDriver driver = new ChromeDriver(options);
29 |         JavascriptExecutor js = (JavascriptExecutor) driver;
30 |         int pageNumber = 5 ;
31 |         
32 |         driver.get("https://www.javawebscrapingsandbox.com/product/infinite_scroll");
33 |         for(int i = 0; i < pageNumber; i++){
34 |         	js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
35 |         	Thread.sleep(1200);
36 |         }
37 |         List<WebElement> rows = driver.findElements(By.xpath("//tr"));
38 |         
39 |         // do something with the row list
40 |         processLines(rows);
41 |         
42 |         driver.quit();
43 | 
44 | 	}
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/user-agents.txt:
--------------------------------------------------------------------------------
 1 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1
 2 | Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1
 3 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31
 4 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36
 5 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36
 6 | Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
 7 | Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.7 Safari/532.2
 8 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13
 9 | Mozilla/5.0 (Windows NT 5.1; rv:5.0.1) Gecko/20100101 Firefox/5.0.1
10 | Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.02
11 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0
12 | Mozilla/5.0 (Windows NT 6.1; rv:2.0b7pre) Gecko/20100921 Firefox/4.0b7pre
13 | Mozilla/5.0 (X11; U; Linux x86; fr-fr) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3 AppleWebKit/532.4 Safari/532.4
14 | Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11
15 | Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.3) Gecko/20090824 Firefox/3.5.3 GTB5
16 | Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1
17 | Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)
18 | Opera/9.80 (Windows NT 6.2; Win64; x64) Presto/2.12.388 Version/12.15
19 | Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16
20 | Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14


--------------------------------------------------------------------------------
/src/chapter4/SearchForm.java:
--------------------------------------------------------------------------------
 1 | package chapter4;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.util.logging.Level;
 6 | 
 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 8 | import com.gargoylesoftware.htmlunit.WebClient;
 9 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
10 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
12 | import com.gargoylesoftware.htmlunit.html.HtmlTable;
13 | import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
14 | import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLTableElement;
15 | 
16 | public class SearchForm {
17 | 	
18 | 	static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
19 | 	static final String MINPRICE = "300";
20 | 	static final String MAXPRICE = "650" ;
21 | 	
22 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
23 | 		WebClient client = new WebClient();
24 | 		client.getOptions().setJavaScriptEnabled(true);
25 | 		client.getOptions().setCssEnabled(false);
26 | 		client.getOptions().setUseInsecureSSL(true);
27 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
28 | 		
29 | 		HtmlPage page = client.getPage(baseUrl + "product/search");
30 | 		
31 | 		HtmlInput minPrice = page.getHtmlElementById("min_price");
32 | 		HtmlInput maxPrice = page.getHtmlElementById("max_price");
33 | 		
34 | 		// set the min/max values
35 | 		minPrice.setValueAttribute(MINPRICE);
36 | 		maxPrice.setValueAttribute(MAXPRICE);
37 | 		HtmlForm form = minPrice.getEnclosingForm();
38 | 		
39 | 		page = client.getPage(form.getWebRequest(null));
40 | 		
41 | 		HtmlTable table = page.getFirstByXPath("//table");
42 | 		for(HtmlTableRow elem : table.getBodies().get(0).getRows()){
43 | 			System.out.println(String.format("Name : %s Price: %s", elem.getCell(0).asText(), elem.getCell(2).asText()));
44 | 		}
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/chapter7/Chapter7.java:
--------------------------------------------------------------------------------
 1 | package chapter7;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | import java.util.Random;
 9 | 
10 | import com.gargoylesoftware.htmlunit.ProxyConfig;
11 | import com.gargoylesoftware.htmlunit.WebClient;
12 | 
13 | public class Chapter7 {
14 | 
15 | 	
16 | 	private static final String FILENAME = "user-agents.txt";
17 | 	
18 | 	public static WebClient initWebClientWithHeaders(){
19 | 		WebClient client = new WebClient();		
20 | 		client.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
21 | 		client.addRequestHeader("Accept-Encoding", "gzip, deflate, br");
22 | 		client.addRequestHeader("Accept-Language", "en-US,en;q=0.9,fr-FR;q=0.8,fr;q=0.7,la;q=0.6");
23 | 		client.addRequestHeader("Connection", "keep-alive");
24 | 		client.addRequestHeader("Host", "ksah.in");
25 | 		client.addRequestHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
26 | 		client.addRequestHeader("Pragma", "no-cache");
27 | 		
28 | 		return client ; 
29 | 	}
30 | 	
31 | 	private static String getRandomUseragent(){
32 | 		List<String> userAgents = new ArrayList<String>();
33 | 		Random rand = new Random();
34 | 		try (BufferedReader br = new BufferedReader(new FileReader(FILENAME))) {
35 | 			String sCurrentLine;
36 | 			while ((sCurrentLine = br.readLine()) != null) {
37 | 				userAgents.add(sCurrentLine);
38 | 			}
39 | 
40 | 		} catch (IOException e) {
41 | 			e.printStackTrace();
42 | 		}
43 | 		
44 | 		return userAgents.get(rand.nextInt(userAgents.size()));
45 | 	}
46 | 	public static void main(String[] args) {
47 | 		WebClient client = new WebClient() ;
48 | 		ProxyConfig proxyConfig = new ProxyConfig("host", 12345);
49 | 		client.getOptions().setProxyConfig(proxyConfig);
50 | 		System.out.println(getRandomUseragent());
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/src/chapter3/HNScraper.java:
--------------------------------------------------------------------------------
 1 | package chapter3;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import com.fasterxml.jackson.databind.ObjectMapper;
 6 | import com.gargoylesoftware.htmlunit.WebClient;
 7 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
 8 | import com.gargoylesoftware.htmlunit.html.HtmlElement;
 9 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
10 | 
11 | public class HNScraper {
12 | 
13 | 	public static void main(String[] args) {
14 | 		String baseUrl = "https://news.ycombinator.com/" ;
15 | 		WebClient client = new WebClient();
16 | 		client.getOptions().setCssEnabled(false);
17 | 		client.getOptions().setJavaScriptEnabled(false);
18 | 		try{
19 | 			HtmlPage page = client.getPage(baseUrl);
20 | 			List<HtmlElement> itemList =  page.getByXPath("//tr[@class='athing']");
21 | 			if(itemList.isEmpty()){
22 | 				System.out.println("No item found");
23 | 			}else{
24 | 				for(HtmlElement htmlItem : itemList){
25 | 					int position = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./td/span")).asText().replace(".", ""));
26 | 					int id = Integer.parseInt(htmlItem.getAttribute("id"));
27 | 					String title =  ((HtmlElement) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']")).asText();
28 | 					String url = ((HtmlAnchor) htmlItem.getFirstByXPath("./td[not(@valign='top')][@class='title']/a")).getHrefAttribute();
29 | 					String author =  ((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/a[@class='hnuser']")).asText();
30 | 					int score = Integer.parseInt(((HtmlElement) htmlItem.getFirstByXPath("./following-sibling::tr/td[@class='subtext']/span[@class='score']")).asText().replace(" points", ""));
31 | 					
32 | 					HackerNewsItem hnItem = new HackerNewsItem(title, url, author, score, position, id);
33 | 					
34 | 					ObjectMapper mapper = new ObjectMapper();
35 | 					String jsonString = mapper.writeValueAsString(hnItem) ;
36 | 					
37 | 					System.out.println(jsonString);
38 | 				}
39 | 			}
40 | 		}catch(Exception e){
41 | 			e.printStackTrace();
42 | 		}finally{
43 | 			client.close();
44 | 		}
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/src/chapter4/Authentication.java:
--------------------------------------------------------------------------------
 1 | package chapter4;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.util.logging.Level;
 6 | 
 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 8 | import com.gargoylesoftware.htmlunit.WebClient;
 9 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
10 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
12 | 
13 | public class Authentication {
14 | 	
15 | 	static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
16 | 	static final String loginUrl = "account/login" ;
17 | 	static final String email = "test@test.com" ;
18 | 	static final String password = "test" ;
19 | 	
20 | 	
21 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException {
22 | 		WebClient client = new WebClient();
23 | 		client.getOptions().setJavaScriptEnabled(true);
24 | 		client.getOptions().setCssEnabled(false);
25 | 		client.getOptions().setUseInsecureSSL(true);
26 | 		// Turn off the logger
27 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
28 | 
29 | 		// Get the login page
30 | 		HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
31 | 		
32 | 		// Select the email input
33 | 		HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
34 | 		
35 | 		// Select the password input
36 | 		HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
37 | 		
38 | 		// Set the value for both inputs
39 | 		inputEmail.setValueAttribute(email);
40 | 		inputPassword.setValueAttribute(password);
41 | 		
42 | 		// Select the form
43 | 		HtmlForm loginForm = inputPassword.getEnclosingForm() ;
44 | 		
45 | 		// Generate the POST request with the form
46 | 		page = client.getPage(loginForm.getWebRequest(null));
47 | 		
48 | 		if(!page.asText().contains("You are now logged in")){
49 | 			System.err.println("Error: Authentication failed");
50 | 		}else{
51 | 			System.out.println("Success ! Logged in");
52 | 		}
53 | 		
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/src/chapter6/PDFScraping.java:
--------------------------------------------------------------------------------
 1 | package chapter6;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileOutputStream;
 5 | import java.io.IOException;
 6 | import java.util.logging.Level;
 7 | import java.util.regex.Matcher;
 8 | import java.util.regex.Pattern;
 9 | 
10 | import org.apache.commons.io.IOUtils;
11 | import org.apache.pdfbox.pdmodel.PDDocument;
12 | import org.apache.pdfbox.text.PDFTextStripper;
13 | import org.apache.pdfbox.text.PDFTextStripperByArea;
14 | 
15 | import com.gargoylesoftware.htmlunit.Page;
16 | import com.gargoylesoftware.htmlunit.WebClient;
17 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
18 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
19 | 
20 | public class PDFScraping {
21 | 
22 | 	public static void main(String[] args) throws IOException {
23 | 		
24 | 		WebClient client = new WebClient();
25 | 		client.getOptions().setJavaScriptEnabled(true);
26 | 		client.getOptions().setCssEnabled(false);
27 | 		client.getOptions().setUseInsecureSSL(true);
28 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
29 | 		
30 | 		HtmlPage html = client.getPage("https://www.javawebscrapingsandbox.com/pdf");
31 | 		
32 | 		// selects the first anchor which contains "pdf" 
33 | 		HtmlAnchor anchor = html.getFirstByXPath("//a[contains(@href, 'pdf')]");
34 | 		String pdfUrl = anchor.getHrefAttribute();
35 | 		
36 | 		Page pdf = client.getPage(pdfUrl);
37 | 		
38 | 		if(pdf.getWebResponse().getContentType().equals("application/pdf")){
39 | 			System.out.println("Pdf downloaded");
40 | 			IOUtils.copy(pdf.getWebResponse().getContentAsStream(), 
41 | 					new FileOutputStream("invoice.pdf"));
42 | 			System.out.println("Pdf file created");
43 | 			PDDocument document = null;
44 | 			try{
45 | 			 	document = PDDocument.load(new File("invoice.pdf")) ;
46 | 
47 | 		        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
48 | 		        stripper.setSortByPosition(true);
49 | 
50 | 		        PDFTextStripper tStripper = new PDFTextStripper();
51 | 
52 | 		        String stringPdf = tStripper.getText(document);
53 | 		        String lines[] = stringPdf.split("\\n");
54 | 		        String pattern = "Total\\s+€\\s+(.+)";
55 | 		        Pattern p = Pattern.compile(pattern);
56 | 		        String price = "";
57 | 		        for (String line : lines) {
58 | 		        	Matcher m = p.matcher(line);
59 | 		        	if(m.find()){
60 | 		        		price = m.group(1);
61 | 		        	}
62 | 		        }
63 | 		        
64 | 		        if(!price.isEmpty()){
65 | 		        	System.out.println("Price found: " + price);
66 | 		        }else{
67 | 		        	System.out.println("Price not found");
68 | 		        }
69 | 			}finally{
70 | 				if(document != null){
71 | 					document.close();
72 | 				}
73 | 			}
74 | 		  
75 | 		}
76 |  
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/src/chapter4/AuthenticationPostRequest.java:
--------------------------------------------------------------------------------
 1 | package chapter4;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.net.URL;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | import java.util.logging.Level;
 9 | 
10 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
11 | import com.gargoylesoftware.htmlunit.FormEncodingType;
12 | import com.gargoylesoftware.htmlunit.HttpMethod;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 | import com.gargoylesoftware.htmlunit.WebRequest;
15 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
16 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
17 | import com.gargoylesoftware.htmlunit.util.NameValuePair;
18 | 
19 | public class AuthenticationPostRequest {
20 | 	
21 | 	
22 | 	static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
23 | 	//static final String baseUrl = "http://localhost:8000/" ;
24 | 	static final String loginUrl = "account/login" ;
25 | 	static final String email = "test@test.com" ;
26 | 	static final String password = "test" ;
27 | 	
28 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
29 | 		WebClient client = new WebClient();
30 | 		client.getOptions().setJavaScriptEnabled(true);
31 | 		client.getOptions().setCssEnabled(false);
32 | 		client.getOptions().setUseInsecureSSL(true);
33 | 		// Turn off the logger
34 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
35 | 
36 | 		// Get the login page
37 | 		HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
38 | 		
39 | 		// Select the email input
40 | 		HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
41 | 		
42 | 		// Select the password input
43 | 		HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
44 | 		
45 | 		HtmlInput csrfToken = page.getFirstByXPath("//form//input[@name='csrf_token']") ;
46 | 		WebRequest request = new WebRequest(
47 | 				new URL("http://www.javawebscrapingsandbox.com/account/login"), HttpMethod.POST);
48 | 		List<NameValuePair> params = new ArrayList<NameValuePair>();
49 | 		params.add(new NameValuePair("csrf_token", csrfToken.getValueAttribute()));
50 | 		params.add(new NameValuePair("email", email));
51 | 		params.add(new NameValuePair("password", password));
52 | 
53 | 		request.setRequestParameters(params);
54 | 		request.setAdditionalHeader("Content-Type", "application/x-www-form-urlencoded");
55 | 		request.setAdditionalHeader("Accept-Encoding", "gzip, deflate");
56 | 		
57 | 		page = client.getPage(request);
58 | 		
59 | 		if(!page.asText().contains("You are now logged in")){
60 | 			System.err.println("Error: Authentication failed");
61 | 		}else{
62 | 			System.out.println("Success ! Logged in");
63 | 		}
64 | 
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/chapter4/FileUploader.java:
--------------------------------------------------------------------------------
 1 | package chapter4;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.MalformedURLException;
 5 | import java.util.logging.Level;
 6 | 
 7 | import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
 8 | import com.gargoylesoftware.htmlunit.WebClient;
 9 | import com.gargoylesoftware.htmlunit.html.HtmlElement;
10 | import com.gargoylesoftware.htmlunit.html.HtmlFileInput;
11 | import com.gargoylesoftware.htmlunit.html.HtmlForm;
12 | import com.gargoylesoftware.htmlunit.html.HtmlInput;
13 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
14 | 
15 | public class FileUploader {
16 | 
17 | 	static final String baseUrl = "https://www.javawebscrapingsandbox.com/" ;
18 | 	static final String loginUrl = "account/login" ;
19 | 	static final String email = "test@test.com" ;
20 | 	static final String password = "test" ;
21 | 	static final String fileName = "file.png" ;
22 | 	
23 | 	public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException {
24 | 		WebClient client = new WebClient();
25 | 		client.getOptions().setJavaScriptEnabled(true);
26 | 		client.getOptions().setCssEnabled(false);
27 | 		client.getOptions().setUseInsecureSSL(true);
28 | 		// Turn off the logger
29 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); 
30 | 
31 | 		// Get the login page
32 | 		HtmlPage page = client.getPage(String.format("%s%s", baseUrl, loginUrl)) ;
33 | 		
34 | 		// Select the email input
35 | 		HtmlInput inputEmail = page.getFirstByXPath("//form//input[@name='email']");
36 | 		
37 | 		// Select the password input
38 | 		HtmlInput inputPassword = page.getFirstByXPath("//form//input[@name='password']");
39 | 		
40 | 		// Set the value for both inputs
41 | 		inputEmail.setValueAttribute(email);
42 | 		inputPassword.setValueAttribute(password);
43 | 		
44 | 		// Select the form
45 | 		HtmlForm loginForm = inputPassword.getEnclosingForm() ;
46 | 		
47 | 		// Generate the POST request with the form
48 | 		page = client.getPage(loginForm.getWebRequest(null));
49 | 		
50 | 		if(!page.asText().contains("You are now logged in")){
51 | 			System.err.println("Error: Authentication failed");
52 | 		}else{
53 | 			System.out.println("Success ! Logged in");
54 | 		
55 | 		}
56 | 		
57 | 		page = client.getPage(baseUrl + "upload_file") ;
58 | 		HtmlForm uploadFileForm = page.getFirstByXPath("//form[@action='/upload_file']");
59 | 		HtmlFileInput fileInput = uploadFileForm.getInputByName("user_file");
60 | 		
61 | 		fileInput.setValueAttribute(fileName);
62 | 		fileInput.setContentType("image/png");
63 | 		
64 | 		HtmlElement button = page.getFirstByXPath("//button");
65 | 		page = button.click();
66 | 		
67 | 		
68 | 		if(page.asText().contains("Your file was successful uploaded")){
69 | 			System.out.println("File successfully uploaded");
70 | 		}else{
71 | 			System.out.println("Error uploading the file");
72 | 		}
73 | 		
74 | 	}
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/chapter6/ReCaptchaV2.java:
--------------------------------------------------------------------------------
 1 | package chapter6;
 2 | 
 3 | import java.util.logging.Level;
 4 | 
 5 | import org.openqa.selenium.By;
 6 | import org.openqa.selenium.JavascriptExecutor;
 7 | import org.openqa.selenium.WebDriver;
 8 | import org.openqa.selenium.WebElement;
 9 | import org.openqa.selenium.chrome.ChromeDriver;
10 | import org.openqa.selenium.chrome.ChromeOptions;
11 | 
12 | import com.gargoylesoftware.htmlunit.Page;
13 | import com.gargoylesoftware.htmlunit.WebClient;
14 | 
15 | public class ReCaptchaV2 {
16 | 	
17 | 	
18 | 	public static final String API_KEY = "" ;
19 | 	
20 | 	
21 | 	
22 | 	public static void main(String[] args) throws Exception {
23 | 		final String API_BASE_URL = "http://2captcha.com/" ;
24 | 		final String BASE_URL = "https://www.javawebscrapingsandbox.com/captcha";
25 | 		WebClient client = new WebClient();
26 | 		client.getOptions().setJavaScriptEnabled(false);
27 | 		client.getOptions().setCssEnabled(false);
28 | 		client.getOptions().setUseInsecureSSL(true);
29 | 		java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
30 | 		
31 | 		
32 | 		final String chromeDriverPath = "/usr/local/bin/chromedriver" ;
33 | 		System.setProperty("webdriver.chrome.driver", chromeDriverPath);
34 | 		ChromeOptions options = new ChromeOptions();
35 |         options.addArguments("--headless", "--disable-gpu", "--window-size=1920,1200","--ignore-certificate-errors", "--silent");
36 |         options.addArguments("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36");
37 |         WebDriver driver = new ChromeDriver(options);
38 |         
39 |         driver.get(BASE_URL);
40 | 	
41 | 	        
42 | 		
43 | 		String siteId = "" ;
44 | 		WebElement elem = driver.findElement(By.xpath("//div[@class='g-recaptcha']"));
45 | 		
46 | 		try {
47 | 			siteId = elem.getAttribute("data-sitekey");
48 | 		} catch (Exception e) {
49 | 			System.err.println("Catpcha's div cannot be found or missing attribute data-sitekey");
50 | 			e.printStackTrace();
51 | 		}
52 | 		String QUERY = String.format("%sin.php?key=%s&method=userrecaptcha&googlekey=%s&pageurl=%s&here=now", 
53 | 				API_BASE_URL, API_KEY, siteId, BASE_URL);
54 | 		Page response = client.getPage(QUERY);
55 | 		String stringResponse = response.getWebResponse().getContentAsString();
56 | 		String jobId = "";
57 | 		if(!stringResponse.contains("OK")){
58 | 			throw new Exception("Error with 2captcha.com API, received : " + stringResponse);
59 | 		}else{
60 | 			jobId = stringResponse.split("\\|")[1];
61 | 		}
62 | 		
63 | 		boolean captchaSolved = false ;
64 | 		while(!captchaSolved){
65 | 			response = client.getPage(String.format("%sres.php?key=%s&action=get&id=%s", API_BASE_URL, API_KEY, jobId));
66 | 			if (response.getWebResponse().getContentAsString().contains("CAPCHA_NOT_READY")){
67 | 				Thread.sleep(3000);
68 | 				System.out.println("Waiting for 2Captcha.com ...");
69 | 			}else{
70 | 				captchaSolved = true ;
71 | 				System.out.println("Captcha solved !");
72 | 			}
73 | 		}
74 | 		String captchaToken = response.getWebResponse().getContentAsString().split("\\|")[1];
75 | 		JavascriptExecutor js = (JavascriptExecutor) driver ;
76 | 		js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'block';");
77 | 		WebElement textarea = driver.findElement(By.xpath("//textarea[@id='g-recaptcha-response']"));
78 | 		
79 | 		textarea.sendKeys(captchaToken);
80 | 		js.executeScript("document.getElementById('g-recaptcha-response').style.display = 'none';");
81 | 		driver.findElement(By.id("name")).sendKeys("Kevin");
82 | 		driver.getPageSource();
83 | 		driver.findElement(By.id("submit")).click();
84 | 		
85 | 		if(driver.getPageSource().contains("your captcha was successfully submitted")){
86 | 			System.out.println("Captcha successfuly submitted !");
87 | 		}else{
88 | 			System.out.println("Error while submitting captcha");
89 | 		}
90 | 		
91 | 		
92 | 		System.out.println();
93 | 
94 | 	}
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------