├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
    ├── main
        └── java
        │   └── scraper
        │       ├── Scraper.java
        │       ├── result
        │           ├── DocumentResult.java
        │           ├── JsoupElementResult.java
        │           └── StringResult.java
        │       └── util
        │           └── CustomHtmlToPlainText.java
    └── test
        ├── java
            └── scraper
            │   └── ScraperTest.java
        └── resources
            └── test.html


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/**/workspace.xml
 2 | .idea/**/tasks.xml
 3 | .idea/**/usage.statistics.xml
 4 | .idea/**/dictionaries
 5 | .idea/**/shelf
 6 | .idea/**/contentModel.xml
 7 | .idea/**/dataSources/
 8 | .idea/**/dataSources.ids
 9 | .idea/**/dataSources.local.xml
10 | .idea/**/sqlDataSources.xml
11 | .idea/**/dynamic.xml
12 | .idea/**/uiDesigner.xml
13 | .idea/**/dbnavigator.xml
14 | .idea/**/gradle.xml
15 | .idea/**/libraries
16 | .idea_modules/
17 | .idea/
18 | *.iml
19 | modules.xml
20 | .idea/misc.xml
21 | *.ipr
22 | target/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Janos Szendi-Varga
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Neo4jscraperproc
  2 | ==========================
  3 | ## Preface
  4 | This is a pretotype implementation of an idea for the [Global GraphHack 2019](https://globalgraphhack.devpost.com) competition.
  5 | It is not "ready", it is not nice, but It works.
  6 | 
  7 | ## Description
  8 | If we use Neo4j, sometimes we need textual information from a web page. Sometimes we need the links of a website to build a graph from it. It is quite common use case where we need to follow the links, and get the information from the links. Nowadays, when NLP is getting more mainstream, it is also usually required to collect the textual information from the web.
  9 | We would like to do this in exactly the same step when we are creating or modifying our graphs with cypher commands. This is why I create this tiny tool, to be able to do web scraping with cypher commands via stored procedures in Neo4j. I used [jSoup](http://jsoup.org) Java library in my procedures to be able to scrape.
 10 | 
 11 | ## Install
 12 | 
 13 | To use this plugin you will need the .jar file ([you can download here](https://github.com/szenyo/neo4jscraperproc/releases)) dropped into the `plugins` directory of your Neo4j installation.
 14 | 
 15 | ## Configuration
 16 | Insert this line into `neo4j.conf` to be able to use the scraper procedures:
 17 | 
 18 | ```dbms.security.procedures.unrestricted=scraper.*```
 19 | 
 20 | ## Examples
 21 | 
 22 | ### Node from random Wikipedia page
 23 | ```call scraper.select("https://en.wikipedia.org/wiki/Special:Random","body") yield element create (:Wikinode {url:element.url,text:element.text})```
 24 | 
 25 | ### Reference URL list from a Wikipedia page
 26 | ```call scraper.select('https://en.wikipedia.org/wiki/Budapest','div.reflist cite a.external') yield element with element.attributes.`abs:href` as url
 27 | return url```
 28 | 
 29 | ### Get content from reference URL list
 30 | This way you can get the content of the URLs from a reference section of a Wikipedia page.
 31 | Note: this can be long, based on the number of URLs and your internet connection, cpu, memory, etc.
 32 | 
 33 | ```call scraper.select('https://en.wikipedia.org/wiki/Budapest','div.reflist cite a.external') yield element with element.attributes.`abs:href` as url
 34 | call scraper.getPlainText(url) yield value
 35 | create (w:Page {url: url, text: value})```
 36 | 
 37 | ### Trick to get Ebay prices of something
 38 | Sometimes you want to get specific elements from an html file. You can use the selector syntax to get them.  
 39 | 
 40 | ```call scraper.select('https://www.ebay.com/sch/i.html?_nkw=seiko+turtle&rt=nc&LH_BIN=1','.s-item__price') yield element return element.text```
 41 | 
 42 | ### More advanced usage of scraping Ebay listing
 43 | You can scrape out title, link and price information easily from result page.  
 44 | 
 45 | ```
 46 | call scraper.select('https://www.ebay.com/sch/i.html?_nkw=seiko+turtle&rt=nc&LH_BIN=1','.s-item__wrapper') yield element with element as row
 47 | call scraper.selectInHtml(row.html,'.s-item__link') yield element with element.attributes.href as url,row
 48 | call scraper.selectInHtml(row.html,'.s-item__title') yield element with element.text as title,url,row
 49 | call scraper.selectInHtml(row.html,'.s-item__price') yield element with element.text as price, title,url
 50 | return title, url, price
 51 | ```
 52 | 
 53 | ### All the procedures
 54 | 
 55 | ```
 56 | scraper.getDocument(url) YIELD value - Return the content of an url
 57 | scraper.select(url,selector) YIELD element - Find elements that match the Selector CSS query, with this element as the starting context.
 58 | scraper.selectInHtml(html,selector) YIELD element - Find elements that match the Selector CSS query, with this element as the starting context.
 59 | scraper.getLinks(url) YIELD element - Get link elements from an url.
 60 | scraper.getLinksInHtml(html) YIELD element - Get link elements from a html.
 61 | scraper.getMediaLinks(url) YIELD element - Get media link elements.
 62 | scraper.getMediaLinksInHtml(html) YIELD element - Get media link elements.
 63 | scraper.getPlainText(url,selector) YIELD value - Get plain text version of a given page.
 64 | scraper.getPlainTextInHtml(url,selector) YIELD value - Get plain text version of a given page.
 65 | scraper.getElementById(url,id) YIELD element - Find an element by ID, including or under this element.
 66 | scraper.getElementByIdInHtml(html,id) YIELD element - Find an element by ID, including or under this element.
 67 | scraper.getElementsByTag(url,tag) YIELD element - Finds elements, including and recursively under this element, with the specified tag name.          
 68 | scraper.getElementsByTagInHtml(html,tag) YIELD element - Finds elements, including and recursively under this element, with the specified tag name.
 69 | scraper.getElementsByClass(url,className) YIELD element - Find elements that have this class, including or under this element.
 70 | scraper.getElementsByClassInHtml(html,className) YIELD element - Find elements that have this class, including or under this element.
 71 | scraper.getElementsByAttribute(url,key) YIELD element - Find elements that have a named attribute set.
 72 | scraper.getElementsByAttributeInHtml(html,attribute) YIELD element - Find elements that have a named attribute set.
 73 | scraper.getElementsByAttributeStarting(url,keyPrefix) YIELD element - Find elements that have an attribute name starting with the supplied prefix. Use data- to find elements that have HTML5 datasets.
 74 | scraper.getElementsByAttributeStartingInHtml(html,keyPrefix) YIELD element - Find elements that have an attribute name starting with the supplied prefix. Use data- to find elements that have HTML5 datasets.
 75 | scraper.getElementsByAttributeValue(url,key,value) YIELD element - Find elements that have an attribute with the specific value.
 76 | scraper.getElementsByAttributeValueInHtml(html,key,value) YIELD element - Find elements that have an attribute with the specific value.
 77 | scraper.getElementsByAttributeValueContaining(url,key,match) YIELD element - Find elements that have attributes whose value contains the match string.
 78 | scraper.getElementsByAttributeValueContainingInHtml(html,key,match) YIELD element - Find elements that have attributes whose value contains the match string.
 79 | scraper.getElementsByAttributeValueEnding(url,key,valueSuffix) YIELD element - Find elements that have attributes that end with the value suffix.
 80 | scraper.getElementsByAttributeValueEndingInHtml(html,key,valueSuffix) YIELD element - Find elements that have attributes that end with the value suffix.
 81 | scraper.getElementsByAttributeValueMatching(url,key,regex) YIELD element - Find elements that have attributes whose values match the supplied regular expression.
 82 | scraper.getElementsByAttributeValueMatchingInHtml(html,key,regex) YIELD element - Find elements that have attributes whose values match the supplied regular expression.
 83 | scraper.getElementsByAttributeValueNot(url,key,value) YIELD element - Find elements that either do not have this attribute, or have it with a different value.
 84 | scraper.getElementsByAttributeValueNotInHtml(html,key,value) YIELD element - Find elements that either do not have this attribute, or have it with a different value.
 85 | scraper.getElementsByAttributeValueStarting(url,key,valuePrefix) YIELD element - Find elements that have attributes that start with the value prefix.
 86 | scraper.getElementsByAttributeValueStartingInHtml(html,key,valuePrefix) YIELD element - Find elements that have attributes that start with the value prefix.
 87 | scraper.getElementsByIndexEquals(url,index) YIELD element - Find elements whose sibling index is equal to the supplied index.
 88 | scraper.getElementsByIndexEqualsInHtml(html,index) YIELD element - Find elements whose sibling index is equal to the supplied index.
 89 | scraper.getElementsByIndexGreaterThan(url,index) YIELD element - Find elements whose sibling index is greater than the supplied index.
 90 | scraper.getElementsByIndexGreaterThanInHtml(html,index) YIELD element - Find elements whose sibling index is greater than the supplied index.
 91 | scraper.getElementsByIndexLessThan(url,index) YIELD element - Find elements whose sibling index is less than the supplied index.
 92 | scraper.getElementsByIndexLessThanInHtml(html,index) YIELD element - Find elements whose sibling index is less than the supplied index.
 93 | scraper.getElementsContainingOwnText(url,searchText) YIELD element - Find elements that directly contain the specified string.
 94 | scraper.getElementsContainingOwnTextInHtml(html,searchText) YIELD element - Find elements that directly contain the specified string.
 95 | scraper.getElementsContainingText(url,searchText) YIELD element - Find elements that contain the specified string.
 96 | scraper.getElementsContainingTextInHtml(html,searchText) YIELD element - Find elements that contain the specified string.
 97 | scraper.getElementsMatchingOwnText(url,regex) YIELD element - Find elements whose text matches the supplied regular expression.
 98 | scraper.getElementsMatchingOwnTextInHtml(html,pattern) YIELD element - Find elements whose text matches the supplied regular expression.
 99 | scraper.getElementsMatchingText(url,pattern) YIELD element - Find elements whose text matches the supplied regular expression.
100 | scraper.getElementsContainingTextInHtml(html,pattern) YIELD element - Find elements whose text matches the supplied regular expression.
101 | scraper.getAllElements(url) YIELD element - Find all elements under this element (including self, and children of children).
102 | scraper.getAllElementsInHtml(html) YIELD element - Find all elements under this element (including self, and children of children).
103 | ```
104 | ### Useful links
105 | [Jsoup selector syntax](https://jsoup.org/cookbook/extracting-data/selector-syntax)
106 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.graphcoding.neo4j.scraperproc</groupId>
  8 |     <artifactId>neo4jscraperproc</artifactId>
  9 |     <version>0.1-SNAPSHOT</version>
 10 |     <packaging>jar</packaging>
 11 |     <name>Neo4j Scraper Procedures</name>
 12 |     <description>Neo4j Web Scraper Stored Procedures to load the web into
 13 |         the graph</description>
 14 | 
 15 |     <properties>
 16 |         <neo4j.version>3.5.9</neo4j.version>
 17 |     </properties>
 18 | 
 19 |     <dependencies>
 20 | 
 21 |         <dependency>
 22 |             <groupId>org.neo4j</groupId>
 23 |             <artifactId>neo4j</artifactId>
 24 |             <version>${neo4j.version}</version>
 25 |             <scope>provided</scope>
 26 |         </dependency>
 27 | 
 28 |         <dependency>
 29 |             <groupId>org.neo4j.test</groupId>
 30 |             <artifactId>neo4j-harness</artifactId>
 31 |             <version>${neo4j.version}</version>
 32 |             <scope>test</scope>
 33 |         </dependency>
 34 | 
 35 |         <dependency>
 36 |             <groupId>org.neo4j.community</groupId>
 37 |             <artifactId>it-test-support</artifactId>
 38 |             <version>${neo4j.version}</version>
 39 |             <scope>test</scope>
 40 |         </dependency>
 41 | 
 42 |         <dependency>
 43 |             <groupId>org.neo4j.driver</groupId>
 44 |             <artifactId>neo4j-java-driver</artifactId>
 45 |             <version>1.4.0</version>
 46 |             <scope>test</scope>
 47 |         </dependency>
 48 | 
 49 |         <dependency>
 50 |             <groupId>junit</groupId>
 51 |             <artifactId>junit</artifactId>
 52 |             <version>4.12</version>
 53 |             <scope>test</scope>
 54 |         </dependency>
 55 | 
 56 |         <dependency>
 57 |             <groupId>org.mockito</groupId>
 58 |             <artifactId>mockito-core</artifactId>
 59 |             <version>2.9.0</version>
 60 |         </dependency>
 61 | 
 62 |         <dependency>
 63 |             <groupId>org.powermock</groupId>
 64 |             <artifactId>powermock-core</artifactId>
 65 |             <version>1.7.1</version>
 66 |         </dependency>
 67 | 
 68 |         <!-- https://mvnrepository.com/artifact/org.powermock/powermock-api-mockito -->
 69 |         <dependency>
 70 |             <groupId>org.powermock</groupId>
 71 |             <artifactId>powermock-api-mockito2</artifactId>
 72 |             <version>1.7.1</version>
 73 |             <scope>test</scope>
 74 |         </dependency>
 75 | 
 76 |         <!-- https://mvnrepository.com/artifact/org.powermock/powermock-module-junit4 -->
 77 |         <dependency>
 78 |             <groupId>org.powermock</groupId>
 79 |             <artifactId>powermock-module-junit4</artifactId>
 80 |             <version>1.7.1</version>
 81 |             <scope>test</scope>
 82 |         </dependency>
 83 | 
 84 |         <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
 85 |         <dependency>
 86 |             <groupId>org.jsoup</groupId>
 87 |             <artifactId>jsoup</artifactId>
 88 |             <version>1.10.3</version>
 89 |         </dependency>
 90 |     </dependencies>
 91 | 
 92 |     <build>
 93 |         <plugins>
 94 |             <plugin>
 95 |                 <artifactId>maven-compiler-plugin</artifactId>
 96 |                 <version>3.1</version>
 97 |                 <configuration>
 98 |                     <source>1.8</source>
 99 |                     <target>1.8</target>
100 |                 </configuration>
101 |             </plugin>
102 |             <plugin>
103 |                 <artifactId>maven-shade-plugin</artifactId>
104 |                 <executions>
105 |                     <execution>
106 |                         <phase>package</phase>
107 |                         <goals>
108 |                             <goal>shade</goal>
109 |                         </goals>
110 |                     </execution>
111 |                 </executions>
112 |             </plugin>
113 |         </plugins>
114 |     </build>
115 | 
116 | </project>


--------------------------------------------------------------------------------
/src/main/java/scraper/Scraper.java:
--------------------------------------------------------------------------------
  1 | package scraper;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.InputStream;
  5 | import java.io.InputStreamReader;
  6 | import java.net.URL;
  7 | import java.net.URLConnection;
  8 | import java.util.stream.Collectors;
  9 | import org.jsoup.Jsoup;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | import org.jsoup.select.Elements;
 13 | import org.neo4j.graphdb.GraphDatabaseService;
 14 | import org.neo4j.kernel.internal.GraphDatabaseAPI;
 15 | import org.neo4j.logging.Log;
 16 | import org.neo4j.procedure.Context;
 17 | import org.neo4j.procedure.Description;
 18 | import org.neo4j.procedure.Name;
 19 | import org.neo4j.procedure.Procedure;
 20 | import scraper.result.JsoupElementResult;
 21 | import scraper.result.StringResult;
 22 | import scraper.util.CustomHtmlToPlainText;
 23 | 
 24 | import java.io.IOException;
 25 | import java.util.ArrayList;
 26 | import java.util.List;
 27 | import java.util.stream.Stream;
 28 | 
 29 | /**
 30 |  * Created by Janos Szendi-Varga on 2019. 09. 02.
 31 |  */
 32 | public class Scraper {
 33 | 
 34 |     final static int TIMEOUT = 1000;
 35 |     public static final String USERAGENT = "Mozilla";
 36 |     public static final boolean IGNORE_ERRORS = false;
 37 | 
 38 |     @Context
 39 |     public GraphDatabaseService db;
 40 | 
 41 |     @Context
 42 |     public Log log;
 43 | 
 44 |     @Context
 45 |     public GraphDatabaseAPI dbAPI;
 46 | 
 47 | 
 48 |     @Procedure
 49 |     @Description("scraper.getDocument(url) YIELD value - " +
 50 |           "Return the content of an url")
 51 |     public Stream<StringResult> getDocument(@Name("url") String url) throws IOException {
 52 | 
 53 |         URL urlObject = new URL(url);
 54 |         URLConnection conn = urlObject.openConnection();
 55 |         InputStream is = conn.getInputStream();
 56 | 
 57 |         String result = new BufferedReader(new InputStreamReader(is))
 58 |               .lines().collect(Collectors.joining("\n"));
 59 | 
 60 |         if (result.length() == 0) {
 61 |             return Stream.of(StringResult.EMPTY);
 62 |         } else {
 63 |             return Stream.of(new StringResult(result));
 64 |         }
 65 |     }
 66 | 
 67 |     @Procedure
 68 |     @Description("scraper.select(url,selector) YIELD element - " +
 69 |           "Find elements that match the Selector CSS query, with this element as the starting context.")
 70 |     public Stream<JsoupElementResult> select(@Name("url") String url,
 71 |           @Name("selector") String selector) throws IOException {
 72 |         Document doc = getDoc(url);
 73 | 
 74 |         return getResult(doc, selector).stream();
 75 |     }
 76 | 
 77 |     @Procedure
 78 |     @Description("scraper.selectInHtml(html,selector) YIELD element - " +
 79 |           "Find elements that match the Selector CSS query, with this element as the starting context.")
 80 |     public Stream<JsoupElementResult> selectInHtml(@Name("html") String html,
 81 |           @Name("selector") String selector) {
 82 |         Document doc = Jsoup.parseBodyFragment(html);
 83 | 
 84 |         return getResult(doc, selector).stream();
 85 |     }
 86 | 
 87 |     @Procedure
 88 |     @Description("scraper.getLinks(url) YIELD element - " +
 89 |           "Get link elements from an url.")
 90 |     public Stream<JsoupElementResult> getLinks(@Name("url") String url) throws IOException {
 91 |         Document doc = getDoc(url);
 92 | 
 93 |         return getResult(doc, "a[href]").stream();
 94 |     }
 95 | 
 96 |     @Procedure
 97 |     @Description("scraper.getLinksInHtml(html) YIELD element - " +
 98 |           "Get link elements from a html.")
 99 |     public Stream<JsoupElementResult> getLinksInHtml(@Name("html") String html) {
100 |         Document doc = Jsoup.parseBodyFragment(html);
101 | 
102 |         return getResult(doc, "a[href]").stream();
103 |     }
104 | 
105 |     @Procedure
106 |     @Description("scraper.getMediaLinks(url) YIELD element - " +
107 |           "Get media link elements.")
108 |     public Stream<JsoupElementResult> getMediaLinks(@Name("url") String url) throws IOException {
109 |         Document doc = getDoc(url);
110 | 
111 |         return getResult(doc, "[src]").stream();
112 |     }
113 | 
114 |     @Procedure
115 |     @Description("scraper.getMediaLinksInHtml(html) YIELD element - " +
116 |           "Get media link elements.")
117 |     public Stream<JsoupElementResult> getMediaLinksInHtml(@Name("html") String html) {
118 |         Document doc = Jsoup.parseBodyFragment(html);
119 | 
120 |         return getResult(doc, "[src]").stream();
121 |     }
122 | 
123 |     @Procedure
124 |     @Description("scraper.getPlainText(url,selector) YIELD value - " +
125 |           "Get plain text version of a given page.")
126 |     public Stream<StringResult> getPlainText(@Name("url") String url, @Name(value = "selector", defaultValue = "") String selector) {
127 |         StringBuilder plainText = new StringBuilder();
128 |         try {
129 |             Document doc = getDoc(url);
130 |             CustomHtmlToPlainText formatter = new CustomHtmlToPlainText();
131 |             if (!selector.equals("")) {
132 |                 Elements elements = doc.select(selector);
133 |                 for (Element element : elements) {
134 |                     plainText.append(formatter.getPlainText(element));
135 |                 }
136 |             } else {
137 |                 plainText.append(formatter.getPlainText(doc));
138 |             }
139 |         } catch (Exception e) {
140 |             Stream.of(StringResult.EMPTY);
141 |         }
142 | 
143 |         if (plainText.length() == 0) {
144 |             return Stream.of(StringResult.EMPTY);
145 |         } else {
146 |             return Stream.of(new StringResult(plainText.toString()));
147 |         }
148 | 
149 |     }
150 | 
151 |     @Procedure
152 |     @Description("scraper.getPlainTextInHtml(url,selector) YIELD value - " +
153 |           "Get plain text version of a given page.")
154 |     public Stream<StringResult> getPlainTextInHtml(@Name("html") String html, @Name(value = "selector", defaultValue = "") String selector) {
155 |         StringBuilder plainText = new StringBuilder();
156 |         Document doc = Jsoup.parseBodyFragment(html);
157 |         CustomHtmlToPlainText formatter = new CustomHtmlToPlainText();
158 |         if (!selector.equals("")) {
159 |             Elements elements = doc.select(selector);
160 |             for (Element element : elements) {
161 |                 plainText.append(formatter.getPlainText(element));
162 |             }
163 |         } else {
164 |             plainText.append(formatter.getPlainText(doc));
165 |         }
166 |         if (plainText.length() == 0) {
167 |             return Stream.of(StringResult.EMPTY);
168 |         } else {
169 |             return Stream.of(new StringResult(plainText.toString()));
170 |         }
171 |     }
172 | 
173 |     @Procedure
174 |     @Description("scraper.getElementById(url,id) YIELD element - "
175 |           + "Find an element by ID, including or under this element.")
176 |     public Stream<JsoupElementResult> getElementById(@Name("url") String url, @Name("id") String id)
177 |           throws IOException {
178 |         Document doc = getDoc(url);
179 |         Element element = doc.getElementById(id);
180 |         return Stream.of(new JsoupElementResult(url, element));
181 |     }
182 | 
183 |     @Procedure
184 |     @Description("scraper.getElementByIdInHtml(html,id) YIELD element - " +
185 |           "Find an element by ID, including or under this element.")
186 |     public Stream<JsoupElementResult> getElementByIdInHtml(@Name("html") String html, @Name("id") String id) {
187 |         Document doc = Jsoup.parseBodyFragment(html);
188 |         Element element = doc.getElementById(id);
189 |         return Stream.of(new JsoupElementResult(null, element));
190 |     }
191 | 
192 |     @Procedure
193 |     @Description("scraper.getElementsByTag(url,tag) YIELD element - "
194 |           + "Finds elements, including and recursively under this element, with the specified tag name.")
195 |     public Stream<JsoupElementResult> getElementsByTag(@Name("url") String url, @Name("tag") String tag) throws IOException {
196 |         Document doc = getDoc(url);
197 |         Elements elements = doc.getElementsByTag(tag);
198 | 
199 |         return getResult(doc, elements).stream();
200 |     }
201 | 
202 |     @Procedure
203 |     @Description("scraper.getElementsByTagInHtml(html,tag) YIELD element - "
204 |           + "Finds elements, including and recursively under this element, with the specified tag name.")
205 |     public Stream<JsoupElementResult> getElementsByTagInHtml(@Name("html") String html, @Name("tag") String tag) {
206 |         Document doc = Jsoup.parseBodyFragment(html);
207 |         Elements elements = doc.getElementsByTag(tag);
208 | 
209 |         return getResult(doc, elements).stream();
210 |     }
211 | 
212 |     @Procedure
213 |     @Description("scraper.getElementsByClass(url,className) YIELD element - "
214 |           + "Find elements that have this class, including or under this element.")
215 |     public Stream<JsoupElementResult> getElementsByClass(@Name("url") String url,
216 |           @Name("className") String className) throws IOException {
217 |         Document doc = getDoc(url);
218 |         Elements elements = doc.getElementsByClass(className);
219 | 
220 |         return getResult(doc, elements).stream();
221 |     }
222 | 
223 |     @Procedure
224 |     @Description("scraper.getElementsByClassInHtml(html,className) YIELD element - " +
225 |           "Find elements that have this class, including or under this element.")
226 |     public Stream<JsoupElementResult> getElementsByClassInHtml(@Name("html") String html, @Name("className") String className) {
227 |         Document doc = Jsoup.parseBodyFragment(html);
228 |         Elements elements = doc.getElementsByClass(className);
229 | 
230 |         return getResult(doc, elements).stream();
231 |     }
232 | 
233 |     @Procedure
234 |     @Description("scraper.getElementsByAttribute(url,key) YIELD element - "
235 |           + "Find elements that have a named attribute set.")
236 |     public Stream<JsoupElementResult> getElementsByAttribute(@Name("url") String url, @Name("key") String key) throws IOException {
237 |         Document doc = getDoc(url);
238 |         Elements elements = doc.getElementsByAttribute(key);
239 | 
240 |         return getResult(doc, elements).stream();
241 |     }
242 | 
243 |     @Procedure
244 |     @Description("scraper.getElementsByAttributeInHtml(html,attribute) YIELD element - " +
245 |           "Find elements that have a named attribute set.")
246 |     public Stream<JsoupElementResult> getElementsByAttributeInHtml(@Name("html") String html, @Name("key") String key) {
247 |         Document doc = Jsoup.parseBodyFragment(html);
248 |         Elements elements = doc.getElementsByAttribute(key);
249 | 
250 |         return getResult(doc, elements).stream();
251 | 
252 |     }
253 | 
254 |     @Procedure
255 |     @Description("scraper.getElementsByAttributeStarting(url,keyPrefix) YIELD element - "
256 |           + "Find elements that have an attribute name starting with the supplied prefix. Use data- to find elements that have HTML5 datasets.")
257 |     public Stream<JsoupElementResult> getElementsByAttributeStarting(@Name("url") String url, @Name("keyPrefix") String keyPrefix) throws IOException {
258 |         Document doc = getDoc(url);
259 |         Elements elements = doc.getElementsByAttributeStarting(keyPrefix);
260 | 
261 |         return getResult(doc, elements).stream();
262 |     }
263 | 
264 |     @Procedure
265 |     @Description("scraper.getElementsByAttributeStartingInHtml(html,keyPrefix) YIELD element - " +
266 |           "Find elements that have an attribute name starting with the supplied prefix. Use data- to find elements that have HTML5 datasets.")
267 |     public Stream<JsoupElementResult> getElementsByAttributeStartingInHtml(@Name("html") String html, @Name("keyPrefix") String keyPrefix) {
268 |         Document doc = Jsoup.parseBodyFragment(html);
269 |         Elements elements = doc.getElementsByAttributeStarting(keyPrefix);
270 | 
271 |         return getResult(doc, elements).stream();
272 |     }
273 | 
274 |     @Procedure
275 |     @Description("scraper.getElementsByAttributeValue(url,key,value) YIELD element - "
276 |           + "Find elements that have an attribute with the specific value.")
277 |     public Stream<JsoupElementResult> getElementsByAttributeValue(@Name("url") String url, @Name("key") String key, @Name("value") String value) throws IOException {
278 |         Document doc = getDoc(url);
279 |         Elements elements = doc.getElementsByAttributeValue(key, value);
280 | 
281 |         return getResult(doc, elements).stream();
282 |     }
283 | 
284 |     @Procedure
285 |     @Description("scraper.getElementsByAttributeValueInHtml(html,key,value) YIELD element - "
286 |           + "Find elements that have an attribute with the specific value.")
287 |     public Stream<JsoupElementResult> getElementsByAttributeValueInHtml(@Name("html") String html, @Name("key") String key, @Name("value") String value) {
288 |         Document doc = Jsoup.parseBodyFragment(html);
289 |         Elements elements = doc.getElementsByAttributeValue(key, value);
290 | 
291 |         return getResult(doc, elements).stream();
292 |     }
293 | 
294 |     @Procedure
295 |     @Description("scraper.getElementsByAttributeValueContaining(url,key,match) YIELD element - "
296 |           + "Find elements that have attributes whose value contains the match string.")
297 |     public Stream<JsoupElementResult> getElementsByAttributeValueContaining(@Name("url") String url, @Name("key") String key, @Name("match") String match) throws IOException {
298 |         Document doc = getDoc(url);
299 |         Elements elements = doc.getElementsByAttributeValueContaining(key, match);
300 | 
301 |         return getResult(doc, elements).stream();
302 |     }
303 | 
304 |     @Procedure
305 |     @Description(
306 |           "scraper.getElementsByAttributeValueContainingInHtml(html,key,match) YIELD element - "
307 |                 + "Find elements that have attributes whose value contains the match string.")
308 |     public Stream<JsoupElementResult> getElementsByAttributeValueContainingInHtml(@Name("html") String html, @Name("key") String key, @Name("match") String match) {
309 |         Document doc = Jsoup.parseBodyFragment(html);
310 |         Elements elements = doc.getElementsByAttributeValueContaining(key, match);
311 | 
312 |         return getResult(doc, elements).stream();
313 |     }
314 | 
315 |     @Procedure
316 |     @Description(
317 |           "scraper.getElementsByAttributeValueEnding(url,key,valueSuffix) YIELD element - "
318 |                 + "Find elements that have attributes that end with the value suffix.")
319 |     public Stream<JsoupElementResult> getElementsByAttributeValueEnding(@Name("url") String url, @Name("key") String key, @Name("valueSuffix") String valueSuffix) throws IOException {
320 |         Document doc = getDoc(url);
321 |         Elements elements = doc.getElementsByAttributeValueEnding(key, valueSuffix);
322 | 
323 |         return getResult(doc, elements).stream();
324 |     }
325 | 
326 |     @Procedure
327 |     @Description(
328 |           "scraper.getElementsByAttributeValueEndingInHtml(html,key,valueSuffix) YIELD element - "
329 |                 + "Find elements that have attributes that end with the value suffix.")
330 |     public Stream<JsoupElementResult> getElementsByAttributeValueEndingInHtml(@Name("html") String html, @Name("key") String key, @Name("valueSuffix") String valueSuffix) {
331 |         Document doc = Jsoup.parseBodyFragment(html);
332 |         Elements elements = doc.getElementsByAttributeValueEnding(key, valueSuffix);
333 | 
334 |         return getResult(doc, elements).stream();
335 |     }
336 | 
337 |     @Procedure
338 |     @Description("scraper.getElementsByAttributeValueMatching(url,key,regex) YIELD element - "
339 |           + "Find elements that have attributes whose values match the supplied regular expression.")
340 |     public Stream<JsoupElementResult> getElementsByAttributeValueMatching(@Name("url") String url, @Name("key") String key, @Name("regex") String regex) throws IOException {
341 |         Document doc = getDoc(url);
342 |         Elements elements = doc.getElementsByAttributeValueMatching(key, regex);
343 | 
344 |         return getResult(doc, elements).stream();
345 |     }
346 | 
347 |     @Procedure
348 |     @Description(
349 |           "scraper.getElementsByAttributeValueMatchingInHtml(html,key,regex) YIELD element - "
350 |                 + "Find elements that have attributes whose values match the supplied regular expression.")
351 |     public Stream<JsoupElementResult> getElementsByAttributeValueMatchingInHtml(@Name("html") String html, @Name("key") String key, @Name("regex") String regex) {
352 |         Document doc = Jsoup.parseBodyFragment(html);
353 |         Elements elements = doc.getElementsByAttributeValueMatching(key, regex);
354 | 
355 |         return getResult(doc, elements).stream();
356 |     }
357 | 
358 |     @Procedure
359 |     @Description("scraper.getElementsByAttributeValueNot(url,key,value) YIELD element - "
360 |           + "Find elements that either do not have this attribute, or have it with a different value.")
361 |     public Stream<JsoupElementResult> getElementsByAttributeValueNot(@Name("url") String url, @Name("key") String key, @Name("value") String value) throws IOException {
362 |         Document doc = getDoc(url);
363 | 
364 |         Elements elements = doc.getElementsByAttributeValueNot(key, value);
365 | 
366 |         return getResult(doc, elements).stream();
367 |     }
368 | 
369 |     @Procedure
370 |     @Description("scraper.getElementsByAttributeValueNotInHtml(html,key,value) YIELD element - "
371 |           + "Find elements that either do not have this attribute, or have it with a different value.")
372 |     public Stream<JsoupElementResult> getElementsByAttributeValueNotInHtml(@Name("html") String html, @Name("key") String key, @Name("value") String value) {
373 |         Document doc = Jsoup.parseBodyFragment(html);
374 |         Elements elements = doc.getElementsByAttributeValueNot(key, value);
375 | 
376 |         return getResult(doc, elements).stream();
377 |     }
378 | 
379 |     @Procedure
380 |     @Description(
381 |           "scraper.getElementsByAttributeValueStarting(url,key,valuePrefix) YIELD element - "
382 |                 + "Find elements that have attributes that start with the value prefix.")
383 |     public Stream<JsoupElementResult> getElementsByAttributeValueStarting(@Name("url") String url, @Name("key") String key, @Name("valuePrefix") String valuePrefix) throws IOException {
384 |         Document doc = getDoc(url);
385 |         Elements elements = doc.getElementsByAttributeValueStarting(key, valuePrefix);
386 | 
387 |         return getResult(doc, elements).stream();
388 |     }
389 | 
390 |     @Procedure
391 |     @Description(
392 |           "scraper.getElementsByAttributeValueStartingInHtml(html,key,valuePrefix) YIELD element - "
393 |                 + "Find elements that have attributes that start with the value prefix.")
394 |     public Stream<JsoupElementResult> getElementsByAttributeValueStartingInHtml(@Name("html") String html, @Name("key") String key, @Name("valuePrefix") String valuePrefix) {
395 |         Document doc = Jsoup.parseBodyFragment(html);
396 |         Elements elements = doc.getElementsByAttributeValueStarting(key, valuePrefix);
397 | 
398 |         return getResult(doc, elements).stream();
399 |     }
400 | 
401 |     @Procedure
402 |     @Description("scraper.getElementsByIndexEquals(url,index) YIELD element - "
403 |           + "Find elements whose sibling index is equal to the supplied index.")
404 |     public Stream<JsoupElementResult> getElementsByIndexEquals(@Name("url") String url, @Name("index") String index) throws IOException {
405 |         Document doc = getDoc(url);
406 |         Elements elements = doc.getElementsByIndexEquals(Integer.parseInt(index));
407 | 
408 |         return getResult(doc, elements).stream();
409 |     }
410 | 
411 |     @Procedure
412 |     @Description("scraper.getElementsByIndexEqualsInHtml(html,index) YIELD element - "
413 |           + "Find elements whose sibling index is equal to the supplied index.")
414 |     public Stream<JsoupElementResult> getElementsByIndexEqualsInHtml(@Name("html") String html, @Name("index") String index) {
415 |         Document doc = Jsoup.parseBodyFragment(html);
416 |         Elements elements = doc.getElementsByIndexEquals(Integer.parseInt(index));
417 | 
418 |         return getResult(doc, elements).stream();
419 |     }
420 | 
421 |     @Procedure
422 |     @Description("scraper.getElementsByIndexGreaterThan(url,index) YIELD element - "
423 |           + "Find elements whose sibling index is greater than the supplied index.")
424 |     public Stream<JsoupElementResult> getElementsByIndexGreaterThan(@Name("url") String url, @Name("index") String index) throws IOException {
425 |         Document doc = getDoc(url);
426 |         Elements elements = doc.getElementsByIndexGreaterThan(Integer.parseInt(index));
427 | 
428 |         return getResult(doc, elements).stream();
429 |     }
430 | 
431 |     @Procedure
432 |     @Description("scraper.getElementsByIndexGreaterThanInHtml(html,index) YIELD element - "
433 |           + "Find elements whose sibling index is greater than the supplied index.")
434 |     public Stream<JsoupElementResult> getElementsByIndexGreaterThanInHtml(@Name("html") String html, @Name("index") String index) {
435 |         Document doc = Jsoup.parseBodyFragment(html);
436 |         Elements elements = doc.getElementsByIndexGreaterThan(Integer.parseInt(index));
437 | 
438 |         return getResult(doc, elements).stream();
439 |     }
440 | 
441 |     @Procedure
442 |     @Description("scraper.getElementsByIndexLessThan(url,index) YIELD element - "
443 |           + "Find elements whose sibling index is less than the supplied index.")
444 |     public Stream<JsoupElementResult> getElementsByIndexLessThan(@Name("url") String url, @Name("index") String index) throws IOException {
445 |         Document doc = getDoc(url);
446 |         Elements elements = doc.getElementsByIndexLessThan(Integer.parseInt(index));
447 | 
448 |         return getResult(doc, elements).stream();
449 |     }
450 | 
451 |     @Procedure
452 |     @Description("scraper.getElementsByIndexLessThanInHtml(html,index) YIELD element - "
453 |           + "Find elements whose sibling index is less than the supplied index.")
454 |     public Stream<JsoupElementResult> getElementsByIndexLessThanInHtml(@Name("html") String html, @Name("index") String index) {
455 |         Document doc = Jsoup.parseBodyFragment(html);
456 |         Elements elements = doc.getElementsByIndexLessThan(Integer.parseInt(index));
457 | 
458 |         return getResult(doc, elements).stream();
459 |     }
460 | 
461 |     @Procedure
462 |     @Description("scraper.getElementsContainingOwnText(url,searchText) YIELD element - "
463 |           + "Find elements that directly contain the specified string.")
464 |     public Stream<JsoupElementResult> getElementsContainingOwnText(@Name("url") String url, @Name("searchText") String searchText) throws IOException {
465 |         Document doc = getDoc(url);
466 |         Elements elements = doc.getElementsContainingOwnText(searchText);
467 | 
468 |         return getResult(doc, elements).stream();
469 |     }
470 | 
471 |     @Procedure
472 |     @Description("scraper.getElementsContainingOwnTextInHtml(html,searchText) YIELD element - "
473 |           + "Find elements that directly contain the specified string.")
474 |     public Stream<JsoupElementResult> getElementsContainingOwnTextInHtml(@Name("html") String html, @Name("searchText") String searchText) {
475 |         Document doc = Jsoup.parseBodyFragment(html);
476 |         Elements elements = doc.getElementsContainingOwnText(searchText);
477 | 
478 |         return getResult(doc, elements).stream();
479 |     }
480 | 
481 |     @Procedure
482 |     @Description("scraper.getElementsContainingText(url,searchText) YIELD element - "
483 |           + "Find elements that contain the specified string.")
484 |     public Stream<JsoupElementResult> getElementsContainingText(@Name("url") String url, @Name("searchText") String searchText) throws IOException {
485 |         Document doc = getDoc(url);
486 |         Elements elements = doc.getElementsContainingText(searchText);
487 | 
488 |         return getResult(doc, elements).stream();
489 |     }
490 | 
491 |     @Procedure
492 |     @Description("scraper.getElementsContainingTextInHtml(html,searchText) YIELD element - "
493 |           + "Find elements that contain the specified string.")
494 |     public Stream<JsoupElementResult> getElementsContainingTextInHtml(@Name("html") String html, @Name("searchText") String searchText) {
495 |         Document doc = Jsoup.parseBodyFragment(html);
496 |         Elements elements = doc.getElementsContainingText(searchText);
497 | 
498 |         return getResult(doc, elements).stream();
499 |     }
500 | 
501 |     @Procedure
502 |     @Description("scraper.getElementsMatchingOwnText(url,regex) YIELD element - "
503 |           + "Find elements whose text matches the supplied regular expression.")
504 |     public Stream<JsoupElementResult> getElementsMatchingOwnText(@Name("url") String url, @Name("regex") String regex) throws IOException {
505 |         Document doc = getDoc(url);
506 |         Elements elements = doc.getElementsMatchingOwnText(regex);
507 | 
508 |         return getResult(doc, elements).stream();
509 |     }
510 | 
511 |     @Procedure
512 |     @Description("scraper.getElementsMatchingOwnTextInHtml(html,pattern) YIELD element - "
513 |           + "Find elements whose text matches the supplied regular expression.")
514 |     public Stream<JsoupElementResult> getElementsMatchingOwnTextInHtml(@Name("html") String html, @Name("regex") String regex) {
515 |         Document doc = Jsoup.parseBodyFragment(html);
516 |         Elements elements = doc.getElementsMatchingOwnText(regex);
517 | 
518 |         return getResult(doc, elements).stream();
519 |     }
520 | 
521 |     @Procedure
522 |     @Description("scraper.getElementsMatchingText(url,pattern) YIELD element - "
523 |           + "Find elements whose text matches the supplied regular expression.")
524 |     public Stream<JsoupElementResult> getElementsMatchingText(@Name("url") String url, @Name("regex") String regex) throws IOException {
525 |         Document doc = getDoc(url);
526 |         Elements elements = doc.getElementsMatchingText(regex);
527 | 
528 |         return getResult(doc, elements).stream();
529 |     }
530 | 
531 |     @Procedure
532 |     @Description("scraper.getElementsContainingTextInHtml(html,pattern) YIELD element - "
533 |           + "Find elements whose text matches the supplied regular expression.")
534 |     public Stream<JsoupElementResult> getElementsMatchingTextInHtml(@Name("html") String html, @Name("regex") String regex) {
535 |         Document doc = Jsoup.parseBodyFragment(html);
536 |         Elements elements = doc.getElementsMatchingText(regex);
537 | 
538 |         return getResult(doc, elements).stream();
539 |     }
540 | 
541 |     @Procedure
542 |     @Description("scraper.getAllElements(url) YIELD element - "
543 |           + "Find all elements under this element (including self, and children of children).")
544 |     public Stream<JsoupElementResult> getAllElements(@Name("url") String url) throws IOException {
545 |         Document doc = getDoc(url);
546 |         Elements elements = doc.getAllElements();
547 | 
548 |         return getResult(doc, elements).stream();
549 |     }
550 | 
551 |     @Procedure
552 |     @Description("scraper.getAllElementsInHtml(html) YIELD element - "
553 |           + "Find all elements under this element (including self, and children of children).")
554 |     public Stream<JsoupElementResult> getAllElementsInHtml(@Name("html") String html) {
555 |         Document doc = Jsoup.parseBodyFragment(html);
556 |         Elements elements = doc.getAllElements();
557 | 
558 |         return getResult(doc, elements).stream();
559 |     }
560 | 
561 |     private List<JsoupElementResult> getResult(Document doc, String selector) {
562 |         List<JsoupElementResult> list = new ArrayList<>();
563 |         Elements elements = doc.select(selector);
564 |         for (Element element : elements) {
565 |             list.add(new JsoupElementResult(doc.baseUri(), element));
566 |         }
567 |         return list;
568 |     }
569 | 
570 |     private List<JsoupElementResult> getResult(Document doc, Elements elements) {
571 |         List<JsoupElementResult> list = new ArrayList<>();
572 |         for (Element element : elements) {
573 |             list.add(new JsoupElementResult(doc.baseUri(), element));
574 |         }
575 |         return list;
576 |     }
577 | 
578 |     private Document getDoc(@Name("url") String url) throws IOException {
579 |         return Jsoup.connect(url).userAgent(USERAGENT).ignoreHttpErrors(IGNORE_ERRORS)
580 |               .timeout(TIMEOUT)
581 |               .get();
582 |     }
583 | }
584 | 


--------------------------------------------------------------------------------
/src/main/java/scraper/result/DocumentResult.java:
--------------------------------------------------------------------------------
 1 | package scraper.result;
 2 | 
 3 | public class DocumentResult {
 4 |     public final static DocumentResult EMPTY = new DocumentResult(null);
 5 | 
 6 |     public final String value;
 7 | 
 8 |     public DocumentResult(String value) {
 9 |         this.value = value;
10 |     }
11 | }


--------------------------------------------------------------------------------
/src/main/java/scraper/result/JsoupElementResult.java:
--------------------------------------------------------------------------------
 1 | package scraper.result;
 2 | 
 3 | import org.jsoup.nodes.Attribute;
 4 | import org.jsoup.nodes.Element;
 5 | 
 6 | import java.util.HashMap;
 7 | import java.util.Iterator;
 8 | import java.util.Map;
 9 | 
10 | /**
11 |  * Created by Janos Szendi-Varga on 2019. 09. 02.
12 |  */
13 | public class JsoupElementResult {
14 |     
15 |     public Map element = new HashMap();
16 | 
17 |     public JsoupElementResult(String url, Element jsoupElement) {
18 |         element.put("url",url);
19 |         element.put("text", jsoupElement.text());
20 |         element.put("html", jsoupElement.html());
21 |         element.put("outerHtml", jsoupElement.outerHtml());
22 |         element.put("data", jsoupElement.data());
23 |         element.put("tagName", jsoupElement.tagName());
24 |         element.put("id", jsoupElement.id());
25 |         element.put("className", jsoupElement.className());
26 |         element.put("classNames",jsoupElement.classNames());
27 | 
28 |         Map attributes = new HashMap();
29 |         Iterator<Attribute> it = jsoupElement.attributes().iterator();
30 |         while (it.hasNext()) {
31 |             Attribute attr = it.next();
32 |             attributes.put(attr.getKey(), attr.getValue());
33 |             if(attr.getKey().equals("href")){
34 |                 attributes.put("abs:"+attr.getKey(), jsoupElement.attr
35 |                         ("abs:href"));
36 |             }
37 |         }
38 |         element.put("attributes",attributes);
39 | 
40 |     }
41 | }


--------------------------------------------------------------------------------
/src/main/java/scraper/result/StringResult.java:
--------------------------------------------------------------------------------
 1 | package scraper.result;
 2 | 
 3 | /**
 4 |  * Created by Janos Szendi-Varga on 2019. 09. 02.
 5 |  */
 6 | public class StringResult {
 7 |     public final static StringResult EMPTY = new StringResult(null);
 8 | 
 9 |     public final String value;
10 | 
11 |     public StringResult(String value) {
12 |         this.value = value;
13 |     }
14 | }


--------------------------------------------------------------------------------
/src/main/java/scraper/util/CustomHtmlToPlainText.java:
--------------------------------------------------------------------------------
  1 | package scraper.util;
  2 | 
  3 | import org.jsoup.helper.StringUtil;
  4 | import org.jsoup.nodes.Element;
  5 | import org.jsoup.nodes.Node;
  6 | import org.jsoup.nodes.TextNode;
  7 | import org.jsoup.select.NodeTraversor;
  8 | import org.jsoup.select.NodeVisitor;
  9 | 
 10 | /**
 11 |  * Customized by Janos Szendi-Varga on 2019. 09. 04.
 12 |  * Removing retrieval of Href strings from org.jsoup.examples.HtmlToPlainText
 13 |  */
 14 | public class CustomHtmlToPlainText {
 15 | 
 16 |     /**
 17 |      * Format an Element to plain-text
 18 |      *
 19 |      * @param element the root element to format
 20 |      * @return formatted text
 21 |      */
 22 |     public String getPlainText(Element element) {
 23 |         FormattingVisitor formatter = new FormattingVisitor();
 24 |         NodeTraversor traversor = new NodeTraversor(formatter);
 25 |         traversor.traverse(element); // walk the DOM, and call .head() and
 26 |         // .tail() for each node
 27 | 
 28 |         return formatter.toString();
 29 |     }
 30 | 
 31 |     // the formatting rules, implemented in a breadth-first DOM traverse
 32 |     private class FormattingVisitor implements NodeVisitor {
 33 |         private static final int maxWidth = 80;
 34 |         private int width = 0;
 35 |         private StringBuilder accum = new StringBuilder(); // holds the
 36 |         // accumulated text
 37 | 
 38 |         // hit when the node is first seen
 39 |         public void head(Node node, int depth) {
 40 |             String name = node.nodeName();
 41 |             if (node instanceof TextNode)
 42 |                 append(((TextNode) node).text()); // TextNodes carry all
 43 |                 // user-readable text in the DOM.
 44 |             else if (name.equals("li"))
 45 |                 append("\n * ");
 46 |             else if (name.equals("dt"))
 47 |                 append("  ");
 48 |             else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5",
 49 |                     "tr"))
 50 |                 append("\n");
 51 |         }
 52 | 
 53 |         // hit when all of the node's children (if any) have been visited
 54 |         public void tail(Node node, int depth) {
 55 |             String name = node.nodeName();
 56 |             if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3",
 57 |                     "h4", "h5"))
 58 |                 append("\n");
 59 |             else if (name.equals("a"))
 60 |                 return;
 61 |         }
 62 | 
 63 |         // appends text to the string builder with a simple word wrap method
 64 |         private void append(String text) {
 65 |             if (text.startsWith("\n"))
 66 |                 width = 0; // reset counter if starts with a newline. only
 67 |             // from formats above, not in natural text
 68 |             if (text.equals(" ") &&
 69 |                     (accum.length() == 0 || StringUtil.in(accum.substring
 70 |                             (accum.length() - 1), " ", "\n")))
 71 |                 return; // don't accumulate long runs of empty spaces
 72 | 
 73 |             if (text.length() + width > maxWidth) { // won't fit, needs to wrap
 74 |                 String words[] = text.split("\\s+");
 75 |                 for (int i = 0; i < words.length; i++) {
 76 |                     String word = words[i];
 77 |                     boolean last = i == words.length - 1;
 78 |                     if (!last) // insert a space if not the last word
 79 |                         word = word + " ";
 80 |                     if (word.length() + width > maxWidth) { // wrap and reset
 81 |                         // counter
 82 |                         accum.append("\n").append(word);
 83 |                         width = word.length();
 84 |                     } else {
 85 |                         accum.append(word);
 86 |                         width += word.length();
 87 |                     }
 88 |                 }
 89 |             } else { // fits as is, without need to wrap text
 90 |                 accum.append(text);
 91 |                 width += text.length();
 92 |             }
 93 |         }
 94 | 
 95 |         @Override
 96 |         public String toString() {
 97 |             return accum.toString();
 98 |         }
 99 |     }
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/test/java/scraper/ScraperTest.java:
--------------------------------------------------------------------------------
  1 | package scraper;
  2 | 
  3 | import static junit.framework.TestCase.assertEquals;
  4 | import static org.junit.Assert.assertTrue;
  5 | 
  6 | import java.io.BufferedReader;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.util.ArrayList;
 10 | import java.util.Arrays;
 11 | import java.util.Collections;
 12 | import java.util.HashMap;
 13 | import java.util.List;
 14 | import java.util.Map;
 15 | import org.jsoup.Jsoup;
 16 | import org.junit.AfterClass;
 17 | import org.junit.BeforeClass;
 18 | import org.junit.Test;
 19 | import org.junit.runner.RunWith;
 20 | import org.neo4j.graphdb.DependencyResolver.SelectionStrategy;
 21 | import org.neo4j.graphdb.GraphDatabaseService;
 22 | import org.neo4j.graphdb.Result;
 23 | import org.neo4j.kernel.impl.proc.Procedures;
 24 | import org.neo4j.kernel.internal.GraphDatabaseAPI;
 25 | import org.neo4j.test.TestGraphDatabaseFactory;
 26 | import org.powermock.core.classloader.annotations.PowerMockIgnore;
 27 | import org.powermock.core.classloader.annotations.PrepareForTest;
 28 | import org.powermock.modules.junit4.PowerMockRunner;
 29 | 
 30 | /**
 31 |  * Created by Janos Szendi-Varga on 2019. 09. 02.
 32 |  */
 33 | 
 34 | @RunWith(PowerMockRunner.class)
 35 | @PowerMockIgnore("javax.management.*")
 36 | @PrepareForTest(Jsoup.class)
 37 | public class ScraperTest {
 38 | 
 39 |     private static GraphDatabaseService db;
 40 | 
 41 |     private static String testUrl = "http://www.mocky.io/v2/5d814df73000004e006995f9";
 42 | 
 43 |     @BeforeClass
 44 |     public static void setUp() throws Exception {
 45 | 
 46 |         db = new TestGraphDatabaseFactory().newImpermanentDatabase();
 47 | 
 48 |         Procedures proceduresService = ((GraphDatabaseAPI) db)
 49 |               .getDependencyResolver().resolveDependency(Procedures.class, SelectionStrategy.FIRST);
 50 |         proceduresService.registerProcedure(Scraper.class);
 51 |     }
 52 | 
 53 |     @AfterClass
 54 |     public static void tearDown() {
 55 |         db.shutdown();
 56 |     }
 57 | 
 58 |     @Test
 59 |     public void shouldGetDocument() {
 60 |         Map<String, Object> map = new HashMap<>();
 61 |         map.put("url", testUrl);
 62 | 
 63 |         Result res = db.execute("CALL scraper.getDocument({url}) YIELD value RETURN value",
 64 |               map);
 65 | 
 66 |         assertTrue(res.hasNext());
 67 |         assertEquals(Collections.singletonList("value"), res.columns());
 68 |         assertEquals(res.next().get("value"), getTestHtml());
 69 |     }
 70 | 
 71 |     @Test
 72 |     public void shouldReturnElementsBySelect() {
 73 |         Map<String, Object> map = new HashMap<>();
 74 |         map.put("url", testUrl);
 75 |         map.put("selector", "a[href]");
 76 | 
 77 |         Result res = db.execute("CALL scraper.select({url}," +
 78 |                     "{selector}) YIELD element RETURN element.attributes" +
 79 |                     ".`abs:href` AS col",
 80 |               map);
 81 | 
 82 |         assertTrue(res.hasNext());
 83 |         assertEquals(Collections.singletonList("col"), res.columns());
 84 | 
 85 |         List<String> urls = new ArrayList<>();
 86 |         while (res.hasNext()) {
 87 |             String url = res.next().get("col").toString();
 88 |             urls.add(url);
 89 |         }
 90 |         assertEquals(Arrays.asList("http://www.index.hu", "http://www.index2.hu"), urls);
 91 |     }
 92 | 
 93 |     @Test
 94 |     public void shouldReturnElementsBySelectInHtml() {
 95 |         Map<String, Object> map = new HashMap<>();
 96 |         map.put("html", getTestHtml());
 97 |         map.put("selector", "a[href]");
 98 | 
 99 |         Result res = db.execute("CALL scraper.selectInHtml({html}," +
100 |                     "{selector}) YIELD element RETURN element.attributes" +
101 |                     ".`abs:href` AS col",
102 |               map);
103 | 
104 |         assertTrue(res.hasNext());
105 |         assertEquals(Collections.singletonList("col"), res.columns());
106 | 
107 |         List<String> urls = new ArrayList<>();
108 |         while (res.hasNext()) {
109 |             String url = res.next().get("col").toString();
110 |             urls.add(url);
111 |         }
112 |         assertEquals(Arrays.asList("http://www.index.hu", "http://www.index2.hu"), urls);
113 |     }
114 | 
115 |     @Test
116 |     public void shouldReturnTextOfHtmlUrl() {
117 |         Map<String, Object> map = new HashMap<>();
118 |         map.put("url", testUrl);
119 | 
120 |         Result res = db.execute("CALL scraper.getPlainText({url}) YIELD " +
121 |                     "value RETURN value",
122 |               map);
123 | 
124 |         assertTrue(res.next().get("value").toString().startsWith(" HTML Test" +
125 |               " Page \n" +
126 |               "Testing display of HTML elements"));
127 | 
128 |         assertTrue(!res.hasNext());
129 |         assertEquals(Collections.singletonList("value"), res.columns());
130 |     }
131 | 
132 |     @Test
133 |     public void shouldReturnLinkTextOfUrl() {
134 |         Map<String, Object> map = new HashMap<>();
135 |         map.put("url", testUrl);
136 |         map.put("selector", "a[href]");
137 | 
138 |         Result res = db.execute("CALL scraper.getPlainText({url}," +
139 |                     "{selector}) " +
140 |                     "YIELD " +
141 |                     "value RETURN value",
142 |               map);
143 | 
144 |         assertEquals("Index1Index2", res.next().get("value").toString());
145 |         assertTrue(!res.hasNext());
146 |         assertEquals(Collections.singletonList("value"), res.columns());
147 |     }
148 | 
149 |     @Test
150 |     public void shouldReturnLinksUrl() {
151 |         Map<String, Object> map = new HashMap<>();
152 |         map.put("url", testUrl);
153 | 
154 |         Result res = db.execute("CALL scraper.getLinks({url}) YIELD " +
155 |                     "element RETURN element.attributes.`abs:href` AS col",
156 |               map);
157 |         assertTrue(res.hasNext());
158 |         assertEquals(Collections.singletonList("col"), res.columns());
159 | 
160 |         List<String> urls = new ArrayList<>();
161 |         while (res.hasNext()) {
162 |             String url = res.next().get("col").toString();
163 |             urls.add(url);
164 |         }
165 |         assertEquals(Arrays.asList("http://www.index.hu", "http://www.index2.hu"), urls);
166 |     }
167 | 
168 |     @Test
169 |     public void shouldReturnMediaLinksUrl() {
170 |         Map<String, Object> map = new HashMap<>();
171 |         map.put("url", testUrl);
172 | 
173 |         Result res = db.execute("CALL scraper.getMediaLinks({url}) YIELD" +
174 |                     " " +
175 |                     "element RETURN element.attributes.src AS col",
176 |               map);
177 |         assertTrue(res.hasNext());
178 |         assertEquals(Collections.singletonList("col"), res.columns());
179 | 
180 |         List<String> urls = new ArrayList<>();
181 |         while (res.hasNext()) {
182 |             String url = res.next().get("col").toString();
183 |             urls.add(url);
184 |         }
185 |         assertEquals(Arrays.asList("https://www.google" +
186 |               ".hu/images/branding/googlelogo/2x/googlelogo_color_120x44dp.png"), urls);
187 |     }
188 | 
189 |     private String getTestHtml() {
190 |         StringBuilder contentBuilder = new StringBuilder();
191 |         try {
192 |             BufferedReader in = new BufferedReader(new FileReader(
193 |                   "src/test/resources/test.html"));
194 |             String str;
195 |             while ((str = in.readLine()) != null) {
196 |                 contentBuilder.append(str);
197 |             }
198 |             in.close();
199 |         } catch (IOException e) {
200 |             e.printStackTrace();
201 |         }
202 |         return contentBuilder.toString();
203 |     }
204 | }
205 | 


--------------------------------------------------------------------------------
/src/test/resources/test.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 3 |         "http://www.w3.org/TR/html4/loose.dtd">
 4 | <html lang="en">
 5 | <head>
 6 |     <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
 7 |     <title>
 8 |         HTML Test Page
 9 |     </title>
10 | </head>
11 | <body>
12 | <h1>Testing display of HTML elements</h1>
13 | <h2>This is 2nd level heading</h2>
14 | <p>This is a test paragraph.</p>
15 | <h3>This is 3rd level heading</h3>
16 | <p>This is a test paragraph.</p>
17 | <h4>This is 4th level heading</h4>
18 | <p>This is a test paragraph.</p>
19 | <h5>This is 5th level heading</h5>
20 | <p>This is a test paragraph.</p>
21 | <h6>This is 6th level heading</h6>
22 | <p>This is a normal paragraph</p>
23 | <h2>Links</h2>
24 | <ul>
25 |     <li> <a href="http://www.index.hu">Index1</a></li>
26 |     <li> <a href="http://www.index2.hu">Index2</a></li>
27 | </ul>
28 | <img src="https://www.google.hu/images/branding/googlelogo/2x/googlelogo_color_120x44dp.png"></img>
29 | </body>
30 | </html>
31 | 


--------------------------------------------------------------------------------