├── .gitignore
├── src
├── main
│ ├── docker
│ │ └── Dockerfile
│ ├── resources
│ │ ├── application.properties
│ │ └── index.tpl
│ └── java
│ │ └── edu
│ │ └── nwnu
│ │ └── ququzone
│ │ └── extractor
│ │ ├── service
│ │ ├── Extractor.java
│ │ ├── ParseException.java
│ │ ├── AbstractExtractor.java
│ │ ├── RowBlockExtractor.java
│ │ └── ReadabilityExtractor.java
│ │ ├── utlis
│ │ └── StringUtils.java
│ │ ├── result
│ │ ├── Result.java
│ │ ├── FailureResult.java
│ │ └── SuccessResult.java
│ │ └── ExtractorConfiguration.java
└── test
│ ├── java
│ └── edu
│ │ └── nwnu
│ │ └── ququzone
│ │ └── extractor
│ │ └── service
│ │ └── ReadabilityExtractorTest.java
│ └── resources
│ └── test.html
├── README.md
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | *.log
4 | target
5 |
--------------------------------------------------------------------------------
/src/main/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM java:8
2 | VOLUME /tmp
3 | ADD smart-extractor.jar app.jar
4 | RUN bash -c 'touch /app.jar'
5 | ENTRYPOINT ["java","-Djava.security.egd=file:/dev/./urandom","-jar","/app.jar"]
--------------------------------------------------------------------------------
/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | spring.application.name=smart-extractor
2 | spring.profiles.active=production
3 |
4 | logging.file=smart-extractor.log
5 |
6 | spring.mvc.view.prefix=/WEB-INF/jsp/
7 | spring.mvc.view.suffix=.jsp
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/service/Extractor.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | import edu.nwnu.ququzone.extractor.result.Result;
4 |
5 | /**
6 | * extract page main content.
7 | *
8 | * @author Yang XuePing
9 | */
10 | public interface Extractor {
11 | Result extract(String url);
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/service/ParseException.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | /**
4 | * parse exception.
5 | *
6 | * @author Yang XuePing
7 | */
8 | public class ParseException extends RuntimeException {
9 | public ParseException(String message) {
10 | super(message);
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/utlis/StringUtils.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.utlis;
2 |
3 | /**
4 | * String utls.
5 | *
6 | * @author Yang XuePing
7 | */
8 | public final class StringUtils {
9 | public static boolean isEmpty(String str) {
10 | return str == null || "".equals(str.trim());
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/result/Result.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.result;
2 |
3 | /**
4 | * extractor result.
5 | *
6 | * @author Yang XuePing
7 | */
8 | public class Result {
9 | private final boolean success;
10 |
11 | public Result(boolean success) {
12 | this.success = success;
13 | }
14 |
15 | public boolean isSuccess() {
16 | return success;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/result/FailureResult.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.result;
2 |
3 | /**
4 | * failure result
5 | *
6 | * @author Yang XuePing
7 | */
8 | public class FailureResult extends Result {
9 | private final String msg;
10 |
11 | public FailureResult(String msg) {
12 | super(false);
13 | this.msg = msg;
14 | }
15 |
16 | public String getMsg() {
17 | return msg;
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/result/SuccessResult.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.result;
2 |
3 | /**
4 | * successful result.
5 | *
6 | * @author Yang XuePing
7 | */
8 | public class SuccessResult extends Result {
9 | private final String title;
10 | private final String html;
11 |
12 | public SuccessResult(String title, String html) {
13 | super(true);
14 | this.title = title;
15 | this.html = html;
16 | }
17 |
18 | public String getTitle() {
19 | return title;
20 | }
21 |
22 | public String getHtml() {
23 | return html;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | html smart extractor
2 | ====================
3 |
4 | ### Introduction
5 |
6 | A micro-service for extract main content from url.
7 |
8 | ### Usage
9 |
10 | 1. Package
11 |
12 | ```
13 | $ mvn clean package
14 | ```
15 |
16 | 2. Run
17 |
18 | ```
19 | $ java -jar target/smart-extractor.jar
20 | $ open http://localhost:8080
21 | ```
22 |
23 | ### API
24 |
25 | 1. Extract `http://localhost:8080/extract?url={url}`
26 |
27 | ```
28 | $ curl -i -X GET http://localhost:8080/extract\?url\=https://medium.com/@benjaminhardy/8-things-every-person-should-do-before-8-a-m-cc0233e15c8d
29 | ```
30 |
31 | ### Build Docker Image
32 |
33 | $ mvn clean package
34 | $ mvn package docker:build
35 |
--------------------------------------------------------------------------------
/src/test/java/edu/nwnu/ququzone/extractor/service/ReadabilityExtractorTest.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.testng.annotations.Test;
6 |
7 | import java.net.URL;
8 |
9 | /**
10 | * test readability extractor.
11 | *
12 | * @author Yang XuePing
13 | */
14 | public class ReadabilityExtractorTest {
15 | @Test
16 | public void testCrawl() throws Exception {
17 | Document doc = Jsoup.parse(ReadabilityExtractorTest.class.getClassLoader().getResourceAsStream("test.html"), "UTF-8", "/");
18 | // Document doc = Jsoup.parse(new URL("http://192.168.1.102:3000"), 2000);
19 |
20 | ReadabilityExtractor extractor = new ReadabilityExtractor();
21 | extractor.parse(doc);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/ExtractorConfiguration.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor;
2 |
3 | import com.google.common.io.ByteStreams;
4 | import edu.nwnu.ququzone.extractor.result.Result;
5 | import edu.nwnu.ququzone.extractor.service.Extractor;
6 | import org.springframework.beans.factory.annotation.Autowired;
7 | import org.springframework.beans.factory.annotation.Qualifier;
8 | import org.springframework.boot.SpringApplication;
9 | import org.springframework.boot.autoconfigure.SpringBootApplication;
10 | import org.springframework.web.bind.annotation.RequestMapping;
11 | import org.springframework.web.bind.annotation.RequestMethod;
12 | import org.springframework.web.bind.annotation.RequestParam;
13 | import org.springframework.web.bind.annotation.RestController;
14 |
15 | import java.io.IOException;
16 |
17 | /**
18 | * extractor spring configuration.
19 | *
20 | * @author Yang XuePing
21 | */
22 | @SpringBootApplication
23 | @RestController
24 | public class ExtractorConfiguration {
25 | private static String tpl;
26 |
27 | static {
28 | try {
29 | tpl = new String(ByteStreams.toByteArray(ExtractorConfiguration.class.getClassLoader().getResourceAsStream("index.tpl")));
30 | } catch (IOException e) {
31 | }
32 | }
33 |
34 | @Autowired
35 | @Qualifier("rowBlockExtractor")
36 | private Extractor extractor;
37 |
38 | public static void main(String[] args) {
39 | SpringApplication.run(ExtractorConfiguration.class, args);
40 | }
41 |
42 | @RequestMapping(value = "/", method = RequestMethod.GET)
43 | public String index() {
44 | return tpl;
45 | }
46 |
47 |
48 | @RequestMapping(value = "/extract", method = RequestMethod.GET)
49 | public Result extract(@RequestParam String url) {
50 | return extractor.extract(url);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/test/resources/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Bootstrap 101 Template
8 |
9 |
10 |
11 |
12 |
13 |
14 |
18 |
23 |
24 |
25 |
45 |
46 |
47 |
48 |
Bootstrap starter template
49 |
Use this document as a way to quickly start any new project.
All you get is this text and a mostly barebones HTML document.
50 |
51 |
52 |
53 |
56 |
57 |
58 |
59 |
60 |
61 |
66 |
67 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/service/AbstractExtractor.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | import com.squareup.okhttp.OkHttpClient;
4 | import com.squareup.okhttp.Request;
5 | import com.squareup.okhttp.Response;
6 | import edu.nwnu.ququzone.extractor.result.FailureResult;
7 | import edu.nwnu.ququzone.extractor.result.Result;
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 | import org.mozilla.universalchardet.UniversalDetector;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 |
14 | import java.io.IOException;
15 | import java.util.concurrent.TimeUnit;
16 |
17 | /**
18 | * abstract extractor.
19 | *
20 | * @author Yang XuePing
21 | */
22 | public abstract class AbstractExtractor implements Extractor {
23 | private static final Logger LOG = LoggerFactory.getLogger(AbstractExtractor.class);
24 |
25 | protected OkHttpClient client;
26 |
27 | protected AbstractExtractor() {
28 | this.client = new OkHttpClient();
29 | this.client.setConnectTimeout(20, TimeUnit.SECONDS);
30 | this.client.setReadTimeout(20, TimeUnit.SECONDS);
31 | }
32 |
33 | @Override
34 | public Result extract(String url) {
35 | try {
36 | Document doc = getDocument(url);
37 | if (doc == null) {
38 | return new FailureResult(String.format("fetch %s document error.", url));
39 | }
40 | return parse(doc);
41 | } catch (ParseException e) {
42 | LOG.error(String.format("parse %s document exception.", url), e);
43 | return new FailureResult(e.getMessage());
44 | } catch (Exception e) {
45 | LOG.error(String.format("fetch %s document exception.", url), e);
46 | return new FailureResult(String.format("fetch %s document exception.", url));
47 | }
48 | }
49 |
50 | protected abstract Result parse(Document doc);
51 |
52 | protected Document getDocument(String url) {
53 | Request request = new Request.Builder()
54 | .url(url)
55 | .build();
56 | try {
57 | Response response = client.newCall(request).execute();
58 | if (response.isSuccessful()) {
59 | byte[] data = response.body().bytes();
60 | String encoding = detectEncoding(data);
61 | return Jsoup.parse(new String(data, encoding), url);
62 | } else {
63 | throw new RuntimeException(String.format("get %s document error", url));
64 | }
65 | } catch (IOException e) {
66 | LOG.error("fetch document error:" + url, e);
67 | }
68 | return null;
69 | }
70 |
71 | protected String detectEncoding(byte[] data) {
72 | UniversalDetector detector = new UniversalDetector(null);
73 | detector.handleData(data, 0, data.length);
74 | detector.dataEnd();
75 | String encoding = detector.getDetectedCharset();
76 | detector.reset();
77 | if (encoding == null) {
78 | encoding = "UTF-8";
79 | }
80 | return encoding;
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | edu.nwnu.ququzone
8 | smart-extractor
9 | 0.0.1-SNAPSHOT
10 | jar
11 |
12 |
13 | org.springframework.boot
14 | spring-boot-starter-parent
15 | 1.2.7.RELEASE
16 |
17 |
18 |
19 | 1.8
20 | ququzone
21 |
22 |
23 |
24 |
25 | org.springframework.boot
26 | spring-boot-starter-web
27 |
28 |
29 |
30 | com.squareup.okhttp
31 | okhttp
32 | 2.5.0
33 |
34 |
35 | org.jsoup
36 | jsoup
37 | 1.8.3
38 |
39 |
40 | com.google.guava
41 | guava
42 | 18.0
43 |
44 |
45 |
46 | org.testng
47 | testng
48 | 6.9.9
49 | test
50 |
51 |
52 |
53 | com.googlecode.juniversalchardet
54 | juniversalchardet
55 | 1.0.3
56 |
57 |
58 |
59 |
60 | smart-extractor
61 |
62 |
63 | org.springframework.boot
64 | spring-boot-maven-plugin
65 |
66 |
67 | com.spotify
68 | docker-maven-plugin
69 | 0.2.3
70 |
71 | ${docker.image.prefix}/${project.artifactId}
72 | src/main/docker
73 |
74 |
75 | /
76 | ${project.build.directory}
77 | ${project.build.finalName}.jar
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/src/main/resources/index.tpl:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Smart Extractor
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
Extract main content form web
13 |
15 |
16 |
90 |
91 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/service/RowBlockExtractor.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | import edu.nwnu.ququzone.extractor.result.Result;
4 | import edu.nwnu.ququzone.extractor.result.SuccessResult;
5 | import edu.nwnu.ququzone.extractor.utlis.StringUtils;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.nodes.Node;
9 | import org.jsoup.nodes.TextNode;
10 | import org.springframework.stereotype.Component;
11 |
12 | import java.util.HashMap;
13 | import java.util.LinkedList;
14 | import java.util.List;
15 | import java.util.Map;
16 |
17 | /**
18 | * row block extractor implement.
19 | *
20 | * @author Yang XuePing
21 | */
22 | @Component("rowBlockExtractor")
23 | public class RowBlockExtractor extends AbstractExtractor {
24 | @Override
25 | protected Result parse(Document doc) {
26 | return new SuccessResult(getArticleTitle(doc), getContentElement(doc).html());
27 | }
28 |
29 | public Element getContentElement(Document doc) {
30 | Map infoMap = new HashMap<>();
31 | clean(doc);
32 | computeInfo(doc.body(), infoMap);
33 | double maxScore = 0;
34 | Element content = null;
35 | for (Map.Entry entry : infoMap.entrySet()) {
36 | Element tag = entry.getKey();
37 | if (tag.tagName().equals("a") || tag == doc.body()) {
38 | continue;
39 | }
40 | double score = computeScore(tag, infoMap);
41 | if (score > maxScore) {
42 | maxScore = score;
43 | content = tag;
44 | }
45 | }
46 | if (content == null) {
47 | throw new RuntimeException("extraction failed");
48 | }
49 | return content;
50 | }
51 |
52 | private String getArticleTitle(Document doc) {
53 | String title = doc.title();
54 | if (!StringUtils.isEmpty(title)) {
55 | return title;
56 | }
57 | title = doc.select("head title").text().trim();
58 | if (!title.isEmpty()) {
59 | return title;
60 | }
61 | title = doc.select("head meta[name=title]").attr("content").trim();
62 | if (!title.isEmpty()) {
63 | return title;
64 | }
65 | title = doc.select("head meta[property=og:title]").attr("content").trim();
66 | if (!title.isEmpty()) {
67 | return title;
68 | }
69 | return doc.getElementsByTag("h1").text().trim();
70 | }
71 |
72 | protected void clean(Document doc) {
73 | doc.select("script,noscript,style,iframe,br").remove();
74 | }
75 |
76 | protected CountInfo computeInfo(Node node, Map infoMap) {
77 | if (node instanceof Element) {
78 | Element tag = (Element) node;
79 |
80 | CountInfo countInfo = new CountInfo();
81 | for (Node childNode : tag.childNodes()) {
82 | CountInfo childCountInfo = computeInfo(childNode, infoMap);
83 | countInfo.textCount += childCountInfo.textCount;
84 | countInfo.linkTextCount += childCountInfo.linkTextCount;
85 | countInfo.tagCount += childCountInfo.tagCount;
86 | countInfo.linkTagCount += childCountInfo.linkTagCount;
87 | countInfo.leafList.addAll(childCountInfo.leafList);
88 | countInfo.densitySum += childCountInfo.density;
89 | countInfo.pCount += childCountInfo.pCount;
90 | }
91 | countInfo.tagCount++;
92 | String tagName = tag.tagName();
93 | if ("a".equals(tagName)) {
94 | countInfo.linkTextCount = countInfo.textCount;
95 | countInfo.linkTagCount++;
96 | }
97 | if ("p".equals(tagName)) {
98 | countInfo.pCount++;
99 | }
100 |
101 | int pureLen = countInfo.textCount - countInfo.linkTextCount;
102 | int len = countInfo.tagCount - countInfo.linkTagCount;
103 | if (pureLen == 0 || len == 0) {
104 | countInfo.density = 0;
105 | } else {
106 | countInfo.density = (pureLen + 0.0) / len;
107 | }
108 | infoMap.put(tag, countInfo);
109 | return countInfo;
110 | }
111 | if (node instanceof TextNode) {
112 | TextNode tn = (TextNode) node;
113 | CountInfo countInfo = new CountInfo();
114 | int len = tn.text().length();
115 | countInfo.textCount = len;
116 | countInfo.leafList.add(len);
117 | return countInfo;
118 | }
119 | return new CountInfo();
120 | }
121 |
122 | protected double computeScore(Element tag, Map infoMap) {
123 | CountInfo countInfo = infoMap.get(tag);
124 | double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
125 | double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
126 | return score;
127 | }
128 |
129 | private double computeVar(List data) {
130 | if (data.size() == 0) return 0;
131 |
132 | if (data.size() == 1) {
133 | return data.get(0) / 2;
134 | }
135 |
136 | double sum = 0;
137 | for (Integer i : data) {
138 | sum += i;
139 | }
140 | double ave = sum / data.size();
141 | sum = 0;
142 | for (Integer i : data) {
143 | sum += (i - ave) * (i - ave);
144 | }
145 | sum = sum / data.size();
146 | return sum;
147 | }
148 |
149 | private static class CountInfo {
150 | int textCount = 0;
151 | int linkTextCount = 0;
152 | int tagCount = 0;
153 | int linkTagCount = 0;
154 | double density = 0;
155 | double densitySum = 0;
156 | double score = 0;
157 | int pCount = 0;
158 | List leafList = new LinkedList<>();
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/src/main/java/edu/nwnu/ququzone/extractor/service/ReadabilityExtractor.java:
--------------------------------------------------------------------------------
1 | package edu.nwnu.ququzone.extractor.service;
2 |
3 | import edu.nwnu.ququzone.extractor.result.FailureResult;
4 | import edu.nwnu.ququzone.extractor.result.Result;
5 | import edu.nwnu.ququzone.extractor.result.SuccessResult;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.nodes.TextNode;
9 | import org.springframework.stereotype.Component;
10 |
11 | import java.util.*;
12 | import java.util.regex.Pattern;
13 |
14 | /**
15 | * readability extractor.
16 | *
17 | * @author Yang XuePing
18 | */
19 | @Component("readabilityExtractor")
20 | public class ReadabilityExtractor extends AbstractExtractor {
21 | private static final Map REGEXPS = new HashMap<>();
22 |
23 | private static final Set DEFAULT_TAGS_TO_SCORE = new HashSet<>();
24 |
25 | private static final Set DIV_TO_P_ELEMS = new HashSet<>();
26 |
27 | static {
28 | REGEXPS.put("unlikelyCandidates", Pattern.compile("banner|combx|comment|community|disqus|extra|foot|header|" +
29 | "menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup|nav|navbar"));
30 | REGEXPS.put("okMaybeItsACandidate", Pattern.compile("and|article|body|column|main|shadow"));
31 | REGEXPS.put("positive", Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story"));
32 | REGEXPS.put("negative", Pattern.compile("hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|" +
33 | "meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget"));
34 | REGEXPS.put("extraneous", Pattern.compile("print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility"));
35 | REGEXPS.put("byline", Pattern.compile("byline|author|dateline|writtenby"));
36 | REGEXPS.put("replaceFonts", Pattern.compile("<(/?)font[^>]*>"));
37 | REGEXPS.put("normalize", Pattern.compile("\\s{2,}"));
38 | REGEXPS.put("videos", Pattern.compile("//(www\\.)?(dailymotion|youtube|youtube-nocookie|player\\.vimeo)\\.com"));
39 | REGEXPS.put("nextLink", Pattern.compile("(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))"));
40 | REGEXPS.put("prevLink", Pattern.compile("(prev|earl|old|new|<|«)"));
41 | REGEXPS.put("whitespace", Pattern.compile("^\\s*$"));
42 | REGEXPS.put("hasContent", Pattern.compile("\\S$"));
43 |
44 | DEFAULT_TAGS_TO_SCORE.add("section");
45 | DEFAULT_TAGS_TO_SCORE.add("h2");
46 | DEFAULT_TAGS_TO_SCORE.add("h3");
47 | DEFAULT_TAGS_TO_SCORE.add("h4");
48 | DEFAULT_TAGS_TO_SCORE.add("h5");
49 | DEFAULT_TAGS_TO_SCORE.add("h6");
50 | DEFAULT_TAGS_TO_SCORE.add("p");
51 | DEFAULT_TAGS_TO_SCORE.add("td");
52 | DEFAULT_TAGS_TO_SCORE.add("pre");
53 |
54 | DIV_TO_P_ELEMS.add("a");
55 | DIV_TO_P_ELEMS.add("blockquote");
56 | DIV_TO_P_ELEMS.add("dl");
57 | DIV_TO_P_ELEMS.add("div");
58 | DIV_TO_P_ELEMS.add("img");
59 | DIV_TO_P_ELEMS.add("ol");
60 | DIV_TO_P_ELEMS.add("p");
61 | DIV_TO_P_ELEMS.add("pre");
62 | DIV_TO_P_ELEMS.add("table");
63 | DIV_TO_P_ELEMS.add("ul");
64 | DIV_TO_P_ELEMS.add("select");
65 | }
66 |
67 | private static void removeElementByTag(Element element, String tag) {
68 | element.getElementsByTag(tag).forEach(node -> node.remove());
69 | }
70 |
71 | private static Element setNodeTag(Element element, String tag) {
72 | Element replacement = element.ownerDocument().createElement(tag);
73 | replacement.html(element.html());
74 | element.attributes().forEach(attr -> replacement.attr(attr.getKey(), attr.getValue()));
75 | element.replaceWith(replacement);
76 | return replacement;
77 | }
78 |
79 | private static boolean isEmpty(String str) {
80 | return str == null || "".equals(str.trim());
81 | }
82 |
83 | private static String getArticleTitle(Document doc) {
84 | String title = doc.title();
85 | if (isEmpty(title)) {
86 | title = doc.select("head title").text().trim();
87 | if (title.isEmpty()) {
88 | title = doc.select("head meta[name=title]").attr("content").trim();
89 | if (title.isEmpty()) {
90 | title = doc.select("head meta[property=og:title]").attr("content").trim();
91 | if (title.isEmpty()) {
92 | title = doc.getElementsByTag("h1").text();
93 | }
94 | }
95 | }
96 | }
97 | return title;
98 | }
99 |
100 | private static String grabArticle(Element body) {
101 | List elementsToScore = new LinkedList<>();
102 | Element node = body;
103 | while (node != null) {
104 | String matchString = node.className() + " " + node.id();
105 | if (REGEXPS.get("unlikelyCandidates").matcher(matchString).find() &&
106 | !REGEXPS.get("okMaybeItsACandidate").matcher(matchString).find() &&
107 | !"body".equalsIgnoreCase(node.tagName()) &&
108 | !"a".equalsIgnoreCase(node.tagName())) {
109 | node = removeAndGetNext(node);
110 | continue;
111 | }
112 |
113 | if (DEFAULT_TAGS_TO_SCORE.contains(node.tagName().toLowerCase())) {
114 | elementsToScore.add(node);
115 | }
116 |
117 | if ("div".equalsIgnoreCase(node.tagName())) {
118 | if (node.children().size() == 1 && "p".equalsIgnoreCase(node.child(0).tagName())) {
119 | node.replaceWith(node.child(0));
120 | } else if (!hasChildBlockElement(node)) {
121 | node = setNodeTag(node, "p");
122 | elementsToScore.add(node);
123 | } else {
124 | node.childNodes().forEach(childNode -> {
125 | if (childNode instanceof TextNode) {
126 | if (!"".equals(childNode.outerHtml().trim())) {
127 | Element p = childNode.ownerDocument().createElement("p");
128 | p.html(childNode.outerHtml());
129 | childNode.replaceWith(p);
130 | }
131 | }
132 | });
133 | }
134 | }
135 | node = getNextNode(node, false);
136 | }
137 |
138 | final List candidates = new LinkedList<>();
139 | elementsToScore.forEach(elementToScore -> {
140 | if (elementToScore.parent() == null || isEmpty(elementToScore.parent().tagName())) {
141 | return;
142 | }
143 | String innerText = getInnerText(elementToScore, true);
144 | if (innerText.length() < 10) {
145 | return;
146 | }
147 | List ancestors = getNodeAncestors(elementToScore, 3);
148 | if (ancestors.size() == 0) {
149 | return;
150 | }
151 | int contentScore = 0;
152 | contentScore += 1;
153 | contentScore += innerText.split(",").length;
154 | contentScore += Math.min(innerText.length() / 100, 3);
155 |
156 | // TODO
157 | });
158 |
159 | return "";
160 | }
161 |
162 | private static List getNodeAncestors(Element node, int maxDepth) {
163 | int i = 0;
164 | List ancestors = new LinkedList<>();
165 | while (node.parent() != null) {
166 | ancestors.add(node.parent());
167 | if (++i == maxDepth)
168 | break;
169 | node = node.parent();
170 | }
171 | return ancestors;
172 | }
173 |
174 | private static String getInnerText(Element node, boolean normalizeSpaces) {
175 | String textContent = node.text().trim();
176 | return normalizeSpaces ? textContent.replaceAll(REGEXPS.get("normalize").pattern(), " ") : textContent;
177 | }
178 |
179 | private static boolean hasChildBlockElement(Element node) {
180 | return node.children().stream().anyMatch(child ->
181 | DIV_TO_P_ELEMS.contains(child.tagName().toLowerCase()) || hasChildBlockElement(child));
182 | }
183 |
184 | private static Element removeAndGetNext(Element node) {
185 | Element nextNode = getNextNode(node, true);
186 | node.remove();
187 | return nextNode;
188 | }
189 |
190 | private static Element getNextNode(Element node, boolean ignoreSelfAndKids) {
191 | if (!ignoreSelfAndKids && node.children().size() > 0) {
192 | return node.child(0);
193 | }
194 | if (node.nextElementSibling() != null) {
195 | return node.nextElementSibling();
196 | }
197 | do {
198 | node = node.parent();
199 | } while (node != null && node.nextElementSibling() == null);
200 | return node == null ? null : node.nextElementSibling();
201 | }
202 |
203 | @Override
204 | protected Result parse(Document doc) {
205 | removeElementByTag(doc, "script");
206 | removeElementByTag(doc, "noscript");
207 |
208 | removeElementByTag(doc, "style");
209 | doc.body().getElementsByTag("font").forEach(font -> setNodeTag(font, "span"));
210 |
211 | String articleTitle = getArticleTitle(doc);
212 | String articleContent = grabArticle(doc.body());
213 |
214 | if (articleContent != null) {
215 | return new SuccessResult(articleTitle, articleContent);
216 | }
217 |
218 | return new FailureResult("Can't extract main content");
219 | }
220 | }
221 |
--------------------------------------------------------------------------------