├── .gitignore ├── README.md ├── pom.xml └── src ├── main ├── docker │ └── Dockerfile ├── java │ └── edu │ │ └── nwnu │ │ └── ququzone │ │ └── extractor │ │ ├── ExtractorConfiguration.java │ │ ├── result │ │ ├── FailureResult.java │ │ ├── Result.java │ │ └── SuccessResult.java │ │ ├── service │ │ ├── AbstractExtractor.java │ │ ├── Extractor.java │ │ ├── ParseException.java │ │ ├── ReadabilityExtractor.java │ │ └── RowBlockExtractor.java │ │ └── utlis │ │ └── StringUtils.java └── resources │ ├── application.properties │ └── index.tpl └── test ├── java └── edu │ └── nwnu │ └── ququzone │ └── extractor │ └── service │ └── ReadabilityExtractorTest.java └── resources └── test.html /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | *.log 4 | target 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | html smart extractor 2 | ==================== 3 | 4 | ### Introduction 5 | 6 | A micro-service for extract main content from url. 7 | 8 | ### Usage 9 | 10 | 1. Package 11 | 12 | ``` 13 | $ mvn clean package 14 | ``` 15 | 16 | 2. Run 17 | 18 | ``` 19 | $ java -jar target/smart-extractor.jar 20 | $ open http://localhost:8080 21 | ``` 22 | 23 | ### API 24 | 25 | 1. Extract `http://localhost:8080/extract?url={url}` 26 | 27 | ``` 28 | $ curl -i -X GET http://localhost:8080/extract\?url\=https://medium.com/@benjaminhardy/8-things-every-person-should-do-before-8-a-m-cc0233e15c8d 29 | ``` 30 | 31 | ### Build Docker Image 32 | 33 | $ mvn clean package 34 | $ mvn package docker:build 35 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | edu.nwnu.ququzone 8 | smart-extractor 9 | 0.0.1-SNAPSHOT 10 | jar 11 | 12 | 13 | org.springframework.boot 14 | spring-boot-starter-parent 15 | 1.2.7.RELEASE 16 | 17 | 18 | 19 | 1.8 20 | ququzone 21 | 22 | 23 | 24 | 25 | org.springframework.boot 26 | spring-boot-starter-web 27 | 28 | 29 | 30 | com.squareup.okhttp 31 | okhttp 32 | 2.5.0 33 | 34 | 35 | org.jsoup 36 | jsoup 37 | 1.8.3 38 | 39 | 40 | com.google.guava 41 | guava 42 | 18.0 43 | 44 | 45 | 46 | org.testng 47 | testng 48 | 6.9.9 49 | test 50 | 51 | 52 | 53 | com.googlecode.juniversalchardet 54 | juniversalchardet 55 | 1.0.3 56 | 57 | 58 | 59 | 60 | smart-extractor 61 | 62 | 63 | org.springframework.boot 64 | spring-boot-maven-plugin 65 | 66 | 67 | com.spotify 68 | docker-maven-plugin 69 | 0.2.3 70 | 71 | ${docker.image.prefix}/${project.artifactId} 72 | src/main/docker 73 | 74 | 75 | / 76 | ${project.build.directory} 77 | ${project.build.finalName}.jar 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /src/main/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM java:8 2 | VOLUME /tmp 3 | ADD smart-extractor.jar app.jar 4 | RUN bash -c 'touch /app.jar' 5 | ENTRYPOINT ["java","-Djava.security.egd=file:/dev/./urandom","-jar","/app.jar"] -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/ExtractorConfiguration.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor; 2 | 3 | import com.google.common.io.ByteStreams; 4 | import edu.nwnu.ququzone.extractor.result.Result; 5 | import edu.nwnu.ququzone.extractor.service.Extractor; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.beans.factory.annotation.Qualifier; 8 | import org.springframework.boot.SpringApplication; 9 | import org.springframework.boot.autoconfigure.SpringBootApplication; 10 | import org.springframework.web.bind.annotation.RequestMapping; 11 | import org.springframework.web.bind.annotation.RequestMethod; 12 | import org.springframework.web.bind.annotation.RequestParam; 13 | import org.springframework.web.bind.annotation.RestController; 14 | 15 | import java.io.IOException; 16 | 17 | /** 18 | * extractor spring configuration. 19 | * 20 | * @author Yang XuePing 21 | */ 22 | @SpringBootApplication 23 | @RestController 24 | public class ExtractorConfiguration { 25 | private static String tpl; 26 | 27 | static { 28 | try { 29 | tpl = new String(ByteStreams.toByteArray(ExtractorConfiguration.class.getClassLoader().getResourceAsStream("index.tpl"))); 30 | } catch (IOException e) { 31 | } 32 | } 33 | 34 | @Autowired 35 | @Qualifier("rowBlockExtractor") 36 | private Extractor extractor; 37 | 38 | public static void main(String[] args) { 39 | SpringApplication.run(ExtractorConfiguration.class, args); 40 | } 41 | 42 | @RequestMapping(value = "/", method = RequestMethod.GET) 43 | public String index() { 44 | return tpl; 45 | } 46 | 47 | 48 | @RequestMapping(value = "/extract", method = RequestMethod.GET) 49 | public Result extract(@RequestParam String url) { 50 | return extractor.extract(url); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/result/FailureResult.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.result; 2 | 3 | /** 4 | * failure result 5 | * 6 | * @author Yang XuePing 7 | */ 8 | public class FailureResult extends Result { 9 | private final String msg; 10 | 11 | public FailureResult(String msg) { 12 | super(false); 13 | this.msg = msg; 14 | } 15 | 16 | public String getMsg() { 17 | return msg; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/result/Result.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.result; 2 | 3 | /** 4 | * extractor result. 5 | * 6 | * @author Yang XuePing 7 | */ 8 | public class Result { 9 | private final boolean success; 10 | 11 | public Result(boolean success) { 12 | this.success = success; 13 | } 14 | 15 | public boolean isSuccess() { 16 | return success; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/result/SuccessResult.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.result; 2 | 3 | /** 4 | * successful result. 5 | * 6 | * @author Yang XuePing 7 | */ 8 | public class SuccessResult extends Result { 9 | private final String title; 10 | private final String html; 11 | 12 | public SuccessResult(String title, String html) { 13 | super(true); 14 | this.title = title; 15 | this.html = html; 16 | } 17 | 18 | public String getTitle() { 19 | return title; 20 | } 21 | 22 | public String getHtml() { 23 | return html; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/service/AbstractExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | import com.squareup.okhttp.OkHttpClient; 4 | import com.squareup.okhttp.Request; 5 | import com.squareup.okhttp.Response; 6 | import edu.nwnu.ququzone.extractor.result.FailureResult; 7 | import edu.nwnu.ququzone.extractor.result.Result; 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.mozilla.universalchardet.UniversalDetector; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.io.IOException; 15 | import java.util.concurrent.TimeUnit; 16 | 17 | /** 18 | * abstract extractor. 19 | * 20 | * @author Yang XuePing 21 | */ 22 | public abstract class AbstractExtractor implements Extractor { 23 | private static final Logger LOG = LoggerFactory.getLogger(AbstractExtractor.class); 24 | 25 | protected OkHttpClient client; 26 | 27 | protected AbstractExtractor() { 28 | this.client = new OkHttpClient(); 29 | this.client.setConnectTimeout(20, TimeUnit.SECONDS); 30 | this.client.setReadTimeout(20, TimeUnit.SECONDS); 31 | } 32 | 33 | @Override 34 | public Result extract(String url) { 35 | try { 36 | Document doc = getDocument(url); 37 | if (doc == null) { 38 | return new FailureResult(String.format("fetch %s document error.", url)); 39 | } 40 | return parse(doc); 41 | } catch (ParseException e) { 42 | LOG.error(String.format("parse %s document exception.", url), e); 43 | return new FailureResult(e.getMessage()); 44 | } catch (Exception e) { 45 | LOG.error(String.format("fetch %s document exception.", url), e); 46 | return new FailureResult(String.format("fetch %s document exception.", url)); 47 | } 48 | } 49 | 50 | protected abstract Result parse(Document doc); 51 | 52 | protected Document getDocument(String url) { 53 | Request request = new Request.Builder() 54 | .url(url) 55 | .build(); 56 | try { 57 | Response response = client.newCall(request).execute(); 58 | if (response.isSuccessful()) { 59 | byte[] data = response.body().bytes(); 60 | String encoding = detectEncoding(data); 61 | return Jsoup.parse(new String(data, encoding), url); 62 | } else { 63 | throw new RuntimeException(String.format("get %s document error", url)); 64 | } 65 | } catch (IOException e) { 66 | LOG.error("fetch document error:" + url, e); 67 | } 68 | return null; 69 | } 70 | 71 | protected String detectEncoding(byte[] data) { 72 | UniversalDetector detector = new UniversalDetector(null); 73 | detector.handleData(data, 0, data.length); 74 | detector.dataEnd(); 75 | String encoding = detector.getDetectedCharset(); 76 | detector.reset(); 77 | if (encoding == null) { 78 | encoding = "UTF-8"; 79 | } 80 | return encoding; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/service/Extractor.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | import edu.nwnu.ququzone.extractor.result.Result; 4 | 5 | /** 6 | * extract page main content. 7 | * 8 | * @author Yang XuePing 9 | */ 10 | public interface Extractor { 11 | Result extract(String url); 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/service/ParseException.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | /** 4 | * parse exception. 5 | * 6 | * @author Yang XuePing 7 | */ 8 | public class ParseException extends RuntimeException { 9 | public ParseException(String message) { 10 | super(message); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/service/ReadabilityExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | import edu.nwnu.ququzone.extractor.result.FailureResult; 4 | import edu.nwnu.ququzone.extractor.result.Result; 5 | import edu.nwnu.ququzone.extractor.result.SuccessResult; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.nodes.TextNode; 9 | import org.springframework.stereotype.Component; 10 | 11 | import java.util.*; 12 | import java.util.regex.Pattern; 13 | 14 | /** 15 | * readability extractor. 16 | * 17 | * @author Yang XuePing 18 | */ 19 | @Component("readabilityExtractor") 20 | public class ReadabilityExtractor extends AbstractExtractor { 21 | private static final Map REGEXPS = new HashMap<>(); 22 | 23 | private static final Set DEFAULT_TAGS_TO_SCORE = new HashSet<>(); 24 | 25 | private static final Set DIV_TO_P_ELEMS = new HashSet<>(); 26 | 27 | static { 28 | REGEXPS.put("unlikelyCandidates", Pattern.compile("banner|combx|comment|community|disqus|extra|foot|header|" + 29 | "menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup|nav|navbar")); 30 | REGEXPS.put("okMaybeItsACandidate", Pattern.compile("and|article|body|column|main|shadow")); 31 | REGEXPS.put("positive", Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story")); 32 | REGEXPS.put("negative", Pattern.compile("hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|" + 33 | "meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget")); 34 | REGEXPS.put("extraneous", Pattern.compile("print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility")); 35 | REGEXPS.put("byline", Pattern.compile("byline|author|dateline|writtenby")); 36 | REGEXPS.put("replaceFonts", Pattern.compile("<(/?)font[^>]*>")); 37 | REGEXPS.put("normalize", Pattern.compile("\\s{2,}")); 38 | REGEXPS.put("videos", Pattern.compile("//(www\\.)?(dailymotion|youtube|youtube-nocookie|player\\.vimeo)\\.com")); 39 | REGEXPS.put("nextLink", Pattern.compile("(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))")); 40 | REGEXPS.put("prevLink", Pattern.compile("(prev|earl|old|new|<|«)")); 41 | REGEXPS.put("whitespace", Pattern.compile("^\\s*$")); 42 | REGEXPS.put("hasContent", Pattern.compile("\\S$")); 43 | 44 | DEFAULT_TAGS_TO_SCORE.add("section"); 45 | DEFAULT_TAGS_TO_SCORE.add("h2"); 46 | DEFAULT_TAGS_TO_SCORE.add("h3"); 47 | DEFAULT_TAGS_TO_SCORE.add("h4"); 48 | DEFAULT_TAGS_TO_SCORE.add("h5"); 49 | DEFAULT_TAGS_TO_SCORE.add("h6"); 50 | DEFAULT_TAGS_TO_SCORE.add("p"); 51 | DEFAULT_TAGS_TO_SCORE.add("td"); 52 | DEFAULT_TAGS_TO_SCORE.add("pre"); 53 | 54 | DIV_TO_P_ELEMS.add("a"); 55 | DIV_TO_P_ELEMS.add("blockquote"); 56 | DIV_TO_P_ELEMS.add("dl"); 57 | DIV_TO_P_ELEMS.add("div"); 58 | DIV_TO_P_ELEMS.add("img"); 59 | DIV_TO_P_ELEMS.add("ol"); 60 | DIV_TO_P_ELEMS.add("p"); 61 | DIV_TO_P_ELEMS.add("pre"); 62 | DIV_TO_P_ELEMS.add("table"); 63 | DIV_TO_P_ELEMS.add("ul"); 64 | DIV_TO_P_ELEMS.add("select"); 65 | } 66 | 67 | private static void removeElementByTag(Element element, String tag) { 68 | element.getElementsByTag(tag).forEach(node -> node.remove()); 69 | } 70 | 71 | private static Element setNodeTag(Element element, String tag) { 72 | Element replacement = element.ownerDocument().createElement(tag); 73 | replacement.html(element.html()); 74 | element.attributes().forEach(attr -> replacement.attr(attr.getKey(), attr.getValue())); 75 | element.replaceWith(replacement); 76 | return replacement; 77 | } 78 | 79 | private static boolean isEmpty(String str) { 80 | return str == null || "".equals(str.trim()); 81 | } 82 | 83 | private static String getArticleTitle(Document doc) { 84 | String title = doc.title(); 85 | if (isEmpty(title)) { 86 | title = doc.select("head title").text().trim(); 87 | if (title.isEmpty()) { 88 | title = doc.select("head meta[name=title]").attr("content").trim(); 89 | if (title.isEmpty()) { 90 | title = doc.select("head meta[property=og:title]").attr("content").trim(); 91 | if (title.isEmpty()) { 92 | title = doc.getElementsByTag("h1").text(); 93 | } 94 | } 95 | } 96 | } 97 | return title; 98 | } 99 | 100 | private static String grabArticle(Element body) { 101 | List elementsToScore = new LinkedList<>(); 102 | Element node = body; 103 | while (node != null) { 104 | String matchString = node.className() + " " + node.id(); 105 | if (REGEXPS.get("unlikelyCandidates").matcher(matchString).find() && 106 | !REGEXPS.get("okMaybeItsACandidate").matcher(matchString).find() && 107 | !"body".equalsIgnoreCase(node.tagName()) && 108 | !"a".equalsIgnoreCase(node.tagName())) { 109 | node = removeAndGetNext(node); 110 | continue; 111 | } 112 | 113 | if (DEFAULT_TAGS_TO_SCORE.contains(node.tagName().toLowerCase())) { 114 | elementsToScore.add(node); 115 | } 116 | 117 | if ("div".equalsIgnoreCase(node.tagName())) { 118 | if (node.children().size() == 1 && "p".equalsIgnoreCase(node.child(0).tagName())) { 119 | node.replaceWith(node.child(0)); 120 | } else if (!hasChildBlockElement(node)) { 121 | node = setNodeTag(node, "p"); 122 | elementsToScore.add(node); 123 | } else { 124 | node.childNodes().forEach(childNode -> { 125 | if (childNode instanceof TextNode) { 126 | if (!"".equals(childNode.outerHtml().trim())) { 127 | Element p = childNode.ownerDocument().createElement("p"); 128 | p.html(childNode.outerHtml()); 129 | childNode.replaceWith(p); 130 | } 131 | } 132 | }); 133 | } 134 | } 135 | node = getNextNode(node, false); 136 | } 137 | 138 | final List candidates = new LinkedList<>(); 139 | elementsToScore.forEach(elementToScore -> { 140 | if (elementToScore.parent() == null || isEmpty(elementToScore.parent().tagName())) { 141 | return; 142 | } 143 | String innerText = getInnerText(elementToScore, true); 144 | if (innerText.length() < 10) { 145 | return; 146 | } 147 | List ancestors = getNodeAncestors(elementToScore, 3); 148 | if (ancestors.size() == 0) { 149 | return; 150 | } 151 | int contentScore = 0; 152 | contentScore += 1; 153 | contentScore += innerText.split(",").length; 154 | contentScore += Math.min(innerText.length() / 100, 3); 155 | 156 | // TODO 157 | }); 158 | 159 | return ""; 160 | } 161 | 162 | private static List getNodeAncestors(Element node, int maxDepth) { 163 | int i = 0; 164 | List ancestors = new LinkedList<>(); 165 | while (node.parent() != null) { 166 | ancestors.add(node.parent()); 167 | if (++i == maxDepth) 168 | break; 169 | node = node.parent(); 170 | } 171 | return ancestors; 172 | } 173 | 174 | private static String getInnerText(Element node, boolean normalizeSpaces) { 175 | String textContent = node.text().trim(); 176 | return normalizeSpaces ? textContent.replaceAll(REGEXPS.get("normalize").pattern(), " ") : textContent; 177 | } 178 | 179 | private static boolean hasChildBlockElement(Element node) { 180 | return node.children().stream().anyMatch(child -> 181 | DIV_TO_P_ELEMS.contains(child.tagName().toLowerCase()) || hasChildBlockElement(child)); 182 | } 183 | 184 | private static Element removeAndGetNext(Element node) { 185 | Element nextNode = getNextNode(node, true); 186 | node.remove(); 187 | return nextNode; 188 | } 189 | 190 | private static Element getNextNode(Element node, boolean ignoreSelfAndKids) { 191 | if (!ignoreSelfAndKids && node.children().size() > 0) { 192 | return node.child(0); 193 | } 194 | if (node.nextElementSibling() != null) { 195 | return node.nextElementSibling(); 196 | } 197 | do { 198 | node = node.parent(); 199 | } while (node != null && node.nextElementSibling() == null); 200 | return node == null ? null : node.nextElementSibling(); 201 | } 202 | 203 | @Override 204 | protected Result parse(Document doc) { 205 | removeElementByTag(doc, "script"); 206 | removeElementByTag(doc, "noscript"); 207 | 208 | removeElementByTag(doc, "style"); 209 | doc.body().getElementsByTag("font").forEach(font -> setNodeTag(font, "span")); 210 | 211 | String articleTitle = getArticleTitle(doc); 212 | String articleContent = grabArticle(doc.body()); 213 | 214 | if (articleContent != null) { 215 | return new SuccessResult(articleTitle, articleContent); 216 | } 217 | 218 | return new FailureResult("Can't extract main content"); 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/service/RowBlockExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | import edu.nwnu.ququzone.extractor.result.Result; 4 | import edu.nwnu.ququzone.extractor.result.SuccessResult; 5 | import edu.nwnu.ququzone.extractor.utlis.StringUtils; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.nodes.Node; 9 | import org.jsoup.nodes.TextNode; 10 | import org.springframework.stereotype.Component; 11 | 12 | import java.util.HashMap; 13 | import java.util.LinkedList; 14 | import java.util.List; 15 | import java.util.Map; 16 | 17 | /** 18 | * row block extractor implement. 19 | * 20 | * @author Yang XuePing 21 | */ 22 | @Component("rowBlockExtractor") 23 | public class RowBlockExtractor extends AbstractExtractor { 24 | @Override 25 | protected Result parse(Document doc) { 26 | return new SuccessResult(getArticleTitle(doc), getContentElement(doc).html()); 27 | } 28 | 29 | public Element getContentElement(Document doc) { 30 | Map infoMap = new HashMap<>(); 31 | clean(doc); 32 | computeInfo(doc.body(), infoMap); 33 | double maxScore = 0; 34 | Element content = null; 35 | for (Map.Entry entry : infoMap.entrySet()) { 36 | Element tag = entry.getKey(); 37 | if (tag.tagName().equals("a") || tag == doc.body()) { 38 | continue; 39 | } 40 | double score = computeScore(tag, infoMap); 41 | if (score > maxScore) { 42 | maxScore = score; 43 | content = tag; 44 | } 45 | } 46 | if (content == null) { 47 | throw new RuntimeException("extraction failed"); 48 | } 49 | return content; 50 | } 51 | 52 | private String getArticleTitle(Document doc) { 53 | String title = doc.title(); 54 | if (!StringUtils.isEmpty(title)) { 55 | return title; 56 | } 57 | title = doc.select("head title").text().trim(); 58 | if (!title.isEmpty()) { 59 | return title; 60 | } 61 | title = doc.select("head meta[name=title]").attr("content").trim(); 62 | if (!title.isEmpty()) { 63 | return title; 64 | } 65 | title = doc.select("head meta[property=og:title]").attr("content").trim(); 66 | if (!title.isEmpty()) { 67 | return title; 68 | } 69 | return doc.getElementsByTag("h1").text().trim(); 70 | } 71 | 72 | protected void clean(Document doc) { 73 | doc.select("script,noscript,style,iframe,br").remove(); 74 | } 75 | 76 | protected CountInfo computeInfo(Node node, Map infoMap) { 77 | if (node instanceof Element) { 78 | Element tag = (Element) node; 79 | 80 | CountInfo countInfo = new CountInfo(); 81 | for (Node childNode : tag.childNodes()) { 82 | CountInfo childCountInfo = computeInfo(childNode, infoMap); 83 | countInfo.textCount += childCountInfo.textCount; 84 | countInfo.linkTextCount += childCountInfo.linkTextCount; 85 | countInfo.tagCount += childCountInfo.tagCount; 86 | countInfo.linkTagCount += childCountInfo.linkTagCount; 87 | countInfo.leafList.addAll(childCountInfo.leafList); 88 | countInfo.densitySum += childCountInfo.density; 89 | countInfo.pCount += childCountInfo.pCount; 90 | } 91 | countInfo.tagCount++; 92 | String tagName = tag.tagName(); 93 | if ("a".equals(tagName)) { 94 | countInfo.linkTextCount = countInfo.textCount; 95 | countInfo.linkTagCount++; 96 | } 97 | if ("p".equals(tagName)) { 98 | countInfo.pCount++; 99 | } 100 | 101 | int pureLen = countInfo.textCount - countInfo.linkTextCount; 102 | int len = countInfo.tagCount - countInfo.linkTagCount; 103 | if (pureLen == 0 || len == 0) { 104 | countInfo.density = 0; 105 | } else { 106 | countInfo.density = (pureLen + 0.0) / len; 107 | } 108 | infoMap.put(tag, countInfo); 109 | return countInfo; 110 | } 111 | if (node instanceof TextNode) { 112 | TextNode tn = (TextNode) node; 113 | CountInfo countInfo = new CountInfo(); 114 | int len = tn.text().length(); 115 | countInfo.textCount = len; 116 | countInfo.leafList.add(len); 117 | return countInfo; 118 | } 119 | return new CountInfo(); 120 | } 121 | 122 | protected double computeScore(Element tag, Map infoMap) { 123 | CountInfo countInfo = infoMap.get(tag); 124 | double var = Math.sqrt(computeVar(countInfo.leafList) + 1); 125 | double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2); 126 | return score; 127 | } 128 | 129 | private double computeVar(List data) { 130 | if (data.size() == 0) return 0; 131 | 132 | if (data.size() == 1) { 133 | return data.get(0) / 2; 134 | } 135 | 136 | double sum = 0; 137 | for (Integer i : data) { 138 | sum += i; 139 | } 140 | double ave = sum / data.size(); 141 | sum = 0; 142 | for (Integer i : data) { 143 | sum += (i - ave) * (i - ave); 144 | } 145 | sum = sum / data.size(); 146 | return sum; 147 | } 148 | 149 | private static class CountInfo { 150 | int textCount = 0; 151 | int linkTextCount = 0; 152 | int tagCount = 0; 153 | int linkTagCount = 0; 154 | double density = 0; 155 | double densitySum = 0; 156 | double score = 0; 157 | int pCount = 0; 158 | List leafList = new LinkedList<>(); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /src/main/java/edu/nwnu/ququzone/extractor/utlis/StringUtils.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.utlis; 2 | 3 | /** 4 | * String utls. 5 | * 6 | * @author Yang XuePing 7 | */ 8 | public final class StringUtils { 9 | public static boolean isEmpty(String str) { 10 | return str == null || "".equals(str.trim()); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.application.name=smart-extractor 2 | spring.profiles.active=production 3 | 4 | logging.file=smart-extractor.log 5 | 6 | spring.mvc.view.prefix=/WEB-INF/jsp/ 7 | spring.mvc.view.suffix=.jsp -------------------------------------------------------------------------------- /src/main/resources/index.tpl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Smart Extractor 5 | 6 | 7 | 8 | 9 | 10 | 11 |

12 |

Extract main content form web

13 |

14 |

15 |

16 | 90 | 91 | -------------------------------------------------------------------------------- /src/test/java/edu/nwnu/ququzone/extractor/service/ReadabilityExtractorTest.java: -------------------------------------------------------------------------------- 1 | package edu.nwnu.ququzone.extractor.service; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.testng.annotations.Test; 6 | 7 | import java.net.URL; 8 | 9 | /** 10 | * test readability extractor. 11 | * 12 | * @author Yang XuePing 13 | */ 14 | public class ReadabilityExtractorTest { 15 | @Test 16 | public void testCrawl() throws Exception { 17 | Document doc = Jsoup.parse(ReadabilityExtractorTest.class.getClassLoader().getResourceAsStream("test.html"), "UTF-8", "/"); 18 | // Document doc = Jsoup.parse(new URL("http://192.168.1.102:3000"), 2000); 19 | 20 | ReadabilityExtractor extractor = new ReadabilityExtractor(); 21 | extractor.parse(doc); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Bootstrap 101 Template 8 | 9 | 10 | 11 | 12 | 13 | 14 | 18 | 23 | 24 | 25 | 45 | 46 |

47 |

48 |

Bootstrap starter template

49 |

Use this document as a way to quickly start any new project.
All you get is this text and a mostly barebones HTML document.

50 |

51 |

52 | 53 | 56 | 57 | 58 | 59 | 60 | 61 | 66 | 67 | --------------------------------------------------------------------------------